Beispiel #1
0
def chip_bed_tomat0(id_names, chip_peaks_file, annotation_file, output_path=None, window_size=0, gene_body_flag = False,
                    tss_flag = False):
    """
    Process a BED file of peaks into a integer peak-count matrix
    :param chip_peaks_file: list(str)
        List of paths to a BED file
    :param output_path: str
        Path to the output TSV file
    :param annotation_file: str
        Path to the GTF annotation file
    :param window_size: int
        Window on each side of a gene to include a peak in the count
        100 means 100bp up from start and 100bp down from end
    :return gene_counts: pd.DataFrame
        Integer count matrix of peaks per gene
    """

    # Convert paths to absolutes
    output_path = file_path_abs(output_path)
    annotation_file = file_path_abs(annotation_file)

    # Load annotations into a dataframe with pybedtools
    # Adjust the start and stop positions to account for a flanking window
    genes = load_gtf_to_dataframe(annotation_file)

    if gene_body_flag:
        genes = open_window(genes, window_size)
    if tss_flag:
        genes = open_tss(genes, window_size)


    prior_data = pd.DataFrame(index=genes[GTF_GENENAME])
    for id_name, peak_file in zip(id_names, chip_peaks_file):
        # Load BED file into a dataframe with pybedtools
        peak_file = file_path_abs(peak_file)
        chip_peaks = pybedtools.BedTool(peak_file).to_dataframe()
        gene_counts = get_peaks_in_features(genes, chip_peaks)

        # Get non-zero quantiles and use them to bin peak overlap by length
        quantiles = gene_counts.loc[gene_counts[SEQ_COUNTS] != 0, SEQ_COUNTS].quantile(PEAK_QUANTILES)
        gene_counts[SEQ_BIN] = 0

        for i, qval in enumerate(quantiles.sort_values(ascending=True)):
            gene_counts.loc[gene_counts[SEQ_COUNTS] >= qval, SEQ_BIN] = i + 1

        # Rename the column with ID and reindex for join
        gene_counts = gene_counts.rename({SEQ_BIN: id_name}).set_index(GTF_GENENAME).drop([SEQ_COUNTS], axis=1)
        prior_data = prior_data.join(gene_counts, on=[GTF_GENENAME])

    if output_path is not None:
        prior_data.to_csv(output_path, sep="\t")

    return prior_data
Beispiel #2
0
def get_srr_files(srr_list,
                  target_path,
                  num_workers=5,
                  prefetch_options=PREFETCH_OPTIONS):
    """
    Take a list of SRR ID strings, download them async with num_workers concurrent jobs, and return a list of the
    paths to the SRR files that have been downloaded.
    :param srr_list: list(str)
        List of SRA IDs to acquire from NCBI
    :param target_path: str
        Target path for the SRA files
    :param num_workers: int
        Number of concurrent jobs to run
    :param prefetch_options: list(str)
        Any additional command line arguments to pass to prefetch
    :return:
    """
    sem = asyncio.Semaphore(num_workers)

    srr_file_names = list(
        map(
            lambda x: os.path.join(file_path_abs(target_path), x +
                                   SRA_EXTENSION), srr_list))
    tasks = [
        _get_srr(sid, sfn, sem, prefetch_options=prefetch_options)
        for sid, sfn in zip(srr_list, srr_file_names)
    ]

    try:
        return asyncio.get_event_loop().run_until_complete(
            asyncio.gather(*tasks))
    except RuntimeError:
        return asyncio.new_event_loop().run_until_complete(
            asyncio.gather(*tasks))
Beispiel #3
0
def atac_tomat0(srr_ids, output_path, star_reference_genome, gzip_output=False,  cores=4, star_jobs=2, star_args=None,
                min_quality=None):

    star_args = [] if star_args is None else star_args

    output_path = file_path_abs(output_path)
    os.makedirs(output_path, exist_ok=True)

    # Download all the SRR files
    print("Downloading SRR files")
    os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True)
    srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores)

    # Unpack all the SRR files into FASTQ files
    print("Unpacking SRR files")
    os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True)
    fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH),
                                        num_workers=cores)

    # Run all the FASTQ files through STAR to align
    print("Aligning FASTQ files")
    os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True)
    thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs))
    sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome,
                                       os.path.join(output_path, STAR_ALIGNMENT_SUBPATH),
                                       num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args)

    # Sort all the SAM files into BAM files
    print("Sorting SAM files into BAM files")
    os.makedirs(os.path.join(output_path, BAM_SUBPATH), exist_ok=True)
    bam_file_names = sam_sort(srr_ids, sam_file_names, os.path.join(output_path, BAM_SUBPATH), min_quality=min_quality,
                              num_workers=cores)
Beispiel #4
0
def check_list_of_files_exist(file_list):
    """
    Check a list of file names and return subset of the list that exists (or an empty list if none exist)
    :param file_list: list(str)
        List of file names
    :return existing_file_list: list(str)
        List of files that exist
    """

    existing_file_list = []

    for file_name in file_list:
        if os.path.exists(file_path_abs(file_name)):
            existing_file_list.append(file_name)

    return existing_file_list
Beispiel #5
0
 def test_file_path_abs(self):
     self.assertEqual(os.path.abspath(os.path.expanduser("~")), utils.file_path_abs("~"))
Beispiel #6
0
def srr_tomat0(srr_ids,
               output_path,
               star_reference_genome,
               annotation_file,
               gzip_output=False,
               cores=4,
               star_jobs=2,
               star_args=None):
    star_args = [] if star_args is None else star_args

    output_path = file_path_abs(output_path)
    os.makedirs(output_path, exist_ok=True)

    # Download all the SRR files
    print("Downloading SRR files")
    os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True)
    srr_file_names = get_srr_files(srr_ids,
                                   os.path.join(output_path, SRR_SUBPATH),
                                   num_workers=cores)

    # Unpack all the SRR files into FASTQ files
    print("Unpacking SRR files")
    os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True)
    fastq_file_names = unpack_srr_files(srr_ids,
                                        srr_file_names,
                                        os.path.join(output_path,
                                                     FASTQ_SUBPATH),
                                        num_workers=cores)

    # Run all the FASTQ files through STAR to align
    print("Aligning FASTQ files")
    os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH),
                exist_ok=True)
    thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs))
    sam_file_names = star_align_fastqs(srr_ids,
                                       fastq_file_names,
                                       star_reference_genome,
                                       os.path.join(output_path,
                                                    STAR_ALIGNMENT_SUBPATH),
                                       num_workers=star_jobs,
                                       threads_per_worker=thread_count,
                                       star_options=star_args)

    # Run all the SAM files through HTSeq.count to count
    print("Counting SAM alignments")
    os.makedirs(os.path.join(output_path, HTSEQ_ALIGNMENT_SUBPATH),
                exist_ok=True)
    count_file_names = htseq_count_aligned(srr_ids,
                                           sam_file_names,
                                           annotation_file,
                                           os.path.join(
                                               output_path,
                                               HTSEQ_ALIGNMENT_SUBPATH),
                                           num_workers=cores)

    # Convert the count files into a matrix and save it to a TSV
    print("Assembling result matrix")
    count_matrix, count_metadata = pileup_raw_counts(srr_ids, count_file_names)
    count_matrix_file_name = os.path.join(output_path, OUTPUT_COUNT_FILE_NAME)

    # Save the raw counts file
    if gzip_output:
        count_matrix.to_csv(count_matrix_file_name + ".gz",
                            compression='gzip',
                            sep="\t")
    else:
        count_matrix.to_csv(count_matrix_file_name, sep="\t")

    # Save the count metadata file
    count_metadata.to_csv(os.path.join(output_path,
                                       OUTPUT_COUNT_METADATA_NAME),
                          sep="\t")

    # Normalize to FPKM
    print("Normalizing result matrix to FPKM")
    normalized_count_matrix_fpkm = normalize_matrix_to_fpkm(
        count_matrix, annotation_file)
    fpkm_file_name = os.path.join(output_path, OUTPUT_FPKM_FILE_NAME)

    # Save the normalized counts file
    if gzip_output:
        normalized_count_matrix_fpkm.to_csv(fpkm_file_name + ".gz",
                                            compression='gzip',
                                            sep="\t")
    else:
        normalized_count_matrix_fpkm.to_csv(fpkm_file_name, sep="\t")

    # Normalize to TPM
    print("Normalizing result matrix to TPM")
    normalized_count_matrix_tpm = normalize_matrix_to_tpm(
        count_matrix, annotation_file)
    tpmx_file_name = os.path.join(output_path, OUTPUT_TPM_FILE_NAME)

    # Save the normalized counts file
    if gzip_output:
        normalized_count_matrix_tpm.to_csv(tpmx_file_name + ".gz",
                                           compression='gzip',
                                           sep="\t")
    else:
        normalized_count_matrix_tpm.to_csv(tpmx_file_name, sep="\t")

    print("Count file {sh} generated from {srlen} SRA files".format(
        sh=count_matrix.shape, srlen=len(srr_ids)))
    failed_counts = list(map(lambda x: x is None, count_file_names))

    if any(failed_counts):
        print("{n} Sequence Records could not be counted:".format(
            n=sum(failed_counts)),
              end="")
        print("\n\t".join(
            [sid for sid, fail in zip(srr_ids, failed_counts) if fail]))

    return count_matrix
Beispiel #7
0
async def _unpack_srr(srr_id, srr_file_name, target_path, semaphore):
    """

    :param srr_id: str
        NCBI SRR ID string
    :param srr_file_name: str
        The complete path to the SRR file
    :param target_path: str
        The path to put the FASTQ file(s)
    :param semaphore: asyncio.Semaphore
        Semaphore for resource utilization
    :return:
    """
    async with semaphore:

        if srr_file_name is None:
            return [None]

        # Check and see if this has already been done
        output_file_names = list(
            map(lambda x: os.path.join(file_path_abs(target_path), srr_id + x),
                POSSIBLE_FASTQ_EXTENSIONS))
        files_created = check_list_of_files_exist(output_file_names)

        # If the file is already unpacked, don't do anything
        if len(files_created) > 0:
            print("{id} exists in path {path} ({files})".format(
                id=srr_id, path=target_path, files=" ".join(files_created)))
            return files_created

        # Build a fastq-dump call and execute it
        fastq_dump_call = [
            FASTQDUMP_EXECUTABLE_PATH, "--gzip", "--split-files", "--outdir",
            target_path, srr_file_name
        ]

        print(" ".join(fastq_dump_call))

        # Run fastq-dump and get the files that were created from it
        return_code = 0
        try:
            process = await asyncio.create_subprocess_exec(*fastq_dump_call)
            return_code = await process.wait()
            file_output = check_list_of_files_exist(output_file_names)
        except:
            return_code = 1
            file_output = [None]
            raise
        finally:
            # If the fastq-dump failed, clean up the files associated with it and then move on
            if int(return_code) != 0:
                print("NCBI fastq-dump failed for {id} ({file})".format(
                    id=srr_id, file=srr_file_name))
                files_created = check_list_of_files_exist(output_file_names)
                for f in files_created:
                    try:
                        os.remove(f)
                    except FileNotFoundError:
                        pass
                file_output = [None]

        # Find out which read files were created by looking into the output folder
        return file_output
Beispiel #8
0
async def _star_align(srr_id,
                      fastq_file_names,
                      reference_genome,
                      output_path,
                      semaphore,
                      threads_per_worker=5,
                      star_options=STAR_DEFAULT_COUNT_OPTIONS):
    """
    Align an individual set of FASTQs from an SRA to the reference genome
    :param srr_id: str
        NCBI SRR ID string
    :param fastq_file_names: list(str)
        A list of FASTQ files for the SRR ID
    :param reference_genome: str
        A path to the STAR reference genome
    :param output_path: str
        A path to the output
    :param semaphore: asyncio.Semaphore
        Semaphore for resource utilization
    :param threads_per_worker: int
        Number of threads to assign to each job in STAR (--runThreadN)
    :param star_options: list(str)
        A list of options to pass to the STAR aligner
    :return output_file: str
        The path to the SAM file generated by STAR
    """
    async with semaphore:

        if fastq_file_names[0] is None:
            return None

        try:
            os.makedirs(output_path)
        except FileExistsError:
            pass

        output_file = os.path.join(file_path_abs(output_path),
                                   STAR_ALIGNMENT_FILE_NAME)

        if os.path.exists(output_file):
            print("{id} SAM alignment file exists ({path})".format(
                id=srr_id, path=output_path))
            return output_file

        # Build the STAR executable call
        star_call = [
            STAR_EXECUTABLE_PATH, "--runThreadN",
            str(threads_per_worker), "--runMode", "alignReads",
            "--readFilesCommand", "zcat", "--genomeDir", reference_genome,
            "--outFileNamePrefix",
            os.path.join(file_path_abs(output_path), ''), "--readFilesIn",
            *fastq_file_names, "--outFilterType", "BySJout"
        ]

        # Add in any additional options
        star_call.extend(star_options)

        print(" ".join(star_call))
        process = await asyncio.create_subprocess_exec(*star_call)
        code = await process.wait()

        if int(code) != 0:
            print("STAR failed for {id} ({files})".format(
                id=srr_id, files=" ".join(fastq_file_names)))
            return None

        return output_file
Beispiel #9
0
def star_mkref(output_path,
               genome_file=None,
               annotation_file=None,
               default_genome=None,
               star_options=STAR_DEFAULT_MKREF_OPTIONS,
               cores=1,
               gff_annotations=None,
               star_executable=STAR_EXECUTABLE_PATH,
               move_files=True):
    """
    Make a reference genome index for STAR to align reads to
    :param output_path: str
        Path to output reference index into
    :param genome_file: list(str)
        Genome sequences (usually FASTA)
    :param annotation_file: str
        Annotation file (usually GTF or GFF)
    :param default_genome: str
        A string to identify one of the common genomes
        This will cause the genome data to be downloaded from ENSEMBL
    :param star_options: list
        A list of additional options to pass to STAR
    :param cores: int
        Number of cores to pass to STAR
    :param gff_annotations: bool
        Flag for GFF3 (instead of GTF) annotations. If None, it will autodetect .gff files.
    :param star_executable: str
        Path to the STAR executable
    :param move_files: bool
        Move the genome/annotation files to a `files` path in the STAR reference genome. If false, just copy.
    :return output_path: str
        Location where the reference genome has been created
    """

    # Get default genome files from the internet if needed
    if (genome_file is None
            or annotation_file is None) and default_genome is None:
        raise ValueError(
            "star_mkref() requires (genome_file AND annotation_file) OR default_genome to be passed"
        )
    elif default_genome is not None:
        ((genome_url, genome_file),
         (annotation_url,
          annotation_file)) = get_genome_file_locs(default_genome)
        genome_file = [get_file_from_url(genome_url, genome_file)]
        annotation_file = get_file_from_url(annotation_url, annotation_file)

    # Create the output path
    output_path = file_path_abs(output_path)
    try:
        os.makedirs(output_path)
    except FileExistsError:
        pass

    # Uncompress the genome file if it's gzipped
    for i, gf in enumerate(genome_file):
        if gf.endswith(".gz"):
            subprocess.call(["gunzip", gf])
            genome_file[i] = gf[:-3]

    # Uncompress the annotation file if it's gzipped
    if annotation_file.endswith(".gz"):
        subprocess.call(["gunzip", annotation_file])
        annotation_file = annotation_file[:-3]

    # Build the STAR executable call
    star_call = [
        star_executable, "--outFileNamePrefix",
        os.path.join(file_path_abs(output_path), ''), "--runThreadN",
        str(cores), "--runMode", "genomeGenerate", "--genomeDir", output_path,
        "--genomeFastaFiles", *genome_file, "--sjdbGTFfile", annotation_file
    ]

    # Add any passed-in options
    star_call.extend(star_options)

    # Set a flag for STAR if it's a small genome
    # Sum file sizes as a proxy for genome size (approximately correct for ASCII files)
    star_sa_idx_size = sum(map(lambda x: os.path.getsize(x), genome_file))
    # Calculate genomeSAindexNbases value with the weird equation from the STAR manual
    star_sa_idx_size = int(np.floor(np.log2(star_sa_idx_size) / 2 - 1))
    if star_sa_idx_size < 14:
        star_call.extend(["--genomeSAindexNbases", str(star_sa_idx_size)])

    # Set a flag for STAR if the annotation file is GFF3
    if (gff_annotations is None
            and ".gff" in annotation_file) or gff_annotations:
        star_call.extend(["--sjdbGTFtagExonParentTranscript", "Parent"])

    # Execute STAR
    print(" ".join(star_call))
    subprocess.call(star_call)

    output_file_path = os.path.join(output_path, "files")
    try:
        os.mkdir(output_file_path)
    except FileExistsError:
        pass

    if move_files:
        file_func = os.rename
    else:
        file_func = shutil.copy2

    [
        file_func(file, os.path.join(output_file_path, os.path.basename(file)))
        for file in genome_file
    ]
    file_func(
        annotation_file,
        os.path.join(output_file_path, os.path.basename(annotation_file)))

    return output_path