def chip_bed_tomat0(id_names, chip_peaks_file, annotation_file, output_path=None, window_size=0, gene_body_flag = False, tss_flag = False): """ Process a BED file of peaks into a integer peak-count matrix :param chip_peaks_file: list(str) List of paths to a BED file :param output_path: str Path to the output TSV file :param annotation_file: str Path to the GTF annotation file :param window_size: int Window on each side of a gene to include a peak in the count 100 means 100bp up from start and 100bp down from end :return gene_counts: pd.DataFrame Integer count matrix of peaks per gene """ # Convert paths to absolutes output_path = file_path_abs(output_path) annotation_file = file_path_abs(annotation_file) # Load annotations into a dataframe with pybedtools # Adjust the start and stop positions to account for a flanking window genes = load_gtf_to_dataframe(annotation_file) if gene_body_flag: genes = open_window(genes, window_size) if tss_flag: genes = open_tss(genes, window_size) prior_data = pd.DataFrame(index=genes[GTF_GENENAME]) for id_name, peak_file in zip(id_names, chip_peaks_file): # Load BED file into a dataframe with pybedtools peak_file = file_path_abs(peak_file) chip_peaks = pybedtools.BedTool(peak_file).to_dataframe() gene_counts = get_peaks_in_features(genes, chip_peaks) # Get non-zero quantiles and use them to bin peak overlap by length quantiles = gene_counts.loc[gene_counts[SEQ_COUNTS] != 0, SEQ_COUNTS].quantile(PEAK_QUANTILES) gene_counts[SEQ_BIN] = 0 for i, qval in enumerate(quantiles.sort_values(ascending=True)): gene_counts.loc[gene_counts[SEQ_COUNTS] >= qval, SEQ_BIN] = i + 1 # Rename the column with ID and reindex for join gene_counts = gene_counts.rename({SEQ_BIN: id_name}).set_index(GTF_GENENAME).drop([SEQ_COUNTS], axis=1) prior_data = prior_data.join(gene_counts, on=[GTF_GENENAME]) if output_path is not None: prior_data.to_csv(output_path, sep="\t") return prior_data
def get_srr_files(srr_list, target_path, num_workers=5, prefetch_options=PREFETCH_OPTIONS): """ Take a list of SRR ID strings, download them async with num_workers concurrent jobs, and return a list of the paths to the SRR files that have been downloaded. :param srr_list: list(str) List of SRA IDs to acquire from NCBI :param target_path: str Target path for the SRA files :param num_workers: int Number of concurrent jobs to run :param prefetch_options: list(str) Any additional command line arguments to pass to prefetch :return: """ sem = asyncio.Semaphore(num_workers) srr_file_names = list( map( lambda x: os.path.join(file_path_abs(target_path), x + SRA_EXTENSION), srr_list)) tasks = [ _get_srr(sid, sfn, sem, prefetch_options=prefetch_options) for sid, sfn in zip(srr_list, srr_file_names) ] try: return asyncio.get_event_loop().run_until_complete( asyncio.gather(*tasks)) except RuntimeError: return asyncio.new_event_loop().run_until_complete( asyncio.gather(*tasks))
def atac_tomat0(srr_ids, output_path, star_reference_genome, gzip_output=False, cores=4, star_jobs=2, star_args=None, min_quality=None): star_args = [] if star_args is None else star_args output_path = file_path_abs(output_path) os.makedirs(output_path, exist_ok=True) # Download all the SRR files print("Downloading SRR files") os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) # Unpack all the SRR files into FASTQ files print("Unpacking SRR files") os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), num_workers=cores) # Run all the FASTQ files through STAR to align print("Aligning FASTQ files") os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) # Sort all the SAM files into BAM files print("Sorting SAM files into BAM files") os.makedirs(os.path.join(output_path, BAM_SUBPATH), exist_ok=True) bam_file_names = sam_sort(srr_ids, sam_file_names, os.path.join(output_path, BAM_SUBPATH), min_quality=min_quality, num_workers=cores)
def check_list_of_files_exist(file_list): """ Check a list of file names and return subset of the list that exists (or an empty list if none exist) :param file_list: list(str) List of file names :return existing_file_list: list(str) List of files that exist """ existing_file_list = [] for file_name in file_list: if os.path.exists(file_path_abs(file_name)): existing_file_list.append(file_name) return existing_file_list
def test_file_path_abs(self): self.assertEqual(os.path.abspath(os.path.expanduser("~")), utils.file_path_abs("~"))
def srr_tomat0(srr_ids, output_path, star_reference_genome, annotation_file, gzip_output=False, cores=4, star_jobs=2, star_args=None): star_args = [] if star_args is None else star_args output_path = file_path_abs(output_path) os.makedirs(output_path, exist_ok=True) # Download all the SRR files print("Downloading SRR files") os.makedirs(os.path.join(output_path, SRR_SUBPATH), exist_ok=True) srr_file_names = get_srr_files(srr_ids, os.path.join(output_path, SRR_SUBPATH), num_workers=cores) # Unpack all the SRR files into FASTQ files print("Unpacking SRR files") os.makedirs(os.path.join(output_path, FASTQ_SUBPATH), exist_ok=True) fastq_file_names = unpack_srr_files(srr_ids, srr_file_names, os.path.join(output_path, FASTQ_SUBPATH), num_workers=cores) # Run all the FASTQ files through STAR to align print("Aligning FASTQ files") os.makedirs(os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), exist_ok=True) thread_count = max(int(cores / len(srr_ids)), int(cores / star_jobs)) sam_file_names = star_align_fastqs(srr_ids, fastq_file_names, star_reference_genome, os.path.join(output_path, STAR_ALIGNMENT_SUBPATH), num_workers=star_jobs, threads_per_worker=thread_count, star_options=star_args) # Run all the SAM files through HTSeq.count to count print("Counting SAM alignments") os.makedirs(os.path.join(output_path, HTSEQ_ALIGNMENT_SUBPATH), exist_ok=True) count_file_names = htseq_count_aligned(srr_ids, sam_file_names, annotation_file, os.path.join( output_path, HTSEQ_ALIGNMENT_SUBPATH), num_workers=cores) # Convert the count files into a matrix and save it to a TSV print("Assembling result matrix") count_matrix, count_metadata = pileup_raw_counts(srr_ids, count_file_names) count_matrix_file_name = os.path.join(output_path, OUTPUT_COUNT_FILE_NAME) # Save the raw counts file if gzip_output: count_matrix.to_csv(count_matrix_file_name + ".gz", compression='gzip', sep="\t") else: count_matrix.to_csv(count_matrix_file_name, sep="\t") # Save the count metadata file count_metadata.to_csv(os.path.join(output_path, OUTPUT_COUNT_METADATA_NAME), sep="\t") # Normalize to FPKM print("Normalizing result matrix to FPKM") normalized_count_matrix_fpkm = normalize_matrix_to_fpkm( count_matrix, annotation_file) fpkm_file_name = os.path.join(output_path, OUTPUT_FPKM_FILE_NAME) # Save the normalized counts file if gzip_output: normalized_count_matrix_fpkm.to_csv(fpkm_file_name + ".gz", compression='gzip', sep="\t") else: normalized_count_matrix_fpkm.to_csv(fpkm_file_name, sep="\t") # Normalize to TPM print("Normalizing result matrix to TPM") normalized_count_matrix_tpm = normalize_matrix_to_tpm( count_matrix, annotation_file) tpmx_file_name = os.path.join(output_path, OUTPUT_TPM_FILE_NAME) # Save the normalized counts file if gzip_output: normalized_count_matrix_tpm.to_csv(tpmx_file_name + ".gz", compression='gzip', sep="\t") else: normalized_count_matrix_tpm.to_csv(tpmx_file_name, sep="\t") print("Count file {sh} generated from {srlen} SRA files".format( sh=count_matrix.shape, srlen=len(srr_ids))) failed_counts = list(map(lambda x: x is None, count_file_names)) if any(failed_counts): print("{n} Sequence Records could not be counted:".format( n=sum(failed_counts)), end="") print("\n\t".join( [sid for sid, fail in zip(srr_ids, failed_counts) if fail])) return count_matrix
async def _unpack_srr(srr_id, srr_file_name, target_path, semaphore): """ :param srr_id: str NCBI SRR ID string :param srr_file_name: str The complete path to the SRR file :param target_path: str The path to put the FASTQ file(s) :param semaphore: asyncio.Semaphore Semaphore for resource utilization :return: """ async with semaphore: if srr_file_name is None: return [None] # Check and see if this has already been done output_file_names = list( map(lambda x: os.path.join(file_path_abs(target_path), srr_id + x), POSSIBLE_FASTQ_EXTENSIONS)) files_created = check_list_of_files_exist(output_file_names) # If the file is already unpacked, don't do anything if len(files_created) > 0: print("{id} exists in path {path} ({files})".format( id=srr_id, path=target_path, files=" ".join(files_created))) return files_created # Build a fastq-dump call and execute it fastq_dump_call = [ FASTQDUMP_EXECUTABLE_PATH, "--gzip", "--split-files", "--outdir", target_path, srr_file_name ] print(" ".join(fastq_dump_call)) # Run fastq-dump and get the files that were created from it return_code = 0 try: process = await asyncio.create_subprocess_exec(*fastq_dump_call) return_code = await process.wait() file_output = check_list_of_files_exist(output_file_names) except: return_code = 1 file_output = [None] raise finally: # If the fastq-dump failed, clean up the files associated with it and then move on if int(return_code) != 0: print("NCBI fastq-dump failed for {id} ({file})".format( id=srr_id, file=srr_file_name)) files_created = check_list_of_files_exist(output_file_names) for f in files_created: try: os.remove(f) except FileNotFoundError: pass file_output = [None] # Find out which read files were created by looking into the output folder return file_output
async def _star_align(srr_id, fastq_file_names, reference_genome, output_path, semaphore, threads_per_worker=5, star_options=STAR_DEFAULT_COUNT_OPTIONS): """ Align an individual set of FASTQs from an SRA to the reference genome :param srr_id: str NCBI SRR ID string :param fastq_file_names: list(str) A list of FASTQ files for the SRR ID :param reference_genome: str A path to the STAR reference genome :param output_path: str A path to the output :param semaphore: asyncio.Semaphore Semaphore for resource utilization :param threads_per_worker: int Number of threads to assign to each job in STAR (--runThreadN) :param star_options: list(str) A list of options to pass to the STAR aligner :return output_file: str The path to the SAM file generated by STAR """ async with semaphore: if fastq_file_names[0] is None: return None try: os.makedirs(output_path) except FileExistsError: pass output_file = os.path.join(file_path_abs(output_path), STAR_ALIGNMENT_FILE_NAME) if os.path.exists(output_file): print("{id} SAM alignment file exists ({path})".format( id=srr_id, path=output_path)) return output_file # Build the STAR executable call star_call = [ STAR_EXECUTABLE_PATH, "--runThreadN", str(threads_per_worker), "--runMode", "alignReads", "--readFilesCommand", "zcat", "--genomeDir", reference_genome, "--outFileNamePrefix", os.path.join(file_path_abs(output_path), ''), "--readFilesIn", *fastq_file_names, "--outFilterType", "BySJout" ] # Add in any additional options star_call.extend(star_options) print(" ".join(star_call)) process = await asyncio.create_subprocess_exec(*star_call) code = await process.wait() if int(code) != 0: print("STAR failed for {id} ({files})".format( id=srr_id, files=" ".join(fastq_file_names))) return None return output_file
def star_mkref(output_path, genome_file=None, annotation_file=None, default_genome=None, star_options=STAR_DEFAULT_MKREF_OPTIONS, cores=1, gff_annotations=None, star_executable=STAR_EXECUTABLE_PATH, move_files=True): """ Make a reference genome index for STAR to align reads to :param output_path: str Path to output reference index into :param genome_file: list(str) Genome sequences (usually FASTA) :param annotation_file: str Annotation file (usually GTF or GFF) :param default_genome: str A string to identify one of the common genomes This will cause the genome data to be downloaded from ENSEMBL :param star_options: list A list of additional options to pass to STAR :param cores: int Number of cores to pass to STAR :param gff_annotations: bool Flag for GFF3 (instead of GTF) annotations. If None, it will autodetect .gff files. :param star_executable: str Path to the STAR executable :param move_files: bool Move the genome/annotation files to a `files` path in the STAR reference genome. If false, just copy. :return output_path: str Location where the reference genome has been created """ # Get default genome files from the internet if needed if (genome_file is None or annotation_file is None) and default_genome is None: raise ValueError( "star_mkref() requires (genome_file AND annotation_file) OR default_genome to be passed" ) elif default_genome is not None: ((genome_url, genome_file), (annotation_url, annotation_file)) = get_genome_file_locs(default_genome) genome_file = [get_file_from_url(genome_url, genome_file)] annotation_file = get_file_from_url(annotation_url, annotation_file) # Create the output path output_path = file_path_abs(output_path) try: os.makedirs(output_path) except FileExistsError: pass # Uncompress the genome file if it's gzipped for i, gf in enumerate(genome_file): if gf.endswith(".gz"): subprocess.call(["gunzip", gf]) genome_file[i] = gf[:-3] # Uncompress the annotation file if it's gzipped if annotation_file.endswith(".gz"): subprocess.call(["gunzip", annotation_file]) annotation_file = annotation_file[:-3] # Build the STAR executable call star_call = [ star_executable, "--outFileNamePrefix", os.path.join(file_path_abs(output_path), ''), "--runThreadN", str(cores), "--runMode", "genomeGenerate", "--genomeDir", output_path, "--genomeFastaFiles", *genome_file, "--sjdbGTFfile", annotation_file ] # Add any passed-in options star_call.extend(star_options) # Set a flag for STAR if it's a small genome # Sum file sizes as a proxy for genome size (approximately correct for ASCII files) star_sa_idx_size = sum(map(lambda x: os.path.getsize(x), genome_file)) # Calculate genomeSAindexNbases value with the weird equation from the STAR manual star_sa_idx_size = int(np.floor(np.log2(star_sa_idx_size) / 2 - 1)) if star_sa_idx_size < 14: star_call.extend(["--genomeSAindexNbases", str(star_sa_idx_size)]) # Set a flag for STAR if the annotation file is GFF3 if (gff_annotations is None and ".gff" in annotation_file) or gff_annotations: star_call.extend(["--sjdbGTFtagExonParentTranscript", "Parent"]) # Execute STAR print(" ".join(star_call)) subprocess.call(star_call) output_file_path = os.path.join(output_path, "files") try: os.mkdir(output_file_path) except FileExistsError: pass if move_files: file_func = os.rename else: file_func = shutil.copy2 [ file_func(file, os.path.join(output_file_path, os.path.basename(file))) for file in genome_file ] file_func( annotation_file, os.path.join(output_file_path, os.path.basename(annotation_file))) return output_path