def get_conservation(transcript_list, output_file, max_dS_threshold=None, max_omega_threshold=None): """ Get the conversation for a list of sequences and only keep those that pass Args: transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs output_file (str): path to output file max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below """ print("Getting the most conserved ortholog for each transcript...") temp_dir = "temp_conservation_files" gen.create_output_directories(temp_dir) # get a list of the transcript ids transcript_ids = list(transcript_list.keys()) # transcript_ids = transcript_ids[:200] # run this linearly because it doesnt like being parallelised # outputs = run_conservation_check(transcript_ids, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir) outputs = gen.run_parallel_function( transcript_ids, [transcript_list, max_dS_threshold, max_omega_threshold, temp_dir], run_conservation_check, parallel=False) # remove the old output file if there is one gen.remove_file(output_file) # now concat the output files args = ["cat"] [args.append(i) for i in outputs] gen.run_process(args, file_for_output=output_file) gen.remove_directory(temp_dir)
def bam_nm_filter(input_bam, output, nm_less_equal_to=None): ''' Filters bam reads by NM value. nm_less_equal_to: the NM value you wish to filter by. ''' if not nm_less_equal_to: print("Please provide NM filter value.") raise Exception #create output file if output[-4:] == ".bam": output_file = "{0}.sam".format(output[:-4]) else: output_file = output sam_output = gen.run_process(["samtools", "view", "-h", input_bam]) #create grep args and include header fields if they exist grep_args = ["^@"] #for each nm less than equal to threshold, create grep arg for i in range(nm_less_equal_to + 1): grep_args.append("\|\tNM:i:{0}\t".format(i)) grep_args = "".join(grep_args) gen.run_process(["grep", grep_args], input_to_pipe=sam_output, file_for_output=output_file) #if wanting to create bam, create bam and delete sam if output != output_file: samtools_args = ["samtools", "view", "-bh", output_file] gen.run_process(samtools_args, file_for_output=output) gen.remove_file(output_file)
def blast_sequences(fasta_file, database_path, output_file, evalue=None): """ Given a fasta file and a database, run a blast Args: fasta_file (str): path to fasta file to blast sequences for database_path (str): path to local blast database output_file (str): path to output file evalue (str): """ print("BLASTing sequences...") if not evalue: evalue = "1e-04" elif type(evalue) != "str": print("evalue must be a string!") raise Exception # run blast args = [ "Blastn", "-task", "blastn", "-query", fasta_file, "-db", database_path, "-out", output_file, "-outfmt", "10", "-evalue", evalue, "-num_threads", str(int((os.cpu_count()) - 3)) ] gen.run_process(args)
def retrieve_bams_core(all_files, local_directory, host, user, password, ftp_directory, expect_string): ''' Core function parallelized in retrieve_bams above. ''' #connect to FTP server ftp = gen.ftp_connect(host, user, password, directory=ftp_directory) #loop over .bam files for pos, bam_file in enumerate(all_files): expect_file = "temp_data/expect_file{0}.txt".format(random.random()) start_time = time.time() print("{0}/{1}".format(pos, len(all_files))) local_bam_file = "{0}/{1}".format(local_directory, bam_file) #retrieve current file if not os.path.isfile(local_bam_file): ftp = gen.ftp_retrieve(ftp, host, user, password, ftp_directory, bam_file, destination=local_directory) #transfer file to Watson current_expect_string = str.replace(expect_string, "foo", bam_file) with open(expect_file, "w") as e_file: e_file.write(current_expect_string) gen.run_process(["expect", expect_file]) print("Transferred to Watson.") gen.remove_file(expect_file) gen.remove_file(local_bam_file) print("Time spent: {0} minutes.\n".format( round((time.time() - start_time) / 60), 3)) ftp = gen.ftp_check(ftp, host, user, password, ftp_directory) ftp.quit()
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False): """ Takes a bed file and creates a fasta file with the corresponding sequences. Credit: Rosina Savisaar Args: bed_file (str): the bed file path to create fasta from fasta_file (str): the output fasta file path genome_fasta (str): the file path to the genome fasta file names (bool): if False, the fasta record names will be generated from the sequence coordinates. names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file """ #if the index file exists, check whether the expected features are present genome_fasta_index = genome_fasta + '.fai' if(os.path.exists(genome_fasta_index)): bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")]))) index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")]))) if(not set(bed_chrs).issubset(set(index_chrs))): gen.remove_file(genome_fasta_index) bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file] if not force_strand: del bedtools_args[2] if names: bedtools_args.append("-name") gen.run_process(bedtools_args) names, seqs = gen.read_fasta(fasta_file) seqs = [i.upper() for i in seqs] gen.write_to_fasta(names, seqs, fasta_file)
def convert_bed(input_bed, output_bed = None, to_hg38 = True): """ Convert bed file from hg37 to hg38 and vice versa Args: input_bed (str): path to bed file output_bed (str): if set, path to output_file to_hg38 (bool): if set, convert to hg38, else convert to hg37 """ # create temp file if no output file is given if not output_bed: file_to_write = "temp_files/{0}.bed".format(random.random()) else: file_to_write = output_bed entries = gen.read_many_fields(input_bed, "\t") with open(file_to_write, "w") as outfile: for entry in entries: if to_hg38: entry[0] = entry[0].strip("chr") else: entry[0] = "chr{0}".format(entry[0]) outfile.write("{0}\n".format("\t".join(entry))) # remove the temp file if created if not output_bed: gen.run_process(["mv", file_to_write, input_bed]) gen.remove_file(file_to_write)
def bam_xt_filter(input_bam, output, xt_filter=None): ''' Filter a bam/sam file by XT tag. ''' if not xt_filter: print("Please specify XT filter.") raise Exception #create output file if output[-4:] == ".bam": output_file = "{0}.sam".format(output[:-4]) else: output_file = output sam_output = gen.run_process(["samtools", "view", "-h", input_bam]) grep_args = [] #get header lines grep_args.append("^@") #get XT values with xt_filter grep_args.append("\|\tXT:A:{0}\t".format(xt_filter)) grep_args = "".join(grep_args) gen.run_process(["grep", grep_args], input_to_pipe=sam_output, file_for_output=output_file) #if wanting to create bam, create bam and delete sam if output != output_file: samtools_args = ["samtools", "view", "-bh", output_file] gen.run_process(samtools_args, file_for_output=output) gen.remove_file(output_file)
def extract_exons(gtf, bed): ''' Given a GTF file, extract exon coordinates and write them to .bed. EX.: extract_exons("../source_data/Homo_sapiens.GRCh37.87.gtf", "../source_data/Homo_sapiens.GRCh37.87_exons.bed") ''' #extract exons from GTF exons = gen.run_process(["grep", "\texon\t", gtf]) #filter down to only protein-coding ones exons = gen.run_process(["grep", "transcript_biotype \"protein_coding\""], input_to_pipe = exons) #split lines exons = [i.split("\t") for i in exons.split("\n")] #format as .bed. Switch to base 0. exons = [["chr{0}".format(i[0]), int(i[3]) - 1, i[4], i[8], ".", i[6]] for i in exons if len(i) >= 8] #pre-compile regex trans_regex = re.compile("(?<=transcript_id \")ENST[0-9]*") exon_no_regex = re.compile("(?<=exon_number \")[0-9]*") #extract transcript IDs and exon numbers for pos, exon in enumerate(exons): to_parse = exon[3] trans = re.search(trans_regex, to_parse).group(0) exon_no = re.search(exon_no_regex, to_parse).group(0) exons[pos][3] = "{0}.{1}".format(trans, exon_no) #write to bed with open(bed, "w") as file: for exon in exons: file.write("{0}\n".format("\t".join([str(i) for i in exon])))
def sort_bed(input_file_name, output_file_name): ''' Sort a bed file. ''' #This is done via a temp file because that way you can specify the same file as input and output file and thus #overwrite the unsorted file with the sorted one. temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random()) gen.run_process(["sort-bed", input_file_name], file_for_output=temp_file_name) gen.run_process(["mv", temp_file_name, output_file_name]) gen.remove_file(temp_file_name)
def test_phase_bams(self): snps = "test_data/bam_ops/test_phase_bams/snps.bed" sam = "test_data/bam_ops/test_phase_bams/reads.sam" bam = "test_data/bam_ops/test_phase_bams/reads.bam" gen.run_process(["samtools", "view", "-S", "-b", sam], file_for_output = bam) expected = "test_data/bam_ops/test_phase_bams/expected.sam" observed = "test_data/bam_ops/test_phase_bams/observed.sam" gen.remove_file(observed) phase_bams(snps, bam, "HG3", observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_bam_xt_filter(self): input_bam = "test_data/bam_ops/test_bam_xt_filter/input_bam.bam" expected = "test_data/bam_ops/test_bam_xt_filter/expected_bam_xt_filter.sam" observed = "test_data/bam_ops/test_bam_xt_filter/observed_bam_xt_filter.bam" observed_sam_file = "test_data/bam_ops/test_bam_xt_filter/observed_bam_xt_filter.sam" bam_xt_filter(input_bam, observed, xt_filter="U") #convert bam to sam to check correct output samtools_args = ["samtools", "view", observed] gen.run_process(samtools_args, file_for_output=observed_sam_file) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed_sam_file, "\t") self.assertEqual(expected, observed)
def test_bam_flag_filter_unmapped_reads(self): input_bam = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/input_bam.bam" expected = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/expected_flag_filtered_unmapped_reads.sam" observed = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/observed_flag_filtered_unmapped_reads_bam.bam" observed_sam_output = "test_data/bam_ops/test_bam_flag_filter_unmapped_reads/observed_flag_filtered_unmapped_reads.sam" bam_flag_filter(input_bam, observed, get_unmapped_reads=True) #convert bam to sam to check correct output samtools_args = ["samtools", "view", observed] gen.run_process(samtools_args, file_for_output=observed_sam_output) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed_sam_output, "\t") self.assertEqual(expected, observed)
def make_blast_database(fasta_file, database_path): """ Make a BLAST database """ print("Making BLAST database...") args = [ "makeblastdb", "-in", fasta_file, "-out", database_path, "-dbtype", "nucl" ] gen.run_process(args)
def test_bam_quality_filter_less_than(self): input_bam = "test_data/bam_ops/test_bam_quality_filter_less_than/test_bam.bam" expected = "test_data/bam_ops/test_bam_quality_filter_less_than/expected_bam_quality_filter_less_than.sam" observed = "test_data/bam_ops/test_bam_quality_filter_less_than/observed_bam_quality_filter_less_than.bam" observed_sam_output = "test_data/bam_ops/test_bam_quality_filter_less_than/observed_bam_quality_filter_less_than.sam" expected = gen.read_many_fields(expected, "\t") bam_quality_filter(input_bam, observed, quality_less_than_equal_to=250) #convert bam to sam to check correct output #use samtools to extract in the same format as sam samtools_args = ["samtools", "view", observed] gen.run_process(samtools_args, file_for_output=observed_sam_output) observed = gen.read_many_fields(observed_sam_output, "\t") self.assertEqual(expected, observed)
def merge_bams(bam_list, output_file): ''' Merge a list of bam files to defined output file. ''' #setup args, add -r to attach filename rg tag args = ["samtools", "merge", "-r"] if os.path.exists(output_file): args.append("-f") args.append(output_file) #loop through each input file and add to argument list for file in bam_list: args.append(file) gen.run_process(args)
def test_intersect_bam(self): bam_file = "test_data/bam_ops/test_intersect_bam/test_input_bam.bam" bed_file = "test_data/bam_ops/test_intersect_bam/test_input_bed.bed" observed_bam_output = "test_data/bam_ops/test_intersect_bam/observed_bam_intersect.bam" observed_bed_output = "test_data/bam_ops/test_intersect_bam/observed_bam_intersect.bed" expected_bed_output = "test_data/bam_ops/test_intersect_bam/expected_intersect_bed.bed" intersect_bed(bam_file, bed_file, output_file=observed_bam_output, intersect_bam=True) expected = gen.read_many_fields(expected_bed_output, "\t") #convert bam to bed to check correct output #use samtools to extract in the same format as bed samtools_args = ["samtools", "view", observed_bam_output] gen.run_process(samtools_args, file_for_output=observed_bed_output) observed = gen.read_many_fields(observed_bed_output, "\t") self.assertEqual(observed, expected)
def test_merge_bams(self): input_bam1 = "test_data/bam_ops/test_merge_bams/input1.bam" input_bam2 = "test_data/bam_ops/test_merge_bams/input2.bam" input_list = [input_bam1, input_bam2] expected = "test_data/bam_ops/test_merge_bams/expected_merge_bams.sam" observed = "test_data/bam_ops/test_merge_bams/observed_merge_bams.bam" observed_sam_output = "test_data/bam_ops/test_merge_bams/observed_merge_bams.sam" merge_bams(input_list, observed) #convert bam to sam to check correct output samtools_args = ["samtools", "view", observed] gen.run_process(samtools_args, file_for_output=observed_sam_output) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed_sam_output, "\t") self.assertEqual(expected, observed)
def extract_gtf_features(gtf, features, bed): """ Extract given features coordinates from a .gtf file and write to .bed Args: gtf (str): path to gtf file features (list): list of features to extract bed (str): path to output bed file """ feature_list = [] #iterate over the desired features for feature in features: #extract feature from GTF gtf_features = gen.run_process( ["grep", "\t{0}\t".format(feature), gtf]) #filter down to only protein-coding ones gtf_features = gen.run_process( ["grep", "transcript_biotype \"protein_coding\""], input_to_pipe=gtf_features) #split lines gtf_features = [i.split("\t") for i in gtf_features.split("\n")] #add feature to list [i.append(feature) for i in gtf_features] #append to list feature_list.extend(gtf_features) #format as .bed. Switch to base 0. gtf_features = [[ "chr{0}".format(i[0]), int(i[3]) - 1, i[4], i[8], i[-1], i[6] ] for i in feature_list if len(i) >= 3] #pre-compile regex trans_regex = re.compile("(?<=transcript_id \")ENST[0-9]*") exon_no_regex = re.compile("(?<=exon_number \")[0-9]*") gene_regex = re.compile("(?<=gene_id \")ENSG[0-9]*") #extract transcript IDs and feature numbers for pos, feature in enumerate(gtf_features): to_parse = feature[3] trans = re.search(trans_regex, to_parse).group(0) exon_no = re.search(exon_no_regex, to_parse).group(0) gene_id = re.search(gene_regex, to_parse).group(0) gtf_features[pos][3] = "{0}.{1}".format(trans, exon_no) gtf_features[pos].append(gene_id) #write to bed with open(bed, "w") as file: for feature in gtf_features: file.write("{0}\n".format("\t".join([str(i) for i in feature])))
def sort_bed(input_file, output_file): """ Sort a bed file. Args: input_file (str): path to the input file output_file (str): path to the output file """ # Do like this so we can sort a file and keep the same name gen.create_output_directories("temp_data") temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random()) gen.run_process(["sortBed", "-i", input_file], file_for_output=temp_file_name) gen.run_process(["mv", temp_file_name, output_file]) gen.remove_file(temp_file_name)
def remove_bed_overlaps(input_file, output_file): ''' Given a bed file, only leave non-overlapping elements, regardless of the strand of the overlap. Adapted from function written by RS Args: input_file (str): path to the input file to remove overlaps output_file (str): path to the output file ''' #check how many columns there are in the bedfile with open(input_file) as file: line = file.readline() column_number = line.count("\t") + 1 # merge overlapping intervals and have it count how many of the elements from the original file contribute to each # interval in the new file # note that bedops takes the column numbers in base 1 if column_number > 3: columns = ",".join([str(i) for i in range(4, column_number + 1)] + ["1"]) operations = ",".join( ["distinct" for i in range(4, column_number + 1)] + ["count"]) else: columns = "1" operations = "count" merge_result = gen.run_process([ "bedtools", "merge", "-i", input_file, "-c", columns, "-o", operations ]) #only leave those intervals that do not result from a merge and delete counts column merge_result = merge_result.split("\n") with open(output_file, "w") as outfile: for line in merge_result: if line[-2:] == "\t1": outfile.write("{0}\n".format(line[:-2]))
def get_genome_bed_from_fasta_index(features_bed, fasta_index, output_file): """ Given a list of features, get the genome coordinates as a bed file. Args: features_bed (str): path to bed file containing features fasta_index (str): path to fasta index file output_file (str): path to output file """ # get all the chromosomes required first_column = [ i.strip("chr") for i in list( set( gen.run_process(["awk", "{print $1}"], file_for_input=features_bed).split('\n'))) if len(i) ] # get index lines index = gen.read_many_fields(fasta_index, "\t") with open(output_file, "w") as outfile: for i in index: if i[0] in first_column: start = 0 length = int(i[1]) out_info = [i[0], start, start + length, ".", "."] outfile.write("{0}\t+\n{0}\t-\n".format("\t".join( gen.stringify(out_info))))
def check_coding(exons_file, CDSs_file, outfile, remove_overlapping = False): ''' Given a bed file of exon coordinates and a bed file of CDS coordinates, writes a new bed file that only contains those exon coordinates form the former file that 1) are fully coding 2) are internal NB! Assumes that all the coordinates are from non-overlapping transcripts. If this is not the case, set remove_overlaps to True and it'll remove overlapping intervals. ''' if remove_overlapping: bmo.sort_bed(exons_file, exons_file) remove_overlaps(exons_file, exons_file) #filter out anything that isn't fully coding #you have to write_both because you want to make sure that they #haven't been kept because of an overlap to a transcript that doesn't appear in the exons file temp_file = "temp_data/temp{0}.txt".format(random.random()) bmo.intersect_bed(exons_file, CDSs_file, overlap = 1, overlap_rec = True, output_file = temp_file, force_strand = True, write_both = True, no_dups = False, no_name_check = False) #filter out terminal exons #in theory, there shouldn't be any left after the previous step #in practice, there may be unannotated UTRs, so it looks like we have a fully coding terminal exon, #whereas in reality, the exon is only partially coding temp_file2 = "temp_data/temp{0}.txt".format(random.random()) with open(temp_file2, "w") as o_file: #figure out the rank of the last exon for each transcript filt_exons = gen.read_many_fields(exons_file, "\t") filt_exons = [i for i in filt_exons if len(i) > 3] names = [i[3].split(".") for i in filt_exons] names = gen.list_to_dict(names, 0, 1, as_list = True) names = {i: max([int(j) for j in names[i]]) for i in names} coding_exons = gen.read_many_fields(temp_file, "\t") for exon in coding_exons: overlap_name = exon[9].split(".") if overlap_name[0] in names: name = exon[3].split(".") if name[-1] != "1": last_exon = names[name[0]] if int(name[-1]) != last_exon: exon = [str(i) for i in exon[:6]] o_file.write("\t".join(exon)) o_file.write("\n") bmo.sort_bed(temp_file2, temp_file2) gen.run_process(["mergeBed", "-i", temp_file2, "-c", "4,5,6", "-o", "distinct,distinct,distinct"], file_for_output = outfile) gen.remove_file(temp_file) gen.remove_file(temp_file2)
def test_tabix_samples2(self): bed_file = "test_data/snp_ops/test_tabix_samples2/test_tabix.bed" with open("test_data/snp_ops/test_tabix_samples2/expected_test_tabix_samples2.txt") as file: expected = "".join(file) observed = "test_data/snp_ops/test_tabix_samples2/observed_test_tabix_samples2.txt" gen.remove_file(observed) gen.remove_file(observed + ".gz") vcf_folder = "../source_data/per_sample_vcfs" panel_file = "../source_data/integrated_call_samples_v3.20130502.ALL.panel" tabix_samples(bed_file, observed + ".gz", panel_file, vcf_folder, samples = ["NA18917", "NA19024"]) gen.run_process(["bgzip", "-d", observed + ".gz"]) with open(observed) as file: observed = "".join(file) expected = re.sub("0\.[0-9]*\.vcf", "N.vcf", expected) observed = re.sub("0\.[0-9]*\.vcf", "N.vcf", observed) expected = re.sub("source_[0-9]*\.[0-9]*", "source_N", expected) observed = re.sub("source_[0-9]*\.[0-9]*", "source_N", observed) self.assertEqual(observed, expected)
def run_bedops(A_file, B_file, force_strand=False, write_both=False, chrom=None, overlap=None, sort=False, output_file=None, intersect=False, hit_number=None, no_dups=False, overlap_rec=None, intersect_bam=None): ''' See intersect_bed for details. ''' if intersect: command = "--intersect" else: command = "--element-of" if sort: sort_bed(A_file, A_file) sort_bed(B_file, B_file) bedops_args = ["bedops", "--chrom", "foo", command, "1", A_file, B_file] if overlap: bedops_args[4] = overlap if chrom: bedops_args[2] = chrom if intersect: del bedops_args[4] else: del bedops_args[1:3] if intersect: del bedops_args[2] if force_strand: print( "Bedops can't search by strand! Either use bedtools or separate input data by strand!" ) raise Exception if write_both: print("Bedops can't write both features!") raise Exception if hit_number: print( "Bedops hasn't been set up to count the number of overlapping elements. Use bedtools!" ) raise Exception if no_dups: print("Bedops doesn't print duplicates by default!") if overlap_rec: print("Bedops hasn't been set up to filter by overlap in second file!") if intersect_bam: print("Use bedtools to intersect bam and bed!") raise Exception bedops_output = gen.run_process(bedops_args, file_for_output=output_file) return (bedops_output)
def extract_seqs(source_path, genome_fasta, output_bed, output_fasta, output_seq_fasta, mapping_file, codes_file, exclude_XY=None, hg38=None, NONCODE=None): """ Generate a file containing the exon sequences for a given .bed file Args: source_path (str): the source path for the origin .gtf file genome_fasta (str): the source path for the genome fasta output_bed (str): output .bed file to contain the exon info output_fasta (str): output fasta containing sequences output_seq_fasta (str): mapping_file (str): codes_file (str): used for NONCODE sequences to get the lincRNA exclude_XY (bool): if true, exclude cases on the X and Y chr hg38 (bool): if true, use hg38 NONCODE (bool): if true, using NONCODE sequences """ # create the exon bed file full_bed = "{0}/full_{1}".format("/".join(output_bed.split('/')[:-1]), output_bed.split("/")[-1]) entries_to_bed(source_path, full_bed, exclude_XY, hg38=hg38, NONCODE=NONCODE) # generate the fasta from the file full_exon_fasta = "{0}/full_{1}".format("/".join(output_fasta.split('/')[:-1]), output_fasta.split("/")[-1]) fasta_from_intervals(full_bed, full_exon_fasta, genome_fasta, names=True) # build the sequences from the exons full_seq_fasta = "{0}/full_{1}".format("/".join(output_seq_fasta.split('/')[:-1]), output_seq_fasta.split("/")[-1]) build_seqs_from_exons_fasta(full_exon_fasta, full_seq_fasta) length_filter_fasta = "{0}/length_filtered_{1}".format("/".join(output_seq_fasta.split('/')[:-1]), output_seq_fasta.split("/")[-1]) ops.filter_seq_lengths(full_seq_fasta, length_filter_fasta, 200) # filter to only keep one transcript per gene unique_transcripts_fasta = "{0}/unique_gene_filtered_{1}".format("/".join(output_seq_fasta.split('/')[:-1]), output_seq_fasta.split("/")[-1]) ops.uniquify_lincRNA_transcripts(length_filter_fasta, mapping_file, unique_transcripts_fasta) if NONCODE: # get only those that are lincRNA ops.get_passed_NONCODE_codes(unique_transcripts_fasta, codes_file, mapping_file, output_seq_fasta, "0001") else: # otherwise dont need the step above so copy to file gen.run_process(["cp", unique_transcripts_fasta, output_seq_fasta]) # filter bed file from fasta ops.filter_bed_from_fasta(full_bed, output_seq_fasta, output_bed) # now just get the exon seqs from these entries fasta_from_intervals(output_bed, output_fasta, genome_fasta, names=True)
def convert2bed(input_file_name, output_file_name, group_flags=None): ''' Converts an input file (sam, bam, gtf, gff...) to a bed file using bedops. Set 'group_flags' to an integer if you want to group all the fields from a certain field onwards. For instance, if you set group_flags to 5, all of the fields from the 5th onward will be turned into a comma-separated string and stored as one field in the.bed file. Note that you cannot group all the fields in a row (i.e. you can't set it to 0.) ''' extension = gen.get_extension(input_file_name, 3, ["sam", "bam", "gtf", "gff"]) bed_data = gen.run_process( ["convert2bed", "--input={0}".format(extension)], file_for_input=input_file_name, file_for_output=output_file_name) if group_flags: temp_file_name = "temp_data/temp_bed_file{0}.bed".format( random.random()) group_flags(output_file_name, temp_file_name, group_flags) gen.run_process(["mv", temp_file_name, output_file_name]) print("Grouped flags.") print("Converted data from {0} to bed.".format(extension))
def get_descriptions(names, gtf, out_file): ''' Given a set of Ensembl transcript identifiers and a GTF file, determine the corresponding "gene name" for each transcript identifier. ''' name_regex = re.compile("(?<=gene_name \")[A-z0-9\.\-\/\(\)]*(?=\")") trans_regex = re.compile("(?<=transcript_id \")[A-z0-9]*(?=\")") transcript_lines = gen.run_process(["grep", "\ttranscript\t", gtf]) transcript_lines = transcript_lines.split("\n") with open(out_file, "w") as file: for line in transcript_lines: if len(line) > 1: trans = re.search(trans_regex, line).group(0) if trans in names: description = re.search(name_regex, line).group(0) file.write("{0}\t{1}\n".format(trans, description))
def bam_quality_filter(input_bam, output_bam, quality_greater_than_equal_to=None, quality_less_than_equal_to=None): ''' Filters bam reads by quality. quality_less_than_equal_to: the lower threshold for quality control quality_greater_than_equal_to: the upper threshold for quality control ''' samtools_args = ["samtools", "view", "-h"] #if neither thresholds are specified if not quality_greater_than_equal_to and not quality_less_than_equal_to: print("You must specify one threshold to filter reads by.") raise Exception #if both thresholds are specified if quality_greater_than_equal_to and quality_less_than_equal_to: #create temp file gen.create_directory("temp_data/") temp_file = "temp_data/{0}.{1}.bam".format( os.path.split(output_bam)[1][:-4], random.random()) #first get everything below the upper threshold #need to account for the fact samtools removes everything below threshold #so when inversing need to add 1 to total args = samtools_args.copy() upper_limit = quality_less_than_equal_to + 1 args.extend(["-q", upper_limit, input_bam, "-U", temp_file]) gen.run_process(args) #second get everything above the lower threshold args = samtools_args.copy() args.extend(["-bq", quality_greater_than_equal_to, temp_file]) gen.run_process(args, file_for_output=output_bam) # #cleanup files gen.remove_file(temp_file) #if only the lower threshold is specified elif quality_greater_than_equal_to and not quality_less_than_equal_to: samtools_args.extend(["-bq", quality_greater_than_equal_to, input_bam]) gen.run_process(samtools_args, file_for_output=output_bam) #if only the upper threshold is specified elif quality_less_than_equal_to and not quality_greater_than_equal_to: #need to account for the fact samtools removes everything below threshold #so when inversing need to add 1 to total upper_limit = quality_less_than_equal_to + 1 samtools_args.extend(["-q", upper_limit, input_bam, "-U", output_bam]) gen.run_process(samtools_args)
def filter_fasta_from_bed(bed_file, input_fasta, output_fasta, filter_column = 3): """ Given a bed file, filter a fasta file file to contain only entries with ids in the given column Args: bed_file (str): path to bed file containing entries input_fasta (str): path to fasta file containing the sequences to filter output_fasta (str): path to output fasta file filter_column (int): base 0 index of the column to use as filtering """ # get the ids from the given column ids = [i for i in gen.run_process(["cut", "-f{0}".format(filter_column+1), bed_file]).split("\n") if i] # read in the sequences names, seqs = gen.read_fasta(input_fasta) # filter and output to file with open(output_fasta, "w") as outfile: [outfile.write(">{0}\n{1}\n".format(name, seqs[i])) for i, name in enumerate(names) if name in ids]
def filter_fasta_intervals_from_fasta(intervals_fasta, fasta, output): ''' Given a fasta file and a fasta intervals file, filter the intervals file to only leave records where the 'name' field appears among the names in the fasta file. Write to fasta. ''' #add feature in here that enables overwrite of current file output_exists = False if Path(output).exists(): output_exists = True temp_file_name = "{0}.{1}{2}".format(os.path.splitext(output)[0], random.random(), os.path.splitext(output)[1]) else: temp_file_name = output #fish out the names in the fasta fasta_names = gen.run_process(["grep", ">", fasta]) fasta_names = fasta_names.split("\n") #remove tag and newline from each name fasta_names = [(i.lstrip("\>")).rstrip("\n") for i in fasta_names] #remove and potential blank entries fasta_names = [i for i in fasta_names if len(fasta_names) > 3] #read in the interval data fasta_interval_names, fasta_interval_seqs = gen.read_fasta(intervals_fasta) id_regex = re.compile("^(\w+).*") with open(temp_file_name, "w") as file: for i, interval in enumerate(fasta_interval_names): #search for the sample name id = re.search(id_regex, interval) if id: trans_id = id.group(1) #if the sample name is in the fasta names, output to file if trans_id in fasta_names: file.write(">{0}\n{1}\n".format(fasta_interval_names[i], fasta_interval_seqs[i])) #remove old file, replace with new if(output_exists): os.remove(output) shutil.move(temp_file_name, output)