def get_full_transcripts(cds_fasta, exons_fasta, output_file): cds_names, cds_seqs = gen.read_fasta(cds_fasta) cds_list = {name: cds_seqs[i] for i, name in enumerate(cds_names)} exon_names, exon_seqs = gen.read_fasta(exons_fasta) exon_list = collections.defaultdict( lambda: collections.defaultdict(lambda: [])) for i, name in enumerate(exon_names): id = name.split(".")[0] exon_id = int(name.split(".")[1].split("(")[0]) exon_list[id][exon_id] = exon_seqs[i] full_spliced_transcripts = {} for id in exon_list: exons = [] for exon_id in sorted(exon_list[id]): exons.append(exon_list[id][exon_id]) full_spliced_transcripts[id] = "".join(exons) with open(output_file, "w") as outfile: for id in full_spliced_transcripts: outfile.write(">{0}\n{1}\n".format(id, full_spliced_transcripts[id]))
def test_filter_fasta_intervals_from_fasta(self): fasta = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_fasta.fasta" fasta_intervals = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_intervals.fasta" observed = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/observed_filtered_intervals.fasta" gen.remove_file(observed) expected = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/expected_filtered_intervals.fasta" filter_fasta_intervals_from_fasta(fasta_intervals, fasta, observed) expected = gen.read_fasta(expected) observed = gen.read_fasta(observed) self.assertEqual(expected, observed)
def test_sim_cds_seqs(self): input_file = "test_data/sim_ops/test_sim_cds_seqs/input.fasta" expected_file = "test_data/sim_ops/test_sim_cds_seqs/expected.fasta" seq_list = gen.read_fasta(input_file)[1] expected = gen.read_fasta(expected_file)[1] codons = [re.findall(".{3}", seq) for seq in seq_list] codon_list = [codon_set[1:-1] for codon_set in codons] starts = [i[0] for i in codons] stops = [i[-1] for i in codons] observed = so.sim_cds_seqs(codon_list, starts, stops, seed=1) self.assertEqual(expected, observed)
def test_fasta_from_intervals(self): observed = "test_data/bed_ops/test_fasta_from_intervals/observed_converted_fasta.fasta" gen.remove_file(observed) bed_file = "test_data/bed_ops/test_fasta_from_intervals/test_bed_for_fasta_conversion.bed" expected = gen.read_fasta( "test_data/bed_ops/test_fasta_from_intervals/expected_converted_fasta.fasta" ) fasta_from_intervals( bed_file, observed, "test_data/bed_ops/test_fasta_from_intervals/test_genome.fa") observed = gen.read_fasta(observed) self.assertEqual(observed, expected)
def test_extract_nt_indices(self): fasta_file = "test_data/bed_ops/test_extract_nt_indices/test_fasta_input.fasta" observed_a = "test_data/bed_ops/test_extract_nt_indices/observed_indices_a.fasta" observed_c = "test_data/bed_ops/test_extract_nt_indices/observed_indices_c.fasta" observed_g = "test_data/bed_ops/test_extract_nt_indices/observed_indices_g.fasta" observed_t = "test_data/bed_ops/test_extract_nt_indices/observed_indices_t.fasta" expected_a = "test_data/bed_ops/test_extract_nt_indices/expected_indices_a.fasta" expected_c = "test_data/bed_ops/test_extract_nt_indices/expected_indices_c.fasta" expected_g = "test_data/bed_ops/test_extract_nt_indices/expected_indices_g.fasta" expected_t = "test_data/bed_ops/test_extract_nt_indices/expected_indices_t.fasta" gen.remove_file(observed_a) gen.remove_file(observed_c) gen.remove_file(observed_g) gen.remove_file(observed_t) observed_files = { "A": observed_a, "C": observed_c, "G": observed_g, "T": observed_t, } extract_nt_indices(fasta_file, observed_files) expected_a = gen.read_fasta(expected_a) expected_c = gen.read_fasta(expected_c) expected_g = gen.read_fasta(expected_g) expected_t = gen.read_fasta(expected_t) observed_a = gen.read_fasta(observed_a) observed_c = gen.read_fasta(observed_c) observed_g = gen.read_fasta(observed_g) observed_t = gen.read_fasta(observed_t) self.assertEqual(observed_a, expected_a) self.assertEqual(observed_c, expected_c) self.assertEqual(observed_g, expected_g) self.assertEqual(observed_t, expected_t)
def get_exon_flank_reading_frame(coding_exons_fasta, full_exons_fasta, output_file): """ Get the reading frames that the flanks of an exon starts in. If the first position of the codon this is 0, second is 1 and last is 2. Args: coding_exons_fasta (str): path to file containing coding exon sequences full_exons_fasta (str): path to file containing all exon sequences output_file (str): path to the output file """ # read in the coding exons and all exons full_names, full_seqs = gen.read_fasta(full_exons_fasta) full_coding_names = gen.read_fasta(coding_exons_fasta)[0] # create a dictionary that hold each of the full sequences full_exon_seqs = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict())) for i, name in enumerate(full_names): id = name.split(".")[0] exon_id = int(name.split(".")[1].split("(")[0]) full_exon_seqs[id][exon_id] = full_seqs[i] # now get the reading frame that each flank starts in flanks_reading_frames = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: []))) for id in full_exon_seqs: seq_length = 0 for exon_id, seq in sorted(full_exon_seqs[id].items()): # sequence needs to be longer than 138 nts to have two flanks if len(seq) > 138: five_prime_start = seq_length + 2 three_prime_start = seq_length + len(seq) - 69 flanks_reading_frames[id][exon_id] = [ five_prime_start % 3, three_prime_start % 3 ] seq_length += len(seq) # now output the reading frames for just the coding exons with open(output_file, "w") as outfile: for name in full_coding_names: id = name.split(".")[0] exon_id = int(name.split(".")[1].split("(")[0]) if exon_id in flanks_reading_frames[id]: outfile.write(">{0}\n{1},{2}\n".format( name, flanks_reading_frames[id][exon_id][0], flanks_reading_frames[id][exon_id][1]))
def test_extract_cds_from_bed(self): bed_file = "./test_data/bed_ops/test_extract_cds_from_bed/test_extract_cds_from_bed_bed.bed" observed = "./test_data/bed_ops/test_extract_cds_from_bed/observed_test_extract_cds_from_bed_fasta.fasta" intervals = "./test_data/bed_ops/test_extract_cds_from_bed/observed_intervals.fasta" gen.remove_file(observed) gen.remove_file(intervals) expected = gen.read_fasta( "./test_data/bed_ops/test_extract_cds_from_bed/expected_test_extract_cds_from_bed_fasta.fasta" ) extract_cds_from_bed( bed_file, observed, "./test_data/bed_ops/test_extract_cds_from_bed/test_extract_cds_from_bed_genome.fa", intervals) observed = gen.read_fasta(observed) self.assertEqual(observed, expected)
def test_extract_cds(self): gtf_file = "./test_data/bed_ops/test_extract_cds/test_extract_cds.gtf" genome_file = "./test_data/bed_ops/test_extract_cds/test_extract_cds_genome.fa" bed_output = "./test_data/bed_ops/test_extract_cds/test_extract_cds.bed" observed = "./test_data/bed_ops/test_extract_cds/observed_test_extract_cds_fasta.fasta" gen.remove_file(observed) expected = gen.read_fasta( "./test_data/bed_ops/test_extract_cds/expected_test_extract_cds_fasta.fasta" ) extract_cds(gtf_file, bed_output, observed, genome_file, full_chr_name=True) observed = gen.read_fasta(observed) self.assertEqual(observed, expected)
def get_passed_NONCODE_codes(input_fasta, codes_file, mapping_file, output_fasta, code): """ Only keep sequences that have particular NONCODE code Args: input_fasta (str): path to input fasta file codes_file (str): path to file containing code mapping_file (str): path to transcript-gene mapping file output_fasta (str): path to output fasta code (str): code to look for. As string because cant pass 0001 through """ codes = { code[0]: code[1] for code in gen.read_many_fields(codes_file, "\t") } mappings = { name[0].split(".")[0]: name[1] for name in gen.read_many_fields(mapping_file, "\t") } names, seqs = gen.read_fasta(input_fasta) with open(output_fasta, "w") as outfile: for i, name in enumerate(names): gene = mappings[name] seq_code = codes[gene] if seq_code == code: outfile.write(">{0}\n{1}\n".format(name, seqs[i]))
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False): """ Takes a bed file and creates a fasta file with the corresponding sequences. Credit: Rosina Savisaar Args: bed_file (str): the bed file path to create fasta from fasta_file (str): the output fasta file path genome_fasta (str): the file path to the genome fasta file names (bool): if False, the fasta record names will be generated from the sequence coordinates. names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file """ #if the index file exists, check whether the expected features are present genome_fasta_index = genome_fasta + '.fai' if(os.path.exists(genome_fasta_index)): bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")]))) index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")]))) if(not set(bed_chrs).issubset(set(index_chrs))): gen.remove_file(genome_fasta_index) bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file] if not force_strand: del bedtools_args[2] if names: bedtools_args.append("-name") gen.run_process(bedtools_args) names, seqs = gen.read_fasta(fasta_file) seqs = [i.upper() for i in seqs] gen.write_to_fasta(names, seqs, fasta_file)
def uniquify_transcripts(input_fasta, transcript_gene_links, output_fasta): """ Given a fasta and the links between genes and transcripts, filter to only leave one transcript per gene. Choose the longest transcript. Args: input_fasta (str): path to the input fasta transcript_gene_links (dict): the dictionary containing trancript links to genes, keys are gene ids output_fasta (str): path to the output fasta """ names, seqs = gen.read_fasta(input_fasta) with open(output_fasta, "w") as outfile: for gene_id in sorted(transcript_gene_links): # get the lengths of the transcripts associated with the gene sequence_lengths = [ len(seqs[names.index(transcript_id)]) for transcript_id in transcript_gene_links[gene_id] ] # get the transcript id of the longest transcript max_length_transcript = transcript_gene_links[gene_id][ sequence_lengths.index(max(sequence_lengths))] # write the longest transcript to file outfile.write(">{0}\n{1}\n".format( max_length_transcript, seqs[names.index(max_length_transcript)]))
def run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, output_file): ''' Run simulation that picks hexamers from the exon sequences ''' exon_names, exon_seqs = gen.read_fasta(exon_fasta) #exons needs to be >= 16 to get the two exon ends exon_seqs = [exon for exon in exon_seqs if len(exon) >= 16] # get motifs, avoid header if there is one motif_list = gen.read_many_fields(motif_file, ",") motifs = [i[0] for i in motif_list if i[0][0] != "#"] real_count = se.get_stop_codon_count(motifs) simulations = list(range(required_simulations)) # simulated_counts = simulate_motifs(simulations, exon_seqs, motifs) processes = gen.run_in_parallel(simulations, ["foo", exon_seqs, motifs], simulate_motifs) outputs = [] for process in processes: outputs.extend(process.get()) with open(output_file, "w") as outfile: outfile.write('sim,count\n') outfile.write('real,{0}\n'.format(real_count)) for i, count in enumerate(outputs): outfile.write('{0},{1}\n'.format(i+1,count))
def extract_nt_indices(fasta_file, output_files): ''' Extract the indices for each nt given a fasta file Output files need to be of format: output_files: "A": "filepath_for_A", "C", "filepath_for_C" etc ''' nts = ["A", "C", "G", "T"] names, seqs = gen.read_fasta(fasta_file) # pos_regex = re.compile('^(chr\d+):(\d+)-(\d+)(?=\([+-]\))'); indices = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) for i, seq in enumerate(seqs): id = names[i].strip('>') for nt in nts: indices[id][nt] = [str(m.start(0)) for m in re.finditer('{0}'.format(nt), seq)] outfiles = {} for nt in nts: outfiles[nt] = open(output_files[nt], "w") for id in indices: for nt in indices[id]: if len(indices[id][nt]) > 0: outfiles[nt].write(">{0}\n".format(id)) outfiles[nt].write("{0}\n".format(",".join(indices[id][nt]))) for nt in nts: outfiles[nt].close()
def list_from_fasta(file): fasta_list = {} names, seq = gen.read_fasta(file) for i, name in enumerate(names): name = name.split('(')[0] fasta_list[name] = seq[i] return fasta_list
def extract_second_seqs(input_bed, input_file, genome_fasta, output_dir): """ Extract the second set of sequences """ # get a set of ids that correspond only to lincrna entries id_file = "{0}/lncrna_ids.txt".format(output_dir) extract_lncrna_only(input_file, id_file) # now keep only the bed entries that are in the id list filtered_bed = "{0}.filtered".format(input_bed) ids = gen.read_many_fields(id_file, "\t") bed_entries = gen.read_many_fields(input_bed, "\t") with open(filtered_bed, "w") as outfile: for entry in bed_entries: if entry[3] in ids: outfile.write("{0}\n".format("\t".join(entry))) # now write the bed to an exon bed exons_bed = "{0}.exons.bed".format(input_bed) fo.entries_to_bed(filtered_bed, exons_bed, hg38=True) # now get the exon sequences exons_fasta = "{0}.exons.fasta".format(input_bed) fo.fasta_from_intervals(exons_bed, exons_fasta, genome_fasta, force_strand=True, names=True) # now generate the full transcript for multi exon transcripts transcripts_fasta = "{0}.multi_exon_transcripts.fasta".format(input_bed) names, seqs = gen.read_fasta(exons_fasta) seq_list = collections.defaultdict(lambda: collections.defaultdict()) for i, name in enumerate(names): id = ".".join(name.split("(")[0].split(".")[:-1]) exon = int(name.split("(")[0].split(".")[-1]) seq_list[id][exon] = seqs[i] with open(transcripts_fasta, "w") as outfile: for id in sorted(seq_list): if len(seq_list[id]) > 1: exon_list = [] for exon in sorted(seq_list[id]): exon_list.append(seq_list[id][exon]) seq = "".join(exon_list) if "N" not in seq and len(seq) >= 200: # convert names to : here as otherwise it will run sorting later id = ":".join(id.split(".")) outfile.write(">{0}\n{1}\n".format(id, seq)) # blast to get paralogous families blast_db_path = "{0}/bast_db".format(output_directory) output_blast_file = "{0}/blast_output.csv".format(output_directory) families_file = "{0/families.txt".format(output_directory) gen.create_output_directories(blast_db_path) cons.filter_families(transcripts_fasta, output_blast_file, families_file, database_path=blast_db_path, clean_run=True)
def test_fasta_sequence_quality_control(self): fasta_parts = gen.read_fasta( './test_data/bed_ops/test_fasta_sequence_quality_control/test_fasta_sequence_quality_control.fasta' ) fasta_parts_names = fasta_parts[0] fasta_parts_seqs = fasta_parts[1] names, seqs = check_sequence_quality(fasta_parts_names, fasta_parts_seqs, check_acgt=True, check_stop=True, check_start=True, check_length=True, check_inframe_stop=True) observed = (names, seqs) expected = gen.read_fasta( './test_data/bed_ops/test_fasta_sequence_quality_control/expected_test_fasta_sequence_quality_control.fasta' ) self.assertEqual(observed, expected)
def test_extract_cds_clean_chrom(self): gtf_file = "./test_data/bed_ops/test_extract_cds_clean_chrom/test_extract_cds.gtf" genome_file = "./test_data/bed_ops/test_extract_cds_clean_chrom/test_extract_cds_genome.fa" bed_output = "./test_data/bed_ops/test_extract_cds_clean_chrom/test_extract_cds.bed" observed = "./test_data/bed_ops/test_extract_cds_clean_chrom/observed_test_extract_cds_fasta.fasta" intervals = "./test_data/bed_ops/test_extract_cds_clean_chrom/observed_intervals.fasta" gen.remove_file(observed) gen.remove_file(intervals) expected = gen.read_fasta( "./test_data/bed_ops/test_extract_cds_clean_chrom/expected_test_extract_cds_fasta.fasta" ) extract_cds(gtf_file, bed_output, observed, genome_file, intervals, clean_chrom_only=True) observed = gen.read_fasta(observed) self.assertEqual(observed, expected)
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None): """ Generate random hexamers from introns and calculate purine content Args: input_fasta (str): path to intron fasta motif_file (str): path to file containing real motifs output_directory (str): path to output directory output_file (str): path to output file required_simulations (int): if set, the number of simulations to run families_file (str): if set, path to families file """ hexamers_dir = "{0}/random_hexamers".format(output_directory) gen.create_output_directories(hexamers_dir) # get the motifs motifs = sequo.read_motifs(motif_file) # if there are not enough simulations, generate them if len(os.listdir(hexamers_dir)) < required_simulations: gen.create_output_directories(hexamers_dir) required = list(range(required_simulations - len(os.listdir(hexamers_dir)))) names, seqs = gen.read_fasta(sequences_file) seqs_list = collections.defaultdict(lambda: []) for i, name in enumerate(names): seqs_list[name.split(".")[0]].append(seqs[i]) if families_file: seqs_list = sequo.pick_random_family_member(families_file, seqs_list) all_seqs = [] [all_seqs.extend(seqs_list[i]) for i in seqs_list] full_seq = "X".join(all_seqs) simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False) # calculate the purine contents real_purine_content = sequo.calc_purine_content(motifs) real_nt_content = sequo.calc_nucleotide_content(motifs) test_purine_content = [] test_nt_content = [] for file in os.listdir(hexamers_dir): filepath = "{0}/{1}".format(hexamers_dir, file) test_motifs = sequo.read_motifs(filepath) test_purine_content.append(sequo.calc_purine_content(test_motifs)) test_nt_content.append(sequo.calc_nucleotide_content(test_motifs)) with open(output_file, "w") as outfile: outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n") outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)])))) for i in range(len(test_purine_content)): outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])])))) # remove the output directory gen.remove_directory(hexamers_dir)
def test_extract_cds_from_bed_quality_control(self): bed_file = "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/test_extract_cds_from_bed_quality_control_bed.bed" observed = "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/observed_test_extract_cds_from_bed_quality_control_fasta.fasta" intervals = "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/observed_intervals.fasta" gen.remove_file(observed) gen.remove_file(intervals) expected = gen.read_fasta( "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/expected_test_extract_cds_from_bed_quality_control_fasta.fasta" ) extract_cds_from_bed( bed_file, observed, "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/test_extract_cds_from_bed_quality_control_genome.fa", intervals, check_acgt=True, check_start=True, check_length=True, check_stop=True, check_inframe_stop=True) observed = gen.read_fasta(observed) self.assertEqual(observed, expected)
def get_coding_exons(coding_exons_fasta): coding_exons = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict())) exon_names, exon_seqs = gen.read_fasta(coding_exons_fasta) for i, name in enumerate(exon_names): transcript = name.split('(')[0].split('.')[0] exon = int(name.split('(')[0].split('.')[1]) seq = exon_seqs[i] coding_exons[transcript][exon] = seq return coding_exons
def build_sequences(input_fasta, input_stops_fasta, output_fasta): """ Build sequences from a provided fasta Args: input_fasta (str): path to input fasta input_stops_fasta (str): path to fasta containing the stop codons output_fasta (str): path for output fasta containing built sequences """ names, seqs = gen.read_fasta(input_fasta) stop_names, stop_seqs = gen.read_fasta(input_stops_fasta) # create a dictionary containing all the parts sequence_parts_list = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict())) for i, name in enumerate(names): splits = name.split('(') name_splits = splits[0].split('.') transcript_id = name_splits[0] exon_id = int(name_splits[1]) sequence_parts_list[transcript_id][exon_id] = seqs[i] # now build the sequences out sequence_dict = collections.defaultdict(lambda: []) for transcript_id in sequence_parts_list: for exon_id in sorted(sequence_parts_list[transcript_id]): sequence_dict[transcript_id].append( sequence_parts_list[transcript_id][exon_id]) # add the stop codons for i, stop_name in enumerate(stop_names): transcript_id = stop_name.split(".")[0] sequence_dict[transcript_id].append(stop_seqs[i]) #output to file with open(output_fasta, "w") as outfile: for transcript_id in sorted(sequence_dict): outfile.write(">{0}\n{1}\n".format( transcript_id, "".join(sequence_dict[transcript_id])))
def get_exon_reading_frame(coding_exons_fasta, full_exons_fasta, output_file): """ Get the reading frame an exon starts in. If the first position of the codon this is 0, second is 1 and last is 2. Args: coding_exons_fasta (str): path to file containing coding exon sequences full_exons_fasta (str): path to file containing all exon sequences output_file (str): path to the output file """ # read in the coding exons and all exons full_names, full_seqs = gen.read_fasta(full_exons_fasta) full_coding_names = gen.read_fasta(coding_exons_fasta)[0] # create a dictionary that hold each of the full sequences full_exon_seqs = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict())) for i, name in enumerate(full_names): id = name.split(".")[0] exon_id = int(name.split(".")[1].split("(")[0]) full_exon_seqs[id][exon_id] = full_seqs[i] # now get the reading frame that they start in full_reading_frames = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict())) for id in full_exon_seqs: seq_length = 0 for exon_id, seq in sorted(full_exon_seqs[id].items()): full_reading_frames[id][exon_id] = seq_length % 3 seq_length += len(seq) # now output the reading frames for just the coding exons with open(output_file, "w") as outfile: for name in full_coding_names: id = name.split(".")[0] exon_id = int(name.split(".")[1].split("(")[0]) # print(name, full_reading_frames[id][exon_id]) outfile.write(">{0}\n{1}\n".format( name, full_reading_frames[id][exon_id]))
def get_utrs(transcript_fasta, cds_fasta, output_file): cds_names, cds_seqs = gen.read_fasta(cds_fasta) cds_list = {name: cds_seqs[i] for i, name in enumerate(cds_names)} transcript_names, transcript_seqs = gen.read_fasta(transcript_fasta) transcript_list = { name: transcript_seqs[i] for i, name in enumerate(transcript_names) } with open(output_file, "w") as outfile: for id in cds_list: cds = cds_list[id] transcript = transcript_list[id] try: cds_start_index = transcript.index(cds) if cds_start_index > 0: outfile.write(">{0}\n{1}\n".format( id, transcript[:cds_start_index])) except: pass
def filter_coding_sequences(input_fasta, output_fasta): """ Quality filter coding sequences Args: input_fasta (str): path to fasta file containing input CDS sequences output_fasta (str): path to output file to put sequences that pass filtering """ # copile regex searches actg_regex = re.compile("[^ACTG]") codon_regex = re.compile(".{3}") stop_codons = ["TAA", "TAG", "TGA"] # read the sequences names, seqs = gen.read_fasta(input_fasta) print("{0} sequences prior to filtering...".format(len(seqs))) # filter the sequences with open(output_fasta, "w") as outfile: pass_count = 0 for i, name in enumerate(names): seq = seqs[i] passed = True # check to see if the first codon is ATG if passed and seq[:3] != "ATG": passed = False # check to see if the last codon is a stop codon if passed and seq[-3:] not in stop_codons: passed = False # check to see if sequence is a length that is a # multiple of 3 if passed and len(seq) % 3 != 0: passed = False # check if there are any non ACTG characters in string non_actg = re.subn(actg_regex, '!', seq)[1] if passed and non_actg != 0: passed = False # check if there are any in frame stop codons codons = re.findall(codon_regex, seq[3:-3]) inframe_stops = [codon for codon in codons if codon in stop_codons] if passed and len(inframe_stops): passed = False # only if passed all the filters write to file if passed: outfile.write(">{0}\n{1}\n".format(name, seq)) pass_count += 1 print("{0} sequences after filtering...".format(pass_count))
def extract_cds_from_bed(bed_file, output_fasta, genome_fasta, fasta_interval_file, check_acgt=None, check_start=None, check_length=None, check_stop=None, check_inframe_stop=None, all_checks=None, uniquify = False): ''' Extract the CDS to fasta file Ex.: extract_cds('../feature_file.bed', '../output_file_fasta.fasta', '../source_data/genome_fasta_file.fa') ''' #create dictionaries to hold cds parts cds_list = collections.defaultdict(lambda: collections.defaultdict()) # stop_list = {} concat_list = collections.defaultdict(lambda: collections.UserList()) #create fasta file with extracted parts fasta_from_intervals(bed_file, fasta_interval_file, genome_fasta, names = True) #read the fasta interval file sample_names, sample_seqs = gen.read_fasta(fasta_interval_file) #label the stop codons for i, name in enumerate(sample_names): if len(sample_seqs[i]) == 3 and sample_seqs[i] in ['TAA', 'TAG', 'TGA']: sample_names[i] = name + '.stop_codon' #iterate through the samples for i, sample in enumerate(sample_names): entry_meta_splits = sample.split('.') #set the sample name: sample(.exon) sample_id = entry_meta_splits[0] # check if labelled as stop codon, and set to high number so when # sorted this is the last thing to be appended if entry_meta_splits[-1] == "stop_codon": exon_id = 9999999 else: exon_id = int(entry_meta_splits[1]) cds_list[sample_id][exon_id] = sample_seqs[i] #get sorted list of cds exons to build cds for sample in sorted(cds_list): for part in sorted(cds_list[sample]): concat_list[sample].append(cds_list[sample][part]) #concatenate and write to output names = [] seqs = [] for sample in sorted(concat_list): names.append(sample) seqs.append("".join(concat_list[sample])) #perform sequence quality control checks if check_acgt or check_stop or check_start or check_length or check_inframe_stop or all_checks: names, seqs = check_sequence_quality(names, seqs, check_acgt, check_stop, check_start, check_length, check_inframe_stop, all_checks) if uniquify: #leave only one transcript per gene gene_to_trans = link_genes_and_transcripts(bed_file) names, seqs = uniquify_trans(names, seqs, gene_to_trans) print("After leaving only one transcript per gene, {0} sequences remain.".format(len(seqs))) #write to output fasta file gen.write_to_fasta(names, seqs, output_fasta)
def filter_bed_from_fasta(bed, fasta, out_bed, families_file = None): ''' Given a bed file and a fasta file, filter the bed file to only leave records where the 'name' field appears among the names in the fasta file. Write to out_bed. If a families_file is given, leave only one (randomly picked) member per family. ''' #add feature in here that enables overwrite of current file output_exists = False if Path(out_bed).exists(): output_exists = True temp_file_name = "{0}.{1}{2}".format(os.path.splitext(out_bed)[0], random.random(), os.path.splitext(out_bed)[1]) else: temp_file_name = out_bed # print(temp_file_name) fasta_names, fasta_seqs = gen.read_fasta(fasta) #read in family information and pick one transcript per family if families_file: families = gen.read_families(families_file) #make sure the families file doesn't contain transcripts that are not in the fasta for pos, family in enumerate(families): families[pos] = [i for i in family if i in fasta_names] flat_families = gen.flatten(families) #first fish out singletons fasta_names_new = [i for i in fasta_names if i not in flat_families] for family in families: if family: family_lengths = [len(fasta_seqs[fasta_names.index(i)]) for i in family] fasta_names_new.append(family[family_lengths.index(max(family_lengths))]) fasta_names = fasta_names_new.copy() bed_data = gen.read_many_fields(bed, "\t") #remove empty lines bed_data = [i for i in bed_data if len(i) > 3] #I have to ask Liam cause I'm having a hard time understanding this regex id_regex = re.compile("^(\w+).*") with open(temp_file_name, "w") as file: for line in bed_data: idn = re.search(id_regex, line[3]) if idn: #filter bed data if idn.group(1) in fasta_names: file.write("\t".join(line)) file.write("\n") #remove old file, replace with new if(output_exists): os.remove(out_bed) shutil.move(temp_file_name, out_bed)
def get_seq_list(fasta_file, with_chr=False, full_seq_list=None): names, seqs = gen.read_fasta(fasta_file) seq_list = collections.defaultdict(lambda: []) for i, name in enumerate(names): id = name.split("(")[0] if with_chr: id = ".".join(id.split(".")[:-1]) if full_seq_list: if id in full_seq_list: seq_list[id].append(seqs[i]) else: seq_list[id].append(seqs[i]) seq_list = {id: seq_list[id] for i, id in enumerate(seq_list)} return seq_list
def filter_seq_lengths(input_file, output_file, length): """ Given a fasta filter, filter and keep only sequences longer than the given length Args: input_file (str): path to input fasta output_file (str): path to output fasta length (int): length threshold to retain sequences longer than """ names, seqs = gen.read_fasta(input_file) with open(output_file, "w") as outfile: for i, name in enumerate(names): if len(seqs[i]) >= length: outfile.write(">{0}\n{1}\n".format(name, seqs[i]))
def test_extract_cds_quality_control(self): gtf_file = "./test_data/bed_ops/test_extract_cds_quality_control/test_extract_cds_quality_control.gtf" genome_file = "./test_data/bed_ops/test_extract_cds_quality_control/test_extract_cds_quality_control_genome.fa" bed_output = "./test_data/bed_ops/test_extract_cds_quality_control/observed_test_extract_cds_quality_control_cds.bed" observed = "./test_data/bed_ops/test_extract_cds_quality_control/observed_test_extract_cds_quality_control_fasta.fasta" intervals = "./test_data/bed_ops/test_extract_cds_quality_control/observed_intervals.fasta" gen.remove_file(observed) gen.remove_file(intervals) gen.remove_file(bed_output) expected = gen.read_fasta( "./test_data/bed_ops/test_extract_cds_quality_control/expected_test_extract_cds_quality_control_fasta.fasta" ) extract_cds(gtf_file, bed_output, observed, genome_file, intervals, check_acgt=True, check_start=True, check_length=True, check_stop=True, check_inframe_stop=True) observed = gen.read_fasta(observed) self.assertEqual(observed, expected)
def get_fasta_exon_intervals(intervals_fasta): ''' Get a list of exon seqs ''' stops = ["TAA", "TAG", "TGA"] exon_interval_names, exon_interval_seqs = gen.read_fasta(intervals_fasta) exon_interval_list = collections.defaultdict(lambda: collections.defaultdict()) for i, name in enumerate(exon_interval_names): transcript_id = name.split('.')[0] exon = int(name.split('(')[0].split('.')[1]) seq = exon_interval_seqs[i] if seq in stops: exon_id = 99999 else: exon_id = exon exon_interval_list[transcript_id][exon_id] = seq return exon_interval_list