Esempio n. 1
0
def get_full_transcripts(cds_fasta, exons_fasta, output_file):

    cds_names, cds_seqs = gen.read_fasta(cds_fasta)
    cds_list = {name: cds_seqs[i] for i, name in enumerate(cds_names)}

    exon_names, exon_seqs = gen.read_fasta(exons_fasta)
    exon_list = collections.defaultdict(
        lambda: collections.defaultdict(lambda: []))

    for i, name in enumerate(exon_names):
        id = name.split(".")[0]
        exon_id = int(name.split(".")[1].split("(")[0])
        exon_list[id][exon_id] = exon_seqs[i]

    full_spliced_transcripts = {}
    for id in exon_list:
        exons = []
        for exon_id in sorted(exon_list[id]):
            exons.append(exon_list[id][exon_id])
        full_spliced_transcripts[id] = "".join(exons)

    with open(output_file, "w") as outfile:
        for id in full_spliced_transcripts:
            outfile.write(">{0}\n{1}\n".format(id,
                                               full_spliced_transcripts[id]))
Esempio n. 2
0
 def test_filter_fasta_intervals_from_fasta(self):
     fasta = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_fasta.fasta"
     fasta_intervals = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_intervals.fasta"
     observed = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/observed_filtered_intervals.fasta"
     gen.remove_file(observed)
     expected = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/expected_filtered_intervals.fasta"
     filter_fasta_intervals_from_fasta(fasta_intervals, fasta, observed)
     expected = gen.read_fasta(expected)
     observed = gen.read_fasta(observed)
     self.assertEqual(expected, observed)
Esempio n. 3
0
 def test_sim_cds_seqs(self):
     input_file = "test_data/sim_ops/test_sim_cds_seqs/input.fasta"
     expected_file = "test_data/sim_ops/test_sim_cds_seqs/expected.fasta"
     seq_list = gen.read_fasta(input_file)[1]
     expected = gen.read_fasta(expected_file)[1]
     codons = [re.findall(".{3}", seq) for seq in seq_list]
     codon_list = [codon_set[1:-1] for codon_set in codons]
     starts = [i[0] for i in codons]
     stops = [i[-1] for i in codons]
     observed = so.sim_cds_seqs(codon_list, starts, stops, seed=1)
     self.assertEqual(expected, observed)
Esempio n. 4
0
 def test_fasta_from_intervals(self):
     observed = "test_data/bed_ops/test_fasta_from_intervals/observed_converted_fasta.fasta"
     gen.remove_file(observed)
     bed_file = "test_data/bed_ops/test_fasta_from_intervals/test_bed_for_fasta_conversion.bed"
     expected = gen.read_fasta(
         "test_data/bed_ops/test_fasta_from_intervals/expected_converted_fasta.fasta"
     )
     fasta_from_intervals(
         bed_file, observed,
         "test_data/bed_ops/test_fasta_from_intervals/test_genome.fa")
     observed = gen.read_fasta(observed)
     self.assertEqual(observed, expected)
Esempio n. 5
0
 def test_extract_nt_indices(self):
     fasta_file = "test_data/bed_ops/test_extract_nt_indices/test_fasta_input.fasta"
     observed_a = "test_data/bed_ops/test_extract_nt_indices/observed_indices_a.fasta"
     observed_c = "test_data/bed_ops/test_extract_nt_indices/observed_indices_c.fasta"
     observed_g = "test_data/bed_ops/test_extract_nt_indices/observed_indices_g.fasta"
     observed_t = "test_data/bed_ops/test_extract_nt_indices/observed_indices_t.fasta"
     expected_a = "test_data/bed_ops/test_extract_nt_indices/expected_indices_a.fasta"
     expected_c = "test_data/bed_ops/test_extract_nt_indices/expected_indices_c.fasta"
     expected_g = "test_data/bed_ops/test_extract_nt_indices/expected_indices_g.fasta"
     expected_t = "test_data/bed_ops/test_extract_nt_indices/expected_indices_t.fasta"
     gen.remove_file(observed_a)
     gen.remove_file(observed_c)
     gen.remove_file(observed_g)
     gen.remove_file(observed_t)
     observed_files = {
         "A": observed_a,
         "C": observed_c,
         "G": observed_g,
         "T": observed_t,
     }
     extract_nt_indices(fasta_file, observed_files)
     expected_a = gen.read_fasta(expected_a)
     expected_c = gen.read_fasta(expected_c)
     expected_g = gen.read_fasta(expected_g)
     expected_t = gen.read_fasta(expected_t)
     observed_a = gen.read_fasta(observed_a)
     observed_c = gen.read_fasta(observed_c)
     observed_g = gen.read_fasta(observed_g)
     observed_t = gen.read_fasta(observed_t)
     self.assertEqual(observed_a, expected_a)
     self.assertEqual(observed_c, expected_c)
     self.assertEqual(observed_g, expected_g)
     self.assertEqual(observed_t, expected_t)
Esempio n. 6
0
def get_exon_flank_reading_frame(coding_exons_fasta, full_exons_fasta,
                                 output_file):
    """
    Get the reading frames that the flanks of an exon starts in. If the first position of the codon
    this is 0, second is 1 and last is 2.

    Args:
        coding_exons_fasta (str): path to file containing coding exon sequences
        full_exons_fasta (str): path to file containing all exon sequences
        output_file (str): path to the output file
    """

    # read in the coding exons and all exons
    full_names, full_seqs = gen.read_fasta(full_exons_fasta)
    full_coding_names = gen.read_fasta(coding_exons_fasta)[0]

    # create a dictionary that hold each of the full sequences
    full_exon_seqs = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict()))
    for i, name in enumerate(full_names):
        id = name.split(".")[0]
        exon_id = int(name.split(".")[1].split("(")[0])
        full_exon_seqs[id][exon_id] = full_seqs[i]

    # now get the reading frame that each flank starts in
    flanks_reading_frames = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict(lambda:
                                                                        [])))
    for id in full_exon_seqs:
        seq_length = 0
        for exon_id, seq in sorted(full_exon_seqs[id].items()):
            # sequence needs to be longer than 138 nts to have two flanks
            if len(seq) > 138:
                five_prime_start = seq_length + 2
                three_prime_start = seq_length + len(seq) - 69
                flanks_reading_frames[id][exon_id] = [
                    five_prime_start % 3, three_prime_start % 3
                ]

            seq_length += len(seq)

    # now output the reading frames for just the coding exons
    with open(output_file, "w") as outfile:
        for name in full_coding_names:
            id = name.split(".")[0]
            exon_id = int(name.split(".")[1].split("(")[0])
            if exon_id in flanks_reading_frames[id]:
                outfile.write(">{0}\n{1},{2}\n".format(
                    name, flanks_reading_frames[id][exon_id][0],
                    flanks_reading_frames[id][exon_id][1]))
Esempio n. 7
0
 def test_extract_cds_from_bed(self):
     bed_file = "./test_data/bed_ops/test_extract_cds_from_bed/test_extract_cds_from_bed_bed.bed"
     observed = "./test_data/bed_ops/test_extract_cds_from_bed/observed_test_extract_cds_from_bed_fasta.fasta"
     intervals = "./test_data/bed_ops/test_extract_cds_from_bed/observed_intervals.fasta"
     gen.remove_file(observed)
     gen.remove_file(intervals)
     expected = gen.read_fasta(
         "./test_data/bed_ops/test_extract_cds_from_bed/expected_test_extract_cds_from_bed_fasta.fasta"
     )
     extract_cds_from_bed(
         bed_file, observed,
         "./test_data/bed_ops/test_extract_cds_from_bed/test_extract_cds_from_bed_genome.fa",
         intervals)
     observed = gen.read_fasta(observed)
     self.assertEqual(observed, expected)
Esempio n. 8
0
 def test_extract_cds(self):
     gtf_file = "./test_data/bed_ops/test_extract_cds/test_extract_cds.gtf"
     genome_file = "./test_data/bed_ops/test_extract_cds/test_extract_cds_genome.fa"
     bed_output = "./test_data/bed_ops/test_extract_cds/test_extract_cds.bed"
     observed = "./test_data/bed_ops/test_extract_cds/observed_test_extract_cds_fasta.fasta"
     gen.remove_file(observed)
     expected = gen.read_fasta(
         "./test_data/bed_ops/test_extract_cds/expected_test_extract_cds_fasta.fasta"
     )
     extract_cds(gtf_file,
                 bed_output,
                 observed,
                 genome_file,
                 full_chr_name=True)
     observed = gen.read_fasta(observed)
     self.assertEqual(observed, expected)
Esempio n. 9
0
def get_passed_NONCODE_codes(input_fasta, codes_file, mapping_file,
                             output_fasta, code):
    """
    Only keep sequences that have particular NONCODE code

    Args:
        input_fasta (str): path to input fasta file
        codes_file (str): path to file containing code
        mapping_file (str): path to transcript-gene mapping file
        output_fasta (str): path to output fasta
        code (str): code to look for. As string because cant pass 0001 through
    """

    codes = {
        code[0]: code[1]
        for code in gen.read_many_fields(codes_file, "\t")
    }
    mappings = {
        name[0].split(".")[0]: name[1]
        for name in gen.read_many_fields(mapping_file, "\t")
    }

    names, seqs = gen.read_fasta(input_fasta)
    with open(output_fasta, "w") as outfile:
        for i, name in enumerate(names):
            gene = mappings[name]
            seq_code = codes[gene]
            if seq_code == code:
                outfile.write(">{0}\n{1}\n".format(name, seqs[i]))
Esempio n. 10
0
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False):
    """
    Takes a bed file and creates a fasta file with the corresponding sequences.
    Credit: Rosina Savisaar

    Args:
        bed_file (str): the bed file path to create fasta from
        fasta_file (str): the output fasta file path
        genome_fasta (str): the file path to the genome fasta file
        names (bool): if False, the fasta record names will be generated from the sequence coordinates.
        names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file
    """

    #if the index file exists, check whether the expected features are present
    genome_fasta_index = genome_fasta + '.fai'
    if(os.path.exists(genome_fasta_index)):
        bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")])))
        index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")])))
        if(not set(bed_chrs).issubset(set(index_chrs))):
            gen.remove_file(genome_fasta_index)

    bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file]
    if not force_strand:
        del bedtools_args[2]
    if names:
        bedtools_args.append("-name")
    gen.run_process(bedtools_args)
    names, seqs = gen.read_fasta(fasta_file)
    seqs = [i.upper() for i in seqs]
    gen.write_to_fasta(names, seqs, fasta_file)
Esempio n. 11
0
def uniquify_transcripts(input_fasta, transcript_gene_links, output_fasta):
    """
    Given a fasta and the links between genes and transcripts, filter to only
    leave one transcript per gene. Choose the longest transcript.

    Args:
        input_fasta (str): path to the input fasta
        transcript_gene_links (dict): the dictionary containing trancript links to genes, keys are gene ids
        output_fasta (str): path to the output fasta
    """

    names, seqs = gen.read_fasta(input_fasta)

    with open(output_fasta, "w") as outfile:
        for gene_id in sorted(transcript_gene_links):
            # get the lengths of the transcripts associated with the gene
            sequence_lengths = [
                len(seqs[names.index(transcript_id)])
                for transcript_id in transcript_gene_links[gene_id]
            ]
            # get the transcript id of the longest transcript
            max_length_transcript = transcript_gene_links[gene_id][
                sequence_lengths.index(max(sequence_lengths))]
            # write the longest transcript to file
            outfile.write(">{0}\n{1}\n".format(
                max_length_transcript,
                seqs[names.index(max_length_transcript)]))
Esempio n. 12
0
def run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, output_file):
    '''
    Run simulation that picks hexamers from the exon sequences
    '''

    exon_names, exon_seqs = gen.read_fasta(exon_fasta)
    #exons needs to be >= 16 to get the two exon ends
    exon_seqs = [exon for exon in exon_seqs if len(exon) >= 16]

    # get motifs, avoid header if there is one
    motif_list = gen.read_many_fields(motif_file, ",")
    motifs = [i[0] for i in motif_list if i[0][0] != "#"]

    real_count = se.get_stop_codon_count(motifs)

    simulations = list(range(required_simulations))
    # simulated_counts = simulate_motifs(simulations, exon_seqs, motifs)
    processes = gen.run_in_parallel(simulations, ["foo", exon_seqs, motifs], simulate_motifs)

    outputs = []
    for process in processes:
        outputs.extend(process.get())

    with open(output_file, "w") as outfile:
        outfile.write('sim,count\n')
        outfile.write('real,{0}\n'.format(real_count))
        for i, count in enumerate(outputs):
            outfile.write('{0},{1}\n'.format(i+1,count))
Esempio n. 13
0
def extract_nt_indices(fasta_file, output_files):
    '''
    Extract the indices for each nt given a fasta file
    Output files need to be of format: output_files: "A": "filepath_for_A", "C", "filepath_for_C" etc
    '''
    nts = ["A", "C", "G", "T"]
    names, seqs = gen.read_fasta(fasta_file)
    # pos_regex = re.compile('^(chr\d+):(\d+)-(\d+)(?=\([+-]\))');

    indices = collections.defaultdict(lambda: collections.defaultdict(lambda: []))

    for i, seq in enumerate(seqs):
        id = names[i].strip('>')
        for nt in nts:
            indices[id][nt] = [str(m.start(0)) for m in re.finditer('{0}'.format(nt), seq)]

    outfiles = {}
    for nt in nts:
        outfiles[nt] = open(output_files[nt], "w")


    for id in indices:
        for nt in indices[id]:
            if len(indices[id][nt]) > 0:
                outfiles[nt].write(">{0}\n".format(id))
                outfiles[nt].write("{0}\n".format(",".join(indices[id][nt])))

    for nt in nts:
        outfiles[nt].close()
Esempio n. 14
0
def list_from_fasta(file):

    fasta_list = {}
    names, seq = gen.read_fasta(file)
    for i, name in enumerate(names):
        name = name.split('(')[0]
        fasta_list[name] = seq[i]
    return fasta_list
Esempio n. 15
0
def extract_second_seqs(input_bed, input_file, genome_fasta, output_dir):
    """
    Extract the second set of sequences
    """
    # get a set of ids that correspond only to lincrna entries
    id_file = "{0}/lncrna_ids.txt".format(output_dir)
    extract_lncrna_only(input_file, id_file)

    # now keep only the bed entries that are in the id list
    filtered_bed = "{0}.filtered".format(input_bed)
    ids = gen.read_many_fields(id_file, "\t")
    bed_entries = gen.read_many_fields(input_bed, "\t")
    with open(filtered_bed, "w") as outfile:
        for entry in bed_entries:
            if entry[3] in ids:
                outfile.write("{0}\n".format("\t".join(entry)))

    # now write the bed to an exon bed
    exons_bed = "{0}.exons.bed".format(input_bed)
    fo.entries_to_bed(filtered_bed, exons_bed, hg38=True)
    # now get the exon sequences
    exons_fasta = "{0}.exons.fasta".format(input_bed)
    fo.fasta_from_intervals(exons_bed,
                            exons_fasta,
                            genome_fasta,
                            force_strand=True,
                            names=True)

    # now generate the full transcript for multi exon transcripts
    transcripts_fasta = "{0}.multi_exon_transcripts.fasta".format(input_bed)
    names, seqs = gen.read_fasta(exons_fasta)
    seq_list = collections.defaultdict(lambda: collections.defaultdict())
    for i, name in enumerate(names):
        id = ".".join(name.split("(")[0].split(".")[:-1])
        exon = int(name.split("(")[0].split(".")[-1])
        seq_list[id][exon] = seqs[i]
    with open(transcripts_fasta, "w") as outfile:
        for id in sorted(seq_list):
            if len(seq_list[id]) > 1:
                exon_list = []
                for exon in sorted(seq_list[id]):
                    exon_list.append(seq_list[id][exon])
                seq = "".join(exon_list)
                if "N" not in seq and len(seq) >= 200:
                    # convert names to : here as otherwise it will run sorting later
                    id = ":".join(id.split("."))
                    outfile.write(">{0}\n{1}\n".format(id, seq))

    # blast to get paralogous families
    blast_db_path = "{0}/bast_db".format(output_directory)
    output_blast_file = "{0}/blast_output.csv".format(output_directory)
    families_file = "{0/families.txt".format(output_directory)
    gen.create_output_directories(blast_db_path)
    cons.filter_families(transcripts_fasta,
                         output_blast_file,
                         families_file,
                         database_path=blast_db_path,
                         clean_run=True)
Esempio n. 16
0
 def test_fasta_sequence_quality_control(self):
     fasta_parts = gen.read_fasta(
         './test_data/bed_ops/test_fasta_sequence_quality_control/test_fasta_sequence_quality_control.fasta'
     )
     fasta_parts_names = fasta_parts[0]
     fasta_parts_seqs = fasta_parts[1]
     names, seqs = check_sequence_quality(fasta_parts_names,
                                          fasta_parts_seqs,
                                          check_acgt=True,
                                          check_stop=True,
                                          check_start=True,
                                          check_length=True,
                                          check_inframe_stop=True)
     observed = (names, seqs)
     expected = gen.read_fasta(
         './test_data/bed_ops/test_fasta_sequence_quality_control/expected_test_fasta_sequence_quality_control.fasta'
     )
     self.assertEqual(observed, expected)
Esempio n. 17
0
 def test_extract_cds_clean_chrom(self):
     gtf_file = "./test_data/bed_ops/test_extract_cds_clean_chrom/test_extract_cds.gtf"
     genome_file = "./test_data/bed_ops/test_extract_cds_clean_chrom/test_extract_cds_genome.fa"
     bed_output = "./test_data/bed_ops/test_extract_cds_clean_chrom/test_extract_cds.bed"
     observed = "./test_data/bed_ops/test_extract_cds_clean_chrom/observed_test_extract_cds_fasta.fasta"
     intervals = "./test_data/bed_ops/test_extract_cds_clean_chrom/observed_intervals.fasta"
     gen.remove_file(observed)
     gen.remove_file(intervals)
     expected = gen.read_fasta(
         "./test_data/bed_ops/test_extract_cds_clean_chrom/expected_test_extract_cds_fasta.fasta"
     )
     extract_cds(gtf_file,
                 bed_output,
                 observed,
                 genome_file,
                 intervals,
                 clean_chrom_only=True)
     observed = gen.read_fasta(observed)
     self.assertEqual(observed, expected)
Esempio n. 18
0
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None):
    """
    Generate random hexamers from introns and calculate purine content

    Args:
        input_fasta (str): path to intron fasta
        motif_file (str): path to file containing real motifs
        output_directory (str): path to output directory
        output_file (str): path to output file
        required_simulations (int): if set, the number of simulations to run
        families_file (str): if set, path to families file
    """

    hexamers_dir = "{0}/random_hexamers".format(output_directory)
    gen.create_output_directories(hexamers_dir)
    # get the motifs
    motifs = sequo.read_motifs(motif_file)
    # if there are not enough simulations, generate them
    if len(os.listdir(hexamers_dir)) < required_simulations:
        gen.create_output_directories(hexamers_dir)
        required = list(range(required_simulations - len(os.listdir(hexamers_dir))))
        names, seqs = gen.read_fasta(sequences_file)
        seqs_list = collections.defaultdict(lambda: [])
        for i, name in enumerate(names):
            seqs_list[name.split(".")[0]].append(seqs[i])

        if families_file:
            seqs_list = sequo.pick_random_family_member(families_file, seqs_list)

        all_seqs = []
        [all_seqs.extend(seqs_list[i]) for i in seqs_list]
        full_seq = "X".join(all_seqs)
        simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False)

    # calculate the purine contents
    real_purine_content = sequo.calc_purine_content(motifs)
    real_nt_content = sequo.calc_nucleotide_content(motifs)

    test_purine_content = []
    test_nt_content = []
    for file in os.listdir(hexamers_dir):
        filepath = "{0}/{1}".format(hexamers_dir, file)
        test_motifs = sequo.read_motifs(filepath)
        test_purine_content.append(sequo.calc_purine_content(test_motifs))
        test_nt_content.append(sequo.calc_nucleotide_content(test_motifs))

    with open(output_file, "w") as outfile:
        outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n")
        outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)]))))
        for i in range(len(test_purine_content)):
            outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])]))))

    # remove the output directory
    gen.remove_directory(hexamers_dir)
Esempio n. 19
0
 def test_extract_cds_from_bed_quality_control(self):
     bed_file = "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/test_extract_cds_from_bed_quality_control_bed.bed"
     observed = "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/observed_test_extract_cds_from_bed_quality_control_fasta.fasta"
     intervals = "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/observed_intervals.fasta"
     gen.remove_file(observed)
     gen.remove_file(intervals)
     expected = gen.read_fasta(
         "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/expected_test_extract_cds_from_bed_quality_control_fasta.fasta"
     )
     extract_cds_from_bed(
         bed_file,
         observed,
         "./test_data/bed_ops/test_extract_cds_from_bed_quality_control/test_extract_cds_from_bed_quality_control_genome.fa",
         intervals,
         check_acgt=True,
         check_start=True,
         check_length=True,
         check_stop=True,
         check_inframe_stop=True)
     observed = gen.read_fasta(observed)
     self.assertEqual(observed, expected)
Esempio n. 20
0
def get_coding_exons(coding_exons_fasta):

    coding_exons = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict()))
    exon_names, exon_seqs = gen.read_fasta(coding_exons_fasta)

    for i, name in enumerate(exon_names):
        transcript = name.split('(')[0].split('.')[0]
        exon = int(name.split('(')[0].split('.')[1])
        seq = exon_seqs[i]
        coding_exons[transcript][exon] = seq

    return coding_exons
Esempio n. 21
0
def build_sequences(input_fasta, input_stops_fasta, output_fasta):
    """
    Build sequences from a provided fasta

    Args:
        input_fasta (str): path to input fasta
        input_stops_fasta (str): path to fasta containing the stop codons
        output_fasta (str): path for output fasta containing built sequences
    """

    names, seqs = gen.read_fasta(input_fasta)
    stop_names, stop_seqs = gen.read_fasta(input_stops_fasta)

    # create a dictionary containing all the parts
    sequence_parts_list = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict()))
    for i, name in enumerate(names):
        splits = name.split('(')
        name_splits = splits[0].split('.')
        transcript_id = name_splits[0]
        exon_id = int(name_splits[1])
        sequence_parts_list[transcript_id][exon_id] = seqs[i]

    # now build the sequences out
    sequence_dict = collections.defaultdict(lambda: [])
    for transcript_id in sequence_parts_list:
        for exon_id in sorted(sequence_parts_list[transcript_id]):
            sequence_dict[transcript_id].append(
                sequence_parts_list[transcript_id][exon_id])

    # add the stop codons
    for i, stop_name in enumerate(stop_names):
        transcript_id = stop_name.split(".")[0]
        sequence_dict[transcript_id].append(stop_seqs[i])

    #output to file
    with open(output_fasta, "w") as outfile:
        for transcript_id in sorted(sequence_dict):
            outfile.write(">{0}\n{1}\n".format(
                transcript_id, "".join(sequence_dict[transcript_id])))
Esempio n. 22
0
def get_exon_reading_frame(coding_exons_fasta, full_exons_fasta, output_file):
    """
    Get the reading frame an exon starts in. If the first position of the codon
    this is 0, second is 1 and last is 2.

    Args:
        coding_exons_fasta (str): path to file containing coding exon sequences
        full_exons_fasta (str): path to file containing all exon sequences
        output_file (str): path to the output file
    """

    # read in the coding exons and all exons
    full_names, full_seqs = gen.read_fasta(full_exons_fasta)
    full_coding_names = gen.read_fasta(coding_exons_fasta)[0]

    # create a dictionary that hold each of the full sequences
    full_exon_seqs = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict()))
    for i, name in enumerate(full_names):
        id = name.split(".")[0]
        exon_id = int(name.split(".")[1].split("(")[0])
        full_exon_seqs[id][exon_id] = full_seqs[i]

    # now get the reading frame that they start in
    full_reading_frames = collections.defaultdict(
        lambda: collections.defaultdict(lambda: collections.defaultdict()))
    for id in full_exon_seqs:
        seq_length = 0
        for exon_id, seq in sorted(full_exon_seqs[id].items()):
            full_reading_frames[id][exon_id] = seq_length % 3
            seq_length += len(seq)

    # now output the reading frames for just the coding exons
    with open(output_file, "w") as outfile:
        for name in full_coding_names:
            id = name.split(".")[0]
            exon_id = int(name.split(".")[1].split("(")[0])
            # print(name, full_reading_frames[id][exon_id])
            outfile.write(">{0}\n{1}\n".format(
                name, full_reading_frames[id][exon_id]))
Esempio n. 23
0
def get_utrs(transcript_fasta, cds_fasta, output_file):

    cds_names, cds_seqs = gen.read_fasta(cds_fasta)
    cds_list = {name: cds_seqs[i] for i, name in enumerate(cds_names)}

    transcript_names, transcript_seqs = gen.read_fasta(transcript_fasta)
    transcript_list = {
        name: transcript_seqs[i]
        for i, name in enumerate(transcript_names)
    }

    with open(output_file, "w") as outfile:
        for id in cds_list:
            cds = cds_list[id]
            transcript = transcript_list[id]
            try:
                cds_start_index = transcript.index(cds)
                if cds_start_index > 0:
                    outfile.write(">{0}\n{1}\n".format(
                        id, transcript[:cds_start_index]))
            except:
                pass
Esempio n. 24
0
def filter_coding_sequences(input_fasta, output_fasta):
    """
    Quality filter coding sequences

    Args:
        input_fasta (str): path to fasta file containing input CDS sequences
        output_fasta (str): path to output file to put sequences that pass filtering
    """

    # copile regex searches
    actg_regex = re.compile("[^ACTG]")
    codon_regex = re.compile(".{3}")

    stop_codons = ["TAA", "TAG", "TGA"]

    # read the sequences
    names, seqs = gen.read_fasta(input_fasta)

    print("{0} sequences prior to filtering...".format(len(seqs)))

    # filter the sequences
    with open(output_fasta, "w") as outfile:
        pass_count = 0
        for i, name in enumerate(names):
            seq = seqs[i]
            passed = True
            # check to see if the first codon is ATG
            if passed and seq[:3] != "ATG":
                passed = False
            # check to see if the last codon is a stop codon
            if passed and seq[-3:] not in stop_codons:
                passed = False
            # check to see if sequence is a length that is a
            # multiple of 3
            if passed and len(seq) % 3 != 0:
                passed = False
            # check if there are any non ACTG characters in string
            non_actg = re.subn(actg_regex, '!', seq)[1]
            if passed and non_actg != 0:
                passed = False
            # check if there are any in frame stop codons
            codons = re.findall(codon_regex, seq[3:-3])
            inframe_stops = [codon for codon in codons if codon in stop_codons]
            if passed and len(inframe_stops):
                passed = False
            # only if passed all the filters write to file
            if passed:
                outfile.write(">{0}\n{1}\n".format(name, seq))
                pass_count += 1

    print("{0} sequences after filtering...".format(pass_count))
Esempio n. 25
0
def extract_cds_from_bed(bed_file, output_fasta, genome_fasta, fasta_interval_file, check_acgt=None, check_start=None, check_length=None, check_stop=None, check_inframe_stop=None, all_checks=None, uniquify = False):
        '''
        Extract the CDS to fasta file
        Ex.: extract_cds('../feature_file.bed', '../output_file_fasta.fasta', '../source_data/genome_fasta_file.fa')
        '''
        #create dictionaries to hold cds parts
        cds_list = collections.defaultdict(lambda: collections.defaultdict())
        # stop_list = {}
        concat_list = collections.defaultdict(lambda: collections.UserList())
        #create fasta file with extracted parts
        fasta_from_intervals(bed_file, fasta_interval_file, genome_fasta, names = True)
        #read the fasta interval file
        sample_names, sample_seqs = gen.read_fasta(fasta_interval_file)
        #label the stop codons
        for i, name in enumerate(sample_names):
            if len(sample_seqs[i]) == 3 and sample_seqs[i] in ['TAA', 'TAG', 'TGA']:
                sample_names[i] = name + '.stop_codon'
        #iterate through the samples
        for i, sample in enumerate(sample_names):
            entry_meta_splits = sample.split('.')
            #set the sample name: sample(.exon)
            sample_id = entry_meta_splits[0]
            # check if labelled as stop codon, and set to high number so when
            # sorted this is the last thing to be appended
            if entry_meta_splits[-1] == "stop_codon":
                exon_id = 9999999
            else:
                exon_id = int(entry_meta_splits[1])
            cds_list[sample_id][exon_id] = sample_seqs[i]
        #get sorted list of cds exons to build cds
        for sample in sorted(cds_list):
            for part in sorted(cds_list[sample]):
                concat_list[sample].append(cds_list[sample][part])
        #concatenate and write to output
        names = []
        seqs = []
        for sample in sorted(concat_list):
            names.append(sample)
            seqs.append("".join(concat_list[sample]))
        #perform sequence quality control checks
        if check_acgt or check_stop or check_start or check_length or check_inframe_stop or all_checks:
            names, seqs = check_sequence_quality(names, seqs, check_acgt, check_stop, check_start, check_length, check_inframe_stop, all_checks)

        if uniquify:
            #leave only one transcript per gene
            gene_to_trans = link_genes_and_transcripts(bed_file)
            names, seqs = uniquify_trans(names, seqs, gene_to_trans)
            print("After leaving only one transcript per gene, {0} sequences remain.".format(len(seqs)))
        #write to output fasta file
        gen.write_to_fasta(names, seqs, output_fasta)
Esempio n. 26
0
def filter_bed_from_fasta(bed, fasta, out_bed, families_file = None):
        '''
        Given a bed file and a fasta file, filter the bed file to only leave records where the 'name' field appears
        among the names in the fasta file. Write to out_bed.
        If a families_file is given, leave only one (randomly picked) member per family.
        '''
        #add feature in here that enables overwrite of current file
        output_exists = False
        if Path(out_bed).exists():
            output_exists = True
            temp_file_name = "{0}.{1}{2}".format(os.path.splitext(out_bed)[0], random.random(), os.path.splitext(out_bed)[1])
        else:
            temp_file_name = out_bed

        # print(temp_file_name)


        fasta_names, fasta_seqs = gen.read_fasta(fasta)

        #read in family information and pick one transcript per family
        if families_file:
                families = gen.read_families(families_file)
                #make sure the families file doesn't contain transcripts that are not in the fasta
                for pos, family in enumerate(families):
                        families[pos] = [i for i in family if i in fasta_names]
                flat_families = gen.flatten(families)
                #first fish out singletons
                fasta_names_new = [i for i in fasta_names if i not in flat_families]
                for family in families:
                        if family:
                                family_lengths = [len(fasta_seqs[fasta_names.index(i)]) for i in family]
                                fasta_names_new.append(family[family_lengths.index(max(family_lengths))])
                fasta_names = fasta_names_new.copy()
        bed_data = gen.read_many_fields(bed, "\t")
        #remove empty lines
        bed_data = [i for i in bed_data if len(i) > 3]
        #I have to ask Liam cause I'm having a hard time understanding this regex
        id_regex = re.compile("^(\w+).*")
        with open(temp_file_name, "w") as file:
            for line in bed_data:
                idn = re.search(id_regex, line[3])
                if idn:
                    #filter bed data
                    if idn.group(1) in fasta_names:
                        file.write("\t".join(line))
                        file.write("\n")
        #remove old file, replace with new
        if(output_exists):
            os.remove(out_bed)
            shutil.move(temp_file_name, out_bed)
Esempio n. 27
0
def get_seq_list(fasta_file, with_chr=False, full_seq_list=None):
    names, seqs = gen.read_fasta(fasta_file)
    seq_list = collections.defaultdict(lambda: [])
    for i, name in enumerate(names):
        id = name.split("(")[0]
        if with_chr:
            id = ".".join(id.split(".")[:-1])
        if full_seq_list:
            if id in full_seq_list:
                seq_list[id].append(seqs[i])
        else:
            seq_list[id].append(seqs[i])
    seq_list = {id: seq_list[id] for i, id in enumerate(seq_list)}
    return seq_list
Esempio n. 28
0
def filter_seq_lengths(input_file, output_file, length):
    """
    Given a fasta filter, filter and keep only sequences longer than the given length

    Args:
        input_file (str): path to input fasta
        output_file (str): path to output fasta
        length (int): length threshold to retain sequences longer than
    """

    names, seqs = gen.read_fasta(input_file)
    with open(output_file, "w") as outfile:
        for i, name in enumerate(names):
            if len(seqs[i]) >= length:
                outfile.write(">{0}\n{1}\n".format(name, seqs[i]))
Esempio n. 29
0
 def test_extract_cds_quality_control(self):
     gtf_file = "./test_data/bed_ops/test_extract_cds_quality_control/test_extract_cds_quality_control.gtf"
     genome_file = "./test_data/bed_ops/test_extract_cds_quality_control/test_extract_cds_quality_control_genome.fa"
     bed_output = "./test_data/bed_ops/test_extract_cds_quality_control/observed_test_extract_cds_quality_control_cds.bed"
     observed = "./test_data/bed_ops/test_extract_cds_quality_control/observed_test_extract_cds_quality_control_fasta.fasta"
     intervals = "./test_data/bed_ops/test_extract_cds_quality_control/observed_intervals.fasta"
     gen.remove_file(observed)
     gen.remove_file(intervals)
     gen.remove_file(bed_output)
     expected = gen.read_fasta(
         "./test_data/bed_ops/test_extract_cds_quality_control/expected_test_extract_cds_quality_control_fasta.fasta"
     )
     extract_cds(gtf_file,
                 bed_output,
                 observed,
                 genome_file,
                 intervals,
                 check_acgt=True,
                 check_start=True,
                 check_length=True,
                 check_stop=True,
                 check_inframe_stop=True)
     observed = gen.read_fasta(observed)
     self.assertEqual(observed, expected)
Esempio n. 30
0
def get_fasta_exon_intervals(intervals_fasta):
    '''
    Get a list of exon seqs
    '''
    stops = ["TAA", "TAG", "TGA"]
    exon_interval_names, exon_interval_seqs = gen.read_fasta(intervals_fasta)
    exon_interval_list = collections.defaultdict(lambda: collections.defaultdict())
    for i, name in enumerate(exon_interval_names):
        transcript_id = name.split('.')[0]
        exon = int(name.split('(')[0].split('.')[1])
        seq = exon_interval_seqs[i]
        if seq in stops:
            exon_id = 99999
        else:
            exon_id = exon
        exon_interval_list[transcript_id][exon_id] = seq
    return exon_interval_list