Ejemplo n.º 1
0
def get_donor_acceptor_sequences(genome_fasta_file,
                                 scaffold_gff3_file,
                                 window_size=10):
    """
    When given a FASTA file and GFF3 file corresponding to the same genome,
    parse out and return (in a giant list of tuples) all the donor and acceptor 
    sequences of all introns in the genome.
    """
    genome_fasta = parse_fasta.get_all_sequences(genome_fasta_file, 'fasta')
    scaffold_gff3 = parse_gff3.parse_gff3(scaffold_gff3_file, 'exon')

    # create dictionary to store exon/intron coordinates
    #   donor_acceptor_locs['scaffold_name'] = [tuple of ints]
    donor_acceptor_locs = {}
    for scaf in scaffold_gff3:
        donor_acceptor_locs[scaf] = []

    # get donor and acceptor locations
    for scaf in scaffold_gff3:
        for gene in scaffold_gff3[scaf]:
            for tx in scaffold_gff3[scaf][gene].mRNAs:
                # get all exon coordinates within the transcripts
                tx_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon']

                # check whether the transcript is reverse complemented
                rev_comp_flag = tx_coords[0][0] > tx_coords[0][1]

                if rev_comp_flag:
                    donor_locs = [-y for x, y in tx_coords][:-1]
                    acceptor_locs = [-x for x, y in tx_coords][1:]
                else:
                    donor_locs = [y for x, y in tx_coords][:-1]
                    acceptor_locs = [x for x, y in tx_coords][1:]

                for x in zip(donor_locs, acceptor_locs):
                    donor_acceptor_locs[scaf].append(x)

        donor_acceptor_locs[scaf] = sorted(list(set(
            donor_acceptor_locs[scaf])))

    # parse the locations into sequences
    donor_acceptor_sequences = []
    for scaf in scaffold_gff3:
        for da in donor_acceptor_locs[scaf]:
            donor_seq = slice_window(genome_fasta[scaf], da[0])
            acceptor_seq = slice_window(genome_fasta[scaf], da[1])
            if donor_seq and acceptor_seq:
                donor_acceptor_sequences.append((donor_seq, acceptor_seq))

    return donor_acceptor_sequences
Ejemplo n.º 2
0
parser = argparse.ArgumentParser(description="""
CDS files do not contain intronic regions - this script reads the gff3 file
containing start and end coordinates for genes, and extracts the genic 
(i.e. exonic + intronic) sequences for each gene.""")

parser.add_argument('genome_fasta', metavar="fasta_file",
                    type=argparse.FileType('r'), 
                    help="FASTA file of the genome.")
parser.add_argument('scaffold_gff3', metavar="gff3_file",
                    type=argparse.FileType('r'), 
                    help="corresponding gff3 file of the genome.")

args = parser.parse_args()

# read genome details into memory
genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')
scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'gene')

# read the positions from the cov file
for scaf in scaffold_gff3:
    for gene in scaffold_gff3[scaf]:
        gene_coords = scaffold_gff3[scaf][gene].coords
        on_crick = gene_coords[0] > gene_coords[1]
        
        genic_seq = genome_fasta[scaf][min(gene_coords):max(gene_coords)]
        if on_crick:
            genic_seq = reverse_complement(genic_seq)
            
        print ('>' + gene)
        print (genic_seq)
Ejemplo n.º 3
0
gene.""")

parser.add_argument('genome_fasta',
                    metavar="fasta_file",
                    type=argparse.FileType('r'),
                    help="FASTA file of the genome.")
parser.add_argument('scaffold_gff3',
                    metavar="gff3_file",
                    type=argparse.FileType('r'),
                    help="corresponding gff3 file of the genome.")

args = parser.parse_args()

# read genome details into memory
genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')
scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'exon')

# pick longest transcript in the gff3 file
scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3)

# read the positions from the cov file
for scaf in scaffold_gff3:
    for gene in scaffold_gff3[scaf]:
        gene_coords = scaffold_gff3[scaf][gene].coords
        gene_on_crick = gene_coords[0] > gene_coords[1]

        tx = list(scaffold_gff3[scaf][gene].mRNAs.keys())[0]
        mrna_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon']

        exon_seq = ''
        for i in mrna_coords:
Ejemplo n.º 4
0
#!/usr/bin/env python
from parse_gff3 import parse_gff3
import fasta
import sys

if len(sys.argv) != 3:
    print "Usage: %s gff_file output_fasta_file"
    sys.exit(0)

g = parse_gff3(sys.argv[1])
f = fasta.Fasta()
for x in g:
    f.add_seq(fasta.Sequence(x.name + ' ' + x[0].reference_sequence, x.protein_seq))
f.save_to(sys.argv[2])
Ejemplo n.º 5
0
#!/usr/bin/env python
from parse_gff3 import parse_gff3
import fasta
import sys

if len(sys.argv) != 3:
    print "Usage: %s gff_file output_fasta_file"
    sys.exit(0)

g = parse_gff3(sys.argv[1])
f = fasta.Fasta()
for x in g:
    f.add_seq(
        fasta.Sequence(x.name + ' ' + x[0].reference_sequence, x.protein_seq))
f.save_to(sys.argv[2])
Ejemplo n.º 6
0
def generate_relative_locations(n):
    '''
    Evenly split the range {0..1} depending on n (number of divisions), and 
    returns the midpoint of the sub-ranges.
    
    i.e. if n == 5, return [0.1, 0.3, 0.5, 0.7, 0.9]
    '''
    return [(x + 0.5) / n for x in range(n)]


# read sequences
genome_sequences = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')

# read coordinates of genes and exons from .gff3 file
scaffold_gff3 = parse_gff3.pick_longest_mRNA(
    parse_gff3.parse_gff3(args.genome_gff3, 'exon'))

# create dictionary to map genes to their respective scaffold (this is needed
# to obtain gene coords based solely on gene names)
gene_to_scaffold = {}
for s in scaffold_gff3:
    for g in scaffold_gff3[s]:
        gene_to_scaffold[g] = s

# print header row for results
print('Gene',
      'Intron relative location',
      'Scaffold',
      'Desired region',
      'Outer region',
      'Outer amplicon length',
        print "Specify either gff2 or gff3 but not both."
        sys.exit(1)

    if o.fas is None:
        print "Specify the fasta database file."
        sys.exit(1)

    if o.output is None:
        o.output = sys.stdout
    else:
        o.output = file(o.output, "w")

    fas = fasta.Fasta()
    fas.read_from(o.fas)

    if o.gff2:
        gff = parse_gff2.parse_gff2(o.gff2)
    else:
        gff = parse_gff3.parse_gff3(o.gff3)

    try:
        l = [fasta.Sequence(
                g.reference_sequence + ' ' + g.start + ' ' + g.stop
                + ' ' + g.strand,
                fas[g.reference_sequence].sequence[int(g.start)
                                                   - 1:int(g.stop)])
         for g in gff]
        print >> o.output, '\n'.join(imap(str, l))
    except KeyError, ke:
        print "Sequence was not found in fasta file :", str(ke)
        sys.exit(1)

    if o.fas is None:
        print "Specify the fasta database file."
        sys.exit(1)

    if o.output is None:
        o.output = sys.stdout
    else:
        o.output = file(o.output, "w")

    fas = fasta.Fasta()
    fas.read_from(o.fas)

    if o.gff2:
        gff = parse_gff2.parse_gff2(o.gff2)
    else:
        gff = parse_gff3.parse_gff3(o.gff3)

    try:
        l = [
            fasta.Sequence(
                g.reference_sequence + ' ' + g.start + ' ' + g.stop + ' ' +
                g.strand, fas[g.reference_sequence].sequence[int(g.start) -
                                                             1:int(g.stop)])
            for g in gff
        ]
        print >> o.output, '\n'.join(imap(str, l))
    except KeyError, ke:
        print "Sequence was not found in fasta file :", str(ke)