コード例 #1
0
def get_donor_acceptor_sequences(genome_fasta_file,
                                 scaffold_gff3_file,
                                 window_size=10):
    """
    When given a FASTA file and GFF3 file corresponding to the same genome,
    parse out and return (in a giant list of tuples) all the donor and acceptor 
    sequences of all introns in the genome.
    """
    genome_fasta = parse_fasta.get_all_sequences(genome_fasta_file, 'fasta')
    scaffold_gff3 = parse_gff3.parse_gff3(scaffold_gff3_file, 'exon')

    # create dictionary to store exon/intron coordinates
    #   donor_acceptor_locs['scaffold_name'] = [tuple of ints]
    donor_acceptor_locs = {}
    for scaf in scaffold_gff3:
        donor_acceptor_locs[scaf] = []

    # get donor and acceptor locations
    for scaf in scaffold_gff3:
        for gene in scaffold_gff3[scaf]:
            for tx in scaffold_gff3[scaf][gene].mRNAs:
                # get all exon coordinates within the transcripts
                tx_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon']

                # check whether the transcript is reverse complemented
                rev_comp_flag = tx_coords[0][0] > tx_coords[0][1]

                if rev_comp_flag:
                    donor_locs = [-y for x, y in tx_coords][:-1]
                    acceptor_locs = [-x for x, y in tx_coords][1:]
                else:
                    donor_locs = [y for x, y in tx_coords][:-1]
                    acceptor_locs = [x for x, y in tx_coords][1:]

                for x in zip(donor_locs, acceptor_locs):
                    donor_acceptor_locs[scaf].append(x)

        donor_acceptor_locs[scaf] = sorted(list(set(
            donor_acceptor_locs[scaf])))

    # parse the locations into sequences
    donor_acceptor_sequences = []
    for scaf in scaffold_gff3:
        for da in donor_acceptor_locs[scaf]:
            donor_seq = slice_window(genome_fasta[scaf], da[0])
            acceptor_seq = slice_window(genome_fasta[scaf], da[1])
            if donor_seq and acceptor_seq:
                donor_acceptor_sequences.append((donor_seq, acceptor_seq))

    return donor_acceptor_sequences
コード例 #2
0
parser = argparse.ArgumentParser(description="""
CDS files do not contain intronic regions - this script reads the gff3 file
containing start and end coordinates for genes, and extracts the genic 
(i.e. exonic + intronic) sequences for each gene.""")

parser.add_argument('genome_fasta', metavar="fasta_file",
                    type=argparse.FileType('r'), 
                    help="FASTA file of the genome.")
parser.add_argument('scaffold_gff3', metavar="gff3_file",
                    type=argparse.FileType('r'), 
                    help="corresponding gff3 file of the genome.")

args = parser.parse_args()

# read genome details into memory
genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')
scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'gene')

# read the positions from the cov file
for scaf in scaffold_gff3:
    for gene in scaffold_gff3[scaf]:
        gene_coords = scaffold_gff3[scaf][gene].coords
        on_crick = gene_coords[0] > gene_coords[1]
        
        genic_seq = genome_fasta[scaf][min(gene_coords):max(gene_coords)]
        if on_crick:
            genic_seq = reverse_complement(genic_seq)
            
        print ('>' + gene)
        print (genic_seq)
コード例 #3
0
gene.""")

parser.add_argument('genome_fasta',
                    metavar="fasta_file",
                    type=argparse.FileType('r'),
                    help="FASTA file of the genome.")
parser.add_argument('scaffold_gff3',
                    metavar="gff3_file",
                    type=argparse.FileType('r'),
                    help="corresponding gff3 file of the genome.")

args = parser.parse_args()

# read genome details into memory
genome_fasta = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')
scaffold_gff3 = parse_gff3.parse_gff3(args.scaffold_gff3, 'exon')

# pick longest transcript in the gff3 file
scaffold_gff3 = parse_gff3.pick_longest_mRNA(scaffold_gff3)

# read the positions from the cov file
for scaf in scaffold_gff3:
    for gene in scaffold_gff3[scaf]:
        gene_coords = scaffold_gff3[scaf][gene].coords
        gene_on_crick = gene_coords[0] > gene_coords[1]

        tx = list(scaffold_gff3[scaf][gene].mRNAs.keys())[0]
        mrna_coords = scaffold_gff3[scaf][gene].mRNAs[tx].details['exon']

        exon_seq = ''
        for i in mrna_coords:
コード例 #4
0
#!/usr/bin/env python
from parse_gff3 import parse_gff3
import fasta
import sys

if len(sys.argv) != 3:
    print "Usage: %s gff_file output_fasta_file"
    sys.exit(0)

g = parse_gff3(sys.argv[1])
f = fasta.Fasta()
for x in g:
    f.add_seq(fasta.Sequence(x.name + ' ' + x[0].reference_sequence, x.protein_seq))
f.save_to(sys.argv[2])
コード例 #5
0
#!/usr/bin/env python
from parse_gff3 import parse_gff3
import fasta
import sys

if len(sys.argv) != 3:
    print "Usage: %s gff_file output_fasta_file"
    sys.exit(0)

g = parse_gff3(sys.argv[1])
f = fasta.Fasta()
for x in g:
    f.add_seq(
        fasta.Sequence(x.name + ' ' + x[0].reference_sequence, x.protein_seq))
f.save_to(sys.argv[2])
コード例 #6
0
def generate_relative_locations(n):
    '''
    Evenly split the range {0..1} depending on n (number of divisions), and 
    returns the midpoint of the sub-ranges.
    
    i.e. if n == 5, return [0.1, 0.3, 0.5, 0.7, 0.9]
    '''
    return [(x + 0.5) / n for x in range(n)]


# read sequences
genome_sequences = parse_fasta.get_all_sequences(args.genome_fasta, 'fasta')

# read coordinates of genes and exons from .gff3 file
scaffold_gff3 = parse_gff3.pick_longest_mRNA(
    parse_gff3.parse_gff3(args.genome_gff3, 'exon'))

# create dictionary to map genes to their respective scaffold (this is needed
# to obtain gene coords based solely on gene names)
gene_to_scaffold = {}
for s in scaffold_gff3:
    for g in scaffold_gff3[s]:
        gene_to_scaffold[g] = s

# print header row for results
print('Gene',
      'Intron relative location',
      'Scaffold',
      'Desired region',
      'Outer region',
      'Outer amplicon length',
コード例 #7
0
        print "Specify either gff2 or gff3 but not both."
        sys.exit(1)

    if o.fas is None:
        print "Specify the fasta database file."
        sys.exit(1)

    if o.output is None:
        o.output = sys.stdout
    else:
        o.output = file(o.output, "w")

    fas = fasta.Fasta()
    fas.read_from(o.fas)

    if o.gff2:
        gff = parse_gff2.parse_gff2(o.gff2)
    else:
        gff = parse_gff3.parse_gff3(o.gff3)

    try:
        l = [fasta.Sequence(
                g.reference_sequence + ' ' + g.start + ' ' + g.stop
                + ' ' + g.strand,
                fas[g.reference_sequence].sequence[int(g.start)
                                                   - 1:int(g.stop)])
         for g in gff]
        print >> o.output, '\n'.join(imap(str, l))
    except KeyError, ke:
        print "Sequence was not found in fasta file :", str(ke)
コード例 #8
0
        sys.exit(1)

    if o.fas is None:
        print "Specify the fasta database file."
        sys.exit(1)

    if o.output is None:
        o.output = sys.stdout
    else:
        o.output = file(o.output, "w")

    fas = fasta.Fasta()
    fas.read_from(o.fas)

    if o.gff2:
        gff = parse_gff2.parse_gff2(o.gff2)
    else:
        gff = parse_gff3.parse_gff3(o.gff3)

    try:
        l = [
            fasta.Sequence(
                g.reference_sequence + ' ' + g.start + ' ' + g.stop + ' ' +
                g.strand, fas[g.reference_sequence].sequence[int(g.start) -
                                                             1:int(g.stop)])
            for g in gff
        ]
        print >> o.output, '\n'.join(imap(str, l))
    except KeyError, ke:
        print "Sequence was not found in fasta file :", str(ke)