Esempio n. 1
0
def main(fasta, prefix):
    # load data
    prots = {}
    with open(fasta, 'r') as infile:
        for rec in fasta_iter(infile):
            prot = Protein(rec)
            prots[prot.name] = prot

    # look for subsumed sequences
    subsumed = set()
    cnt = 0
    for key, value in list(prots.items()):
        cnt += 1
        if cnt % 100 == 0:
            print(f'{cnt} proteins checked')
        qry = value.sqn
        for rec in prots:
            # skip finding itself
            if key == rec:
                continue
            subj = prots[rec].sqn
            if qry in subj:
                subsumed.add(key)
                break

    # write out non-subsumed sequences
    print(f'Number of input sequences is {cnt}')
    print(f'Number of sequences subsumed within another is {len(subsumed)}')
    outname = '-'.join([prefix, 'nonSubsumed.fa'])
    outfile = open(outname, 'w')
    for key, value in list(prots.items()):
        if key not in subsumed:
            print(value.print_prot(), file=outfile)
    outfile.close()
Esempio n. 2
0
def main():
    """distribute-gff.py
    
    Extract annotations on the given scaffolds

    usage: distribute-gff.py annotations.gff sequences.fa
    output: sequences.gff

    """

    gff = sys.argv[1]
    fasta = sys.argv[2]

    seqs = set()

    with open(fasta, "r") as fastafile:
        for rec in fasta_iter(fastafile):
            seqs.add(rec[0])

    # clumsy way of getting base name of fasta file
    fastaname = re.sub(".fa", "", re.sub(".fasta", "", fasta))

    outfile = open(fastaname + ".gff", "w")

    with open(gff, "r") as file_object:
        for line in file_object:
            if line[0] == "#":
                continue

            feature = GFF(line)

            if feature.seqid in seqs:
                outfile.write(line)

    outfile.close()
Esempio n. 3
0
def main():
    # read in gene locus_tags
    tags = 'version3-gene-locus-tags.tsv'
    genes = {}
    with open(tags, 'r') as infile:
        for line in infile:
            gid, loc = line.strip().split('\t')
            genes[gid] = Gene(gid)
            genes[gid].add_tag(loc)
    # read in sequence IDs
    transcripts = '/projects/bullfrog_assembly/genome/ARCS/annotation/fresh-Oct2018/shared-with-uvic/compare-proteins/v3-proteins.fa'
    with open(transcripts, 'r') as infile:
        for sid, sqn in fasta_iter(infile):
            gid = sid.split('-mRNA')[0]
            try:
                genes[gid].add_mrna(sid)
            except KeyError:
                print f'{sid} not in gene table!'

    # write out all tags
    outname = 'version3-transcript-locus-tags.tsv'
    with open(outname, 'w') as outfile:
        for entry in genes:
            this_gene = genes[entry]
            for tag in this_gene.get_tags():
                this_mrna, this_tag = tag
                outbuff = ''.join([this_mrna, '\t', this_tag, '\n'])
                outfile.write(outbuff)
Esempio n. 4
0
#!/usr/bin/env python

import sys

from fastatools import fasta_iter

infile = sys.argv[1]

bueno = ["A", "C", "G", "T", "N"]

with open(infile, "r") as fasta:
    for rec in fasta_iter(fasta):
        seqn = rec[1].upper()
        nam = ">" + rec[0]
        print nam
        for base in seqn:
            if base not in bueno:
                sys.stdout.write("N")
                continue
            sys.stdout.write(base)
        sys.stdout.write("\n")
Esempio n. 5
0
def main():
    """Extract CDS coordinates for each mRNA in a gff file,
    then print the corresponding sequence.
    The subfeatures MUST be sorted by their start coordinate

    Usage: extract-cds.py genes.gff genome.fa > genes-cds.fa
    """

    if len(sys.argv) is not 3:
        print main.__doc__
        sys.exit(1)

    gff = sys.argv[1]
    fasta = sys.argv[2]

    cds_coords = {}

    with open(gff, "r") as file_object:
        for line in file_object:
            if line[0] == "#":
                sys.stdout.write(line)
                continue

            feature = gfftools.GFF(line)

            if feature.type == "CDS":
                for parent in feature.parent:
                    if parent in cds_coords:
                        if feature.strand == "+":
                            cds_coords[parent]['coords'].append(
                                [feature.start, feature.end])

                        else:
                            cds_coords[parent]['coords'].insert(
                                0, [feature.start, feature.end])

                    else:
                        cds_coords[parent] = {
                            'coords': [[feature.start, feature.end]],
                            'scaf': feature.seqid,
                            'strand': feature.strand
                        }

    sequences = {}

    with open(fasta, "r") as file_object:
        for rec in fastatools.fasta_iter(file_object):
            sequences[rec[0]] = rec[1]

    for record in cds_coords:
        whole = ""
        for coords in cds_coords[record]['coords']:
            seqid = cds_coords[record]['scaf']

            if cds_coords[record]['strand'] == "-":
                piece = fastatools.revcomp(
                    sequences[seqid][int(coords[0]) -
                                     1:int(coords[1])])  ### watch for OBO

            else:
                piece = sequences[seqid][int(coords[0]) -
                                         1:int(coords[1])]  ### watch for OBO

            whole += piece
        else:
            sys.stdout.write(">" + record + "_CDS\n" + whole + "\n")
Esempio n. 6
0
args = parser.parse_args()

oldsqn = args.OldFASTA
if args.NewFASTA:
    newsqn = args.NewFASTA

# read in old scaffs with sqn as key and id as value
# then read in new scaffs and compare sqn to keys and write out old and new ids upon match
# or, if only one FASTA given, output unique sequences

seqdict = {}

collisions = open("collisions.txt","w")

with open(oldsqn,"r") as infile:
    for rec in fasta_iter(infile):
        seqid = rec[0]
        sqn = rec[1]
        if sqn in seqdict:
            print >> collisions, "Collision between " + seqid + " AND previously added " + seqdict[sqn]
        else:
            seqdict[sqn] = seqid
collisions.close()

if args.NewFASTA:
    print "OldID\tNewID"

    with open(newsqn,"r") as infile:
        for rec in fasta_iter(infile):
            seqid = rec[0]
            sqn = rec[1]