Example #1
0
def Initialize():
    header,seq = fasta.load('MHC_hg18.fa')
    sixFrameIter = sequence.sixFrameTranslationIter(seq)
    writer = fasta.MfaWriter('6frames.fa')
    for frame,p in sixFrameIter:
        print 'Frame:', frame
        writer.write('%s:%i' % (header,frame),p)
    writer.close()
    sys.exit()
Example #2
0
def Initialize():
    header, seq = fasta.load('MHC_hg18.fa')
    sixFrameIter = sequence.sixFrameTranslationIter(seq)
    writer = fasta.MfaWriter('6frames.fa')
    for frame, p in sixFrameIter:
        print 'Frame:', frame
        writer.write('%s:%i' % (header, frame), p)
    writer.close()
    sys.exit()
Example #3
0
def loadCdsSeq(codingSeqFnL):
    '''Given a list NCBI coding sequence file names (fna) load the sequences and store in a dictionary keyed by locus tag.
    '''
    seqD={}
    for fn in codingSeqFnL:
        for header,seq in fasta.load(fn):
            locusTag = header.split('locus_tag=')[1].split(']')[0]
            seqD[locusTag]=seq
    return seqD
Example #4
0
def loadProt(protFnL):
    '''Given a list of file names of fasta files with the gene name as
header, load the sequences and store in a dictionary keyed by protein
name.
    '''
    seqD={}
    for fn in protFnL:
        for header,seq in fasta.load(fn):
            gn = header.split()[0][1:]
            seqD[gn]=seq
    return seqD
Example #5
0
def loadProt(protFnL):
    '''Given a list of file names of fasta files with the gene name as
header, load the sequences and store in a dictionary keyed by protein
name.
    '''
    seqD = {}
    for fn in protFnL:
        for header, seq in fasta.load(fn):
            gn = header.split()[0][1:]
            seqD[gn] = seq
    return seqD
Example #6
0
def createGeneSpeciesMap(protFile):
    f=open(protFile,'r')
     
    while True:
        s=f.readline()
        if s=='':
            break
        s=s.rstrip('\n')
        L=s.rstrip().split('/')
        k=L[2].split('.')[0]
        print k,
        
        strainInfo=fasta.load(s)
        for gene in strainInfo:
            q=gene[0]
            q=q[1:]
            q=q.split()
            q=q[0]
            print q,
        print

    return
except:
    print """
Your input files cannot be found.
"""
    sys.exit(2)

# Read in the cd-hit clustered file
try:
    cluster_list = cluster_file.read()
except:
    print 'Cannot open', cluster_file, '\n'
    sys.exit(2)

# Read in the FASTA file and check to make sure it's in FASTA format
try:
    fasta_dict_raw = fasta.load(fasta_file)
except:
    print '\n', sys.argv[2], 'does not appear to be a fasta file\n'
    sys.exit(2)


fasta_dict = {}

# Just take everything before the first space in the first line of the FASTA file as 
# the key.  This is also how cd-hit takes the name, so the keys will match.

for fasta_key in fasta_dict_raw:
    new_fasta = fasta_key.split(' ')
    new_key = new_fasta[0]
    fasta_dict[new_key] = fasta_dict_raw[fasta_key]
Example #8
0
usage = "%prog [options] <gff file> <fasta reference sequence>"
parser = OptionParser(usage=usage)
parser.add_option("-o", "--output", dest="oFilename",
  help="Output filename", default=None)
parser.add_option("-f", "--features", dest="features",
  help="Features to extract", default=['exon'])
options, args = parser.parse_args(sys.argv)

if len(args)!=3:
    sys.exit(__doc__)

gffFilename = args[1]
faFilename = args[2]

data = gff.load(gffFilename)
header,seq = fasta.load(faFilename)

if options.oFilename:
    oFile = open(options.oFilename, 'w')
else:
    oFile = sys.stdout

writer = fasta.MfaWriter(oFile)
for name in data:
    s = []
    extrema = []
    for f in data[name]:
        if f.type in options.features:
            if f.strand=='+':
                start,end = f.start,f.end
                _seq = seq[start-1:end]
Example #9
0
    action="store_true",
    dest="complement",
    help="Complement sequence", 
    default=False)
parser.add_option(
    "-b", "--reverseComplement", "--revComp",
    action="store_true",
    dest="reverseComplement",
    help="Reverse complement sequence", default=False)
options, args = parser.parse_args(sys.argv)

iFilename = args[1]
start = int(args[2])
end = int(args[3])

header,seq = fasta.load(iFilename)
s = seq[start-1:end]

h = '%s %i-%i' % (header,start,end)
if options.reverse:
    s = sequence.reverse(s)
    h += '(r)'
elif options.complement:
    s = sequence.complement(s)
    h += '(c)'
elif options.reverseComplement:
    s = sequence.reverse_complement(s)
    h += '(rc)'

fasta.pretty(h, s, width=options.width)
Example #10
0
if __name__ == "__main__":


    if len(sys.argv) !=4:
        sys.stderr.write("""
        Usage: python seedAlign.fa protDB.fa outfile.txt

""")
        sys.exit(-1)

    seedAlignFN = sys.argv[1]
    dbSeqFN = sys.argv[2]
    outFN = sys.argv[3]

    # create profile HMM based on seed alignment
    seedAlignL = [seq for header,seq in fasta.load(seedAlignFN)]

    # load db
    dbL = fasta.load(dbSeqFN)

    outL=[]
    model=hmmmodel.HmmModel(seedAlignFN)
    for hd,sseq in dbL:
        # load db
        print("aligning: "+hd)
        alignment= hmmAlign.HmmAlign(sseq,model)

        score=alignment.subtractShuffleMean()
        
        outL.append((score,hd))
Example #11
0
#!/usr/bin/python
# chrom.py

import sys
import fasta

assembly_file = sys.argv[1]
contigs_file = sys.argv[2]

contigs = {}

for f in fasta.load(open(contigs_file)):
    contigs[f.name] = f

chrom = "chrI"

print ">" + chrom
for line in open(assembly_file):
    fields = line.strip().split()
    s, e = int(fields[1]), int(fields[2])
    l = e - s
    if fields[3][0:3] == "gap":
        sys.stdout.write("N" * l)
    else:
        strand = fields[5]
        if strand == "+":
            sys.stdout.write(contigs[fields[3]].seq)
        elif strand == "-":
            sys.stdout.write(contigs[fields[3]].reverse_complement().seq)
        elif strand == ".":
            sys.stdout.write(contigs[fields[3]].seq)
    sys.exit(2)


# Check to make sure the input file exists and can be opened

try:
   fasta_file = open(filename)
except:
   print "This file could not be opened"
   sys.exit(2)


# Check to make sure the input file is in FASTA format

try:
   fasta_data = fasta.load(fasta_file)
except:
   print 'This file does not seem to be a fasta file.  Please try again with a fasta file'
   sys.exit(0)


# Write out the FASTA input file to a new file, because CD-HIT doesn't handle
# all input file types correctly


new_fasta_file = dirname+'/tmp/input_fasta_file.fa'

try: 
   input_fasta_file = open(new_fasta_file, 'wt')
except:
   print 'Cannot open', new_fasta_file, 'for writing a temporary fasta file'
Example #13
0
#!/usr/bin/env python

"""
testHmmer.py

Author: Tony Papenfuss
Date: Fri Sep  1 09:09:02 EST 2006

"""

import os, sys, re
import hmmer4, fasta, sequence


h,s = fasta.load('seq/HLA-A.fa')
L = len(s)

if False:
    domains = hmmer4.load_domains('hmmer/6frames.txt')
    for d in domains:
        p = hmmer4.parseSixFrameHeader(d.accession)
        print d
        print p.name, p.frame
        gStart,gEnd,strand = hmmer4.convert6FrameToGenomic(d.sStart,d.sEnd,p.frame,L)
        print gStart,gEnd,strand
        if strand=='+':
            dna = s[gStart-1:gEnd]
            print len(dna), len(dna) % 3==0
            print sequence.codons(dna, remainder=True)
            print sequence.translate(dna)
        else:
Example #14
0
except:
    print """
Your input files cannot be found.
"""
    sys.exit(2)

# Read in the cd-hit clustered file
try:
    cluster_list = cluster_file.read()
except:
    print 'Cannot open', cluster_file, '\n'
    sys.exit(2)

# Read in the FASTA file and check to make sure it's in FASTA format
try:
    fasta_dict_raw = fasta.load(fasta_file)
except:
    print '\n', sys.argv[2], 'does not appear to be a fasta file\n'
    sys.exit(2)

fasta_dict = {}

# Just take everything before the first space in the first line of the FASTA file as
# the key.  This is also how cd-hit takes the name, so the keys will match.

for fasta_key in fasta_dict_raw:
    new_fasta = fasta_key.split(' ')
    new_key = new_fasta[0]
    fasta_dict[new_key] = fasta_dict_raw[fasta_key]

# Output file for the list of all the sequences in each cluster
Example #15
0
#!/usr/bin/env python
"""
extractORFs.py

Author: Tony Papenfuss
Date: Wed Aug 23 08:52:58 EST 2006

"""

import os, sys
import re, copy
import fasta, sequence, hmmer3

seqFilename = sys.argv[1]

header, seq = fasta.load(seqFilename)
header = header.split()[0]
L = len(seq)

pattern = re.compile('\*')
minLen = 10

sixFrameIter = sequence.sixFrameTranslationIter(seq)

writer = fasta.MfaWriter(sys.stdout)
i = 0
for frame, p in sixFrameIter:
    print >> sys.stderr, 'Frame:', frame

    matchIter = pattern.finditer(p)
    match = matchIter.next()
Example #16
0

def Initialize():
    header,seq = fasta.load('MHC_hg18.fa')
    sixFrameIter = sequence.sixFrameTranslationIter(seq)
    writer = fasta.MfaWriter('6frames.fa')
    for frame,p in sixFrameIter:
        print 'Frame:', frame
        writer.write('%s:%i' % (header,frame),p)
    writer.close()
    sys.exit()


# Initialize()

header,seq = fasta.load('MHC_hg18.fa')
L = len(seq)
hstart = header.split()[0]

pattern = re.compile('\*|X{200,}')
minLen = 20

# sixFrameIter = sequence.sixFrameTranslationIter(seq)
sixFrameIter = fasta.load_iter('6frames.fa')

writer = fasta.MfaWriter('ORFs.fa')
i = 0
for h,p in sixFrameIter:
    hmmerFrame = int(h.split(':')[-1])
    frame = hmmer.hmmer2frame[hmmerFrame]
    print >> sys.stderr, 'Frame:', frame
Example #17
0

def Initialize():
    header, seq = fasta.load('MHC_hg18.fa')
    sixFrameIter = sequence.sixFrameTranslationIter(seq)
    writer = fasta.MfaWriter('6frames.fa')
    for frame, p in sixFrameIter:
        print 'Frame:', frame
        writer.write('%s:%i' % (header, frame), p)
    writer.close()
    sys.exit()


# Initialize()

header, seq = fasta.load('MHC_hg18.fa')
L = len(seq)
hstart = header.split()[0]

pattern = re.compile('\*|X{200,}')
minLen = 20

# sixFrameIter = sequence.sixFrameTranslationIter(seq)
sixFrameIter = fasta.load_iter('6frames.fa')

writer = fasta.MfaWriter('ORFs.fa')
i = 0
for h, p in sixFrameIter:
    hmmerFrame = int(h.split(':')[-1])
    frame = hmmer.hmmer2frame[hmmerFrame]
    print >> sys.stderr, 'Frame:', frame
Example #18
0
    f = open(aabrhAlignmentFN, "w")

    randStr = str(random.randrange(1e5))
    intempAlignFN = "/tmp/tempAlign" + randStr + ".fa"
    outtempAlignFN = "/tmp/tempAlign" + randStr + ".afa"

    for orthos in aabrhL:
        tempf = open(intempAlignFN, "w")
        writeSeqBlock(tempf, orthos, seqD)
        tempf.close()

        # align the temp file
        os.system("muscle -in " + intempAlignFN + " -out " + outtempAlignFN)

        # load aligned file into a sequence dict
        alSeqL = fasta.load(outtempAlignFN)
        alSeqD = {}
        for hd, sq in alSeqL:
            alSeqD[hd[1:]] = sq

        # write aligned fasta block into main output file, in the
        # original order (muscle messes up this order).
        for gene in orthos:
            commonName, locusTag, descrip, chrom, start, end, strand = geneInfoD[
                gene]
            f.write(">" + gene + " " + locusTag + "\n")
            f.write(alSeqD[gene] + "\n")
        f.write("\n")

    # delete the temp files
    os.system("rm " + intempAlignFN)
Example #19
0
"""
extractORFs.py

Author: Tony Papenfuss
Date: Wed Aug 23 08:52:58 EST 2006

"""

import os, sys
import re, copy
import fasta, sequence, hmmer3


seqFilename = sys.argv[1]

header,seq = fasta.load(seqFilename)
header = header.split()[0]
L = len(seq)

pattern = re.compile('\*')
minLen = 10

sixFrameIter = sequence.sixFrameTranslationIter(seq)

writer = fasta.MfaWriter(sys.stdout)
i = 0
for frame,p in sixFrameIter:
    print >> sys.stderr, 'Frame:', frame
    
    matchIter = pattern.finditer(p)
    match = matchIter.next()
Example #20
0
Date: Wed Aug 23 08:52:58 EST 2006

"""

import os, sys
import re, copy
import fasta, sequence

pattern = re.compile('[\*|X{200,}]')
minLen = 20

i = 0
writer = fasta.MfaWriter('ORFs.fa')

filename = sys.argv[1]
header, dna = fasta.load(filename)
header = header.strip()

orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern)
for i, gStart, gEnd, orf in orfIter:
    h = '%s.%i.%i-%i  Length=%i' % (header, i, gStart, gEnd, len(orf))
    writer.write(h, orf)

    fasta.pretty(h, orf)

    if gStart < gEnd:
        s = dna[gStart - 1:gEnd]
        print gStart, gEnd, len(s), len(s) % 3 == 0
        print sequence.codons(s, remainder=True)
        print sequence.translate(s)
    else:
Example #21
0
#!/usr/bin/env python

"""
orfTest.py

Author: Tony Papenfuss
Date: Tue Aug 22 20:14:57 EST 2006

"""

import os, sys
import fasta, sequence


header,seq = fasta.load('NKC.fa')
orfIterator = fasta.load_iter('ORFs.fa')
writer = fasta.MfaWriter('ORFs2.fa')

for h,orf in orfIterator:
    chrom,block,orfId,limits = h.split()[0].split('.')
    start,end = limits.split('-')
    start = int(start)
    end = int(end)
    
    if start>end:
        strand = '-'
        start,end = end,start
        s = sequence.translate(sequence.reverseComplement(seq[start-1:end]))
    else:
        strand = '+'
        s = sequence.translate(seq[start-1:end])
Example #22
0
                  help="Complement sequence",
                  default=False)
parser.add_option("-b",
                  "--reverseComplement",
                  "--revComp",
                  action="store_true",
                  dest="reverseComplement",
                  help="Reverse complement sequence",
                  default=False)
options, args = parser.parse_args(sys.argv)

iFilename = args[1]
start = int(args[2])
end = int(args[3])

header, seq = fasta.load(iFilename)
s = seq[start - 1:end]

h = '%s %i-%i' % (header, start, end)
if options.reverse:
    s = sequence.reverse(s)
    h += '(r)'
elif options.complement:
    s = sequence.complement(s)
    h += '(c)'
elif options.reverseComplement:
    s = sequence.reverse_complement(s)
    h += '(rc)'

fasta.pretty(h, s, width=options.width)
Example #23
0
#!/usr/bin/python
# Usage: chromByLen.py 454LargeContigs.fna chrI

import fasta
import sys

gap = "N" * 100

contig_file, cn, outdir = sys.argv[1], sys.argv[2], sys.argv[3]


# Load fasta file w/ contigs sequences. Typically, 454LargeContigs.fna
contigs = dict([(x.name, x) for x in fasta.load(open(sys.argv[1]))])

# get sorted list of contigs by length
lengths = [(contigs[x].length, x) for x in contigs]
lengths.sort(reverse=True)

cn = sys.argv[2]

# output BED tracks
cf = open(outdir + '/contigs.bed', 'w')
gf = open(outdir + '/gaps.bed', 'w')

cstart = 0
gn = 1
for length, name in lengths:
    print >> cf, cn, cstart, cstart+length, name, 1000, '+'
    cstart = cstart + length
    print >> gf, cn, cstart, cstart+100, 'gap%s' % gn
    cstart = cstart + len(gap)
Example #24
0
    sys.exit(2)


# Check to make sure the input file exists and can be opened

try:
   fasta_file = open(filename)
except:
   print "This file could not be opened"
   sys.exit(2)


# Check to make sure the input file is in FASTA format

try:
   fasta_data = fasta.load(fasta_file)
except:
   print 'This file does not seem to be a fasta file.  Please try again with a fasta file'
   sys.exit(0)


# Write out the FASTA input file to a new file, because CD-HIT doesn't handle
# all input file types correctly


new_fasta_file = dirname+'/tmp/input_fasta_file.fa'

try: 
   input_fasta_file = open(new_fasta_file, 'wt')
except:
   print 'Cannot open', new_fasta_file, 'for writing a temporary fasta file'
Example #25
0
"""

import os, sys
import re, copy
import fasta, sequence


pattern = re.compile('[\*|X{200,}]')
minLen = 20

i = 0
writer = fasta.MfaWriter('ORFs.fa')

filename = sys.argv[1]
header,dna = fasta.load(filename)
header = header.strip()

orfIter = sequence.extractOrfsIter(dna, minLen=minLen, pattern=pattern)
for i,gStart,gEnd,orf in orfIter:
    h = '%s.%i.%i-%i  Length=%i' % (header,i,gStart,gEnd,len(orf))
    writer.write(h, orf)
    
    fasta.pretty(h, orf)
    
    if gStart<gEnd:
        s = dna[gStart-1:gEnd]
        print gStart, gEnd, len(s), len(s) % 3==0
        print sequence.codons(s, remainder=True)
        print sequence.translate(s)
    else:
Example #26
0
#!/usr/bin/env python
"""codingscore2.py <fasta file> <ghmm cfg> <gff file>
In windows of step evaluates log(P(S|q = coding) / P(S|q = noncoding))
and outputs to file
"""

from common.GHMM import *
from common.feature import *
import fasta, sys

if len(sys.argv) != 4:
    print 'Invalid arguments'
    print __doc__
    sys.exit(0)

head,seq = fasta.load(sys.argv[1])
ghmm = GHMM(sys.argv[2])
features = Features(sys.argv[3])
sequence = sequenceDict(seq, 5)
outfile = open(sys.argv[1] + '.scr', 'w')

posscorecnc = []
posscore = []

for ref in features:
    pos = 0
    for generef in features[ref]:
        gene = features[ref][generef]
        genecoords = gene.coords
        for (start, end) in genecoords:
            coding1 = max(ghmm.content['eintn'].probEmit(sequence, start, end+1, 0, '+'), ghmm.content['eintn'].probEmit(sequence, start, end+1, 1, '+'), ghmm.content['eintn'].probEmit(sequence, start, end+1, 2, '+'))
Example #27
0
    f=open(aabrhAlignmentFN,"w")

    randStr = str(random.randrange(1e5))
    intempAlignFN="/tmp/tempAlign"+randStr+".fa"
    outtempAlignFN="/tmp/tempAlign"+randStr+".afa"
   
    for orthos in aabrhL:
        tempf = open(intempAlignFN,"w")
        writeSeqBlock(tempf,orthos,seqD)
        tempf.close()

        # align the temp file
        os.system("muscle -in "+ intempAlignFN + " -out " + outtempAlignFN)

        # load aligned file into a sequence dict
        alSeqL=fasta.load(outtempAlignFN)
        alSeqD={}
        for hd,sq in alSeqL:
            alSeqD[hd[1:]]=sq

        # write aligned fasta block into main output file, in the
        # original order (muscle messes up this order).
        for gene in orthos:
            commonName,locusTag,descrip,chrom,start,end,strand=geneInfoD[gene]
            f.write(">"+gene+" "+locusTag+"\n")
            f.write(alSeqD[gene]+"\n")
        f.write("\n")

    # delete the temp files
    os.system("rm "+intempAlignFN)
    os.system("rm "+outtempAlignFN)
Example #28
0
#!/usr/bin/python
# Usage: chromByLen.py 454LargeContigs.fna chrI

import fasta
import sys

gap = "N" * 100

contig_file, cn, outdir = sys.argv[1], sys.argv[2], sys.argv[3]

# Load fasta file w/ contigs sequences. Typically, 454LargeContigs.fna
contigs = dict([(x.name, x) for x in fasta.load(open(sys.argv[1]))])

# get sorted list of contigs by length
lengths = [(contigs[x].length, x) for x in contigs]
lengths.sort(reverse=True)

cn = sys.argv[2]

# output BED tracks
cf = open(outdir + '/contigs.bed', 'w')
gf = open(outdir + '/gaps.bed', 'w')

cstart = 0
gn = 1
for length, name in lengths:
    print >> cf, cn, cstart, cstart + length, name, 1000, '+'
    cstart = cstart + length
    print >> gf, cn, cstart, cstart + 100, 'gap%s' % gn
    cstart = cstart + len(gap)
    gn += 1
Example #29
0
#!/usr/bin/python
# chrom.py

import sys
import fasta

assembly_file = sys.argv[1]
contigs_file  = sys.argv[2]

contigs = {}

for f in fasta.load(open(contigs_file)):
    contigs[f.name] = f
    
chrom = 'chrI'

print '>' + chrom
for line in open(assembly_file):
    fields = line.strip().split()
    s, e = int(fields[1]), int(fields[2])
    l = e - s
    if fields[3][0:3] == 'gap':
        sys.stdout.write('N' * l)
    else:
        strand = fields[5]
        if strand == '+':
            sys.stdout.write(contigs[fields[3]].seq)
        elif strand == '-':
            sys.stdout.write(contigs[fields[3]].reverse_complement().seq)
        elif strand == '.':
            sys.stdout.write(contigs[fields[3]].seq)
Example #30
0
import sys, fasta
import hmmmodel
import hmmAlign

#### Main

if __name__ == "__main__":

    if len(sys.argv) != 3:
        sys.stderr.write("""
        Usage: python seedAlign.fa protSeq.fa

""")
        sys.exit(-1)

    seedAlignFN = sys.argv[1]
    protSeqFN = sys.argv[2]

    # create profile HMM based on seed alignment
    model = hmmmodel.HmmModel(seedAlignFN)

    # load db
    sseq = fasta.load(protSeqFN)[0][1]
    alignment = hmmAlign.HmmAlign(sseq, model)
    score = alignment.subtractShuffleMean()
    print(score)