Beispiel #1
0
    def match_fasta(self, fasta_text, do_align=False):
        fasta_fileobj = fasta(fasta_text, 's').readentries()

        queries = []   
        for entry in fasta_fileobj:
            query = self.__qm.new_query(entry['sequence'])
            query.label = entry['name']
            queries.append(query)
        return [self.__mm.match(q) for q in queries]
Beispiel #2
0
from oldowan.mtconvert import seq2sites, sites2seq, str2sites
from oldowan.fasta import fasta
from string import translate
import pandas as pd
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)

ff = fasta('non_2011.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
seqs = []
sites = []

# four sequences are shorter than all the rest, will drop them

for e in data:
	if len(e['sequence']) > 350:
		hids.append(e['name'].split()[0])
		seqs.append(e['sequence'])
		sites.append(seq2sites(e['sequence']))

## Validate
passed_validation = True
Beispiel #3
0
from oldowan.mtconvert import seq2sites, sites2seq, str2sites
from oldowan.fasta import fasta
from string import translate
import pandas as pd
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0,'SeqRange'])

ff = fasta('tishkoff2007.fasta', 'rU')
data = ff.readentries()
ff.close()

drop = ['San_43', 'San_67', 'tzbg040', 'tzdt045', 'tzhz108', 'tzhz130', 'tzhz131']

hids = []
seqs = []

for e in data:
	name_parts = e['name'].split()
	if name_parts[0] not in drop:
		hids.append(name_parts[0])
		seqs.append(e['sequence'])

## Validate
passed_validation = True
Beispiel #4
0
from oldowan.mtconvert import seq2sites, sites2seq, str2sites
from oldowan.fasta import fasta
from string import translate
import pandas as pd
import sys
import re

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0, 'SeqRange'])

ff = fasta('krings1999.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
sites = []

for entry in data:
	hids.append(entry['name'])
	sites.append(seq2sites(entry['sequence']))

# some of the sequences are short. Many are just missing a base or two
# from the beginning or the end - will keep those
# a few are missing large chunks of the end, so will drop those
ok = [41, 42, 43, 44, 54, 55, 83, 85, 89, 136, 137, 157, 178]
skip = [8, 123, 124]
Beispiel #5
0
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0, 'SeqRange'])

## load sample info
sinfo = pd.read_csv('HGDP_info.csv', index_col=0)
newindices = ['HGDP' + str(x).zfill(5) for x in sinfo.index]
sinfo['hgdpid'] = newindices
sinfo = sinfo.set_index('hgdpid')

ff = fasta('hgdp_africa.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
sites = []

for entry in data:
	words = entry['name'].split()
	hids.append(words[4])
	sites.append(seq2sites(entry['sequence']))

# three sequences have an 'N' at around 309 that breaks validation
# this will be treated as a heteroplasy of the T there and ignored
skip = [64, 67, 73]
Beispiel #6
0
def process_seq2sites(form):
    """Process data submitted in seq2sites form"""

    # submission validation and error reporting
    problems = []
    valid = True

    # first, just assume whatever is in the textarea is the submission
    # even if that may be nothing
    content = form.cleaned_data['query']   

    # then check to see if a file was supplied, and if so, replace the
    # previously assumed content with the file data
    if form.cleaned_data['file'] is not None:
        if form.cleaned_data['file'].multiple_chunks():
            pass
            # error - return with error
        content = form.cleaned_data['file'].read()

    # clear off any trailing whitespace
    content.strip()

    # make sure something was submitted
    if len(content) == 0:
        valid = False
        return HttpResponseRedirect(reverse('seq2sites'))

    # determine format
    format = None
    if content.startswith('>'):
        format = 'fasta'
    else:
        format = 'single_seq'
        
    # pull names and sequence out of submitted content
    names = []
    seqs = []
    if format == 'fasta':
        try:
            fnames = []
            fseqs = []
            for entry in fasta(content, 's'):
                fnames.append(entry['name'])
                fseqs.append(RE_NON_IUPAC.sub('', entry['sequence'].upper()))
            names = fnames
            seqs = fseqs
        except:
            valid = False
            problems.append('There was an error in the FASTA format')
    else:
        names = ['']
        seqs = [RE_NON_IUPAC.sub('', content.upper())]

    # enforce limits for multisequence submissions
    if format == 'fasta':
        if len(seqs) > MAX_SEQS:
                valid = False
                problems.append('Too many sequences submitted; current maximum allowed is %d' % MAX_SEQS)

    if valid:
        result_lines = []
        sites_by_line = []
        for seq in seqs:
            try:
                sites = seq2sites(seq)
                sites_by_line.append(sites)
                result_lines.append(sites2str(sites))
            except Exception, e:
                result_lines.append('There was an error: %s' % e)

        results = list(Result(x,y) for x,y in zip(names,result_lines))

        c = Context({'results': results})
Beispiel #7
0
from string import translate
import pandas as pd
import re
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
regionstr = metadata.ix[0,'SeqRange']
regionparts = regionstr.split(';')
region1 = range2region(regionparts[0])
region2 = range2region(regionparts[1])

ff = fasta('knight2003.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
seqs = []
sites = []

for e in data:
	hids.append(e['name'].split()[0])
	seqs.append(e['sequence'])
	m = re.search(r'GATCACA', e['sequence'])
	cut = m.start()
	seq1 = e['sequence'][:cut]
	seq2 = e['sequence'][cut:]
	sites.append(seq2sites(seq1) + seq2sites(seq2))
Beispiel #8
0
from oldowan.mtconvert import seq2sites, sites2seq, str2sites
from oldowan.fasta import fasta
from string import translate
import pandas as pd
import sys
import re

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0, 'SeqRange'])

ff = fasta('barbieri2012.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
groups = []
sites = []
sequences = []

for entry in data:
	words = entry['name'].split()
	if entry['sequence'].count('N') > 10:
		print "Too many Ns in %s (%s), skipping" % (words[0], words[1])
	else:
		hids.append(words[0])
		groups.append(words[1])
		sites.append(seq2sites(entry['sequence']))
Beispiel #9
0
from oldowan.mtconvert import seq2sites, sites2seq, str2sites
from oldowan.fasta import fasta
from string import translate
import pandas as pd
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0, 'SeqRange'])

ff = fasta('coia2005.fasta', 'r')
data = ff.readentries()
ff.close()

hids = []
sites = []

for entry in data:
	words = entry['name'].split()
	hids.append(words[4])
	sites.append(seq2sites(entry['sequence']))

# validate
passed_validation = True

for i in range(len(sites)):
	seq1 = data[i]['sequence']
	if not seq1 == translate(sites2seq(sites[i], region), None, '-'):
Beispiel #10
0
from string import translate
import pandas as pd
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
regionstr = metadata.ix[0,'SeqRange']
regionparts = regionstr.split(';')
region1 = range2region(regionparts[0])
region2 = range2region(regionparts[1]) + range2region(regionparts[2])


ff = fasta('Poloni2009HVR1.fasta', 'r')
data1 = ff.readentries()
ff.close()

ff = fasta('Poloni2009HVR2.fasta', 'r')
data2 = ff.readentries()
ff.close()

hvr1 = {}
hvr2 = {}

for e in data1:
	k = e['name'].split()[0]
	hvr1[k] = e['sequence']

for e in data2:
Beispiel #11
0
sys.path.append("../../scripts")
from utils import *

## load metadata
metadata = pd.read_csv("metadata.csv", index_col=0)
region = range2region(metadata.ix[0, "SeqRange"])

groups = pd.read_csv("barbieri2014_groups.csv", index_col=0)
newindex = [x.upper() for x in groups.index]
groups.index = pd.Index(newindex)

hids = []
seqs = []

for filename in ["barbieri2013a.fasta", "barbieri2013b.fasta", "barbieri2013c.fasta", "barbieri2014.fasta"]:
    ff = fasta(filename, "r")
    data = ff.readentries()
    ff.close()

    for e in data:
        if e["sequence"].count("N") < 20:
            hid = e["name"].split()[4].upper()
            if hid in groups.index:
                hids.append(hid)
                seqs.append(e["sequence"])

sites = []
for s in seqs:
    sites.append(seq2sites(s, ambig_cutoff=20))

## Validate
Beispiel #12
0
from oldowan.mtconvert import seq2sites, sites2seq, str2sites
from oldowan.fasta import fasta
from string import translate
import pandas as pd
import re
import sys

sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix['Nai', 'SeqRange'])

ff = fasta('brandstaetter2004.fasta', 'r')
data = ff.readentries()
ff.close()

## Validate
passed_validation = True

for entry in data:
	seq1 = entry['sequence']
	# Brandstatter et al. put an N at the end of an unstable poly-C run at the end
	#    of HVR3 in 5 samples. This spurious N messes with my conversion utility,
	#    so I strip it out.
	if seq1.endswith('NACA'):
		seq1 = seq1[:-4] + 'ACA'
	mysites = seq2sites(seq1)
	if not seq1 == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
from oldowan.fasta import fasta

DLOOPplus = range(15800,16570) + range(1,1500)

dloops_fn   = os.path.join(here, 'dloops.fasta')
ofn         = os.path.join(here, 'fail_dloop.txt')
fail_fn     = os.path.join(here, 'dloop_expect_to_fail.txt')

expect_to_fail = []
for line in open(fail_fn, 'U'):
    expect_to_fail.append(int(line.strip()[:-1]))

of = open(ofn, 'w')

count = 0
for entry in fasta(dloops_fn):
    count += 1
    if count not in expect_to_fail:
        try:
            sites = seq2sites(entry["sequence"])
            seq = sites2seq(sites, region=DLOOPplus)
            seq = seq.replace('-', '')
            if entry["sequence"] in seq:
                print count, entry["name"], sites2str(sites)
            else:
                print 'FAILED', count, entry["name"]
                of.write('%d, %s\n' % (count, entry["name"]))
        except Exception, e:
            print 'FAILED',e, count, entry["name"]
            of.write('%d, %s\n' % (count, entry["name"]))
            of.flush()