def match_fasta(self, fasta_text, do_align=False): fasta_fileobj = fasta(fasta_text, 's').readentries() queries = [] for entry in fasta_fileobj: query = self.__qm.new_query(entry['sequence']) query.label = entry['name'] queries.append(query) return [self.__mm.match(q) for q in queries]
from oldowan.mtconvert import seq2sites, sites2seq, str2sites from oldowan.fasta import fasta from string import translate import pandas as pd import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) ff = fasta('non_2011.fasta', 'r') data = ff.readentries() ff.close() hids = [] seqs = [] sites = [] # four sequences are shorter than all the rest, will drop them for e in data: if len(e['sequence']) > 350: hids.append(e['name'].split()[0]) seqs.append(e['sequence']) sites.append(seq2sites(e['sequence'])) ## Validate passed_validation = True
from oldowan.mtconvert import seq2sites, sites2seq, str2sites from oldowan.fasta import fasta from string import translate import pandas as pd import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0,'SeqRange']) ff = fasta('tishkoff2007.fasta', 'rU') data = ff.readentries() ff.close() drop = ['San_43', 'San_67', 'tzbg040', 'tzdt045', 'tzhz108', 'tzhz130', 'tzhz131'] hids = [] seqs = [] for e in data: name_parts = e['name'].split() if name_parts[0] not in drop: hids.append(name_parts[0]) seqs.append(e['sequence']) ## Validate passed_validation = True
from oldowan.mtconvert import seq2sites, sites2seq, str2sites from oldowan.fasta import fasta from string import translate import pandas as pd import sys import re sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0, 'SeqRange']) ff = fasta('krings1999.fasta', 'r') data = ff.readentries() ff.close() hids = [] sites = [] for entry in data: hids.append(entry['name']) sites.append(seq2sites(entry['sequence'])) # some of the sequences are short. Many are just missing a base or two # from the beginning or the end - will keep those # a few are missing large chunks of the end, so will drop those ok = [41, 42, 43, 44, 54, 55, 83, 85, 89, 136, 137, 157, 178] skip = [8, 123, 124]
import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0, 'SeqRange']) ## load sample info sinfo = pd.read_csv('HGDP_info.csv', index_col=0) newindices = ['HGDP' + str(x).zfill(5) for x in sinfo.index] sinfo['hgdpid'] = newindices sinfo = sinfo.set_index('hgdpid') ff = fasta('hgdp_africa.fasta', 'r') data = ff.readentries() ff.close() hids = [] sites = [] for entry in data: words = entry['name'].split() hids.append(words[4]) sites.append(seq2sites(entry['sequence'])) # three sequences have an 'N' at around 309 that breaks validation # this will be treated as a heteroplasy of the T there and ignored skip = [64, 67, 73]
def process_seq2sites(form): """Process data submitted in seq2sites form""" # submission validation and error reporting problems = [] valid = True # first, just assume whatever is in the textarea is the submission # even if that may be nothing content = form.cleaned_data['query'] # then check to see if a file was supplied, and if so, replace the # previously assumed content with the file data if form.cleaned_data['file'] is not None: if form.cleaned_data['file'].multiple_chunks(): pass # error - return with error content = form.cleaned_data['file'].read() # clear off any trailing whitespace content.strip() # make sure something was submitted if len(content) == 0: valid = False return HttpResponseRedirect(reverse('seq2sites')) # determine format format = None if content.startswith('>'): format = 'fasta' else: format = 'single_seq' # pull names and sequence out of submitted content names = [] seqs = [] if format == 'fasta': try: fnames = [] fseqs = [] for entry in fasta(content, 's'): fnames.append(entry['name']) fseqs.append(RE_NON_IUPAC.sub('', entry['sequence'].upper())) names = fnames seqs = fseqs except: valid = False problems.append('There was an error in the FASTA format') else: names = [''] seqs = [RE_NON_IUPAC.sub('', content.upper())] # enforce limits for multisequence submissions if format == 'fasta': if len(seqs) > MAX_SEQS: valid = False problems.append('Too many sequences submitted; current maximum allowed is %d' % MAX_SEQS) if valid: result_lines = [] sites_by_line = [] for seq in seqs: try: sites = seq2sites(seq) sites_by_line.append(sites) result_lines.append(sites2str(sites)) except Exception, e: result_lines.append('There was an error: %s' % e) results = list(Result(x,y) for x,y in zip(names,result_lines)) c = Context({'results': results})
from string import translate import pandas as pd import re import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) regionstr = metadata.ix[0,'SeqRange'] regionparts = regionstr.split(';') region1 = range2region(regionparts[0]) region2 = range2region(regionparts[1]) ff = fasta('knight2003.fasta', 'r') data = ff.readentries() ff.close() hids = [] seqs = [] sites = [] for e in data: hids.append(e['name'].split()[0]) seqs.append(e['sequence']) m = re.search(r'GATCACA', e['sequence']) cut = m.start() seq1 = e['sequence'][:cut] seq2 = e['sequence'][cut:] sites.append(seq2sites(seq1) + seq2sites(seq2))
from oldowan.mtconvert import seq2sites, sites2seq, str2sites from oldowan.fasta import fasta from string import translate import pandas as pd import sys import re sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0, 'SeqRange']) ff = fasta('barbieri2012.fasta', 'r') data = ff.readentries() ff.close() hids = [] groups = [] sites = [] sequences = [] for entry in data: words = entry['name'].split() if entry['sequence'].count('N') > 10: print "Too many Ns in %s (%s), skipping" % (words[0], words[1]) else: hids.append(words[0]) groups.append(words[1]) sites.append(seq2sites(entry['sequence']))
from oldowan.mtconvert import seq2sites, sites2seq, str2sites from oldowan.fasta import fasta from string import translate import pandas as pd import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0, 'SeqRange']) ff = fasta('coia2005.fasta', 'r') data = ff.readentries() ff.close() hids = [] sites = [] for entry in data: words = entry['name'].split() hids.append(words[4]) sites.append(seq2sites(entry['sequence'])) # validate passed_validation = True for i in range(len(sites)): seq1 = data[i]['sequence'] if not seq1 == translate(sites2seq(sites[i], region), None, '-'):
from string import translate import pandas as pd import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) regionstr = metadata.ix[0,'SeqRange'] regionparts = regionstr.split(';') region1 = range2region(regionparts[0]) region2 = range2region(regionparts[1]) + range2region(regionparts[2]) ff = fasta('Poloni2009HVR1.fasta', 'r') data1 = ff.readentries() ff.close() ff = fasta('Poloni2009HVR2.fasta', 'r') data2 = ff.readentries() ff.close() hvr1 = {} hvr2 = {} for e in data1: k = e['name'].split()[0] hvr1[k] = e['sequence'] for e in data2:
sys.path.append("../../scripts") from utils import * ## load metadata metadata = pd.read_csv("metadata.csv", index_col=0) region = range2region(metadata.ix[0, "SeqRange"]) groups = pd.read_csv("barbieri2014_groups.csv", index_col=0) newindex = [x.upper() for x in groups.index] groups.index = pd.Index(newindex) hids = [] seqs = [] for filename in ["barbieri2013a.fasta", "barbieri2013b.fasta", "barbieri2013c.fasta", "barbieri2014.fasta"]: ff = fasta(filename, "r") data = ff.readentries() ff.close() for e in data: if e["sequence"].count("N") < 20: hid = e["name"].split()[4].upper() if hid in groups.index: hids.append(hid) seqs.append(e["sequence"]) sites = [] for s in seqs: sites.append(seq2sites(s, ambig_cutoff=20)) ## Validate
from oldowan.mtconvert import seq2sites, sites2seq, str2sites from oldowan.fasta import fasta from string import translate import pandas as pd import re import sys sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix['Nai', 'SeqRange']) ff = fasta('brandstaetter2004.fasta', 'r') data = ff.readentries() ff.close() ## Validate passed_validation = True for entry in data: seq1 = entry['sequence'] # Brandstatter et al. put an N at the end of an unstable poly-C run at the end # of HVR3 in 5 samples. This spurious N messes with my conversion utility, # so I strip it out. if seq1.endswith('NACA'): seq1 = seq1[:-4] + 'ACA' mysites = seq2sites(seq1) if not seq1 == translate(sites2seq(mysites, region), None, '-'): passed_validation = False
from oldowan.fasta import fasta DLOOPplus = range(15800,16570) + range(1,1500) dloops_fn = os.path.join(here, 'dloops.fasta') ofn = os.path.join(here, 'fail_dloop.txt') fail_fn = os.path.join(here, 'dloop_expect_to_fail.txt') expect_to_fail = [] for line in open(fail_fn, 'U'): expect_to_fail.append(int(line.strip()[:-1])) of = open(ofn, 'w') count = 0 for entry in fasta(dloops_fn): count += 1 if count not in expect_to_fail: try: sites = seq2sites(entry["sequence"]) seq = sites2seq(sites, region=DLOOPplus) seq = seq.replace('-', '') if entry["sequence"] in seq: print count, entry["name"], sites2str(sites) else: print 'FAILED', count, entry["name"] of.write('%d, %s\n' % (count, entry["name"])) except Exception, e: print 'FAILED',e, count, entry["name"] of.write('%d, %s\n' % (count, entry["name"])) of.flush()