for x in e['features']: if x[0] == 'source': for y in x[1]: if isinstance(y, tuple): if y[0] == 'isolate': return y[1] return None ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) regionparts = metadata.ix[0, 'SeqRange'].split(';') region1 = range2region(regionparts[0]) region2 = range2region(regionparts[1]) entries1 = read_genbank('hvr1.gb', what='filename') hids1 = [] groups1 = [] seqs1 = [] sites1 = [] for e in entries1: hid = getisolate(e) if hid is not None: hids1.append(hid) n = getnote(e) if n is not None: parts = n.split() groups1.append(parts[-1]) seqs1.append(e['sequence'])
from genbank import read_genbank ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) regionstr = metadata.ix[0,'SeqRange'] regionparts = regionstr.split(';') region1 = range2region(regionparts[0]) region2 = range2region(regionparts[1]) counts = {} with open('vigilant1991_counts.txt', 'rU') as f: for line in f: parts = line.strip().split(',') counts[parts[0]] = int(parts[1]) entries = read_genbank('vigilant1991.txt', what='filename') hids = [] pops = [] seqs = [] sites = [] for i in range(len(entries)): e = entries[i] name = e['definition'].split() if e['sequence'].count('n') > 10: print 'skipping isolate %s' % name[3] else: hids.append(name[3]) seqs.append(e['sequence']) if name[3] == '63':
# go through the genbank file, extract ids and population attributions # drop the mandenka and yoruba (already in db from Lippold et al. 2014) # and write out in fasta import sys from oldowan.fasta import fasta sys.path.append('../../scripts') from utils import * from genbank import read_genbank entries = read_genbank('barbieri2012.gb', what='filename') def getpop(x): source = x['features'][0][1] for x in source: if isinstance(x, tuple): if x[0] == 'pop_variant': return x[1] def getid(x): words = x['definition'].split() return words[3] ff = fasta('barbieri2012.fasta', 'w') for i in range(len(entries)): hid = getid(entries[i]) if not hid.startswith('MAN') and not hid.startswith('YOR'): pop = getpop(entries[i]) seq = entries[i]['sequence'].upper() newentry = {'name': "%s %s" % (hid, pop), 'sequence':seq}