Beispiel #1
0
	for x in e['features']:
		if x[0] == 'source':
			for y in x[1]:
				if isinstance(y, tuple):
					if y[0] == 'isolate':
						return y[1]
	return None


## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
regionparts = metadata.ix[0, 'SeqRange'].split(';')
region1 = range2region(regionparts[0])
region2 = range2region(regionparts[1])

entries1 = read_genbank('hvr1.gb', what='filename')

hids1 = []
groups1 = []
seqs1 = []
sites1 = []

for e in entries1:
	hid = getisolate(e)
	if hid is not None:
		hids1.append(hid)
	n = getnote(e)
	if n is not None:
		parts = n.split()
		groups1.append(parts[-1])
	seqs1.append(e['sequence'])
Beispiel #2
0
from genbank import read_genbank

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
regionstr = metadata.ix[0,'SeqRange']
regionparts = regionstr.split(';')
region1 = range2region(regionparts[0])
region2 = range2region(regionparts[1])

counts = {}
with open('vigilant1991_counts.txt', 'rU') as f:
	for line in f:
		parts = line.strip().split(',')
		counts[parts[0]] = int(parts[1])

entries = read_genbank('vigilant1991.txt', what='filename')

hids = []
pops = []
seqs = []
sites = []

for i in range(len(entries)):
	e = entries[i]
	name = e['definition'].split()
	if e['sequence'].count('n') > 10:
		print 'skipping isolate %s' % name[3]
	else:
		hids.append(name[3])
		seqs.append(e['sequence'])
		if name[3] == '63':
Beispiel #3
0
# go through the genbank file, extract ids and population attributions
# drop the mandenka and yoruba (already in db from Lippold et al. 2014)
# and write out in fasta
import sys
from oldowan.fasta import fasta

sys.path.append('../../scripts')
from utils import *
from genbank import read_genbank

entries = read_genbank('barbieri2012.gb', what='filename')

def getpop(x):
	source = x['features'][0][1]
	for x in source:
		if isinstance(x, tuple):
			if x[0] == 'pop_variant':
				return x[1]

def getid(x):
	words = x['definition'].split()
	return words[3]

ff = fasta('barbieri2012.fasta', 'w')

for i in range(len(entries)):
	hid = getid(entries[i])
	if not hid.startswith('MAN') and not hid.startswith('YOR'):
		pop = getpop(entries[i])
		seq = entries[i]['sequence'].upper()
		newentry = {'name': "%s %s" % (hid, pop), 'sequence':seq}