def test_haplotype_2236():
    sites = str2sites('16126C 16163G 16185.1T 16185.2T 16189d 16294T 16519C')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_2911():
    sites = str2sites('16051G 16129C 16182d 16183d 16193.1C 16193.2C 16362C 16519C')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_3070():
    sites = str2sites('16093C 16183d 16184d 16191.1T 16191.2T 16270T')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_4827():
    sites = str2sites('16172C 16183d 16193.1C 16193.2C 16223T 16320T')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
Beispiel #5
0
def process_sites2seq(form):
    """Process data submitted in sites2seq form"""

    problems = []
    valid = True

    # first, just assume whatever is in the textarea is the submission
    # even if that may be nothing
    content = form.cleaned_data['query']

    # then check to see if a file was supplied, and if so, replace the
    # previously assumed content with the file data
    if form.cleaned_data['file'] is not None:
        if form.cleaned_data['file'].multiple_chunks():
            pass
            # error - return with error
        content = form.cleaned_data['file'].read()

    content_lines = content.strip().split('\n')
    names = []
    ns = []
    motifs = []
    count = 0
    for curr_line in content_lines:
        line = re.sub(r'[,;]', ' ', curr_line)

        count += 1
        name = 'Seq%s' % count
        n = 1
        motif = line
        if form.cleaned_data['format'] == 'name_and_motif':
            split = line.split(' ', 1)
            if len(split) == 2:
                name, motif = split 
            else:
                valid = False
                msg = 'The entry "%s" is not correctly formatted' % curr_line
                problems.append(msg)
        elif form.cleaned_data['format'] == 'name_n_and_motif':
            split = line.split(' ', 2)
            if len(split) == 3:
                name, n, motif = split 
                if re.match(r'^[0-9]+$', n) is None:
                    valid = False
                    problems.append("One of the given 'N's is not a number")
                else:
                    n = int(n)
            else:
                valid = False
                msg = 'The entry "%s" is not correctly formatted' % curr_line
                problems.append(msg)
        names.append(name)
        ns.append(n)
        motifs.append(motif)
        
    if valid:
        pnames = []
        pseqs = []
        for name,n,motif in zip(names,ns,motifs):
            try:
                sites = str2sites(motif)
                seq = sites2seq(sites, region=form.cleaned_data['output'], add16k=form.cleaned_data['add16k'])
                for i in range(n):
                    pnames.append(name)
                    pseqs.append(seq)
            except Exception, e:
                valid = False
                problems.append(e)
Beispiel #6
0
ff = fasta('brandstaetter2004.fasta', 'r')
data = ff.readentries()
ff.close()

## Validate
passed_validation = True

for entry in data:
	seq1 = entry['sequence']
	# Brandstatter et al. put an N at the end of an unstable poly-C run at the end
	#    of HVR3 in 5 samples. This spurious N messes with my conversion utility,
	#    so I strip it out.
	if seq1.endswith('NACA'):
		seq1 = seq1[:-4] + 'ACA'
	mysites = seq2sites(seq1)
	if not seq1 == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print entry['name']

if passed_validation:
	with open('processed.csv', 'w') as f:
		for entry in data:
			name_parts = entry['name'].split()
			origid = name_parts[0]
			key = name_parts[0][:3]
			m = re.search(r'[0-9]+', name_parts[0])
			counter = m.group()
			newid = metadata.ix[key,'NewPrefix'] + counter
			seq = entry['sequence']
			# Brandstatter et al. put an N at the end of an unstable poly-C run at the end
			#    of HVR3 in 5 samples. This spurious N messes with my conversion utility,
Beispiel #7
0
# four sequences are shorter than all the rest, will drop them

for e in data:
	if len(e['sequence']) > 350:
		hids.append(e['name'].split()[0])
		seqs.append(e['sequence'])
		sites.append(seq2sites(e['sequence']))

## Validate
passed_validation = True

for i in range(len(sites)):
	hid = hids[i]
	key = hid[:2]
	region = range2region(metadata.ix[key, 'SeqRange'])
	seq = translate(sites2seq(sites[i], region), None, '-')
	if not seq == seqs[i]:
		passed_validation = False
		print i, hids[i]

if passed_validation:
	counter = {}
	for k in metadata.index:
		counter[k] = 0
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			hid = hids[i]
			grp = hid[:2]
			mysites = ' '.join([str(x) for x in sites[i]])
			prefix = metadata.ix[grp,'NewPrefix']
			counter[grp] += 1
Beispiel #8
0
	assert len(values) == len(sitenums2)

	variants = []
	for j in range(len(values)):
		if values[j] != '.':
			variants.append(sitenums2[j] + values[j])

	sites2.append(str2sites(' '.join(variants), add16k=True))

newcounts2 = [int(x) for x in counts2]

## Validate variant sites
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i

for i in range(len(sites2)):
	seq = sites2seq(sites2[i], region)
	mysites = seq2sites(seq)
	if not sites2[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i

if passed_validation:
Beispiel #9
0
			found = False
			for f in e['features']:
				if f[0] == 'misc_feature' and f[1][1][1] == 'segment 1':
					c = int(f[1][0].split('..')[1]) - 1
					s = e['sequence']
					sites.append(seq2sites(s[:c]) + seq2sites(s[c:]))
					found = True
			if not found:
				print 'problem with isolate %s' % name[3]

# Vigilant GenBank data have variable sequence lengths
# normalize all sites to specified range in the metadata file
mysites = []

for i in range(len(sites)):
	seq1 = translate(sites2seq(sites[i], region1), None, '-')
	seq2 = translate(sites2seq(sites[i], region2), None, '-')
	s = seq2sites(seq1) + seq2sites(seq2)
	mysites.append(' '.join([str(x) for x in s]))

counter = {}
for k in metadata.index:
	counter[k] = 0
with open('processed.csv', 'w') as f:
	for i in range(len(sites)):
		hid = hids[i]
		grp = pops[i]
		if grp in metadata.index:
			repeat = 1
			if hid in counts.keys():
				repeat = counts[hid]
Beispiel #10
0
	data = f.readlines()

hids = []
sites = []

for l in data:
	e = l.strip().split()
	hids.append(e[0])
	sites.append(' '.join(e[1:]))

## Validate
passed_validation = True

for i in range(len(sites)):
	curr_sites = str2sites(sites[i], add16k=True)
	seq = translate(sites2seq(curr_sites, region), None, '-')
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = {}
	for k in metadata.index:
		counter[k] = 0
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			hid = hids[i]
			seq = translate(sites2seq(sites[i], region, add16k=True), None, '-') 
Beispiel #11
0
## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0,'SeqRange'])

with open('boattini2013.csv', 'rU') as f:
	data = f.readlines()

## Validate variant sites
passed_validation = True

for l in data:
	parts = l.strip().split(',')
	sites = parts[1].split()
	sites.sort()
	sites = ' '.join(sites)
	seq1 = sites2seq(parts[1], region)
	mysites = seq2sites(seq1)
	if not sites == ' '.join([str(x) for x in mysites]):
		if not translate(seq1, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print l

if passed_validation:
	with open('processed.csv', 'w') as f:
		for l in data:
			parts = l.strip().split(',')
			origid = parts[0]
			key = parts[0][:3]
			m = re.search(r'[0-9]+', parts[0])
			counter = m.group()
			newid = metadata.ix[key,'NewPrefix'] + counter.zfill(3)
Beispiel #12
0
		if val.startswith('d'):
			s[j].value = '-'
			newsites[i].append(s[j])
			if len(val) > 2:
				p = Polymorphism(s[j].position+1,0,'-')
				newsites[i].append(p)
		elif val.startswith('i'):
			if val.startswith('ii'):
				val = val[1:]
			inserts = list(val[1:])
			for k in range(len(inserts)):
				p = Polymorphism(s[j].position, k+1, inserts[k])
				newsites[i].append(p)
		elif val.startswith('.'):
			pos = s[j].position
			p = Polymorphism(pos, 1, sites2seq('', (pos,pos)))
			newsites[i].append(p)
		else:
			newsites[i].append(s[j])

## Validate
passed_validation = True

for i in range(len(newsites)):
	curr_sites = newsites[i]
	seq = translate(sites2seq(curr_sites, region), None, '-')
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
Beispiel #13
0
sites = []

with open('tofanelli2009.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		groups.append(row[1])
		sites_str = ' '.join(row[3].split('-'))
		sites.append(str2sites(sites_str, add16k=True))

## Validate
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not mysites == sites[i]:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

counter = {}
for k in metadata.index:
	counter[k] = 0
	
if passed_validation:
	with open('processed.csv', 'w') as f:
		for i in range(len(groups)):
			key = groups[i]
Beispiel #14
0
    row = len(iids) - 1
    counts[row,] = [int(x) for x in parts[1:3]]
    sites.append(" ".join(parts[3:]))

## Validate
passed_validation = True

# there are sites in the source table that are not actual variant sites
# sequence 9 (index 8) has 263A as a variant
# sequence 12 (index 11) has 16223C as a variant
not_polys = [Polymorphism(263, 0, "A"), Polymorphism(16223, 0, "C")]

for i in range(len(sites)):
    curr_sites = sites[i]
    curr_polys = [x for x in str2sites(curr_sites) if x not in not_polys]
    cseq1 = sites2seq(curr_sites, region1)
    cseq2 = sites2seq(curr_sites, region2)
    mysites1 = seq2sites(cseq1)
    mysites2 = seq2sites(cseq2)
    mysites = mysites1 + mysites2
    if not mysites == curr_polys:
        passed_validation = False
        print iids[i]

if passed_validation:
    counters = [1] * 2
    with open("processed.csv", "w") as f:
        for i in range(len(sites)):
            curr_sites = str2sites(sites[i])
            mysites = [x for x in curr_sites if x not in not_polys]
            mysites = " ".join([str(x) for x in mysites])
Beispiel #15
0
					newrow.append('%s.%s%s' % (loc, str(i+1), nuc[i % len(nuc)]))
			else:
				newrow.append(s)
		else:
			newrow.append(s)
	newsites.append(' '.join(newrow))

## Validate
passed_validation = True

for i in range(len(newsites)):
	curr_sites = str2sites(newsites[i])
	# some entries have data outside explicitly sequenced 15900-640 region
	# get rid of extra sites
	curr_sites = [x for x in curr_sites if x.position >= 15900 or x.position <= 640]
	seq = sites2seq(curr_sites, region)
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = sites2seq(mysites, region)
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = {}
	for i in metadata.index:
		counter[i] = 0
	with open('processed.csv', 'w') as f:
		for i in range(len(newsites)):
			curr_sites = str2sites(newsites[i])
			# some entries have data outside explicitly sequenced 15900-640 region
Beispiel #16
0
    reader = csv.reader(f)
    reader.next()  # skip past header
    for row in reader:
        if row[4] == "Senegalese":
            hids.append(row[0])
            hvr1.append(str2sites(row[2], add16k=True))
            hvr2.append(str2sites(row[3]))

for i in range(len(hids)):
    sites.append(hvr1[i] + hvr2[i])

## Validate variant sites
passed_validation = True

for i in range(len(hids)):
    seq = sites2seq(sites[i], region)
    mysites = seq2sites(seq)
    if not sites[i] == mysites:
        if not translate(seq, None, "-") == translate(sites2seq(mysites, region), None, "-"):
            passed_validation = False
            print i


if passed_validation:
    counter = 0
    prefix = metadata.ix[0, "NewPrefix"]
    with open("processed.csv", "w") as f:
        for i in range(len(hids)):
            counter = counter + 1
            newid = prefix + str(counter).zfill(3)
            seq = sites2seq(sites[i], region)
Beispiel #17
0
grps = []
hids = []
sites = []

for l in data:
	e = l.strip().split(',')
	grps.append(e[1])
	hids.append(e[4])
	sites.append(e[5])

## Validate
passed_validation = True

for i in range(len(sites)):
	curr_sites = str2sites(sites[i])
	seq = translate(sites2seq(curr_sites, region), None, '-').upper()
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i


if passed_validation:
	counter = {}
	for k in metadata.index:
		counter[k] = 0
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			grp = grps[i]
Beispiel #18
0
# some of the sequences are short. Many are just missing a base or two
# from the beginning or the end - will keep those
# a few are missing large chunks of the end, so will drop those
ok = [41, 42, 43, 44, 54, 55, 83, 85, 89, 136, 137, 157, 178]
skip = [8, 123, 124]

# validate
passed_validation = True

for i in range(len(sites)):
	if i in ok or i in skip:
		pass
	else:
		seq1 = data[i]['sequence'].upper()
		if not seq1 == translate(sites2seq(sites[i], region), None, '-'):
				passed_validation = False
				print i, hids[i]

counter = {}
for k in metadata.GroupName:
	counter[k] = 0

if passed_validation:
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			if i not in skip:
				hid = hids[i]
				key = None
				prefix = None
				for pattern in metadata.index:
Beispiel #19
0
            m = re.match(r"([0-9]+)[A-Z]/([A-Z])", s)
            s = "%s%s" % m.groups()
        elif ".1" in s:
            s = s + "C"
        elif ".2" in s:
            h2f.append(s[:-1] + "1C")
            s = s + "C"
        h2f.append(s)
    hvr2[i] = " ".join(h2f)

## Validate
passed_validation = True

for i in range(len(freq)):
    curr_sites = str2sites(hvr1[i], add16k=True)
    seq = translate(sites2seq(curr_sites, region1), None, "-")
    mysites = seq2sites(seq)
    if not mysites == curr_sites:
        myseq = translate(sites2seq(mysites, region1), None, "-")
        if not seq == myseq:
            passed_validation = False
            print i, "hvr1"
    curr_sites = str2sites(hvr2[i])
    seq = translate(sites2seq(curr_sites, region2), None, "-")
    mysites = seq2sites(seq)
    if not mysites == curr_sites:
        myseq = translate(sites2seq(mysites, region2), None, "-")
        if not seq == myseq:
            passed_validation = False
            print i, "hvr2"
Beispiel #20
0
drop = ['San_43', 'San_67', 'tzbg040', 'tzdt045', 'tzhz108', 'tzhz130', 'tzhz131']

hids = []
seqs = []

for e in data:
	name_parts = e['name'].split()
	if name_parts[0] not in drop:
		hids.append(name_parts[0])
		seqs.append(e['sequence'])

## Validate
passed_validation = True

for i in range(len(seqs)):
	mysites = seq2sites(seqs[i])
	myseq = translate(sites2seq(mysites, region), None, '-')
	if not seqs[i] == myseq:
		passed_validation = False
		print i, hids[i]

if passed_validation:
	with open('processed.csv', 'w') as f:
		for i in range(len(seqs)):
			mysites = ' '.join([str(x) for x in seq2sites(seqs[i])])
			origid = hids[i]
			prefix = metadata.ix[origid[:4],'NewPrefix']
			num = origid[4:].split('_')[0].zfill(3)
			newid = prefix + num
			f.write('%s,%s,%s\n' % (newid, origid, mysites))
Beispiel #21
0
for i in range(len(data)):
        x = data[i].strip().split(',')
        hids.append(x[0])
        sites.append(x[2])
        count = x[4:]
        for j in range(5):
        	if count[j] == '':
        		count[j] = '0'
        counts[i,] = [int(y) for y in count]

## Validate
passed_validation = True

for i in range(len(sites)):
	curr_sites = str2sites(sites[i])
	seq = sites2seq(curr_sites, region)
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = [0] * 5
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			hid = hids[i]
			curr_sites = str2sites(sites[i])
			seq = sites2seq(curr_sites, region)
			mysites = ' '.join([str(x) for x in seq2sites(seq)])
Beispiel #22
0
groups = []
sites = []

with open('defilippo2010.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		groups.append(row[1])
		sites.append(str2sites(row[3]))

## Validate
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not mysites == sites[i]:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

counter = {}
for k in metadata.index:
	counter[k] = 0
	
if passed_validation:
	with open('processed.csv', 'w') as f:
		for i in range(len(groups)):
			key = groups[i]
Beispiel #23
0
			elif j == 84:
				y.append('309.2C')
			elif j == 85:
				y.append('313.1C')
			elif j == 86:
				y.append('315.1C')
			else:
				y.append('%s%s' % (positions[j], x[j]))
	sites.append(' '.join(y))

## Validate
passed_validation = True

for i in range(len(sites)):
	curr_sites = str2sites(sites[i])
	cseq1 = sites2seq(curr_sites, region1)
	cseq2 = sites2seq(curr_sites, region2)
	mysites1 = seq2sites(cseq1)
	mysites2 = seq2sites(cseq2)
	mysites = mysites1 + mysites2
	if not mysites == curr_sites:
		seq = cseq1 + cseq2
		myseq = translate(sites2seq(mysites, region1), None, '-') + translate(sites2seq(mysites, region2), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	count = 0
	prefix = metadata.ix[0,'NewPrefix']
	with open('processed.csv', 'w') as f: