def test_haplotype_2236(): sites = str2sites('16126C 16163G 16185.1T 16185.2T 16189d 16294T 16519C') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_2911(): sites = str2sites('16051G 16129C 16182d 16183d 16193.1C 16193.2C 16362C 16519C') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_3070(): sites = str2sites('16093C 16183d 16184d 16191.1T 16191.2T 16270T') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_4827(): sites = str2sites('16172C 16183d 16193.1C 16193.2C 16223T 16320T') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def process_sites2seq(form): """Process data submitted in sites2seq form""" problems = [] valid = True # first, just assume whatever is in the textarea is the submission # even if that may be nothing content = form.cleaned_data['query'] # then check to see if a file was supplied, and if so, replace the # previously assumed content with the file data if form.cleaned_data['file'] is not None: if form.cleaned_data['file'].multiple_chunks(): pass # error - return with error content = form.cleaned_data['file'].read() content_lines = content.strip().split('\n') names = [] ns = [] motifs = [] count = 0 for curr_line in content_lines: line = re.sub(r'[,;]', ' ', curr_line) count += 1 name = 'Seq%s' % count n = 1 motif = line if form.cleaned_data['format'] == 'name_and_motif': split = line.split(' ', 1) if len(split) == 2: name, motif = split else: valid = False msg = 'The entry "%s" is not correctly formatted' % curr_line problems.append(msg) elif form.cleaned_data['format'] == 'name_n_and_motif': split = line.split(' ', 2) if len(split) == 3: name, n, motif = split if re.match(r'^[0-9]+$', n) is None: valid = False problems.append("One of the given 'N's is not a number") else: n = int(n) else: valid = False msg = 'The entry "%s" is not correctly formatted' % curr_line problems.append(msg) names.append(name) ns.append(n) motifs.append(motif) if valid: pnames = [] pseqs = [] for name,n,motif in zip(names,ns,motifs): try: sites = str2sites(motif) seq = sites2seq(sites, region=form.cleaned_data['output'], add16k=form.cleaned_data['add16k']) for i in range(n): pnames.append(name) pseqs.append(seq) except Exception, e: valid = False problems.append(e)
ff = fasta('brandstaetter2004.fasta', 'r') data = ff.readentries() ff.close() ## Validate passed_validation = True for entry in data: seq1 = entry['sequence'] # Brandstatter et al. put an N at the end of an unstable poly-C run at the end # of HVR3 in 5 samples. This spurious N messes with my conversion utility, # so I strip it out. if seq1.endswith('NACA'): seq1 = seq1[:-4] + 'ACA' mysites = seq2sites(seq1) if not seq1 == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print entry['name'] if passed_validation: with open('processed.csv', 'w') as f: for entry in data: name_parts = entry['name'].split() origid = name_parts[0] key = name_parts[0][:3] m = re.search(r'[0-9]+', name_parts[0]) counter = m.group() newid = metadata.ix[key,'NewPrefix'] + counter seq = entry['sequence'] # Brandstatter et al. put an N at the end of an unstable poly-C run at the end # of HVR3 in 5 samples. This spurious N messes with my conversion utility,
# four sequences are shorter than all the rest, will drop them for e in data: if len(e['sequence']) > 350: hids.append(e['name'].split()[0]) seqs.append(e['sequence']) sites.append(seq2sites(e['sequence'])) ## Validate passed_validation = True for i in range(len(sites)): hid = hids[i] key = hid[:2] region = range2region(metadata.ix[key, 'SeqRange']) seq = translate(sites2seq(sites[i], region), None, '-') if not seq == seqs[i]: passed_validation = False print i, hids[i] if passed_validation: counter = {} for k in metadata.index: counter[k] = 0 with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i] grp = hid[:2] mysites = ' '.join([str(x) for x in sites[i]]) prefix = metadata.ix[grp,'NewPrefix'] counter[grp] += 1
assert len(values) == len(sitenums2) variants = [] for j in range(len(values)): if values[j] != '.': variants.append(sitenums2[j] + values[j]) sites2.append(str2sites(' '.join(variants), add16k=True)) newcounts2 = [int(x) for x in counts2] ## Validate variant sites passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i for i in range(len(sites2)): seq = sites2seq(sites2[i], region) mysites = seq2sites(seq) if not sites2[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i if passed_validation:
found = False for f in e['features']: if f[0] == 'misc_feature' and f[1][1][1] == 'segment 1': c = int(f[1][0].split('..')[1]) - 1 s = e['sequence'] sites.append(seq2sites(s[:c]) + seq2sites(s[c:])) found = True if not found: print 'problem with isolate %s' % name[3] # Vigilant GenBank data have variable sequence lengths # normalize all sites to specified range in the metadata file mysites = [] for i in range(len(sites)): seq1 = translate(sites2seq(sites[i], region1), None, '-') seq2 = translate(sites2seq(sites[i], region2), None, '-') s = seq2sites(seq1) + seq2sites(seq2) mysites.append(' '.join([str(x) for x in s])) counter = {} for k in metadata.index: counter[k] = 0 with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i] grp = pops[i] if grp in metadata.index: repeat = 1 if hid in counts.keys(): repeat = counts[hid]
data = f.readlines() hids = [] sites = [] for l in data: e = l.strip().split() hids.append(e[0]) sites.append(' '.join(e[1:])) ## Validate passed_validation = True for i in range(len(sites)): curr_sites = str2sites(sites[i], add16k=True) seq = translate(sites2seq(curr_sites, region), None, '-') mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = {} for k in metadata.index: counter[k] = 0 with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i] seq = translate(sites2seq(sites[i], region, add16k=True), None, '-')
## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0,'SeqRange']) with open('boattini2013.csv', 'rU') as f: data = f.readlines() ## Validate variant sites passed_validation = True for l in data: parts = l.strip().split(',') sites = parts[1].split() sites.sort() sites = ' '.join(sites) seq1 = sites2seq(parts[1], region) mysites = seq2sites(seq1) if not sites == ' '.join([str(x) for x in mysites]): if not translate(seq1, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print l if passed_validation: with open('processed.csv', 'w') as f: for l in data: parts = l.strip().split(',') origid = parts[0] key = parts[0][:3] m = re.search(r'[0-9]+', parts[0]) counter = m.group() newid = metadata.ix[key,'NewPrefix'] + counter.zfill(3)
if val.startswith('d'): s[j].value = '-' newsites[i].append(s[j]) if len(val) > 2: p = Polymorphism(s[j].position+1,0,'-') newsites[i].append(p) elif val.startswith('i'): if val.startswith('ii'): val = val[1:] inserts = list(val[1:]) for k in range(len(inserts)): p = Polymorphism(s[j].position, k+1, inserts[k]) newsites[i].append(p) elif val.startswith('.'): pos = s[j].position p = Polymorphism(pos, 1, sites2seq('', (pos,pos))) newsites[i].append(p) else: newsites[i].append(s[j]) ## Validate passed_validation = True for i in range(len(newsites)): curr_sites = newsites[i] seq = translate(sites2seq(curr_sites, region), None, '-') mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False
sites = [] with open('tofanelli2009.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) groups.append(row[1]) sites_str = ' '.join(row[3].split('-')) sites.append(str2sites(sites_str, add16k=True)) ## Validate passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not mysites == sites[i]: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] counter = {} for k in metadata.index: counter[k] = 0 if passed_validation: with open('processed.csv', 'w') as f: for i in range(len(groups)): key = groups[i]
row = len(iids) - 1 counts[row,] = [int(x) for x in parts[1:3]] sites.append(" ".join(parts[3:])) ## Validate passed_validation = True # there are sites in the source table that are not actual variant sites # sequence 9 (index 8) has 263A as a variant # sequence 12 (index 11) has 16223C as a variant not_polys = [Polymorphism(263, 0, "A"), Polymorphism(16223, 0, "C")] for i in range(len(sites)): curr_sites = sites[i] curr_polys = [x for x in str2sites(curr_sites) if x not in not_polys] cseq1 = sites2seq(curr_sites, region1) cseq2 = sites2seq(curr_sites, region2) mysites1 = seq2sites(cseq1) mysites2 = seq2sites(cseq2) mysites = mysites1 + mysites2 if not mysites == curr_polys: passed_validation = False print iids[i] if passed_validation: counters = [1] * 2 with open("processed.csv", "w") as f: for i in range(len(sites)): curr_sites = str2sites(sites[i]) mysites = [x for x in curr_sites if x not in not_polys] mysites = " ".join([str(x) for x in mysites])
newrow.append('%s.%s%s' % (loc, str(i+1), nuc[i % len(nuc)])) else: newrow.append(s) else: newrow.append(s) newsites.append(' '.join(newrow)) ## Validate passed_validation = True for i in range(len(newsites)): curr_sites = str2sites(newsites[i]) # some entries have data outside explicitly sequenced 15900-640 region # get rid of extra sites curr_sites = [x for x in curr_sites if x.position >= 15900 or x.position <= 640] seq = sites2seq(curr_sites, region) mysites = seq2sites(seq) if not mysites == curr_sites: myseq = sites2seq(mysites, region) if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = {} for i in metadata.index: counter[i] = 0 with open('processed.csv', 'w') as f: for i in range(len(newsites)): curr_sites = str2sites(newsites[i]) # some entries have data outside explicitly sequenced 15900-640 region
reader = csv.reader(f) reader.next() # skip past header for row in reader: if row[4] == "Senegalese": hids.append(row[0]) hvr1.append(str2sites(row[2], add16k=True)) hvr2.append(str2sites(row[3])) for i in range(len(hids)): sites.append(hvr1[i] + hvr2[i]) ## Validate variant sites passed_validation = True for i in range(len(hids)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, "-") == translate(sites2seq(mysites, region), None, "-"): passed_validation = False print i if passed_validation: counter = 0 prefix = metadata.ix[0, "NewPrefix"] with open("processed.csv", "w") as f: for i in range(len(hids)): counter = counter + 1 newid = prefix + str(counter).zfill(3) seq = sites2seq(sites[i], region)
grps = [] hids = [] sites = [] for l in data: e = l.strip().split(',') grps.append(e[1]) hids.append(e[4]) sites.append(e[5]) ## Validate passed_validation = True for i in range(len(sites)): curr_sites = str2sites(sites[i]) seq = translate(sites2seq(curr_sites, region), None, '-').upper() mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i if passed_validation: counter = {} for k in metadata.index: counter[k] = 0 with open('processed.csv', 'w') as f: for i in range(len(sites)): grp = grps[i]
# some of the sequences are short. Many are just missing a base or two # from the beginning or the end - will keep those # a few are missing large chunks of the end, so will drop those ok = [41, 42, 43, 44, 54, 55, 83, 85, 89, 136, 137, 157, 178] skip = [8, 123, 124] # validate passed_validation = True for i in range(len(sites)): if i in ok or i in skip: pass else: seq1 = data[i]['sequence'].upper() if not seq1 == translate(sites2seq(sites[i], region), None, '-'): passed_validation = False print i, hids[i] counter = {} for k in metadata.GroupName: counter[k] = 0 if passed_validation: with open('processed.csv', 'w') as f: for i in range(len(sites)): if i not in skip: hid = hids[i] key = None prefix = None for pattern in metadata.index:
m = re.match(r"([0-9]+)[A-Z]/([A-Z])", s) s = "%s%s" % m.groups() elif ".1" in s: s = s + "C" elif ".2" in s: h2f.append(s[:-1] + "1C") s = s + "C" h2f.append(s) hvr2[i] = " ".join(h2f) ## Validate passed_validation = True for i in range(len(freq)): curr_sites = str2sites(hvr1[i], add16k=True) seq = translate(sites2seq(curr_sites, region1), None, "-") mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region1), None, "-") if not seq == myseq: passed_validation = False print i, "hvr1" curr_sites = str2sites(hvr2[i]) seq = translate(sites2seq(curr_sites, region2), None, "-") mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region2), None, "-") if not seq == myseq: passed_validation = False print i, "hvr2"
drop = ['San_43', 'San_67', 'tzbg040', 'tzdt045', 'tzhz108', 'tzhz130', 'tzhz131'] hids = [] seqs = [] for e in data: name_parts = e['name'].split() if name_parts[0] not in drop: hids.append(name_parts[0]) seqs.append(e['sequence']) ## Validate passed_validation = True for i in range(len(seqs)): mysites = seq2sites(seqs[i]) myseq = translate(sites2seq(mysites, region), None, '-') if not seqs[i] == myseq: passed_validation = False print i, hids[i] if passed_validation: with open('processed.csv', 'w') as f: for i in range(len(seqs)): mysites = ' '.join([str(x) for x in seq2sites(seqs[i])]) origid = hids[i] prefix = metadata.ix[origid[:4],'NewPrefix'] num = origid[4:].split('_')[0].zfill(3) newid = prefix + num f.write('%s,%s,%s\n' % (newid, origid, mysites))
for i in range(len(data)): x = data[i].strip().split(',') hids.append(x[0]) sites.append(x[2]) count = x[4:] for j in range(5): if count[j] == '': count[j] = '0' counts[i,] = [int(y) for y in count] ## Validate passed_validation = True for i in range(len(sites)): curr_sites = str2sites(sites[i]) seq = sites2seq(curr_sites, region) mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = [0] * 5 with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i] curr_sites = str2sites(sites[i]) seq = sites2seq(curr_sites, region) mysites = ' '.join([str(x) for x in seq2sites(seq)])
groups = [] sites = [] with open('defilippo2010.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) groups.append(row[1]) sites.append(str2sites(row[3])) ## Validate passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not mysites == sites[i]: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] counter = {} for k in metadata.index: counter[k] = 0 if passed_validation: with open('processed.csv', 'w') as f: for i in range(len(groups)): key = groups[i]
elif j == 84: y.append('309.2C') elif j == 85: y.append('313.1C') elif j == 86: y.append('315.1C') else: y.append('%s%s' % (positions[j], x[j])) sites.append(' '.join(y)) ## Validate passed_validation = True for i in range(len(sites)): curr_sites = str2sites(sites[i]) cseq1 = sites2seq(curr_sites, region1) cseq2 = sites2seq(curr_sites, region2) mysites1 = seq2sites(cseq1) mysites2 = seq2sites(cseq2) mysites = mysites1 + mysites2 if not mysites == curr_sites: seq = cseq1 + cseq2 myseq = translate(sites2seq(mysites, region1), None, '-') + translate(sites2seq(mysites, region2), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: count = 0 prefix = metadata.ix[0,'NewPrefix'] with open('processed.csv', 'w') as f: