def test_haplotype_2236(): sites = str2sites('16126C 16163G 16185.1T 16185.2T 16189d 16294T 16519C') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_3070(): sites = str2sites('16093C 16183d 16184d 16191.1T 16191.2T 16270T') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_2911(): sites = str2sites('16051G 16129C 16182d 16183d 16193.1C 16193.2C 16362C 16519C') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
def test_haplotype_4827(): sites = str2sites('16172C 16183d 16193.1C 16193.2C 16223T 16320T') seq = sites2seq(sites, region=range(16000,16570)) rts = seq2sites(seq) # rts: round trip sites print 'EXP: %s' % sites print 'OBS: %s' % rts assert sites == rts
if "/" in s: m = re.match(r"([0-9]+)[A-Z]/([A-Z])", s) s = "%s%s" % m.groups() elif ".1" in s: s = s + "C" elif ".2" in s: h2f.append(s[:-1] + "1C") s = s + "C" h2f.append(s) hvr2[i] = " ".join(h2f) ## Validate passed_validation = True for i in range(len(freq)): curr_sites = str2sites(hvr1[i], add16k=True) seq = translate(sites2seq(curr_sites, region1), None, "-") mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region1), None, "-") if not seq == myseq: passed_validation = False print i, "hvr1" curr_sites = str2sites(hvr2[i]) seq = translate(sites2seq(curr_sites, region2), None, "-") mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region2), None, "-") if not seq == myseq: passed_validation = False print i, "hvr2"
for i in range(len(data)): x = data[i].strip().split(',') hids.append(x[0]) sites.append(x[2]) count = x[4:] for j in range(5): if count[j] == '': count[j] = '0' counts[i,] = [int(y) for y in count] ## Validate passed_validation = True for i in range(len(sites)): curr_sites = str2sites(sites[i]) seq = sites2seq(curr_sites, region) mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = [0] * 5 with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i] curr_sites = str2sites(sites[i]) seq = sites2seq(curr_sites, region)
## load metadata metadata = pd.read_csv("metadata.csv", index_col=0) region = range2region("16030-16569;1-600") hids = [] hvr1 = [] hvr2 = [] sites = [] with open("stefflova2009.csv", "rU") as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: if row[4] == "Senegalese": hids.append(row[0]) hvr1.append(str2sites(row[2], add16k=True)) hvr2.append(str2sites(row[3])) for i in range(len(hids)): sites.append(hvr1[i] + hvr2[i]) ## Validate variant sites passed_validation = True for i in range(len(hids)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, "-") == translate(sites2seq(mysites, region), None, "-"): passed_validation = False print i
sys.path.append('../../scripts') from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) hids = [] sites = [] with open('alabri2012.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) sites.append(str2sites(row[4])) ## Validate passed_validation = True for i in range(len(sites)): region = range2region(metadata.ix[hids[i][:2],'SeqRange']) seq = translate(sites2seq(sites[i], region), None, '-') mysites = seq2sites(seq) if not mysites == sites[i]: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] counter = {}
locations[-4] = locations[-4] + '.1' f.readline() # skip past anderson sequence for line in f: parts = line.split() # drop non-Mbundu (AN9) and individuals without HVR2 (AN130, AN42) if parts[0] not in ['AN9', 'AN130', 'AN42']: hids.append(parts[0]) bits = [x for x in parts[1]] + [x for x in parts[2]] assert len(bits) == len(locations) variants = [] for i in range(len(locations)): if bits[i] != '.': variants.append(locations[i]+bits[i]) sites.append(str2sites(' '.join(variants))) ## Validate passed_validation = True for i in range(len(hids)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i, hids[i] if passed_validation: counter = 0
for i in range(len(sites)): s = sites[i].split() s2 = [] for x in s: if 'G/A' in x: x = x[:3] + 'R' elif '-' in x: x = x[:3] + x[-1] s2.append(x) newsites.append(' '.join(s2)) ## Validate passed_validation = True for i in range(len(newsites)): curr_sites = str2sites(newsites[i], add16k=True) seq = sites2seq(curr_sites, region) mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i if passed_validation: counter = pd.Series([0] * counts.shape[1], index=counts.columns) with open('processed.csv', 'w') as f: for i in range(len(newsites)): s = newsites[i] curr_sites = str2sites(s, add16k=True) seq = sites2seq(curr_sites, region)
## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) groups = [] hids = [] hvr1 = [] hvr2 = [] sites = [] with open('gomes2015.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: groups.append(row[3]) hids.append(row[1]) hvr1.append(str2sites(row[6], add16k=True)) hvr2.append(str2sites(row[8])) for i in range(len(groups)): sites.append(hvr1[i] + hvr2[i]) ## Validate variant sites passed_validation = True for i in range(len(groups)): region = range2region(metadata.ix[groups[i],'SeqRange']) seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False
sitenums = [] for i in range(len(header[0][1])): sitenums.append(header[0][1][i] + header[1][1][i] + header[2][1][i]) sitenums = [x for x in sitenums if x != ' '] for i in range(len(data)): values = data[i].split() assert len(values) == len(sitenums) variants = [] for j in range(len(values)): if values[j] != '.': variants.append(sitenums[j] + values[j]) sites.append(str2sites(' '.join(variants), add16k=True)) newcounts = [] for i in range(len(counts)): fixed = [] for j in range(len(counts[i])): if counts[i][j] == '\xc9': fixed.append(0) else: fixed.append(int(counts[i][j])) newcounts.append(fixed) header2 = [] hids2 = [] data2 = [] counts2 = []
with open('gonzalez2006_haplotypes.csv', 'rU') as f: f.readline() # skip past header data = f.readlines() hids = [] sites = [] for l in data: parts = l.strip().split(',') hids.append(parts[0]) sites.append('%s %s' % (parts[1],parts[2])) ## need to preprocess sites data because Gonzalez et al. use some nonstandard notation newsites = [] for i in range(len(sites)): s = str2sites(sites[i]) newsites.append([]) for j in range(len(s)): val = s[j].value if val.startswith('d'): s[j].value = '-' newsites[i].append(s[j]) if len(val) > 2: p = Polymorphism(s[j].position+1,0,'-') newsites[i].append(p) elif val.startswith('i'): if val.startswith('ii'): val = val[1:] inserts = list(val[1:]) for k in range(len(inserts)): p = Polymorphism(s[j].position, k+1, inserts[k])
def process_sites2seq(form): """Process data submitted in sites2seq form""" problems = [] valid = True # first, just assume whatever is in the textarea is the submission # even if that may be nothing content = form.cleaned_data['query'] # then check to see if a file was supplied, and if so, replace the # previously assumed content with the file data if form.cleaned_data['file'] is not None: if form.cleaned_data['file'].multiple_chunks(): pass # error - return with error content = form.cleaned_data['file'].read() content_lines = content.strip().split('\n') names = [] ns = [] motifs = [] count = 0 for curr_line in content_lines: line = re.sub(r'[,;]', ' ', curr_line) count += 1 name = 'Seq%s' % count n = 1 motif = line if form.cleaned_data['format'] == 'name_and_motif': split = line.split(' ', 1) if len(split) == 2: name, motif = split else: valid = False msg = 'The entry "%s" is not correctly formatted' % curr_line problems.append(msg) elif form.cleaned_data['format'] == 'name_n_and_motif': split = line.split(' ', 2) if len(split) == 3: name, n, motif = split if re.match(r'^[0-9]+$', n) is None: valid = False problems.append("One of the given 'N's is not a number") else: n = int(n) else: valid = False msg = 'The entry "%s" is not correctly formatted' % curr_line problems.append(msg) names.append(name) ns.append(n) motifs.append(motif) if valid: pnames = [] pseqs = [] for name,n,motif in zip(names,ns,motifs): try: sites = str2sites(motif) seq = sites2seq(sites, region=form.cleaned_data['output'], add16k=form.cleaned_data['add16k']) for i in range(n): pnames.append(name) pseqs.append(seq) except Exception, e: valid = False problems.append(e)
## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0, 'SeqRange']) hids = [] groups = [] sites = [] with open('tofanelli2009.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) groups.append(row[1]) sites_str = ' '.join(row[3].split('-')) sites.append(str2sites(sites_str, add16k=True)) ## Validate passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not mysites == sites[i]: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] counter = {} for k in metadata.index:
iids.append(parts[0]) row = len(iids) - 1 counts[row,] = [int(x) for x in parts[1:3]] sites.append(" ".join(parts[3:])) ## Validate passed_validation = True # there are sites in the source table that are not actual variant sites # sequence 9 (index 8) has 263A as a variant # sequence 12 (index 11) has 16223C as a variant not_polys = [Polymorphism(263, 0, "A"), Polymorphism(16223, 0, "C")] for i in range(len(sites)): curr_sites = sites[i] curr_polys = [x for x in str2sites(curr_sites) if x not in not_polys] cseq1 = sites2seq(curr_sites, region1) cseq2 = sites2seq(curr_sites, region2) mysites1 = seq2sites(cseq1) mysites2 = seq2sites(cseq2) mysites = mysites1 + mysites2 if not mysites == curr_polys: passed_validation = False print iids[i] if passed_validation: counters = [1] * 2 with open("processed.csv", "w") as f: for i in range(len(sites)): curr_sites = str2sites(sites[i]) mysites = [x for x in curr_sites if x not in not_polys]
for j in range(counts.shape[1]): if count[j] == '': count[j] = '0' counts[i,] = [int(y) for y in count] counts = pd.DataFrame(counts, columns=popnames) ## Validate passed_validation = True # use larger region for validation region = range2region('16000-16400') for i in range(len(sites)): x = sites[i] curr_sites = str2sites(x, add16k=True) seq = sites2seq(curr_sites, region) mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = pd.Series([0] * counts.shape[1], index=counts.columns) with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i] for pop in metadata.index: prefix = metadata.ix[pop,'NewPrefix']
def new_query_from_sites(self, sites, label='Query', add16k=False): return MotifQuery(defining_polymorphisms=str2sites(sites, add16k), label=label)
from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0,'SeqRange']) hids = [] sites = [] with open('poetsch2013.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) sitestr = ' '.join(row[1:]) sites.append(str2sites(sitestr)) ## Validate variant sites passed_validation = True for i in range(len(hids)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i if passed_validation:
with open('salas2002_haplotypes.txt', 'rU') as f: data = f.readlines() hids = [] sites = [] for l in data: e = l.strip().split() hids.append(e[0]) sites.append(' '.join(e[1:])) ## Validate passed_validation = True for i in range(len(sites)): curr_sites = str2sites(sites[i], add16k=True) seq = translate(sites2seq(curr_sites, region), None, '-') mysites = seq2sites(seq) if not mysites == curr_sites: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = {} for k in metadata.index: counter[k] = 0 with open('processed.csv', 'w') as f: for i in range(len(sites)): hid = hids[i]
count = parts[1][0] nuc = parts[1][1:] count = int(count) * len(nuc) for i in range(int(count)): newrow.append('%s.%s%s' % (loc, str(i+1), nuc[i % len(nuc)])) else: newrow.append(s) else: newrow.append(s) newsites.append(' '.join(newrow)) ## Validate passed_validation = True for i in range(len(newsites)): curr_sites = str2sites(newsites[i]) # some entries have data outside explicitly sequenced 15900-640 region # get rid of extra sites curr_sites = [x for x in curr_sites if x.position >= 15900 or x.position <= 640] seq = sites2seq(curr_sites, region) mysites = seq2sites(seq) if not mysites == curr_sites: myseq = sites2seq(mysites, region) if not seq == myseq: passed_validation = False print i, hids[i] if passed_validation: counter = {} for i in metadata.index: counter[i] = 0
elif x == '309.1': x = '309.1C' elif x == '315.1': x = '315.1C' elif '(' in x: x = '' return x for i in range(len(hvr1)): hvr1[i] = [fix2(x) for x in hvr1[i]] for i in range(len(hvr2)): hvr2[i] = [fix2(x) for x in hvr2[i]] for i in range(len(hvr1)): sites.append(str2sites(' '.join(hvr1[i]), add16k=True) + str2sites(' '.join(hvr2[i]))) ## Validate variant sites passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i if passed_validation: counter = [0] * 3 with open('processed.csv', 'w') as f:
with open('batini2011.csv', 'rU') as f: reader = csv.reader(f) header = reader.next() popnames = header[3:13] for row in reader: hids.append(row[0]) data.append(row[13].split(',')) counts.append(row[3:13]) # convert counts to integers newcounts = [] for i in range(len(counts)): newcounts.append([int(x) for x in counts[i]]) for i in range(len(data)): sites.append(str2sites(' '.join(data[i]))) ## Validate variant sites passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region).upper() mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i counter = {} for k in popnames: counter[k] = 0
from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0,'SeqRange']) counts = [] sites = [] popnames = None with open('cerny2011.csv', 'rU') as f: reader = csv.reader(f) header = reader.next() popnames = header[2:] for row in reader: sites.append(str2sites(row[0], add16k=True)) counts.append(row[2:]) def convert(x): if x == '': return 0 return int(x) countm = np.zeros((len(counts), len(popnames)), dtype=np.int) for i in range(len(counts)): countm[i] = [convert(x) for x in counts[i]] ## Validate passed_validation = True
from utils import * ## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0,'SeqRange']) hids = [] sites = [] with open('podgorna2013.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: if row[0] != 'DAZ_40': hids.append(row[0]) sites.append(str2sites(row[2])) ## Validate variant sites passed_validation = True for i in range(len(hids)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i if passed_validation:
## load metadata metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region('16024-16569;1-397') hids = [] groups = [] sites = [] with open('defilippo2010.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) groups.append(row[1]) sites.append(str2sites(row[3])) ## Validate passed_validation = True for i in range(len(sites)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not mysites == sites[i]: myseq = translate(sites2seq(mysites, region), None, '-') if not seq == myseq: passed_validation = False print i, hids[i] counter = {} for k in metadata.index:
metadata = pd.read_csv('metadata.csv', index_col=0) region = range2region(metadata.ix[0,'SeqRange']) counts = pd.read_csv('plaster2011_counts.csv', index_col=0) counts = counts.fillna(0) hids = [] sites = [] with open('plaster2011_haplotypes.csv', 'rU') as f: reader = csv.reader(f) reader.next() # skip past header for row in reader: hids.append(row[0]) parts = row[1].split(',') sites.append(str2sites(' '.join(parts), add16k=True)) ## Validate passed_validation = True for i in range(len(hids)): seq = sites2seq(sites[i], region) mysites = seq2sites(seq) if not sites[i] == mysites: if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'): passed_validation = False print i, hids[i] counter = {} for k in counts.columns: counter[k] = 0