def test_haplotype_2236():
    sites = str2sites('16126C 16163G 16185.1T 16185.2T 16189d 16294T 16519C')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_3070():
    sites = str2sites('16093C 16183d 16184d 16191.1T 16191.2T 16270T')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_2911():
    sites = str2sites('16051G 16129C 16182d 16183d 16193.1C 16193.2C 16362C 16519C')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
def test_haplotype_4827():
    sites = str2sites('16172C 16183d 16193.1C 16193.2C 16223T 16320T')
    seq   = sites2seq(sites, region=range(16000,16570))
    rts   = seq2sites(seq) # rts: round trip sites
    print 'EXP: %s' % sites
    print 'OBS: %s' % rts
    assert sites == rts
Exemple #5
0
        if "/" in s:
            m = re.match(r"([0-9]+)[A-Z]/([A-Z])", s)
            s = "%s%s" % m.groups()
        elif ".1" in s:
            s = s + "C"
        elif ".2" in s:
            h2f.append(s[:-1] + "1C")
            s = s + "C"
        h2f.append(s)
    hvr2[i] = " ".join(h2f)

## Validate
passed_validation = True

for i in range(len(freq)):
    curr_sites = str2sites(hvr1[i], add16k=True)
    seq = translate(sites2seq(curr_sites, region1), None, "-")
    mysites = seq2sites(seq)
    if not mysites == curr_sites:
        myseq = translate(sites2seq(mysites, region1), None, "-")
        if not seq == myseq:
            passed_validation = False
            print i, "hvr1"
    curr_sites = str2sites(hvr2[i])
    seq = translate(sites2seq(curr_sites, region2), None, "-")
    mysites = seq2sites(seq)
    if not mysites == curr_sites:
        myseq = translate(sites2seq(mysites, region2), None, "-")
        if not seq == myseq:
            passed_validation = False
            print i, "hvr2"
Exemple #6
0
for i in range(len(data)):
        x = data[i].strip().split(',')
        hids.append(x[0])
        sites.append(x[2])
        count = x[4:]
        for j in range(5):
        	if count[j] == '':
        		count[j] = '0'
        counts[i,] = [int(y) for y in count]

## Validate
passed_validation = True

for i in range(len(sites)):
	curr_sites = str2sites(sites[i])
	seq = sites2seq(curr_sites, region)
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = [0] * 5
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			hid = hids[i]
			curr_sites = str2sites(sites[i])
			seq = sites2seq(curr_sites, region)
Exemple #7
0
## load metadata
metadata = pd.read_csv("metadata.csv", index_col=0)
region = range2region("16030-16569;1-600")

hids = []
hvr1 = []
hvr2 = []
sites = []

with open("stefflova2009.csv", "rU") as f:
    reader = csv.reader(f)
    reader.next()  # skip past header
    for row in reader:
        if row[4] == "Senegalese":
            hids.append(row[0])
            hvr1.append(str2sites(row[2], add16k=True))
            hvr2.append(str2sites(row[3]))

for i in range(len(hids)):
    sites.append(hvr1[i] + hvr2[i])

## Validate variant sites
passed_validation = True

for i in range(len(hids)):
    seq = sites2seq(sites[i], region)
    mysites = seq2sites(seq)
    if not sites[i] == mysites:
        if not translate(seq, None, "-") == translate(sites2seq(mysites, region), None, "-"):
            passed_validation = False
            print i
Exemple #8
0
sys.path.append('../../scripts')
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)

hids = []
sites = []

with open('alabri2012.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		sites.append(str2sites(row[4]))

## Validate
passed_validation = True

for i in range(len(sites)):
	region = range2region(metadata.ix[hids[i][:2],'SeqRange'])
	seq = translate(sites2seq(sites[i], region), None, '-')
	mysites = seq2sites(seq)
	if not mysites == sites[i]:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

counter = {}
Exemple #9
0
	locations[-4] = locations[-4] + '.1'

	f.readline() # skip past anderson sequence

	for line in f:
		parts = line.split()
		# drop non-Mbundu (AN9) and individuals without HVR2 (AN130, AN42)
		if parts[0] not in ['AN9', 'AN130', 'AN42']:
			hids.append(parts[0])
			bits = [x for x in parts[1]] + [x for x in parts[2]]
			assert len(bits) == len(locations)
			variants = []
			for i in range(len(locations)):
				if bits[i] != '.':
					variants.append(locations[i]+bits[i])
			sites.append(str2sites(' '.join(variants)))

## Validate
passed_validation = True

for i in range(len(hids)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i, hids[i]


if passed_validation:
	counter = 0
Exemple #10
0
for i in range(len(sites)):
	s = sites[i].split()
	s2 = []
	for x in s:
		if 'G/A' in x:
			x = x[:3] + 'R'
		elif '-' in x:
			x = x[:3] + x[-1]
		s2.append(x)
	newsites.append(' '.join(s2))

## Validate
passed_validation = True

for i in range(len(newsites)):
	curr_sites = str2sites(newsites[i], add16k=True)
	seq = sites2seq(curr_sites, region)
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i

if passed_validation:
	counter = pd.Series([0] * counts.shape[1], index=counts.columns)
	with open('processed.csv', 'w') as f:
		for i in range(len(newsites)):
			s = newsites[i]
			curr_sites = str2sites(s, add16k=True)
			seq = sites2seq(curr_sites, region)
Exemple #11
0
## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)

groups = []
hids = []
hvr1 = []
hvr2 = []
sites = []

with open('gomes2015.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		groups.append(row[3])
		hids.append(row[1])
		hvr1.append(str2sites(row[6], add16k=True))
		hvr2.append(str2sites(row[8]))

for i in range(len(groups)):
	sites.append(hvr1[i] + hvr2[i])

## Validate variant sites
passed_validation = True

for i in range(len(groups)):
	region = range2region(metadata.ix[groups[i],'SeqRange'])
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
Exemple #12
0
sitenums = []
for i in range(len(header[0][1])):
	sitenums.append(header[0][1][i] + header[1][1][i] + header[2][1][i])
sitenums = [x for x in sitenums if x != '   ']

for i in range(len(data)):
	values = data[i].split()
	assert len(values) == len(sitenums)

	variants = []
	for j in range(len(values)):
		if values[j] != '.':
			variants.append(sitenums[j] + values[j])

	sites.append(str2sites(' '.join(variants), add16k=True))

newcounts = []
for i in range(len(counts)):
	fixed = []
	for j in range(len(counts[i])):
		if counts[i][j] == '\xc9':
			fixed.append(0)
		else:
			fixed.append(int(counts[i][j]))
	newcounts.append(fixed)

header2 = []
hids2 = []
data2 = []
counts2 = []
Exemple #13
0
with open('gonzalez2006_haplotypes.csv', 'rU') as f:
	f.readline() # skip past header
	data = f.readlines()

hids = []
sites = []

for l in data:
	parts = l.strip().split(',')
	hids.append(parts[0])
	sites.append('%s %s' % (parts[1],parts[2]))

## need to preprocess sites data because Gonzalez et al. use some nonstandard notation
newsites = []
for i in range(len(sites)):
	s = str2sites(sites[i])
	newsites.append([])
	for j in range(len(s)):
		val = s[j].value
		if val.startswith('d'):
			s[j].value = '-'
			newsites[i].append(s[j])
			if len(val) > 2:
				p = Polymorphism(s[j].position+1,0,'-')
				newsites[i].append(p)
		elif val.startswith('i'):
			if val.startswith('ii'):
				val = val[1:]
			inserts = list(val[1:])
			for k in range(len(inserts)):
				p = Polymorphism(s[j].position, k+1, inserts[k])
Exemple #14
0
def process_sites2seq(form):
    """Process data submitted in sites2seq form"""

    problems = []
    valid = True

    # first, just assume whatever is in the textarea is the submission
    # even if that may be nothing
    content = form.cleaned_data['query']

    # then check to see if a file was supplied, and if so, replace the
    # previously assumed content with the file data
    if form.cleaned_data['file'] is not None:
        if form.cleaned_data['file'].multiple_chunks():
            pass
            # error - return with error
        content = form.cleaned_data['file'].read()

    content_lines = content.strip().split('\n')
    names = []
    ns = []
    motifs = []
    count = 0
    for curr_line in content_lines:
        line = re.sub(r'[,;]', ' ', curr_line)

        count += 1
        name = 'Seq%s' % count
        n = 1
        motif = line
        if form.cleaned_data['format'] == 'name_and_motif':
            split = line.split(' ', 1)
            if len(split) == 2:
                name, motif = split 
            else:
                valid = False
                msg = 'The entry "%s" is not correctly formatted' % curr_line
                problems.append(msg)
        elif form.cleaned_data['format'] == 'name_n_and_motif':
            split = line.split(' ', 2)
            if len(split) == 3:
                name, n, motif = split 
                if re.match(r'^[0-9]+$', n) is None:
                    valid = False
                    problems.append("One of the given 'N's is not a number")
                else:
                    n = int(n)
            else:
                valid = False
                msg = 'The entry "%s" is not correctly formatted' % curr_line
                problems.append(msg)
        names.append(name)
        ns.append(n)
        motifs.append(motif)
        
    if valid:
        pnames = []
        pseqs = []
        for name,n,motif in zip(names,ns,motifs):
            try:
                sites = str2sites(motif)
                seq = sites2seq(sites, region=form.cleaned_data['output'], add16k=form.cleaned_data['add16k'])
                for i in range(n):
                    pnames.append(name)
                    pseqs.append(seq)
            except Exception, e:
                valid = False
                problems.append(e)
Exemple #15
0
## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0, 'SeqRange'])

hids = []
groups = []
sites = []

with open('tofanelli2009.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		groups.append(row[1])
		sites_str = ' '.join(row[3].split('-'))
		sites.append(str2sites(sites_str, add16k=True))

## Validate
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not mysites == sites[i]:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

counter = {}
for k in metadata.index:
Exemple #16
0
    iids.append(parts[0])
    row = len(iids) - 1
    counts[row,] = [int(x) for x in parts[1:3]]
    sites.append(" ".join(parts[3:]))

## Validate
passed_validation = True

# there are sites in the source table that are not actual variant sites
# sequence 9 (index 8) has 263A as a variant
# sequence 12 (index 11) has 16223C as a variant
not_polys = [Polymorphism(263, 0, "A"), Polymorphism(16223, 0, "C")]

for i in range(len(sites)):
    curr_sites = sites[i]
    curr_polys = [x for x in str2sites(curr_sites) if x not in not_polys]
    cseq1 = sites2seq(curr_sites, region1)
    cseq2 = sites2seq(curr_sites, region2)
    mysites1 = seq2sites(cseq1)
    mysites2 = seq2sites(cseq2)
    mysites = mysites1 + mysites2
    if not mysites == curr_polys:
        passed_validation = False
        print iids[i]

if passed_validation:
    counters = [1] * 2
    with open("processed.csv", "w") as f:
        for i in range(len(sites)):
            curr_sites = str2sites(sites[i])
            mysites = [x for x in curr_sites if x not in not_polys]
Exemple #17
0
        for j in range(counts.shape[1]):
        	if count[j] == '':
        		count[j] = '0'
        counts[i,] = [int(y) for y in count]

counts = pd.DataFrame(counts, columns=popnames)

## Validate
passed_validation = True

# use larger region for validation
region = range2region('16000-16400')

for i in range(len(sites)):
	x = sites[i]
	curr_sites = str2sites(x, add16k=True)
	seq = sites2seq(curr_sites, region)
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = pd.Series([0] * counts.shape[1], index=counts.columns)
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			hid = hids[i]
			for pop in metadata.index:
				prefix = metadata.ix[pop,'NewPrefix']
Exemple #18
0
    def new_query_from_sites(self, sites, label='Query', add16k=False):

        return MotifQuery(defining_polymorphisms=str2sites(sites, add16k), label=label)
Exemple #19
0
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0,'SeqRange'])

hids = []
sites = []

with open('poetsch2013.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		sitestr = ' '.join(row[1:])
		sites.append(str2sites(sitestr))


## Validate variant sites
passed_validation = True

for i in range(len(hids)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i


if passed_validation:
Exemple #20
0
with open('salas2002_haplotypes.txt', 'rU') as f:
	data = f.readlines()

hids = []
sites = []

for l in data:
	e = l.strip().split()
	hids.append(e[0])
	sites.append(' '.join(e[1:]))

## Validate
passed_validation = True

for i in range(len(sites)):
	curr_sites = str2sites(sites[i], add16k=True)
	seq = translate(sites2seq(curr_sites, region), None, '-')
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = {}
	for k in metadata.index:
		counter[k] = 0
	with open('processed.csv', 'w') as f:
		for i in range(len(sites)):
			hid = hids[i]
Exemple #21
0
				count = parts[1][0]
				nuc = parts[1][1:]
				count = int(count) * len(nuc)
				for i in range(int(count)):
					newrow.append('%s.%s%s' % (loc, str(i+1), nuc[i % len(nuc)]))
			else:
				newrow.append(s)
		else:
			newrow.append(s)
	newsites.append(' '.join(newrow))

## Validate
passed_validation = True

for i in range(len(newsites)):
	curr_sites = str2sites(newsites[i])
	# some entries have data outside explicitly sequenced 15900-640 region
	# get rid of extra sites
	curr_sites = [x for x in curr_sites if x.position >= 15900 or x.position <= 640]
	seq = sites2seq(curr_sites, region)
	mysites = seq2sites(seq)
	if not mysites == curr_sites:
		myseq = sites2seq(mysites, region)
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

if passed_validation:
	counter = {}
	for i in metadata.index:
		counter[i] = 0
Exemple #22
0
	elif x == '309.1':
		x = '309.1C'
	elif x == '315.1':
		x = '315.1C'
	elif '(' in x:
		x = ''
	return x	

for i in range(len(hvr1)):
	hvr1[i] = [fix2(x) for x in hvr1[i]]

for i in range(len(hvr2)):
	hvr2[i] = [fix2(x) for x in hvr2[i]]

for i in range(len(hvr1)):
	sites.append(str2sites(' '.join(hvr1[i]), add16k=True) + str2sites(' '.join(hvr2[i])))

## Validate variant sites
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i

if passed_validation:
	counter = [0] * 3
	with open('processed.csv', 'w') as f:
Exemple #23
0
with open('batini2011.csv', 'rU') as f:
	reader = csv.reader(f)
	header = reader.next()
	popnames = header[3:13]
	for row in reader:
		hids.append(row[0])
		data.append(row[13].split(','))
		counts.append(row[3:13])

# convert counts to integers
newcounts = []
for i in range(len(counts)):
	newcounts.append([int(x) for x in counts[i]])

for i in range(len(data)):
	sites.append(str2sites(' '.join(data[i])))

## Validate variant sites
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region).upper()
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i

counter = {}
for k in popnames:
	counter[k] = 0
Exemple #24
0
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0,'SeqRange'])

counts = []
sites = []
popnames = None

with open('cerny2011.csv', 'rU') as f:
	reader = csv.reader(f)
	header = reader.next()
	popnames = header[2:]
	for row in reader:
		sites.append(str2sites(row[0], add16k=True))
		counts.append(row[2:])

def convert(x):
	if x == '':
		return 0
	return int(x)

countm = np.zeros((len(counts), len(popnames)), dtype=np.int)

for i in range(len(counts)):
	countm[i] = [convert(x) for x in counts[i]]

## Validate
passed_validation = True
Exemple #25
0
from utils import *

## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0,'SeqRange'])

hids = []
sites = []

with open('podgorna2013.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		if row[0] != 'DAZ_40':
			hids.append(row[0])
			sites.append(str2sites(row[2]))


## Validate variant sites
passed_validation = True

for i in range(len(hids)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i


if passed_validation:
Exemple #26
0
## load metadata
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region('16024-16569;1-397')

hids = []
groups = []
sites = []

with open('defilippo2010.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		groups.append(row[1])
		sites.append(str2sites(row[3]))

## Validate
passed_validation = True

for i in range(len(sites)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not mysites == sites[i]:
		myseq = translate(sites2seq(mysites, region), None, '-')
		if not seq == myseq:
			passed_validation = False
			print i, hids[i]

counter = {}
for k in metadata.index:
Exemple #27
0
metadata = pd.read_csv('metadata.csv', index_col=0)
region = range2region(metadata.ix[0,'SeqRange'])

counts = pd.read_csv('plaster2011_counts.csv', index_col=0)
counts = counts.fillna(0)

hids = []
sites = []

with open('plaster2011_haplotypes.csv', 'rU') as f:
	reader = csv.reader(f)
	reader.next() # skip past header
	for row in reader:
		hids.append(row[0])
		parts = row[1].split(',')
		sites.append(str2sites(' '.join(parts), add16k=True))

## Validate
passed_validation = True

for i in range(len(hids)):
	seq = sites2seq(sites[i], region)
	mysites = seq2sites(seq)
	if not sites[i] == mysites:
		if not translate(seq, None, '-') == translate(sites2seq(mysites, region), None, '-'):
			passed_validation = False
			print i, hids[i]

counter = {}
for k in counts.columns:
	counter[k] = 0