def _run(self): with base.connect() as con: c = con.cursor() q = 'select year, count(old) as n from (' q += 'SELECT old, idx, new, pub, min(year) as year' q += ' FROM report inner join publication' q += ' on report.pub == publication.key' q += ' where pub != "exac" and year != 2016' q += ' group by idx, new)' q += ' group by year' filename = self.data_out('first-reports.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['year', 'first_reports']) for row in c.execute(q): w.writerow([row['year'], row['n']]) q = 'select year, count(old) as n from (' q += 'SELECT old, idx, new, pub, year' q += ' FROM epdata inner join publication' q += ' on epdata.pub == publication.key' q += ' where year != 2016)' q += ' group by year' filename = self.data_out('epdata-reports.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['year', 'epdata_reports']) for row in c.execute(q): w.writerow([row['year'], row['n']])
def expected(self, simple_weights=False): """ Returns a 2d matrix of expected amino acid substitutions ratios. Uses weights based on single nucleotide substitutions. By default, these take into account rates derived from the human genome. To disable this, set simple_weights = True. """ acids, acid_map = self.acids() n = len(acids) matrix = np.zeros((n, n)) with base.connect() as con: c = con.cursor() q = 'select old, new, ' q += 'simple_weight as weight' if simple_weights else 'weight' q += ' from mutation_possible' for k, row in enumerate(c.execute(q)): old = acid_map[row['old']] new = acid_map[row['new']] matrix[old, new] += row['weight'] if old == new: print(row['old'], row['new']) return matrix / np.sum(matrix)
def _run(self): with base.connect() as con: c = con.cursor() # Load all epdata mentions q = 'select old, idx, new, pub from epdata' epdata_mentions = set() for old, idx, new, pub in c.execute(q): epdata_mentions.add((old, idx, new, pub)) # Load all reports q = 'select old, idx, new, pub from report' reports = set() for old, idx, new, pub in c.execute(q): reports.add((old, idx, new, pub)) # Compare diff = epdata_mentions - reports if diff: print('-' * 60) print('EPData establishes mutation-publication links not' ' mentioned in report table') print('-' * 60) for old, idx, new, pub in diff: print(old + str(idx) + new + ' : ' + pub) import sys sys.exit(1) print('[ok] EPData links all listed in reports')
def main(): # Create drones drones.main() ###################################################### ## DOMAINS CONFIGURATION ## ###################################################### for i in xrange(0, 18): hosts.add_host('target%i' % i, 'deploy') # for i in xrange(0,6): # hosts.add_host('srv%i' % i, 'deploy') # hosts.add_host('storage0', 'deploy') # hosts.add_host('storage1', 'deploy') # hosts.add_host('monitor0', 'deploy') # hosts.add_host('monitor1', 'deploy') # hosts.add_host('load0', 'deploy') # hosts.add_host('load1', 'deploy') # hosts.add_host('test0', 'deploy') # hosts.add_host('test1', 'deploy') ###################################################### # Connect with all drone relays hosts_map = hosts.get_hosts_list() dlist = base.connect(hosts_map) # Wait for all connections wait = defer.DeferredList(dlist) wait.addCallback(deploy_phase) # Start the Twisted reactor reactor.run()
def _run(self): with base.connect() as con: c = con.cursor() q = 'select pub, count(idx) as muts from report group by pub' q += ' order by muts desc' for row in c.execute(q): if row['muts'] < 5: break print(str(row['muts']) + ' :: ' + row['pub'])
def gather(self, q, fname1, fname2): # Query db na, ni = [], [] stda, stdi = [], [] sema, semi = [], [] with base.connect() as con: c = con.cursor() for row in c.execute(q): if row['na'] > 0: na.append(row['na']) stda.append(row['stda']) sema.append(row['sema']) if row['ni'] > 0: ni.append(row['ni']) stdi.append(row['stdi']) semi.append(row['semi']) # Calculate stda = np.array(stda) stdi = np.array(stdi) print('std a mean: ' + str(np.mean(stda))) print('std i mean: ' + str(np.mean(stdi))) print('2sr a mean: ' + str(4*np.mean(stda))) print('2sr i mean: ' + str(4*np.mean(stdi))) print('std a min: ' + str(np.min(stda))) print('std a max: ' + str(np.max(stda))) print('std i min: ' + str(np.min(stdi))) print('std i max: ' + str(np.max(stdi))) # Debug plot if False: pl.figure() pl.subplot(2,1,1) pl.plot(na, stda, 'o') pl.xlabel('na') pl.ylabel('stda') pl.subplot(2,1,2) pl.plot(ni, stdi, 'o') pl.xlabel('ni') pl.ylabel('stdi') pl.show() # Write files fname1 = self.data_out(fname1) print('Writing to ' + fname1) with open(fname1, 'w') as f: csv = self.csv_writer(f) csv.writerow(['na', 'stda']) data = [] for k, n in enumerate(na): csv.writerow((n, stda[k])) fname2 = self.data_out(fname2) print('Writing to ' + fname2) with open(fname2, 'w') as f: csv = self.csv_writer(f) csv.writerow(['ni', 'stdi']) data = [] for k, n in enumerate(ni): csv.writerow((n, stdi[k])) print('Done')
def _run(self): with base.connect() as con: c = con.cursor() # Get positions (may have gaps!) idx = [] for r in c.execute('select idx from scn5a order by idx'): idx.append(r[0]) idx = np.array(idx) # Get mutations for each position q = 'select distinct idx, new from report where pub != "exac"' mut = np.zeros(idx.shape) for r in c.execute(q): mut[r[0] - 1] = 1 # Positions start at 1 # Get Human-squid-eel and domain alignment score hse = np.zeros(idx.shape, dtype=float) dom = np.zeros(idx.shape, dtype=float) q = 'select idx, hse, dom from conservedness order by idx' for k, r in enumerate(c.execute(q)): assert(r[0] == 1 + k) # Score should be stored for each idx hse[k] = r[1] dom[k] = r[2] # # 1. Sliding window averages for mutation count, hse score and dom # score. # radius = 5 ms = window(mut, radius) hs = window(hse, radius) ds = window(dom, radius) basename = 'windowed-averages' filename = self.data_out(basename + '.txt') print('Writing info to ' + filename) with open(filename, 'w') as f: f.write('Mutation count, human-squid-eel and domain-alignment' ' score were measured per position, and then averaged' ' using a sliding window with radius ' + str(radius) + ' leading to a window size of ' + str(1 + 2 * radius) + '. At the borders, where the window is smaller, the' ' average is computed by dividing through the effective' ' window size at that point.') filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: c = self.csv_writer(f) c.writerow([ 'position', 'mutation_count', 'hse_score', 'dom_score', ]) hs = iter(hs) ds = iter(ds) for k, m in enumerate(ms): c.writerow([k+1, m, next(hs), next(ds)])
def connect_next(): global index if index >= len(hosts): print 'Successful' reactor.stop() return host = hosts[index] dlist = base.connect((host,)) dlist[0].addCallback(connected, host) dlist[0].addErrback(errback, host)
def _run(self): """ Creates the weka files! """ with base.connect() as connection: filename = self.data_out('dva-disc.arff') self.create_discretized( connection, filename, dva=True, moments=False, deltas_only=True, ) filename = self.data_out('dvi-disc.arff') self.create_discretized( connection, filename, dva=False, moments=False, deltas_only=True, ) filename = self.data_out('zero.arff') self.create_class(connection, filename, output='zero', deltas_only=True) filename = self.data_out('act.arff') self.create_class(connection, filename, output='act', deltas_only=True) filename = self.data_out('inact.arff') self.create_class(connection, filename, output='inact', deltas_only=True) filename = self.data_out('late.arff') self.create_class(connection, filename, output='late', deltas_only=True) filename = self.data_out('changed.arff') self.create_class(connection, filename, output='changed', deltas_only=True)
def observed(self): """ Returns a matrix of observed amino acid substituion ratios. """ acids, acid_map = self.acids() n = len(acids) matrix = np.zeros((n, n), dtype=float) with base.connect() as con: c = con.cursor() q = 'select distinct old, idx, new from report where pub != "exac"' for k, row in enumerate(c.execute(q)): old = acid_map[row['old']] new = acid_map[row['new']] matrix[old, new] += 1 return matrix / np.sum(matrix)
def main(): # Create drones drones.main() # Add hosts hosts.add_host('load0', 'deploy') hosts.add_host('load1', 'deploy') # Connect with all drone relays hosts_map = hosts.get_hosts_list() dlist = base.connect(hosts_map) # Wait for all connections wait = defer.DeferredList(dlist) wait.addCallback(deploy_phase) # Start the Twisted reactor reactor.run()
def main(): # Create drones drones.main() # Add hosts hosts.add_host('192.168.96.6', 'action') hosts_map = hosts.get_hosts_list() # Connect with all drone relays deferList = base.connect(hosts_map) wait = defer.DeferredList(deferList) # Decide what to do after connection setup wait.addCallback(test0) # Start the Twisted reactor reactor.run()
def gather(self, q): # Query db tmin, tmax, tmid = [], [], [] with base.connect() as con: c = con.cursor() for row in c.execute(q): tmin.append(row['tmin']) tmax.append(row['tmax']) tmid.append(0.5 * (row['tmin'] + row['tmax'])) # Create list of tmin, tmax, tmid tmin = np.array(tmin) tmax = np.array(tmax) tmid = np.array(tmid) print('TMid, mean: ' + str(np.mean(tmin))) print('TMid, std : ' + str(np.std(tmid))) print('2Sigma range: ' + str(4 * np.std(tmid))) print('Min, max: ' + str(np.min(tmin)) + ', ' + str(np.max(tmax))) # Write file print('Done')
def main(): # Create drones drones.main() # Add hosts for i in xrange(0, 18): hosts.add_host('target%i' % i, 'action') # Connect with all drone relays hosts_map = hosts.get_hosts_list() dlist = base.connect(hosts_map) # Wait for all connections wait = defer.DeferredList(dlist) wait.addCallback(deploy_phase) wait.addErrback(error) # Start the Twisted reactor reactor.run()
def _run(self): with base.connect() as con: c = con.cursor() # Retrieve all locations indices = [] q = 'select * from scn5a_isoform_b order by idx' for row in c.execute(q): indices.append(row['idx']) locations = {} for row in c.execute('select * from scn5a_diagram order by idx'): locations[row['idx']] = (row['x'], row['y']) # Write to file filename = self.data_out('diagram_isoform_b.csv') print('Writing ' + filename) with open(filename, 'w') as f: csv = self.csv_writer(f) csv.writerow(['idx', 'x', 'y']) for idx in indices: x, y = locations[idx] csv.writerow([idx, x, y])
def _run(self): with base.connect() as con: c = con.cursor() # Load known shifts print('Loading voltage shift data') q = 'select * from epdata_filtered' q += ' where dva is not null and dvi is not null' mutations = [] for row in c.execute(q): mutations.append([ int(row['idx']), float(row['dva']), float(row['dvi']), ]) # Load conservedness doms = {} hses = {} for row in c.execute('select * from conservedness'): doms[row['idx']] = row['dom'] hses[row['idx']] = row['hse'] # Store for graphing filename = self.data_out('voltage-shift-conservedness.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'idx', 'dom', 'hse', 'dva', 'dvi', 'window', ]) for mutation in mutations: idx, dva, dvi = mutation dom = doms[idx] hse = hses[idx] window = dvi - dva w.writerow([idx, dom, hse, dva, dvi, window]) print('Done')
def _run(self): with base.connect() as con: c = con.cursor() totals = [0] * 5 def show(letter=None): q = 'select count(pub) from midpoints_wt' if letter: q += ' where sequence = "' + letter + '"' else: q += ' where sequence is null' n = next(c.execute(q))[0] totals[0] += n if letter: print('Isoform ' + letter + ': ' + str(n)) else: print('Unknown : ' + str(n)) n = next(c.execute(q + ' and beta1="yes"'))[0] totals[1] += n print(' with b1 : ' + str(n)) n = next(c.execute(q + ' and cell="HEK"'))[0] totals[2] += n print(' HEK : ' + str(n)) n = next(c.execute(q + ' and cell="Oocyte"'))[0] totals[3] += n print(' Oocyte : ' + str(n)) n = next(c.execute(q + ' and cell="CHO"'))[0] totals[4] += n print(' CHO : ' + str(n)) print('') show('a') show('b') show('astar') show('bstar') show(None) print('Totals: ' + str(totals))
def _run(self): with base.connect() as con: c = con.cursor() print('Loading voltage shift data') mutations = [] for row in c.execute('select * from epdata'): idx = row['idx'] dva = row['dva'] dvi = row['dvi'] dva = float(dva) if dva is not None else dva dvi = float(dvi) if dvi is not None else dvi mutations.append([idx, dva, dvi]) filename = self.data_out('voltage-shift-indices.csv') print('Writing ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'idx', 'dva', 'dvi', ]) for mutation in mutations: w.writerow(mutation) print('Done')
def _run(self): with base.connect() as con: c = con.cursor() # Get scn5a isoform b indices indices = [] q = 'select * from scn5a_isoform_b order by idx' for row in c.execute(q): indices.append(row['idx']) isoform_b = set(indices) # Get a map from index to diagram location locations = {} for row in c.execute('select * from scn5a_diagram order by idx'): locations[row['idx']] = (row['x'], row['y']) # Get scn5a indices matched to a NavAb acid, write to file filename = self.data_out('diagram-navab-cover.csv') with open(filename, 'w') as f: csv = self.csv_writer(f) csv.writerow(('idx', 'x', 'y')) q = 'select scn5a from navab_to_scn5a order by scn5a;' for row in c.execute(q): idx = row['scn5a'] if idx in isoform_b: x, y = locations[idx] csv.writerow((idx, x, y))
def acids(self): """ Returns a list of amino acids and a reverse lookup dict. (So a list [A, R, N, ...] and a dict {A:0, R:1, N:2, ...}). """ if self._acids is None: if False: acids = [] acid_map = {} with base.connect() as con: c = con.cursor() q = 'select key from acid order by rowid' for k, row in enumerate(c.execute(q)): acids.append(row['key']) acid_map[row['key']] = k self._acids = acids self._acid_map = acid_map else: self._acids = 'KDERHQNSPTGACWFLIMYV' assert (len(self._acids) == 20) self._acid_map = dict(zip(self._acids, range(len(self._acids)))) return self._acids, self._acid_map
def _run(self): with base.connect() as con: c = con.cursor() # Change selecting queries condition_names = [ 'zero', 'act', 'inact', 'late', 'changed', 'unchanged', ] conditions = [ 'zero > 0', 'act > 0', 'inact > 0', 'late > 0', # Changed / no change reported: '(zero > 0 or act > 0 or inact > 0 or late > 0)', '(zero < 1 and act < 1 and inact < 1 and late < 1)', ] # # Number of changes per domain # # Domain names domain_names = [ r['name'] for r in c.execute('select name from domain')] # Count number of each type of change per domain # Note that epdata_annotated contains each unique mutation only # once (although its fields such as `act` show a sum of votes for # whether or not it was affected, based on all available reports). domain_counts = [] for name in domain_names: q = 'select count(idx) from epdata_annotated' q += ' where domain = "' + name + '"' counts = [] for condition in conditions: qc = q + ' and ' + condition counts.append(c.execute(qc).fetchone()[0]) domain_counts.append(counts) # Write files filename = self.data_out('changes-1-per-domain.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel'] + condition_names) for k, name in enumerate(domain_names): w.writerow([k + 1, name] + domain_counts[k]) filename = self.data_out('changes-relative-1-per-domain.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel'] + condition_names) for k, name in enumerate(domain_names): r = np.array(domain_counts[k]) s = np.sum(r) or 1 w.writerow([k + 1, name] + list(r / s)) # # Number of changes per region type # # Region type names regtype_names = [ r['name'] for r in c.execute('select name from regtype')] # Count number of each type of change per regtype regtype_counts = [] for name in regtype_names: q = 'select count(idx) from epdata_annotated' q += ' where regtype = "' + name + '"' counts = [] for condition in conditions: qc = q + ' and ' + condition counts.append(c.execute(qc).fetchone()[0]) regtype_counts.append(counts) # Write files filename = self.data_out('changes-2-per-regtype.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel'] + condition_names) for k, name in enumerate(regtype_names): w.writerow([k + 1, name] + regtype_counts[k]) filename = self.data_out('changes-relative-2-per-regtype.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel'] + condition_names) for k, name in enumerate(regtype_names): r = np.array(regtype_counts[k]) s = np.sum(r) or 1 w.writerow([k + 1, name] + list(r / s)) # # Number of changes per region # # Region names q = 'select name from region order by start' region_names = [r['name'] for r in c.execute(q)] # Number of mutations per region region_counts = [] for name in region_names: q = 'select count(idx) from epdata_annotated' q += ' where region = "' + name + '"' counts = [] for condition in conditions: qc = q + ' and ' + condition counts.append(c.execute(qc).fetchone()[0]) region_counts.append(counts) # Write files filename = self.data_out('changes-3-per-region.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel'] + condition_names) for k, name in enumerate(region_names): w.writerow([k + 1, name] + region_counts[k]) filename = self.data_out('changes-relative-3-per-region.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel'] + condition_names) for k, name in enumerate(region_names): r = np.array(region_counts[k]) s = np.sum(r) or 1 w.writerow([k + 1, name] + list(r / s))
def make(self, alpha): """ Alpha=True, make file for alpha subunits Alpha=False, make file for beta subunits """ with base.connect() as con: c = con.cursor() # Get years to find data for q = 'select min(year)' q += ' from epdata inner join publication' q += ' on epdata.pub = publication.key' year1 = next(c.execute(q))[0] years = range(year1, 2016) # Gather data data = {} if alpha: # Alpha subunits names = [] q = 'select distinct sequence from epdata' for row in c.execute(q): names.append(row['sequence']) names.remove(None) q = 'select year, count(old) as n' q += ' from epdata inner join publication' q += ' on epdata.pub = publication.key' q += ' where sequence = ? and year < 2016' q += ' group by year' for name in names: ydata = [0]*len(years) for row in c.execute(q, (name,)): ydata[row['year'] - year1] = row['n'] data[name] = ydata q = 'select year, count(old) as n' q += ' from epdata inner join publication' q += ' on epdata.pub = publication.key' q += ' where sequence is null and year < 2016' q += ' group by year' ydata = [0]*len(years) for row in c.execute(q): ydata[row['year'] - year1] = row['n'] data['Unknown'] = ydata names.append('Unknown') # Tweak order tweak = [ 'achen', 'bstar', 'astar', 'b', 'a', 'Unknown', ] if set(names) != set(tweak): raise Exception('Tried to tweak order of lines, but custom' ' order is lacking some values! Should have: ' + ','.join(names)) names = tweak else: # Beta subunits names = ['yes', 'no'] q = 'select year, count(old) as n' q += ' from epdata inner join publication' q += ' on epdata.pub = publication.key' q += ' where beta1 = ? and year < 2016' q += ' group by year' for name in names: ydata = [0]*len(years) for row in c.execute(q, (name,)): ydata[row['year'] - year1] = row['n'] data[name] = ydata # Filenames if alpha: basename = 'alpha_per_year' else: basename = 'beta1_per_year' # Write output iters = [] iters.append(iter(years)) year_rows = [] for name in names: iters.append(iter(data[name])) filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['year'] + names) for k, year in enumerate(years): row = [next(i) for i in iters] w.writerow(row) year_rows.append(np.array(row[1:])) # Plot same data but as fraction for k, row in enumerate(year_rows): if np.sum(row) > 0: year_rows[k] = row / np.sum(row) filename = self.data_out(basename + '_frac.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['year'] + names) for k, year in enumerate(years): w.writerow([year] + list(year_rows[k] * 100)) # Plot same data but as cumulative fractions for k, row in enumerate(year_rows): offset = 0 for i, x in enumerate(row): year_rows[k][i] += offset offset += x filename = self.data_out(basename + '_cumfrac.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['year'] + names) for k, year in enumerate(years): w.writerow([year] + list(year_rows[k]))
import base import scrapper import msvcrt as m base.clear() print("First step") print("Create connection to database") print("Make sure u have mysql database, user with have privileges to it and he using plugin mysql_native_password") print("") base.raw_print("Any key to continue ") m.getch() database = base.connect() if base.create_schema(database): print("Operation end successfully") print("") print("Start webscraping") xx = "https://www.zalando.pl/odziez-damska" xy = "https://www.zalando.pl/odziez-meska/" scrapper.scrap(xx, "kobiety", database) scrapper.scrap(xy, "mezczyzni", database) else: print("Error occur during creating schema")
def _run(self): with base.connect() as con: c = con.cursor() # # 1. Create files showing the physical locations of positions with # epdata, no epdata and no mutations (only positions with a # navab equivalent). # # Get all positions with mutations (no exac) q = 'select distinct idx from report where pub != "exac"' mutations = set() for row in c.execute(q): mutations.add(row['idx']) # Get all positions with epdata epdata = set() for row in c.execute('select distinct idx from epdata'): epdata.add(row['idx']) # Get NavAb-SCN5A translation and vice versa navab2scn5a = {} for row in c.execute('select * from navab_to_scn5a'): navab2scn5a[row['navab']] = row['scn5a'] # Write locations of SCN5A mutations, where known file1 = self.data_out('navab-locations-epdata.csv') file2 = self.data_out('navab-locations-mutation.csv') file3 = self.data_out('navab-locations-free.csv') try: f1 = open(file1, 'w') f2 = open(file2, 'w') f3 = open(file3, 'w') c1 = self.csv_writer(f1) c2 = self.csv_writer(f2) c3 = self.csv_writer(f3) data = ['idx', 'x', 'y', 'z', 'r', 't'] c1.writerow(data) c2.writerow(data) c3.writerow(data) print('Writing data to ' + file1) print(' and ' + file2) print(' and ' + file3) for row in c.execute('select * from navab_locations'): # Get scn5a equivalent try: idx = navab2scn5a[row['key']] except KeyError: # Skip if no scn5a equivalent is known continue # Store data data = [ idx, row['x'], row['y'], row['z'], row['r'], row['t'] ] if idx in epdata: c1.writerow(data) elif idx in mutations: c2.writerow(data) else: c3.writerow(data) finally: f1.close() f2.close() f3.close() # # 2. Create a file with the positions of mutations and the # associated dvi and dva # # Get midpoint shifts q = 'select * from epdata_filtered' q += ' where dva is not null and dvi is not null' names = {} dvi = {} dva = {} for row in c.execute(q): names[row['idx']] = row['old'] + str(row['idx']) + row['new'] dvi[row['idx']] = row['dvi'] dva[row['idx']] = row['dva'] # Write file with distance to pore and midpoint shifts file1 = self.data_out('voltage-shift-navab-locations.csv') print('Writing data to ' + file1) with open(file1, 'w') as f1: c1 = self.csv_writer(f1) c1.writerow([ 'name', 'r', 'dva', 'dvi', 'dva_abs', 'dvi_abs', 'sum_abs', 'window', ]) for row in c.execute('select * from navab_locations'): # Get scn5a equivalent or skip try: idx = navab2scn5a[row['key']] except KeyError: continue # Get epdata or skip try: name = names[idx] da = dva[idx] di = dvi[idx] except KeyError: continue c1.writerow([ name, row['r'], da, di, abs(di), abs(da), abs(di) + abs(di), di - da, ]) # # 3. Create a file with the translated NavAb positions, grouped by # SCN5A region # # Get regions, scn5a idx to region mapping regions = [] scn5a_regions = {} for row in c.execute('select * from region order by start'): regions.append(row['name']) for idx in range(row['start'], 1 + row['end']): scn5a_regions[idx] = row['name'] # Create files path = self.data_out('navab_location_regions') if not os.path.isdir(path): os.makedirs(path) files = [] csvs = {} try: # Open file per region for region in regions: filename = region.replace(' ', '-') filename = os.path.join(path, region + '.csv') print('Writing data to ' + filename) f = open(filename, 'w') files.append(f) csv = self.csv_writer(f) csvs[region] = csv csv.writerow(['idx', 'x', 'y', 'z']) # Get and write data q = 'select * from navab_locations order by key' for row in c.execute(q): # Get scn5a equivalent or skip try: idx = navab2scn5a[row['key']] except KeyError: continue # Write to correct file csv = csvs[scn5a_regions[idx]] csv.writerow([idx, row['x'], row['y'], row['z']]) finally: for f in files: f.close()
def _run(self): with base.connect() as con: c = con.cursor() # Change selecting queries conditions = [ 'zero > 0', 'act > 0', 'inact > 0', 'late > 0', ] # Acids acids = [ str(r['key']) for r in c.execute('select key from acid order by rowid') ] # Number of changes per acid_from counts_fr = [] for acid in acids: q = 'select count(idx) from (' q += ' select idx, sum(zero) as zero, sum(act) as act,' q += ' sum(inact) as inact, sum(late) as late' q += ' from epdata_annotated' q += ' where old = "' + acid + '"' counts = [] for condition in conditions: qc = q + ' and ' + condition + ' group by idx, new)' counts.append(c.execute(qc).fetchone()[0]) counts_fr.append(counts) # Write files filename = self.data_out('acid-changes-1-from.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late']) for k, name in enumerate(acids): w.writerow([k + 1, name] + counts_fr[k]) relative_fr = np.array(counts_fr, dtype=float) relative_fr /= (np.sum(relative_fr, axis=1).reshape(20, 1) + 1e-12) filename = self.data_out('acid-changes-relative-1-from.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late']) for k, name in enumerate(acids): w.writerow([k + 1, name] + list(relative_fr[k])) # Number of changes per acid_to counts_to = [] for acid in acids: q = 'select count(idx) from (' q += ' select idx, sum(zero) as zero, sum(act) as act,' q += ' sum(inact) as inact, sum(late) as late' q += ' from epdata_annotated' q += ' where new = "' + acid + '"' counts = [] for condition in conditions: qc = q + ' and ' + condition + ' group by idx, new)' counts.append(c.execute(qc).fetchone()[0]) counts_to.append(counts) # Write file filename = self.data_out('acid-changes-2-to.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late']) for k, name in enumerate(acids): w.writerow([k + 1, name] + counts_to[k]) relative_to = np.array(counts_to, dtype=float) relative_to /= (np.sum(relative_to, axis=1).reshape(20, 1) + 1e-12) filename = self.data_out('acid-changes-relative-2-to.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late']) for k, name in enumerate(acids): w.writerow([k + 1, name] + list(relative_to[k])) # Write file combined data filename = self.data_out('acid-changes-3-combined.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late']) for k, name in enumerate(acids): counts = [] for i in range(len(conditions)): counts.append(counts_fr[k][i] + counts_to[k][i]) w.writerow([k + 1, name] + counts) relative_sum = np.array(counts_to, dtype=float) relative_sum += np.array(counts_fr, dtype=float) relative_sum /= (np.sum(relative_sum, axis=1).reshape(20, 1) + 1e-12) filename = self.data_out('acid-changes-relative-3-combined.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late']) for k, name in enumerate(acids): w.writerow([k + 1, name] + list(relative_sum[k]))
def _run(self): with base.connect() as con: c = con.cursor() # Count positions n = next(c.execute('select count(idx) from scn5a;'))[0] print('Positions: ' + str(n)) # Count articles n = next(c.execute('select count(key) from publication;'))[0] print('Articles: ' + str(n)) # Count journals n = next(c.execute('select count(key) from journal;'))[0] print('Journals: ' + str(n)) # Count mutations # Note: `mutation` lists only unique (old, idx, new) pairs, not # reports of mutations! n = next(c.execute('select count(old) from mutation;'))[0] print('Mutations : ' + str(n)) # Count unique mutation positions q = 'select count(idx) from '\ '(select distinct idx from mutation);' n = next(c.execute(q))[0] print('Positions with mutations: ' + str(n)) # Count mutations (no exac) q = 'select count(idx) from '\ '(select distinct idx, new from report where pub != "exac");' n = next(c.execute(q))[0] print('Mutations (no exac) : ' + str(n)) # Count unique mutation positions (no exac) q = 'select count(distinct idx) from report where pub != "exac"' n = next(c.execute(q))[0] print('Positions with mutations (no exac): ' + str(n)) # EP Data measurements n = next(c.execute('select count(idx) from epdata;'))[0] print('EP measurements: ' + str(n)) # Mutations with measured EP q = 'select count(idx) from epdata_outcomes' n = next(c.execute(q))[0] print('Mutations with epdata: ' + str(n)) # Positions with measured EP q = 'select count(distinct idx) from epdata_outcomes' n = next(c.execute(q))[0] print('Positions with epdata: ' + str(n)) # Mutations with any change to EP q = 'select count(idx) from epdata_outcomes' q += ' where zero>0 or act>0 or inact>0 or late>0' n = next(c.execute(q))[0] print('Mutations with change to epdata: ' + str(n)) q = 'select count(distinct idx) from epdata_outcomes' q += ' where zero>0 or act>0 or inact>0 or late>0' n = next(c.execute(q))[0] print('Positions with change to epdata: ' + str(n)) q = 'select distinct idx from epdata_outcomes' q += ' where zero>0 or act>0 or inact>0 or late>0' # Mutations without any change to EP q = 'select count(idx) from epdata_outcomes' q += ' where zero<1 and act<1 and inact<1 and late<1' n = next(c.execute(q))[0] print('Mutations with no change to epdata: ' + str(n)) q = 'select count(distinct idx) from epdata_outcomes' q += ' where zero<1 and act<1 and inact<1 and late<1' n = next(c.execute(q))[0] print('Positions with no change to epdata: ' + str(n)) # Mutations causing zero current q = 'select count(idx) from ' \ '(select distinct idx, new from epdata_outcomes where zero>0);' n = next(c.execute(q))[0] print('Mutations with zero=1: ' + str(n)) # Positions causing zero current q = 'select count(idx) from ' \ '(select distinct idx from epdata_outcomes where zero>0);' n = next(c.execute(q))[0] print('Positions with zero=1: ' + str(n)) # Mutations affecting activation q = 'select count(idx) from ' \ '(select distinct idx, new from epdata_outcomes where act>0);' n = next(c.execute(q))[0] print('Mutations with act=1: ' + str(n)) # Positions affecting activation q = 'select count(idx) from ' \ '(select distinct idx from epdata_outcomes where act>0);' n = next(c.execute(q))[0] print('Positions with act=1: ' + str(n)) # Mutations affecting inactivation q = 'select count(idx) from ' \ '(select distinct idx, new from epdata_outcomes where inact>0);' n = next(c.execute(q))[0] print('Mutations with inact=1: ' + str(n)) # Positions affecting inactivation q = 'select count(idx) from ' \ '(select distinct idx from epdata_outcomes where inact>0);' n = next(c.execute(q))[0] print('Positions with inact=1: ' + str(n)) # Mutations affecting late q = 'select count(idx) from ' \ '(select distinct idx, new from epdata_outcomes where late>0);' n = next(c.execute(q))[0] print('Mutations with late=1: ' + str(n)) # Positions affecting late q = 'select count(idx) from ' \ '(select distinct idx from epdata_outcomes where late>0);' n = next(c.execute(q))[0] print('Positions with late=1: ' + str(n))
def connect(database, username=None, password=None, environment=None): """Creates a database connection. """ return Connection(base.connect(database, username, password, environment))
def _run(self): with base.connect() as con: c = con.cursor() # Get positions (may have gaps!) idx = [] for r in c.execute('select idx from scn5a order by idx'): idx.append(r[0]) idx = np.array(idx) # Get mutations for each position q = 'select distinct idx, new from report where pub != "exac"' mut = np.zeros(idx.shape) for r in c.execute(q): mut[r[0] - 1] = 1 # Positions start at 1 # Get Human-squid-eel and domain alignment score hse = np.zeros(idx.shape, dtype=float) dom = np.zeros(idx.shape, dtype=float) q = 'select idx, hse, dom from conservedness order by idx' for k, r in enumerate(c.execute(q)): assert (r[0] - 1 == k) # Score should be stored for each idx hse[k] = r[1] dom[k] = r[2] # # 1. Overal mean mutation count and hse/dom scores # (Text output only) # hse_mean = np.mean(hse) hse_stdd = np.std(hse) dom_mean = np.mean(dom) dom_stdd = np.std(dom) print('Mean hse: ' + str(hse_mean) + ', std: ' + str(hse_stdd)) print('Mean dom: ' + str(dom_mean) + ', std: ' + str(dom_stdd)) # # 2. Position and dom/hse scores for positions with and without # mutations. # (Text output only) # idx_idx = idx[mut > 0] hse_idx = hse[mut > 0] dom_idx = dom[mut > 0] idx_neg = idx[mut == 0] hse_neg = hse[mut == 0] dom_neg = dom[mut == 0] hse_idx_mean = np.mean(hse_idx) hse_idx_stdd = np.std(hse_idx) dom_idx_mean = np.mean(dom_idx) dom_idx_stdd = np.std(dom_idx) hse_neg_mean = np.mean(hse_neg) hse_neg_stdd = np.std(hse_neg) dom_neg_mean = np.mean(dom_neg) dom_neg_stdd = np.std(dom_neg) print('HSE score:') print(' Mean, with mutations: ' + str(hse_idx_mean) + ', std: ' + str(hse_idx_stdd)) print(' Mean, no mutations : ' + str(hse_neg_mean) + ', std: ' + str(hse_neg_stdd)) print('DOM score:') print(' Mean, with mutations: ' + str(dom_idx_mean) + ', std: ' + str(dom_idx_stdd)) print(' Mean, no mutations : ' + str(dom_neg_mean) + ', std: ' + str(dom_neg_stdd)) # # 3. HSE and DOM score for positions with and without mutations # (For use in a box-plot) # basename = 'score-with-mutations' filename = self.data_out(basename + '.txt') print('Writing info to ' + filename) with open(filename, 'w') as f: f.write('Scores for positions with mutations (idx, hse, dom)') filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: c = self.csv_writer(f) c.writerow(['position', 'hse-score', 'dom-score']) h = iter(hse_idx) d = iter(dom_idx) for p in idx_idx: c.writerow([p, next(h), next(d)]) basename = 'score-without-mutations' filename = self.data_out(basename + '.txt') print('Writing info to ' + filename) with open(filename, 'w') as f: f.write( 'Scores for positions without mutations (idx, hse, dom)') filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: c = self.csv_writer(f) c.writerow(['position', 'hse-score', 'dom-score']) h = iter(hse_neg) d = iter(dom_neg) for p in idx_neg: c.writerow([p, next(h), next(d)]) # Write labels used to create box plots basename = 'score-with-without-mutations-labels' filename = self.data_out(basename + '.csv') print('Writing label info to ' + filename) with open(filename, 'w') as f: c = self.csv_writer(f) c.writerow(['HSE-']) c.writerow(['HSE+']) c.writerow(['DOM-']) c.writerow(['DOM+'])
def _run(self): print('Calculating...') # Get region names and positions position_count = 0 # Number of positions region_name = [] # Region name region_first = [] # First position in region region_final = [] # Final position in region region_size = [] # Number of positions in region region_map = {} # Maps positions to regions region_regtype = [] # Region "regtype" values with base.connect() as con: c = con.cursor() q = 'select name, start, end, regtype from region order by start' for r in c.execute(q): k = len(region_name) region_name.append(r[0]) region_first.append(r[1]) region_final.append(r[2]) region_size.append(1 + r[2] - r[1]) for p in range(r[1], r[2] + 1): region_map[p] = k region_regtype.append(r[3]) position_count = r[2] # Number of regions region_count = len(region_name) # Number of mutations per region region_mutations = [0] * region_count mutation_count = 0 q = 'select distinct idx, new from report where pub != "exac"' for r in c.execute(q): region_mutations[region_map[r[0]]] += 1 mutation_count += 1 # Get average density mutation_density = float(mutation_count) / position_count # Get mutation density and relative density, per region region_density = [0] * region_count region_reldens = [0] * region_count for k, count in enumerate(region_mutations): density = float(count) / region_size[k] reldens = density - mutation_density if density != 0 else 0 region_density[k] = density region_reldens[k] = reldens # Region types regtype_name = [] regtype_map = {} q = 'select name from regtype order by rowid' for k, r in enumerate(c.execute(q)): regtype_name.append(r[0]) regtype_map[r[0]] = k # Count number of regtypes regtype_count = len(regtype_name) # Count mutations per regtype regtype_mutations = [0] * regtype_count # Count positions per regtype regtype_size = [0] * regtype_count for k, t in enumerate(region_regtype): t = regtype_map[t] regtype_mutations[t] += region_mutations[k] regtype_size[t] += region_size[k] regtype_density = [0] * regtype_count regtype_reldens = [0] * regtype_count for k, count in enumerate(regtype_size): density = float(regtype_mutations[k]) / count reldens = density - mutation_density if density != 0 else 0 regtype_density[k] = density regtype_reldens[k] = reldens # # Write results # # 1. Global mutation density basename = 'mutation-density-global' filename = self.data_out(basename + '.txt') print('Writing info to ' + filename) with open(filename, 'w') as f: f.write('Number of positions in scn5a, mutations found, global' ' mutation density.') filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow( ['total_positions', 'total_mutations', 'total_density']) w.writerow([position_count, mutation_count, mutation_density]) # 2. Mutation density in regions basename = 'mutation-density-regions' filename = self.data_out(basename + '.txt') print('Writing info to ' + filename) with open(filename, 'w') as f: f.write( 'Region name, region start, region end, region size,' ' mutation density in region and mutation density relative' ' to global. Finally, density and relative density in' ' percentages. Relative density is calculated as' ' (density in segment) / total density. Except where' ' (density in segment) == 0, there, relative density is' ' set to 0') filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'name', 'idx', 'first', 'final', 'count', 'mutations', 'density', 'reldens', 'pdensity', 'preldens', ]) for k, name in enumerate(region_name): w.writerow([ name, k, region_first[k], region_final[k], region_size[k], region_mutations[k], region_density[k], region_reldens[k], region_density[k] * 100, region_reldens[k] * 100, ]) # 3. Mutation density in region types basename = 'mutation-density-regtypes' filename = self.data_out(basename + '.txt') print('Writing info to ' + filename) with open(filename, 'w') as f: f.write('Regtype name, regtype size, mutation density and' ' relative mutation density.') filename = self.data_out(basename + '.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'name', 'idx', 'mutations', 'density', 'reldens', 'pdensity', 'preldens', ]) for k, name in enumerate(regtype_name): w.writerow([ name, k, regtype_mutations[k], regtype_density[k], regtype_reldens[k], regtype_density[k] * 100, regtype_reldens[k] * 100, ])
def gather(self, q, fname1, fname2): # Query db data = [] with base.connect() as con: c = con.cursor() for row in c.execute(q): data.append(( row['pub'], row['va'], #row['stda'], 2 * row['stda'], row['vi'], #row['stdi'], 2 * row['stdi'], row['va'] - row['vi'], )) # Write file fname1 = self.data_out(fname1) print('Writing to ' + fname1) with open(fname1, 'w') as f: csv = self.csv_writer(f) csv.writerow(['pub', 'va', '+-', 'vi', '+-', 'dv']) for row in data: csv.writerow(row) print('Collected data from ' + str(len(data)) + ' reports.') # # Correct with linear regression # print('Subtracting linear regression...') # Gather data va = [] vi = [] for row in data: va.append(row[1]) vi.append(row[3]) va = np.array(va) vi = np.array(vi) # Fit line b, a = np.polyfit(va, vi, 1) if DEBUG: import matplotlib.pyplot as pl pl.figure() pl.plot(va, vi, 'o') x = np.linspace(np.min(va) - 10, np.max(va) + 10, 1000) y = a + b * x pl.plot(x, y) # Subtract if DEBUG: pl.figure() pl.plot(va, vi - (a + b * va), 'o') pl.show() print('Coefficients: ' + str(a) + ', ' + str(b)) # Write file fname2 = self.data_out(fname2) print('Writing to ' + fname2) with open(fname2, 'w') as f: csv = self.csv_writer(f) csv.writerow(['pub', 'va', '+-', 'vic', '+-']) for k, row in enumerate(data): row = list(row[:-1]) row[3] = row[3] - (a + b * row[1]) csv.writerow(row) # Get Pearson correlation coefficient print('Pearson correlation coefficient: ' + str(np.corrcoef(va, vi)[1, 0])) print('Done')
def _run(self): # Collect tex references refs = {} with base.connect() as con: c = con.cursor() q = 'select key, tex from publication_tex' for k, row in enumerate(c.execute(q)): refs[row['key']] = row['tex'] # Create table file filename = self.data_out('epdata-table2.tex') fields = [ 'pub', 'old', 'idx', 'new', 'dva', 'dvi', 'zero', 'act', 'inact', 'late', 'sequence', 'cell', 'beta1', ] # Sequence formatting def seq(s): if s == 'astar': return 'a*' elif s == 'bstar': return 'b*' elif s == 'achen': return 'a**' elif s is None: return '' return s def cell(c): if c in ['HEK', 'CHO']: return c elif c == 'Mouse myocyte': return 'MM' elif c == 'Oocyte': return 'Ooc.' elif c is None: return '' return c def yesno(x): return 'yes' if x == 1 else ('no' if x == -1 else '') # Create table with open(filename, 'w') as f: # Header size = 'tiny' f.write('\\begin{' + size + '}\n') f.write('\\startrowcolors\n') f.write('\\begin{longtable}{p{4cm}|l|llll|ll|lll}\n') f.write('\\caption{\\label{tab:epdata}EP Data} \\\\\n') f.write('\\hline\n') f.write('Publication') f.write(' & Mutation') f.write(' & Act. & Inact. & Late & Zero') f.write(' & ${\Delta}V_a$ & ${\Delta}V_i$') f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n') f.write('\\hline\n') f.write('\\endfirsthead') f.write('\\hline\n') f.write('\\rowcolor{white}\n') f.write('Publication') f.write(' & Mutation') f.write(' & Act. & Inact. & Late & Zero') f.write(' & ${\Delta}V_a$ & ${\Delta}V_i$') f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n') f.write('\\hline\n') f.write('\\endhead\n') f.write('\\hline\n') f.write('\\endfoot\n') # Body form = '{:.3g}' with base.connect() as con: c = con.cursor() q = 'select ' + ', '.join(fields) + ' from epdata' q += ' order by idx, new' for k, row in enumerate(c.execute(q)): x = [] x.append(row['pub'] + '\\cite{' + refs[row['pub']] + '}') x.append(row['old'] + str(row['idx']) + row['new']) x.append(yesno(row['act'])) x.append(yesno(row['inact'])) x.append(yesno(row['late'])) x.append(yesno(row['zero'])) x.append( '' if row['dva'] is None else form.format(row['dva'])) x.append( '' if row['dvi'] is None else form.format(row['dvi'])) x.append(cell(row['cell'])) x.append(seq(row['sequence'])) x.append(row['beta1']) f.write(' & '.join(x) + ' \\\\\n') # Footer f.write('\\end{longtable}\n') f.write('\\end{' + size + '}\n')
def _run(self): with base.connect() as con: c = con.cursor() # Mutations with multiple epdata reports names = [] parts = [] counts = [] q = 'select old, idx, new, n from (' q += 'select *, count(idx) as n from epdata group by idx, new' q += ') where n > 1 order by n desc, idx, new' for row in c.execute(q): names.append(row['old'] + str(row['idx']) + row['new']) parts.append((row['old'], row['idx'], row['new'])) counts.append(row['n']) # Get data for mutations with multiple reports fields = [ 'act', 'inact', 'late', 'zero', 'sequence', 'cell', 'beta1', 'pub' ] data = {} q = 'select * from epdata' q += ' where old=? and idx=? and new=?' for k, name in enumerate(names): d = {} for field in fields: d[field] = [] for row in c.execute(q, parts[k]): for field in fields: d[field].append(row[field]) data[name] = d # Get list of mutations with inconsistencies ters = ['act', 'inact', 'late'] bins = ['zero'] issues = [] for k, name in enumerate(names): d = data[name] for t in ters: if -1 in d[t] and 1 in d[t]: issues.append(name) break else: # If didn't break for b in bins: if 0 in d[t] and 1 in d[t]: issues.append(name) break # Show output def pront(name): print(name) d = data[name] for k in range(len(d['act'])): for f in fields: print(d[f][k], end=' ') print('') for name in names: if name not in issues: pront(name) print() print('=' * 40) print('Mutations with doubles: ' + str(len(counts))) print('Mutations with issues: ' + str(len(issues))) print('=' * 40) print() for name in issues: pront(name)
def _run(self): with base.connect() as con: c = con.cursor() # # Global mutation density # # Get total number of positions q = 'select count(idx) from scn5a' total_length = c.execute(q).fetchone()[0] # Get total number of positions with a (non exac) mutation q = 'select count(idx) from mutation_no_exac' total_count = c.execute(q).fetchone()[0] # Global mutation density global_density = total_count / total_length # # Mutation density per domain # # Domain names domain_names = [ r['name'] for r in c.execute('select name from domain')] # Domain lengths domain_lengths = [] for name in domain_names: q = 'select sum(length) from region' q += ' where domain = "' + name + '"' domain_lengths.append(c.execute(q).fetchone()[0]) # Number of mutations per domain domain_count = [] for name in domain_names: q = 'select count(idx)' q += ' from mutation_no_exac_annotated' q += ' where domain = "' + name + '"' domain_count.append(c.execute(q).fetchone()[0]) # Mutation density per domain domain_density = np.array(domain_count) / np.array(domain_lengths) # Relative mutation density per domain domain_reldens = domain_density - global_density # Write file filename = self.data_out('density-1-per-domain.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'xaxis', 'xlabel', 'density', 'relative-density', ]) for k, name in enumerate(domain_names): w.writerow([ k + 1, name, domain_density[k], domain_reldens[k], ]) # # Mutation density per region type # # Region type names regtype_names = [ r['name'] for r in c.execute('select name from regtype')] # Regtype lengths regtype_lengths = [] for name in regtype_names: q = 'select sum(length) from region' q += ' where regtype = "' + name + '"' regtype_lengths.append(c.execute(q).fetchone()[0]) # Number of mutations per regtype regtype_count = [] for name in regtype_names: q = 'select count(idx)' q += ' from mutation_no_exac_annotated' q += ' where regtype = "' + name + '"' regtype_count.append(c.execute(q).fetchone()[0]) # Mutation density per regtype regtype_density = np.array(regtype_count)/np.array(regtype_lengths) # Relative mutation density per regtype regtype_reldens = regtype_density - global_density # Write file filename = self.data_out('density-2-per-regtype.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'xaxis', 'xlabel', 'density', 'relative-density', ]) for k, name in enumerate(regtype_names): w.writerow([ k + 1, name, regtype_density[k], regtype_reldens[k], ]) # # Mutation density per region # # Region names q = 'select name from region order by start' region_names = [r['name'] for r in c.execute(q)] # Region lengths region_lengths = [] for name in region_names: q = 'select length from region' q += ' where name = "' + name + '"' region_lengths.append(c.execute(q).fetchone()[0]) # Number of mutations per region region_count = [] for name in region_names: q = 'select count(idx)' q += ' from mutation_no_exac_annotated' q += ' where region = "' + name + '"' region_count.append(c.execute(q).fetchone()[0]) # Mutation density per region region_density = np.array(region_count) / np.array(region_lengths) # Relative mutation density per regtype region_reldens = region_density - global_density # Write file filename = self.data_out('density-3-per-region.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'xaxis', 'xlabel', 'density', 'relative-density', ]) for k, name in enumerate(region_names): w.writerow([ k + 1, name, region_density[k], region_reldens[k], ])
def _run(self): with base.connect() as con: c = con.cursor() # Get acid properties properties = [ 'average_residue_mass', 'percent_buried_residues', 'v_waals', 'polarity_ranking', 'charge', 'hydrophobicity', 'helix_propensity', ] pvalues = {} for row in c.execute('select * from acid'): vs = {} for p in properties: vs[p] = row[p] pvalues[row['key']] = vs # Gather delta-properties, per outcome # Note that epdata_outcomes contains each mutation only once outcomes = ['zero', 'act', 'inact', 'late'] ovalues = {} for o in outcomes: ovalues[o] = {} for p in properties: ovalues[o][p] = [] for row in c.execute('select * from epdata_outcomes'): old, new = row['old'], row['new'] for o in outcomes: if row[o] > 0: for p in properties: delta = pvalues[new][p] - pvalues[old][p] ovalues[o][p].append(delta) # Add unchanged outcome outcomes += ['unchanged'] ovalues['unchanged'] = {} for p in properties: ovalues['unchanged'][p] = [] q = 'select * from epdata where' q += ' (act < 1 and inact < 1 and zero < 1 and late < 1)' for row in c.execute(q): old, new = row['old'], row['new'] for p in properties: delta = pvalues[new][p] - pvalues[old][p] ovalues['unchanged'][p].append(delta) # Add changed outcome outcomes += ['changed'] ovalues['changed'] = {} for p in properties: ovalues['changed'][p] = [] q = 'select * from epdata where' q += ' (act > 0 or inact > 0 or zero > 0 or late > 0)' for row in c.execute(q): old, new = row['old'], row['new'] for p in properties: delta = pvalues[new][p] - pvalues[old][p] ovalues['changed'][p].append(delta) # Store basename = 'deltas-' w = 0.3 for k, o in enumerate(outcomes): filename = self.data_out(basename + str(1 + k) + '-' + o + '.csv') print('Writing ' + filename) with open(filename, 'w') as f: c = self.csv_writer(f) c.writerow(['x'] + properties) n = len(ovalues[o][properties[0]]) iters = [iter(ovalues[o][p]) for p in properties] for x in np.linspace(1 + k - w, 1 + k + w, n): c.writerow([x] + [next(i) for i in iters]) filename = self.data_out('deltas-labels.csv') print('Writing ' + filename) with open(filename, 'w') as f: f.write('x, label\n') for k, o in enumerate(outcomes): f.write(str(k + 1) + ',' + o + '\n') # # Now run a test, for each property (so independent tests!). # # Data isn't normal, so we use a Kruskal Wallis test. # # In each test, test if we can reject the null hypothesis that the # results in each outcome group are from the same distribution. # # This gives us a pvalue. If the pvalue is low, we can say the # groups are different. # print('-' * 40) print('Comparing 5 outcomes (not including `changed`)') print('-' * 40) for p in properties: print(p) # Gather samples for each outcome, not including `changed` groups = [] for o in outcomes[:-1]: groups.append(np.array(ovalues[o][p])) # Perform test statistic, pvalue = sp.stats.f_oneway(*groups) print('Anova: ' + str(statistic) + ', ' + str(pvalue)) statistic, pvalue = sp.stats.kruskal(*groups) print('Kruskal-Wallis: ' + str(statistic) + ', ' + str(pvalue)) # # But... samples can be in multiple changed groups, so maybe # compare them individually with `unchanged` as well # print('-' * 40) print('Comparing 5 outcomes with `unchanged`') print('-' * 40) for p in properties: print(p) # Gather samples for each outcome groups = [] for o in outcomes: groups.append(np.array(ovalues[o][p])) # Perform tests for i, o in enumerate(outcomes): print('Unchanged vs ' + o) statistic, pvalue = sp.stats.kruskal(groups[4], groups[i]) print('Kruskal-Wallis: ' + str(statistic) + ', ' + str(pvalue)) print('- ' * 20)
def _run(self): with base.connect() as con: c = con.cursor() # Load gonnet scores print('Loading gonnet scores') scores = {} for row in c.execute('select * from gonnet_score'): scores[row['key1'] + row['key2']] = row['score'] scores[row['key2'] + row['key1']] = row['score'] # Load voltage shifts print('Loading voltage shift data') q = 'select * from epdata_filtered' q += ' where dva is not null and dvi is not null' mutations = [] for row in c.execute(q): mutations.append([ int(scores[row['old'] + row['new']]), float(row['dva']), float(row['dvi']), ]) # Create file relating the two filename = self.data_out('voltage-shift-gonnet-scores.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'score', 'dva', 'dvi', ]) for mutation in mutations: w.writerow(mutation) # # Divide the score axis into bins, for each bin, calculate # (1/bin_size) * sum( dva ) # and # (1/bin_size) * sum( abs(dva) ) # then repeat with dvi # print('Calculating binned sum of squares') # Create bins lo = -5.5 hi = 5.5 bw = 1 centers = np.arange(lo, hi + bw, bw) lower = centers - bw * 0.5 upper = centers + bw * 0.5 # Gather shifts in bins dva = [[] for x in centers] dvi = [[] for x in centers] for mutation in mutations: score, da, di = mutation i = np.where((score >= lower) * (score < upper))[0][0] dva[i].append(da) dvi[i].append(di) # Calculate stats and write to file dva = [np.array(x) if x else np.array([0]) for x in dva] dvi = [np.array(x) if x else np.array([0]) for x in dvi] print([np.mean(x) for x in dva]) print([np.mean(x) for x in dvi]) filename = self.data_out('voltage-shift-gonnet-scores-binned.csv') print('Writing data to ' + filename) with open(filename, 'w') as f: w = self.csv_writer(f) w.writerow([ 'center', 'dva-mean', 'dvi-mean', 'dva-std', 'dvi-std', 'dva-abs-mean', 'dvi-abs-mean', ]) data = [ centers, [np.mean(x) for x in dva], [np.mean(x) for x in dvi], [np.std(x) for x in dva], [np.std(x) for x in dvi], [np.mean(np.abs(x)) for x in dva], [np.mean(np.abs(x)) for x in dvi], ] data = np.array([np.array(x) for x in data]).transpose() for row in data: w.writerow(row) print('Done')
def _run(self): # Collect tex references refs = {} with base.connect() as con: c = con.cursor() q = 'select key, tex from publication_tex' for k, row in enumerate(c.execute(q)): refs[row['key']] = row['tex'] # Create table file filename = self.data_out('midpoint-table.tex') fields = [ 'pub', 'va', 'na', 'stda', 'vi', 'ni', 'stdi', 'sequence', 'cell', 'beta1', ] # Sequence formatting def seq(s): if s == 'astar': return 'a*' elif s == 'bstar': return 'b*' elif s is None: return '?' return s # Create table with open(filename, 'w') as f: # Header size = 'footnotesize' f.write('\\begin{' + size + '}\n') f.write('\\startrowcolors\n') f.write('\\begin{longtable}{p{5cm}|lll|lll|lll}\n') f.write('\\caption{\\label{midpoints}Midpoints} \\\\\n') f.write('\\hline\n') f.write('Publication') f.write(' & $V_a$ & $\sigma_a$ & $n_a$') f.write(' & $V_i$ & $\sigma_i$ & $n_i$') f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n') f.write('\\hline\n') f.write('\\endfirsthead') f.write('\\hline\n') f.write('\\rowcolor{white}\n') f.write('Publication') f.write(' & $V_a$ & $\sigma_a$ & $n_a$') f.write(' & $V_i$ & $\sigma_i$ & $n_i$') f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n') f.write('\\hline\n') f.write('\\endhead\n') f.write('\\hline\n') f.write('\\endfoot\n') # Body form = '{:.3g}' with base.connect() as con: c = con.cursor() q = 'select ' + ', '.join(fields) + ' from midpoints_wt' for k, row in enumerate(c.execute(q)): x = [] x.append('\\citet{' + refs[row['pub']] + '}') if row['na'] != 0: x.append(form.format(row['va'])) x.append(form.format(row['stda'])) x.append(form.format(row['na'])) else: x.append('&&') if row['ni'] != 0: x.append(form.format(row['vi'])) x.append(form.format(row['stdi'])) x.append(form.format(row['ni'])) else: x.append('&&') x.append(row['cell'].replace('Oocyte', 'Ooc.')) x.append(seq(row['sequence'])) x.append(row['beta1']) f.write(' & '.join(x) + ' \\\\\n') # Footer f.write('\\end{longtable}\n') f.write('\\end{' + size + '}\n')