def get_variants_mc3(syn): """Reads ICGC mutation data from the MC3 synapse file. Args: syn (Synapse): A logged-in synapseclient instance. Returns: muts (pandas DataFrame), shape = [n_mutations, mut_levels + 1] An array of mutation data, with a row for each mutation appearing in an individual sample. Examples: >>> import synapseclient >>> syn = synapseclient.Synapse() >>> syn.login() >>> muts = get_variants_mc3(syn) """ mc3 = syn.get('syn7824274') # defines which mutation annotation MAF columns to use use_cols = [0, 8, 15, 36, 37, 38, 39, 40, 41, 71, 72] use_names = ['Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon', 'depth', 'ref_count', 'alt_count', 'SIFT', 'PolyPhen'] # imports mutation data into a DataFrame, parses TCGA sample barcodes # and PolyPhen scores i = 0 while i < 10: try: muts = pd.read_csv(mc3.path, engine='python', usecols=use_cols, sep='\t', header=None, names=use_names, comment='#', skiprows=1) break except OSError: i = i + 1 for annt, null_val in zip(['PolyPhen', 'SIFT'], [0, 1]): muts[annt] = muts[annt].apply( lambda val: (np.float(gsub('\)$', '', gsub('^.*\(', '', val))) if val != '.' else null_val) ) muts.Sample = muts.Sample.apply(lambda smp: '-'.join(smp.split('-')[:4])) muts.SIFT = 1 - muts.SIFT return muts
def get_mtree_newick(mtree): """Get the Newick tree format representation of this MuTree.""" newick_str = '' for nm, mut in sorted(mtree, key=lambda x: x[0]): if isinstance(mut, MuTree): newick_str += '(' + gsub(',$', '', get_mtree_newick(mut)) + ')' if nm == ".": newick_str += '{*none*},' else: newick_str += '{' + nm + '},' if mtree.depth == 0: newick_str = gsub(',$', '', newick_str) + ';' return newick_str
def get_variants_mc3(syn): """Reads ICGC mutation data from the MC3 synapse file. Args: syn (Synapse): A logged-in synapseclient instance. Returns: muts (pandas DataFrame), shape = [n_mutations, mut_levels + 1] An array of mutation data, with a row for each mutation appearing in an individual sample. Examples: >>> import synapseclient >>> syn = synapseclient.Synapse() >>> syn.login() >>> muts = get_variants_mc3(syn) """ mc3 = syn.get('syn7824274') # defines which mutation annotation MAF columns to use use_cols = [0, 8, 15, 36, 37, 38, 72] use_names = [ 'Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon', 'PolyPhen' ] # imports mutation data into a DataFrame, parses TCGA sample barcodes # and PolyPhen scores muts = pd.read_csv(mc3.path, usecols=use_cols, sep='\t', header=None, names=use_names, comment='#', skiprows=1) muts['Sample'] = [ reduce(lambda x, y: x + '-' + y, s.split('-', 4)[:4]) for s in muts['Sample'] ] muts['PolyPhen'] = [ gsub('\)$', '', gsub('^.*\(', '', x)) if x != '.' else 0 for x in muts['PolyPhen'] ] return muts
def eval_node(self, variables, fallback): # self.variableValues =variables #woot? whot = self.full_value() try: whot = re.gsub("\\", "", whot) # where from?? token? res = eval(whot) # except fallback ## v0.0 return res except SyntaxError as se: return fallback
def get_newick(self): """Get the Newick tree format representation of this MuTree.""" newick_str = '' for nm, mut in self.sort_iter(): if isinstance(mut, MuTree): newick_str += '(' + gsub(',$', '', mut.get_newick()) + ')' if nm == "0": newick_str += '{*none*},' else: newick_str += '{' + nm + '},' if self.depth == 0: newick_str = gsub(',$', '', newick_str) + ';' return newick_str
def sort_iter(self): """Iterates through the branches of the tree, ordering mutation attributes where possible.""" if self.mut_level in ['Exon', 'Location']: return iter( sorted([("0", branch) if lbl == '.' else (lbl, branch) for lbl, branch in self._child.items()], key=lambda x: int( gsub('[^0-9]', '0', x[0].split('/')[0])))) else: return self.__iter__()
def get_gencode(): """Gets annotation data for protein-coding genes on non-sex chromosomes from a Gencode file. Returns ------- annot : dict Dictionary with keys corresponding to Ensembl gene IDs and values consisting of dicts with annotation fields. """ annot = pd.read_csv(DATA_PATH + "gencode.v22.annotation.gtf.gz", usecols=[0, 2, 3, 4, 8], names=['Chr', 'Type', 'Start', 'End', 'Info'], sep='\t', header=None, comment='#') # filter out annotation records that aren't # protein-coding genes on non-sex chromosomes chroms_use = ['chr' + str(i + 1) for i in range(22)] annot = annot.loc[annot['Type'] == 'gene', ] chr_indx = np.array([chrom in chroms_use for chrom in annot['Chr']]) annot = annot.loc[chr_indx, ] # parse the info field to get each gene's annotation data gn_annot = { gsub('\.[0-9]+', '', z['gene_id']).replace('"', ''): z for z in [ dict([['chr', an[0]]] + [['Start', an[2]]] + [['End', an[3]]] + [ y for y in [x.split(' ') for x in an[4].split('; ')] if len(y) == 2 ]) for an in annot.values ] if z['gene_type'] == '"protein_coding"' } for g in gn_annot: gn_annot[g]['gene_name'] = gn_annot[g]['gene_name'].replace('"', '') return gn_annot
def __str__(self): """Printing a MuTree shows each of the branches of the tree and the samples at the end of each branch.""" new_str = self.mut_level for nm, mut in self: new_str += ' IS {}'.format(nm) if isinstance(mut, MuTree): new_str += (' AND ' + '\n' + '\t' * (self.depth + 1) + str(mut)) # if we have reached a root node, print the samples elif len(mut) > 8: new_str += ': ({} samples)'.format(len(mut)) else: new_str += ': {}'.format( reduce(lambda x, y: '{},{}'.format(x, y), mut)) new_str += ('\n' + '\t' * self.depth) new_str = gsub('\n$', '', new_str) return new_str
def no_attrib_error(module, text): output = """<div class="%s">%s</div>""" text = re.gsub(r"\r\n|\n", "", text[0:20]) print "* Error: '%s' with text: %s...\n" % (module, text[0:20]) return output % ("general-error", "Error! No attribue is found for '%s' with text => %s ..." % (module,text[0:20]))
def plot_performance(clf_set='base', mtype_set='default'): """Plots barplots of classifier performance for a set of mutations.""" out_data = load_output('baseline', clf_set, mtype_set) alg_order = [clf.__name__ for clf in clf_list[clf_set]] # gets AUC data, sets up plot and subplots auc_data = [x['AUC'] for x in out_data] auc_min = min([min(x.values()) for x in auc_data]) * 0.9 fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(7, 11)) for i, gene in enumerate(mtype_list[mtype_set]): # cast performance data into matrix format perf_data = pd.DataFrame( [{k[0].split('_')[0]: v for k, v in x.items() if k[1] == gene} for x in auc_data]) alg_indx = [list(perf_data.columns).index(x) for x in alg_order] perf_data = perf_data.ix[:, alg_indx] # create and plot the subplot titles describing mutation types gene_lbl = '{}-{}'.format(gene[0], gsub('(-|, )', '\n', str(gene[1]))) axes[i // 3, i % 3].set_title(gene_lbl, fontsize=13) # plot the boxes showing performances axes[i // 3, i % 3].boxplot(x=np.array(perf_data), boxprops={'linewidth': 1.5}, medianprops={ 'linewidth': 3, 'color': '#960c20' }, flierprops={'markersize': 2}) # label x-axis ticks with algorithm names if we are on bottom row if (i // 3) == 1: axes[i // 3, i % 3].set_xticklabels(perf_data.columns, fontsize=12, rotation=45, ha='right') else: axes[i // 3, i % 3].set_xticklabels(np.repeat('', len(alg_indx))) # add y-axis title if we are on left-most column if (i % 3) == 0: axes[i // 3, i % 3].set_ylabel('AUC', fontsize=19) else: axes[i // 3, i % 3].set_yticklabels([]) # add dotted line at AUC=0.5, set AUC axis limits axes[i // 3, i % 3].plot(list(range(len(alg_indx) + 2)), np.repeat(0.5, len(alg_indx) + 2), c="black", lw=0.8, ls='--', alpha=0.8) axes[i // 3, i % 3].set_ylim(auc_min, 1.0) # tweak subplot spacing and save plot to file plt.tight_layout(w_pad=-1.2, h_pad=1.5) plt.savefig(base_dir + '/plots/' + get_set_plotlbl(clf_set) + '_' + get_set_plotlbl(mtype_set) + '__performance.png', dpi=700)
def get_variant_data(cohort, var_source, **var_args): if var_source == 'mc3': mc3 = var_args['syn'].get('syn7824274') field_dict = (('Gene', 0), ('Chr', 4), ('Start', 5), ('End', 6), ('Strand', 7), ('Form', 8), ('RefAllele', 10), ('TumorAllele', 12), ('Sample', 15), ('HGVS', 34), ('Protein', 36), ('Transcript', 37), ('Exon', 38), ('depth', 39), ('ref_count', 40), ('alt_count', 41), ('SIFT', 71), ('PolyPhen', 72), ('Filter', 108)) if 'mut_fields' not in var_args or var_args['mut_fields'] is None: use_fields, use_cols = tuple(zip(*field_dict)) else: use_fields, use_cols = tuple( zip(*[(name, col) for name, col in field_dict if name in {'Sample', 'Filter'} | set(var_args['mut_fields'])])) # imports mutation data into a DataFrame, parses TCGA sample barcodes # and PolyPhen scores i = 0 while i < 10: #TODO: handle I/O errors on the cohort/experiment level? try: var_data = pd.read_csv(mc3.path, engine='c', dtype='object', sep='\t', header=None, usecols=use_cols, names=use_fields, comment='#', skiprows=1) break except OSError: i = i + 1 #TODO: more fine-grained Filtering control? var_data = var_data.loc[~var_data.Filter.str. contains('nonpreferredpair')] for annt, null_val in zip(['PolyPhen', 'SIFT'], [0, 1]): if annt in var_data: var_data[annt] = var_data[annt].apply(lambda val: ( np.float(gsub('\)$', '', gsub('^.*\(', '', val))) if val != '.' else null_val)) if annt == 'SIFT': var_data[annt] = 1 - var_data[annt] var_data.Sample = var_data.Sample.apply( lambda smp: '-'.join(smp.split('-')[:4])) elif var_source == 'Firehose': mut_tar = tarfile.open( glob.glob( os.path.join( data_dir, "stddata__2016_01_28", cohort, "20160128", "*Mutation_Packager_Oncotated_Calls.Level_3*tar.gz"))[0]) mut_list = [] for mut_fl in mut_tar.getmembers(): try: mut_tbl = pd.read_csv( BytesIO(mut_tar.extractfile(mut_fl).read()), sep='\t', skiprows=4, usecols=[0, 8, 15, 37, 41], names=['Gene', 'Form', 'Sample', 'Exon', 'Protein']) mut_list += [mut_tbl] except: print("Skipping mutations for {}".format(mut_fl)) muts = pd.concat(mut_list) muts.Sample = muts.Sample.apply( lambda smp: "-".join(smp.split("-")[:4])) mut_tar.close() elif var_source == 'BMEG': oph = Ophion("http://bmeg.io") mut_list = {samp: {} for samp in sample_list} gene_lbls = ["gene:" + gn for gn in gene_list] print(oph.query().has( "gid", "biosample:" + sample_list[0]).incoming("variantInBiosample").outEdge( "variantInGene").mark("variant").inVertex().has( "gid", oph.within(gene_lbls)).count().execute()) # .mark("gene").select(["gene", "variant"]).count().execute()) for samp in sample_list: for i in oph.query().has("gid", "biosample:" + samp)\ .incoming("variantInBiosample")\ .outEdge("variantInGene").mark("variant")\ .inVertex().has("gid", oph.within(gene_lbls))\ .mark("gene").select(["gene", "variant"]).execute(): dt = json.loads(i) gene_name = dt["gene"]["properties"]["symbol"] mut_list[samp][gene_name] = { k: v for k, v in dt["variant"]["properties"].items() if k in mut_fields } mut_table = pd.DataFrame(mut_list) else: raise ValueError("Unrecognized source of variant data!") return var_data
def subn_filter(s, find, replace, count=0): """A non-optimal implementation of a regex filter""" return re.gsub(find, replace, count, s)