def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:] all_features = all_features[new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene' ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:] all_features = all_features[ new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options[ 'output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def main(db_path): db_opts = _utils.get_db_config('2020plus') out_db = db_opts['db'] out_db = db_path if db_path else out_db # drop table if exists # _utils.drop_table('mutation', out_db, kind='sqlite') cols_of_interest = [ 'Gene', 'Tumor_Sample', 'Tumor_Type', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Reference_Allele', 'Tumor_Allele', 'Protein_Change', 'DNA_Change' ] data_type = [ 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'INT', 'INT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT' ] _utils.create_empty_table('mutation', out_db, cols_of_interest, data_type) conn = sqlite3.connect(out_db) # open connection cur = conn.cursor() #col_info_list = [' '.join(x) for x in zip(cols_of_interest, data_type)] #col_info_str = ', '.join(col_info_list) #sql = "CREATE TABLE mutation({0});".format(col_info_str) #cur.execute(sql) #conn.commit() maf_mutation_cols = ['Gene_Symbol'] + cols_of_interest[1:-1] sql = ("INSERT INTO mutation ({0}) " " SELECT {1}" " FROM maf_mutation".format(', '.join(cols_of_interest[:-1]), ', '.join(maf_mutation_cols))) cur.execute(sql) conn.commit() cols_of_interest = cols_of_interest[:-5] + cols_of_interest[-2:] + [ 'Variant_Classification' ] cosmic_col_list = [ 'Gene', 'SampleName', 'PrimaryTissue', 'hg19chrom', 'hg19start', 'hg19end', 'AminoAcid', 'Nucleotide', 'Variant_Classification' ] mut_cols = ', '.join(cols_of_interest) cosmic_cols = ', '.join(cosmic_col_list) sql = ("INSERT INTO mutation({0}) " " SELECT {1} " " FROM cosmic_mutation cm" " WHERE cm.SampleName NOT IN (" " SELECT DISTINCT(m.Tumor_Sample) " " FROM mutation m " " ) ".format(mut_cols, cosmic_cols)) cur.execute(sql) conn.commit() sql = ("UPDATE mutation SET DNA_Change='c.?'" "WHERE DNA_CHANGE IS NULL") cur.execute(sql) conn.commit() conn.close()
def main(hypermutator_count, mut_path, db_path, no_cosmic_flag, opts): """Concatenates all the mutation data from tab delmited files in the cosmic directory. Next, saves the results to a sqlite db. Parameters ---------- hypermutator_count : int remove samples with too many mutations mut_path : str Either path to directory containing contents of COSMIC's genes.tgz file or decompressed CosmicMutantExport.tsv. If empty string, just use path from config file. db_path : str path to save sqlite database. If string is empty, use path from config. no_cosmic_flag : bool indicates not to use cosmic mutations """ # get input/output configurations in_opts = _utils.get_input_config('input') cosmic_path = in_opts['cosmic_path'] out_opts = _utils.get_output_config('gene_tsv') out_path = out_opts['gene_tsv'] cnv_path = out_opts['cnv_tsv'] db_opts = _utils.get_db_config('2020plus') out_db = db_opts['db'] # check if user specifies non standard db path out_db = db_path if db_path else out_db # save info into a txt file and sqlite3 database if not no_cosmic_flag: cosmic_path = mut_path if mut_path else cosmic_path if os.path.isdir(cosmic_path): # concatenate all gene files concatenate_genes(out_path, cosmic_path) # save database save_db(hypermutator_count, out_path, out_db, is_genes_tgz=True, only_genome_wide=opts['only_genome_wide'], use_unknown_status=opts['use_unknown_status']) elif os.path.isfile(cosmic_path): # save database save_db(hypermutator_count, cosmic_path, out_db, is_genes_tgz=False, only_genome_wide=opts['only_genome_wide'], use_unknown_status=opts['use_unknown_status']) else: raise ValueError('Please specify a vlid path to COSMIC data') else: # create an empty table if cosmic not wanted create_empty_cosmic_mutation_table(out_db)
def main(maf_path, db_path, hypermutator_count): # get db info db_opts = _utils.get_db_config('2020plus') out_db = db_opts['db'] out_db = db_path if db_path else out_db # update databse maf_mutation table if maf_path: # add to database if they specify a MAF file save_db(maf_path, out_db, hypermutator_count) else: # else just create an empty maf_mutation table create_empty_maf_mutation_table(out_db)
def wrapper_retrieve_gene_features(opts): """Wrapper arround the retrieve_gene_features function in the features module. Parameters ---------- opts : dict command line options Returns ------- additional_features : pd.DataFrame """ # get additional features db_cfg = _utils.get_db_config('2020plus') conn = sqlite3.connect(db_cfg['db']) additional_features = retrieve_gene_features(conn, opts, get_entropy=False) conn.close() return additional_features
def main(db_path): # get config files in_opts = _utils.get_input_config('input') db_opts = _utils.get_db_config('2020plus') # get absolute path for cosmic data cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path']) # get data for gene_features table logger.info('Processing features for gene_features table ...') if os.path.isdir(cosmic_path): gene_length = recursive_gene_length(in_opts['fasta_dir']) genes, lengths = zip(*gene_length.items()) gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths}) else: gene_length_df = pd.read_csv(cosmic_path, sep='\t') gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']] gene_length_df = gene_length_df.rename(columns={ 'Gene name': 'gene', 'Gene CDS length': 'gene length' }) gene_length_df.drop_duplicates(cols=['gene'], inplace=True) # merge in data from mutsig and biogrid mutsigcv_feature_path = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features']) df = pd.read_csv(mutsigcv_feature_path, sep='\t') df = pd.merge(gene_length_df, df, how='left', on='gene') # merge the data frames biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt') biogrid_df = pd.read_csv(biogrid_path, sep='\t') df = pd.merge(df, biogrid_df, how='left', on='gene') # path to database db_path = db_path if db_path else db_opts['db'] logger.info('Finished processing features for gene_features table.') # save database save_db(df, db_path)
def main(db_path): # get config files in_opts = _utils.get_input_config('input') db_opts = _utils.get_db_config('2020plus') # get absolute path for cosmic data cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path']) # get data for gene_features table logger.info('Processing features for gene_features table ...') if os.path.isdir(cosmic_path): gene_length = recursive_gene_length(in_opts['fasta_dir']) genes, lengths = zip(*gene_length.items()) gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths}) else: gene_length_df = pd.read_csv(cosmic_path, sep='\t') gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']] gene_length_df = gene_length_df.rename(columns={'Gene name': 'gene', 'Gene CDS length': 'gene length'}) gene_length_df.drop_duplicates(cols=['gene'], inplace=True) # merge in data from mutsig and biogrid mutsigcv_feature_path = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features']) df = pd.read_csv(mutsigcv_feature_path, sep='\t') df = pd.merge(gene_length_df, df, how='left', on='gene') # merge the data frames biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt') biogrid_df = pd.read_csv(biogrid_path, sep='\t') df = pd.merge(df, biogrid_df, how='left', on='gene') # path to database db_path = db_path if db_path else db_opts['db'] logger.info('Finished processing features for gene_features table.') # save database save_db(df, db_path)