コード例 #1
0
ファイル: features.py プロジェクト: KarchinLab/2020plus
def main_cosmic(options):
    """Main function used to process COSMIC data."""
    # get configs
    in_opts = _utils.get_input_config('classifier')
    out_opts = _utils.get_output_config('features')
    count_opts = _utils.get_output_config('feature_matrix')
    # result_opts = _utils.get_input_config('result')
    db_cfg = _utils.get_db_config('2020plus')

    # get mutations
    conn = sqlite3.connect(db_cfg['db'])
    sql = ("SELECT Gene, Protein_Change as AminoAcid, "
            "       DNA_Change as Nucleotide, "
            "       Variant_Classification, "
            "       Tumor_Sample, Tumor_Type "
            "FROM mutations")
    mut_df = psql.frame_query(sql, con=conn)
    conn.close()

    # get features for classification
    all_features = futils.generate_features(mut_df, options)

    # save features to text file
    cols = all_features.columns.tolist()
    new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:]
    all_features = all_features[new_order]  # make the gene name the first column
    out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output']
    all_features.to_csv(out_path, sep='\t', index=False)
コード例 #2
0
def main_cosmic(options):
    """Main function used to process COSMIC data."""
    # get configs
    in_opts = _utils.get_input_config('classifier')
    out_opts = _utils.get_output_config('features')
    count_opts = _utils.get_output_config('feature_matrix')
    # result_opts = _utils.get_input_config('result')
    db_cfg = _utils.get_db_config('2020plus')

    # get mutations
    conn = sqlite3.connect(db_cfg['db'])
    sql = ("SELECT Gene, Protein_Change as AminoAcid, "
           "       DNA_Change as Nucleotide, "
           "       Variant_Classification, "
           "       Tumor_Sample, Tumor_Type "
           "FROM mutations")
    mut_df = psql.frame_query(sql, con=conn)
    conn.close()

    # get features for classification
    all_features = futils.generate_features(mut_df, options)

    # save features to text file
    cols = all_features.columns.tolist()
    new_order = ['gene'
                 ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:]
    all_features = all_features[
        new_order]  # make the gene name the first column
    out_path = _utils.save_dir + in_opts['gene_features'] if not options[
        'output'] else options['output']
    all_features.to_csv(out_path, sep='\t', index=False)
コード例 #3
0
def main(db_path):
    db_opts = _utils.get_db_config('2020plus')
    out_db = db_opts['db']
    out_db = db_path if db_path else out_db

    # drop table if exists
    # _utils.drop_table('mutation', out_db, kind='sqlite')

    cols_of_interest = [
        'Gene', 'Tumor_Sample', 'Tumor_Type', 'Chromosome', 'Start_Position',
        'End_Position', 'Variant_Classification', 'Reference_Allele',
        'Tumor_Allele', 'Protein_Change', 'DNA_Change'
    ]
    data_type = [
        'TEXT', 'TEXT', 'TEXT', 'TEXT', 'INT', 'INT', 'TEXT', 'TEXT', 'TEXT',
        'TEXT', 'TEXT'
    ]
    _utils.create_empty_table('mutation', out_db, cols_of_interest, data_type)

    conn = sqlite3.connect(out_db)  # open connection
    cur = conn.cursor()
    #col_info_list = [' '.join(x) for x in zip(cols_of_interest, data_type)]
    #col_info_str = ', '.join(col_info_list)
    #sql = "CREATE TABLE mutation({0});".format(col_info_str)
    #cur.execute(sql)
    #conn.commit()

    maf_mutation_cols = ['Gene_Symbol'] + cols_of_interest[1:-1]
    sql = ("INSERT INTO mutation ({0}) "
           "    SELECT {1}"
           "    FROM maf_mutation".format(', '.join(cols_of_interest[:-1]),
                                          ', '.join(maf_mutation_cols)))
    cur.execute(sql)
    conn.commit()

    cols_of_interest = cols_of_interest[:-5] + cols_of_interest[-2:] + [
        'Variant_Classification'
    ]
    cosmic_col_list = [
        'Gene', 'SampleName', 'PrimaryTissue', 'hg19chrom', 'hg19start',
        'hg19end', 'AminoAcid', 'Nucleotide', 'Variant_Classification'
    ]
    mut_cols = ', '.join(cols_of_interest)
    cosmic_cols = ', '.join(cosmic_col_list)
    sql = ("INSERT INTO mutation({0}) "
           "    SELECT {1} "
           "    FROM cosmic_mutation cm"
           "    WHERE cm.SampleName NOT IN ("
           "        SELECT DISTINCT(m.Tumor_Sample) "
           "        FROM mutation m "
           "    ) ".format(mut_cols, cosmic_cols))
    cur.execute(sql)
    conn.commit()

    sql = ("UPDATE mutation SET DNA_Change='c.?'" "WHERE DNA_CHANGE IS NULL")
    cur.execute(sql)
    conn.commit()
    conn.close()
コード例 #4
0
ファイル: gene_tsv.py プロジェクト: yuanjingnan/2020plus
def main(hypermutator_count, mut_path, db_path, no_cosmic_flag, opts):
    """Concatenates all the mutation data from tab delmited files in
    the cosmic directory. Next, saves the results to a sqlite db.

    Parameters
    ----------
    hypermutator_count : int
        remove samples with too many mutations
    mut_path : str
        Either path to directory containing contents of COSMIC's
        genes.tgz file or decompressed CosmicMutantExport.tsv.
        If empty string, just use path from config file.
    db_path : str
        path to save sqlite database. If string is empty,
        use path from config.
    no_cosmic_flag : bool
        indicates not to use cosmic mutations
    """
    # get input/output configurations
    in_opts = _utils.get_input_config('input')
    cosmic_path = in_opts['cosmic_path']
    out_opts = _utils.get_output_config('gene_tsv')
    out_path = out_opts['gene_tsv']
    cnv_path = out_opts['cnv_tsv']
    db_opts = _utils.get_db_config('2020plus')
    out_db = db_opts['db']

    # check if user specifies non standard db path
    out_db = db_path if db_path else out_db

    # save info into a txt file and sqlite3 database
    if not no_cosmic_flag:
        cosmic_path = mut_path if mut_path else cosmic_path
        if os.path.isdir(cosmic_path):
            # concatenate all gene files
            concatenate_genes(out_path, cosmic_path)
            # save database
            save_db(hypermutator_count,
                    out_path,
                    out_db,
                    is_genes_tgz=True,
                    only_genome_wide=opts['only_genome_wide'],
                    use_unknown_status=opts['use_unknown_status'])
        elif os.path.isfile(cosmic_path):
            # save database
            save_db(hypermutator_count,
                    cosmic_path,
                    out_db,
                    is_genes_tgz=False,
                    only_genome_wide=opts['only_genome_wide'],
                    use_unknown_status=opts['use_unknown_status'])
        else:
            raise ValueError('Please specify a vlid path to COSMIC data')
    else:
        # create an empty table if cosmic not wanted
        create_empty_cosmic_mutation_table(out_db)
コード例 #5
0
ファイル: gene_maf.py プロジェクト: KarchinLab/2020plus
def main(maf_path, db_path, hypermutator_count):
    # get db info
    db_opts = _utils.get_db_config('2020plus')
    out_db = db_opts['db']
    out_db = db_path if db_path else out_db

    # update databse maf_mutation table
    if maf_path:
        # add to database if they specify a MAF file
        save_db(maf_path, out_db, hypermutator_count)
    else:
        # else just create an empty maf_mutation table
        create_empty_maf_mutation_table(out_db)
コード例 #6
0
ファイル: gene_maf.py プロジェクト: yuanjingnan/2020plus
def main(maf_path, db_path, hypermutator_count):
    # get db info
    db_opts = _utils.get_db_config('2020plus')
    out_db = db_opts['db']
    out_db = db_path if db_path else out_db

    # update databse maf_mutation table
    if maf_path:
        # add to database if they specify a MAF file
        save_db(maf_path, out_db, hypermutator_count)
    else:
        # else just create an empty maf_mutation table
        create_empty_maf_mutation_table(out_db)
コード例 #7
0
ファイル: feature_utils.py プロジェクト: MartinFXP/2020plus
def wrapper_retrieve_gene_features(opts):
    """Wrapper arround the retrieve_gene_features function in the
    features module.

    Parameters
    ----------
    opts : dict
        command line options

    Returns
    -------
    additional_features : pd.DataFrame

    """
    # get additional features
    db_cfg = _utils.get_db_config('2020plus')
    conn = sqlite3.connect(db_cfg['db'])
    additional_features = retrieve_gene_features(conn, opts, get_entropy=False)
    conn.close()
    return additional_features
コード例 #8
0
ファイル: feature_utils.py プロジェクト: KarchinLab/2020plus
def wrapper_retrieve_gene_features(opts):
    """Wrapper arround the retrieve_gene_features function in the
    features module.

    Parameters
    ----------
    opts : dict
        command line options

    Returns
    -------
    additional_features : pd.DataFrame

    """
    # get additional features
    db_cfg = _utils.get_db_config('2020plus')
    conn = sqlite3.connect(db_cfg['db'])
    additional_features = retrieve_gene_features(conn, opts, get_entropy=False)
    conn.close()
    return additional_features
コード例 #9
0
ファイル: gene_features.py プロジェクト: yuanjingnan/2020plus
def main(db_path):
    # get config files
    in_opts = _utils.get_input_config('input')
    db_opts = _utils.get_db_config('2020plus')

    # get absolute path for cosmic data
    cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path'])

    # get data for gene_features table
    logger.info('Processing features for gene_features table ...')
    if os.path.isdir(cosmic_path):
        gene_length = recursive_gene_length(in_opts['fasta_dir'])
        genes, lengths = zip(*gene_length.items())
        gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths})
    else:
        gene_length_df = pd.read_csv(cosmic_path, sep='\t')
        gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']]
        gene_length_df = gene_length_df.rename(columns={
            'Gene name': 'gene',
            'Gene CDS length': 'gene length'
        })
        gene_length_df.drop_duplicates(cols=['gene'], inplace=True)

    # merge in data from mutsig and biogrid
    mutsigcv_feature_path = os.path.join(_utils.proj_dir,
                                         in_opts['mutsigcv_features'])
    df = pd.read_csv(mutsigcv_feature_path, sep='\t')
    df = pd.merge(gene_length_df, df, how='left',
                  on='gene')  # merge the data frames
    biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt')
    biogrid_df = pd.read_csv(biogrid_path, sep='\t')
    df = pd.merge(df, biogrid_df, how='left', on='gene')

    # path to database
    db_path = db_path if db_path else db_opts['db']

    logger.info('Finished processing features for gene_features table.')

    # save database
    save_db(df, db_path)
コード例 #10
0
ファイル: gene_features.py プロジェクト: KarchinLab/2020plus
def main(db_path):
    # get config files
    in_opts = _utils.get_input_config('input')
    db_opts = _utils.get_db_config('2020plus')

    # get absolute path for cosmic data
    cosmic_path = os.path.join(_utils.proj_dir, in_opts['cosmic_path'])

    # get data for gene_features table
    logger.info('Processing features for gene_features table ...')
    if os.path.isdir(cosmic_path):
        gene_length = recursive_gene_length(in_opts['fasta_dir'])
        genes, lengths = zip(*gene_length.items())
        gene_length_df = pd.DataFrame({'gene': genes, 'gene length': lengths})
    else:
        gene_length_df = pd.read_csv(cosmic_path, sep='\t')
        gene_length_df = gene_length_df[['Gene name', 'Gene CDS length']]
        gene_length_df = gene_length_df.rename(columns={'Gene name': 'gene',
                                                        'Gene CDS length': 'gene length'})
        gene_length_df.drop_duplicates(cols=['gene'], inplace=True)

    # merge in data from mutsig and biogrid
    mutsigcv_feature_path = os.path.join(_utils.proj_dir, in_opts['mutsigcv_features'])
    df = pd.read_csv(mutsigcv_feature_path, sep='\t')
    df = pd.merge(gene_length_df, df, how='left', on='gene')  # merge the data frames
    biogrid_path = os.path.join(_utils.proj_dir, 'data/biogrid_stats.txt')
    biogrid_df = pd.read_csv(biogrid_path, sep='\t')
    df = pd.merge(df, biogrid_df, how='left', on='gene')

    # path to database
    db_path = db_path if db_path else db_opts['db']

    logger.info('Finished processing features for gene_features table.')

    # save database
    save_db(df, db_path)