Example #1
0
def identify_data(config):
    """Gets the metadata info from database
    """
    log = configure_logging('cnv', 'logs/cnv_extract.log')
    log.info('start cnv extract')
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    log.info("\tselect file names from db")
    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE DataType='Copy Number Results-SNP'
        AND DatafileUploaded='true'
        AND DatafileNameKey LIKE '%.nocnv_hg19.seg.txt'
        AND IncludeForAnalysis='yes'
        """

    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows.  because number of rows exceed import limitation for GCS to BigQuery, rotate into multiple
    # virtual directories
    dir_prefix = config['cnv']['output_dir_prefix']
    dir_suffixes = config['cnv']['output_dir_suffixes']
    num_suffix = len(dir_suffixes)
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        metadata_df.loc[i, 'OutDatafileNameKey'] = dir_prefix + dir_suffixes[i % num_suffix] \
                             + metadata['DatafileName'] + '.json'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df['transform_function'] = 'cnv.transform.parse_cnv'

    log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape)))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    log.info("\tAfter filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape)))

    log.info('finished cnv extract')
    return metadata_df
Example #2
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # setup logging
    log = configure_logging('methylation_extract',
                            "logs/" + 'methylation_extract.log')
    log.info('start methylation extract')

    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis
        FROM metadata_data
        WHERE DataType='DNA Methylation'
        AND DatafileUploaded='true'
        AND IncludeForAnalysis='yes'
        """

    # connect to db and get results in a dataframe
    log.info("\tselect file names from db")
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        platform = row['Platform']
        metadata_df.loc[i, 'OutDatafileNameKey'] = config['methylation']['output_dir']\
                            + platform + '/' + metadata['DatafileName'] + '.csv'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df[
        'transform_function'] = 'methylation.transform.parse_methylation'

    log.info("\tFound {0} rows, columns.".format(str(metadata_df.shape)))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    log.info("\tAfter filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape)))

    log.info('finished methylation extract')
    return metadata_df
Example #3
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE DataType='Expression-miRNA'
        AND DatafileUploaded='true'
        AND DatafileNameKey LIKE '%.mirna.quantification.txt'
        AND IncludeForAnalysis='yes'
        """
    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        if 'IlluminaHiSeq' in metadata['Platform']:
            metadata_df.loc[i, 'Platform'] = 'IlluminaHiSeq'
            platform = 'IlluminaHiSeq'
        elif 'IlluminaGA' in metadata['Platform']:
            metadata_df.loc[i, 'Platform'] = 'IlluminaGA'
            platform = 'IlluminaGA'
        else:
            raise Exception('Did not match any given platform')

        metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['mirna']['output_dir']\
                            + platform + '/' + metadata['DatafileName'] + '.json'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df['transform_function'] = 'mirna.mirna.transform.parse_mirna'

    print "Found {0} rows, columns.".format(str(metadata_df.shape))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    print "After filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape))

    return metadata_df
Example #4
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE DataType='RSEM_genes_normalized'
        AND DatafileUploaded='true'
        AND DatafileNameKey LIKE '%.rsem.genes.normalized_results'
        AND IncludeForAnalysis='yes'
        """

    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        if 'IlluminaHiSeq' in metadata['Platform']:
            metadata_df.loc[i, 'Platform'] = 'IlluminaHiSeq'
            platform = 'IlluminaHiSeq'
        elif 'IlluminaGA' in metadata['Platform']:
            metadata_df.loc[i, 'Platform'] = 'IlluminaGA'
            platform = 'IlluminaGA'
        else:
            raise Exception('Did not match any given platform')

        metadata_df.loc[i, 'OutDatafileNameKey'] = config['mrna']['unc']['output_dir']\
                            + platform + '/' + metadata['DatafileName'] + '.json'

    metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
                                               .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['is_tumor'] = (metadata_df['SampleTypeLetterCode'] != "CELLC")
    metadata_df['transform_function'] = 'mrna.unc.transform.parse_unc'

    print "Found {0} rows, columns." .format(str(metadata_df.shape))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [key for key in metadata_df.columns.values if key.startswith("is_")]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    print "After filtering: Found {0} rows, columns." .format(str(metadata_df.shape))

    return metadata_df
Example #5
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # cloudSql connection params
    host = config["cloudsql"]["host"]
    database = config["cloudsql"]["db"]
    user = config["cloudsql"]["user"]
    passwd = config["cloudsql"]["passwd"]

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE DataType='Expression-Protein'
        AND DatafileUploaded='true'
        AND IncludeForAnalysis='yes'
        """

    #       AND DatafileNameKey LIKE '%protein_expression.%.txt'
    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)

    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        metadata_df.loc[i, "OutDatafileNameKey"] = config["protein"]["output_dir"] + metadata["DatafileName"] + ".json"

    metadata_df.loc[:, "SampleTypeLetterCode"] = metadata_df["SampleTypeCode"].map(
        lambda code: config["sample_code2letter"][code]
    )

    metadata_df.loc[:, "DatafileNameKey"] = metadata_df["DatafileNameKey"].map(
        lambda inputfile: re.sub(r"^/", "", inputfile)
    )

    # tag CELLC samples
    metadata_df["is_tumor"] = metadata_df["SampleTypeLetterCode"] != "CELLC"
    metadata_df["transform_function"] = "protein.transform.parse_protein"

    print "Found {0} rows, columns.".format(str(metadata_df.shape))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [key for key in metadata_df.columns.values if key.startswith("is_")]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]

    print "After filtering: Found {0} rows, columns.".format(str(metadata_df.shape))

    return metadata_df
Example #6
0
def process_sql2bq(config, etl_config, sql, log):
    log.info("\t\tselect annotation records from db")
    metadata_df = read_mysql_query(config['cloudsql']['host'],
                                   config['cloudsql']['db'],
                                   config['cloudsql']['user'],
                                   config['cloudsql']['passwd'], sql)
    log.info("\t\tFound {0} rows, columns.".format(str(metadata_df.shape)))
    log.info("\tupload data to GCS.")
    project_id = config['cloud_projects']['open']
    bucket_name = config['buckets']['open']
    gcs_file_path = config['buckets']['folders'][
        'base_run_folder'] + 'etl/annotation'
    gcs = GcsConnector(project_id, bucket_name)
    gcs.convert_df_to_njson_and_upload(metadata_df,
                                       gcs_file_path + '/annotation.json',
                                       logparam=log)
    log.info('create the BigQuery table')
    Etl().load(project_id, [etl_config['bq_dataset']],
               [etl_config['bq_table']], [etl_config['schema_file']],
               ['gs://' + bucket_name + '/' + gcs_file_path],
               [etl_config['write_disposition']], 1, log)
Example #7
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']
#    sqlquery = """
#        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
#              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
#              DatafileUploaded, IncludeForAnalysis, DataCenterName
#        FROM metadata_data
#        WHERE DatafileUploaded='true'
#        AND DatafileNameKey LIKE '%.maf'
#        AND IncludeForAnalysis='yes'
#        """

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE Datatype='Mutations'
        AND DatafileNameKey LIKE '%.maf'
        AND DatafileUploaded='true'
        AND IncludeForAnalysis='yes'
        """

    #sqlquery = """ select datafilename, datafilenamekey from metadata_data where 0 < instr(datafilename, 'maf') and 'true' = datafileuploaded and 0 = instr(datafilename, 'protected') group by datafilename, dataarchivename;
#"""
    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)
#    print metadata_df
    for i, x in metadata_df.iterrows():
        print x.to_dict()
#    print metadata_df
    sys.exit()
    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['mirna']['output_dir']\
                            +  metadata['DatafileName'] + '.json'

    #metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
    #                                           .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['transform_function'] = 'mirna.mirna.transform.parse_mirna'

    print "Found {0} rows, columns." .format(str(metadata_df.shape))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [key for key in metadata_df.columns.values if key.startswith("is_")]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]
    metadata_df.to_excel(writer, 'maf_files_metadata_table')
    writer.save()
    print "After filtering: Found {0} rows, columns." .format(str(metadata_df.shape))

    return metadata_df
Example #8
0
def identify_data(config):
    """Gets the metadata info from database
    """
    log = configure_logging('data_et', 'logs/data_et.log')
    log.info('start data extract and transform')
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    log.info("\tselect data records from db")
    sqlquery = """
        SELECT 
            ParticipantBarcode,
            SampleBarcode,
            AliquotBarcode,
            AliquotUUID,
            DataArchiveName,
            DataArchiveURL,
            DataArchiveVersion,
            DataCenterCode,
            DataCenterName,
            DataCenterType,
            DatafileMD5,
            DatafileName,
            DatafileNameKey,
            DatafileUploaded,
            DataLevel,
            Datatype,
            GenomeReference,
            IncludeForAnalysis,
            MAGETabArchiveName,
            MAGETabArchiveURL,
            Pipeline,
            Platform,
            Project,
            Repository,
            SampleType,
            SampleTypeCode,
            SDRFFileName,
            SDRFFileNameKey,
            SecurityProtocol,
            Species,
            Study,
            wasDerivedFrom,
            library_strategy,
            state,
            reason_for_state,
            analysis_id,
            analyte_code,
            last_modified,
            platform_full_name,
            GG_dataset_id,
            GG_readgroupset_id
        FROM metadata_data
        WHERE Project = 'TCGA'
            AND study = '%s'
            AND DatafileUploaded='true'
            AND DatafileNameKey is not null
            AND IncludeForAnalysis='yes'
        """

    studies = config['all_tumor_types']
    for study in studies:
        # connect to db and get results in a dataframe
        log.info("\tselect data records from db for %s" % (study))
        metadata_df = read_mysql_query(host, database, user, passwd, sqlquery % (study))
        log.info("\tFound {0} rows, columns." .format(str(metadata_df.shape)))
    
        project_id = config['project_id']
        bucket_name = config['buckets']['open']
        gcs = GcsConnector(project_id, bucket_name)
        log.info("\tupload data to GCS.")
        gcs.convert_df_to_njson_and_upload(metadata_df, config['data']['output_dir'] + config['data']['bq_table'] + '_%s.json' % (study))
    
    log.info('finished extract and transform')
Example #9
0
def identify_data(config):
    """Gets the metadata info from database
    """
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']
    #    sqlquery = """
    #        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
    #              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
    #              DatafileUploaded, IncludeForAnalysis, DataCenterName
    #        FROM metadata_data
    #        WHERE DatafileUploaded='true'
    #        AND DatafileNameKey LIKE '%.maf'
    #        AND IncludeForAnalysis='yes'
    #        """

    sqlquery = """
        SELECT ParticipantBarcode, SampleBarcode, AliquotBarcode, Pipeline, Platform,
              SampleType, SampleTypeCode, Study, DatafileName, DatafileNameKey, Datatype,
              DatafileUploaded, IncludeForAnalysis, DataCenterName
        FROM metadata_data
        WHERE Datatype='Mutations'
        AND DatafileNameKey LIKE '%.maf'
        AND DatafileUploaded='true'
        AND IncludeForAnalysis='yes'
        """

    #sqlquery = """ select datafilename, datafilenamekey from metadata_data where 0 < instr(datafilename, 'maf') and 'true' = datafileuploaded and 0 = instr(datafilename, 'protected') group by datafilename, dataarchivename;
    #"""
    # connect to db and get results in a dataframe
    metadata_df = read_mysql_query(host, database, user, passwd, sqlquery)
    #    print metadata_df
    for i, x in metadata_df.iterrows():
        print x.to_dict()


#    print metadata_df
    sys.exit()
    # rename platforms in rows
    for i, row in metadata_df.iterrows():
        metadata = row.to_dict()
        metadata_df.loc[i, 'OutDatafileNameKey'] = config['mirna']['mirna']['output_dir']\
                            +  metadata['DatafileName'] + '.json'

    #metadata_df.loc[:, 'SampleTypeLetterCode'] = metadata_df['SampleTypeCode']\
    #                                           .map(lambda code: config['sample_code2letter'][code])

    metadata_df.loc[:, 'DatafileNameKey'] = metadata_df['DatafileNameKey']\
                                               .map(lambda inputfile: re.sub(r"^/", "", inputfile))

    # tag CELLC samples
    metadata_df['transform_function'] = 'mirna.mirna.transform.parse_mirna'

    print "Found {0} rows, columns.".format(str(metadata_df.shape))

    # Filter - check all "is_" fields - remember all 'is_' fields must be boolean
    all_flag_columns = [
        key for key in metadata_df.columns.values if key.startswith("is_")
    ]
    flag_df = metadata_df[all_flag_columns]
    metadata_df = metadata_df[flag_df.all(axis=1)]
    metadata_df.to_excel(writer, 'maf_files_metadata_table')
    writer.save()
    print "After filtering: Found {0} rows, columns.".format(
        str(metadata_df.shape))

    return metadata_df