コード例 #1
0
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list(
        [k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)
コード例 #2
0
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
コード例 #3
0
def insert_metadata(data_df, metadata, table):
    sample_barcodes = list(set([k for d, k in data_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(table, sample_metadata_list)
コード例 #4
0
def insert_metadata(data_df, metadata, table):
    sample_barcodes = list(set([k for d, k in data_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(table, sample_metadata_list)
コード例 #5
0
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj['sample_barcode'] = i # Normalized to match user_gen
                new_df_obj['project_id'] = metadata['project_id']
                new_df_obj['study_id'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
コード例 #6
0
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj[
                    'sample_barcode'] = i  # Normalized to match user_gen
                new_df_obj['Project'] = metadata['project_id']
                new_df_obj['Study'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][
                    k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][
                    k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][
                    k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(
        set([k for d, k in new_df['SampleBarcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'],
                                           metadata['data_type'],
                                           sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'],
                                         metadata['study_id'], project_id,
                                         bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(new_df,
                                       outfilename,
                                       metadata=metadata,
                                       tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(project_id,
                            bq_dataset,
                            table_name,
                            schema,
                            source_path,
                            source_format='NEWLINE_DELIMITED_JSON',
                            write_disposition='WRITE_APPEND',
                            is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)