Example #1
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    print "Loading Clinical data into BigQuery..."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['clinical']['bq_table'],
        schemas_dir + config['clinical']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['clinical']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
    print "*"*30
    print "Loading Biospecimen data into BigQuery..."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['biospecimen']['bq_table'],
        schemas_dir + config['biospecimen']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['biospecimen']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
Example #2
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading mRNA unc HiSeq data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['mrna']['unc']['bq_table_hiseq'],
    #    schemas_dir + config['mrna']['unc']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['mrna']['unc']['output_dir'] + 'IlluminaHiSeq/*',
    #    'NEWLINE_DELIMITED_JSON',
    #    'WRITE_EMPTY'
    #)
    print "*"*30
    print "Loading mRNA unc GA data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mrna']['unc']['bq_table_ga'],
        schemas_dir + config['mrna']['unc']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mrna']['unc']['output_dir'] + 'IlluminaGA/*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
Example #3
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('data_load', 'logs/data_load.log')
    log.info('begin load of data into bigquery')

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    log.info("\tLoading Data data into BigQuery...")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['data']['bq_table'],
        schemas_dir + config['data']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['data']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )

    log.info('finished load of data into bigquery')
Example #4
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get("SCHEMA_DIR", "schemas/")

    print "Loading Isoform HiSeq data into BigQuery.."
    load_data_from_file.run(
        config["project_id"],
        config["bq_dataset"],
        config["mirna"]["isoform"]["bq_table_hiseq"],
        schemas_dir + config["mirna"]["isoform"]["schema_file"],
        "gs://" + config["buckets"]["open"] + "/" + config["mirna"]["isoform"]["output_dir"] + "IlluminaHiSeq/*",
        "NEWLINE_DELIMITED_JSON",
        "WRITE_EMPTY",
    )
    print "*" * 30
    print "Loading Isoform GA data into BigQuery.."
    load_data_from_file.run(
        config["project_id"],
        config["bq_dataset"],
        config["mirna"]["isoform"]["bq_table_ga"],
        schemas_dir + config["mirna"]["isoform"]["schema_file"],
        "gs://" + config["buckets"]["open"] + "/" + config["mirna"]["isoform"]["output_dir"] + "IlluminaGA/*",
        "NEWLINE_DELIMITED_JSON",
        "WRITE_EMPTY",
    )
Example #5
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading Methylation 450K data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['methylation']['bq_table'],
    #    schemas_dir + config['methylation']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['methylation']['output_dir'] + 'HumanMethylation450/*',
    #    'CSV',
    #    'WRITE_EMPTY'
    #)
    print "*" * 30
    print "Loading Methylation 27K data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['methylation']['bq_table'],
        schemas_dir + config['methylation']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['methylation']['output_dir'] + 'HumanMethylation27/*',
        'CSV',
        'WRITE_APPEND'
    )
Example #6
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    print "Loading Isoform HiSeq data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mirna']['isoform']['bq_table_hiseq'],
        schemas_dir + config['mirna']['isoform']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mirna']['isoform']['output_dir'] + 'IlluminaHiSeq/*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
    print "*"*30
    print "Loading Isoform GA data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mirna']['isoform']['bq_table_ga'],
        schemas_dir + config['mirna']['isoform']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mirna']['isoform']['output_dir'] + 'IlluminaGA/*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
Example #7
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    print "Loading Clinical data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['clinical']['bq_table'],
        schemas_dir + config['clinical']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['clinical']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
    print "*"*30
    print "Loading Biospecimen data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['biospecimen']['bq_table'],
        schemas_dir + config['biospecimen']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['biospecimen']['output_dir'] + 'IlluminaGA/*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
Example #8
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading Methylation 450K data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['methylation']['bq_table'],
    #    schemas_dir + config['methylation']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['methylation']['output_dir'] + 'HumanMethylation450/*',
    #    'CSV',
    #    'WRITE_EMPTY'
    #)
    print "*"*30
    print "Loading Methylation 27K data into BigQuery.."
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['methylation']['bq_table'],
        schemas_dir + config['methylation']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['methylation']['output_dir'] + 'HumanMethylation27/*',
        'CSV',
        'WRITE_APPEND'
    )
Example #9
0
def load():
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    print "*" * 30
    print "Loading CCLE GCT data into BigQuery.."
    load_data_from_file.run(
        '', '', '', 'schema.json',
        'intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv', 'CSV',
        'WRITE_EMPTY')
Example #10
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    load_data_from_file.run(
        config['project_id'], 'genomic_reference',
        config['mirtarbase']['bq_table'],
        schemas_dir + config['mirtarbase']['schema_file'], 'gs://' +
        config['buckets']['open'] + '/' + config['mirtarbase']['output_file'],
        'NEWLINE_DELIMITED_JSON', 'WRITE_EMPTY')
Example #11
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['tcga_annotations']['bq_table'],
        schemas_dir + config['tcga_annotations']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' + config['tcga_annotations']['output_file'],
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
Example #12
0
def load(project_id, bq_datasets, bq_tables, schema_files, gcs_file_paths,
         write_dispositions, log):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log.info('\tbegin load of %s data into bigquery' % (gcs_file_paths))
    sep = ''
    for index in range(len(bq_datasets)):
        log.info("%s\t\tLoading %s table into BigQuery.." %
                 (sep, bq_datasets[index]))
        load_data_from_file.run(project_id, bq_datasets[index],
                                bq_tables[index], schema_files[index],
                                gcs_file_paths[index] + '/*',
                                'NEWLINE_DELIMITED_JSON',
                                write_dispositions[index])
        sep = '\n\t\t"*"*30\n'

    log.info('done load %s of data into bigquery' % (gcs_file_paths))
Example #13
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('cnv_load', 'logs/cnv_load.log')
    log.info('begin load of cnv into bigquery')
    
    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading Methylation 450K data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['methylation']['bq_table'],
    #    schemas_dir + config['methylation']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['methylation']['output_dir'] + 'HumanMethylation450/*',
    #    'CSV',
    #    'WRITE_EMPTY'
    #)
    dir_prefix = config['cnv']['output_dir_prefix']
    dir_suffixes = config['cnv']['output_dir_suffixes']
    for dir_suffix in dir_suffixes:
        log.info("\tLoading CNV data into BigQuery from %s..." % (dir_prefix + dir_suffix))
        load_data_from_file.run(
            config['project_id'],
            config['bq_dataset'],
            config['cnv']['bq_table'],
            schemas_dir + config['cnv']['schema_file'],
            'gs://' + config['buckets']['open'] + '/' +\
                dir_prefix + dir_suffix + '*',
            'NEWLINE_DELIMITED_JSON',
            'WRITE_APPEND'
        )
        log.info("*"*30)

    log.info('finished load of CNV into bigquery')
Example #14
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('mirna_isoform_matrix_load',
                            'logs/mirna_isoform_matrix_load.log')
    log.info('begin load of mirna isoform matrix into bigquery')

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    log.info("\tLoading Isoform HiSeq matrix data into BigQuery..")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mirna_isoform_matrix']['bq_table_hiseq'],
        schemas_dir + config['mirna_isoform_matrix']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mirna_isoform_matrix']['IlluminaHiSeq']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_EMPTY'
    )
    log.info("*" * 30)
    log.info("\tLoading Isoform GA matrix data into BigQuery..")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['mirna_isoform_matrix']['bq_table_ga'],
        schemas_dir + config['mirna_isoform_matrix']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['mirna_isoform_matrix']['IlluminaGA']['output_dir'] + '*',
        'NEWLINE_DELIMITED_JSON',
        'WRITE_APPEND'
    )

    log.info('done load of mirna isoform matrix into bigquery')
Example #15
0
def load(config):
    """
    Load the bigquery table
    load_data_from_file accepts following params:
    project_id, dataset_id, table_name, schema_file, data_path,
          source_format, write_disposition, poll_interval, num_retries
    """
    log = configure_logging('methylation_split', 'logs/methylation_load.log')
    log.info('begin load of methylation into bigquery')

    schemas_dir = os.environ.get('SCHEMA_DIR', 'schemas/')

    #print "Loading Methylation 450K data into BigQuery.."
    #load_data_from_file.run(
    #    config['project_id'],
    #    config['bq_dataset'],
    #    config['methylation']['bq_table'],
    #    schemas_dir + config['methylation']['schema_file'],
    #    'gs://' + config['buckets']['open'] + '/' +\
    #        config['methylation']['output_dir'] + 'HumanMethylation450/*',
    #    'CSV',
    #    'WRITE_EMPTY'
    #)
    log.info("\tLoading Methylation data into BigQuery...")
    load_data_from_file.run(
        config['project_id'],
        config['bq_dataset'],
        config['methylation']['bq_table'],
        schemas_dir + config['methylation']['schema_file'],
        'gs://' + config['buckets']['open'] + '/' +\
            config['methylation']['output_dir'] + '*',
        'CSV',
        'WRITE_APPEND'
    )

    main(config, log)

    log.info('finished load of methylation into bigquery')
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj[
                    'sample_barcode'] = i  # Normalized to match user_gen
                new_df_obj['Project'] = metadata['project_id']
                new_df_obj['Study'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][
                    k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][
                    k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][
                    k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(
        set([k for d, k in new_df['SampleBarcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'],
                                           metadata['data_type'],
                                           sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'],
                                         metadata['study_id'], project_id,
                                         bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(new_df,
                                       outfilename,
                                       metadata=metadata,
                                       tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(project_id,
                            bq_dataset,
                            table_name,
                            schema,
                            source_path,
                            source_format='NEWLINE_DELIMITED_JSON',
                            write_disposition='WRITE_APPEND',
                            is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj['sample_barcode'] = i # Normalized to match user_gen
                new_df_obj['project_id'] = metadata['project_id']
                new_df_obj['study_id'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)