def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
Exemple #2
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header,
                  set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df['Study'] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list(
        [k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)
Exemple #4
0
    def process_paths(self, config, outputdir, data_type, paths, program_name,
                      project, file2info, log):
        if 0 != len(paths) % 3:
            raise RuntimeError(
                'need to process the three RNA files per sample together.  adjust the configuration option \'download_files_per\' accordingly'
            )
        types = config[program_name]['process_files']['datatype2bqscript'][
            'Gene Expression Quantification']['analysis_types']
        idents = set()
        count = 0
        associated_paths = [None] * 3
        complete_df = None
        for path in paths:
            fields = path.split('/')
            info = file2info[fields[-2] + '/' + fields[-1]]
            type_index = types.index(info['analysis']['workflow_type'])
            if associated_paths[type_index]:
                raise ValueError(
                    'files in bad order, found two of the same type for %s:%s'
                    % (associated_paths[type_index], fields[1]))
            idents.add(fields[1].split('.')[0])
            if 1 < len(idents):
                raise ValueError(
                    'files in bad order, found two different identifiers %s:%s'
                    % (associated_paths[type_index], fields[1]))
            associated_paths[type_index] = path
            count += 1
            if 0 == count % 3:
                merge_df = self.process_per_sample_files(
                    config, outputdir, associated_paths, types, info,
                    program_name, project, log)
                if complete_df is None:
                    complete_df = merge_df
                else:
                    complete_df = pd.concat([complete_df, merge_df],
                                            ignore_index=True)
                idents = set()
                count = 0
                associated_paths = [None] * 3

        # add unversioned gene column
        complete_df['Ensembl_gene_ID'] = complete_df[
            'Ensembl_versioned_gene_ID'].str.split('.').str[0]

        # clean-up dataframe
        log.info('\t\tcalling cleanup_dataframe() for %s' % (paths))
        complete_df = cleanup_dataframe(complete_df, log)
        log.info('\t\tdone calling cleanup_dataframe() for %s' % (paths))

        log.info('\tcomplete data frame(%d):\n%s\n%s' %
                 (len(complete_df), complete_df.head(3), complete_df.tail(3)))
        return complete_df
def parse_annotations(config):
    """
    Download using annotation file using urllib2.Request
    Convert the file into a dataframe
    """
    # connect to google.cloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    gcs = GcsConnector(project_id, bucket_name)

    # use Request to get the json data
    print 'Downloading the TCGA annotations file'
    request = Request(config['tcga_annotations']['input_file'])
    response = urlopen(request)
    elevations = response.read()
    data = json.loads(elevations)

    all_annotations = []
    for annotation in data['dccAnnotation']:
        annotation = normalize_json(annotation)
        if annotation["items_item"].startswith('TCGA-') and len(
                annotation["items_item"]) >= 12:
            item = annotation["items_item"]
            annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3])
            if (len(annotation["items_item"]) > 12 and
                    annotation["items_itemType_itemTypeName"] != "Patient"):
                annotation['SampleBarcode'] = "-".join(item.split("-")[0:4])
        if annotation["items_itemType_itemTypeName"] == "Aliquot":
            aliquot = annotation["items_item"]
            if aliquot.startswith('TCGA-') and len(aliquot) == 28:
                annotation['AliquotBarcode'] = aliquot

        all_annotations.append(annotation)

    #------------
    # transform
    #------------
    data_df = pd.DataFrame(all_annotations)
    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(data_df)
    # rename columns
    data_df = rename_columns(data_df)
    print data_df

    #----------
    # upload
    #----------
    # upload the contents of the dataframe in njson format
    print "Uploading the njson file"
    status = gcs.convert_df_to_njson_and_upload(
        data_df, config['tcga_annotations']['output_file'])
    print status
def parse_annotations(config):
    """
    Download using annotation file using urllib2.Request
    Convert the file into a dataframe
    """
   # connect to gcloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    gcs = GcsConnector(project_id, bucket_name)

    # use Request to get the json data
    print 'Downloading the TCGA annotations file'
    request = Request(config['tcga_annotations']['input_file'])
    response = urlopen(request)
    elevations = response.read()
    data = json.loads(elevations)

    all_annotations = []
    for annotation in data['dccAnnotation']:
        annotation = normalize_json(annotation)
        if annotation["items_item"].startswith('TCGA-') and len(annotation["items_item"]) >= 12:
            item = annotation["items_item"]
            annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3])
            if (len(annotation["items_item"]) > 12 and
                    annotation["items_itemType_itemTypeName"] != "Patient"):
                annotation['SampleBarcode'] = "-".join(item.split("-")[0:4])
        if annotation["items_itemType_itemTypeName"] == "Aliquot":
            aliquot = annotation["items_item"]
            if aliquot.startswith('TCGA-') and len(aliquot) == 28:
                annotation['AliquotBarcode'] = aliquot

        all_annotations.append(annotation)

    #------------
    # transform
    #------------
    data_df = pd.DataFrame(all_annotations)
    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(data_df)
    # rename columns
    data_df = rename_columns(data_df)
    print data_df

    #----------
    # upload
    #----------
    # upload the contents of the dataframe in njson format
    print "Uploading the njson file"
    status = gcs.convert_df_to_njson_and_upload(data_df, config['tcga_annotations']['output_file'])
    print status
Exemple #7
0
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0):
    """
    Function to connect to google cloud storage, download the file,
    and convert to a dataframe
    """

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)

    return data_df
def melt_matrix(matrix_file, Platform, studies_map, config, log):
    """
    # melt matrix
    """
    log.info('\tbegin melt matrix: \'%s\'' % (matrix_file))
    # begin parsing the data
    data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0)
    data_df2 = data_df2.set_index(["Gene"])

    # create a StingIO object with this info
    # call utils.convert_file_to_dataframe(buffer, sep=",")
    # call tools.cleanup_dataframe()
    # gcs.convert_df_to_njson_and_upload()
    log.info('\t\tstart processing saved matrix.  size: %s' % (len(data_df2)))
    mod = int(len(data_df2) / 20)
    count = 0
    buf = StringIO()
    buf.write(
        "ParticipantBarcode	SampleBarcode	AliquotBarcode	SampleTypeLetterCode	Study	Platform	mirna_id	mirna_accession	normalized_count\n"
    )
    for i, j in data_df2.T.iteritems():
        if 0 == count % mod:
            log.info('\t\t\tprocessed %s lines' % (count))
        count += 1
        for k, m in j.iteritems():
            aliquot = k.strip(".mirbase20")
            aliquot = aliquot.strip(".hg19")
            SampleBarcode = "-".join(aliquot.split("-")[0:4])
            ParticipantBarcode = "-".join(aliquot.split("-")[0:3])
            SampleTypeLetterCode = config["sample_code2letter"][aliquot.split(
                "-")[3][0:2]]
            Study = studies_map[aliquot].upper()
            buf.write("\t".join(
                map(str, (ParticipantBarcode, SampleBarcode,
                          aliquot, SampleTypeLetterCode, Study, Platform,
                          i.split(".")[0], i.split(".")[1], m))) + '\n')
    log.info('\t\tprocessed %s total lines' % (count))

    file_name = matrix_file.split('/')[-1]
    log.info('\t\tsave %s to GCS' % file_name)
    buf.seek(0)
    df = convert_file_to_dataframe(buf)
    df = cleanup_dataframe(df)
    gcs = GcsConnector(config['project_id'], config['buckets']['open'])
    gcs.convert_df_to_njson_and_upload(
        df, config['mirna_isoform_matrix'][Platform]['output_dir'] + file_name)
    log.info('\t\tcompleted save to GCS')
    log.info('\tfinished melt matrix')
Exemple #9
0
def parse_mirtarbase(config):
    """
    Download mirtarbase file with urllib2.Request
    Convert to a dataframe
    """
    # use Request to get the data (change the link to get the most recent file)
    request = Request(config['mirtarbase']['input_file'])
    socket = urlopen(request)

    # load into a pandas dataframe
    xlsd = pd.ExcelFile(socket, dtype=object)
    df_sheet1 = xlsd.parse(0)  # parse first sheet (converts ina dataframe)
    print 'Found {0} rows in the file'.format(len(df_sheet1))

    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(df_sheet1)

    replace_column_names = {
        'Species_miRNA': 'miRNA_Species',
        'Species_Target_Gene': 'Target_Gene_Species',
        'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID',
        'miRTarBase_ID': 'miRTarBaseID'
    }
    data_df.columns = [replace_column_names[x] if x in replace_column_names\
                                               else x for x in data_df.columns]
    # convert to int
    data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\
                                        .map(lambda x: str(int(float(x))), na_action='ignore')

    #-------
    # upload
    #-------
    # connect to google.cloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    # connect to google storage
    gcs = GcsConnector(project_id, bucket_name)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(
        data_df, config['mirtarbase']['output_file'])
    print status
def parse_mirtarbase(config):
    """
    Download mirtarbase file with urllib2.Request
    Convert to a dataframe
    """
    # use Request to get the data (change the link to get the most recent file)
    request = Request(config['mirtarbase']['input_file'])
    socket = urlopen(request)

    # load into a pandas dataframe
    xlsd = pd.ExcelFile(socket, dtype=object)
    df_sheet1 = xlsd.parse(0) # parse first sheet (converts ina dataframe)
    print 'Found {0} rows in the file'. format(len(df_sheet1))

    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(df_sheet1)

    replace_column_names = {
        'Species_miRNA': 'miRNA_Species',
        'Species_Target_Gene': 'Target_Gene_Species',
        'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID',
        'miRTarBase_ID': 'miRTarBaseID'
    }
    data_df.columns = [replace_column_names[x] if x in replace_column_names\
                                               else x for x in data_df.columns]
    # convert to int
    data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\
                                        .map(lambda x: str(int(float(x))), na_action='ignore')

    #-------
    # upload
    #-------
    # connect to gcloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    # connect to google storage
    gcs = GcsConnector(project_id, bucket_name)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, config['mirtarbase']['output_file'])
    print status
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'TCGA-OR-A5J1-01A-11D-A29J-05.txt'
    outfilename = ''

    # read the stringIO/file into a pandas dataframe
    # load the file into a table
    data_df = pandas.read_table(filename, sep="\t", skiprows=1,
                                lineterminator='\n', comment='#')

    # clean up the dataframe for upload to BigQuery
    data_df = cleanup_dataframe(data_df)

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)

    print status
Exemple #12
0
    def process_paths(self, config, outputdir, data_type, paths, program_name,
                      project, file2info, log):
        count = 0
        complete_df = None
        log.info('\tprocessing %d paths for %s:%s' %
                 (len(paths), data_type, project))
        for path in paths:
            count += 1
            fields = path.split('/')
            info = file2info[fields[-2] + '/' + fields[-1]]
            if self.skip_file(config, data_type, path, program_name, file2info,
                              info, log):
                continue
            file_df = self.process_file(config, outputdir, data_type, path,
                                        info, program_name, project, log)
            if complete_df is None:
                complete_df = file_df
            else:
                complete_df = pd.concat([complete_df, file_df],
                                        ignore_index=True)
            if 0 == count % 128:
                log.info('\t\tprocessed %s path: %s' % (count, path))
        log.info('\tdone processing %d paths for %s:%s' %
                 (len(paths), data_type, project))

        # clean-up dataframe
        if complete_df is not None:
            log.info('\t\tcalling cleanup_dataframe() for %s' % (paths))
            complete_df = cleanup_dataframe(complete_df, log)
            log.info('\t\tdone calling cleanup_dataframe() for %s' % (paths))
            log.info(
                '\tcomplete data frame(%d):\n%s\n%s' %
                (len(complete_df), complete_df.head(3), complete_df.tail(3)))
        else:
            log.info('\tno complete data frame created')
        return complete_df
Exemple #13
0
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0, log = None):
    """
    Function to connect to google cloud storage, download the file,
    and convert to a dataframe
    """

    try:
        logit(log, 'calling download_blob_to_file() for %s' % (filename), 'info')
        filebuffer = gcs.download_blob_to_file(filename)
        logit(log, 'done calling download_blob_to_file() for %s' % (filename), 'info')
    
        # convert blob into dataframe
        logit(log, 'calling convert_file_to_dataframe() for %s' % (filename), 'info')
        data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows)
        logit(log, 'done calling convert_file_to_dataframe() for %s' % (filename), 'info')
    
        # clean-up dataframe
        logit(log, 'calling cleanup_dataframe() for %s' % (filename), 'info')
        data_df = cleanup_dataframe(data_df)
        logit(log, 'done calling cleanup_dataframe() for %s' % (filename), 'info')
    except Exception as e:
        logit(log, 'problem in convert_blob_to_dataframe(%s): %s' % (filename, e), 'exception')

    return data_df
Exemple #14
0
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns,
                             sample_code2letter, oncotator_object_path):
    study = data_library['Study'].iloc[0]

    # this needed to stop pandas from converting them to FLOAT
    dtype = {
        "Transcript_Exon": "object",
        "NCBI_Build": "object",
        "COSMIC_Total_Alterations_In_Gene": "object",
        "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object",
        "HGNC_HGNC_ID": "object",
        "UniProt_AApos": "object",
        "Transcript_Position": "object",
        "HGNC_OMIM_ID_Supplied_By_NCBI": "object"
    }

    file_count = 0

    # create an empty dataframe. we use this to merge dataframe
    disease_bigdata_df = pd.DataFrame()

    # iterate over the selected files
    for oncotator_file in data_library['filename']:
        file_count += 1

        log.info(
            '-' * 10 +
            "{0}: Processing file {1}".format(file_count, oncotator_file) +
            '-' * 10)

        try:
            # covert the file to a dataframe
            filename = oncotator_object_path + oncotator_file
            with open(filename) as infile:
                filestring = StringIO(infile.read())
            df = convert_file_to_dataframe(filestring)
            try:
                df = cleanup_dataframe(df)
            except RuntimeError as re:
                log.warning('%s: problem cleaning dataframe for %s: %s' %
                            (study, filename, re))
        except Exception as e:
            print e
            raise

        if df.empty:
            log.debug('empty dataframe for file: ' + str(oncotator_file))
            continue
        #------------------------------
        # different operations on the frame
        #------------------------------
        # get only the required BigQuery columns
        df = df[bq_columns]

        # format oncotator columns; name changes etc
        df = format_oncotator_columns(df)

        # add new columns
        df = add_columns(df, sample_code2letter, study)

        disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True)

        log.info('-' * 10 + "{0}: Finished file {1}. rows: {2}".format(
            file_count, oncotator_file, len(df)) + '-' * 10)

    # this is a merged dataframe
    if not disease_bigdata_df.empty:

        # remove duplicates; various rules; see check duplicates)

        log.info(
            '\tcalling check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))
        disease_bigdata_df = check_duplicates.remove_maf_duplicates(
            disease_bigdata_df, sample_code2letter, log)
        log.info(
            '\tfinished check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))

        # enforce unique mutation--previous
        # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode']
        # enforce unique mutation
        unique_mutation = [
            'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position',
            'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1',
            'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'
        ]
        # merge mutations from multiple centers
        log.info('\tconsolidate the centers for duplicate mutations into list')
        seencenters = set()

        def concatcenters(df_group):
            if len(df_group) > 1:
                centers = list(set(df_group['Center'].tolist()))
                uniquecenters = set()
                delim = config['maf']['center_delim']
                for center in centers:
                    fields = center.split(delim)
                    for field in fields:
                        uniquecenters.add(field)
                sortedunique = delim.join(sorted(list(uniquecenters)))
                df_group.loc[:, 'Center'] = sortedunique
                if sortedunique not in seencenters:
                    log.info('unique centers: %s' % sortedunique)
                    seencenters.add(sortedunique)
            return df_group

        disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply(
            concatcenters)
        log.info('\tfinished consolidating centers for duplicate mutations')

        # enforce unique mutation
        log.info(
            '\tcalling remove_duplicates to collapse mutations with %s rows' %
            (len(disease_bigdata_df)))
        disease_bigdata_df = remove_duplicates(disease_bigdata_df,
                                               unique_mutation)
        log.info(
            '\tfinished remove_duplicates to collapse mutations with %s rows' %
            (len(disease_bigdata_df)))

        # convert the disease_bigdata_df to new-line JSON and the upload the file
        file_to_upload = StringIO()

        log.info('writing %s rows' % (len(disease_bigdata_df)))
        for _, rec in disease_bigdata_df.iterrows():
            file_to_upload.write(
                rec.convert_objects(convert_numeric=False).to_json() + "\n")
        file_to_upload.seek(0)
        with open(oncotator_object_path + "{0}.json".format(study),
                  'w') as outfile:
            outfile.write(file_to_upload.getvalue())
    else:
        log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study))
    return True
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
def main():
    """Parse GCT file, merge with barcodes info, melt(tidy) 
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'ccle/mRNA-gene-exp/CCLE_Expression_Entrez_2012-09-29.gct'
    outfilename = 'test'
    writer = ExcelWriter('ccle.xlsx')

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #------------------------------
    # load the GCT file
    #  * the file has duplicate samples (columns)
    #------------------------------
    # To remove duplicates, load the first few lines of the file only. Get columns, unique and select dataframe
    # this is a hack, but I cant find a elegant way to remove duplciates
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=False,
                           nrows=2)
    unqiue_columns = np.unique(gct_df.columns)
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=True)
    # clean-up the dataset/dataframe
    gct_df = cleanup_dataframe(gct_df)
    gct_df = gct_df[unqiue_columns]

    # remove any gene_id starting with 'AFFX-'
    gct_df[gct_df['Name'].str.startswith('AFFX-')].to_excel(
        writer, sheet_name="affy_info")
    gct_df = gct_df[~gct_df['Name'].str.startswith('AFFX-')]

    #------------------------------
    # HGNC validation
    #-----------------------------
    hgnc_df = hgnc_validation.get_hgnc_map()
    hgnc_df.to_excel(writer, sheet_name="hgnc_info")
    hgnc_dict = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol))
    gct_df['HGNC_gene_symbol'] = gct_df['Name'].map(
        lambda gene_id: hgnc_dict.get(gene_id.replace('_at', ''), np.nan))
    gct_df[['HGNC_gene_symbol', 'Name',
            'Description']].to_excel(writer, sheet_name="gene_info")
    gct_df['Name'] = gct_df['Name'].map(
        lambda gene_id: gene_id.replace('_at', ''))

    #------------------------------
    # barcodes info
    #------------------------------
    barcodes_filename = 'ccle/mRNA-gene-exp/mRNA_names.out.tsv'
    filebuffer = gcs.download_blob_to_file(barcodes_filename)
    barcodes_df = pd.read_table(
        filebuffer,
        header=None,
        names=['ParticipantBarcode', 'SampleBarcode',
               'CCLE_long_name'])  # convert into dataframe
    barcodes_df = cleanup_dataframe(barcodes_df)  # clean-up dataframe

    #------------------------------
    # ignore (drop) all of the columns from the gene-expression matrix
    #that don't have corresponding Participant and Sample barcodes,
    #------------------------------
    columns_df = pd.DataFrame(unqiue_columns)
    columns_df.columns = ['CCLE_long_name']
    samples_map_df = pd.merge(columns_df,
                              barcodes_df,
                              on='CCLE_long_name',
                              how='inner')
    samples_map_df.to_excel(writer, sheet_name="sample_info")

    # select columns that are overlapping
    overlapping_samples = samples_map_df['CCLE_long_name'].tolist()
    overlapping_samples = overlapping_samples + [
        'Name', 'Description', 'HGNC_gene_symbol'
    ]
    gct_df = gct_df[overlapping_samples]
    print gct_df

    # melt the matrix
    value_vars = [
        col for col in gct_df.columns
        if col not in ['Name', 'Description', 'HGNC_gene_symbol']
    ]
    melted_df = pd.melt(gct_df,
                        id_vars=['Name', 'Description', 'HGNC_gene_symbol'],
                        value_vars=value_vars)
    melted_df = melted_df.rename(
        columns={
            'Name': 'gene_id',
            'Description': 'original_gene_symbol',
            'variable': 'CCLE_long_name',
            'value': 'RMA_normalized_expression'
        })

    # merge to get barcode information
    # changed from outer join to inner join. In this case it shouldnt matter, since we already did a inner join
    # while select the samples above.
    data_df = pd.merge(melted_df,
                       samples_map_df,
                       on='CCLE_long_name',
                       how='inner')
    data_df['Platform'] = "Affymetrix U133 Plus 2.0"

    # reorder columns
    col_order = [
        "ParticipantBarcode", "SampleBarcode", "CCLE_long_name", "gene_id",
        "HGNC_gene_symbol", "original_gene_symbol", "Platform",
        "RMA_normalized_expression"
    ]
    data_df = data_df[col_order]

    # upload the contents of the dataframe in CSV format
    print "Convert to CSV"
    outfilename = "tcga/intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv"
    df_string = data_df.to_csv(index=False, header=False)
    status = gcs.upload_blob_from_string(outfilename, df_string)
    print status

    # save the excel file
    writer.save()
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj[
                    'sample_barcode'] = i  # Normalized to match user_gen
                new_df_obj['Project'] = metadata['project_id']
                new_df_obj['Study'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][
                    k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][
                    k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][
                    k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(
        set([k for d, k in new_df['SampleBarcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'],
                                           metadata['data_type'],
                                           sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'],
                                         metadata['study_id'], project_id,
                                         bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(new_df,
                                       outfilename,
                                       metadata=metadata,
                                       tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(project_id,
                            bq_dataset,
                            table_name,
                            schema,
                            source_path,
                            source_format='NEWLINE_DELIMITED_JSON',
                            write_disposition='WRITE_APPEND',
                            is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
Exemple #18
0
    def melt_matrix(self, matrix_file, platform, file2info, program_name,
                    config, log):
        """
        # melt matrix
        """
        log.info('\t\t\tbegin melt matrix: \'%s\'' % (matrix_file))
        # begin parsing the data
        data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0)
        data_df2 = data_df2.set_index(["Gene"])

        # create a StingIO object with this info
        # call utils.convert_file_to_dataframe(buffer, sep=",")
        # call tools.cleanup_dataframe()
        # gcs.convert_df_to_njson_and_upload()
        log.info('\t\t\t\tstart processing saved matrix.  size: %s' %
                 (len(data_df2)))
        mod = int(len(data_df2) / 20)
        count = 0
        total_count = 0
        buf = StringIO()
        buf.write(
            "sample_barcode	mirna_id	mirna_accession	normalized_count	platform	project_short_name	program_name	sample_type_code"
            +
            "	file_name	file_gdc_id	aliquot_barcode	case_barcode	case_gdc_id	sample_gdc_id	aliquot_gdc_id\n"
        )
        for i, j in data_df2.T.iteritems():
            for k, m in j.iteritems():
                aliquot = file2info[k]['aliquot_barcode']
                SampleBarcode = "-".join(aliquot.split("-")[0:4])
                ParticipantBarcode = "-".join(aliquot.split("-")[0:3])
                SampleTypeCode = aliquot.split("-")[3][0:2]
                info = file2info[k]
                line = "\t".join(
                    map(str, (SampleBarcode, i.split(".")[0], i.split(".")[1],
                              m, platform, info['project_short_name'],
                              info['program_name'], SampleTypeCode,
                              info['file_name'], info['file_gdc_id'], aliquot,
                              ParticipantBarcode, info['case_gdc_id'],
                              info['sample_gdc_id'],
                              info['aliquot_gdc_id']))) + '\n'
                buf.write(line)
                total_count += 1
            if 0 == count % mod:
                log.info('\t\t\t\t\tprocessed %s lines:\n%s' % (count, line))
                file_name = '%s_%s' % (matrix_file.split('/')[-1], count)
                log.info('\t\t\t\tsave %s to GCS' % file_name)
                buf.seek(0)
                df = convert_file_to_dataframe(buf)
                df = cleanup_dataframe(df, log)
                gcs = GcsConnector(config['cloud_projects']['open'],
                                   config['buckets']['open'])
                gcs.convert_df_to_njson_and_upload(
                    df,
                    config[program_name]['process_files']['datatype2bqscript']
                    ['Isoform Expression Quantification']['gcs_output_path'] +
                    file_name,
                    logparam=log)
                buf = StringIO()
                buf.write(
                    "sample_barcode	mirna_id	mirna_accession	normalized_count	platform	project_short_name	program_name	sample_type_code"
                    +
                    "	file_name	file_gdc_id	aliquot_barcode	case_barcode	case_gdc_id	sample_gdc_id	aliquot_gdc_id\n"
                )
            count += 1
        log.info('\t\t\t\tprocessed %s total lines created %s records' %
                 (count, total_count))

        log.info('\t\t\t\tcompleted save to GCS')
        log.info('\t\t\tfinished melt matrix')
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj['sample_barcode'] = i # Normalized to match user_gen
                new_df_obj['project_id'] = metadata['project_id']
                new_df_obj['study_id'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
Exemple #20
0
    else:
        raise Exception('Empty dataframe!')
    return True

if __name__ == '__main__':

    config = json.load(open(sys.argv[1]))
  
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    sample_code2letter = config['sample_code2letter']
 
    # get disease_codes/studies( TODO this must be changed to get the disease code from the file name)
    df = convert_file_to_dataframe(open(sys.argv[2]))
    df = cleanup_dataframe(df)
    studies = list(set(df['Study'].tolist()))

    # get bq columns ( this allows the user to select the columns
    # , without worrying about the index, case-sensitivenes etc
    selected_columns = pd.read_table(sys.argv[3], names=['bq_columns'])
    transposed = selected_columns.T
    transposed.columns = transposed.loc['bq_columns']
    transposed = cleanup_dataframe(transposed)
    bq_columns = transposed.columns.values

    # submit threads by disease  code
    pm = process_manager.ProcessManager(max_workers=33, db='maf.db', table='task_queue_status')
    for idx, df_group in df.groupby(['Study']):
        future = pm.submit(process_oncotator_output, project_id, bucket_name, df_group, bq_columns, sample_code2letter)
        #process_oncotator_output( project_id, bucket_name, df_group, bq_columns, sample_code2letter)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)