Exemple #1
0
def search_files(config):

    # project id
    project_id = config['project_id']
    # bucket name
    bucket_name = config['buckets']['open']

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    #---------------------
    # search for MAF files
    #---------------------
    maf_file = re.compile("^.*.maf$")
    # search only these tumor types defined in config files
    search_tumors = ["tcga/" + d.lower() for d in config['all_tumor_types']]
    data_library = gcs.search_files(search_patterns=['.maf'],
                                    regex_search_pattern=maf_file,
                                    prefixes=search_tumors)
    data_library['basefilename'] = map(
        lambda x: os.path.splitext(os.path.basename(x))[0],
        data_library['filename'])
    data_library['unique_filename'] = format_dupe_values(
        data_library['basefilename'])

    return data_library
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
Exemple #3
0
 def upload_batch_etl(self, config, outputdir, paths, file2info, endpt_type,
                      program_name, project, data_type, log):
     log.info('\tstart upload_batch_etl() for %s and %s' %
              (project, data_type))
     try:
         complete_df = self.process_paths(config, outputdir, data_type,
                                          paths, program_name, project,
                                          file2info, log)
         if complete_df is not None:
             etl_uploaded = True
             gcs = GcsConnector(config['cloud_projects']['open'],
                                config['buckets']['open'])
             keyname = config['buckets']['folders'][
                 'base_run_folder'] + 'etl/%s/%s/%s/%s' % (
                     endpt_type, project, data_type, paths[0].replace(
                         '/', '_'))
             log.info('\t\tstart convert and upload %s to the cloud' %
                      (keyname))
             gcs.convert_df_to_njson_and_upload(complete_df,
                                                keyname,
                                                logparam=log)
             log.info('\t\tfinished convert and upload %s to the cloud' %
                      (keyname))
         else:
             etl_uploaded = False
             log.info('\t\tno upload for this batch of files')
     except Exception as e:
         log.exception('problem finishing the etl: %s' % (e))
         raise
     log.info('\tfinished upload_batch_etl() for %s and %s' %
              (project, data_type))
     return etl_uploaded
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('protein', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=['Protein_Name'])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df,
                                                outfilename,
                                                metadata=metadata)
    return status
Exemple #5
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df["Study"] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
Exemple #6
0
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    log = configure_logging('mirna.isoform.transform', "logs/mirna_isoform_transform_" + metadata['AliquotBarcode'] + '.log')
    try:
        log.info('start transform of %s' % (metadata['AliquotBarcode']))
        # connect to the cloud bucket
        gcs = GcsConnector(project_id, bucket_name)
    
        #main steps: download, convert to df, cleanup, transform, add metadata
        log.info('\tadd changes and metadata for %s' % (metadata['AliquotBarcode']))
        data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, log=log)
        data_df = additional_changes(data_df)
        data_df = add_metadata(data_df, metadata)
    
        # upload the contents of the dataframe in njson format
        status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
        log.info('finished transform of %s' % (metadata['AliquotBarcode']))
    except Exception as e:
        log.exception('problem transforming %s' % (metadata['AliquotBarcode']))
        raise e
    return status
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list(
        [k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)
Exemple #8
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header,
                  set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df['Study'] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
def parse_methylation(project_id, bucket_name, filename, outfilename,
                      metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df.columns = [
        'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome",
        "Genomic_Coordinate"
    ]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = ''
    outfilename = ''

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

     # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                 .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)

    print status
Exemple #11
0
def test_blob_exists(project_id, bucket_name, df):
    """
    Checks if the DataFileNameKey blob exists in the bucket
    """
    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    for idx, row in df.iterrows():
        records = row.to_dict()
        print records['DatafileNameKey']
        blob_status = gcs.check_blob_exists(records['DatafileNameKey'])
        assert (blob_status is True), 'Blob doesnt exist'
Exemple #12
0
def test_blob_exists(project_id, bucket_name, df):
    """
    Checks if the DataFileNameKey blob exists in the bucket
    """
     # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    for idx, row in df.iterrows():
        records = row.to_dict()
        print records['DatafileNameKey']
        blob_status = gcs.check_blob_exists(records['DatafileNameKey'])
        assert (blob_status is True), 'Blob doesnt exist'
def parse_annotations(config):
    """
    Download using annotation file using urllib2.Request
    Convert the file into a dataframe
    """
    # connect to google.cloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    gcs = GcsConnector(project_id, bucket_name)

    # use Request to get the json data
    print 'Downloading the TCGA annotations file'
    request = Request(config['tcga_annotations']['input_file'])
    response = urlopen(request)
    elevations = response.read()
    data = json.loads(elevations)

    all_annotations = []
    for annotation in data['dccAnnotation']:
        annotation = normalize_json(annotation)
        if annotation["items_item"].startswith('TCGA-') and len(
                annotation["items_item"]) >= 12:
            item = annotation["items_item"]
            annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3])
            if (len(annotation["items_item"]) > 12 and
                    annotation["items_itemType_itemTypeName"] != "Patient"):
                annotation['SampleBarcode'] = "-".join(item.split("-")[0:4])
        if annotation["items_itemType_itemTypeName"] == "Aliquot":
            aliquot = annotation["items_item"]
            if aliquot.startswith('TCGA-') and len(aliquot) == 28:
                annotation['AliquotBarcode'] = aliquot

        all_annotations.append(annotation)

    #------------
    # transform
    #------------
    data_df = pd.DataFrame(all_annotations)
    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(data_df)
    # rename columns
    data_df = rename_columns(data_df)
    print data_df

    #----------
    # upload
    #----------
    # upload the contents of the dataframe in njson format
    print "Uploading the njson file"
    status = gcs.convert_df_to_njson_and_upload(
        data_df, config['tcga_annotations']['output_file'])
    print status
def parse_annotations(config):
    """
    Download using annotation file using urllib2.Request
    Convert the file into a dataframe
    """
   # connect to gcloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    gcs = GcsConnector(project_id, bucket_name)

    # use Request to get the json data
    print 'Downloading the TCGA annotations file'
    request = Request(config['tcga_annotations']['input_file'])
    response = urlopen(request)
    elevations = response.read()
    data = json.loads(elevations)

    all_annotations = []
    for annotation in data['dccAnnotation']:
        annotation = normalize_json(annotation)
        if annotation["items_item"].startswith('TCGA-') and len(annotation["items_item"]) >= 12:
            item = annotation["items_item"]
            annotation['ParticipantBarcode'] = "-".join(item.split("-")[0:3])
            if (len(annotation["items_item"]) > 12 and
                    annotation["items_itemType_itemTypeName"] != "Patient"):
                annotation['SampleBarcode'] = "-".join(item.split("-")[0:4])
        if annotation["items_itemType_itemTypeName"] == "Aliquot":
            aliquot = annotation["items_item"]
            if aliquot.startswith('TCGA-') and len(aliquot) == 28:
                annotation['AliquotBarcode'] = aliquot

        all_annotations.append(annotation)

    #------------
    # transform
    #------------
    data_df = pd.DataFrame(all_annotations)
    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(data_df)
    # rename columns
    data_df = rename_columns(data_df)
    print data_df

    #----------
    # upload
    #----------
    # upload the contents of the dataframe in njson format
    print "Uploading the njson file"
    status = gcs.convert_df_to_njson_and_upload(data_df, config['tcga_annotations']['output_file'])
    print status
def melt_matrix(matrix_file, Platform, studies_map, config, log):
    """
    # melt matrix
    """
    log.info('\tbegin melt matrix: \'%s\'' % (matrix_file))
    # begin parsing the data
    data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0)
    data_df2 = data_df2.set_index(["Gene"])

    # create a StingIO object with this info
    # call utils.convert_file_to_dataframe(buffer, sep=",")
    # call tools.cleanup_dataframe()
    # gcs.convert_df_to_njson_and_upload()
    log.info('\t\tstart processing saved matrix.  size: %s' % (len(data_df2)))
    mod = int(len(data_df2) / 20)
    count = 0
    buf = StringIO()
    buf.write(
        "ParticipantBarcode	SampleBarcode	AliquotBarcode	SampleTypeLetterCode	Study	Platform	mirna_id	mirna_accession	normalized_count\n"
    )
    for i, j in data_df2.T.iteritems():
        if 0 == count % mod:
            log.info('\t\t\tprocessed %s lines' % (count))
        count += 1
        for k, m in j.iteritems():
            aliquot = k.strip(".mirbase20")
            aliquot = aliquot.strip(".hg19")
            SampleBarcode = "-".join(aliquot.split("-")[0:4])
            ParticipantBarcode = "-".join(aliquot.split("-")[0:3])
            SampleTypeLetterCode = config["sample_code2letter"][aliquot.split(
                "-")[3][0:2]]
            Study = studies_map[aliquot].upper()
            buf.write("\t".join(
                map(str, (ParticipantBarcode, SampleBarcode,
                          aliquot, SampleTypeLetterCode, Study, Platform,
                          i.split(".")[0], i.split(".")[1], m))) + '\n')
    log.info('\t\tprocessed %s total lines' % (count))

    file_name = matrix_file.split('/')[-1]
    log.info('\t\tsave %s to GCS' % file_name)
    buf.seek(0)
    df = convert_file_to_dataframe(buf)
    df = cleanup_dataframe(df)
    gcs = GcsConnector(config['project_id'], config['buckets']['open'])
    gcs.convert_df_to_njson_and_upload(
        df, config['mirna_isoform_matrix'][Platform]['output_dir'] + file_name)
    log.info('\t\tcompleted save to GCS')
    log.info('\tfinished melt matrix')
Exemple #16
0
def main(config):

    #    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connectioni
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    # connect to bucket to get files
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(
        search_patterns=['.isoform.quantification.txt'],
        regex_search_pattern=isoform_file,
        prefixes=['tcga/intermediary/mirna/isoform/'])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(
        lambda x: os.path.splitext(os.path.basename(x))[0].replace(
            '.json', ''))
    data_library = data_library.query('size > 0')

    conn = sqlite3.connect('../etl-mirna-isoform.db')
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()

    with open('downloadedfiles.txt') as f:
        lines = f.read().splitlines()

    all_files_df = all_files_df[(all_files_df.DatafileName.isin(
        data_library.basefilename))]
    all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    print data_library

    conn = sqlite3.connect('etl-isoform-download.db')
    submit_to_queue(data_library, conn, 'task_queue')
    queue_df = data_library

    # restart ETL; this gets the diff; also takes care of errors
    try:
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        print 'completed: ', len(queue_df2)
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        print 'Not completed: ', len(queue_df)
    except Exception, e:
        pass
Exemple #17
0
def main(configfilename):
    """
    Prefer 450K files over 270k (delete transformed 270K files)
    """
    with open(configfilename) as configfile:
        config = json.load(configfile)

    print '*'*30 + "\nExtract - methylation"
    data_library = methylation.extract.identify_data(config)
    hiseq_aliquots = data_library.query('Platform == "HumanMethylation450"')['AliquotBarcode'].drop_duplicates().values.tolist()
    IlluminaGA_df = data_library.query('Platform == "HumanMethylation27"')
    IlluminaHiSeq_GA_df =  IlluminaGA_df[ (IlluminaGA_df['AliquotBarcode'].isin(hiseq_aliquots))]

    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    gcs = GcsConnector(project_id, bucket_name)
    for _, row in IlluminaHiSeq_GA_df.iterrows():
        blob = row.to_dict()['OutDatafileNameKey']
        gcs.delete_blob(blob)
def parse_cnv(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('cnv', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
    return status
Exemple #19
0
def parse_mirtarbase(config):
    """
    Download mirtarbase file with urllib2.Request
    Convert to a dataframe
    """
    # use Request to get the data (change the link to get the most recent file)
    request = Request(config['mirtarbase']['input_file'])
    socket = urlopen(request)

    # load into a pandas dataframe
    xlsd = pd.ExcelFile(socket, dtype=object)
    df_sheet1 = xlsd.parse(0)  # parse first sheet (converts ina dataframe)
    print 'Found {0} rows in the file'.format(len(df_sheet1))

    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(df_sheet1)

    replace_column_names = {
        'Species_miRNA': 'miRNA_Species',
        'Species_Target_Gene': 'Target_Gene_Species',
        'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID',
        'miRTarBase_ID': 'miRTarBaseID'
    }
    data_df.columns = [replace_column_names[x] if x in replace_column_names\
                                               else x for x in data_df.columns]
    # convert to int
    data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\
                                        .map(lambda x: str(int(float(x))), na_action='ignore')

    #-------
    # upload
    #-------
    # connect to google.cloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    # connect to google storage
    gcs = GcsConnector(project_id, bucket_name)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(
        data_df, config['mirtarbase']['output_file'])
    print status
Exemple #20
0
def download_antibody_annotation_files(config, log):
    object_key_template = config['protein']['aa_object_key_template']
    aa_file_dir = config['protein']['aa_file_dir']
    gcs = GcsConnector(config['project_id'], config['buckets']['open'])
    studies = config['all_tumor_types']
    nonrppa_studies = config['protein']['nonrppa']

    log.info('\tstart downloading antibody annotation files to %s from %s:%s' %
             (aa_file_dir, config['project_id'], config['buckets']['open']))
    if not os.path.isdir(aa_file_dir):
        os.makedirs(aa_file_dir)
    for study in studies:
        if study in nonrppa_studies:
            continue
        keypath = object_key_template % (study.lower(), study.upper())
        log.info('\t\tdownloading %s' % (keypath))
        tmpfile = gcs.download_blob_to_file(keypath)
        with open(aa_file_dir + keypath[keypath.rindex('/'):], 'w') as outfile:
            outfile.write(tmpfile.getvalue())
    log.info('\tdone downloading antibody annotation files')
def parse_mirtarbase(config):
    """
    Download mirtarbase file with urllib2.Request
    Convert to a dataframe
    """
    # use Request to get the data (change the link to get the most recent file)
    request = Request(config['mirtarbase']['input_file'])
    socket = urlopen(request)

    # load into a pandas dataframe
    xlsd = pd.ExcelFile(socket, dtype=object)
    df_sheet1 = xlsd.parse(0) # parse first sheet (converts ina dataframe)
    print 'Found {0} rows in the file'. format(len(df_sheet1))

    # clean up the dataframe to upload to BigQuery
    data_df = cleanup_dataframe(df_sheet1)

    replace_column_names = {
        'Species_miRNA': 'miRNA_Species',
        'Species_Target_Gene': 'Target_Gene_Species',
        'Target_Gene_Entrez_Gene_ID': 'Target_Gene_EntrezID',
        'miRTarBase_ID': 'miRTarBaseID'
    }
    data_df.columns = [replace_column_names[x] if x in replace_column_names\
                                               else x for x in data_df.columns]
    # convert to int
    data_df['Target_Gene_EntrezID'] = data_df['Target_Gene_EntrezID']\
                                        .map(lambda x: str(int(float(x))), na_action='ignore')

    #-------
    # upload
    #-------
    # connect to gcloud wrapper
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    # connect to google storage
    gcs = GcsConnector(project_id, bucket_name)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, config['mirtarbase']['output_file'])
    print status
def main(config):

#    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connectioni
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
   
    # connect to bucket to get files 
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(search_patterns=['.isoform.quantification.txt'], regex_search_pattern=isoform_file, prefixes=['tcga/intermediary/mirna/isoform/'])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(lambda x: os.path.splitext(os.path.basename(x))[0].replace('.json', ''))
    data_library = data_library.query('size > 0')
    

    conn = sqlite3.connect('../etl-mirna-isoform.db')
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()

    with open('downloadedfiles.txt') as f:
        lines = f.read().splitlines()

    all_files_df = all_files_df[ (all_files_df.DatafileName.isin(data_library.basefilename))]
    all_files_df = all_files_df[ ~ (all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    print data_library
    
    conn = sqlite3.connect('etl-isoform-download.db')
    submit_to_queue(data_library, conn, 'task_queue')
    queue_df = data_library

     # restart ETL; this gets the diff; also takes care of errors
    try:
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        print 'completed: ', len(queue_df2)
        queue_df = queue_df[ ~(queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        print 'Not completed: ',  len(queue_df)
    except Exception, e:
        pass
def parse_isoform(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name,
                                                filename)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename)
    return status
Exemple #24
0
def search_files(config):

    # project id
    project_id = config['project_id']
    # bucket name 
    bucket_name = config['buckets']['open']
   
    # connect to google cloud storage 
    gcs = GcsConnector(project_id, bucket_name)

    #---------------------    
    # search for MAF files
    #---------------------
    maf_file = re.compile("^.*.maf$")
    # search only these tumor types defined in config files
    search_tumors = ["tcga/" + d.lower() for d in config['all_tumor_types']]
    data_library = gcs.search_files(search_patterns=['.maf'], regex_search_pattern=maf_file, prefixes=search_tumors)
    data_library['basefilename'] = map(lambda x: os.path.splitext(os.path.basename(x))[0], data_library['filename']) 
    data_library['unique_filename'] = format_dupe_values(data_library['basefilename'])

    return data_library
Exemple #25
0
def process_sql2bq(config, etl_config, sql, log):
    log.info("\t\tselect annotation records from db")
    metadata_df = read_mysql_query(config['cloudsql']['host'],
                                   config['cloudsql']['db'],
                                   config['cloudsql']['user'],
                                   config['cloudsql']['passwd'], sql)
    log.info("\t\tFound {0} rows, columns.".format(str(metadata_df.shape)))
    log.info("\tupload data to GCS.")
    project_id = config['cloud_projects']['open']
    bucket_name = config['buckets']['open']
    gcs_file_path = config['buckets']['folders'][
        'base_run_folder'] + 'etl/annotation'
    gcs = GcsConnector(project_id, bucket_name)
    gcs.convert_df_to_njson_and_upload(metadata_df,
                                       gcs_file_path + '/annotation.json',
                                       logparam=log)
    log.info('create the BigQuery table')
    Etl().load(project_id, [etl_config['bq_dataset']],
               [etl_config['bq_table']], [etl_config['schema_file']],
               ['gs://' + bucket_name + '/' + gcs_file_path],
               [etl_config['write_disposition']], 1, log)
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'TCGA-OR-A5J1-01A-11D-A29J-05.txt'
    outfilename = ''

    # read the stringIO/file into a pandas dataframe
    # load the file into a table
    data_df = pandas.read_table(filename, sep="\t", skiprows=1,
                                lineterminator='\n', comment='#')

    # clean up the dataframe for upload to BigQuery
    data_df = cleanup_dataframe(data_df)

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)

    print status
def parse_protein(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging("protein", "logs/" + metadata["AliquotBarcode"] + ".log")

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df = additional_changes(data_df)
    data_df = add_metadata(data_df, metadata)

    # validation
    tests.assert_notnull_property(data_df, columns_list=["Protein_Name"])

    # upload the contents of the dataframe in njson format
    status = gcs.convert_df_to_njson_and_upload(data_df, outfilename, metadata=metadata)
    return status
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
def main():
    """
    Pipeline
    """

    #--------------
    # methylation
    #--------------
    config = json.load(open(sys.argv[1]))
    project_id = config['project_id']
    bucket_name = config['buckets']['open']


    print '*'*30 + "\nExtract - methylation"
    data_library = methylation.extract.identify_data(config)
    hiseq_aliquots = data_library.query('Platform == "HumanMethylation450"')['AliquotBarcode'].drop_duplicates().values.tolist()
    IlluminaGA_df = data_library.query('Platform == "HumanMethylation27"')
    IlluminaHiSeq_GA_df =  IlluminaGA_df[ (IlluminaGA_df['AliquotBarcode'].isin(hiseq_aliquots))]

    gcs = GcsConnector(project_id, bucket_name)
    for idx, row in IlluminaHiSeq_GA_df.iterrows():
        blob = row.to_dict()['OutDatafileNameKey']
        print gcs.delete_blob(blob)
def main():
    """Example to download a file from the Google Storage, transform,
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = ''
    outfilename = ''

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    # main steps: download, convert to df
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)

    #---------------------------------------------------------
    # get required information
    # get chromosome 1 and Genomic_Coordinate > 20000000
    #---------------------------------------------------------
    data_df = (data_df.query("Chromosome == '1' and Genomic_Coordinate > 20000000")\
                 .query("Beta_value > 0.2"))
    # we can assign this query to a new dataframe and have new data

    # upload the contents of the dataframe in njson format to google storage
    # set metadata on the blob/object
    metadata = {'info': 'etl-test'}
    status = gcs.convert_df_to_njson_and_upload(data_df,
                                                outfilename,
                                                metadata=metadata)

    print status
def main():
    """
    Pipeline
    """

    #--------------
    # methylation
    #--------------
    config = json.load(open(sys.argv[1]))
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    print '*' * 30 + "\nExtract - methylation"
    data_library = methylation.extract.identify_data(config)
    hiseq_aliquots = data_library.query('Platform == "HumanMethylation450"')[
        'AliquotBarcode'].drop_duplicates().values.tolist()
    IlluminaGA_df = data_library.query('Platform == "HumanMethylation27"')
    IlluminaHiSeq_GA_df = IlluminaGA_df[(
        IlluminaGA_df['AliquotBarcode'].isin(hiseq_aliquots))]

    gcs = GcsConnector(project_id, bucket_name)
    for idx, row in IlluminaHiSeq_GA_df.iterrows():
        blob = row.to_dict()['OutDatafileNameKey']
        print gcs.delete_blob(blob)
def generate_oncotator_inputfiles(project_id, bucket_name, filename,
                                  outputfilename, oncotator_columns):

    print(filename)

    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    try:
        maf_df = convert_file_to_dataframe(filebuffer)
    except:
        print 'problem converting %s to a dataframe' % (filename)
        raise

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns)

    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({
        'hg19': '37',
        'GRCh37': '37',
        'GRCh37-lite': '37'
    })

    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome": map(str, range(1, 23)) + ['X', 'Y'],
        "mutation_status": ['somatic', 'Somatic'],
        "sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'],
        "ncbi_build": ['37']
    }

    filter_checklist_df = maf_df.isin(filters)

    filter_string = ((filter_checklist_df["chromosome"] == True)
                     & (filter_checklist_df["mutation_status"] == True)
                     & (filter_checklist_df["sequencer"] == True)
                     & (filter_checklist_df["ncbi_build"] == True))

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build": 'build',
        'chromosome': 'chr',
        'start_position': 'start',
        'end_position': 'end',
        'reference_allele': 'ref_allele',
        'tumor_seq_allele1': 'tum_allele1',
        'tumor_seq_allele2': 'tum_allele2',
        'tumor_sample_barcode': 'tumor_barcode',
        'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [
            replace_column_names[x] if x == rcol else x for x in maf_df.columns
        ]
        oncotator_columns = [
            replace_column_names[y] if y == rcol else y
            for y in oncotator_columns
        ]

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)

    df_stringIO = oncotator_df.to_csv(sep='\t',
                                      index=False,
                                      columns=oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)

    return True
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns):

    print (filename)
    
    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    maf_df = convert_file_to_dataframe(filebuffer)

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns) 
    
    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37'
                                  ,'GRCh37': '37'
                                  ,'GRCh37-lite': '37'
                                 })

   
    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome" : map(str,range(1,23)) + ['X', 'Y']
        ,"mutation_status": ['somatic', 'Somatic']
        ,"sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq']
        ,"ncbi_build" : ['37']
    }

    filter_checklist_df = maf_df.isin(filters)
    
    filter_string = (
                       (filter_checklist_df["chromosome"] == True)
                        &   
                       (filter_checklist_df["mutation_status"] == True)
                        &
                       (filter_checklist_df["sequencer"] == True)
                        &
                       (filter_checklist_df["ncbi_build"] == True)
                    )

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------
   
    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build" : 'build'
       ,'chromosome' : 'chr'
       ,'start_position' : 'start'
       ,'end_position' : 'end'
       ,'reference_allele' : 'ref_allele'
       ,'tumor_seq_allele1' : 'tum_allele1'
       ,'tumor_seq_allele2' : 'tum_allele2'
       ,'tumor_sample_barcode': 'tumor_barcode'
       ,'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [replace_column_names[x] if x==rcol else x for x in maf_df.columns]
        oncotator_columns = [replace_column_names[y] if y==rcol else y for y in oncotator_columns]         

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------      
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)
   
    df_stringIO =  oncotator_df.to_csv(sep='\t', index=False, columns= oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)
    
    return True
def main():
    """Parse GCT file, merge with barcodes info, melt(tidy) 
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'ccle/mRNA-gene-exp/CCLE_Expression_Entrez_2012-09-29.gct'
    outfilename = 'test'
    writer = ExcelWriter('ccle.xlsx')

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #------------------------------
    # load the GCT file
    #  * the file has duplicate samples (columns)
    #------------------------------
    # To remove duplicates, load the first few lines of the file only. Get columns, unique and select dataframe
    # this is a hack, but I cant find a elegant way to remove duplciates
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=False,
                           nrows=2)
    unqiue_columns = np.unique(gct_df.columns)
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=True)
    # clean-up the dataset/dataframe
    gct_df = cleanup_dataframe(gct_df)
    gct_df = gct_df[unqiue_columns]

    # remove any gene_id starting with 'AFFX-'
    gct_df[gct_df['Name'].str.startswith('AFFX-')].to_excel(
        writer, sheet_name="affy_info")
    gct_df = gct_df[~gct_df['Name'].str.startswith('AFFX-')]

    #------------------------------
    # HGNC validation
    #-----------------------------
    hgnc_df = hgnc_validation.get_hgnc_map()
    hgnc_df.to_excel(writer, sheet_name="hgnc_info")
    hgnc_dict = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol))
    gct_df['HGNC_gene_symbol'] = gct_df['Name'].map(
        lambda gene_id: hgnc_dict.get(gene_id.replace('_at', ''), np.nan))
    gct_df[['HGNC_gene_symbol', 'Name',
            'Description']].to_excel(writer, sheet_name="gene_info")
    gct_df['Name'] = gct_df['Name'].map(
        lambda gene_id: gene_id.replace('_at', ''))

    #------------------------------
    # barcodes info
    #------------------------------
    barcodes_filename = 'ccle/mRNA-gene-exp/mRNA_names.out.tsv'
    filebuffer = gcs.download_blob_to_file(barcodes_filename)
    barcodes_df = pd.read_table(
        filebuffer,
        header=None,
        names=['ParticipantBarcode', 'SampleBarcode',
               'CCLE_long_name'])  # convert into dataframe
    barcodes_df = cleanup_dataframe(barcodes_df)  # clean-up dataframe

    #------------------------------
    # ignore (drop) all of the columns from the gene-expression matrix
    #that don't have corresponding Participant and Sample barcodes,
    #------------------------------
    columns_df = pd.DataFrame(unqiue_columns)
    columns_df.columns = ['CCLE_long_name']
    samples_map_df = pd.merge(columns_df,
                              barcodes_df,
                              on='CCLE_long_name',
                              how='inner')
    samples_map_df.to_excel(writer, sheet_name="sample_info")

    # select columns that are overlapping
    overlapping_samples = samples_map_df['CCLE_long_name'].tolist()
    overlapping_samples = overlapping_samples + [
        'Name', 'Description', 'HGNC_gene_symbol'
    ]
    gct_df = gct_df[overlapping_samples]
    print gct_df

    # melt the matrix
    value_vars = [
        col for col in gct_df.columns
        if col not in ['Name', 'Description', 'HGNC_gene_symbol']
    ]
    melted_df = pd.melt(gct_df,
                        id_vars=['Name', 'Description', 'HGNC_gene_symbol'],
                        value_vars=value_vars)
    melted_df = melted_df.rename(
        columns={
            'Name': 'gene_id',
            'Description': 'original_gene_symbol',
            'variable': 'CCLE_long_name',
            'value': 'RMA_normalized_expression'
        })

    # merge to get barcode information
    # changed from outer join to inner join. In this case it shouldnt matter, since we already did a inner join
    # while select the samples above.
    data_df = pd.merge(melted_df,
                       samples_map_df,
                       on='CCLE_long_name',
                       how='inner')
    data_df['Platform'] = "Affymetrix U133 Plus 2.0"

    # reorder columns
    col_order = [
        "ParticipantBarcode", "SampleBarcode", "CCLE_long_name", "gene_id",
        "HGNC_gene_symbol", "original_gene_symbol", "Platform",
        "RMA_normalized_expression"
    ]
    data_df = data_df[col_order]

    # upload the contents of the dataframe in CSV format
    print "Convert to CSV"
    outfilename = "tcga/intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv"
    df_string = data_df.to_csv(index=False, header=False)
    status = gcs.upload_blob_from_string(outfilename, df_string)
    print status

    # save the excel file
    writer.save()
Exemple #36
0
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns, sample_code2letter):
    study = data_library['Study'].iloc[0]

    # this needed to stop pandas from converting them to FLOAT
    dtype = {
        "Transcript_Exon" : "object"
       ,"NCBI_Build" : "object"
       ,"COSMIC_Total_Alterations_In_Gene" : "object"
       ,"CCLE_ONCOMAP_Total_Mutations_In_Gene" : "object"
       ,"HGNC_HGNC_ID" : "object"
       ,"UniProt_AApos" : "object"
       ,"Transcript_Position" : "object"
       ,"HGNC_OMIM_ID_Supplied_By_NCBI" : "object"
      }


    file_count = 0
   
    # create an empty dataframe. we use this to merge dataframe
    disease_bigdata_df = pd.DataFrame()

    # iterate over the selected files
    for oncotator_file in data_library['filename']:
        file_count+= 1

        log.info('-'*10 + "{0}: Processing file {1}".format(file_count, oncotator_file) + '-'*10)

        try:
           gcs = GcsConnector(project_id, bucket_name)
           # covert the file to a dataframe
           filename = 'tcga/intermediary/MAF/oncotator_output_files/' + oncotator_file
           df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename)
        except Exception as e:
           print e
           raise
           
        if df.empty:
           log.debug('empty dataframe for file: ' + str(oncotator_file))
           continue

        #------------------------------
        # different operations on the frame
        #------------------------------
        # get only the required BigQuery columns
        df = df[bq_columns]
       
        # format oncotator columns; name changes etc 
        df = format_oncotator_columns(df)

        # add new columns
        df = add_columns(df, sample_code2letter, study)

        disease_bigdata_df = disease_bigdata_df.append(df, ignore_index = True)
            

    # this is a merged dataframe
    if not disease_bigdata_df.empty:
        
        # remove duplicates; various rules; see check duplicates)
        df = check_duplicates.remove_maf_duplicates(df, sample_code2letter)

        # enforce unique mutation
        unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode']

        # merge mutations from multiple centers
        concat_df = []
        for idx, df_group in df.groupby(unique_mutation):
            if len(df_group) > 1:
                # tolist; unique list; sort; concat
                df_group.loc[:,'Center'] = ";".join(map(str,sorted(list(set(df_group['Center'].tolist())))))
            concat_df.append(df_group)
        df = pd.concat(concat_df)

        # enforce unique mutation
        df = remove_duplicates(df, unique_mutation)

        # convert the df to new-line JSON and the upload the file
        gcs.convert_df_to_njson_and_upload(disease_bigdata_df, "tcga/intermediary/MAF/bigquery_data_files/{0}.json".format(study))

    else:
        raise Exception('Empty dataframe!')
    return True
Exemple #37
0
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns,
                             sample_code2letter, oncotator_object_path,
                             oncotator_object_output_path):
    study = data_library['Study'].iloc[0]

    # this needed to stop pandas from converting them to FLOAT
    dtype = {
        "Transcript_Exon": "object",
        "NCBI_Build": "object",
        "COSMIC_Total_Alterations_In_Gene": "object",
        "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object",
        "HGNC_HGNC_ID": "object",
        "UniProt_AApos": "object",
        "Transcript_Position": "object",
        "HGNC_OMIM_ID_Supplied_By_NCBI": "object"
    }

    file_count = 0

    # create an empty dataframe. we use this to merge dataframe
    disease_bigdata_df = pd.DataFrame()

    # iterate over the selected files
    for oncotator_file in data_library['filename']:
        file_count += 1

        log.info(
            '-' * 10 +
            "{0}: Processing file {1}".format(file_count, oncotator_file) +
            '-' * 10)

        try:
            # covert the file to a dataframe
            filename = oncotator_object_path + oncotator_file
            gcs = GcsConnector(project_id, bucket_name)
            log.info('%s: converting %s to dataframe' % (study, filename))
            df = gcutils.convert_blob_to_dataframe(gcs,
                                                   project_id,
                                                   bucket_name,
                                                   filename,
                                                   log=log)
            log.info('%s: done converting %s to dataframe' % (study, filename))
        except RuntimeError as re:
            log.warning('%s: problem cleaning dataframe for %s: %s' %
                        (study, filename, re))
        except Exception as e:
            log.exception('%s: problem converting to dataframe for %s: %s' %
                          (study, filename, e))
            raise e

        if df.empty:
            log.warning('%s: empty dataframe for file: %s' %
                        (study, oncotator_file))
            continue

        #------------------------------
        # different operations on the frame
        #------------------------------
        # get only the required BigQuery columns
        df = df[bq_columns]

        # format oncotator columns; name changes etc
        df = format_oncotator_columns(df)

        # add new columns
        df = add_columns(df, sample_code2letter, study)

        disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True)

        log.info('-' * 10 + "{0}: Finished file({3}) {1}. rows: {2}".format(
            file_count, oncotator_file, len(df), study) + '-' * 10)

    # this is a merged dataframe
    if not disease_bigdata_df.empty:
        # remove duplicates; various rules; see check duplicates)

        log.info(
            '\tcalling check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))
        disease_bigdata_df = check_duplicates.remove_maf_duplicates(
            disease_bigdata_df, sample_code2letter, log)
        log.info(
            '\tfinished check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))

        # enforce unique mutation--previous
        # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode']
        # enforce unique mutation
        unique_mutation = [
            'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position',
            'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1',
            'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'
        ]
        # merge mutations from multiple centers
        log.info(
            '\tconsolidate the centers for duplicate mutations into list for %s'
            % (study))
        seencenters = set()

        def concatcenters(df_group):
            if len(df_group) > 1:
                centers = list(set(df_group['Center'].tolist()))
                uniquecenters = set()
                delim = config['maf']['center_delim']
                for center in centers:
                    fields = center.split(delim)
                    for field in fields:
                        uniquecenters.add(field)
                sortedunique = delim.join(sorted(list(uniquecenters)))
                df_group.loc[:, 'Center'] = sortedunique
                if sortedunique not in seencenters:
                    log.info('unique centers: %s' % sortedunique)
                    seencenters.add(sortedunique)
            return df_group

        disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply(
            concatcenters)
        log.info(
            '\tfinished consolidating centers for duplicate mutations for %s' %
            (study))

        # enforce unique mutation
        log.info(
            '\tcalling remove_duplicates to collapse mutations with %s rows for %s'
            % (len(disease_bigdata_df), study))
        disease_bigdata_df = remove_duplicates(disease_bigdata_df,
                                               unique_mutation)
        log.info(
            '\tfinished remove_duplicates to collapse mutations with %s rows for %s'
            % (len(disease_bigdata_df), study))

        # convert the disease_bigdata_df to new-line JSON and upload the file
        uploadpath = oncotator_object_output_path + "{0}.json".format(study)
        log.info('%s: uploading %s to GCS' % (study, uploadpath))
        gcs.convert_df_to_njson_and_upload(disease_bigdata_df, uploadpath)
        log.info('%s: done uploading %s to GCS' % (study, uploadpath))

    else:
        log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study))
    return True
Exemple #38
0
def identify_data(config):
    """Gets the metadata info from database
    """
    log = configure_logging('data_et', 'logs/data_et.log')
    log.info('start data extract and transform')
    # cloudSql connection params
    host = config['cloudsql']['host']
    database = config['cloudsql']['db']
    user = config['cloudsql']['user']
    passwd = config['cloudsql']['passwd']

    log.info("\tselect data records from db")
    sqlquery = """
        SELECT 
            ParticipantBarcode,
            SampleBarcode,
            AliquotBarcode,
            AliquotUUID,
            DataArchiveName,
            DataArchiveURL,
            DataArchiveVersion,
            DataCenterCode,
            DataCenterName,
            DataCenterType,
            DatafileMD5,
            DatafileName,
            DatafileNameKey,
            DatafileUploaded,
            DataLevel,
            Datatype,
            GenomeReference,
            IncludeForAnalysis,
            MAGETabArchiveName,
            MAGETabArchiveURL,
            Pipeline,
            Platform,
            Project,
            Repository,
            SampleType,
            SampleTypeCode,
            SDRFFileName,
            SDRFFileNameKey,
            SecurityProtocol,
            Species,
            Study,
            wasDerivedFrom,
            library_strategy,
            state,
            reason_for_state,
            analysis_id,
            analyte_code,
            last_modified,
            platform_full_name,
            GG_dataset_id,
            GG_readgroupset_id
        FROM metadata_data
        WHERE Project = 'TCGA'
            AND study = '%s'
            AND DatafileUploaded='true'
            AND DatafileNameKey is not null
            AND IncludeForAnalysis='yes'
        """

    studies = config['all_tumor_types']
    for study in studies:
        # connect to db and get results in a dataframe
        log.info("\tselect data records from db for %s" % (study))
        metadata_df = read_mysql_query(host, database, user, passwd, sqlquery % (study))
        log.info("\tFound {0} rows, columns." .format(str(metadata_df.shape)))
    
        project_id = config['project_id']
        bucket_name = config['buckets']['open']
        gcs = GcsConnector(project_id, bucket_name)
        log.info("\tupload data to GCS.")
        gcs.convert_df_to_njson_and_upload(metadata_df, config['data']['output_dir'] + config['data']['bq_table'] + '_%s.json' % (study))
    
    log.info('finished extract and transform')
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj[
                    'sample_barcode'] = i  # Normalized to match user_gen
                new_df_obj['Project'] = metadata['project_id']
                new_df_obj['Study'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][
                    k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][
                    k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][
                    k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(
        set([k for d, k in new_df['SampleBarcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'],
                                           metadata['data_type'],
                                           sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'],
                                         metadata['study_id'], project_id,
                                         bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(new_df,
                                       outfilename,
                                       metadata=metadata,
                                       tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(project_id,
                            bq_dataset,
                            table_name,
                            schema,
                            source_path,
                            source_format='NEWLINE_DELIMITED_JSON',
                            write_disposition='WRITE_APPEND',
                            is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj['sample_barcode'] = i # Normalized to match user_gen
                new_df_obj['project_id'] = metadata['project_id']
                new_df_obj['study_id'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
Exemple #42
0
    def melt_matrix(self, matrix_file, platform, file2info, program_name,
                    config, log):
        """
        # melt matrix
        """
        log.info('\t\t\tbegin melt matrix: \'%s\'' % (matrix_file))
        # begin parsing the data
        data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0)
        data_df2 = data_df2.set_index(["Gene"])

        # create a StingIO object with this info
        # call utils.convert_file_to_dataframe(buffer, sep=",")
        # call tools.cleanup_dataframe()
        # gcs.convert_df_to_njson_and_upload()
        log.info('\t\t\t\tstart processing saved matrix.  size: %s' %
                 (len(data_df2)))
        mod = int(len(data_df2) / 20)
        count = 0
        total_count = 0
        buf = StringIO()
        buf.write(
            "sample_barcode	mirna_id	mirna_accession	normalized_count	platform	project_short_name	program_name	sample_type_code"
            +
            "	file_name	file_gdc_id	aliquot_barcode	case_barcode	case_gdc_id	sample_gdc_id	aliquot_gdc_id\n"
        )
        for i, j in data_df2.T.iteritems():
            for k, m in j.iteritems():
                aliquot = file2info[k]['aliquot_barcode']
                SampleBarcode = "-".join(aliquot.split("-")[0:4])
                ParticipantBarcode = "-".join(aliquot.split("-")[0:3])
                SampleTypeCode = aliquot.split("-")[3][0:2]
                info = file2info[k]
                line = "\t".join(
                    map(str, (SampleBarcode, i.split(".")[0], i.split(".")[1],
                              m, platform, info['project_short_name'],
                              info['program_name'], SampleTypeCode,
                              info['file_name'], info['file_gdc_id'], aliquot,
                              ParticipantBarcode, info['case_gdc_id'],
                              info['sample_gdc_id'],
                              info['aliquot_gdc_id']))) + '\n'
                buf.write(line)
                total_count += 1
            if 0 == count % mod:
                log.info('\t\t\t\t\tprocessed %s lines:\n%s' % (count, line))
                file_name = '%s_%s' % (matrix_file.split('/')[-1], count)
                log.info('\t\t\t\tsave %s to GCS' % file_name)
                buf.seek(0)
                df = convert_file_to_dataframe(buf)
                df = cleanup_dataframe(df, log)
                gcs = GcsConnector(config['cloud_projects']['open'],
                                   config['buckets']['open'])
                gcs.convert_df_to_njson_and_upload(
                    df,
                    config[program_name]['process_files']['datatype2bqscript']
                    ['Isoform Expression Quantification']['gcs_output_path'] +
                    file_name,
                    logparam=log)
                buf = StringIO()
                buf.write(
                    "sample_barcode	mirna_id	mirna_accession	normalized_count	platform	project_short_name	program_name	sample_type_code"
                    +
                    "	file_name	file_gdc_id	aliquot_barcode	case_barcode	case_gdc_id	sample_gdc_id	aliquot_gdc_id\n"
                )
            count += 1
        log.info('\t\t\t\tprocessed %s total lines created %s records' %
                 (count, total_count))

        log.info('\t\t\t\tcompleted save to GCS')
        log.info('\t\t\tfinished melt matrix')
def main(config):

    log_filename = 'etl_download_isoform.log'
    log_name = 'etl_download_isoform'
    log = configure_logging(log_name, log_filename)
    log.info('begin downloading isoform files')
    #    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    # connect to bucket to get files
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(
        search_patterns=['.isoform.quantification.txt'],
        regex_search_pattern=isoform_file,
        prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(
        lambda x: os.path.splitext(os.path.basename(x))[0].replace(
            '.json', ''))
    data_library = data_library.query('size > 0')

    log.info('\tbegin selecting isoform files from sql-lite isoform db')
    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()
    log.info('\tfinished selecting isoform files')

    log.info('\tbegin reading from down loaded files')
    with open(config['mirna_isoform_matrix']
              ['isoform_download_prev_files']) as f:
        lines = f.read().splitlines()
    log.info('\tfinished reading from down loaded files')

    log.info(
        'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n'
        % (len(data_library), len(all_files_df), data_library))
    all_files_df = all_files_df[(all_files_df.DatafileName.isin(
        data_library.basefilename))]
    all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    log.info('finished filter files: %s\n%s\n' %
             (len(data_library), data_library))

    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    submit_to_queue(data_library, conn, 'task_queue', log)
    queue_df = data_library

    # restart ETL; this gets the diff; also takes care of errors
    try:
        conn = sqlite3.connect('isoform_download.db')
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        log.info('\tso far completed: ' % (len(queue_df2)))
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        log.info('\tso far not completed: ' % (len(queue_df)))
    except Exception:
        log.exception(
            '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n'
        )

    # -----------------------------------------------------
    # thread this with concurrent futures
    #------------------------------------------------------
    log.info('\tsubmit jobs to process manager')
    pm = process_manager.ProcessManager(max_workers=200,
                                        db='isoform_download.db',
                                        table='task_queue_status',
                                        log=log)
    for count, df in data_library.iterrows():
        row = df.to_dict()
        if 0 == count % 512:
            time.sleep(10)
        if 0 == count % 2048:
            log.info('\t\tsubmitting %s file: %s' %
                     (count, row['DatafileName']))
        if not os.path.isdir(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform']):
            os.makedirs(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform'])
        outfilename = config['mirna_isoform_matrix'][
            'isoform_download_dir'] + row['Platform'] + "/" + row[
                'DatafileName']
        pm.submit(download_file, project_id, bucket_name,
                  row['DatafileNameKey'], outfilename, '')
        time.sleep(0.2)
    log.info('\tsubmitted %s total jobs to process manager' % (count))

    log.info('\tstart process manager completion check')
    pm.start()

    log.info('finished downloading isoform files')