Beispiel #1
0
def main(datatype, config_file, max_workers, dry_run, create_new, debug):
    """
    Pipeline
    """
    # config params
    config = json.load(open(config_file))
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    table_task_queue = 'task_queue'
    table_task_queue_status = 'task_queue_status' 
    db_filename = 'etl-{0}.db'.format(datatype)
    log_filename = 'etl_{0}.log'.format(datatype)
    log_name = 'etl_{0}'.format(datatype)

    log.info('start pipeline for %s' % (datatype))
    # check if the table exists and issue warning
    if os.path.exists(db_filename):
       log.warning('Using the already available database file - {0}'.format(db_filename))
       time.sleep(2)
    
    # connect to the database
    conn = sqlite3.connect(db_filename, check_same_thread=False)
   
    #-------------------------------
    # Submit to task queue
    #-------------------------------
    print "="*30 + "\nQuerying Google Cloud SQL metadata_data table"
    queue_df = extract_functions[datatype](config)
    submit_to_queue(queue_df, conn, table_task_queue)

    #--------------
    # Tests
    #--------------
    tests.assert_notnull_property(queue_df, columns_list=['SampleTypeCode', 'SampleTypeLetterCode',\
                     'Study', 'Platform', 'SampleBarcode', 'OutDatafileNameKey',\
                                         'ParticipantBarcode', 'DatafileNameKey', 'AliquotBarcode'])

    if create_new:
         # delete the old queue(task_queue_status) and re-run
        conn.execute('DROP TABLE IF EXISTS {0}'.format(table_task_queue_status))

    # Validate and get diff ; restart ETL; also takes care of errors
    queue_df = validate_and_get_diff(conn, queue_df, table_task_queue_status)

    if debug:
        # debug mode runs top 30 rows
        log.debug('Running in debug mode (first 30 records)')
        queue_df = queue_df.head(30)

    if dry_run:
        log.info('finished dry run for %s' % (datatype))
        sys.exit()


    #--------------------------------------------
    # Execution
    #------------------------------------------------------
    pmr = process_manager.ProcessManager(max_workers=max_workers, db=db_filename, table=table_task_queue_status, log=log)
    for index, row in queue_df.iterrows():
        metadata = row.to_dict()
        inputfilename = metadata['DatafileNameKey']
        outputfilename = metadata['OutDatafileNameKey']
        # transform
        #transform_functions[datatype]( project_id, bucket_name,\
        #                inputfilename, outputfilename, metadata)
        future = pmr.submit(transform_functions[datatype], project_id, bucket_name,\
                        inputfilename, outputfilename, metadata)
    
        time.sleep(0.1 + 0.5  * random.random())
        if index % 100 == 0:
            time.sleep(5)

    pmr.start()
    log.info('finished pipeline for %s' % (datatype))
#   Through out the script we use lowecase heading names
oncotator_columns = [
    line.rstrip('\n').lower()
    for line in config['maf']['oncotator_input_columns']
]
oncotator_input_files_dest = config['maf']['oncotator_input_files_dest']

#-----------------------------------
# Extract
#-----------------------------------
data_library = extract.search_files(config)
# log all files found
writer = ExcelWriter('maf_part1.log.xlsx')
data_library.to_excel(writer, "maf_files")
writer.save()

#-----------------------------------------------------
# Execution
#------------------------------------------------------
pm = process_manager.ProcessManager(max_workers=20)
for index, row in data_library.iterrows():
    inputfilename = row['filename']
    outputfilename = oncotator_input_files_dest + row[
        'unique_filename'].replace(".maf", ".txt")
    # transform
    future = pm.submit(transform.generate_oncotator_inputfiles, project_id,
                       bucket_name, inputfilename, outputfilename,
                       oncotator_columns)
    time.sleep(0.2)
pm.start()
Beispiel #3
0
    return True

if __name__ == '__main__':

    config = json.load(open(sys.argv[1]))
  
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    sample_code2letter = config['sample_code2letter']
 
    # get disease_codes/studies( TODO this must be changed to get the disease code from the file name)
    df = convert_file_to_dataframe(open(sys.argv[2]))
    df = cleanup_dataframe(df)
    studies = list(set(df['Study'].tolist()))

    # get bq columns ( this allows the user to select the columns
    # , without worrying about the index, case-sensitivenes etc
    selected_columns = pd.read_table(sys.argv[3], names=['bq_columns'])
    transposed = selected_columns.T
    transposed.columns = transposed.loc['bq_columns']
    transposed = cleanup_dataframe(transposed)
    bq_columns = transposed.columns.values

    # submit threads by disease  code
    pm = process_manager.ProcessManager(max_workers=33, db='maf.db', table='task_queue_status')
    for idx, df_group in df.groupby(['Study']):
        future = pm.submit(process_oncotator_output, project_id, bucket_name, df_group, bq_columns, sample_code2letter)
        #process_oncotator_output( project_id, bucket_name, df_group, bq_columns, sample_code2letter)
        time.sleep(0.2)
    pm.start()
def main(config):

    log_filename = 'etl_download_isoform.log'
    log_name = 'etl_download_isoform'
    log = configure_logging(log_name, log_filename)
    log.info('begin downloading isoform files')
    #    etl = util.DataETL("isb-cgc", "isb-cgc-open") # this starts a new connection
    project_id = config['project_id']
    bucket_name = config['buckets']['open']

    # connect to bucket to get files
    gcs = GcsConnector(project_id, bucket_name)
    isoform_file = re.compile("^.*.isoform.quantification.txt.json$")
    data_library = gcs.search_files(
        search_patterns=['.isoform.quantification.txt'],
        regex_search_pattern=isoform_file,
        prefixes=[config['mirna_isoform_matrix']['isoform_gcs_dir']])
    # we are eliminating bad files - size 0; could be hg18 etc
    data_library.loc[:, 'basefilename'] = data_library['filename'].map(
        lambda x: os.path.splitext(os.path.basename(x))[0].replace(
            '.json', ''))
    data_library = data_library.query('size > 0')

    log.info('\tbegin selecting isoform files from sql-lite isoform db')
    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    sql = 'SELECT * from {0}'.format('task_queue')
    all_files_df = pd.read_sql_query(sql, conn)
    conn.close()
    log.info('\tfinished selecting isoform files')

    log.info('\tbegin reading from down loaded files')
    with open(config['mirna_isoform_matrix']
              ['isoform_download_prev_files']) as f:
        lines = f.read().splitlines()
    log.info('\tfinished reading from down loaded files')

    log.info(
        'filter files.\n\tfiles in cloud storage: %s\n\tfiles previously marked to download: %s\n%s\n'
        % (len(data_library), len(all_files_df), data_library))
    all_files_df = all_files_df[(all_files_df.DatafileName.isin(
        data_library.basefilename))]
    all_files_df = all_files_df[~(all_files_df.DatafileName.isin(lines))]
    data_library = all_files_df
    log.info('finished filter files: %s\n%s\n' %
             (len(data_library), data_library))

    conn = sqlite3.connect(config['mirna_isoform_matrix']['isoform_file_db'])
    submit_to_queue(data_library, conn, 'task_queue', log)
    queue_df = data_library

    # restart ETL; this gets the diff; also takes care of errors
    try:
        conn = sqlite3.connect('isoform_download.db')
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        log.info('\tso far completed: ' % (len(queue_df2)))
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        log.info('\tso far not completed: ' % (len(queue_df)))
    except Exception:
        log.exception(
            '\n++++++++++++++++++++++\n\tproblem filtering completed jobs, ignoring\n++++++++++++++++++++++\n'
        )

    # -----------------------------------------------------
    # thread this with concurrent futures
    #------------------------------------------------------
    log.info('\tsubmit jobs to process manager')
    pm = process_manager.ProcessManager(max_workers=200,
                                        db='isoform_download.db',
                                        table='task_queue_status',
                                        log=log)
    for count, df in data_library.iterrows():
        row = df.to_dict()
        if 0 == count % 512:
            time.sleep(10)
        if 0 == count % 2048:
            log.info('\t\tsubmitting %s file: %s' %
                     (count, row['DatafileName']))
        if not os.path.isdir(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform']):
            os.makedirs(
                config['mirna_isoform_matrix']['isoform_download_dir'] +
                row['Platform'])
        outfilename = config['mirna_isoform_matrix'][
            'isoform_download_dir'] + row['Platform'] + "/" + row[
                'DatafileName']
        pm.submit(download_file, project_id, bucket_name,
                  row['DatafileNameKey'], outfilename, '')
        time.sleep(0.2)
    log.info('\tsubmitted %s total jobs to process manager' % (count))

    log.info('\tstart process manager completion check')
    pm.start()

    log.info('finished downloading isoform files')
Beispiel #5
0
    # restart ETL; this gets the diff; also takes care of errors
    try:
        sql = 'SELECT * from task_queue_status where errors="None"'
        queue_df2 = pd.read_sql_query(sql, conn)
        print 'completed: ', len(queue_df2)
        queue_df = queue_df[~(
            queue_df.DatafileNameKey.isin(queue_df2.DatafileNameKey))]
        print 'Not completed: ', len(queue_df)
    except Exception, e:
        pass

    # -----------------------------------------------------
    # thread this with concurrent futures
    #------------------------------------------------------
    pm = process_manager.ProcessManager(max_workers=200,
                                        db='isoform_download',
                                        table='task_queue_status')
    for i, df in data_library.iterrows():
        row = df.to_dict()
        print row['DatafileName']
        outfilename = "/mnt/datadisk-3/isoform_files/" + row[
            'Platform'] + "/" + row['DatafileName']
        future = pm.submit(download_file, project_id, bucket_name,
                           row['DatafileNameKey'], outfilename, '')
        time.sleep(0.2)

    pm.start()


if __name__ == "__main__":
    config = json.load(open(sys.argv[1]))
Beispiel #6
0
# Extract
#-----------------------------------
data_library = extract.search_files(config)
# log all files found
writer = ExcelWriter('maf_part1.log.xlsx')
data_library.to_excel(writer, "maf_files")
writer.save()

#-----------------------------------------------------
# Execution
#------------------------------------------------------
log_filename = 'etl_maf_part1.log'
log_name = 'etl_maf_part1.log'
log = configure_logging(log_name, log_filename)
log.info('start maf part1 pipeline')
pm = process_manager.ProcessManager(max_workers=20,
                                    db='maf1.db',
                                    table='task_queue_status',
                                    log=log)
for index, row in data_library.iterrows():
    inputfilename = row['filename']
    outputfilename = oncotator_input_files_dest + row[
        'unique_filename'].replace(".maf", ".txt")
    # transform
    future = pm.submit(transform.generate_oncotator_inputfiles, project_id,
                       bucket_name, inputfilename, outputfilename,
                       oncotator_columns)
    time.sleep(0.2)
pm.start()
log.info('finished maf part1 pipeline')