Esempio n. 1
0
def get_colossus_tantalus_data(directory):
    hmmcopy_data = collections.defaultdict(list)

    for table_name, data in load_qc_data(directory).items():
        hmmcopy_data[table_name].append(data)
    for table_name in hmmcopy_data:
        hmmcopy_data[table_name] = pd.concat(hmmcopy_data[table_name],
                                             ignore_index=True)
    return hmmcopy_data
Esempio n. 2
0
def load_data(directory, dashboard_id, host, port):
    logger.info("LOADING DATA: " + dashboard_id)

    hmmcopy_data = collections.defaultdict(list)

    for table_name, data in load_qc_data(directory).items():
        hmmcopy_data[table_name].append(data)
    for table_name in hmmcopy_data:
        hmmcopy_data[table_name] = pd.concat(hmmcopy_data[table_name],
                                             ignore_index=True)

    logger.info(f'loading hmmcopy data with tables {hmmcopy_data.keys()}')

    for index_type in constants.DATA_TYPES:
        index_name = f"{dashboard_id.lower()}_{index_type}"
        logger.info(f"Index {index_name}")

        data = eval(f"get_{index_type}_data(hmmcopy_data)")

        logger.info(f"dataframe for {index_name} has shape {data.shape}")
        load_records(data, index_name, host, port)
Esempio n. 3
0
def test_load_local_qc_data(results_dir):
    results_tables = load_qc_data(results_dir)
    test_qc_data(results_tables)
Esempio n. 4
0
def load_ticket(
    jira_ticket,
    ip_address,
    local_cache_directory=None,
    ticket_directory=None,
    description=None,
    title=None,
    sample_id=None,
    cell_subset_count=None,
    cell_ids=None,
    experimental_condition_override=None,
):

    if (local_cache_directory is not None) == (len(ticket_directory) > 0):
        raise ValueError(
            'must specify one of local_cache_directory or ticket_directory')

    if len(cell_ids) == 0:
        cell_ids = None

    if cell_subset_count and cell_ids:
        logging.info(
            f'Sorry, --cell_subset_count and --cell_ids arguments cannot be used together')
        return

    logging.info(f'jira ticket {jira_ticket}')

    if local_cache_directory is not None:
        cache_qc_results(jira_ticket, local_cache_directory)
        ticket_directory = [os.path.join(local_cache_directory, jira_ticket)]

    hmmcopy_data = collections.defaultdict(list)
    for d in ticket_directory:
        for table_name, data in load_qc_data(d).items():
            hmmcopy_data[table_name].append(data)
    for table_name in hmmcopy_data:
        hmmcopy_data[table_name] = pd.concat(
            hmmcopy_data[table_name], ignore_index=True)

    if experimental_condition_override is not None:
        for table_name, data in hmmcopy_data.items():
            if 'experimental_condition' in data:
                data['experimental_condition'] = experimental_condition_override

    logging.info(f'loading hmmcopy data with tables {hmmcopy_data.keys()}')

    if sample_id is not None:
        logging.info(f'filtering hmmcopy data by sample={sample_id}')
        filter_by_sample_id(hmmcopy_data, sample_id)

    elasticsearch_client = ElasticsearchClient(host=ip_address)

    if cell_subset_count is not None:
        cell_ids = hmmcopy_data['annotation_metrics']['cell_id'].iloc[:cell_subset_count].values

    index = jira_ticket.lower()
    index_get_data = {
        f"qc": get_qc_data,
        f"segs": get_segs_data,
        f"bins": get_bins_data,
        f"gc_bias": get_gc_bias_data,
    }

    for index_type, get_data in index_get_data.items():
        index_name = f"{jira_ticket.lower()}_{index_type}"
        logging.info(f"Index {index_name}")

        init_load(elasticsearch_client, index_name,)

        data = get_data(hmmcopy_data)

        # Subset cells
        if cell_ids is not None and index_type != 'gc_bias':
            data = data[data['cell_id'].isin(cell_ids)]

        logging.info(f"dataframe for {index_name} has shape {data.shape}")

        data['caller'] = caller_map[index_type]
        data['sample_id'] = jira_ticket

        load_index(elasticsearch_client, index_name, data,)

    logging.info(
        f"loading published dashboard record {jira_ticket}")

    AnalysisLoader().load_data(jira_ticket, ip_address, 9200)
Esempio n. 5
0
def test_load_local_qc_data(results_dir):
    results_tables = load_qc_data(results_dir)
    test_qc_data(results_tables)
    logging.info(f'successfully loaded results from {results_dir}')