Example #1
0
def test_DropDFPipe():
    ladder = loader.LoaderPipe()
    ladder.load(query, filters_dict)
    cols = loader.all_columns_metaphlan()
    embedder = dataset.ExampleEmbeddingPipe(ladder, examples_columns=cols)
    dropped = dataset.DropDFPipe(embedder)
    dropped[0:10]
Example #2
0
def test_ExampleEmbeddingPipe():

    ladder = loader.LoaderPipe()
    ladder.load(query, filters_dict)
    cols = loader.all_columns_metaphlan()
    embedder = dataset.ExampleEmbeddingPipe(ladder, examples_columns=cols)
    batch = embedder[0:10]
Example #3
0
def test_MetagenomePipe():

    ladder = loader.LoaderPipe()
    ladder.load(query, filters_dict)
    cols = loader.all_columns_metaphlan()
    exembedder = dataset.ExampleEmbeddingPipe(ladder, examples_columns=cols)
    laembedder = dataset.LabelEmbeddingPipe(exembedder, labels_column='label')
    dropped = dataset.DropDFPipe(laembedder)
    batch = dropped[20:30]
    for c in ['SampleID', 'label', 'examples']:
        assert c in batch.columns
def test_save_intermediate_file():

    labels_dict = loader.get_file_labels(query, filters_dict, keys)
    interpolate_columns = loader.all_columns_metaphlan()
    directory = os.path.join(butterfree.test_dir,
                             'save_intermediate_file_test')
    try:
        os.remove(directory)
    except:
        pass
    loader.save_intermediate_file(labels_dict, directory, interpolate_columns)
    df = pd.read_csv(directory, index_col=0)
    assert len(df) > 3000
    assert len(df.columns) == len(interpolate_columns) + 1
    assert 'label' in df.columns
Example #5
0
def make_test_examples():

    logging.info("Downloading data from database and file system.")
    keys = ['body_site', 'title', 'path']
    query = "SELECT {0} FROM annotations where annotation = 'metaphlan_bugs_list';".format(
        ', '.join(keys))
    diseases = loader.get_unique_phenotypes('disease')
    filters_dict = {
        disease:
        "SELECT sampleID, title, body_site FROM phenotypes WHERE disease LIKE '%{0}%'"
        .format(disease)
        for disease in diseases
    }

    ladder = loader.LoaderPipe()
    ladder.load(query, filters_dict)
    cols = loader.all_columns_metaphlan()
    exembedder = dataset.ExampleEmbeddingPipe(ladder, examples_columns=cols)
    laembedder = dataset.LabelEmbeddingPipe(exembedder, labels_column='label')

    dropped = Message(laembedder[0:len(laembedder)].tensors())
    dropped['SampleID'] = ladder[0:len(ladder)]['SampleID']

    logging.info("Saving data")

    try:  # Remove file if it exists
        os.remove('SampleID.csv')
    except OSError:
        pass
    try:
        os.remove('examples.torch')
    except OSError:
        pass
    try:
        os.remove('label.torch')
    except OSError:
        pass

    dropped[['SampleID']].to_csv(
        'SampleID.csv', index=False
    )  # TODO: Make this the default way that Messages are saved.
    torch.save(dropped['examples'], 'examples.torch')
    torch.save(dropped['label'], 'label.torch')
    query = "SELECT {0} FROM annotations where annotation = 'metaphlan_bugs_list';".format(
        ', '.join(keys))
    exact_filters_dict = {
        disease:
        "SELECT sampleID, title, body_site FROM phenotypes WHERE disease = '{0}'"
        .format(disease)
        for disease in exact_study_labels
    }
    approximate_filters_dict = {
        disease:
        "SELECT sampleID, title, body_site FROM phenotypes WHERE disease LIKE '%{0}%'"
        .format(disease)
        for disease in approximate_study_labels
    }
    filters_dict = {**exact_filters_dict, **approximate_filters_dict}
    cols = loader.all_columns_metaphlan()
    loaderpipe = loader.LoaderPipe(interpolate_columns=cols)
    loaderpipe.load(query, filters_dict)
    example_embedder = dataset.ExampleEmbeddingPipe(loaderpipe,
                                                    examples_columns=cols)
    labels_embedder = dataset.LabelEmbeddingPipe(example_embedder,
                                                 labels_column='label')
    labels_dict = labels_embedder.labels_dict

    dropped = Message(labels_embedder[0:len(labels_embedder)].tensors())
    dropped['named_label'] = loaderpipe['label']
    dropped['SampleID'] = loaderpipe[0:len(loaderpipe)]['SampleID']
    dropped['label_index'] = torch.LongTensor([
        dataset.vector_to_single_index(x) for x in dropped['label']
    ])  # For cross entropy loss for single classification
    shuffle_dropped = ShufflerPipe(dropped)