Ejemplo n.º 1
0
def predict_links(
    input_path: str,
    num_walks: int,
    walk_length: int,
    dimension: int,
    window_size: int,
) -> Tuple[float, float]:
    g2v = Gat2Vec(input_path,
                  input_path,
                  label=False,
                  tr=gat2vec_config.training_ratio)
    model = g2v.train_gat2vec(
        num_walks,
        walk_length,
        dimension,
        window_size,
        output=True,
    )
    df = get_embeddingDF(model)
    labels = pd.read_csv(gat2vec_paths.get_labels_path(input_path),
                         sep='\t',
                         header=None,
                         names=['disease', 'gene', 'label'])
    disease_idx = labels.iloc[0, 0]
    labels['prob'] = pd.Series([
        calculate_prob(df[disease_idx], df[gene_idx])
        for gene_idx in labels.loc[:, 'gene']
    ])
    auc = roc_auc_score(labels['label'], labels['prob'])
    aps = average_precision_score(labels['label'], labels['prob'])
    return auc, aps
Ejemplo n.º 2
0
    def get_prediction_probs_for_entire_set(self, model):
        embedding = parsers.get_embeddingDF(model)
        embedding = embedding[self.label_ind, :]

        log_reg = linear_model.LogisticRegression(solver='lbfgs')
        clf = OneVsRestClassifier(log_reg)

        clf.fit(embedding, self.labels)  # for multi-class classification
        probs = clf.predict_proba(embedding)
        logger.debug('ROC: %.2f', roc_auc_score(self.labels, probs[:, 1]))

        return probs
Ejemplo n.º 3
0
    def evaluate(self, model, label=False, evaluation_scheme="tr"):
        embedding = 0
        clf = self.get_classifier()

        if not label:
            embedding = parsers.get_embeddingDF(model)

        if evaluation_scheme == "cv":
            results = self.evaluate_cv(clf, embedding, 5)
        elif evaluation_scheme == "tr" or label:
            results = defaultdict(list)
            for tr in self.TR:
                logger.debug("TR ... %s", tr)
                if label:
                    model = paths.get_embedding_path_wl(
                        self.dataset_dir, self.output_dir, tr)
                    if isinstance(model, str):
                        embedding = parsers.get_embeddingDF(model)
                results.update(self.evaluate_tr(clf, embedding, tr))

        logger.debug("Training Finished")

        df = pd.DataFrame(results)
        return df.groupby(axis=0, by="TR").mean()
Ejemplo n.º 4
0
def predict_links_cv(network, dataset, use_dge, num_walks, walk_length,
                     dimension, window_size) -> pd.DataFrame:
    results = defaultdict(list)
    disease_abv = dataset_to_disease_abv(dataset)
    do_id = disease_identifiers[disease_abv]
    for i in range(10):
        for cv_labels in network.write_gat2vec_cv_split(
                home_dir=g2v_path,
                disease_id=do_id,
                filter_pleiotropic_targets=True,
                use_dge_data=use_dge):
            g2v = Gat2Vec(input_dir=g2v_path,
                          output_dir=g2v_path,
                          label=False,
                          tr=gat2vec_config.training_ratio)
            model = g2v.train_gat2vec(
                num_walks,
                walk_length,
                dimension,
                window_size,
                output=True,
            )
            model_df = get_embeddingDF(model)
            labels = pd.DataFrame(data=[(x, y) for x, y in cv_labels.items()],
                                  columns=['gene', 'label'])
            disease_idx = network.get_index_for_disease(do_id)
            labels['prob'] = pd.Series([
                calculate_prob(model_df[disease_idx], model_df[gene_idx])
                for gene_idx in labels.loc[:, 'gene']
            ])
            auc = roc_auc_score(labels['label'], labels['prob'])
            aps = average_precision_score(labels['label'], labels['prob'])
            results['TR'].append(i)
            results['dge'].append(dataset)
            results['eval'].append(use_dge)
            results['auc'].append(auc)
            results['aps'].append(aps)
        logger.debug(results)
    df = pd.DataFrame(results)
    return df.groupby(by=['TR', 'dge', 'eval']).mean().reset_index()