Example #1
0
def main(
    in_corpus,
    concept_field,
    cat_field,
    out_indices,
    out_cat_indices,
    out_raw2lemma,
    out_cat_raw2lemma,
):
    LOG.info(f"Corpus: {in_corpus}")
    LOG.info(f"Keyword Field: {concept_field}")
    LOG.info(f"Category Field: {cat_field}")

    ce = ml.ConceptExtractor()
    ce.from_corpus(in_corpus, concept_field)
    LOG.info(f"Output keyword indices: {out_indices}")
    LOG.info(f"Output keyword raw2lemma: {out_raw2lemma}")
    ce.to_jsons(out_indices, out_raw2lemma)

    LOG.info(f"Extracting categories.")
    ce_higher = ml.ConceptExtractor()
    ce_higher.from_corpus(in_corpus, cat_field)
    LOG.info(f"Output category indices: {out_cat_indices}")
    LOG.info(f"Output category raw2lemma: {out_cat_raw2lemma}")
    ce_higher.to_jsons(out_cat_indices, out_cat_raw2lemma)
Example #2
0
def process(in_corpus, out_dir, abstract_field, concept_field, term_types,
            batch_size, n_threads):
    out_dir.mkdir(exist_ok=True, parents=True)
    out_features = out_dir / FEATURES
    out_indices = out_dir / INDICES
    out_raw2lemma = out_dir / RAW2LEMMA

    fe = ml.FeatureExtractor()
    fe.from_corpus_to_jsonlines(in_corpus, out_features, abstract_field,
                                term_types, batch_size, n_threads)

    ce = ml.ConceptExtractor()
    ce.from_corpus(in_corpus, concept_field)
    ce.to_jsons(out_indices, out_raw2lemma)

    return fe, ce
Example #3
0
def main(
    experiment_name,
    out_store,
    out_cat_preds,
    gt_batch_size,
    limit=None,
):
    LOG.info("Loading test data and models.")
    # TODO: paths should be put into main function
    test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy")
    train_inds = np.load(f"data/interim/{experiment_name}/train_inds.npy")
    feature_matrix = joblib.load(
        f"data/interim/{experiment_name}/feature_matrix.jbl")
    in_cat_models = Path(f"models/{experiment_name}/categories/models/")
    in_kwd_models = Path(f"models/{experiment_name}/keywords/models/")

    if limit is not None:
        LOG.info(f"Limiting to {limit} test records.")
        feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :]
        # TODO: How does this affect indices?
    else:
        feature_matrix_test = feature_matrix.tocsc()[test_inds, :]

    LOG.info("Making predictions.")
    categories, concepts_with_classifiers, cat_preds, = make_predictions(
        in_cat_models,
        in_kwd_models,
        feature_matrix_test,
        out_store,
    )  # need t if limiting
    np.save(out_cat_preds, cat_preds)
    LOG.info("Creating ground truth data.")
    kwd_ext = ml.ConceptExtractor(
    )  # TODO: these paths should be provided as args
    kwd_ext.from_jsons(
        f"data/interim/{experiment_name}/kwd_indices.json",
        f"models/{experiment_name}/kwd_raw2lemma.json",
    )
    create_ground_truth(
        store=out_store,
        dataset="ground_truth",
        kwd_ext=kwd_ext,
        concepts_with_classifiers=concepts_with_classifiers,
        batch_size=gt_batch_size,
        train_inds=train_inds,
        test_inds=test_inds,
    )
Example #4
0
def main(
    in_feature_matrix,
    in_ind_train,
    in_ind_test,
    in_cat_indices,
    in_cat_raw2lemma,
    in_config,
    out_dir,
):
    with open(in_config, "r") as f0:
        config = yaml.safe_load(f0)

    X = joblib.load(in_feature_matrix)
    ind_train = np.load(in_ind_train)
    ind_test = np.load(in_ind_test)

    LOG.info(
        f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}."
    )
    cat_ext = ml.ConceptExtractor()
    cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma)

    paramgrid = {
        "alpha": [0.01, 0.001, 0.0001],
        "class_weight": [{
            1: 10,
            0: 1
        }, {
            1: 5,
            0: 1
        }, {
            1: 20,
            0: 1
        }],
        "max_iter": [1],
        "loss": ["log"],
    }  # requires loss function with predict_proba
    clf = GridSearchCV(SGDClassifier(), paramgrid,
                       scoring="f1")  # requires GridSearchCV
    out_models = out_dir / OUT_MODELS_DIR
    trainer = ConceptTrainer(cat_ext, clf)

    trainer.train_concepts(X, ind_train, ind_test, out_models,
                           config["min_concept_occurrence"])
    LOG.info("Complete.")
Example #5
0
def main(
    in_feature_matrix,
    in_ind_train,
    in_ind_test,
    in_kwd_indices,
    in_cat_indices,
    in_kwd_raw2lemma,
    in_cat_raw2lemma,
    in_config,
    out_dir,
    topics=True,
):
    with open(in_config, "r") as f0:
        config = yaml.safe_load(f0)

    X = joblib.load(in_feature_matrix)
    ind_train = np.load(in_ind_train)
    ind_test = np.load(in_ind_test)

    LOG.info(
        f"Loading keyword extractor from {in_kwd_indices} and {in_kwd_raw2lemma}."
    )
    ce = ml.ConceptExtractor()
    ce.from_jsons(in_kwd_indices, in_kwd_raw2lemma)

    LOG.info(
        f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}."
    )
    cat_ext = ml.ConceptExtractor()
    cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma)

    paramgrid = {
        "alpha": [0.01, 0.001, 0.0001],
        "class_weight": [{
            1: 10,
            0: 1
        }, {
            1: 5,
            0: 1
        }, {
            1: 20,
            0: 1
        }],
        "max_iter": [5],
        "loss": ["log"],
    }  # requires loss function with predict_proba
    clf = GridSearchCV(
        SGDClassifier(),
        paramgrid,
        scoring="f1",
        n_jobs=-1,
    )  # requires GridSearchCV
    out_models = out_dir / OUT_MODELS_DIR
    trainer = tr.ConceptTrainer(ce, clf)
    doc_topic_indices = cat_ext.concept_index_mapping

    if topics:
        LOG.info(
            f"Training one set for each of {len(doc_topic_indices)} topics divisions."
        )
        for topic, doc_topic_index in doc_topic_indices.items():
            trainer.train_concepts(
                X,
                ind_train,
                ind_test,
                out_models,
                config["min_concept_occurrence"],
                topic,
                doc_topic_index,
            )
    LOG.info("Training one general set")
    trainer.train_concepts(X, ind_train, ind_test, out_models,
                           config["min_concept_occurrence"])
    LOG.info("Complete.")