def main(in_features, in_config, out_feature_dir, out_vectorizer):
    with open(in_config, "r") as f0:
        config = yaml.safe_load(f0)

    LOG.info(f"Loading features from {in_features}.")
    fe = ml.FeatureExtractor()
    fe.from_jsonlines(in_features)
    weighted_features = fe.weight_terms(config["weights"])
    limited_features = fe.limit_features(
        weighted_features,
        config["min_feature_occurrence"],
        config["max_feature_occurrence"],
    )
    v = DictVectorizer()
    X = v.fit_transform(limited_features)

    out_feature_matrix = out_feature_dir / FEATURE_MATRIX
    LOG.info(f"Outputting vectorizer to {out_vectorizer}.")
    joblib.dump(v, out_vectorizer)
    LOG.info(f"Outputting feature matrix to {out_feature_matrix}.")
    joblib.dump(X, out_feature_matrix)

    _, _, ind_train, ind_test = train_test_split(X,
                                                 np.array(range(X.shape[0])),
                                                 test_size=0.10,
                                                 random_state=42)
    np.save(out_feature_dir / f"train_inds.npy", ind_train)
    np.save(out_feature_dir / f"test_inds.npy", ind_test)
Example #2
0
    def vectorize(
        self,
        texts: List[str],
        weights: Dict[str, int],
        batch_size: int = 1000,
        n_threads: int = cpu_count(),
    ) -> Tuple[List[Dict[str, str]], np.array]:
        """
        Transform texts into a matrix of features.

        Args:
            texts: texts to transform
            weights: how to weight different types of features
            batch_size: what batch size to pass to nlp.pipe
            n_threads: number of threads to use

        Returns:
            feature_matrix: matrix representation of features for each document
        """
        assert self.vectorizer is not None, LOG.exception(
            "Must initialize vectorizer.")
        fe = ml.FeatureExtractor()
        with NamedTemporaryFile() as tmp_features_loc:
            tmp_features = tmp_features_loc.name
            ml.extract_features_from_abstracts(texts, tmp_features, batch_size,
                                               n_threads)
            fe.from_jsonlines(tmp_features)
        weighted_features = fe.weight_terms(weights)
        feature_matrix = self.vectorizer.transform(weighted_features)
        return fe.features, feature_matrix
Example #3
0
def topic_model(topic_model_dir, processed_dir, topic_weights, min_feature,
                max_feature):
    topic_model_dir.mkdir(exist_ok=True)
    tfe = ml.FeatureExtractor()
    tfe.from_jsonlines(processed_dir / FEATURES)

    topic_weighted_features = tfe.weight_terms(topic_weights)
    topic_limited_features = tfe.limit_features(topic_weighted_features,
                                                min_feature, max_feature)

    topic_v = DictVectorizer()
    topic_X = topic_v.fit_transform(topic_limited_features)

    model = LatentDirichletAllocation(
        n_components=3,
        max_iter=5,
        learning_method="online",
        learning_offset=50.0,
        random_state=0,
    )
    doc_topic_distr = model.fit_transform(topic_X)

    out_vectorizer = topic_model_dir / TOPIC_VECTORIZER
    out_feature_matrix = topic_model_dir / TOPIC_FEATURE_MATRIX
    out_model = topic_model_dir / TOPIC_MODEL
    out_doc_topic_distr = topic_model_dir / DOC_TOPIC_DISTR

    joblib.dump(topic_v, out_vectorizer)
    joblib.dump(topic_X, out_feature_matrix)
    joblib.dump(model, out_model)
    joblib.dump(doc_topic_distr, out_doc_topic_distr)

    return doc_topic_distr
Example #4
0
def main(in_corpus, abstract_field, out_features, batch_size, n_threads):
    LOG.info(f"Extracting features from corpus at {in_corpus}.")
    LOG.info(f"Using field: {abstract_field}.")
    fe = ml.FeatureExtractor()
    LOG.info(f"Using batch_size {batch_size} with {n_threads} threads.")
    LOG.info(f"Outputting processed features to {out_features}.")
    fe.from_corpus_to_jsonlines(
        in_corpus, out_features, abstract_field, batch_size, n_threads
    )
Example #5
0
def process(in_corpus, out_dir, abstract_field, concept_field, term_types,
            batch_size, n_threads):
    out_dir.mkdir(exist_ok=True, parents=True)
    out_features = out_dir / FEATURES
    out_indices = out_dir / INDICES
    out_raw2lemma = out_dir / RAW2LEMMA

    fe = ml.FeatureExtractor()
    fe.from_corpus_to_jsonlines(in_corpus, out_features, abstract_field,
                                term_types, batch_size, n_threads)

    ce = ml.ConceptExtractor()
    ce.from_corpus(in_corpus, concept_field)
    ce.to_jsons(out_indices, out_raw2lemma)

    return fe, ce