Python FeatureExtractor Examples

Programming Language: Python

Namespace/Package Name: dsconcept.model

Method/Function: FeatureExtractor

Examples at hotexamples.com: 5

Python FeatureExtractor - 5 examples found. These are the top rated real world Python examples of dsconcept.model.FeatureExtractor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: make_vec_and_matrix.py Project: wd6/concept-tagging-training

def main(in_features, in_config, out_feature_dir, out_vectorizer):
    with open(in_config, "r") as f0:
        config = yaml.safe_load(f0)

    LOG.info(f"Loading features from {in_features}.")
    fe = ml.FeatureExtractor()
    fe.from_jsonlines(in_features)
    weighted_features = fe.weight_terms(config["weights"])
    limited_features = fe.limit_features(
        weighted_features,
        config["min_feature_occurrence"],
        config["max_feature_occurrence"],
    )
    v = DictVectorizer()
    X = v.fit_transform(limited_features)

    out_feature_matrix = out_feature_dir / FEATURE_MATRIX
    LOG.info(f"Outputting vectorizer to {out_vectorizer}.")
    joblib.dump(v, out_vectorizer)
    LOG.info(f"Outputting feature matrix to {out_feature_matrix}.")
    joblib.dump(X, out_feature_matrix)

    _, _, ind_train, ind_test = train_test_split(X,
                                                 np.array(range(X.shape[0])),
                                                 test_size=0.10,
                                                 random_state=42)
    np.save(out_feature_dir / f"train_inds.npy", ind_train)
    np.save(out_feature_dir / f"test_inds.npy", ind_test)

Example #2

Show file

    def vectorize(
        self,
        texts: List[str],
        weights: Dict[str, int],
        batch_size: int = 1000,
        n_threads: int = cpu_count(),
    ) -> Tuple[List[Dict[str, str]], np.array]:
        """
        Transform texts into a matrix of features.

        Args:
            texts: texts to transform
            weights: how to weight different types of features
            batch_size: what batch size to pass to nlp.pipe
            n_threads: number of threads to use

        Returns:
            feature_matrix: matrix representation of features for each document
        """
        assert self.vectorizer is not None, LOG.exception(
            "Must initialize vectorizer.")
        fe = ml.FeatureExtractor()
        with NamedTemporaryFile() as tmp_features_loc:
            tmp_features = tmp_features_loc.name
            ml.extract_features_from_abstracts(texts, tmp_features, batch_size,
                                               n_threads)
            fe.from_jsonlines(tmp_features)
        weighted_features = fe.weight_terms(weights)
        feature_matrix = self.vectorizer.transform(weighted_features)
        return fe.features, feature_matrix

Example #3

Show file

File: pipeline.py Project: wd6/concept-tagging-training

def topic_model(topic_model_dir, processed_dir, topic_weights, min_feature,
                max_feature):
    topic_model_dir.mkdir(exist_ok=True)
    tfe = ml.FeatureExtractor()
    tfe.from_jsonlines(processed_dir / FEATURES)

    topic_weighted_features = tfe.weight_terms(topic_weights)
    topic_limited_features = tfe.limit_features(topic_weighted_features,
                                                min_feature, max_feature)

    topic_v = DictVectorizer()
    topic_X = topic_v.fit_transform(topic_limited_features)

    model = LatentDirichletAllocation(
        n_components=3,
        max_iter=5,
        learning_method="online",
        learning_offset=50.0,
        random_state=0,
    )
    doc_topic_distr = model.fit_transform(topic_X)

    out_vectorizer = topic_model_dir / TOPIC_VECTORIZER
    out_feature_matrix = topic_model_dir / TOPIC_FEATURE_MATRIX
    out_model = topic_model_dir / TOPIC_MODEL
    out_doc_topic_distr = topic_model_dir / DOC_TOPIC_DISTR

    joblib.dump(topic_v, out_vectorizer)
    joblib.dump(topic_X, out_feature_matrix)
    joblib.dump(model, out_model)
    joblib.dump(doc_topic_distr, out_doc_topic_distr)

    return doc_topic_distr

Example #4

Show file

def main(in_corpus, abstract_field, out_features, batch_size, n_threads):
    LOG.info(f"Extracting features from corpus at {in_corpus}.")
    LOG.info(f"Using field: {abstract_field}.")
    fe = ml.FeatureExtractor()
    LOG.info(f"Using batch_size {batch_size} with {n_threads} threads.")
    LOG.info(f"Outputting processed features to {out_features}.")
    fe.from_corpus_to_jsonlines(
        in_corpus, out_features, abstract_field, batch_size, n_threads
    )

Example #5

Show file

File: pipeline.py Project: wd6/concept-tagging-training

def process(in_corpus, out_dir, abstract_field, concept_field, term_types,
            batch_size, n_threads):
    out_dir.mkdir(exist_ok=True, parents=True)
    out_features = out_dir / FEATURES
    out_indices = out_dir / INDICES
    out_raw2lemma = out_dir / RAW2LEMMA

    fe = ml.FeatureExtractor()
    fe.from_corpus_to_jsonlines(in_corpus, out_features, abstract_field,
                                term_types, batch_size, n_threads)

    ce = ml.ConceptExtractor()
    ce.from_corpus(in_corpus, concept_field)
    ce.to_jsons(out_indices, out_raw2lemma)

    return fe, ce