def main(in_features, in_config, out_feature_dir, out_vectorizer): with open(in_config, "r") as f0: config = yaml.safe_load(f0) LOG.info(f"Loading features from {in_features}.") fe = ml.FeatureExtractor() fe.from_jsonlines(in_features) weighted_features = fe.weight_terms(config["weights"]) limited_features = fe.limit_features( weighted_features, config["min_feature_occurrence"], config["max_feature_occurrence"], ) v = DictVectorizer() X = v.fit_transform(limited_features) out_feature_matrix = out_feature_dir / FEATURE_MATRIX LOG.info(f"Outputting vectorizer to {out_vectorizer}.") joblib.dump(v, out_vectorizer) LOG.info(f"Outputting feature matrix to {out_feature_matrix}.") joblib.dump(X, out_feature_matrix) _, _, ind_train, ind_test = train_test_split(X, np.array(range(X.shape[0])), test_size=0.10, random_state=42) np.save(out_feature_dir / f"train_inds.npy", ind_train) np.save(out_feature_dir / f"test_inds.npy", ind_test)
def vectorize( self, texts: List[str], weights: Dict[str, int], batch_size: int = 1000, n_threads: int = cpu_count(), ) -> Tuple[List[Dict[str, str]], np.array]: """ Transform texts into a matrix of features. Args: texts: texts to transform weights: how to weight different types of features batch_size: what batch size to pass to nlp.pipe n_threads: number of threads to use Returns: feature_matrix: matrix representation of features for each document """ assert self.vectorizer is not None, LOG.exception( "Must initialize vectorizer.") fe = ml.FeatureExtractor() with NamedTemporaryFile() as tmp_features_loc: tmp_features = tmp_features_loc.name ml.extract_features_from_abstracts(texts, tmp_features, batch_size, n_threads) fe.from_jsonlines(tmp_features) weighted_features = fe.weight_terms(weights) feature_matrix = self.vectorizer.transform(weighted_features) return fe.features, feature_matrix
def topic_model(topic_model_dir, processed_dir, topic_weights, min_feature, max_feature): topic_model_dir.mkdir(exist_ok=True) tfe = ml.FeatureExtractor() tfe.from_jsonlines(processed_dir / FEATURES) topic_weighted_features = tfe.weight_terms(topic_weights) topic_limited_features = tfe.limit_features(topic_weighted_features, min_feature, max_feature) topic_v = DictVectorizer() topic_X = topic_v.fit_transform(topic_limited_features) model = LatentDirichletAllocation( n_components=3, max_iter=5, learning_method="online", learning_offset=50.0, random_state=0, ) doc_topic_distr = model.fit_transform(topic_X) out_vectorizer = topic_model_dir / TOPIC_VECTORIZER out_feature_matrix = topic_model_dir / TOPIC_FEATURE_MATRIX out_model = topic_model_dir / TOPIC_MODEL out_doc_topic_distr = topic_model_dir / DOC_TOPIC_DISTR joblib.dump(topic_v, out_vectorizer) joblib.dump(topic_X, out_feature_matrix) joblib.dump(model, out_model) joblib.dump(doc_topic_distr, out_doc_topic_distr) return doc_topic_distr
def main(in_corpus, abstract_field, out_features, batch_size, n_threads): LOG.info(f"Extracting features from corpus at {in_corpus}.") LOG.info(f"Using field: {abstract_field}.") fe = ml.FeatureExtractor() LOG.info(f"Using batch_size {batch_size} with {n_threads} threads.") LOG.info(f"Outputting processed features to {out_features}.") fe.from_corpus_to_jsonlines( in_corpus, out_features, abstract_field, batch_size, n_threads )
def process(in_corpus, out_dir, abstract_field, concept_field, term_types, batch_size, n_threads): out_dir.mkdir(exist_ok=True, parents=True) out_features = out_dir / FEATURES out_indices = out_dir / INDICES out_raw2lemma = out_dir / RAW2LEMMA fe = ml.FeatureExtractor() fe.from_corpus_to_jsonlines(in_corpus, out_features, abstract_field, term_types, batch_size, n_threads) ce = ml.ConceptExtractor() ce.from_corpus(in_corpus, concept_field) ce.to_jsons(out_indices, out_raw2lemma) return fe, ce