def main( in_corpus, concept_field, cat_field, out_indices, out_cat_indices, out_raw2lemma, out_cat_raw2lemma, ): LOG.info(f"Corpus: {in_corpus}") LOG.info(f"Keyword Field: {concept_field}") LOG.info(f"Category Field: {cat_field}") ce = ml.ConceptExtractor() ce.from_corpus(in_corpus, concept_field) LOG.info(f"Output keyword indices: {out_indices}") LOG.info(f"Output keyword raw2lemma: {out_raw2lemma}") ce.to_jsons(out_indices, out_raw2lemma) LOG.info(f"Extracting categories.") ce_higher = ml.ConceptExtractor() ce_higher.from_corpus(in_corpus, cat_field) LOG.info(f"Output category indices: {out_cat_indices}") LOG.info(f"Output category raw2lemma: {out_cat_raw2lemma}") ce_higher.to_jsons(out_cat_indices, out_cat_raw2lemma)
def process(in_corpus, out_dir, abstract_field, concept_field, term_types, batch_size, n_threads): out_dir.mkdir(exist_ok=True, parents=True) out_features = out_dir / FEATURES out_indices = out_dir / INDICES out_raw2lemma = out_dir / RAW2LEMMA fe = ml.FeatureExtractor() fe.from_corpus_to_jsonlines(in_corpus, out_features, abstract_field, term_types, batch_size, n_threads) ce = ml.ConceptExtractor() ce.from_corpus(in_corpus, concept_field) ce.to_jsons(out_indices, out_raw2lemma) return fe, ce
def main( experiment_name, out_store, out_cat_preds, gt_batch_size, limit=None, ): LOG.info("Loading test data and models.") # TODO: paths should be put into main function test_inds = np.load(f"data/interim/{experiment_name}/test_inds.npy") train_inds = np.load(f"data/interim/{experiment_name}/train_inds.npy") feature_matrix = joblib.load( f"data/interim/{experiment_name}/feature_matrix.jbl") in_cat_models = Path(f"models/{experiment_name}/categories/models/") in_kwd_models = Path(f"models/{experiment_name}/keywords/models/") if limit is not None: LOG.info(f"Limiting to {limit} test records.") feature_matrix_test = feature_matrix.tocsc()[test_inds[0:limit], :] # TODO: How does this affect indices? else: feature_matrix_test = feature_matrix.tocsc()[test_inds, :] LOG.info("Making predictions.") categories, concepts_with_classifiers, cat_preds, = make_predictions( in_cat_models, in_kwd_models, feature_matrix_test, out_store, ) # need t if limiting np.save(out_cat_preds, cat_preds) LOG.info("Creating ground truth data.") kwd_ext = ml.ConceptExtractor( ) # TODO: these paths should be provided as args kwd_ext.from_jsons( f"data/interim/{experiment_name}/kwd_indices.json", f"models/{experiment_name}/kwd_raw2lemma.json", ) create_ground_truth( store=out_store, dataset="ground_truth", kwd_ext=kwd_ext, concepts_with_classifiers=concepts_with_classifiers, batch_size=gt_batch_size, train_inds=train_inds, test_inds=test_inds, )
def main( in_feature_matrix, in_ind_train, in_ind_test, in_cat_indices, in_cat_raw2lemma, in_config, out_dir, ): with open(in_config, "r") as f0: config = yaml.safe_load(f0) X = joblib.load(in_feature_matrix) ind_train = np.load(in_ind_train) ind_test = np.load(in_ind_test) LOG.info( f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}." ) cat_ext = ml.ConceptExtractor() cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma) paramgrid = { "alpha": [0.01, 0.001, 0.0001], "class_weight": [{ 1: 10, 0: 1 }, { 1: 5, 0: 1 }, { 1: 20, 0: 1 }], "max_iter": [1], "loss": ["log"], } # requires loss function with predict_proba clf = GridSearchCV(SGDClassifier(), paramgrid, scoring="f1") # requires GridSearchCV out_models = out_dir / OUT_MODELS_DIR trainer = ConceptTrainer(cat_ext, clf) trainer.train_concepts(X, ind_train, ind_test, out_models, config["min_concept_occurrence"]) LOG.info("Complete.")
def main( in_feature_matrix, in_ind_train, in_ind_test, in_kwd_indices, in_cat_indices, in_kwd_raw2lemma, in_cat_raw2lemma, in_config, out_dir, topics=True, ): with open(in_config, "r") as f0: config = yaml.safe_load(f0) X = joblib.load(in_feature_matrix) ind_train = np.load(in_ind_train) ind_test = np.load(in_ind_test) LOG.info( f"Loading keyword extractor from {in_kwd_indices} and {in_kwd_raw2lemma}." ) ce = ml.ConceptExtractor() ce.from_jsons(in_kwd_indices, in_kwd_raw2lemma) LOG.info( f"Loading category extractor from {in_cat_indices} and {in_cat_raw2lemma}." ) cat_ext = ml.ConceptExtractor() cat_ext.from_jsons(in_cat_indices, in_cat_raw2lemma) paramgrid = { "alpha": [0.01, 0.001, 0.0001], "class_weight": [{ 1: 10, 0: 1 }, { 1: 5, 0: 1 }, { 1: 20, 0: 1 }], "max_iter": [5], "loss": ["log"], } # requires loss function with predict_proba clf = GridSearchCV( SGDClassifier(), paramgrid, scoring="f1", n_jobs=-1, ) # requires GridSearchCV out_models = out_dir / OUT_MODELS_DIR trainer = tr.ConceptTrainer(ce, clf) doc_topic_indices = cat_ext.concept_index_mapping if topics: LOG.info( f"Training one set for each of {len(doc_topic_indices)} topics divisions." ) for topic, doc_topic_index in doc_topic_indices.items(): trainer.train_concepts( X, ind_train, ind_test, out_models, config["min_concept_occurrence"], topic, doc_topic_index, ) LOG.info("Training one general set") trainer.train_concepts(X, ind_train, ind_test, out_models, config["min_concept_occurrence"]) LOG.info("Complete.")