def count_vectorize(corpus, table_name, model_from_pickle, input_type="content"): cv_corpus = None if model_from_pickle and os.path.exists(f"../data/{table_name}/pickles/CountVectorizer.pkl"): count_vectorizer = pickle_load(f"../data/{table_name}/pickles/CountVectorizer.pkl") else: g.debug("Vectorizing documents...") count_vectorizer = CountVectorizerProgressBar(input=input_type, max_features=g.MAX_FEATURES, min_df=g.MIN_DF, max_df=g.MAX_DF, stop_words=get_stopwords(), tokenizer=tokenize, ngram_range=(1, g.N_GRAMS), strip_accents="ascii", dtype=np.uint16, progress_bar_clear_when_done=True) cv_corpus = count_vectorizer.fit_transform(corpus) count_vectorizer.stop_words_ = None # we can delete this to take up less memory (useful for pickling) g.debug(" -> Done!", 1) g.debug(f" -> Loaded vectorizer with {len(count_vectorizer.get_feature_names())} features!", 1) if cv_corpus is None: g.debug("Transforming corpus...") cv_corpus = count_vectorizer.transform(corpus) g.debug(" -> Done!", 1) g.debug(f" -> Loaded {cv_corpus.shape[0]} documents with {cv_corpus.shape[1]} features!", 1) return count_vectorizer, cv_corpus
def get_cached_corpus(table_name, name): if os.path.exists(f"../data/{table_name}/pickles/{name}_corpus.pkl") and os.path.exists(f"../data/{table_name}/pickles/{name}_doc_ids.txt"): doc_ids = load_doc_ids(f"../data/{table_name}/pickles/{name}_doc_ids.txt") cv_corpus = pickle_load(f"../data/{table_name}/pickles/{name}_corpus.pkl") return doc_ids, cv_corpus else: return None, None
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("-t", "--topics", type=int, help="The number of latent topics to extract.") args = parser.parse_args() n_topics = args.topics if args.topics else g.N_TOPICS corpus = pickle_load("data/pickles/corpus.pkl") nmf, W = create_topic_model(corpus, n_topics) pickle_dump(nmf, "data/pickles/nmf.pkl") pickle_dump(W, "data/pickles/W.pkl") # cache the wordclouds from wordclouds import cache_wordclouds vectorizer = pickle_load("data/pickles/vectorizer.pkl") W_normalized = W / W.max(axis=0) cache_wordclouds(corpus, vectorizer.get_feature_names(), nmf.components_, W_normalized)
def cv_to_tfidf(cv_corpus, table_name, model_from_pickle): tfidf_corpus = None if model_from_pickle and os.path.exists(f"../data/{table_name}/pickles/TfidfTransformer.pkl"): tfidf_transformer = pickle_load(f"../data/{table_name}/pickles/TfidfTransformer.pkl") else: g.debug("Transforming to TF-IDF vector...") tfidf_transformer = TfidfTransformer(sublinear_tf=True) tfidf_corpus = tfidf_transformer.fit_transform(cv_corpus) g.debug(" -> Done!", 1) if tfidf_corpus is None: g.debug("Transforming corpus to TF-IDF...") tfidf_corpus = tfidf_transformer.transform(cv_corpus) g.debug(" -> Done!", 1) g.debug(f" -> {tfidf_corpus.shape[0]} count vectors with {tfidf_corpus.shape[1]} features transformed!", 1) return tfidf_transformer, tfidf_corpus
from corpus import Corpus import globals as g import re regex = re.compile( r"(BACKGROUND|Background\:)[\s\S]+(?=(Requirements\:|REQUIREMENTS|Summary\:|SUMMARY))" ) app = Flask(__name__) app.secret_key = os.urandom(24) app.add_template_global(str, "str") app.add_template_global(int, "int") app.add_template_global(round, "round") topics = json.load(open("data/json/topics.json")) corpus = pickle_load("data/pickles/corpus.pkl") vectorizer = pickle_load("data/pickles/vectorizer.pkl") nmf = pickle_load("data/pickles/nmf.pkl") W = pickle_load("data/pickles/W.pkl") W_normalized = W / W.max(axis=0) def naics_descriptions(doc_ids): from database import get_connection query = f""" SELECT opportunity_id, naics_description FROM import.govwin_opportunity WHERE opportunity_id IN ({", ".join([str(x) for x in doc_ids])}) """