Ejemplo n.º 1
0
def count_vectorize(corpus, table_name, model_from_pickle, input_type="content"):

    cv_corpus = None


    if model_from_pickle and os.path.exists(f"../data/{table_name}/pickles/CountVectorizer.pkl"):
        count_vectorizer = pickle_load(f"../data/{table_name}/pickles/CountVectorizer.pkl")

    else:
        g.debug("Vectorizing documents...")
        count_vectorizer = CountVectorizerProgressBar(input=input_type, max_features=g.MAX_FEATURES, min_df=g.MIN_DF, max_df=g.MAX_DF, stop_words=get_stopwords(), tokenizer=tokenize, ngram_range=(1, g.N_GRAMS), strip_accents="ascii", dtype=np.uint16, progress_bar_clear_when_done=True)
        cv_corpus = count_vectorizer.fit_transform(corpus)
        count_vectorizer.stop_words_ = None  # we can delete this to take up less memory (useful for pickling)
        g.debug(" -> Done!", 1)

    g.debug(f" -> Loaded vectorizer with {len(count_vectorizer.get_feature_names())} features!", 1)


    if cv_corpus is None:
        g.debug("Transforming corpus...")
        cv_corpus = count_vectorizer.transform(corpus)
        g.debug(" -> Done!", 1)


    g.debug(f" -> Loaded {cv_corpus.shape[0]} documents with {cv_corpus.shape[1]} features!", 1)
    return count_vectorizer, cv_corpus
Ejemplo n.º 2
0
def get_cached_corpus(table_name, name):

    if os.path.exists(f"../data/{table_name}/pickles/{name}_corpus.pkl") and os.path.exists(f"../data/{table_name}/pickles/{name}_doc_ids.txt"):
        doc_ids = load_doc_ids(f"../data/{table_name}/pickles/{name}_doc_ids.txt")
        cv_corpus = pickle_load(f"../data/{table_name}/pickles/{name}_corpus.pkl")
        return doc_ids, cv_corpus
    else:
        return None, None
Ejemplo n.º 3
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-t",
                        "--topics",
                        type=int,
                        help="The number of latent topics to extract.")

    args = parser.parse_args()
    n_topics = args.topics if args.topics else g.N_TOPICS

    corpus = pickle_load("data/pickles/corpus.pkl")
    nmf, W = create_topic_model(corpus, n_topics)

    pickle_dump(nmf, "data/pickles/nmf.pkl")
    pickle_dump(W, "data/pickles/W.pkl")

    # cache the wordclouds
    from wordclouds import cache_wordclouds
    vectorizer = pickle_load("data/pickles/vectorizer.pkl")
    W_normalized = W / W.max(axis=0)
    cache_wordclouds(corpus, vectorizer.get_feature_names(), nmf.components_,
                     W_normalized)
Ejemplo n.º 4
0
def cv_to_tfidf(cv_corpus, table_name, model_from_pickle):

    tfidf_corpus = None


    if model_from_pickle and os.path.exists(f"../data/{table_name}/pickles/TfidfTransformer.pkl"):
        tfidf_transformer = pickle_load(f"../data/{table_name}/pickles/TfidfTransformer.pkl")

    else:
        g.debug("Transforming to TF-IDF vector...")
        tfidf_transformer = TfidfTransformer(sublinear_tf=True)
        tfidf_corpus = tfidf_transformer.fit_transform(cv_corpus)
        g.debug(" -> Done!", 1)


    if tfidf_corpus is None:
        g.debug("Transforming corpus to TF-IDF...")
        tfidf_corpus = tfidf_transformer.transform(cv_corpus)
        g.debug(" -> Done!", 1)


    g.debug(f" -> {tfidf_corpus.shape[0]} count vectors with {tfidf_corpus.shape[1]} features transformed!", 1)
    return tfidf_transformer, tfidf_corpus
Ejemplo n.º 5
0
from corpus import Corpus
import globals as g
import re

regex = re.compile(
    r"(BACKGROUND|Background\:)[\s\S]+(?=(Requirements\:|REQUIREMENTS|Summary\:|SUMMARY))"
)

app = Flask(__name__)
app.secret_key = os.urandom(24)
app.add_template_global(str, "str")
app.add_template_global(int, "int")
app.add_template_global(round, "round")

topics = json.load(open("data/json/topics.json"))
corpus = pickle_load("data/pickles/corpus.pkl")
vectorizer = pickle_load("data/pickles/vectorizer.pkl")
nmf = pickle_load("data/pickles/nmf.pkl")
W = pickle_load("data/pickles/W.pkl")
W_normalized = W / W.max(axis=0)


def naics_descriptions(doc_ids):

    from database import get_connection
    query = f"""
                        SELECT opportunity_id, naics_description
                        FROM import.govwin_opportunity
                        WHERE opportunity_id IN ({", ".join([str(x) for x in doc_ids])})
                    """