Ejemplo n.º 1
0
Archivo: main.py Proyecto: samiroid/fra
def vectorize_docket(docket, vocab):
    df_queries = pd.read_csv(OUTPUT_TXT + "{}_queries.csv".format(docket))
    df_comments = pd.read_csv(OUTPUT_TXT + "{}_comments.csv".format(docket))
    df_queries = df_queries.dropna(subset=['clean_text'])
    df_comments = df_comments.dropna(subset=['clean_text'])
    qidxs, _ = vectorizer.docs2idx(df_queries["clean_text"], vocab)
    cidxs, _ = vectorizer.docs2idx(df_comments["clean_text"], vocab)
    return qidxs, cidxs
Ejemplo n.º 2
0
def train_topic_model():
    with open(VOCABULARY_PATH,"rb") as f:
        vocab = pickle.load(f)
    with open(CORPUS,"r") as f:
        all_text =   f.readlines()
    all_idxs, _ = vectorizer.docs2idx(all_text, vocab)
    X = features.BOW_freq(all_idxs, len(vocab), sparse=True)
    X = X.astype('int32')
    topic_model = lda.LDA(n_topics=N_TOPICS, n_iter=LDA_EPOCHS)
    topic_model.fit(X)
    #save model
    with open(OUTPUT_PKL+"/lda.pkl","wb") as f:
        pickle.dump([topic_model, vocab], f)
Ejemplo n.º 3
0
def compute_IDF():
    with open(VOCABULARY_PATH,"rb") as f:
        vocab = pickle.load(f)
    with open(CORPUS,"r") as f:
        all_text =   f.readlines()
    #compute document frequencies
    all_idxs, _ = vectorizer.docs2idx(all_text, vocab)
    ndocs = len(all_idxs)
    docfreq = Counter(str(x) for xs in all_idxs for x in set(xs))
    #inverse document frequencies
    idfs = {w: getIDF(ndocs, docfreq[w]) for w in docfreq}
    #get an IDF vector 
    idfvec = np.zeros(len(idfs))
    for w, v in idfs.items(): idfvec[int(w)] = v
    with open(OUTPUT_PKL+"/IDF.pkl","wb") as f:
        pickle.dump(idfvec,f)