def TFIDF_vectors(docket, vocab, idfs): queries, comments = vectorize_docket(docket, vocab) Q = features.BOW_freq(queries, len(vocab), sparse=False) C = features.BOW_freq(comments, len(vocab), sparse=False) Q *= idfs C *= idfs return Q, C
def LDA_vectors(docket, vocab, topic_model): queries, comments = vectorize_docket(docket, vocab) Q = features.BOW_freq(queries, len(vocab), sparse=True) C = features.BOW_freq(comments, len(vocab), sparse=True) Q = Q.astype('int32') C = C.astype('int32') Qt = topic_model.transform(Q) Ct = topic_model.transform(C) return Qt, Ct
def vectorize_TFIDF(target_dockets): with open(VOCABULARY_PATH,"rb") as f: vocab = pickle.load(f) with open(OUTPUT_PKL+"/IDF.pkl","rb") as f: idfvec = pickle.load(f) print("[building TF-IDF vectors]") for docket in target_dockets: print("[vectorizing docket: {}]".format(docket)) # Q,C = TFIDF_vectors(docket, vocab, idfvec) queries, comments = vectorize_docket(docket, vocab) Q = features.BOW_freq(queries, len(vocab), sparse=True) C = features.BOW_freq(comments, len(vocab), sparse=True) #sparse matrices sp.sparse.save_npz(OUTPUT_VECTORS+"{}_queries_tf-idf".format(docket), Q.tocsc()) sp.sparse.save_npz(OUTPUT_VECTORS+"{}_comments_tf-idf".format(docket), C.tocsc()) print("[done]")
def vectorize_LDA(target_dockets): with open(VOCABULARY_PATH,"rb") as f: vocab = pickle.load(f) #topics with open(OUTPUT_PKL+"/lda.pkl","rb") as f: topic_model, _ = pickle.load(f) print("[building LDA vectors]") for docket in target_dockets: print("[vectorizing docket: {}]".format(docket)) queries, comments = vectorize_docket(docket, vocab) Q = features.BOW_freq(queries, len(vocab), sparse=True) C = features.BOW_freq(comments, len(vocab), sparse=True) Qt = topic_model.transform(Q.astype('int32')) Ct = topic_model.transform(C.astype('int32')) with open(OUTPUT_VECTORS+"{}_queries_lda.np".format(docket),"wb") as f: np.save(f, Qt) with open(OUTPUT_VECTORS+"{}_comments_lda.np".format(docket),"wb") as f: np.save(f, Ct) print("[done]")
def train_topic_model(): with open(VOCABULARY_PATH,"rb") as f: vocab = pickle.load(f) with open(CORPUS,"r") as f: all_text = f.readlines() all_idxs, _ = vectorizer.docs2idx(all_text, vocab) X = features.BOW_freq(all_idxs, len(vocab), sparse=True) X = X.astype('int32') topic_model = lda.LDA(n_topics=N_TOPICS, n_iter=LDA_EPOCHS) topic_model.fit(X) #save model with open(OUTPUT_PKL+"/lda.pkl","wb") as f: pickle.dump([topic_model, vocab], f)