def rf_predictor(incident): rf_model = pickle.load(open("Models/rf/randomforest_model.sav", 'rb')) cv = pickle.load(open("Models/rf/randomforest_vector.pickel", "rb")) tf = pickle.load(open("Models/rf/randomforest_transformer.pickel", "rb")) vect_rf = cv.transform(rf_preprocess(incident)) trans_rf = tf.transform(vect_rf) rf_pred = rf_model.predict(trans_rf)[0] prob = rf_model.predict_proba(trans_rf) rf_prob = str(np.round(np.max(prob) * 100, 2)) + "%" return rf_pred, rf_prob
def lda_analysis(users): global lda_text_to_id, lda_topics_per_text n_features = 1000 n_components = 50 n_top_words = 20 print("Constructing user docs") X = [[tweet['text'] for tweet in user.tweets] for user in users] X = [tweet for sublist in X for tweet in sublist] fact_topics = build_fact_topics() for t in [' '.join(f) for f in fact_topics['fact_terms'].values]: X.append(t) print(X[:5]) print("TF fitting user docs") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit(X) X_tf = tf.transform(X) if NEW_LDA_MODEL: print("Training new LDA model") lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(X_tf) with open('model_data/lda_model', 'wb') as tmpfile: pickle.dump(lda, tmpfile) else: with open('model_data/lda_model', 'rb') as tmpfile: lda = pickle.load(tmpfile) lda_text_to_id = {txt: id for id, txt in enumerate(X)} lda_topics_per_text = lda.transform(X_tf) tf_feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): message = "Topic #%d: " % topic_idx message += " ".join([ tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) print(message) print() return lda_text_to_id, lda_topics_per_text
def main(_): config = Config() model = Models(config) tf = model.trainTf() clf = model.trianModel() testData = pd.read_csv(config.FLAGS.trainfile_dir) testData["cut"] = testData["scapeOfBesiness"].apply(cut) X_test = tf.transform(testData["cut"]) y_test = clf.predict(X_test) testData["secInduCode"] = pd.DataFrame(y_test) labels = pd.DataFrame.from_dict(getLabels()) testData = testData.merge(labels, on="secInduCode", how="left") testData.to_csv( "D:/[email protected]/Company22.csv", index=None, encoding="utf8") print(testData[["companyName", "secnduName"]].head())
vectorizer = TfidfVectorizer() tf = vectorizer.fit(vocabulary_list) # print(type(tfidf)) # print(tfidf.toarray()) # print("FRESH") # for row in tfidf.toarray(): # print([val for val in row]) score = 0 for i in range(len(q1)): x1 = [] x2 = [] pred = 0 x1.append(q1[i]) x2.append(q2[i]) q1_tf = tf.transform(x1).toarray() q2_tf = tf.transform(x2).toarray() sim = cosine_similarity(q1_tf, q2_tf) if(sim > 0.7): pred = 1 else: pred = 0 if(pred == dup[i]): score+=1 print("acc: ", (score/len(q1))*100)
def get_feature_vector(test_data, vect): return tf.transform(ptile.transform(vect.transform(test_data[0])))