def get_analyzed_feed(): from app import get_list_of_feed lst = get_list_of_feed() lst = [i[0] for i in lst] clf = MultinomialNB(alpha=.01) #print('_' * 80) #print("Training: ") #print(clf) t0 = time() res = get_corpus_for_sp() pairs = [] for category, posts_list in res.items(): for post in posts_list: pairs.append((category, post)) import random random.shuffle(pairs) data, target = [], [] for category, post in pairs: target.append(category) data.append(post) SPLIT_PERC = 0.75 split_size = int(len(data) * SPLIT_PERC) train_data = data[:split_size] test_data = data[split_size:] train_categories = target[:split_size] test_categories = target[split_size:] y_train, y_test = train_categories, test_categories vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(train_data) clf.fit(X_train, y_train) train_time = time() - t0 #print("train time: %0.3fs" % train_time) t0 = time() X_test = vectorizer.transform(test_data) pred = clf.predict(X_test) from app import get_friends, get_groups, get_group_messages vv = vectorizer.transform(lst) predicted = clf.predict(vv) for i in range(len(predicted)): txt = lst[i] class_ = predicted[i] yield (txt, class_)
"headers, signatures, and quoting.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) #print(__doc__) #op.print_help() #print() res = get_corpus_for_sp() pairs = [] for category, posts_list in res.items(): for post in posts_list: pairs.append((category, post)) import random random.shuffle(pairs) data, target = [], [] for category, post in pairs: target.append(category) data.append(post)