Example #1
0
def get_analyzed_feed():
    from app import get_list_of_feed
    lst = get_list_of_feed()
    lst = [i[0] for i in lst]

    clf = MultinomialNB(alpha=.01)

    #print('_' * 80)
    #print("Training: ")
    #print(clf)
    t0 = time()


    res = get_corpus_for_sp()

    pairs = []
    for category, posts_list in res.items():
        for post in posts_list:
            pairs.append((category, post))

    import random
    random.shuffle(pairs)


    data, target = [], []
    for category, post in pairs:
        target.append(category)
        data.append(post)


    SPLIT_PERC = 0.75
    split_size = int(len(data) * SPLIT_PERC)
    train_data = data[:split_size]
    test_data = data[split_size:]
    train_categories = target[:split_size]
    test_categories = target[split_size:]

    y_train, y_test = train_categories, test_categories


    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
    X_train = vectorizer.fit_transform(train_data)


    clf.fit(X_train, y_train)
    train_time = time() - t0
    #print("train time: %0.3fs" % train_time)

    t0 = time()
    X_test = vectorizer.transform(test_data)
    pred = clf.predict(X_test)

    from app import get_friends, get_groups, get_group_messages

    vv = vectorizer.transform(lst)
    predicted = clf.predict(vv)

    for i in range(len(predicted)):
        txt = lst[i]
        class_ = predicted[i]
        yield (txt, class_)
Example #2
0
                       "headers, signatures, and quoting.")

    (opts, args) = op.parse_args()
    if len(args) > 0:
        op.error("this script takes no arguments.")
        sys.exit(1)

    #print(__doc__)
    #op.print_help()
    #print()





    res = get_corpus_for_sp()

    pairs = []
    for category, posts_list in res.items():
        for post in posts_list:
            pairs.append((category, post))

    import random
    random.shuffle(pairs)

    data, target = [], []
    for category, post in pairs:
        target.append(category)
        data.append(post)