コード例 #1
0
def train_word2vec(categories, comments, n_dim):
    from feature_extraction import tokenize_document
    from feature_extraction import word2vec_model
    from sklearn.linear_model import SGDClassifier
    documents = [tokenize_document(document) for document in comments]
    model = word2vec_model(documents, n_dim)
    train_vecs = w2vectorize(documents, model, n_dim)
    classifier = SGDClassifier(loss='log', penalty='l1')
    classifier.fit(train_vecs, categories)

    return model, classifier
コード例 #2
0
def train_word2vec(categories, comments, n_dim):
    from feature_extraction import tokenize_document
    from feature_extraction import word2vec_model
    from sklearn.linear_model import SGDClassifier
    documents = [tokenize_document(document) for document in comments]
    model = word2vec_model(documents, n_dim)
    train_vecs = w2vectorize(documents, model, n_dim)
    classifier = SGDClassifier(loss='log', penalty='l1')
    classifier.fit(train_vecs, categories)

    return model, classifier
コード例 #3
0
        print "\n"
        print collocations
        print "\n"

        similar_words(comments, "fakeinsult")

        model = language_model(comments)

        print "\nSamples: "
        import pprint
        printer = pprint.PrettyPrinter(indent=4)
        printer.pprint(model["sound"].samples())

        print "\n"

        model = word2vec_model(comments)
        print model.similarity('retarded', 'loser')

    if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'):
        from train import train_word2vec
        from train import w2vectorize
        from feature_extraction import tokenize_document
        model, classifier = train_word2vec(categories, comments, 500)
        test_documents = [
            tokenize_document(document, stopwords='english')
            for document in test_comments
        ]
        test_vecs = w2vectorize(test_documents, model, 500)
        predictions = classifier.predict(test_vecs)
        print "\nWord2Vec Model Result\n"
        prediction_info(predictions, test_categories)
コード例 #4
0
ファイル: main.py プロジェクト: chrisdamba/trolling_detection
        print "\n"


        similar_words(comments, "fakeinsult")


        model = language_model(comments)

        print "\nSamples: "
        import pprint
        printer = pprint.PrettyPrinter(indent=4)
        printer.pprint(model["sound"].samples())

        print "\n"

        model = word2vec_model(comments)
        print model.similarity('retarded', 'loser')



    if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'):
        from train import train_word2vec
        from train import w2vectorize
        from feature_extraction import tokenize_document
        model, classifier = train_word2vec(categories, comments, 500)
        test_documents = [tokenize_document(document, stopwords='english') for document in test_comments]
        test_vecs = w2vectorize(test_documents, model, 500)
        predictions = classifier.predict(test_vecs)
        print "\nWord2Vec Model Result\n"
        prediction_info(predictions, test_categories)