コード例 #1
0
def classificar(train_data,
                test_data,
                my_tags,
                binary,
                dirW2V,
                language,
                clean,
                show_confusion_graphic,
                file_result=None):
    wv = Word2Vec.load_word2vec_format(fname=dirW2V, binary=binary)
    wv.init_sims(replace=True)

    test_tokenized = test_data.apply(
        lambda r: w2v_tokenize_text(r['plot'], language, clean), axis=1).values
    train_tokenized = train_data.apply(
        lambda r: w2v_tokenize_text(r['plot'], language, clean), axis=1).values

    X_train_word_average = word_averaging_list(wv, train_tokenized)
    X_test_word_average = word_averaging_list(wv, test_tokenized)

    #KNN
    knn_naive_dv = KNeighborsClassifier(n_neighbors=3,
                                        n_jobs=1,
                                        algorithm='brute',
                                        metric='cosine')
    knn_naive_dv.fit(X_train_word_average, train_data.tag)

    predicted1 = knn_naive_dv.predict(X_test_word_average)
    accuracy1 = Avaliar.evaluate_prediction(predicted1,
                                            test_data.tag,
                                            my_tags,
                                            test_data.tag.unique(),
                                            show_confusion_graphic,
                                            file_result=file_result)

    #Regressao logistica
    logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5)
    logreg = logreg.fit(X_train_word_average, train_data['tag'])

    predicted2 = logreg.predict(X_test_word_average)
    accuracy2 = Avaliar.evaluate_prediction(predicted2,
                                            test_data.tag,
                                            my_tags,
                                            test_data.tag.unique(),
                                            show_confusion_graphic,
                                            file_result=file_result)

    #wv.most_similar(positive=[X_test_word_average[56]], restrict_vocab=100000, topn=30)[0:20]

    ListPredictions = (predicted1, predicted2)
    ListAccuracy = (accuracy1, accuracy2)
    return ListPredictions, ListAccuracy
コード例 #2
0
ファイル: BagOfWords.py プロジェクト: matszrmn/TCC
def classificar(train_data,
                test_data,
                my_tags,
                language,
                max_features,
                show_confusion_graphic,
                file_result=None):
    count_vectorizer = CountVectorizer(analyzer="word",
                                       tokenizer=nltk.word_tokenize,
                                       preprocessor=None,
                                       stop_words=language,
                                       max_features=max_features)
    train_data_features = count_vectorizer.fit_transform(train_data['plot'])

    logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5)
    logreg = logreg.fit(train_data_features, train_data['tag'])

    #joblib.dump(logreg, 'filename.pkl') #PERSISTENCIA (escrita)
    #logreg = joblib.load('filename.pkl') #PERSISTENCIA (leitura)

    prediction, accuracy = Avaliar.predict(count_vectorizer,
                                           logreg,
                                           test_data,
                                           my_tags,
                                           test_data.tag.unique(),
                                           show_confusion_graphic,
                                           file_result=file_result)
    return prediction, accuracy
コード例 #3
0
def classificar(train_data, test_data, my_tags, number_iters, size, alpha, clean, show_box_plot, show_confusion_graphic, file_result=None):
    def tag_sentences(reviews, stars=my_tags):  
        for r in reviews:
            if r['y'] in stars:
                for s in r['x']:
                    yield s

    revtrain = list(plots(train_data, clean))
    revtest = list(plots(test_data, clean))

    np.random.shuffle(revtrain)
    #print(next(tag_sentences(revtrain, my_tags[0])))
    
    basemodel = Word2Vec(workers=multiprocessing.cpu_count(), #Processamento em paralelo
                         iter=number_iters, #Tempo de aprendizado
                         hs=1, negative=0, #Apenas classificacao para tipo "softmax" 
                         )
    #print(basemodel)
    basemodel.build_vocab(tag_sentences(revtrain))
    genremodels = [deepcopy(basemodel) for i in range(len(my_tags))]
    for i in range(len(my_tags)):
        slist = list(tag_sentences(revtrain, my_tags[i]))
        genremodels[i].train(  slist, total_examples=len(slist) )


    Word2Vec(size=size, alpha=alpha) #Incluir argumento vocab=0 se nao houver erro

    #joblib.dump(genremodels,'filename.pkl') #PERSISTENCIA (escrita)
    #genremodels = joblib.load('filename.pkl') #PERSISTENCIA (leitura)


    #Prevendo dados
    probs = docprob( [r['x'] for r in revtest], genremodels)  
    prediction = probs.idxmax(axis=1).apply(lambda x: my_tags[x])

    if(show_box_plot==True):
        print_box_plot(revtest, probs, my_tags)        

    target = [r['y'] for r in revtest]
    accuracy = Avaliar.evaluate_prediction(prediction, target, my_tags, test_data.tag.unique(), show_confusion_graphic, file_result=file_result)
    return prediction, accuracy
コード例 #4
0
ファイル: WMDistanceV2.py プロジェクト: matszrmn/TCC
def classificar(train_data,
                test_data,
                my_tags,
                dirW2V,
                binary,
                dirData,
                shapeLin,
                shapeCol,
                language,
                show_confusion_graphic,
                file_result=None):

    wv = KeyedVectors.load_word2vec_format(dirW2V, binary=binary)
    wv.init_sims(replace=True)

    fp = np.memmap(dirData + "embed.dat",
                   dtype=np.double,
                   mode='w+',
                   shape=wv.syn0norm.shape)
    fp[:] = wv.syn0norm[:]

    with open(dirData + "embed.vocab", "w") as f:
        for _, w in sorted(
            (voc.index, word) for word, voc in wv.vocab.items()):
            print(w, file=f)
    del fp, wv

    W = np.memmap(dirData + "embed.dat",
                  dtype=np.double,
                  mode="r",
                  shape=(shapeLin, shapeCol))
    with open(dirData + "embed.vocab") as f:
        vocab_list = map(str.strip, f.readlines())

    vocab_dict = {w: k for k, w in enumerate(vocab_list)}

    train_tokenized = []
    for i in range(0, len(train_data)):
        train_tokenized.append(
            w2v_tokenize_text(train_data['plot'][i], language))

    #test_tokenized = []
    #for i in range(0,len(test_data)):
    #    test_tokenized.append(w2v_tokenize_text(test_data['plot'][i], language))

    flat_train_tokenized = [
        item for sublist in train_tokenized for item in sublist
    ]

    vect = CountVectorizer(stop_words=language).fit(flat_train_tokenized)
    del flat_train_tokenized  #talvez retirar

    common = [word for word in vect.get_feature_names() if word in vocab_dict]
    W_common = W[[vocab_dict[w] for w in common]]
    del W, vocab_dict  #talvez retirar

    vect = CountVectorizer(vocabulary=common, dtype=np.double)
    del common  #talvez retirar

    X_train = vect.fit_transform(train_data['plot'])
    del train_tokenized  #talvez retirar

    X_test = vect.transform(test_data['plot'])
    #del test_tokenized, vect #talvez retirar
    #del train_sentences, test_sentences

    knn = WordMoversKNN(n_neighbors=1, W_embed=W_common, verbose=5, n_jobs=7)
    del W_common  #talvez retirar

    knn.fit(X_train, train_data['tag'])
    del X_train  #talvez retirar

    prediction = knn.predict(X_test)
    del X_test  #talvez retirar

    accuracy = Avaliar.evaluate_prediction(prediction,
                                           test_data.tag,
                                           my_tags,
                                           test_data.tag.unique(),
                                           show_confusion_graphic,
                                           file_result=file_result)
    return prediction, accuracy
コード例 #5
0
ファイル: Doc2Vec.py プロジェクト: matszrmn/TCC
def classificar(train_data,
                test_data,
                my_tags,
                seed,
                clean,
                show_confusion_graphic,
                file_result=None):
    train_tagged = train_data.apply(lambda r: TaggedDocument(
        words=tokenize_text(r['plot'], clean), tags=[r.tag]),
                                    axis=1)

    test_tagged = test_data.apply(lambda r: TaggedDocument(
        words=tokenize_text(r['plot'], clean), tags=[r.tag]),
                                  axis=1)

    trainsent = train_tagged.values
    testsent = test_tagged.values
    doc2vec_model = Doc2Vec(trainsent, workers=1, size=5, iter=20, dm=1)

    train_targets, train_regressors = zip(
        *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20))
          for doc in trainsent])

    test_targets, test_regressors = zip(
        *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20))
          for doc in testsent])

    #joblib.dump(doc2vec_model, 'filename.pkl') #PERSISTENCIA (escrita)
    #doc2vec_model = joblib.load('filename.pkl') #PERSISTENCIA (leitura)

    #Knn
    prediction1 = [
        doc2vec_model.docvecs.most_similar([pred_vec], topn=1)[0][0]
        for pred_vec in test_regressors
    ]
    accuracy1 = Avaliar.evaluate_prediction(prediction1,
                                            test_targets,
                                            my_tags,
                                            test_data.tag.unique(),
                                            show_confusion_graphic,
                                            title=str(doc2vec_model),
                                            file_result=file_result)

    #doc2vec_model.docvecs.most_similar('action')
    #doc2vec_model.most_similar([doc2vec_model.docvecs['sci-fi']])

    #Regressao logistica 1
    logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5)
    logreg = logreg.fit(train_regressors, train_targets)
    #joblib.dump(logreg, 'filename.pkl') #PERSISTENCIA (escrita)
    #logreg = joblib.load('filename.pkl') #PERSISTENCIA (leitura)
    prediction2 = logreg.predict(test_regressors)
    accuracy2 = Avaliar.evaluate_prediction(prediction2,
                                            test_targets,
                                            my_tags,
                                            test_data.tag.unique(),
                                            show_confusion_graphic,
                                            title=str(doc2vec_model),
                                            file_result=file_result)

    #Regressao logistica 2
    doc2vec_model.seed = seed
    doc2vec_model.random = random.RandomState(seed)
    test_targets, test_regressors = zip(
        *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20))
          for doc in testsent])

    logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5, random_state=42)
    logreg = logreg.fit(train_regressors, train_targets)
    #joblib.dump(logreg, 'filename.pkl') #PERSISTENCIA (escrita)
    #logreg = joblib.load('filename.pkl') #PERSISTENCIA (leitura)
    prediction3 = logreg.predict(test_regressors)
    accuracy3 = Avaliar.evaluate_prediction(prediction3,
                                            test_targets,
                                            my_tags,
                                            test_data.tag.unique(),
                                            show_confusion_graphic,
                                            title=str(doc2vec_model),
                                            file_result=file_result)

    ListPredictions = (prediction1, prediction2, prediction3)
    ListAccuracy = (accuracy1, accuracy2, accuracy3)
    return ListPredictions, ListAccuracy