def classificar(train_data, test_data, my_tags, binary, dirW2V, language, clean, show_confusion_graphic, file_result=None): wv = Word2Vec.load_word2vec_format(fname=dirW2V, binary=binary) wv.init_sims(replace=True) test_tokenized = test_data.apply( lambda r: w2v_tokenize_text(r['plot'], language, clean), axis=1).values train_tokenized = train_data.apply( lambda r: w2v_tokenize_text(r['plot'], language, clean), axis=1).values X_train_word_average = word_averaging_list(wv, train_tokenized) X_test_word_average = word_averaging_list(wv, test_tokenized) #KNN knn_naive_dv = KNeighborsClassifier(n_neighbors=3, n_jobs=1, algorithm='brute', metric='cosine') knn_naive_dv.fit(X_train_word_average, train_data.tag) predicted1 = knn_naive_dv.predict(X_test_word_average) accuracy1 = Avaliar.evaluate_prediction(predicted1, test_data.tag, my_tags, test_data.tag.unique(), show_confusion_graphic, file_result=file_result) #Regressao logistica logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5) logreg = logreg.fit(X_train_word_average, train_data['tag']) predicted2 = logreg.predict(X_test_word_average) accuracy2 = Avaliar.evaluate_prediction(predicted2, test_data.tag, my_tags, test_data.tag.unique(), show_confusion_graphic, file_result=file_result) #wv.most_similar(positive=[X_test_word_average[56]], restrict_vocab=100000, topn=30)[0:20] ListPredictions = (predicted1, predicted2) ListAccuracy = (accuracy1, accuracy2) return ListPredictions, ListAccuracy
def classificar(train_data, test_data, my_tags, language, max_features, show_confusion_graphic, file_result=None): count_vectorizer = CountVectorizer(analyzer="word", tokenizer=nltk.word_tokenize, preprocessor=None, stop_words=language, max_features=max_features) train_data_features = count_vectorizer.fit_transform(train_data['plot']) logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5) logreg = logreg.fit(train_data_features, train_data['tag']) #joblib.dump(logreg, 'filename.pkl') #PERSISTENCIA (escrita) #logreg = joblib.load('filename.pkl') #PERSISTENCIA (leitura) prediction, accuracy = Avaliar.predict(count_vectorizer, logreg, test_data, my_tags, test_data.tag.unique(), show_confusion_graphic, file_result=file_result) return prediction, accuracy
def classificar(train_data, test_data, my_tags, number_iters, size, alpha, clean, show_box_plot, show_confusion_graphic, file_result=None): def tag_sentences(reviews, stars=my_tags): for r in reviews: if r['y'] in stars: for s in r['x']: yield s revtrain = list(plots(train_data, clean)) revtest = list(plots(test_data, clean)) np.random.shuffle(revtrain) #print(next(tag_sentences(revtrain, my_tags[0]))) basemodel = Word2Vec(workers=multiprocessing.cpu_count(), #Processamento em paralelo iter=number_iters, #Tempo de aprendizado hs=1, negative=0, #Apenas classificacao para tipo "softmax" ) #print(basemodel) basemodel.build_vocab(tag_sentences(revtrain)) genremodels = [deepcopy(basemodel) for i in range(len(my_tags))] for i in range(len(my_tags)): slist = list(tag_sentences(revtrain, my_tags[i])) genremodels[i].train( slist, total_examples=len(slist) ) Word2Vec(size=size, alpha=alpha) #Incluir argumento vocab=0 se nao houver erro #joblib.dump(genremodels,'filename.pkl') #PERSISTENCIA (escrita) #genremodels = joblib.load('filename.pkl') #PERSISTENCIA (leitura) #Prevendo dados probs = docprob( [r['x'] for r in revtest], genremodels) prediction = probs.idxmax(axis=1).apply(lambda x: my_tags[x]) if(show_box_plot==True): print_box_plot(revtest, probs, my_tags) target = [r['y'] for r in revtest] accuracy = Avaliar.evaluate_prediction(prediction, target, my_tags, test_data.tag.unique(), show_confusion_graphic, file_result=file_result) return prediction, accuracy
def classificar(train_data, test_data, my_tags, dirW2V, binary, dirData, shapeLin, shapeCol, language, show_confusion_graphic, file_result=None): wv = KeyedVectors.load_word2vec_format(dirW2V, binary=binary) wv.init_sims(replace=True) fp = np.memmap(dirData + "embed.dat", dtype=np.double, mode='w+', shape=wv.syn0norm.shape) fp[:] = wv.syn0norm[:] with open(dirData + "embed.vocab", "w") as f: for _, w in sorted( (voc.index, word) for word, voc in wv.vocab.items()): print(w, file=f) del fp, wv W = np.memmap(dirData + "embed.dat", dtype=np.double, mode="r", shape=(shapeLin, shapeCol)) with open(dirData + "embed.vocab") as f: vocab_list = map(str.strip, f.readlines()) vocab_dict = {w: k for k, w in enumerate(vocab_list)} train_tokenized = [] for i in range(0, len(train_data)): train_tokenized.append( w2v_tokenize_text(train_data['plot'][i], language)) #test_tokenized = [] #for i in range(0,len(test_data)): # test_tokenized.append(w2v_tokenize_text(test_data['plot'][i], language)) flat_train_tokenized = [ item for sublist in train_tokenized for item in sublist ] vect = CountVectorizer(stop_words=language).fit(flat_train_tokenized) del flat_train_tokenized #talvez retirar common = [word for word in vect.get_feature_names() if word in vocab_dict] W_common = W[[vocab_dict[w] for w in common]] del W, vocab_dict #talvez retirar vect = CountVectorizer(vocabulary=common, dtype=np.double) del common #talvez retirar X_train = vect.fit_transform(train_data['plot']) del train_tokenized #talvez retirar X_test = vect.transform(test_data['plot']) #del test_tokenized, vect #talvez retirar #del train_sentences, test_sentences knn = WordMoversKNN(n_neighbors=1, W_embed=W_common, verbose=5, n_jobs=7) del W_common #talvez retirar knn.fit(X_train, train_data['tag']) del X_train #talvez retirar prediction = knn.predict(X_test) del X_test #talvez retirar accuracy = Avaliar.evaluate_prediction(prediction, test_data.tag, my_tags, test_data.tag.unique(), show_confusion_graphic, file_result=file_result) return prediction, accuracy
def classificar(train_data, test_data, my_tags, seed, clean, show_confusion_graphic, file_result=None): train_tagged = train_data.apply(lambda r: TaggedDocument( words=tokenize_text(r['plot'], clean), tags=[r.tag]), axis=1) test_tagged = test_data.apply(lambda r: TaggedDocument( words=tokenize_text(r['plot'], clean), tags=[r.tag]), axis=1) trainsent = train_tagged.values testsent = test_tagged.values doc2vec_model = Doc2Vec(trainsent, workers=1, size=5, iter=20, dm=1) train_targets, train_regressors = zip( *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in trainsent]) test_targets, test_regressors = zip( *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in testsent]) #joblib.dump(doc2vec_model, 'filename.pkl') #PERSISTENCIA (escrita) #doc2vec_model = joblib.load('filename.pkl') #PERSISTENCIA (leitura) #Knn prediction1 = [ doc2vec_model.docvecs.most_similar([pred_vec], topn=1)[0][0] for pred_vec in test_regressors ] accuracy1 = Avaliar.evaluate_prediction(prediction1, test_targets, my_tags, test_data.tag.unique(), show_confusion_graphic, title=str(doc2vec_model), file_result=file_result) #doc2vec_model.docvecs.most_similar('action') #doc2vec_model.most_similar([doc2vec_model.docvecs['sci-fi']]) #Regressao logistica 1 logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5) logreg = logreg.fit(train_regressors, train_targets) #joblib.dump(logreg, 'filename.pkl') #PERSISTENCIA (escrita) #logreg = joblib.load('filename.pkl') #PERSISTENCIA (leitura) prediction2 = logreg.predict(test_regressors) accuracy2 = Avaliar.evaluate_prediction(prediction2, test_targets, my_tags, test_data.tag.unique(), show_confusion_graphic, title=str(doc2vec_model), file_result=file_result) #Regressao logistica 2 doc2vec_model.seed = seed doc2vec_model.random = random.RandomState(seed) test_targets, test_regressors = zip( *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in testsent]) logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5, random_state=42) logreg = logreg.fit(train_regressors, train_targets) #joblib.dump(logreg, 'filename.pkl') #PERSISTENCIA (escrita) #logreg = joblib.load('filename.pkl') #PERSISTENCIA (leitura) prediction3 = logreg.predict(test_regressors) accuracy3 = Avaliar.evaluate_prediction(prediction3, test_targets, my_tags, test_data.tag.unique(), show_confusion_graphic, title=str(doc2vec_model), file_result=file_result) ListPredictions = (prediction1, prediction2, prediction3) ListAccuracy = (accuracy1, accuracy2, accuracy3) return ListPredictions, ListAccuracy