def predict_batch(self, batch, top_n=1): """Predict a batch of document - question pairs.""" documents, questions, candidates = [], [], [] for b in batch: documents.append(b[0]) questions.append(b[1]) candidates.append(b[2] if len(b) == 3 else None) candidates = candidates if any(candidates) else None # Tokenize the inputs, perhaps multi-processed. if self.workers: q_tokens = self.workers.map_async(tokenize, questions) c_tokens = self.workers.map_async(tokenize, documents) q_tokens = list(q_tokens.get()) c_tokens = list(c_tokens.get()) else: q_tokens = list(map(self.tokenizer.tokenize, questions)) c_tokens = list(map(self.tokenizer.tokenize, documents)) examples = [] for i in range(len(questions)): examples.append({ 'id': i, 'question': q_tokens[i].words(), 'question_char': q_tokens[i].chars(), 'qlemma': q_tokens[i].lemmas(), 'qpos': q_tokens[i].pos(), 'qner': q_tokens[i].entities(), 'document': c_tokens[i].words(), 'document_char': c_tokens[i].chars(), 'clemma': c_tokens[i].lemmas(), 'cpos': c_tokens[i].pos(), 'cner': c_tokens[i].entities(), }) # Stick document tokens in candidates for decoding if candidates: candidates = [{ 'input': c_tokens[i], 'cands': candidates[i] } for i in range(len(candidates))] # Build the batch and run it through the model batch_exs = batchify([vectorize(e, self.model) for e in examples]) s, e, score = self.model.predict(batch_exs, candidates, top_n) # Retrieve the predicted spans results = [] for i in range(len(s)): predictions = [] for j in range(len(s[i])): span = c_tokens[i].slice(s[i][j], e[i][j] + 1).untokenize() predictions.append((span, score[i][j])) results.append(predictions) return results
def __getitem__(self, index): return vectorize(self.examples[index], self.model)
def __getitem__(self, index): return vectorize(self.examples[index], self.model, self.single_answer)
import pandas as pd import vector as v import preprocessing as p import cluster2 as c import classifier as r a = pd.read_csv("Z:/TermPaper/twitter_cred-master/data.csv") print("cleaning....") doc, id1 = p.clean(a) print("vectorizing....") dvec, global_vector = v.vectorize(doc) print("clustering....") g, t = c.cluster(dvec, global_vector, id1) cnt = 0 x = [] print(len(t)) print("credibility calculating") r.classifier(g)
from vector import get_word_features, vectorize, get_words, naive_bayes_vector import cPickle from sklearn.svm import SVC from sklearn.naive_bayes import BernoulliNB from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from maxent import Maxent import numpy from naive_bayes import NaiveBayesClassifier from svm_classifier import SVM data = get_data('shortdatabase.csv') word_features = get_word_features(data['tweet']) word_features = sorted(word_features) word_features = sorted(word_features) word_vector = vectorize(word_features, data['tweet'], data['sentiment']) vector = [] labels = [] for example in word_vector: vector = vector + [example[0]] labels = labels + [example[1]] print "Stage 1: Word Polarity" print "training bayesian network" words = get_words("features.txt") bayes_vector = naive_bayes_vector(words, data['tweet'], data['sentiment']) #print bayes_vector NaiveBayesClassifier.train(bayes_vector) #gnb = BernoulliNB()