def tokenize_query(query, ds): """ Returns a dictionary with structure {term : frequency}. Also preprocesses the input query string using the Sklearn TfidfVectorizer. """ print >> sys.stderr, "tokenize_query" helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() with open(os.path.join(BASE, os.path.join(ds, 'vocab_to_ix.json'))) as f: vocab_to_ix = json.load(f) prepro_q = tfidf_preprocessor(query) q_tokens = tfidf_tokenizer(prepro_q) gc.collect() query_dict_ix = defaultdict(int) query_dict_term = defaultdict(int) for tok in q_tokens: tfidf_vocab_ix = vocab_to_ix.get(tok, -1) if tfidf_vocab_ix != -1: query_dict_ix[vocab_to_ix[tok]] += 1 query_dict_term[tok] += 1 expanded_query_dict = expand_query(query_dict_ix, query_dict_term, vocab_to_ix) f.close() gc.collect() return expanded_query_dict
def tokenize_query(query, ds, vocab_to_ix, words_compressed, docs_compressed, ATN_word_to_ix): """ Returns a dictionary with structure {term : frequency}. Also preprocesses the input query string using the Sklearn TfidfVectorizer. """ print >> sys.stderr, "tokenize_query" helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() prepro_q = tfidf_preprocessor(query) q_tokens = tfidf_tokenizer(prepro_q) gc.collect() query_dict_ix = defaultdict(int) query_dict_term = defaultdict(int) for tok in q_tokens: tfidf_vocab_ix = vocab_to_ix.get(tok, -1) if tfidf_vocab_ix != -1: query_dict_ix[vocab_to_ix[tok]] += 1 query_dict_term[tok] += 1 print >> sys.stderr, "lending control to expand query" expanded_query_dict = expand_query(query_dict_ix, query_dict_term, vocab_to_ix, \ words_compressed, docs_compressed, ATN_word_to_ix) gc.collect() return expanded_query_dict
def vectorize_reu_iden(): helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() news = pd.read_csv('data/reu_identifiers.csv', names=['date', 'id', 'title'], usecols=['id', 'title']) news = news[news['title'].isnull() == False] news = news[2283884:] #2016 on news.reindex(labels=np.arange(len(news))) gc.collect() article_tf = {} doc_freq = defaultdict(lambda: 0) unique_toks = set() for ix, story in news.iterrows(): tf_dict = defaultdict(lambda: 0) tokens = tfidf_tokenizer(story['title']) story_unique_toks = set(tokens) for tok in tokens: tf_dict[tok] += 1 for tok in story_unique_toks: unique_toks.add(tok) doc_freq[tok] += 1 article_tf[story['id']] = tf_dict gc.collect() return article_tf, doc_freq, unique_toks
def tokenize_query(query): helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16) tfidf_preprocessor = helper.build_preprocessor() tfidf_tokenizer = helper.build_tokenizer() with open(os.path.join(os.path.dirname(__file__), 'reuters/vocab_to_ix.json')) as f: #vocab_to_ix = json.load(open('vocab_to_ix.json')) vocab_to_ix= json.load(f) prepro_q = tfidf_preprocessor(query) q_tokens = tfidf_tokenizer(prepro_q) gc.collect() query_dict = defaultdict(int) for tok in q_tokens: tfidf_vocab_ix = vocab_to_ix.get(tok, -1) if tfidf_vocab_ix != -1: query_dict[vocab_to_ix[tok]] += 1 f.close() gc.collect() return query_dict
unclassified_features = vectorizer.transform(unclassified_df['Tweet']) # Get predictions Bayes unclassified_tweet_sentiments_bayes = classifier_bayes.predict( unclassified_features) # Store the sentiment in a new column, NOTE 0 is negative, 4 is positive unclassified_df['Sentiment'] = unclassified_tweet_sentiments_bayes unclassified_df.head() # Need code to classify the tweets for the different major political parties, in this case there are 4 major political party categories I will consider in the Canadian Context ***'Liberal', 'Conservative', 'NDP', Others'*** # As this data needs to be assigned to a party, a simple word frequency counter algorithm will be used to assign to each party # Preporcessor and tokenizer code preprocessor = vectorizer.build_preprocessor() tokenizer = vectorizer.build_tokenizer() # Defining the bag_of_words function def bag_of_words(tw): '''(str) -> dict Input: a string tw (a tweet line) Output: a python dictionary ''' unigram_ls = tokenizer(preprocessor(tw)) #Create an empty dictionary bag_words = {} #Run through tokenized unigram list
X_test = np.array([''.join(el) for el in nyt_data[trainset_size + 1:len(nyt_data)]]) y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]]) #print(X_train) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') test_string = unicode(nyt_data[0]) print "Example string: " + test_string print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string) print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string)) print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string)) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) svm_classifier = LinearSVC().fit(X_train, y_train)
class SplitVectorizer(): def __init__(self, tfidf_model=None, input_file_name=None, type_analyzer='word', n_gram_range=(1, 2), Xy='X', vectorize=False): if tfidf_model == None: assert input_file_name != None # Give model or input text self.model = TfidfVectorizer(analyzer=type_analyzer, ngram_range=n_gram_range) elif input_file_name == None: assert tfidf_model != None # Give model or input text self.model = tfidf_model elif not None in [input_file_name, tfidf_model]: self.model = tfidf_model self.XY = Xy self.input_file = input_file_name self.vectorize = vectorize def fit(self, X=None, y=None): with open(self.input_file) as f: self.model.fit(f) self.analyzer = self.model.build_analyzer() self.prep = self.model.build_preprocessor() self.tokenizer = self.model.build_tokenizer() self.vocab = {self.model.vocabulary_[w]: w for w in self.model.vocabulary_} return self def get_matrices(self): self.docs_X = [] self.docs_Y = [] for a in open(self.input_file): x = self.tokenizer(self.prep(a)) dl = len(x) self.docs_X.append(" ".join(x[:int(dl/2)])) self.docs_Y.append(" ".join(x[int(dl/2):])) return self.model.transform(self.docs_X), \ self.model.transform(self.docs_Y) def Tx(self, x): if self.vectorize: return self.model.transform([x]) else: return self.analyzer(x) def __iter__(self): for a in open(self.input_file): x = self.tokenizer(self.prep(a)) dl = len(x) if self.XY == 'X': yield self.Tx(" ".join(x[:int(dl/2)])) elif self.XY == 'Y': yield self.Tx(" ".join(x[int(dl/2):])) elif self.XY == 'join': yield self.Tx(" ".join(x[:int(dl/2)])), \ self.Tx(" ".join(x[int(dl/2):]))
test_data[i,1] = 0 count_pos_test = count_neg_test + 1 label_test = test_data[:,1] #vctr = CountVectorizer(stop_words='english',min_df = 1) #vctr2 = HashingVectorizer(stop_words='english') vctr = TfidfVectorizer(stop_words='english') #intailising vectorizers TF-IDF gives better accuracy by 1 percent compared to the other vectors count_pos = 0 count_neg = 0 ###################################################################################################### train = [] test = [] for i in range(len(train_data)): #processing of the train data string = train_data[i,0] string = vctr.build_preprocessor()(string.lower()) string = vctr.build_tokenizer()(string.lower()) train.append(' '.join(string)) for i in range(len(test_data)): #processing of the test data string = test_data[i,0] string = vctr.build_preprocessor()(string.lower()) string = vctr.build_tokenizer()(string.lower()) test.append(' '.join(string)) ###################################################################################################### train_data1 = vctr.fit_transform(train).toarray() #fitting the dictionary for bag of words model using TF-IDF vectorizers #X_test = vctr.transform(test).toarray() y_train = np.asarray(label_train, dtype="|S6") y_train = y_train.astype(int) clf1 = GradientBoostingClassifier(n_estimators = 500) #initialising classifiers
X_test = np.array([''.join(el) for el in nyt_data[trainset_size+1:len(nyt_data)]]) y_test = np.array([el for el in nyt_labels[trainset_size+1:len(nyt_labels)]]) #print(X_train) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') test_string = unicode(nyt_data[0]) print "Example string: " + test_string print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string) print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string)) print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string)) print "\n" X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) nb_classifier = MultinomialNB().fit(X_train, y_train) y_nb_predicted = nb_classifier.predict(X_test) print "MODEL: Multinomial Naive Bayes\n" print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted)) print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
def main(): seed = 9001 combined_data = read_all_data() # Create train/test split of data x_train, x_test, y_train, y_test = train_test_split( combined_data["headline"], combined_data["is_clickbait"], random_state=seed) if len(sys.argv) > 1: print() print("Loading pickle...") print() pipe = utils.unpickle_gzip("models/pipeline.pickle.gz") else: print() print("Training...") print() # Instantiate TfidVectrorizer to translate text data to feature vectors # such that they can be used as inputs for an estimator tf_v = TfidfVectorizer(strip_accents='unicode') # With the vectorizer trained, let's load some different estimators clf = LogisticRegressionCV( cv=5, solver='saga', random_state=seed, ) pipe = make_pipeline(tf_v, clf) pipe.fit(x_train, y_train) print() print("Predicting...") print() predictions = pipe.predict(x_test) utils.print_evaluation(y_test, predictions) if len(sys.argv) <= 1: print() print("Pickling...") print() utils.pickle_gzip(pipe, "models/pipeline.pickle.gz") # CANNOT RUN DUE TO MEMORY # rfc = RandomForestClassifier( # n_jobs=-1, # n_estimators=1000, # random_state=seed, # verbose=3) # predictions = rfc.predict(x_test) # utils.print_evaluation(y_test, predictions) print("\n\nPlotting frequency of word use . . .") plot_split_word_freqs(combined_data, tf_v.build_preprocessor(), tf_v.build_tokenizer())