def tokenize_query(query, ds):
    """
    Returns a dictionary with structure {term : frequency}. Also preprocesses
    the input query string using the Sklearn TfidfVectorizer.
    """
    print >> sys.stderr, "tokenize_query"
    helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()
    with open(os.path.join(BASE, os.path.join(ds, 'vocab_to_ix.json'))) as f:
        vocab_to_ix = json.load(f)
        prepro_q = tfidf_preprocessor(query)
        q_tokens = tfidf_tokenizer(prepro_q)
        gc.collect()
        query_dict_ix = defaultdict(int)
        query_dict_term = defaultdict(int)
        for tok in q_tokens:
            tfidf_vocab_ix = vocab_to_ix.get(tok, -1)
            if tfidf_vocab_ix != -1:
                query_dict_ix[vocab_to_ix[tok]] += 1
                query_dict_term[tok] += 1
        expanded_query_dict = expand_query(query_dict_ix, query_dict_term,
                                           vocab_to_ix)
        f.close()
        gc.collect()
        return expanded_query_dict
Esempio n. 2
0
def tokenize_query(query, ds, vocab_to_ix, words_compressed, docs_compressed,
                   ATN_word_to_ix):
    """
    Returns a dictionary with structure {term : frequency}. Also preprocesses
    the input query string using the Sklearn TfidfVectorizer.
    """
    print >> sys.stderr, "tokenize_query"
    helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()
    prepro_q = tfidf_preprocessor(query)
    q_tokens = tfidf_tokenizer(prepro_q)
    gc.collect()
    query_dict_ix = defaultdict(int)
    query_dict_term = defaultdict(int)
    for tok in q_tokens:
        tfidf_vocab_ix = vocab_to_ix.get(tok, -1)
        if tfidf_vocab_ix != -1:
            query_dict_ix[vocab_to_ix[tok]] += 1
            query_dict_term[tok] += 1
    print >> sys.stderr, "lending control to expand query"
    expanded_query_dict = expand_query(query_dict_ix, query_dict_term, vocab_to_ix, \
     words_compressed, docs_compressed, ATN_word_to_ix)
    gc.collect()
    return expanded_query_dict
def vectorize_reu_iden():
    helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()

    news = pd.read_csv('data/reu_identifiers.csv',
                       names=['date', 'id', 'title'],
                       usecols=['id', 'title'])
    news = news[news['title'].isnull() == False]
    news = news[2283884:]  #2016 on
    news.reindex(labels=np.arange(len(news)))
    gc.collect()

    article_tf = {}
    doc_freq = defaultdict(lambda: 0)
    unique_toks = set()
    for ix, story in news.iterrows():
        tf_dict = defaultdict(lambda: 0)
        tokens = tfidf_tokenizer(story['title'])
        story_unique_toks = set(tokens)

        for tok in tokens:
            tf_dict[tok] += 1

        for tok in story_unique_toks:
            unique_toks.add(tok)
            doc_freq[tok] += 1

        article_tf[story['id']] = tf_dict

    gc.collect()

    return article_tf, doc_freq, unique_toks
def tokenize_query(query):
    helper = TfidfVectorizer(min_df=3, stop_words='english',  dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()
    with open(os.path.join(os.path.dirname(__file__), 'reuters/vocab_to_ix.json')) as f:
        #vocab_to_ix = json.load(open('vocab_to_ix.json'))
        vocab_to_ix= json.load(f)
        prepro_q = tfidf_preprocessor(query)
        q_tokens = tfidf_tokenizer(prepro_q)
        gc.collect()
        query_dict = defaultdict(int)
        for tok in q_tokens:
            tfidf_vocab_ix = vocab_to_ix.get(tok, -1)
            if tfidf_vocab_ix != -1:
                query_dict[vocab_to_ix[tok]] += 1
        f.close()
        gc.collect()
        return query_dict
unclassified_features = vectorizer.transform(unclassified_df['Tweet'])
# Get predictions Bayes
unclassified_tweet_sentiments_bayes = classifier_bayes.predict(
    unclassified_features)

# Store the sentiment in a new column, NOTE 0 is negative, 4 is positive
unclassified_df['Sentiment'] = unclassified_tweet_sentiments_bayes

unclassified_df.head()

# Need code to classify the tweets for the different major political parties, in this case there are 4 major political party categories I will consider in the Canadian Context ***'Liberal', 'Conservative', 'NDP', Others'***
# As this data needs to be assigned to a party, a simple word frequency counter algorithm will be used to assign to each party

# Preporcessor and tokenizer code

preprocessor = vectorizer.build_preprocessor()
tokenizer = vectorizer.build_tokenizer()


# Defining the bag_of_words function
def bag_of_words(tw):
    '''(str) -> dict
    Input: a string tw (a tweet line)
    Output: a python dictionary 
    '''

    unigram_ls = tokenizer(preprocessor(tw))

    #Create an empty dictionary
    bag_words = {}
    #Run through tokenized unigram list
Esempio n. 6
0
X_test = np.array([''.join(el) for el in nyt_data[trainset_size + 1:len(nyt_data)]])
y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]])

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2,
                             ngram_range=(1, 2),
                             stop_words='english',
                             strip_accents='unicode',
                             norm='l2')

test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)






svm_classifier = LinearSVC().fit(X_train, y_train)

Esempio n. 7
0
class SplitVectorizer():
    def __init__(self, tfidf_model=None,
                        input_file_name=None,
                        type_analyzer='word',
                        n_gram_range=(1, 2),
                        Xy='X',
                        vectorize=False):
        if tfidf_model == None:
            assert input_file_name != None  # Give model or input text
            self.model = TfidfVectorizer(analyzer=type_analyzer,
                                                ngram_range=n_gram_range)
        elif input_file_name == None:
            assert tfidf_model != None  # Give model or input text
            self.model = tfidf_model

        elif not None in [input_file_name, tfidf_model]:
            self.model = tfidf_model

        self.XY = Xy
        self.input_file = input_file_name
        self.vectorize = vectorize

    def fit(self, X=None, y=None):
        with open(self.input_file) as f:
            self.model.fit(f)

        self.analyzer = self.model.build_analyzer()
        self.prep = self.model.build_preprocessor()
        self.tokenizer = self.model.build_tokenizer()
        self.vocab = {self.model.vocabulary_[w]: w
				for w in self.model.vocabulary_}

        return self

    def get_matrices(self):
        self.docs_X = []
        self.docs_Y = []
        for a in open(self.input_file):
            x = self.tokenizer(self.prep(a))
            dl = len(x)
            self.docs_X.append(" ".join(x[:int(dl/2)]))
            self.docs_Y.append(" ".join(x[int(dl/2):]))
        return self.model.transform(self.docs_X), \
               self.model.transform(self.docs_Y)

    def Tx(self, x):
        if self.vectorize:
            return self.model.transform([x])
        else:
            return self.analyzer(x)

    def __iter__(self):
        for a in open(self.input_file):
            x = self.tokenizer(self.prep(a))
            dl = len(x)

            if self.XY == 'X':
                yield self.Tx(" ".join(x[:int(dl/2)]))
            elif self.XY == 'Y':
                yield self.Tx(" ".join(x[int(dl/2):]))
            elif self.XY == 'join':
                yield self.Tx(" ".join(x[:int(dl/2)])), \
			self.Tx(" ".join(x[int(dl/2):]))
Esempio n. 8
0
        test_data[i,1] = 0
        count_pos_test = count_neg_test + 1
        
label_test = test_data[:,1]
#vctr =  CountVectorizer(stop_words='english',min_df = 1)
#vctr2 = HashingVectorizer(stop_words='english') 
vctr = TfidfVectorizer(stop_words='english') #intailising vectorizers TF-IDF gives better accuracy by 1 percent compared to the other vectors
count_pos = 0
count_neg = 0

######################################################################################################
train = []
test = []
for i in range(len(train_data)):           #processing of the train data
    string = train_data[i,0]            
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    train.append(' '.join(string))

for i in range(len(test_data)):            #processing of the test data  
    string = test_data[i,0]
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    test.append(' '.join(string)) 

######################################################################################################
train_data1 = vctr.fit_transform(train).toarray() #fitting the dictionary for bag of words model using TF-IDF vectorizers
#X_test = vctr.transform(test).toarray()
y_train = np.asarray(label_train, dtype="|S6")
y_train = y_train.astype(int)
clf1 =   GradientBoostingClassifier(n_estimators = 500) #initialising classifiers
Esempio n. 9
0
X_test = np.array([''.join(el) for el in nyt_data[trainset_size+1:len(nyt_data)]]) 
y_test = np.array([el for el in nyt_labels[trainset_size+1:len(nyt_labels)]]) 

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2), 
 stop_words='english', 
 strip_accents='unicode', 
 norm='l2')
 
test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))
print "\n"
 
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
Esempio n. 10
0
def main():
    seed = 9001

    combined_data = read_all_data()

    # Create train/test split of data
    x_train, x_test, y_train, y_test = train_test_split(
        combined_data["headline"],
        combined_data["is_clickbait"],
        random_state=seed)

    if len(sys.argv) > 1:
        print()
        print("Loading pickle...")
        print()

        pipe = utils.unpickle_gzip("models/pipeline.pickle.gz")
    else:
        print()
        print("Training...")
        print()

        # Instantiate TfidVectrorizer to translate text data to feature vectors
        # such that they can be used as inputs for an estimator
        tf_v = TfidfVectorizer(strip_accents='unicode')

        # With the vectorizer trained, let's load some different estimators
        clf = LogisticRegressionCV(
            cv=5,
            solver='saga',
            random_state=seed,
        )

        pipe = make_pipeline(tf_v, clf)

        pipe.fit(x_train, y_train)

    print()
    print("Predicting...")
    print()

    predictions = pipe.predict(x_test)
    utils.print_evaluation(y_test, predictions)

    if len(sys.argv) <= 1:
        print()
        print("Pickling...")
        print()

        utils.pickle_gzip(pipe, "models/pipeline.pickle.gz")

    # CANNOT RUN DUE TO MEMORY
    # rfc = RandomForestClassifier(
    #     n_jobs=-1,
    #     n_estimators=1000,
    #     random_state=seed,
    #     verbose=3)
    # predictions = rfc.predict(x_test)
    # utils.print_evaluation(y_test, predictions)

    print("\n\nPlotting frequency of word use . . .")
    plot_split_word_freqs(combined_data, tf_v.build_preprocessor(),
                          tf_v.build_tokenizer())