def file():
    cats = ['alt.atheism', 'sci.electronics']

    newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

    newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
    vectorizer = TfidfVectorizer()  #把所有文档都切词,统计了

    vectors_train = vectorizer.fit_transform(newsgroups_train.data)
    vectors = vectorizer.transform(newsgroups_test.data)
    print vectors.shape[1]
    #f=open('test_all.txt','wb')
    for j in range(0, vectors.shape[0]):
        item_id = list()
        tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j])  #提取分词结果
        #print tokens

        word_sort = np.argsort(-vectors[j].data)
        print '顶点' + str(j)
        for i in range(0, len(word_sort)):
            word = vectorizer.get_feature_names()[vectors[j].indices[
                word_sort[i]]]  #这个是tf-idf詞
            for line in range(0, len(tokens)):
                if tokens[line].lower() == word:
                    item_id.append((line, word_sort[i]))

        pos_item = sorted(item_id, key=lambda jj: jj[0],
                          reverse=True)  #抽取tf-idf词

        word_word = np.zeros([len(word_sort), len(word_sort)])
        for p in range(0, len(pos_item)):
            if p < (len(pos_item) - 1):
                ki = word_sort[pos_item[p][1]]
                kj = word_sort[pos_item[p + 1][1]]
                word_word[ki, kj] = word_word[ki, kj] + 1
Esempio n. 2
0
def tokenize_query(query, ds, vocab_to_ix, words_compressed, docs_compressed,
                   ATN_word_to_ix):
    """
    Returns a dictionary with structure {term : frequency}. Also preprocesses
    the input query string using the Sklearn TfidfVectorizer.
    """
    print >> sys.stderr, "tokenize_query"
    helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()
    prepro_q = tfidf_preprocessor(query)
    q_tokens = tfidf_tokenizer(prepro_q)
    gc.collect()
    query_dict_ix = defaultdict(int)
    query_dict_term = defaultdict(int)
    for tok in q_tokens:
        tfidf_vocab_ix = vocab_to_ix.get(tok, -1)
        if tfidf_vocab_ix != -1:
            query_dict_ix[vocab_to_ix[tok]] += 1
            query_dict_term[tok] += 1
    print >> sys.stderr, "lending control to expand query"
    expanded_query_dict = expand_query(query_dict_ix, query_dict_term, vocab_to_ix, \
     words_compressed, docs_compressed, ATN_word_to_ix)
    gc.collect()
    return expanded_query_dict
def file():
    cats = ["alt.atheism", "sci.electronics"]

    newsgroups_train = fetch_20newsgroups(subset="train", categories=cats)

    newsgroups_test = fetch_20newsgroups(subset="test", categories=cats)
    vectorizer = TfidfVectorizer()  # 把所有文档都切词,统计了

    vectors_train = vectorizer.fit_transform(newsgroups_train.data)
    vectors = vectorizer.transform(newsgroups_test.data)
    print vectors.shape[1]
    # f=open('test_all.txt','wb')
    for j in range(0, vectors.shape[0]):
        item_id = list()
        tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j])  # 提取分词结果
        # print tokens

        word_sort = np.argsort(-vectors[j].data)
        print "顶点" + str(j)
        for i in range(0, len(word_sort)):
            word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]]  # 这个是tf-idf詞
            for line in range(0, len(tokens)):
                if tokens[line].lower() == word:
                    item_id.append((line, word_sort[i]))

        pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True)  # 抽取tf-idf词

        word_word = np.zeros([len(word_sort), len(word_sort)])
        for p in range(0, len(pos_item)):
            if p < (len(pos_item) - 1):
                ki = word_sort[pos_item[p][1]]
                kj = word_sort[pos_item[p + 1][1]]
                word_word[ki, kj] = word_word[ki, kj] + 1
def tokenize_query(query, ds):
    """
    Returns a dictionary with structure {term : frequency}. Also preprocesses
    the input query string using the Sklearn TfidfVectorizer.
    """
    print >> sys.stderr, "tokenize_query"
    helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()
    with open(os.path.join(BASE, os.path.join(ds, 'vocab_to_ix.json'))) as f:
        vocab_to_ix = json.load(f)
        prepro_q = tfidf_preprocessor(query)
        q_tokens = tfidf_tokenizer(prepro_q)
        gc.collect()
        query_dict_ix = defaultdict(int)
        query_dict_term = defaultdict(int)
        for tok in q_tokens:
            tfidf_vocab_ix = vocab_to_ix.get(tok, -1)
            if tfidf_vocab_ix != -1:
                query_dict_ix[vocab_to_ix[tok]] += 1
                query_dict_term[tok] += 1
        expanded_query_dict = expand_query(query_dict_ix, query_dict_term,
                                           vocab_to_ix)
        f.close()
        gc.collect()
        return expanded_query_dict
def vectorize_reu_iden():
    helper = TfidfVectorizer(min_df=3, stop_words='english', dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()

    news = pd.read_csv('data/reu_identifiers.csv',
                       names=['date', 'id', 'title'],
                       usecols=['id', 'title'])
    news = news[news['title'].isnull() == False]
    news = news[2283884:]  #2016 on
    news.reindex(labels=np.arange(len(news)))
    gc.collect()

    article_tf = {}
    doc_freq = defaultdict(lambda: 0)
    unique_toks = set()
    for ix, story in news.iterrows():
        tf_dict = defaultdict(lambda: 0)
        tokens = tfidf_tokenizer(story['title'])
        story_unique_toks = set(tokens)

        for tok in tokens:
            tf_dict[tok] += 1

        for tok in story_unique_toks:
            unique_toks.add(tok)
            doc_freq[tok] += 1

        article_tf[story['id']] = tf_dict

    gc.collect()

    return article_tf, doc_freq, unique_toks
Esempio n. 6
0
    def corpusweights(s):

        records = s.maC.getRecords()
        tfidf = []
        topics = []
        corpus = []
        for record in records:
            if (record['domain'] == s.domain):
                topics.append(record['topic'])
                corpus.append(record['body'])
        if s.verbose:
            s.logger.info("corpusweights : topics : " + str(topics) +
                          " len(corpus) : " + str(len(corpus)))

        tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 3),
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[A-Za-z]{2,}')
        tokenize = tf.build_tokenizer()
        tfidf_matrix = tf.fit_transform(corpus)
        feature_names = tf.get_feature_names()
        dense_tfidf_matrix = tfidf_matrix.todense()

        for i in range(0, len(dense_tfidf_matrix)):
            topic = dense_tfidf_matrix[i].tolist()[0]
            # filter out phrases with a score of 0
            phrase_scores = [
                pair for pair in zip(range(0, len(topic)), topic)
                if pair[1] > 0
            ]
            sorted_phrase_scores = sorted(phrase_scores,
                                          key=lambda t: t[1] * -1)

            # find the min and max score for normalization by grabbing the scores of
            # the first and last elements in the sorted list
            max_score = sorted_phrase_scores[0][1]
            min_score = sorted_phrase_scores[len(sorted_phrase_scores) - 1][1]

            tfidf.append(dict({'topic': topics[i], 'phrases': []}))
            for phrase, score in [(feature_names[word_id], score)
                                  for (word_id, score) in sorted_phrase_scores
                                  ]:
                # normalize scores to a 0 to 1 range
                normalized_score = (score - min_score) / (max_score -
                                                          min_score)
                normalized_score = 1 - 100**(
                    -1 * normalized_score
                )  #(score - min_score) / (max_score - min_score)
                tfidf[i]['phrases'].append(
                    dict({
                        'phrase': phrase,
                        'score': normalized_score
                    }))

        return tfidf
Esempio n. 7
0
def create_vocab(train):
    init_time = time.time()
    vocab = set()
    t = TfidfVectorizer()
    tokenizer = t.build_tokenizer()
    for ex in train[0]:
        vocab.update(tokenizer(ex))
    end_time = time.time()
    print("it took " + str(end_time - init_time) + "to create the vocabulary")
    return vocab
def tokenize(observations):
    vectorizer = TfidfVectorizer(
        strip_accents='unicode',
        lowercase=True,
        analyzer='word',
    )
    tokenizer = vectorizer.build_tokenizer()
    observations['aft_comment'] = observations['aft_comment'].astype(str)
    tokenized_text = observations['aft_comment'].apply(tokenizer).values
    return tokenized_text
Esempio n. 9
0
def create_vocab(train):
    init_time = time.time()
    vocab = set()
    t = TfidfVectorizer()
    tokenizer = t.build_tokenizer()
    for ex in train[0]:
        vocab.update(tokenizer(ex))
    end_time = time.time()
    print("it took " + str(end_time - init_time) + "to create the vocabulary")
    return vocab
class TfidfTokenizerWrapper(AbstractTokenizer):
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tokenizer = self.vectorizer.build_tokenizer()

    def tokenize(self, text):
        return self.tokenizer(text)

    def convert_tokens_to_ids(self, text_list):
        return self.vectorizer.fit_transform(
            (' '.join(tokenized_text)
             for tokenized_text in text_list)).toarray()
Esempio n. 11
0
class Analyzer(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer())
        self.tokens = self.tfidf.build_tokenizer()
        self.ngram = self.tfidf.build_analyzer()

    def __call__(self, sentence):
        ret = self.ngram(sentence)
        terms = self.tokens(sentence)
        for term in terms:
            cate = term_category(term)
            if term != cate:
                ret.append(cate)
        return ret
Esempio n. 12
0
class Vectorizer(object):
    def __init__(self):
        self.count_vec = TfidfVectorizer(binary = True,
                                         ngram_range = (1, 3),
                                         tokenizer = Tokenizer())

        self.last_vec = CountVectorizer(binary = True, ngram_range = (1, 1), tokenizer = Tokenizer())


    def collect_last_term(self, X):
        X_last = list()
        tokens = self.last_vec.build_tokenizer()
        _logger.debug("Extracting last term for each sentence")
        for sent in X:
            X_last.append(tokens(sent)[-1])
        _logger.debug("Fitting last-term vectorizer")
        return X_last
        

    def fit(self, X, y = None):
        _logger.debug("Fitting count vectorizer")
        self.count_vec.fit(X)
        X_last = self.collect_last_term(X)
        self.last_vec.fit(X_last)
        return self

    def transform(self, X, y = None):
        #return self.count_vec.transform(X)
        _logger.debug("Doing tfidf transform")
        Xc = self.count_vec.transform(X)

        X_last = self.collect_last_term(X)
        _logger.debug("Doing last term transform")
        Xl = self.last_vec.transform(X_last)
        _logger.debug("stacking features")
        ret = sparse.hstack([Xc, Xl])
        
        tokens = self.count_vec.build_tokenizer()
        l = list()
        for sent in X:
            terms = tokens(sent)
            l.append(1 if  ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0)

        l = np.array(l)
        l.shape = len(l), 1
        ret = sparse.hstack([ret, l])
        _logger.debug("vectorization transform done")

        return ret
Esempio n. 13
0
 def find_tfidf(self):
     ''' pre-calculate tfidf '''
     print('Finding tfidf...')
     stop_words = set(stopwords.words('english'))
     vectorizer = TfidfVectorizer(lowercase=False,
                                  ngram_range=self.ngrams,
                                  norm='l2',
                                  smooth_idf=True,
                                  stop_words=stop_words,
                                  min_df=2,
                                  max_df=0.8)
     data = self.data[self.description].apply(self.remove_html)
     self.tfidf = vectorizer.fit_transform(data)
     self.tfidf_indices = vectorizer.get_feature_names()
     self.tokenizer = vectorizer.build_tokenizer()
Esempio n. 14
0
File: encode.py Progetto: mbilab/rct
def tfidf_sequential_model(data, only_overall=True, **kwargs):
    X = field_array(data, 'Text')
    X.append(' '.join(X))
    tfidfer = TfidfVectorizer(**kwargs)
    tfidfer.fit(X)
    values = tfidfer.transform(X)
    if only_overall:
        values = values[-1]
    terms = tfidfer.get_feature_names()
    n = len(terms)
    s = sorted(range(n), key=lambda k: values[0,k], reverse=True)
    for i in range(n):
        print('%s\t%s\t%s' % (terms[s[i]], values[0,s[i]], (i+1) / n))
    return {
            'terms': terms,
            'tokenizer': tfidfer.build_tokenizer(),
            'values': values,
            }
def tokenize_query(query):
    helper = TfidfVectorizer(min_df=3, stop_words='english',  dtype=np.int16)
    tfidf_preprocessor = helper.build_preprocessor()
    tfidf_tokenizer = helper.build_tokenizer()
    with open(os.path.join(os.path.dirname(__file__), 'reuters/vocab_to_ix.json')) as f:
        #vocab_to_ix = json.load(open('vocab_to_ix.json'))
        vocab_to_ix= json.load(f)
        prepro_q = tfidf_preprocessor(query)
        q_tokens = tfidf_tokenizer(prepro_q)
        gc.collect()
        query_dict = defaultdict(int)
        for tok in q_tokens:
            tfidf_vocab_ix = vocab_to_ix.get(tok, -1)
            if tfidf_vocab_ix != -1:
                query_dict[vocab_to_ix[tok]] += 1
        f.close()
        gc.collect()
        return query_dict
Esempio n. 16
0
    def transform_cnn_data(self, X_raw, feat_and_param):
        feat_and_param['feats']['ngram_range'] = (1,1)
        feat_and_param['feats']['use_idf'] = False
        feat_and_param['feats']['binary'] = False

        vectorizer = TfidfVectorizer(**feat_and_param['feats'])
        vectorizer.fit(X_raw)
        tokenizer = TfidfVectorizer.build_tokenizer(vectorizer)
        X_raw_tokenized = [tokenizer(ex) for ex in X_raw]
        train_X = []
        for example in X_raw_tokenized:
            for i in range(len(example)):
                example[i] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", example[i])
            train_X.append([vectorizer.transform(example)])
        index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
        for key in index_to_word:
            index_to_word[key] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", index_to_word[key])
        return train_X, index_to_word
Esempio n. 17
0
    def transform_cnn_data(self, X_raw, feat_and_param):
        #DEBUGGING
        feat_and_param['feats']['ngram_range'] = (1,1)
        feat_and_param['feats']['use_idf'] = False
        feat_and_param['feats']['binary'] = False

        vectorizer = TfidfVectorizer(**feat_and_param['feats'])
        vectorizer.fit(X_raw)
        tokenizer = TfidfVectorizer.build_tokenizer(vectorizer)
        X_raw_tokenized = [tokenizer(ex) for ex in X_raw]
        train_X = []
        for example in X_raw_tokenized:
            for i in range(len(example)):
                example[i] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", example[i])
            train_X.append([vectorizer.transform(example)])
        index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
        #for key in index_to_word:
        #    index_to_word[key] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", index_to_word[key])
        return train_X, index_to_word
class CleanAndVectorize(object):
    def __init__(self, **kwargs):
        max_df = kwargs.get('max_df', .9)
        max_features = kwargs.get('max_features', 1000)
        self.vectorizer = TfidfVectorizer(strip_accents='unicode',
                                          lowercase=True,
                                          analyzer='word',
                                          max_df=max_df,
                                          max_features=max_features)
        self.tokenizer = self.vectorizer.build_tokenizer()
        self.cols_to_extract = [
            'aft_id', 'aft_page', 'aft_page_revision', 'aft_user',
            'aft_user_text', 'aft_comment', 'aft_noaction',
            'aft_inappropriate', 'aft_helpful', 'aft_unhelpful', 'aft_rating'
        ]

    def process(self,
                observations,
                save_tokens=False,
                remove_zero=True,
                debug=False,
                add_rating=False):
        if debug:
            observations = observations.sample(debug)
        observations = observations[self.cols_to_extract]
        observations['aft_comment'] = observations['aft_comment'].astype(str)
        observations['aft_net_sign_helpful'] = np.sign(
            observations['aft_helpful'] -
            observations['aft_unhelpful']).astype(int)
        if remove_zero:
            observations = observations.loc[
                observations['aft_net_sign_helpful'] != 0]
        if save_tokens:
            observations['tokenized_text'] = observations['aft_comment'].apply(
                self.tokenizer)
        #observations['feature_vector'] = self.vectorizer.fit_transform(observations['aft_comment'].values).toarray().tolist()
        feature_vectors = self.vectorizer.fit_transform(
            observations['aft_comment'].values)
        if add_rating:
            feature_vectors = hstack(
                (feature_vectors, observations['aft_rating'].values[:, None]))
        return observations, feature_vectors
Esempio n. 19
0
def process_joke(joke):
    data = {}

    # Lowercase text.
    joke.text = joke.text.lower()

    # Replace text with dict.
    stop_words = set(stopwords.words('english'))
    vectorizer = TfidfVectorizer()
    tokenizer = vectorizer.build_tokenizer()

    def tokenize_text(text, prefix=''):
        d = {}
        for term in tokenizer(text):
            if term in stop_words:
                continue
            d[prefix + term] = d.get(prefix + term, 0) + 1
        return d

    data.update(tokenize_text(joke.text, 't_'))
    data.update({('cat_' + cat): 1 for cat in joke.categories})
    data.update({('subcat_' + cat): 1 for cat in joke.subcategories})

    return data
Esempio n. 20
0
def main():


    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')  #, tokenizer=StemTokenizer())

    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10  # max(10, args.fixk)

    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))


    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = experiment_utils.clean_html(data.train.data)
    data.test.data = experiment_utils.clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    train_test_data = Bunch()

    expert_data.sentence, train_test_data.pool = split_data(data.train)
    expert_data.oracle, train_test_data.test = split_data(data.test)

    data.train.data = train_test_data.pool.train.data
    data.train.target = train_test_data.pool.train.target

    data.test.data = train_test_data.test.train.data
    data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")

    labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)

    expert_data.oracle.train.data = sent_train
    expert_data.oracle.train.target = np.array(labels)
    expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)


    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)

    expert_data.sentence.train.data = sent_train
    expert_data.sentence.train.target = np.array(labels)
    expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)

    sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### TESTING THE CLASSIFERS

    test_target, test_data = split_data_sentences(data.test,sent_detector)
    test_data_bow = vct.transform(test_data)

    #pred_sent = sent_clf.predict(test_data_bow)
    pred_ora = exp_clf.predict(test_data_bow)
    y_probas = sent_clf.predict_proba(test_data_bow)
    pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
    ## just based on one class probability
    # order = np.argsort(y_probas[:,0])
    order = np.argsort(y_probas.max(axis=1))
    print "ORACLE\tSENTENCE\tMAX-SENT"
    # for i in order[:500]:
    #     print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
    for i in order[-500:]:
        print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
    print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
    print "Class distribution: %s" % pred_sent.sum()
    print "Size of data: %s" % pred_sent.shape[0]
    sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
    clf = linear_model.LogisticRegression(penalty='l1', C=1)
    bootstrap = rand.permutation(len(test_data))
    x = []
    y = []
    for s in sizes:
        indices = bootstrap[:s]

        train_x = expert_data.sentence.train.bow[indices[:s]]
        train_y = expert_data.sentence.train.target[indices[:s]]

        clf.fit(train_x, train_y)

        predictions = clf.predict(test_data_bow)
        scores = metrics.accuracy_score(test_target,predictions)
        ## print clf.__class__.__name__
        print "Accuracy {0}: {1}".format(s, scores)
        y.append(scores)
    plt.clf()
    plt.title("Accuracy")
    plt.xlabel("Labels")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.plot(sizes, y, '--bo', label="sent")
    plt.show()
Esempio n. 21
0
class FeatureExtractor:

	vectorizer = None
	feature_names = None
	feature_matrix = None

	def train_extractor_from_lines(self, train_lines, labels, test_lines):
		self.vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=DISTINCT_WORDS_CNT)
		self.vectorizer.fit(train_lines + test_lines)

		pass

	def load_vectorizer(self):
		input_file = open('../models/tfidf_vectorizer.pkl', 'rb')
		self.vectorizer = pickle.load(input_file)
		input_file.close()
		pass

	def save_vectorizer(self):
		output_file = open('../models/tfidf_vectorizer.pkl', 'wb')
		pickle.dump(self.vectorizer, output_file)
		output_file.close()
		pass

	def train_extractor(self, full = False):

		if not full:
			train_lines = file2lines('../data/train_lite.csv')
			labels = file2labels('../data/train_lite.csv')
			test_lines = file2lines('../data/test_lite.csv')
		else:
			train_lines = file2lines('../data/train.csv')
			labels = file2labels('../data/train.csv')
			test_lines = file2lines('../data/test.csv')

		self.train_extractor_from_lines(train_lines, labels, test_lines)

		pass

	def lines2words(self, lines):
		self.tokenizer = self.vectorizer.build_tokenizer()

		return [self.tokenizer(line) for line in lines]

	def lines2features(self, lines, use_tense = False):
		"""
		returns DataFrame(feature_matrix, feature_name)

		['word_rainny', 'word_'sunny'],
		array([
			[1, 0.4, 0.2],
			[0.2, 1, 0.2],
		])
		"""
		self.feature_names = []
		self.feature_matrix = None

		# tf-idf features
		data = self.vectorizer.transform(lines).toarray()

		self.feature_names = self.vectorizer.get_feature_names()
		self.feature_matrix = data

		# additional features
		add_features = []
		important_words = ['sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain', 'snow', 'tornado', 'storm', 'hurricane']
		important_words = ['cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain', 'snow', 'storm', 'sunny', 'tornado', 'wind']
		self.feature_names = self.feature_names + ['impt_words:' + word for word in important_words]
		if use_tense:
			self.feature_names = self.feature_names + ['past_tense_num', 'present_tense_num']

		all_words = self.lines2words(lines)
		for words in all_words:
			# important words
			important_words_ftr = [int(word in words) for word in important_words]
			add_features.append(important_words_ftr)

			# tense
			if use_tense:
				tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1]
				past_num = len([v for v in tagz if v == 'VBD'])
				present_num = len([v for v in tagz if v in ['VBP', 'VB']])

				add_features.append([past_num, present_num])
    	
		self.feature_matrix = np.hstack((self.feature_matrix, add_features))

		return DataFrame(self.feature_matrix, columns = self.feature_names)
Esempio n. 22
0
        count_pos_test = count_neg_test + 1
        
label_test = test_data[:,1]
#vctr =  CountVectorizer(stop_words='english',min_df = 1)
#vctr2 = HashingVectorizer(stop_words='english') 
vctr = TfidfVectorizer(stop_words='english') #intailising vectorizers TF-IDF gives better accuracy by 1 percent compared to the other vectors
count_pos = 0
count_neg = 0

######################################################################################################
train = []
test = []
for i in range(len(train_data)):           #processing of the train data
    string = train_data[i,0]            
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    train.append(' '.join(string))

for i in range(len(test_data)):            #processing of the test data  
    string = test_data[i,0]
    string = vctr.build_preprocessor()(string.lower()) 
    string = vctr.build_tokenizer()(string.lower())
    test.append(' '.join(string)) 

######################################################################################################
train_data1 = vctr.fit_transform(train).toarray() #fitting the dictionary for bag of words model using TF-IDF vectorizers
#X_test = vctr.transform(test).toarray()
y_train = np.asarray(label_train, dtype="|S6")
y_train = y_train.astype(int)
clf1 =   GradientBoostingClassifier(n_estimators = 500) #initialising classifiers
clf2 =   AdaBoostClassifier(n_estimators = 500)
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        doc = str(doc)
        s = "".join(doc.split("__EOS__"))
        doc = s.translate(None, string.punctuation)
        tokens = doc.word_tokenize(doc)
        bi = list(p1+" "+p2 for p1,p2 in nltk.bigrams(tokens))
        tokens.extend(bi)
        return [self.wnl.lemmatize(t) for t in tokens]            


if _use_TFIDF_ :
    #vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features, tokenizer=LemmaTokenizer())
    vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features)
    func_tokenizer =vectorizer.build_tokenizer()

'''
I was using two functions earlier for tokenization and data preprocessing.
Later implemented the LemmaTokenizer class for this. 
'''

def ispunct(some_string):
    return not any(char.isalnum() for char in some_string)
    

def get_tokens(s):
#   Tokenize into words in sentences. Returns list of strs
    retval = []
    sents = sent_tokenize(s)
    for sent in sents:
Esempio n. 24
0
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer

dataset=fetch_20newsgroups(categories=['alt.atheism','talk.religion.misc','sci.space']) # berem toko 3 categorii
vect = TfidfVectorizer()
tok=vect.build_tokenizer() # хорошо токенизирует все
texts=[]
lem=WordNetLemmatizer()
lemms=[]
#for text in dataset.data:
#    for token in tok(text):
#        lemms.append(lem.lemmatize(token))
#    texts.append(lemms)
#models = models.Word2Vec(texts,size=100, window=5,min_count=5,workers=4)
#models.save('texts.dat')

model = models.Word2Vec.load('texts.dat')
#print(model['theory'])
#print(model.similarity('man','car'))
#print(model.most_similar(positive=['man'],negative=['computer']))
print model.doesnt_match("car wheel glass engine".split())
Esempio n. 25
0
class SplitVectorizer():
    def __init__(self, tfidf_model=None,
                        input_file_name=None,
                        type_analyzer='word',
                        n_gram_range=(1, 2),
                        Xy='X',
                        vectorize=False):
        if tfidf_model == None:
            assert input_file_name != None  # Give model or input text
            self.model = TfidfVectorizer(analyzer=type_analyzer,
                                                ngram_range=n_gram_range)
        elif input_file_name == None:
            assert tfidf_model != None  # Give model or input text
            self.model = tfidf_model

        elif not None in [input_file_name, tfidf_model]:
            self.model = tfidf_model

        self.XY = Xy
        self.input_file = input_file_name
        self.vectorize = vectorize

    def fit(self, X=None, y=None):
        with open(self.input_file) as f:
            self.model.fit(f)

        self.analyzer = self.model.build_analyzer()
        self.prep = self.model.build_preprocessor()
        self.tokenizer = self.model.build_tokenizer()
        self.vocab = {self.model.vocabulary_[w]: w
				for w in self.model.vocabulary_}

        return self

    def get_matrices(self):
        self.docs_X = []
        self.docs_Y = []
        for a in open(self.input_file):
            x = self.tokenizer(self.prep(a))
            dl = len(x)
            self.docs_X.append(" ".join(x[:int(dl/2)]))
            self.docs_Y.append(" ".join(x[int(dl/2):]))
        return self.model.transform(self.docs_X), \
               self.model.transform(self.docs_Y)

    def Tx(self, x):
        if self.vectorize:
            return self.model.transform([x])
        else:
            return self.analyzer(x)

    def __iter__(self):
        for a in open(self.input_file):
            x = self.tokenizer(self.prep(a))
            dl = len(x)

            if self.XY == 'X':
                yield self.Tx(" ".join(x[:int(dl/2)]))
            elif self.XY == 'Y':
                yield self.Tx(" ".join(x[int(dl/2):]))
            elif self.XY == 'join':
                yield self.Tx(" ".join(x[:int(dl/2)])), \
			self.Tx(" ".join(x[int(dl/2):]))
class CleanAndVectorize(object):
    def __init__(self, en_kvs_path, **kwargs):
        max_df = kwargs.get('max_df', .9)
        max_features = kwargs.get('max_features', 1000)
        self.tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode',
                                                lowercase=True,
                                                analyzer='word',
                                                max_df=max_df,
                                                max_features=max_features)
        self.w2v_vectorizer = KeyedVectors.load(en_kvs_path, mmap='r')
        self.tokenizer = self.tfidf_vectorizer.build_tokenizer()
        self.cols_to_extract = [
            'aft_id', 'aft_page', 'aft_page_revision', 'aft_user',
            'aft_user_text', 'aft_comment', 'aft_noaction',
            'aft_inappropriate', 'aft_helpful', 'aft_unhelpful', 'aft_rating'
        ]

    def get_token_vector(self, token):
        if token in self.w2v_vectorizer:
            return self.w2v_vectorizer[token]
        else:
            return np.zeros(self.w2v_vectorizer.vector_size)

    def get_sentence_vector(self, token_list):
        vector_list = np.array([self.get_token_vector(x) for x in token_list])
        sentence_vector = np.mean(vector_list, axis=0)
        return sentence_vector

    def get_feature_vector(self, observation, add_rating=False):
        feature_vector = self.get_sentence_vector(
            observation['tokenized_text'])
        if add_rating:
            feature_vector = np.append(feature_vector,
                                       observation['aft_rating'])
        feature_vector = feature_vector.tolist()
        return feature_vector

    def process(self,
                observations,
                save_tokens=False,
                remove_zero=True,
                debug=False,
                add_rating=False):
        if debug:
            observations = observations.sample(debug)
        observations = observations[self.cols_to_extract]
        observations['aft_comment'] = observations['aft_comment'].astype(str)
        observations['aft_net_sign_helpful'] = np.sign(
            observations['aft_helpful'] -
            observations['aft_unhelpful']).astype(int)
        if remove_zero:
            observations = observations.loc[
                observations['aft_net_sign_helpful'] != 0]
        observations['tokenized_text'] = observations['aft_comment'].apply(
            self.tokenizer)
        observations = observations.loc[
            observations['tokenized_text'].apply(len) > 0]
        observations['feature_vector'] = observations[[
            'tokenized_text', 'aft_rating'
        ]].apply(self.get_feature_vector, axis=1, add_rating=add_rating)
        if not save_tokens:
            observations.drop(labels='tokenized_text', axis=1, inplace=True)
        return observations
Esempio n. 27
0
class NLPPipeline():
    def __init__(self, text, Y, train_size=.85):
        self.model_builders = {'dtc': dtc, 'rfc': rfc}
        steps = ['tfidf', 'feature_engineering', 'lda', 'model']
        self.pipeline_dic = {step: None for step in steps}
        self.text_train, self.text_test, self.Y_train, self.Y_test = split(
            text, Y, train_size=train_size, stratify=Y)
        self.keep_tfidf = lambda tfidf_dic: (tfidf_dic == self.pipeline_dic[
            'tfidf'])
        self.keep_features = lambda features_dic: (features_dic == self.
                                                   pipeline_dic['features'])
        self.prob_info = lambda prob: -prob * np.log(prob)
        self.pipeline_dic = {step: "Default" for step in steps}
        self.train_size = train_size

    def update_tfidf(self, tfidf_dic):
        self.pipeline_dic['tfidf'] = tfidf_dic
        self.tfidf = TfidfVectorizer(**tfidf_dic)
        self.tfidf_train = self.tfidf.fit_transform(self.text_train)
        self.tfidf_train = self.tfidf_train.toarray()
        self.tokenizer = self.tfidf.build_tokenizer()
        self.tfidf_test = self.tfidf.transform(self.text_test)
        self.tfidf_test = self.tfidf_test.toarray()
        self.feature_names = self.tfidf.get_feature_names()

    def update_lda(self, lda_dic):
        def calc_topics_words(num_top_words):
            topics_words = []

            for ix, topic in enumerate(self.lda.components_):
                top_word_inds = topic.argsort()[:-num_top_words - 1:-1]
                topic_words = set(
                    [self.feature_names[i] for i in top_word_inds])
                topics_words.append(topic_words)

            return topics_words

        num_top_words = lda_dic[
            'num_top_words'] if 'num_top_words' in lda_dic else 10
        lda_model_dic = {
            k: v
            for k, v in lda_dic.items() if k != 'num_top_words'
        }
        self.lda = LDA(**lda_model_dic)
        self.lda.fit_transform(self.tfidf_train)
        self.topics_words = calc_topics_words(num_top_words)

    def calc_entropy(self, text):
        word_counts = defaultdict(int)
        text_size = float(len(text))

        for word in text:
            word_counts[word] += 1

        word_counts = np.array(list(word_counts.values()))
        word_probs = word_counts / text_size
        entropy = -1 * sum(map(self.prob_info, word_probs))

        return entropy

    def calc_lda_features(self, tokenized_text):
        num_topics = len(self.topics_words)
        unique_words = set(tokenized_text)
        num_unique_words = float(len(unique_words))
        lda_features = [
            len(unique_words.intersection(topic_words)) / num_unique_words
            for topic_words in self.topics_words
        ]

        return lda_features

    def calc_sentiment_features(self, text):
        min_polarity, max_polarity = -.1, .1
        blob = TextBlob(text)
        polarities = [
            sentence.sentiment.polarity for sentence in blob.sentences
        ]
        polarities = [round(polarity, 2) for polarity in polarities]
        polarity_entropy = self.calc_entropy(polarities)
        polarity_var = np.var(polarities)
        num_pos_sents = len(
            [polarity for polarity in polarities if polarity > max_polarity])
        num_neg_sents = len(
            [polarity for polarity in polarities if polarity < min_polarity])
        num_sents = float(len(polarities))

        pos_sent_freq, neg_sent_freq = num_pos_sents / num_sents, num_neg_sents / num_sents
        num_neutral_sents = num_sents - num_pos_sents - num_neg_sents
        max_pol, min_pol = np.max(polarities) if polarities else 0, min(
            polarities) if polarities else 0
        subjectivities = [
            sentence.sentiment.subjectivity for sentence in blob.sentences
        ]
        subjectivities = [round(x, 2) for x in subjectivities]
        subj_var = np.var(subjectivities)
        max_subj, min_subj = np.max(subjectivities) if polarities else 0, min(
            subjectivities) if polarities else 0
        sentiment_features = [
            polarity_entropy, polarity_var, num_pos_sents, num_neg_sents,
            num_neutral_sents, pos_sent_freq, neg_sent_freq, num_sents,
            max_pol, min_pol, subj_var, max_subj, min_subj
        ]

        return sentiment_features

    def update_features(self, features_dic):
        def calc_features(text):
            words = self.tokenizer(text)
            entropy = self.calc_entropy(words)
            lda_features = self.calc_lda_features(words)
            sentiment_features = self.calc_sentiment_features(text)
            features = [entropy, *lda_features, *sentiment_features]

            return features

        self.pipeline_dic['features'] = features_dic
        self.update_lda(features_dic)
        self.X_train = np.hstack(
            (self.tfidf_train,
             np.array(
                 [np.array(calc_features(text)) for text in self.text_train])))
        self.X_test = np.hstack(
            (self.tfidf_test,
             np.array(
                 [np.array(calc_features(text)) for text in self.text_test])))

    def grid_search(self, step_grids):
        def get_step_dics(grid):
            param_names = list(grid.keys())
            param_val_combos = list(product(*list(grid.values())))
            num_params = len(param_names)
            step_dics = [{
                param_names[j]: param_val_combo[j]
                for j in range(num_params)
            } for param_val_combo in param_val_combos]

            return step_dics

        steps = list(step_grids.keys())
        num_steps = len(steps)
        grids = list(step_grids.values())
        step_dics = list(map(get_step_dics, grids))
        pipeline_combos = list(product(*step_dics))
        pipeline_dics = [{
            steps[i]: pipeline_combo[i]
            for i in range(num_steps)
        } for pipeline_combo in pipeline_combos]
        pipeline_scores = [[pipeline_dic,
                            self.score(pipeline_dic)]
                           for pipeline_dic in pipeline_dics]
        pipeline_scores.sort(key=lambda x: x[1], reverse=True)

        return pipeline_scores

    def score(self, pipeline_dic):
        tfidf_vectorizer = TfidfVectorizer(**pipeline_dic['tfidf'])
        keep_tfidf = self.keep_tfidf(pipeline_dic['tfidf'])

        if not keep_tfidf:
            self.update_tfidf(pipeline_dic['tfidf'])

        keep_features = keep_tfidf and self.keep_features(
            pipeline_dic['features'])

        if not keep_features:
            self.update_features(pipeline_dic['features'])

        self.model_builder = self.model_builders[pipeline_dic['model']['type']]
        model_dic = {
            key: value
            for key, value in pipeline_dic['model'].items() if key != 'type'
        }
        self.model = self.model_builder(**model_dic)
        self.model.fit(self.X_train, self.Y_train)
        Y_pred = self.model.predict(self.X_test)
        score = accuracy(Y_pred, self.Y_test)
        print(f"Params = {pipeline_dic}, score = {score}. \n")

        return score
Esempio n. 28
0
    def visit(self, featureset):
        try:
            # TODO: outsource into method "set_tokenizer" (tokenizer as member - no extraction_target required then)
            tokenizer = None
            if self._extraction_target == "word":
                tokenizer = LemmaTokenizer(LanguageProcessor())
            elif self._extraction_target == "pos":
                tokenizer = POSTokenizer(LanguageProcessor())
            elif self._extraction_target == "ne_simple":
                tokenizer = NamedEntityTokenizer(LanguageProcessor())
            elif self._extraction_target == "ne_detailed":
                tokenizer = NamedEntityTokenizer(LanguageProcessor(),
                                                 detailed=True)
            elif self._extraction_target.startswith("wordlist"):
                path = self._extraction_target.split("_")[1]
                tokenizer = WordlistEntryTokenizer(LanguageProcessor(),
                                                   wordlist=path)

            # TODO: outsource into method "set_vectorizer" (vectorizer as member - no measure required then)
            print(self._ngram)
            print(self._column)
            vectorizer = None
            binary = self._measure == "presence" or self._extraction_type == "presence"
            if self._ngram is None:
                if self._measure == "tfidf":
                    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
                else:
                    vectorizer = CountVectorizer(tokenizer=tokenizer,
                                                 binary=binary)
            else:
                if self._measure == "tfidf":
                    vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                                 ngram_range=self._ngram)
                else:
                    vectorizer = CountVectorizer(tokenizer=tokenizer,
                                                 ngram_range=self._ngram,
                                                 binary=binary)
            temp_column = featureset.get_featureset()[self._column]
            temp_column = temp_column.values

            new_column = []
            "Note: Presence and Count for every(einzeln) feature or for all(alle) feature"
            if self._extraction_type == "bow" or self._extraction_type == "ngram":
                # Return Matrix
                new_column = list(
                    vectorizer.fit_transform(temp_column).toarray())
            elif self._extraction_type == "list":
                # Return String Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    print(row)
                    print(analyzer(row))
                    new_column.append(analyzer(row))
            elif self._extraction_type == "presence":
                # Return Numeric Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    new_column.append(1 if len(analyzer(row)) > 0 else 0)
                    # new_column.append(len(analyzer(row)) > 0)
            elif self._extraction_type == "count":
                # Return Numeric Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    new_column.append(len(analyzer(row)))
            return new_column
        except Exception as error:
            util.print_error("Failed to use Language Processor " + str(error))
            util.print_detailed_error()
Esempio n. 29
0
class file_index(object):
    "Use n_jobs = 1 for now."

    def __init__(self,
                 input_file,
                 index_file=None,
                 mmap=True,
                 wsize=10,
                 vectorizer=None,
                 encoding='latin1',
                 sampsize=50,
                 n_jobs=1,
                 chunk_size=1000,
                 verbose=True):

        self.mmap = mmap
        self.memory = ":memory:"
        if not (vectorizer is None):
            self.vectorizer = vectorizer
            self.tokenizer = vectorizer.build_tokenizer()
        self.encoder = encoding
        self.index_file = self.memory if (
            not index_file or index_file == ":memory:") else index_file
        self.chunk_size = chunk_size
        self.input_file = input_file
        self.wsize = wsize
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.sampsize = sampsize
        if not os.path.exists(self.index_file):
            self.connect()
            self.cursor.execute("create table words (word text, coo text)")
        else:
            self.connect()
            self.load_input()

    def __enter__(self):
        self.connect()
        return self

    def __exit__(self):
        self.disconnect()
        return self

    def windows(self, word):
        if self.n_jobs != 1:
            self.connect()

        try:
            self.index_lines
        except AttributeError:
            self.load_input()

        if self.sampsize > 0:
            query = "select * from words where word=? order by random() limit ?"
            t = (word, self.sampsize)
            self.cursor.execute(query, t)
        else:
            query = "select * from words where word=?"
            self.cursor.execute(query, (word, ))

        coordinates = self.str2tup([t for w, t in self.cursor.fetchall()])

        windows = []
        for r, w in coordinates:
            try:
                ln = self.index_lines[r].split()  #decode("utf-8").split()
            except UnicodeDecodeError:
                continue
            except AttributeError:
                print(
                    "\nCall 'load_input()' method before querying windows.\n")
                raise

            start = min(len(ln[0:w]), self.wsize)
            windows.append(ln[w - start:w] + ln[w + 1:w + (self.wsize + 1)])

        if self.verbose > 10:
            logging.info("Got windows for '%s'\n" % word)
        return windows

    def fit(self):
        with open(self.input_file,
                  mode='rb') as f:  # encoding='latin-1', mode='rb') as f:
            if self.index_file != self.memory and self.chunk_size > 0:
                c = 0
                ck = 0
                for n, row in enumerate(enumerate(f)):
                    #st()
                    self.index_row(n, row[1])
                    if c == self.chunk_size:
                        c = 0
                        self.conn.commit()
                        if self.verbose > 5:
                            logging.info(
                                "Saved index chunk %d into index file %s \n" %
                                (ck, self.index_file))
                        ck += 1
                    c += 1

            else:
                if self.verbose:
                    logging.info("Creating index in-memory database... \n")

                for n, row in enumerate(get_binary(self.input_file)):
                    self.index_row(n, row)

            try:
                self.cursor.execute("create index idxword on words(word)")
                self.conn.commit()
                # Getting properties
                self.cursor.execute("SELECT * FROM words")
                self.vocab = list(set([r[0] for r in self.cursor.fetchall()]))
                self.vocab_size = len(self.vocab)

                if self.verbose:
                    logging.info("Saved index into index file datbase %s\n" %
                                 self.index_file)
                return self
            except:
                print("Database couldn't be created... EXIT error.")
                raise

    def load_input(self):
        """ Call this method when a prefitted index db file already exists"""
        with open(self.input_file,
                  mode='rb') as fc:  # encoding=self.encoder, mode='rb') as fc:
            self.index_lines = fc.readlines()

        self.cursor.execute("SELECT * FROM words")
        self.vocab = list(set([r[0] for r in self.cursor.fetchall()]))
        self.vocab_size = len(self.vocab)
        logging.info("Loaded index database properties and connections..")
        # Return pointer to the index
        return self

    def connect(self):
        self.conn = sqlite3.connect(self.index_file, check_same_thread=False)
        self.cursor = self.conn.cursor()
        return self

    def disconnect(self):
        self.conn.commit()
        self.conn.close()
        return self

    def tup2str(self, t):
        if isinstance(t, list):
            return [str(a) + ',' + str(b) for a, b in t]
        else:
            return str(t[0]) + ',' + str(t[1])

    def str2tup(self, t):
        if isinstance(t, list):
            r = []
            for x in t:
                r.append(self.str2tup(x))
            return r
        else:
            a, b = t.split(',')
            return (int(a), int(b))

    def index_row(self, line_id, row, conn=None):
        if self.n_jobs != 1 and self.n_jobs != 0:
            cursor = conn.cursor()
        else:
            cursor = self.cursor

        for of, word in enumerate(self.tokenize(row)):
            if word is None: continue
            t = (word, self.tup2str((line_id, of)))
            insert = "INSERT INTO words VALUES (?, ?)"
            try:
                cursor.execute(insert, t)
            except sqlite3.OperationalError:
                print("Problems to create word table '%s'.\n" % word)
                self.disconnect()
                raise

    def tokenize(self, string):
        if self.tokenizer:
            if self.vectorizer.lowercase:
                try:
                    string = string.decode(errors="replace").lower()
                except Exception as e:
                    logging.info(
                        "Problems occurred while indexing row: {}\nEXCEPTION: {}"
                        .format(row, e))
                    return None
            return [w.encode() for w in self.tokenizer(string)]
        else:
            self.vectorizer = TfidfVectorizer()
            self.tokenizer = self.vectorizer.build_tokenizer()
            return self.tokenize(string)
Esempio n. 30
0
def main():
    seed = 9001

    combined_data = read_all_data()

    # Create train/test split of data
    x_train, x_test, y_train, y_test = train_test_split(
        combined_data["headline"],
        combined_data["is_clickbait"],
        random_state=seed)

    if len(sys.argv) > 1:
        print()
        print("Loading pickle...")
        print()

        pipe = utils.unpickle_gzip("models/pipeline.pickle.gz")
    else:
        print()
        print("Training...")
        print()

        # Instantiate TfidVectrorizer to translate text data to feature vectors
        # such that they can be used as inputs for an estimator
        tf_v = TfidfVectorizer(strip_accents='unicode')

        # With the vectorizer trained, let's load some different estimators
        clf = LogisticRegressionCV(
            cv=5,
            solver='saga',
            random_state=seed,
        )

        pipe = make_pipeline(tf_v, clf)

        pipe.fit(x_train, y_train)

    print()
    print("Predicting...")
    print()

    predictions = pipe.predict(x_test)
    utils.print_evaluation(y_test, predictions)

    if len(sys.argv) <= 1:
        print()
        print("Pickling...")
        print()

        utils.pickle_gzip(pipe, "models/pipeline.pickle.gz")

    # CANNOT RUN DUE TO MEMORY
    # rfc = RandomForestClassifier(
    #     n_jobs=-1,
    #     n_estimators=1000,
    #     random_state=seed,
    #     verbose=3)
    # predictions = rfc.predict(x_test)
    # utils.print_evaluation(y_test, predictions)

    print("\n\nPlotting frequency of word use . . .")
    plot_split_word_freqs(combined_data, tf_v.build_preprocessor(),
                          tf_v.build_tokenizer())
Esempio n. 31
0
    def visit(self, featureset):
        try:
            # TODO: outsource into method "set_tokenizer" (tokenizer as member - no extraction_target required then)
            tokenizer = None
            if self._extraction_target == "word":
                tokenizer = LemmaTokenizer(LanguageProcessor())
            elif self._extraction_target == "pos":
                tokenizer = POSTokenizer(LanguageProcessor())
            elif self._extraction_target == "ne_simple":
                tokenizer = NamedEntityTokenizer(LanguageProcessor())
            elif self._extraction_target == "ne_detailed":
                tokenizer = NamedEntityTokenizer(LanguageProcessor(), detailed=True)
            elif self._extraction_target.startswith("wordlist"):
                path = self._extraction_target.split("_")[1]
                tokenizer = WordlistEntryTokenizer(LanguageProcessor(), wordlist=path)

            # TODO: outsource into method "set_vectorizer" (vectorizer as member - no measure required then)
            print(self._ngram)
            print(self._column)
            vectorizer = None
            binary = self._measure == "presence" or self._extraction_type == "presence"
            if self._ngram is None:
                if self._measure == "tfidf":
                    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
                else:
                    # TODO: here it is absolute term-frequency - what about relative?
                    #   For ngrams not easy:
                    #   - needs to count the amount of n-gram for each document and divide each feature generated from
                    #     the ngram-counts of the document by that amount
                    #   For named-entities:
                    #   - count words inside named entities (not just the amount of NEs) devide by num tokens of doc
                    #   ...

                    vectorizer = CountVectorizer(tokenizer=tokenizer, binary=binary)
            else:
                if self._measure == "tfidf":
                    vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=self._ngram)
                else:
                    vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=self._ngram, binary=binary)
            temp_column = featureset.get_featureset()[self._column]
            temp_column = temp_column.values

            new_column = []
            "Note: Presence and Count for every(einzeln) feature or for all(alle) feature"
            if self._extraction_type == "bow" or self._extraction_type == "ngram":
                # Return Matrix
                new_column = list(vectorizer.fit_transform(temp_column).toarray())
            elif self._extraction_type == "list":
                # Return String Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    print(row)
                    print(analyzer(row))
                    new_column.append(analyzer(row))
            elif self._extraction_type == "presence":
                # Return Numeric Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    new_column.append(1 if len(analyzer(row)) > 0 else 0)
                    # new_column.append(len(analyzer(row)) > 0)
            elif self._extraction_type == "count":
                # Return Numeric Array
                analyzer = vectorizer.build_tokenizer()
                for row in temp_column:
                    new_column.append(len(analyzer(row)))
            return new_column
        except Exception as error:
            util.print_error("Failed to use Language Processor " + str(error))
            util.print_detailed_error()
Esempio n. 32
0
 def __init__(self, vectorizer: TfidfVectorizer):
     self.vectorizer = vectorizer
     self.vocab = vectorizer.get_feature_names()
     self.tokenizer = vectorizer.build_tokenizer()
Esempio n. 33
0
if __name__ == '__main__':
    #load the vocab file
    train_list = get_values('train.txt')
    test_list = get_values('test.txt')
    valid_list = get_values('valid.txt')
    targetlist = valid_list

    l = 100000
    targetlist = targetlist[:l]

    all_docs_str, all_docs_list = makedocs()
    file = open("evidence_dev.txt", "w", encoding='utf-8')
    cnt = 0
    wholestring = ""
    tiv = TfidfVectorizer(stop_words="english").fit(all_docs_str)
    tokenizer = tiv.build_tokenizer()
    all_docs_numpy = tiv.transform(all_docs_str)
    all_docs_text_numpy = np.array(all_docs_list)

    #all doc key
    doc_keys = pickle.load(file=open("dockey.pkl", 'rb'))

    # way to find relative docs
    importantword = False
    for crset in tqdm(targetlist):

        #find doc
        #temp = TfidfVectorizer(stop_words="english").fit([crset])

        #print(set(co_command.keys()).intersection(set(temp.vocabulary_.keys())))
        #if set(co_command.keys()).intersection(set(temp.vocabulary_.keys())) is None:
Esempio n. 34
0
class FeatureExtractor:

    vectorizer = None
    feature_names = None
    feature_matrix = None
    features = None

    def train_extractor_from_lines(self, lines):
        self.vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                          max_features=DISTINCT_WORDS_CNT)
        self.vectorizer.fit(lines)

        pass

    def train_extractor(self, full=False):
        lines = dir2lines_labels('data/train_lite.csv')
        self.train_extractor_from_lines(lines)

        pass

    def get_features_distance_matrix(self, feature_df):
        mat = np.zeros((len(feature_df), len(feature_df)))

        for index1, row1 in feature_df.iterrows():
            for index2, row2 in feature_df.iterrows():
                mat[index1, index2] = self.feature_distance(row1, row2)

        return mat

    def get_lines_distance_matrix(self, lines):

        feature_df = self.lines2features(lines)

        self.features = feature_df

        return self.get_features_distance_matrix(feature_df)

    def lines2features(self, lines, use_tense=False):
        """
        returns DataFrame(feature_matrix, feature_name)

        ['word_rainny', 'word_'sunny'],
        array([
                [1, 0.4, 0.2],
                [0.2, 1, 0.2],
        ])
        """
        self.feature_names = []
        self.feature_matrix = None

        # tf-idf features
        data = self.vectorizer.transform(lines).toarray()

        self.feature_names = self.vectorizer.get_feature_names()
        self.feature_matrix = data

        # additional features
        add_features = []
        important_words = [
            'sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain',
            'snow', 'tornado', 'storm', 'hurricane'
        ]
        important_words = [
            'cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain',
            'snow', 'storm', 'sunny', 'tornado', 'wind'
        ]
        self.feature_names = self.feature_names + [
            'impt_words:' + word for word in important_words
        ]
        if use_tense:
            self.feature_names = self.feature_names + [
                'past_tense_num', 'present_tense_num'
            ]

        all_words = self.lines2words(lines)
        for words in all_words:
            # important words
            important_words_ftr = [
                int(word in words) for word in important_words
            ]
            add_features.append(important_words_ftr)

            # tense
            if use_tense:
                tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1]
                past_num = len([v for v in tagz if v == 'VBD'])
                present_num = len([v for v in tagz if v in ['VBP', 'VB']])

                add_features.append([past_num, present_num])

        self.feature_matrix = np.hstack((self.feature_matrix, add_features))

        return DataFrame(self.feature_matrix, columns=self.feature_names)

    def feature_distance(self, feature_vector1, feature_vector2):
        # preliminary version
        return 1 - np.dot(feature_vector1, feature_vector2) / np.sqrt(
            (np.dot(feature_vector1, feature_vector1) *
             np.dot(feature_vector2, feature_vector2)))

    def lines2words(self, lines):
        self.tokenizer = self.vectorizer.build_tokenizer()

        return [self.tokenizer(line) for line in lines]

    def load_vectorizer(self):
        input_file = open('models/tfidf_vectorizer.pkl', 'rb')
        self.vectorizer = pickle.load(input_file)
        input_file.close()
        pass

    def save_vectorizer(self):
        output_file = open('models/tfidf_vectorizer.pkl', 'wb')
        pickle.dump(self.vectorizer, output_file)
        output_file.close()
        pass
Esempio n. 35
0
X_test = np.array([''.join(el) for el in nyt_data[trainset_size+1:len(nyt_data)]]) 
y_test = np.array([el for el in nyt_labels[trainset_size+1:len(nyt_labels)]]) 

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2), 
 stop_words='english', 
 strip_accents='unicode', 
 norm='l2')
 
test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))
print "\n"
 
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
def weighted_embeddings(esco_df, eperusteet_df, model):
    """
    Create TFIDF weighted embeddings for ESCO and ePerusteet.
    The input sentences should be separated with newlines.

    Args:
        esco_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ESCO.
        eperusteet_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ePerusteet.
        model (fasttext.model) : Model for word-embeddings.

    Return:
        X_esco (xArray) : Embeddings for ESCO texts.
        X_eperusteet (xArray) : Embeddings for ePerusteet texts.
    """
    assert isinstance(esco_df, pd.DataFrame)
    assert isinstance(eperusteet_df, pd.DataFrame)

    text_esco = esco_df["text"]
    text_eperusteet = eperusteet_df["text"]

    # Do not sort - to we can resplit using the indices
    combined_texts = pd.concat([text_esco, text_eperusteet], sort=False)

    vectorizer = TfidfVectorizer()
    vectorizer.fit(combined_texts)
    tokenizer = vectorizer.build_tokenizer()
    feature_array = vectorizer.get_feature_names()

    identifiers = []
    embeddings = []

    for _, row in tqdm(esco_df.iterrows(),
                       total=esco_df.shape[0],
                       desc="Computing embeddings for ESCOs"):
        identifiers.append(row["label"])

        texts = row["text"].split("\n")

        # Take average over the sentences
        competence_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                            dims=["embedding"])

        for text in texts:

            sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                              dims=["embedding"])

            weights = vectorizer.transform([text])

            nonzero_indexes = weights.nonzero()
            weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, ))
            weights = [w / sum(weights) for w in weights]

            weight_dict = {
                feature_array[idx]: weights[i]
                for i, idx in enumerate(nonzero_indexes[1])
            }

            for word in text.split(" "):
                try:
                    token = tokenizer(word)[0]
                except IndexError:
                    continue
                weight = weight_dict[token]
                sentence_embedding += (model[word] * weight)

            competence_embedding += sentence_embedding

        # If the texts was empty, avoid division and add the 0-vector
        if not texts:
            competence_embedding = competence_embedding / len(texts)

        embeddings.append(competence_embedding)

    embeddings = np.stack(embeddings, axis=0)

    esco_embeddings = xr.DataArray(embeddings,
                                   coords={"ESCO": identifiers},
                                   dims=["ESCO", "embedding"])

    identifiers = []
    embeddings = []

    for _, row in tqdm(eperusteet_df.iterrows(),
                       total=eperusteet_df.shape[0],
                       desc="Computing embeddings for ePerusteet"):
        identifiers.append(row["label"])

        texts = row["text"].split("\n")

        # Take average over the sentences
        degree_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                        dims=["embedding"])

        for text in texts:
            sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                              dims=["embedding"])

            weights = vectorizer.transform([text])

            nonzero_indexes = weights.nonzero()
            weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, ))
            weights = [w / sum(weights) for w in weights]

            weights = {
                feature_array[idx]: weights[i]
                for i, idx in enumerate(nonzero_indexes[1])
            }

            for word in text.split(" "):
                try:
                    token = tokenizer(word)[0]
                except IndexError:
                    continue
                weight = weights[token]
                sentence_embedding += (model[word] * weight)

            degree_embedding += sentence_embedding

        # If the texts was empty, avoid division and add the 0-vector
        if not texts:
            degree_embedding = degree_embedding / len(texts)

        embeddings.append(degree_embedding)

    embeddings = np.stack(embeddings, axis=0)

    eperusteet_embeddings = xr.DataArray(embeddings,
                                         coords={"ePerusteet": identifiers},
                                         dims=["ePerusteet", "embedding"])

    return esco_embeddings, eperusteet_embeddings
Esempio n. 37
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())
    #
    # vct = CountVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=True, ngram_range=(1, 1),
    #                       token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())


    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10 # max(10, args.fixk)

    # if args.fixk < 0:
    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))
    print ("Vectorizer: %s" % vct)
    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    # labels, sent_train = split_data_sentences(data.train, sent_detector)
    #
    # data.train.data = sent_train
    # data.train.target = np.array(labels)

    # labels, sent_train = split_data_sentences(data.test, sent_detector)
    # data.test.data = sent_train
    # data.test.target = np.array(labels)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset
    # data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)


    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)

    student = structured.AALStructured(model=clf, accuracy_model=None, budget=args.budget, seed=args.seed, vcn=vct,
                                       subpool=250, cost_model=cost_model)
    student.set_score_model(exp_clf)



    print "\nStudent Classifier: %s" % clf


    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print ("Sentence Classification")
    t0 = time.time()
    tac = []
    tau = []

    # predition = exp_clf.predict(data.train.bow)



    print ("Prepare test ... ")
    ## create sentences from documents based on first k
    ## random
    ## best sentence
    filtered_data = []
    bestk = []
    bestk_max = []
    random_k = []
    print "First k=1"
    for iDoc, y in zip(data.train.data, data.train.target):
        doc_sent = split_into_sentences([iDoc], sent_detector, vct)
        random_k.append(doc_sent[random.randint(0, len(doc_sent)-1)])
        scores = [best_score_max(iSent, y, exp_clf) for iSent in vct.transform(doc_sent)]
        best = np.argmax(scores)
        bestk_max.append(doc_sent[best])
        scores = [best_score(iSent, y, exp_clf) for iSent in vct.transform(doc_sent)]
        best = np.argmax(scores)
        bestk.append(doc_sent[best])
        filtered_data.append(doc_sent[0])

    test_firstk = vct.transform(filtered_data)
    test_random = vct.transform(random_k)
    test_best = vct.transform(bestk)
    test_best_max = vct.transform(bestk)
    targets = data.train.target



    print"*"*80
    accu, auc, predictions = evaluate(exp_clf, test_random, targets, vct)
    print "RND: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])
    accu, auc, predictions = evaluate(exp_clf, test_firstk, targets, vct)
    print "FIRSTK: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])
    accu, auc, predictions = evaluate(exp_clf, test_best, targets, vct)
    print "BEST: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])
    accu, auc, predictions = evaluate(exp_clf, test_best_max, targets, vct)
    print "BESTMAX: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])


    # print"*"*80
    # print "STUDENT"
    # clf.fit(test_random, targets)
    # accu, auc, predictions = evaluate(clf, data.test.bow, data.test.target, vct)
    # print "RND: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])
    # clf.fit(test_firstk, targets)
    # accu, auc, predictions = evaluate(clf, data.test.bow, data.test.target, vct)
    # print "FIRSTK: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])
    # clf.fit(test_best, targets)
    # accu, auc, predictions = evaluate(clf, data.test.bow, data.test.target, vct)
    # print "BEST: ACCU:{}\t AUC:{} \t Predictions:{}".format(accu, auc, predictions.shape[0])

    print("Elapsed time %.3f" % (time.time() - t0))
Esempio n. 38
0
__author__ = '315-4'
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

dataset=fetch_20newsgroups()  # датасет - 20 групп новостей

vect = TfidfVectorizer()  # конвертор в матрицу TF-IDF
tok = vect.build_tokenizer()  # токенизатор

texts = []
# токенизация текстов
for text in dataset.data:
    texts.append(tok(text))

# на сцену выходит gensim
# Convert document (a list of words) into the bag-of-words
dictionary = corpora.Dictionary(texts)  # создаем словарь (сет токенов)
corpus = [dictionary.doc2bow(text) for text in texts]  # корпус

new_vec = dictionary.doc2bow((tok('Hello world')))  # это нигде не используется

# Обучение LDA модели
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,\
                               num_topics=100, update_every=1, chunksize=10000, passes=1)

# выводим матрицу V из UEV разложения
for item in lda.print_topics(100):
    print (item)
    
Esempio n. 39
0
X_test = np.array([''.join(el) for el in nyt_data[trainset_size + 1:len(nyt_data)]])
y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]])

#print(X_train)

vectorizer = TfidfVectorizer(min_df=2,
                             ngram_range=(1, 2),
                             stop_words='english',
                             strip_accents='unicode',
                             norm='l2')

test_string = unicode(nyt_data[0])

print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)






svm_classifier = LinearSVC().fit(X_train, y_train)


Esempio n. 40
0
class NLPPipeline():
    def __init__(self, text, Y, train_size=.85):
        self.model_builders = {'dtc': dtc, 'rfc': rfc}
        steps = ['tfidf', 'feature_engineering', 'lda', 'model']
        self.pipeline_dic = {step: None for step in steps}
        self.text_train, self.text_test, self.Y_train, self.Y_test = split(
            text, Y, train_size=train_size, stratify=Y)
        self.keep_tfidf = lambda tfidf_dic: (tfidf_dic == self.pipeline_dic[
            'tfidf'])
        self.keep_features = lambda features_dic: (features_dic == self.
                                                   pipeline_dic['features'])
        self.prob_info = lambda prob: -prob * np.log(prob)
        self.pipeline_dic = {step: "Default" for step in steps}
        self.train_size = train_size

    def update_tfidf(self, tfidf_dic):
        self.pipeline_dic['tfidf'] = tfidf_dic
        self.tfidf = TfidfVectorizer(**tfidf_dic)
        self.tfidf_train = self.tfidf.fit_transform(self.text_train)
        self.tfidf_train = self.tfidf_train.toarray()
        self.tokenizer = self.tfidf.build_tokenizer()
        self.tfidf_test = self.tfidf.transform(self.text_test)
        self.tfidf_test = self.tfidf_test.toarray()
        self.feature_names = self.tfidf.get_feature_names()

    def update_lda(self, lda_dic):
        def calc_topics_words(num_top_words):
            topics_words = []

            for ix, topic in enumerate(self.lda.components_):
                top_word_inds = topic.argsort()[:-num_top_words - 1:-1]
                topic_words = set(
                    [self.feature_names[i] for i in top_word_inds])
                topics_words.append(topic_words)

            return topics_words

        num_top_words = lda_dic[
            'num_top_words'] if 'num_top_words' in lda_dic else 10
        lda_model_dic = {
            k: v
            for k, v in lda_dic.items() if k != 'num_top_words'
        }
        self.lda = LDA(**lda_model_dic)
        self.lda.fit_transform(self.tfidf_train)
        self.topics_words = calc_topics_words(num_top_words)

    def calc_entropy(self, text):
        ''' Many other equivalent ways to calculate entropy. This seems to be the fastest. 5 x faster than scipy's entropy method.'''

        word_counts = defaultdict(int)
        text_size = float(len(text))

        for word in text:
            word_counts[word] += 1

        word_counts = np.array(list(word_counts.values()))
        word_probs = word_counts / text_size
        entropy = -1 * sum(map(self.prob_info, word_probs))

        return entropy

    def calc_lda_features(self, tokenized_text):
        num_topics = len(self.topics_words)
        unique_words = set(tokenized_text)
        num_unique_words = float(len(unique_words))
        lda_features = [
            len(unique_words.intersection(topic_words)) / num_unique_words
            for topic_words in self.topics_words
        ]

        return lda_features

    def calc_sentiment_features(self, text):
        min_polarity, max_polarity = -.1, .1
        blob = TextBlob(text)
        polarities = [
            sentence.sentiment.polarity for sentence in blob.sentences
        ]
        polarities = [round(polarity, 2) for polarity in polarities]
        polarity_entropy = self.calc_entropy(polarities)
        polarity_var = np.var(polarities)
        num_pos_sents = len(
            [polarity for polarity in polarities if polarity > max_polarity])
        num_neg_sents = len(
            [polarity for polarity in polarities if polarity < min_polarity])
        num_sents = float(len(polarities))

        pos_sent_freq, neg_sent_freq = num_pos_sents / num_sents, num_neg_sents / num_sents
        num_neutral_sents = num_sents - num_pos_sents - num_neg_sents
        max_pol, min_pol = np.max(polarities) if polarities else 0, min(
            polarities) if polarities else 0
        subjectivities = [
            sentence.sentiment.subjectivity for sentence in blob.sentences
        ]
        subjectivities = [round(x, 2) for x in subjectivities]
        subj_var = np.var(subjectivities)
        max_subj, min_subj = np.max(subjectivities) if polarities else 0, min(
            subjectivities) if polarities else 0
        sentiment_features = [
            polarity_entropy, polarity_var, num_pos_sents, num_neg_sents,
            num_neutral_sents, pos_sent_freq, neg_sent_freq, num_sents,
            max_pol, min_pol, subj_var, max_subj, min_subj
        ]

        return sentiment_features

    def update_features(self, features_dic):
        """
		From a dictionary containing parameter labels and values used for building features (currently just LDA),
		updates feature matrices by re-calculating features for each text. 
		
		Arguments
		features_dic (dictionary): A dictionary with string parameter names as keys and ints/floats as values. 

		Example: 
		features_dic = {'n_components': 10, 'n_words': 10}
		"""
        def calc_features(text):
            words = self.tokenizer(text)
            entropy = self.calc_entropy(words)
            lda_features = self.calc_lda_features(words)
            sentiment_features = self.calc_sentiment_features(text)
            features = [entropy, *lda_features, *sentiment_features]

            return features

        self.pipeline_dic['features'] = features_dic
        self.update_lda(features_dic)
        self.X_train = np.hstack(
            (self.tfidf_train,
             np.array(
                 [np.array(calc_features(text)) for text in self.text_train])))
        self.X_test = np.hstack(
            (self.tfidf_test,
             np.array(
                 [np.array(calc_features(text)) for text in self.text_test])))

    def grid_search(self, step_grids):
        """
		From a nested dictionary containing grids for each pipeline step, fit and score each possible pipeline
		permutation (nested permutation of the step permutations).
		
		Arguments
		step_grids: A nested dictionary containing the step grid for each step. 
					Example: step_grids = {'tfidf' = {'min_df': [0.1]},
			   							   'features' = {'n_components': [10], num_top_words: [10]},
			   							   'model' = {'type': ['rfc']} }

		Returns
		pipeline_scores: A sorted list of 2-tuples containing the pipeline dictionary and score of each pipeline permutation. 
		"""
        def get_step_perms(grid):
            """
			From grid (dict) mapping each parameter name to a list of values for that parameter,
			returns the list of all permutations (dicts) that can be made by choosing a different value
			for each parameter from its values list.
			
			Arguments 
			grid ({string: list}): A dictionary mapping parameter names to a list of parameter values. 
								   Example: grid = {'min_df': [0.1], 'max_df': [0.8, 0.9]}

			Returns 
			step_perms ([dict]): A list of all dictionary permutations for the step that can be made 
								 by choosing different parameter values from each parameter's domain. 

								 Example: For the above grid example, we'd have
								 step_perms = [{'min_df': 0.1, 'max_df: 0.8'}, {'min_df: 0.1', max_df: 0.9}]

			"""

            param_names = list(grid.keys())
            param_val_perms = list(product(*list(grid.values())))
            num_params = len(param_names)
            step_perms = [{
                param_names[j]: param_val_perm[j]
                for j in range(num_params)
            } for param_val_perm in param_val_perms]

            return step_perms

        steps = list(step_grids.keys())
        num_steps = len(steps)
        grids = list(step_grids.values())
        step_perms = list(map(get_step_perms, grids))
        pipeline_perms = list(product(*step_perms))
        pipeline_perms = [{
            steps[i]: pipeline_perm[i]
            for i in range(num_steps)
        } for pipeline_perm in pipeline_perms]
        pipeline_scores = [[
            pipeline_perm, round(self.score(pipeline_perm), 3)
        ] for pipeline_perm in pipeline_perms]
        pipeline_scores.sort(key=lambda x: x[1], reverse=True)

        return pipeline_scores

    def score(self, pipeline_dic):
        tfidf_vectorizer = TfidfVectorizer(**pipeline_dic['tfidf'])
        keep_tfidf = self.keep_tfidf(pipeline_dic['tfidf'])

        if not keep_tfidf:
            self.update_tfidf(pipeline_dic['tfidf'])

        keep_features = keep_tfidf and self.keep_features(
            pipeline_dic['features'])

        if not keep_features:
            self.update_features(pipeline_dic['features'])

        self.model_builder = self.model_builders[pipeline_dic['model']['type']]
        model_dic = {
            key: value
            for key, value in pipeline_dic['model'].items() if key != 'type'
        }
        self.model = self.model_builder(**model_dic)
        self.model.fit(self.X_train, self.Y_train)
        Y_pred = self.model.predict(self.X_test)
        score = accuracy(Y_pred, self.Y_test)
        print(f"Params = {pipeline_dic}, score = {round(score, 3)}. \n")

        return score
Esempio n. 41
0
class FeatureExtractor:

    vectorizer = None
    feature_names = None
    feature_matrix = None

    def train_extractor_from_lines(self, train_lines, labels, test_lines):
        self.vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                          max_features=DISTINCT_WORDS_CNT)
        self.vectorizer.fit(train_lines + test_lines)

        pass

    def load_vectorizer(self):
        input_file = open('../models/tfidf_vectorizer.pkl', 'rb')
        self.vectorizer = pickle.load(input_file)
        input_file.close()
        pass

    def save_vectorizer(self):
        output_file = open('../models/tfidf_vectorizer.pkl', 'wb')
        pickle.dump(self.vectorizer, output_file)
        output_file.close()
        pass

    def train_extractor(self, full=False):

        if not full:
            train_lines = file2lines('../data/train_lite.csv')
            labels = file2labels('../data/train_lite.csv')
            test_lines = file2lines('../data/test_lite.csv')
        else:
            train_lines = file2lines('../data/train.csv')
            labels = file2labels('../data/train.csv')
            test_lines = file2lines('../data/test.csv')

        self.train_extractor_from_lines(train_lines, labels, test_lines)

        pass

    def lines2words(self, lines):
        self.tokenizer = self.vectorizer.build_tokenizer()

        return [self.tokenizer(line) for line in lines]

    def lines2features(self, lines, use_tense=False):
        """
                returns DataFrame(feature_matrix, feature_name)

                ['word_rainny', 'word_'sunny'],
                array([
                        [1, 0.4, 0.2],
                        [0.2, 1, 0.2],
                ])
                """
        self.feature_names = []
        self.feature_matrix = None

        # tf-idf features
        data = self.vectorizer.transform(lines).toarray()

        self.feature_names = self.vectorizer.get_feature_names()
        self.feature_matrix = data

        # additional features
        add_features = []
        important_words = [
            'sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain',
            'snow', 'tornado', 'storm', 'hurricane'
        ]
        important_words = [
            'cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain',
            'snow', 'storm', 'sunny', 'tornado', 'wind'
        ]
        self.feature_names = self.feature_names + [
            'impt_words:' + word for word in important_words
        ]
        if use_tense:
            self.feature_names = self.feature_names + [
                'past_tense_num', 'present_tense_num'
            ]

        all_words = self.lines2words(lines)
        for words in all_words:
            # important words
            important_words_ftr = [
                int(word in words) for word in important_words
            ]
            add_features.append(important_words_ftr)

            # tense
            if use_tense:
                tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1]
                past_num = len([v for v in tagz if v == 'VBD'])
                present_num = len([v for v in tagz if v in ['VBP', 'VB']])

                add_features.append([past_num, present_num])

        self.feature_matrix = np.hstack((self.feature_matrix, add_features))

        return DataFrame(self.feature_matrix, columns=self.feature_names)
# Get predictions Bayes
unclassified_tweet_sentiments_bayes = classifier_bayes.predict(
    unclassified_features)

# Store the sentiment in a new column, NOTE 0 is negative, 4 is positive
unclassified_df['Sentiment'] = unclassified_tweet_sentiments_bayes

unclassified_df.head()

# Need code to classify the tweets for the different major political parties, in this case there are 4 major political party categories I will consider in the Canadian Context ***'Liberal', 'Conservative', 'NDP', Others'***
# As this data needs to be assigned to a party, a simple word frequency counter algorithm will be used to assign to each party

# Preporcessor and tokenizer code

preprocessor = vectorizer.build_preprocessor()
tokenizer = vectorizer.build_tokenizer()


# Defining the bag_of_words function
def bag_of_words(tw):
    '''(str) -> dict
    Input: a string tw (a tweet line)
    Output: a python dictionary 
    '''

    unigram_ls = tokenizer(preprocessor(tw))

    #Create an empty dictionary
    bag_words = {}
    #Run through tokenized unigram list
    for item in unigram_ls:
Esempio n. 43
0
__author__ = 'Alena'
from sklearn.datasets import fetch_20newsgroups
dataset=fetch_20newsgroups()
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
tok=vect.build_tokenizer()
texts=[]

Y=vect.fit_transform(dataset.data)
first=Y.getcol(0)
second=Y.getcol(1)
word1=[]
for i, el in enumerate(first):
    word1.append(first._get_single_element(i,0))
    word2=[]
for i, el in enumerate(second):
    word2.append(second._get_single_element(i,0))

distance=0
for i in range(len(word2)):
    
    distance+=absmod=word1[i]-word2[i](mod)
print(distance)