Example #1
0
class UnitClassifier(Trainer):
    def __init__(self, x, y, train_ratio):
        super(UnitClassifier, self).__init__(x, y, train_ratio)
        self._count_vec = CountVectorizer()
        self._tfidf_transformer = TfidfTransformer()

    def Fit(self):
        x_count = self._count_vec.fit_transform(self._x_train)
        self._tfidf_transformer.fit(x_count)

    def Preprocess(self, x):
        return self._tfidf_transformer.transform(self._count_vec.transform(x))

    def Learn(self, x_train, y_train):
        LOG.info('x_train.shape = %s', str(x_train.shape))
        LOG.info('len(y_train) = %d', len(y_train))

        clf = RandomForestClassifier(verbose=0, n_jobs=-1, n_estimators=20)
        LOG.info('Training...')
        clf.fit(x_train, y_train)
        LOG.info('Done...')
        return clf

    def Eval(self):
        LOG.info('Eval ...')
        y_pred = self.Predict(self._x_test)
        return {
            'misclass': np.mean(y_pred != self._y_test),
            'report': classification_report(self._y_test, y_pred,
                                            target_names=self._model.classes_)
        }
Example #2
0
def race_tfidf(data, can_be_noun_arg, stop_words):
    print 
    data = data.groupby('race')['last']
    data = dict(list(data))
    docs = []
    for k in data:
        docs.append(' '.join(data[k]))
    count_vectorizer = CountVectorizer(stop_words='english')
    counts = count_vectorizer.fit_transform(docs)
    #print counts.todense().shape
    tfidf = TfidfTransformer(norm="l2", sublinear_tf='True')
    tfidf.fit(counts)
    #print "IDF:", tfidf.idf_.shape
    tf_idf_matrix = tfidf.transform(counts)
    freqs = {}
    sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1))
    terms,_ = zip(*sorted_voc)
    for i,k in enumerate(data.keys()):
        # make list
        row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist()
        freq = zip(terms, row)
        freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1])
        print freqs[k][:5]
    #print tf_idf_matrix.todense().shape
    return freqs
Example #3
0
 def cal_weight(self, key_words):
     """
     计算获取特征词后的权重信息
     :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据
     :return:
     """
     print "Cal Weight: ", time.strftime('%Y-%m-%d %H:%M:%S')
     if not self.istrain:
         dir_ = os.path.join(TEXT_OUT, "key_words")
         filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt"
         url = os.path.join(dir_, filename)
         train_key_words = FileUtil.read(url)
     else:
         train_key_words = key_words
     train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words]
     key_words = [d.get("sentence") if "sentence" in d else d for d in key_words]
     # 获得 tf
     key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words]
     fit_train_key_words = Feature_Hasher.transform(train_key_words)
     fit_key_words = Feature_Hasher.transform(key_words)
     tfidf = TfidfTransformer()
     # 训练 idf
     tfidf.fit(fit_train_key_words)
     weight_matrix = tfidf.transform(fit_key_words)
     print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     print
     return weight_matrix
Example #4
0
def tfidf_score(train_set, test_set):

    stopwords = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords))
    #Remove all the None Types from the input datasets
    train_set = filter(None, train_set)
    test_set = filter(None, test_set)
    vectorizer.fit_transform(train_set)
    #print "Word Index is {0} \n".format(vectorizer.vocabulary_)
    smatrix = vectorizer.transform(test_set)
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(smatrix)
    #print "IDF scores:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(smatrix)
    pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T
    msum = tf_idf_matrix.sum(axis=1)
    cos_sum = pairwise_similarity.sum(axis=1)
    mlist = msum.tolist()
    cos_sim = cos_sum.tolist()
    count = 0
    tfidfscores = {}
    for s in train_set:
        tfidfscores[s] = []
        tfidfscores[s].append(mlist[count][0])
        tfidfscores[s].append(cos_sim[count][0])
        count += 1
    return tfidfscores
class FeatureVectorTfIdf(object):

    def __init__(self):
        self.cvector_obj = CountVectorizer()
        self.tfidf_obj = TfidfTransformer(norm="l2")

    def extract_features(self, data):
        # fit_transform get train data and extract features

        count_vect = self.cvector_obj.fit_transform(data)
        return self.tf_idf_vector(count_vect)

        # You can get the list of features
        # print self.cvector_obj.get_feature_names()

    def word_vectors_tfidf(self, vect_data):
        '''convert documents into matrix'''

        vect_data = self.cvector_obj.transform(vect_data)
        return self.tf_idf_vector(vect_data)

    def tf_idf_vector(self, vect_data):
        '''convert documnet matrix into idf matrix '''
        self.tfidf_obj.fit(vect_data)

        '''convert into tf-idf matrix'''
        tfidf = self.tfidf_obj.transform(vect_data)
        return tfidf
Example #6
0
def tf_idf_features(train_ls):
    train_set = train_ls #Documents
    vectorizer = CountVectorizer()#stop_words = stopWords
    transformer = TfidfTransformer()
    trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
    transformer.fit(trainVectorizerArray)
    return transformer.transform(trainVectorizerArray).toarray()
def tf_idf(tag_matrix):
    #calculate TF-IDF
    tfidf = TfidfTransformer(None, use_idf=True)
    tfidf.fit(tag_matrix)
    tag_matrix = tfidf.transform(tag_matrix)
    dense_tag_matrix = tag_matrix.todense()
    return dense_tag_matrix
Example #8
0
def tfidf_step_by_step():
    """ Example of calculating TF-IDF for OSM nodes.
    Document is a list of keys.
    """

    learn_data_set = documents_gen()
    test_data_set = documents_gen()

    # calculate term-frequency
    vectorizer = CountVectorizer(stop_words=stop_words,
        token_pattern='[a-z0-9_\-:]+')
    vectorizer.fit_transform(learn_data_set)
    #pprint.pprint(vectorizer.vocabulary_)

    # freq_term_matrix is a sparse matrix (elemens stored in Coordinate format
    # http://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_.28COO.29 )
    freq_term_matrix = vectorizer.transform(test_data_set)
    # freq_term_matrix.todense()

    # l2 - Euclidean normalization
    # http://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)

    tf_idf = tfidf.transform(freq_term_matrix)

    pprint.pprint(tf_idf.todense())
def cal_product_title_tfidf():

    #PART I compute the tf-idf for product title
    print "\nBegins,compute the tf-idf for product title ..."


    print "\nStemming product_title..."
    AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x))
    product_title = AllSet['product_title']

    print "\nGet the (product title vocabulary)-(search term) frequency matrix..."
    search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_tittle.fit(product_title)#learn the vocabulary
    search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix

    print "\nGet the (product title vocabulary)-(product_title) frequency matrix"
    title_vect = CountVectorizer(stop_words='english')
    title_vect.fit_transform(product_title)#learn the vocabulary
    title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix

    print "\nGet the idf matrix"
    tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True)
    tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary
    tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix

    print "\nCompute the result of tf-idf for product title ..."
    tf_idf_title_result = [] #compute the result of tf-idf for product title
    for index in range(tf_idf_title_matrix.shape[0]):
        tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False)

    return 0
def cal_product_description_tfidf():
    #PART II compute the tf-idf for product description
    print "\nBegins,compute the tf-idf for product description ..."
    product_description_data = pd.read_csv('product_descriptions.csv')

    print "\nMerge the product description into database..."
    AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid')

    print "\nStemming the product description ..."
    AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x))
    product_description=AllSet['product_description']

    print "\nGet the (product description vocabulary)-(search term) frequency matrix..."
    search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_descrip.fit(product_description)#learn the vocabulary
    search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix

    print "\nGet the (product description vocabulary)-(product_description) frequency matrix..."
    description_vect = CountVectorizer(stop_words ='english')
    description_vect.fit_transform(product_description)#learn the vocabulary
    description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix

    print "\nGet the idf matrix..."
    tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True)
    tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary
    tf_idf_descrip_matrix  = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix


    print "\nCompute the result of tf-idf for product description ..."
    tf_idf_descrip_result=[]#compute the result of tf-idf for product title
    for index in range(tf_idf_descrip_matrix.shape[0]):
        tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
Example #11
0
class CaloriesRegressor(Trainer):
    def __init__(self, x, y, train_ratio):
        super(CaloriesRegressor, self).__init__(x, y, train_ratio)
        self._count_vec = CountVectorizer()
        self._tfidf_transformer = TfidfTransformer()

    def Fit(self):
        x_count = self._count_vec.fit_transform(self._x_train)
        self._tfidf_transformer.fit(x_count)

    def Preprocess(self, x):
        return self._tfidf_transformer.transform(self._count_vec.transform(x))

    def Learn(self, x_train, y_train):
        LOG.info('x_train.shape = %s', str(x_train.shape))
        LOG.info('len(y_train) = %d', len(y_train))

        clf = RandomForestRegressor(verbose=0, n_jobs=-1, n_estimators=100)
        LOG.info('Training...')
        clf.fit(x_train, y_train)
        LOG.info('Done...')
        return clf

    def Eval(self):
        LOG.info('Eval ...')
        y_pred = self.Predict(self._x_test)
        return {
            'median_absolute_error':
            median_absolute_error(self._y_test, y_pred),
            'mean_squared_error': mean_squared_error(self._y_test, y_pred),
            'explained_variance_score':
            explained_variance_score(self._y_test, y_pred),
        }
Example #12
0
def return_idf(instances, labels):

    transformer = TfidfTransformer(smooth_idf=True)
    transformer.fit(instances)
    idf = dict.fromkeys(range(instances.shape[1]), 0)
    for feature,value in enumerate(list(transformer._idf_diag.data)):
        idf[feature] = value
    return idf
Example #13
0
def TextTransform(X, Xtest = None):
    Write("Process Data with TFIDF...\n")
    tfidf = TfidfTransformer()
    if Xtest is None:
        X = tfidf.fit_transform(X).toarray()
        return X
    else:
        tfidf.fit(X)
        return tfidf.transform(X).toarray(), tfidf.transform(Xtest).toarray()
Example #14
0
def setup(train, test, binaryOpt = False):
    count_vectorizer = CountVectorizer(binary = binaryOpt)
    count_vectorizer.fit_transform(train)
    freq_term_matrix = count_vectorizer.transform(test)
    if binaryOpt:
        return freq_term_matrix
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    return tf_idf_matrix
	def do_idf_original(self):
		"""Calcul de l'idf directement avec sklearn

		On calcule ici l'idf directement avec les classe de sklearn. On obtient le même résultat que do_idf_variante.
		Calculer nous même l'idf nous permet de mieux contrôler ce que l'on fait, notamment sur la variante utilisée.
		"""
		tfidf_transformer = TfidfTransformer()
		tfidf_transformer.fit(self.tfidf_matrix)

		self.idf = tfidf_transformer.idf_
Example #16
0
def create_tf_idf(bow):
    """ (SPARSE VERSION) Reads the bag of words representation from GridFS, then generates the TF-IDF representation """

    print "Creating TF-IDF Bag of Words"

    transformer = TfidfTransformer(norm=u'l2', use_idf=True)
    transformer.fit(bow)
    tf_idf = transformer.transform(bow)

    return tf_idf
 def _collect(self, splited_words_list, sentence_size):
     print "Collection datas: ", time.strftime('%Y-%m-%d %H:%M:%S')
     data = [d.get("sentence") for d in splited_words_list[: sentence_size]]
     class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]]
     fit_data = Feature_Hasher.transform(data)
     tfidf = TfidfTransformer()
     tfidf.fit(fit_data)
     a = tfidf.transform(fit_data)
     print "Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     return a, class_label, []
Example #18
0
def tfidf_normalize(articles_with_id):
    global NON_STOPWORD_LIMIT
    stemmed_articles_with_id = [(aid, stem_article(article)) for (aid, article) in articles_with_id]
    stemmed_articles = [article for (aid, article) in stemmed_articles_with_id]
    # test_set = train_set
    # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer
    vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b")
    # by appling the vectorizer instance to the train set
    # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df
    # documents in the train_set
    vectorizer.fit_transform(stemmed_articles)
    # vectorizer transform will apply the vocabulary from the train set to the test set. In my case,
    # they are the same set: whole Wikipedia.
    # this means that each article will get representation based on the words from the vocabulary and
    # their TF-IDF values in the Scipy sparse output matricx
    freq_term_matrix = vectorizer.transform(stemmed_articles)
    long_articles_with_id = []
    assert freq_term_matrix.shape[0] == len(articles_with_id)
    for (i, article_with_id) in zip(xrange(freq_term_matrix.shape[0]), stemmed_articles_with_id):
        row = freq_term_matrix.getrow(i)
        if row.getnnz() >= NON_STOPWORD_LIMIT:
            long_articles_with_id.append(article_with_id)

    long_articles = [article for (aid, article) in long_articles_with_id]

    vectorizer = CountVectorizer(stop_words="english", min_df=3, max_df=0.1, token_pattern=r"\b[a-zA-Z][a-zA-Z]+\b")
    vectorizer.fit_transform(long_articles)

    freq_term_matrix = vectorizer.transform(long_articles)

    # Gabrilovich says that they threshold TF on 3 (remove word-article association if that word
    # does not appear at least 3 times in that single article
    # freq_term_matrix.data *= freq_term_matrix.data>=3
    # freq_term_matrix.eliminate_zeros() # I think this is not necessary...
    # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is
    # how he defines TF values. In case of TF = 0, this shall not affect such value
    # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data )
    # instantiate tfidf trnasformer
    tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True)
    # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary)
    tfidf.fit(freq_term_matrix)
    # finally, tfidf will calculate TFIDF values with transform()
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    # tf_idf_matrix.data = np.log(np.log(tf_idf_matrix.data))
    tf_idf_matrix = normalize(tf_idf_matrix, norm="l2", axis=0, copy=False)
    # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to
    # words' concept vectors)
    tf_idf_matrix = tf_idf_matrix.tocsc()
    # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the
    # dictionary and put them to new dictionary word_index
    word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems())
    M, N = tf_idf_matrix.shape
    print "Articles: ", M
    print "Words: ", N
    return tf_idf_matrix, word_index, long_articles_with_id
    def test_same_idf_diag(self):
        X, X_rdd = self.generate_dataset(4, 1000, None)

        local = TfidfTransformer()
        dist = SparkTfidfTransformer()

        local.fit(X)
        dist.fit(X_rdd)

        assert_array_almost_equal(local._idf_diag.toarray(),
                                  dist._idf_diag.toarray())
Example #20
0
def data_pro():
    [[corpus_train, target_train], [corpus_test, target_test]] = load_data()
    count_v1 = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    counts_train = count_v1.fit_transform(corpus_train)  # fit_transform是将文本转为词频矩阵
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    tfidf_train = transformer.fit(counts_train).transform(counts_train)  # fit_transform是计算tf-idf
    weight_train = tfidf_train.toarray()  # weight[i][j],第i个文本,第j个词的tf-idf值
    count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_)  # 让两个CountVectorizer共享vocabulary
    counts_test = count_v2.fit_transform(corpus_test)  # fit_transform是将文本转为词频矩阵
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    tfidf_test = transformer.fit(counts_train).transform(counts_test)  # fit_transform是计算tf-idf
    weight_test = tfidf_test.toarray()  # weight[i][j],第i个文本,第j个词的tf-idf值
    return [[weight_train, target_train], [weight_test, target_test]]
Example #21
0
    def __init__(self, feature='tfidf', **kwargs):
        super(IMDB, self).__init__(**kwargs)
        if self.conf is not None:
            feature = self.conf.get('feature', 'tfidf')
        if feature.startswith('tfidf'):
            max_features = 5000
            (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
        else:
            (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=None, 
                    skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3)
        X, y = self.get_data_by_imageset(X_train, y_train, X_test, y_test)
        print('data_set={}, Average sequence length: {}'.format(self.data_set, np.mean(list(map(len, X)))))

        #feature
        if feature == 'origin':
            maxlen = 400
            X = sequence.pad_sequences(X, maxlen=maxlen)
        elif feature == 'tfidf':
            from sklearn.feature_extraction.text import TfidfTransformer
            transformer = TfidfTransformer(smooth_idf=False)
            #transformer = TfidfTransformer(smooth_idf=True)
            X_train_bin = np.zeros((len(X_train), max_features), dtype=np.int16)
            X_bin = np.zeros((len(X), max_features), dtype=np.int16)
            for i, X_i in enumerate(X_train):
                X_train_bin[i, :] = np.bincount(X_i, minlength=max_features)
            for i, X_i in enumerate(X):
                X_bin[i, :] = np.bincount(X_i, minlength=max_features)
            transformer.fit(X_train_bin)
            X = transformer.transform(X_bin)
            X = np.asarray(X.todense())
        elif feature == 'tfidf_seq':
            from sklearn.feature_extraction.text import TfidfTransformer
            transformer = TfidfTransformer(smooth_idf=False)
            maxlen = 400
            N = len(X)
            X_bin = np.zeros((N, max_features), dtype=np.int16)
            for i, X_i in enumerate(X):
                X_bin_i = np.bincount(X_i)
                X_bin[i, :len(X_bin_i)] = X_bin_i
            tfidf = transformer.fit_transform(X_bin)
            tfidf = np.asarray(tfidf.todense())
            X_id = sequence.pad_sequences(X, maxlen=maxlen)
            X = np.zeros(X_id.shape, dtype=np.float32)
            for i in range(N):
                X[i, :] = tfidf[i][X_id[i]]
        else:
            raise ValueError('Unkown feature: ', feature)

        X = X[:,np.newaxis,:,np.newaxis]
        self.X = self.init_layout_X(X)
        self.y = self.init_layout_y(y)
def tf(train,test):
    """Transform feature vectors: TF"""
    trf = TfidfTransformer(use_idf=False)
    trf = trf.fit(train)
    train = trf.transform(train)
    test = trf.transform(test)
    return train,test
def tfidf(train,test):
    """Transform feature vectors: TFIDF"""
    trf = TfidfTransformer()
    trf = trf.fit(train)
    train = trf.transform(train)
    test = trf.transform(test)
    return train,test
Example #24
0
def tfidf_raw(cnt_articles, article_ids, train_set_dict):
    # use the whole (Wiki) set as both the train and test set
    train_set = train_set_dict.values()
    test_set = train_set
    # train_set = ("The sky is blue.", "The sun is bright.")
    # test_set = ("The sun in the sky is bright.","We can see the shining sun, the bright sun.")
    vectorizer = CountVectorizer(stop_words="english")
    # instantiate vectorizer with English language, using stopwords and set min_df, max_df parameters and the tokenizer
    # vectorizer = CountVectorizer(stop_words='english', min_df=2, max_df=0.7, token_pattern=r'\b[a-zA-Z][a-zA-Z]+\b')
    # by appling the vectorizer instance to the train set
    # it will create a vocabulary from all the words that appear in at least min_df and in no more than max_df
    # documents in the train_set
    vectorizer.fit_transform(train_set)
    # print "Vocabulary:", vectorizer.vocabulary_
    # vectorizer transform will apply the vocabulary from the train set to the test set. In my case,
    # they are the same set: whole Wikipedia.
    # this means that each article will get representation based on the words from the vocabulary and
    # their TF-IDF values in the Scipy sparse output matricx
    freq_term_matrix = vectorizer.transform(test_set)
    print freq_term_matrix.todense()
    # this is a log transformation as applied in (Gabrilovich, 2009), i.e., that is
    # how he defines TF values. In case of TF = 0, this shall not affect such value
    # freq_term_matrix.data = 1 + np.log( freq_term_matrix.data )
    # instantiate tfidf trnasformer
    tfidf = TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=True)
    # print tfidf
    # tfidf uses the freq_term_matrix to calculate IDF values for each word (element of the vocabulary)
    tfidf.fit(freq_term_matrix)
    # print tfidf.idf_
    # finally, tfidf will calculate TFIDF values with transform()
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    print
    # print tf_idf_matrix.todense()
    print
    # now we put our matrix to CSC format (as it helps with accessing columns for inversing the vectors to
    # words' concept vectors)
    CSC_matrix = tf_idf_matrix.tocsc()
    CSC_matrix = normalize(CSC_matrix, norm="l2", axis=0, copy=False)
    # we need vocabulary_ to be accessible by the index of the word so we inverse the keys and values of the
    # dictionary and put them to new dictionary word_index
    word_index = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems())
    print word_index
    M, N = CSC_matrix.shape
    print "Articles: ", M
    print "Words: ", N
    return M, N, CSC_matrix, word_index
Example #25
0
 def preprocess(self):
     count_vect = CountVectorizer(stop_words='english')
     tokens = [self.tokenize(line) for line in self.data]
     count_vect.fit(tokens)
     self.count_vect = count_vect
     self.tfidf_vect = TfidfVectorizer(stop_words='english')
     self.tfidf_vect.fit(tokens)
     self.X_tfidfVect = self.tfidf_vect.transform(tokens)
     X_train_counts = count_vect.transform(tokens)
     self.X = X_train_counts
     tf_transformer = TfidfTransformer()
     tf_transformer.fit(X_train_counts)
     X_train_counts = tf_transformer.transform(X_train_counts)
     self.tf_transformer = tf_transformer
     self.X_tfidf = X_train_counts
     self.query_vector = self.tfidf_vect.transform([self.tokenize(self.query)])
     voc = count_vect.vocabulary_
     self.vocabulary = [(v,k) for k, v in voc.iteritems()]
     sorted_voc = sorted(self.vocabulary,key=lambda value:value[0],reverse=False)
     self.vocabulary = [k for v,k in sorted_voc]
Example #26
0
def get_keyphrase_data(home, group, max_keyphrases, update=False):
  start = time.time()
  path = '%s/output/entity/%s/keyphrase-data.txt' % (home, group)
  fp = codecs.open(path, 'r', 'UTF-8')
  data = simplejson.load(fp)
  fp.close()
  nrow = data["next-mention-index"]
  ncol = data["next-kp-index"]  
  path = '%s/output/entity/%s/kp-fof-sparse.npz' % (home, group)
  if update or not os.path.exists(path):
    print "Updating first-order features..."
    keyphrase_map = {}
    S = lil_matrix((nrow, ncol))
    for mention_id, mention_index in data['mention-index-map'].iteritems():
      keyphrases = sorted(data['mention-kps'][mention_id], key=lambda x: -x[1])
      keyphrases = keyphrases[:max_keyphrases]
      keyphrase_map[mention_index] = keyphrases
      for kp_index, _, frequency in keyphrases: 
          S[mention_index, kp_index] = 1.0
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(S)
    S = tfidf.transform(S)
    S = S.tocsc()
    sums = S.sum(axis=0)
    np.savez(path, S.data, S.indices, S.indptr, sums)
    fp = codecs.open('%s/output/entity/%s/keyphrase-map.json' % (home, group), 'w', 'UTF-8')
    simplejson.dump(keyphrase_map, fp, indent=4)
    fp.close()
  else: print "Loading keyphrase data..."  
  npzfile = np.load(path)
  S = csc_matrix((npzfile['arr_0'], npzfile['arr_1'], npzfile['arr_2']), shape=(nrow, ncol))
  sums = npzfile['arr_3']
  fp = codecs.open('%s/output/entity/%s/keyphrase-map.json' % (home, group), 'r', 'UTF-8')
  keyphrase_map = simplejson.load(fp)
  fp.close()
  keyphrase_map = {int(key):value for key, value in keyphrase_map.iteritems()}
  finish = time.time()
  print '\ttook %0.3f s' % (finish-start)
  return data, keyphrase_map, S, sums
Example #27
0
def get_matrices(good_deals,bad_deals,test_deals):
    """ Return the training and testing matrices with labels """
    # Generte labels for good and bad deals
    labels=[0]*len(good_deals)+[1]*len(bad_deals)
    
    deals = good_deals+bad_deals
    
    # Instance of vectorizer that records counts of terms
    count_vectorizer = CountVectorizer()
    
    # Fit training and testing data to transfroms into matrix
    train=count_vectorizer.fit_transform(deals)    
    test = count_vectorizer.transform(test_deals)
    
    # Initialize TFIDF tranformer and transform testing and training data to tfidf matrix
    tfidf = TfidfTransformer()
    tfidf.fit(test)
    tfidf.fit(train)
    train_mat=tfidf.transform(train)
    test_mat = tfidf.transform(test)
    
    return train_mat.todense(),labels,test_mat.todense()
Example #28
0
class TFIDF(object):

    def __init__(self, preprocessing=None, binary=False):
        super(TFIDF, self).__init__()

        if preprocessing is None:
            preprocessing = TextPreProcessing()
        self.preprocessing = preprocessing

        _ = lambda x: x
        self._cv = CountVectorizer(tokenizer=_, analyzer=_, preprocessor=_,
                                   binary=binary)
        self._tfidf = TfidfTransformer(
            norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

    def fit(self, documents):
        counts = self._cv.fit_transform(
            [self.preprocessing(d) for d in documents])
        self._tfidf.fit(counts)

    def transform(self, documents):
        counts = self._cv.transform([self.preprocessing(d) for d in documents])
        return self._tfidf.transform(counts)
Example #29
0
def vectorize_data(train_data, test_data):
    global app_vocabulary

    from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer  
    vocabulary=[x.replace('.', '_') for x in list(app_vocabulary)]
    save_vocabulary(vocabulary, 'vocabulary.txt')
    # train_data=[x.replace('.','_') for x in train_data]
    # test_data=[x.replace('.','_') for x in test_data]

    count_v1= CountVectorizer(vocabulary=vocabulary)
    # import pdb;pdb.set_trace()
    counts_train = count_v1.fit_transform(train_data)
    # print "the shape of train is "+repr(counts_train.shape)  
    
    count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_)
    counts_test = count_v2.fit_transform(test_data)

    save_vocabulary(count_v2.get_feature_names(), 'vocabulary.txt')

    tfidftransformer=TfidfTransformer()
    tfidf_train=tfidftransformer.fit(counts_train).transform(counts_train)
    tfidf_test=tfidftransformer.fit(counts_test).transform(counts_test)

    return tfidf_train, tfidf_test
def provide_idf_transformer_and_idf_dtm(vectorizer_params,count_dtm):
    """
    #TODO: update docu
    Takes count_dtm and transforms it to idf_dtm.
    
    :param count_dtm - scipy.sparse.csr.csr_matrix (n_samples, n_features)
    
    :return tfidf_transformer_fit - fitted TfidfTransformer
    :return dtm_idf - scipy.sparse.csr.csr_matrix (n_samples, n_features) with 
    idf.
    """
    
    tfidf_transformer = TfidfTransformer(use_idf=True, sublinear_tf=False, 
                                norm=vectorizer_params[DEFAULT_NORM_KEY])
    
    tfidf_transformer_fit = tfidf_transformer.fit(count_dtm)
    
    #the document_term_matrix with idf*tf
    dtm_idf = tfidf_transformer_fit.transform(count_dtm)
        
    return tfidf_transformer_fit, dtm_idf
Example #31
0
# "We can see the shining sun, the bright sun.")


count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(docs)

print "Vocabulary:", count_vectorizer.vocabulary_
print "size of terms",  len(count_vectorizer.vocabulary_)

freq_term_matrix = count_vectorizer.transform(docs)
#print freq_term_matrix.todense()

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
#print "IDF:", tfidf.idf_

tf_idf_matrix = tfidf.transform(freq_term_matrix)
print len(tf_idf_matrix.todense())

#print tf_idf_matrix.todense()[1,2]
#print type(count_vectorizer.vocabulary_.keys()[0])
#print count_vectorizer.vocabulary_.keys()[0].encode('ascii','replace')

#+++++++++++++++++++numpy to json, too slow
# results=[]
# for index in range(0, len(keys)-550):
#     result = {}
#     result["business_id"]=keys[index]
#     for i in range(0,len(count_vectorizer.vocabulary_.keys())-35800):
Example #32
0
def PredictionScoreLeaveOneOut(X, y, limit, columnName):
    from sklearn.metrics import f1_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.svm import SVC, LinearSVC
    import matplotlib.pyplot as plt

    names = [
        "Linear SVM", "Nearest Neighbors", "RBF SVM", "Decision Tree",
        "Random Forest", "AdaBoost", "Naive Bayes"
    ]
    # names = ["Linear SVM","Linear SVM","Linear SVM","Linear SVM"]

    classifiers = [
        SVC(kernel="linear", C=0.025, probability=True),
        KNeighborsClassifier(3),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB()
    ]

    outFile = open('output.txt', 'a')

    vec = DictVectorizer()

    for name, clf in zip(names, classifiers):
        try:
            accuracy = 0.0
            count = 0.0
            total_accuracy = 0.0
            total_f1 = 0.0
            total_precision = 0.0
            total_recall = 0.0

            count = 1.0

            from sklearn.model_selection import LeaveOneOut
            loo = LeaveOneOut()
            loo.get_n_splits(X)

            # print(loo)
            y_test_all = []
            y_pred_all = []
            accuracy_total = 0
            count = 0
            for train_index, test_index in loo.split(X):
                # print("TRAIN:", train_index, "TEST:", test_index)
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                from sklearn.feature_extraction.text import CountVectorizer
                count_vect = CountVectorizer()
                X_train_fit = count_vect.fit(X_train)
                X_train_counts = X_train_fit.transform(X_train)
                X_test_counts = X_train_fit.transform(X_test)

                from sklearn.feature_extraction.text import TfidfTransformer
                tfidf_transformer = TfidfTransformer()
                fit = tfidf_transformer.fit(X_train_counts)
                X_train_tfidf = fit.transform(X_train_counts)
                X_test_tfidf = fit.transform(X_test_counts)

                X_train_counts = X_train_tfidf
                X_test_counts = X_test_tfidf
                try:
                    clf.fit(X_train_counts.toarray(), y_train)
                    accuracy_total += clf.score(X_test_counts.toarray(),
                                                y_test)
                    count += 1
                    y_pred = clf.predict(X_test_counts.toarray())
                    #
                    # binary_predictions = [x if x == 'good' else 0 for x in y_pred]
                    # binary_predictions = [x if x == 0 else 1 for x in binary_predictions]
                    #
                    # binary_labels = [x if x == 'good' else 0 for x in y_test]
                    # binary_labels = [x if x == 0 else 1 for x in binary_labels]
                    y_pred_all.append(y_pred[0])
                    y_test_all.append(y_test[0])

                except BaseException as b:
                    print(b)

            f1 = f1_score(y_test_all, y_pred_all, average='weighted')
            precision = precision_score(y_test_all,
                                        y_pred_all,
                                        average='weighted')
            recall = recall_score(y_test_all, y_pred_all, average='weighted')

            print(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall))
            outFile.write(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall) + "\n")
            # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test)
            #
            # total_accuracy +=acc
            # total_f1 += f1
            # total_precision += prc
            # total_recall += rec

        except BaseException as b:
            print(b)
    outFile.close()
    if labels[i] == 'ham':
        labels[i] = 0
        ham_index.append(i)
    elif labels[i] == 'spam':
        labels[i] = 1
        spam_index.append(i)
    else:
        print('UNIDENTIFIED LABEL AT INDEX: ' + str(i))

#count the occurance of each word
CV = CountVectorizer()
TF = TfidfTransformer()
#fit and transform the features
features_count = CV.fit_transform(
    features)  #contains a count of each word in each sms
TF.fit(features_count)
features_tfidf = TF.transform(features_count)  #TfIdf representation

#convert to dense arrays in order to seperate into test and training data and perform feature selection
features_count = features_count.toarray()
features_tfidf = features_tfidf.toarray()

#seperate into traiing and testing data
num = round(rows / 20)
spam_num = len(spam_index)
ham_num = len(ham_index)
#select 10% of the rows for testing
random_ham_index = random.sample(range(0, ham_num), num)
random_spam_index = random.sample(range(0, spam_num), num)

hams = [ham_index[i] for i in random_ham_index]
Example #34
0
    with open(output, 'wb') as fd:
        pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
    pass


os.makedirs(sys.argv[2], exist_ok=True)

# Generate train feature matrix
df_train = get_df(train_input)
train_words = np.array(df_train.text.str.lower().values.astype('U'))

bag_of_words = CountVectorizer(stop_words='english',
                               max_features=max_features,
                               ngram_range=(1, ngrams))

bag_of_words.fit(train_words)
train_words_binary_matrix = bag_of_words.transform(train_words)
tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(train_words_binary_matrix)
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)

save_matrix(df_train, train_words_tfidf_matrix, train_output)

# Generate test feature matrix
df_test = get_df(test_input)
test_words = np.array(df_test.text.str.lower().values.astype('U'))
test_words_binary_matrix = bag_of_words.transform(test_words)
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)

save_matrix(df_test, test_words_tfidf_matrix, test_output)
Example #35
0
    'OtherP'
]

X = data[features[0]]
Y = data['ope']
print "X", len(X)  #(48701, 55)
print "Y", len(Y)  #(48701, 1)
topic_train = X
''' Vectorizing X '''
vect = CountVectorizer()
vect.fit(topic_train)
topic_train_dtm = vect.transform(topic_train)
print "topic train dym shape", topic_train_dtm.shape
''' TFIDF Transform '''
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(topic_train_dtm)
topic_train_tfidf_dtm = tfidf.transform(topic_train_dtm)
print "topic_train_tfidf_dtm size", topic_train_tfidf_dtm.shape  #(48701, 96459)
''' Getting the LIWC features '''
lwic = data[features[1:]]
print "lwic.shape : ", lwic.shape
lwic_array = lwic.as_matrix()
lwic_array_numpy = np.array(lwic_array)  #(48701, 54)
print "lwic_array_numpy.shape : ", lwic_array_numpy.shape
''' Only TOPICS '''
X_matrix = topic_train_tfidf_dtm
print "X_matrix shape : ", X_matrix.shape  #(48701, 96513)

Y_matrix = np.array(Y).reshape(len(Y), 1)
print "Y_matrix shape : ", Y_matrix.shape  #(48701, 1)
Example #36
0
    def train(self,
              train_data_file,
              test_data_file,
              chain_tgt_fields,
              metadata_file=None):
        print('start train')
        raw_train = pickle.load(open(train_data_file, 'rb'))
        raw_test = pickle.load(open(test_data_file, 'rb'))

        print('get count matrix')
        train_x_count_matrix = get_count_matrix(raw_train['x'],
                                                raw_train['src_size'])
        test_x_count_matrix = get_count_matrix(raw_test['x'],
                                               raw_train['src_size'])

        classification_model_num = len(raw_train['y'])
        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(train_x_count_matrix)

        train_x = tfidf_transformer.transform(train_x_count_matrix)
        test_x = tfidf_transformer.transform(test_x_count_matrix)

        # train_y = binarize_label(raw_train['y'], raw_train['tgt_size'])
        # test_y = binarize_label(raw_test['y'], raw_train['tgt_size'])

        chains = []
        choose_feature_list = []
        train_chain_pred_history = sp.csr_matrix([])
        feature_num = 20000
        model_type = 'svm'
        for i in range(classification_model_num):
            chains.append(self.get_model(model_type))

        chain_train_x = train_x
        print(chain_train_x.shape)
        for idx, tgt_field in enumerate(chain_tgt_fields):
            print(tgt_field)
            # chain.fit(chain_train_x, raw_train['y'][tgt_field])
            # train_pred = chain.predict(chain_train_x)
            # train_pred_pro = sp.csr_matrix(chain.predict_proba(chain_train_x))

            choose_feature = self.feature_slect(train_x,
                                                raw_train['y'][tgt_field],
                                                feature_num)
            if train_chain_pred_history.shape[1] != 0:
                chain_train_x = sp.hstack(
                    [train_x[:, choose_feature], train_chain_pred_history])
            else:
                chain_train_x = train_x[:, choose_feature]
            choose_feature_list.append(choose_feature)
            print(chain_train_x.shape)
            chains[idx], train_pred_pro = self.get_fit(
                chains[idx],
                model_type,
                chain_train_x,
                raw_train['y'][tgt_field],
            )
            if train_chain_pred_history.shape[1] != 0:
                train_chain_pred_history = sp.hstack(
                    [train_chain_pred_history, train_pred_pro])
            else:
                train_chain_pred_history = train_pred_pro
            # chain_train_x = sp.hstack([chain_train_x, train_pred_pro])
            # chain_train_x = sp.csr_matrix(chain_train_x)
            # print(train_pred)
        self.chains = chains
        self.chain_tgt_fields = chain_tgt_fields
        metadata = {'tfidf': tfidf_transformer, 'model': self}
        pickle.dump(metadata, open(metadata_file, 'wb'))

        chain_test_x = test_x
        test_chain_pred_history = sp.csr_matrix([])
        for chain, tgt_field, choose_feature in zip(chains, chain_tgt_fields,
                                                    choose_feature_list):
            # preds = chain.predict(chain_test_x)
            # test_pred_pro = sp.csr_matrix(chain.predict_proba(chain_test_x))
            if test_chain_pred_history.shape[1] != 0:
                chain_test_x = sp.hstack(
                    [test_x[:, choose_feature], test_chain_pred_history])
            else:
                chain_test_x = test_x[:, choose_feature]
            test_pred_pro, test_pred = self.get_predict(
                chain, model_type, chain_test_x)

            if test_chain_pred_history.shape[1] != 0:
                test_chain_pred_history = sp.hstack(
                    [test_chain_pred_history, test_pred_pro])
            else:
                test_chain_pred_history = test_pred_pro
            # chain_test_x = sp.hstack([chain_test_x, test_pred_pro])
            # chain_test_x = sp.csr_matrix(chain_test_x)

            print(tgt_field)
            print(
                metrics.classification_report(raw_test['y'][tgt_field],
                                              test_pred,
                                              digits=4))
Example #37
0
def parse_group(group):
    group_id = '-' + group
    offset = 0
    all_posts = []

    r = requests.get(
        'https://api.vk.com/method/wall.get',
        params={
            'owner_id': group_id,
            'offset': offset,
            'count': 10,
            'access_token':
            'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe',
            'v': '5.95'
        })
    posts = r.json()['response']['items']
    all_posts.extend(posts)

    data_posts = []
    likes_response = []
    all_likes = []

    for p in all_posts:
        data_posts.append(get_data(p))
        r = requests.get(
            'https://api.vk.com/method/likes.getList',
            params={
                'owner_id': group_id,
                'offset': offset,
                'type': 'post',
                'item_id': p['id'],
                'filter': 'likes',
                'friends_only': 0,
                'extended': 1,
                'count': p['likes']['count'],
                'access_token':
                'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe',
                'v': '5.95'
            })
        likes_response.extend(r.json()['response']['items'])

    for like_response in likes_response:
        like = Like(group_id, like_response['id'], like_response['type'],
                    like_response['first_name'], like_response['last_name'])
        all_likes.append(like)
    write_likes_json(all_likes, group_id)

    write_posts_json(data_posts, group_id)
    my_stop_words = get_stop_words('ru')

    vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=my_stop_words)
    X = vectorizer.fit_transform([data_post.text for data_post in data_posts])
    idf = vectorizer.idf_

    #***************

    cv = CountVectorizer(max_df=0.85,
                         stop_words=my_stop_words,
                         max_features=10000)
    word_count_vector = cv.fit_transform(
        [data_post.text for data_post in data_posts])
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    feature_names = cv.get_feature_names()
    #all keywords
    keywords = []
    morph = pymorphy2.MorphAnalyzer()

    #generate tf-idf for the given document
    for data_post in data_posts:
        tf_idf_vector = tfidf_transformer.transform(
            cv.transform([data_post.text]))

        #sort the tf-idf vectors by descending order of scores
        sorted_items = sort_coo(tf_idf_vector.tocoo())

        #extract only the top n; n here is 1
        results = extract_topn_from_vector(feature_names, sorted_items, 1)
        result = ''
        if results:
            result = next(iter(results))
        if result != '' and not result.isdigit():
            result = morph.parse(result)[0].normal_form
        if len(result) > 2:
            keyword = KeyWord(data_post.id, result, 1)
            keywords.append(keyword)
    return data_posts, keywords
Example #38
0
print("Using the quora data set...")
print("Total data set size" , len(test.question1))
print("Training data set size" , len(test.question1)//18)

for i in range(len(test.question1)//18):
    train_set.append(test.question1[i])
    train_set.append(test.question2[i])

vectorizer = CountVectorizer(stop_words = stopWords)
transformer = TfidfTransformer()

print("Count vector is built")

trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
transformer.fit(trainVectorizerArray)

print("Tf-Idf vector is learnt from count vector")

output = []
test_set = []
for i in range(10000):
    test_set = []
    test_set.append(test.question1[i])
    test_set.append(test.question2[i])
    testVectorizerArray = vectorizer.transform(test_set).toarray()
    tfidf = transformer.transform(testVectorizerArray)
    similarilty = 1 - spatial.distance.cosine((tfidf.todense())[0][0], (tfidf.todense())[1][0])
    output.append([similarilty, test.is_duplicate[i]])

for i in range(1,51):
Example #39
0
    scaler = StandardScaler()
    tfidf = TfidfTransformer(norm=None)
    dense = Data_Utils.DenseTransformer()

    for train, test in skf.split(CU_X, Y):
        #train split
        CU_train_data = CU_X[train]
        train_labels = Y[train]
        
        #test split
        CU_eval_data = CU_X[test]
        eval_labels = Y[test]

        # tf-idf
        tfidf.fit(CU_train_data)
        CU_train_data = dense.transform(tfidf.transform(CU_train_data))
        CU_eval_data = dense.transform(tfidf.transform(CU_eval_data))
        
        # standardization
        scaler.fit(CU_train_data)
        CU_train_data = scaler.transform(CU_train_data)
        CU_eval_data = scaler.transform(CU_eval_data)

        # normalization
        CU_train_data = normalize(CU_train_data)
        CU_eval_data = normalize(CU_eval_data)

        train_data =  CU_train_data
        eval_data = CU_eval_data
Example #40
0
def query(Category, category_obj):
    base_url = 'http://export.arxiv.org/api/query?'
    search_query = Category
    query = 'search_query=%s&max_results=30&sortBy=submittedDate&sortOrder=descending' % (
        search_query)
    feedparser._FeedParserMixin.namespaces[
        'http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
    feedparser._FeedParserMixin.namespaces[
        'http://arxiv.org/schemas/atom'] = 'arxiv'
    with libreq.urlopen(base_url + query) as url:
        response = url.read()
    feed = feedparser.parse(response)
    #date = the_date
    corpus_entry = []
    exists = False
    count = 0
    all_dates = []
    for entry in feed.entries:
        # print (entry.title + " " + entry.published + "\n")
        #if entry.published[0:10] == date:
        print('entery published ', entry.published, type(entry.published))
        date_ = entry.published[:10]
        date = datetime.strptime(date_, '%Y-%m-%d').date()
        all_dates.append(date)
        corpus_entry.append(entry)
        #xists = True
    # If date does not exist just returns most recent articles
    # if exists == False:
    # 	if count == 0:
    # 		count = 1
    # 		if entry.published[0:10] == date:
    # 			return False
    # 		else:
    # 			date = entry.published[0:10]
    # 	if entry.published[0:10] == date:
    # 		corpus_entry.append(entry)

    for paper in corpus_entry:
        paper.summary = prePro(paper.summary.lower())
    stop_Words = stop_words.ENGLISH_STOP_WORDS
    # Dictionary here
    corpusSumm = []
    for paper in corpus_entry:
        corpusSumm.append(paper.summary)
    cv = CountVectorizer(max_df=.85, stop_words=stop_Words)
    word_count_vector = cv.fit_transform(corpusSumm)
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count_vector)

    feature_names = cv.get_feature_names()
    i = 0
    d_i = 0
    for paper in corpus_entry:
        i += 1
        tf_idf_vector = tfidf_transformer.transform(
            cv.transform([paper.summary]))
        sorted_items = sort_coo(tf_idf_vector.tocoo())
        keywords = extract_topn_from_vector(feature_names, sorted_items)
        top3_sentences = []
        top3_scores = []
        top3_breakdown = []
        for sentence in paper.summary.split("."):
            # how many scores are higher in top3 than this sentence, if all 3, delete and replace, otherwise delete lowest
            higherScores = 0
            sentenceTotal = 0
            theSentence = []
            breakdown = []
            index = 0
            # keep track of words / sentence to get average score
            word_count = 0
            for word in sentence.split(" "):
                # Add up all of the tf_idf scores
                if word.lower() in keywords:
                    sentenceTotal = sentenceTotal + keywords[word.lower()]
                    breakdown.append(keywords[word.lower()])
                # Average by word
                theSentence.append(word.lower())
                breakdown.append("0")
                word_count = word_count + 1
            sentenceTotal = sentenceTotal / word_count
            min_score = 1000
            # print (theSentence,sentenceTotal,word_count)
            # get index of min score and append if should
            if top3_sentences:
                # print (top3_scores)
                if len(top3_scores) == 3:
                    for idx, score in enumerate(top3_scores):
                        if score > sentenceTotal:
                            higherScores = higherScores + 1
                        elif score < min_score:
                            index = idx
                            min_score = score
                    if higherScores < 3:
                        del top3_sentences[index]
                        del top3_scores[index]
                        top3_sentences.append(sentence)
                        top3_scores.append(sentenceTotal)
                        top3_breakdown.append(breakdown)

                else:
                    top3_sentences.append(sentence)
                    top3_scores.append(sentenceTotal)
                    top3_breakdown.append(breakdown)
            else:
                top3_sentences.append(sentence)
                top3_scores.append(sentenceTotal)
                top3_breakdown.append(breakdown)

        three_sentences = {}
        k = 0
        for sentence in top3_sentences:
            # if sentence == '':
            # 	print(sentence)
            # else:
            # 	if sentence[0] == ' ':
            # 		print(sentence[1:] + "\n")
            # 	else:
            # 		print(sentence + "\n")
            three_sentences[k] = {"sentence": sentence}
            k += 1

        obj, created = Articles.objects.get_or_create(link=paper.link,
                                                      defaults={
                                                          'title': paper.title,
                                                          'sentence':
                                                          three_sentences,
                                                          'category':
                                                          category_obj,
                                                          'date':
                                                          all_dates[d_i]
                                                      })
        print('obj created ', created, ' date', all_dates[d_i])
        d_i += 1
Example #41
0
top3_words = get_top_three_words(dt['text'], n=20)

top3_df = pd.DataFrame(top3_words)

top3_df.columns = ["Tri-gram", "Freq"]

print(top3_df)

# Barplot of most freq Tri-grams for
sns.set(rc={'figure.figsize': (13, 8)})
j = sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45)

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# read test docs into a dataframe and concatenate title and body
df_test = dataframe[15000:]
df_test['text'] = df_test['Title'] + df_test['Body']
df_test['text'] = df_test['text'].apply(lambda x: cleanse_text(x))

# get test docs into a list
docs_test = df_test['text'].tolist()


def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def apply_tf_idf(X):
    transformer = TfidfTransformer(smooth_idf=True,
                                   sublinear_tf=True,
                                   use_idf=True)
    transformer.fit(X)
    return transformer
Example #43
0
def PredictionScoreLeaveOneOutSpecifyClassifier(X, y, limit, columnName,
                                                classifierNames, classifiers):
    from sklearn.metrics import f1_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.svm import SVC, LinearSVC
    import matplotlib.pyplot as plt

    names = classifierNames

    outFile = open('output.txt', 'a')

    vec = DictVectorizer()

    for name, clf in zip(names, classifiers):
        try:
            accuracy = 0.0
            count = 0.0
            total_accuracy = 0.0
            total_f1 = 0.0
            total_precision = 0.0
            total_recall = 0.0

            count = 1.0

            from sklearn.model_selection import LeaveOneOut
            loo = LeaveOneOut()
            loo.get_n_splits(X)

            # print(loo)
            y_test_all = []
            y_pred_all = []
            accuracy_total = 0
            count = 0
            for train_index, test_index in loo.split(X):
                # print("TRAIN:", train_index, "TEST:", test_index)
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                from sklearn.feature_extraction.text import CountVectorizer
                count_vect = CountVectorizer()
                X_train_fit = count_vect.fit(X_train)
                X_train_counts = X_train_fit.transform(X_train)
                X_test_counts = X_train_fit.transform(X_test)
                #
                from sklearn.feature_extraction.text import TfidfTransformer
                tfidf_transformer = TfidfTransformer()
                fit = tfidf_transformer.fit(X_train_counts)
                X_train_tfidf = fit.transform(X_train_counts)
                X_test_tfidf = fit.transform(X_test_counts)

                X_train_counts = X_train_tfidf
                X_test_counts = X_test_tfidf
                try:
                    clf.fit(X_train_counts.toarray(), y_train)
                    accuracy_total += clf.score(X_test_counts.toarray(),
                                                y_test)
                    count += 1
                    y_pred = clf.predict(X_test_counts.toarray())
                    #
                    # binary_predictions = [x if x == 'good' else 0 for x in y_pred]
                    # binary_predictions = [x if x == 0 else 1 for x in binary_predictions]
                    #
                    # binary_labels = [x if x == 'good' else 0 for x in y_test]
                    # binary_labels = [x if x == 0 else 1 for x in binary_labels]
                    y_pred_all.append(y_pred[0])
                    y_test_all.append(y_test[0])

                except BaseException as b:
                    print(b)

            f1 = f1_score(y_test_all, y_pred_all, average='weighted')
            precision = precision_score(y_test_all,
                                        y_pred_all,
                                        average='weighted')
            recall = recall_score(y_test_all, y_pred_all, average='weighted')

            print(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall))
            outFile.write(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall) + "\n")
            # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test)
            #
            # total_accuracy +=acc
            # total_f1 += f1
            # total_precision += prc
            # total_recall += rec

        except BaseException as b:
            print(b)
    outFile.close()
Example #44
0
print()
print('TF-IDF Embedding Using Scikit-learn')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
tf = cv.fit_transform(texts)

# Vocabulary
vocabulary = cv.get_feature_names()
print(vocabulary[:10])
print(vocabulary[-10:])

tt = TfidfTransformer()
tt.fit(tf)
tf_idf = tt.transform(tf)

print()
print('tf matrix')
print(tf_idf.shape)
print(tf_idf.toarray())

tv = TfidfVectorizer()
tf_idf_2 = tv.fit_transform(texts)

print()
print('tf-idf matrix')
print(tf_idf_2.toarray())
"""import random
len_query = 5
def tfidf_process(data):
    from sklearn.feature_extraction.text import TfidfTransformer
    transformer = TfidfTransformer()
    transformer = transformer.fit(data)
    return transformer
Example #46
0
    dataFrame.at[i, "AuthorName"] = dataFrame_final.at[i, "AuthorName"]
    dataFrame = dataFrame.reindex(np.random.permutation(dataFrame.index))
#########################

########################
print("\n\n", "::::::>>>>>>> NAIVE <<<<<<<::::::")
#Naive + Vectorizer
X_train, X_test, y_train, y_test = train_test_split(dataFrame["Content"],
                                                    dataFrame["AuthorName"],
                                                    test_size=0.2,
                                                    random_state=1)
count_vect = CountVectorizer()
fiter = count_vect.fit(X_train)
X_train_counts = fiter.transform(X_train)
tfidf_transformer = TfidfTransformer()
fiter2 = tfidf_transformer.fit(X_train_counts)
X_train_tfidf = fiter2.transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

print(">>> Naive Score:")
print(clf.score(fiter2.transform(fiter.transform(X_test)), y_test))

#Cross Validation

X = dataFrame['Content']
y = dataFrame['AuthorName']
scores1 = cross_val_score(clf, fiter2.transform(fiter.transform(X)), y, cv=10)
print(">>> Cross Validation Score:")
print(scores1)
########################
Example #47
0
def generateLazyLoadForModel2(useIntegrated,
                              category,
                              platform,
                              uniqueFileConvention,
                              dataFileConvention,
                              test_size=defaultTestSize):
    if os.path.isfile('data/' + category + '/' + dataFileConvention +
                      '_dataset.csv'):
        print(dataFileConvention + '_dataset.csv' +
              ' already generated only shuffling right now')
        with open(
                'data/' + category + '/' + dataFileConvention + '_dataset.csv',
                'r') as f:
            l = list(csv.reader(f))
            firstHeadersRow = l[0]
            l = l[1:]
        random.shuffle(l)
        with open(
                'data/' + category + '/' + dataFileConvention + '_dataset.csv',
                'w') as f:
            csv.writer(f).writerows([firstHeadersRow] + l)
        return
    print(dataFileConvention + '_dataset.csv' + ' not found')
    print(dataFileConvention + '_dataset.csv' + ' generating')
    probs = []
    if len(generateLazyLoad.probs) == 0:
        generateLazyLoadForModel2.probs = get_all_probs_without_category_NA(
            useIntegrated, platform)
    probs = generateLazyLoadForModel2.probs

    random.shuffle(probs)
    train_set = tuple([prob.modified_description for prob in probs])

    prob_class = []
    for prob in probs:
        prob_class.append(1.0 if category in prob.category else 0.0)

    print 'Test Size: ' + str((test_size))
    print 'Total: ' + str((len(probs)))
    print 'Train Set Length: ' + str(len(train_set))

    timeStart = time.time()

    if os.path.isfile(PlatformType.platformString[platform] + '_tfidfMatrix_' +
                      '.pickle'):
        print('Loading tfidf matrix from pickle')
        with open(
                PlatformType.platformString[platform] + '_tfidfMatrix_' +
                '.pickle', 'rb') as f:
            tf_idf_matrix = pickle.load(f)
    else:
        print('Building tfidf matrix and dumping in pickle')
        count_vectorizer = CountVectorizer(stop_words='english')
        count_vectorizer.fit_transform(train_set)
        freq_term_matrix = count_vectorizer.transform(train_set)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        tf_idf_matrix = tfidf.transform(freq_term_matrix)
        with open(
                PlatformType.platformString[platform] + '_tfidfMatrix_' +
                '.pickle', 'wb') as f:
            pickle.dump(tf_idf_matrix, f)

    # print(str(tf_idf_matrix))
    numpyAr = tf_idf_matrix.toarray()
    np.set_printoptions(threshold='nan')
    print 'Tfidf Feature Size: ' + str(len(numpyAr[0]) + 1)
    list_of_features_needed = []
    keepPercentage = 0.05

    if os.path.isfile(PlatformType.platformString[platform] +
                      '_list_of_features_aftertfidf_' + str(keepPercentage) +
                      '.pickle'):
        print('List of features after tfidf already found, reading from there')
        with open(
                PlatformType.platformString[platform] +
                '_list_of_features_aftertfidf_' + str(keepPercentage) +
                '.pickle', 'rb') as f:
            list_of_features_needed = pickle.load(f)
    else:
        print('Making List of features after tfidf and dumping as pickle')
        for cat in categories:
            list_of_features_needed = list_of_features_needed + get_categorywise_features(
                numpyAr, cat, probs, keepPercentage)

        list_of_features_needed = list(set(list_of_features_needed))
        list_of_features_needed.sort()
        with open(
                PlatformType.platformString[platform] +
                '_list_of_features_aftertfidf_' + str(keepPercentage) +
                '.pickle', 'wb') as f:
            pickle.dump(list_of_features_needed, f)

    # print(str(list_of_features_needed))
    numpyArrayList = []
    for i in range(len(numpyAr)):
        newRowList = []
        for j in list_of_features_needed:
            newRowList.append(numpyAr[i][j])
        numpyArrayList = numpyArrayList + [np.array(newRowList)]
    numpyAr = np.array(numpyArrayList)
    print(numpyAr.shape)

    # Applying SVD to reduce features
    # svd = TruncatedSVD(int(0.1*len(numpyAr[0] + 1)))
    # lsa = make_pipeline(svd, Normalizer(copy = False))
    # reduced_lsa_features = lsa.fit_transform(tf_idf_matrix)
    # numpyAr = reduced_lsa_features
    #
    print 'Reduced Feature Size: ' + str(len(numpyAr[0]) + 1)
    print('Time taken for generating data: ' + str(time.time() - timeStart))
    if not os.path.exists('data/' + category):
        os.makedirs('data/' + category)
    print('Currently on cat: ' + str(category))
    with open('data/' + category + '/' + dataFileConvention + '_dataset.csv',
              'w') as f:
        print('Writing ' + 'data/' + category + '/' + dataFileConvention +
              '_dataset.csv')
        writer = csv.writer(f)
        i = 0
        for row in numpyAr:
            writer.writerow(list(row) + [prob_class[i]])
            i += 1
Example #48
0
    post = " ".join(post)
    return post


my_data['processed_text'] = my_data.title + my_data.body
my_data['processed_text'] = my_data.processed_text.apply(text_processing)

doc = my_data['processed_text'].tolist()  #lista coloana "processed_text
cv = CountVectorizer(analyzer='word',
                     stop_words='english',
                     max_df=0.85,
                     ngram_range=(1, 2))
word_count = cv.fit_transform(my_data['processed_text'])

tfidf_transf = TfidfTransformer()
tfidf_transf.fit(word_count)
tf_idf_matrix = tfidf_transf.transform(word_count).toarray()

dict_sent = {}
for doc_nr in range(0, len(tf_idf_matrix)):
    df_tfidf = pd.DataFrame(tf_idf_matrix[doc_nr],
                            index=cv.get_feature_names(),
                            columns=['tf_idf_scores'
                                     ]).sort_values(by=['tf_idf_scores'],
                                                    ascending=False).head(3)
    dict_sent[doc_nr] = df_tfidf.index.values.tolist()

df_results = pd.DataFrame(columns=['Title', 'Top 3 topics'])
df_results['Title'] = my_data['title'].values
df_results['index'] = df_results.reset_index().index
df_results['Top_3_topics'] = df_results['index'].map(dict_sent)
Example #49
0
def tf_idf(X):
  tfidfTransformer = TfidfTransformer(use_idf=True)
  tfidfTransformer.fit(X)
  X = tfidfTransformer.transform(X, copy=True)
  return X
Example #50
0
documents = [data[d]['plot'] for d in range(l)]
titles = [data[d]['title'] for d in range(l)]

LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
LemVectorizer.fit_transform(documents)
stemmer = nltk.stem.porter.PorterStemmer()

# print LemVectorizer.vocabulary_

tf_matrix = LemVectorizer.transform(documents).toarray()
# print tf_matrix
# print tf_matrix.shape

from sklearn.feature_extraction.text import TfidfTransformer
tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)
# print tfidfTran.idf_

import math


def idf(n, df):
    result = math.log((n + 1.0) / (df + 1.0)) + 1
    return result


# print "The idf for terms that appear in one document: " + str(idf(4,1))
# print "The idf for terms that appear in two documents: " + str(idf(4,2))

tfidf_matrix = tfidfTran.transform(tf_matrix)
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range = (1,3),
    max_df = 0.9261187281287935,
    min_df = 4
)
corpus_data_fitted = vectorizer.fit(X_train_full)
pickle.dump(corpus_data_fitted.vocabulary_, 	# save vocabulary for future
            open('models/vocabulary.p', 'wb'))	# predictions
corpus_data_features = corpus_data_fitted.transform(X_train_full)


# Transform count matrix to a normalized tf-idf representation
tfidf_transformer = TfidfTransformer()
corpus_data_tfidf_fitted = tfidf_transformer.fit(corpus_data_features)
pickle.dump(corpus_data_tfidf_fitted, 		# save fitted Tfidf for future
            open('models/corpus_data_tfidf_fitted.p', 'wb'))	# predictions
corpus_data_features_tfidf = corpus_data_tfidf_fitted\
                              .transform(corpus_data_features)


# TRAIN CLASSIFIER
print("Training Logistic Regression classifier... "
      "This takes around 10 minutes!")
clf_logreg = LogisticRegression(max_iter=100, n_jobs=-1, C=3.41)\
              .fit(corpus_data_features_tfidf, y_train_full)


# SAVE MODEL
print("Saving the trained classifier in models/ folder...")
Example #52
0
class SklearnGridSearch:
    """
    Run grid search with selected sklearn classifiers
    """
    grid = {
        "RandomForestClassifier" : {
            "criterion": ["gini", "entropy"],
            "max_depth": [5, 10, 20, 40, 100, 200],
            "min_samples_leaf": [1, 2, 4, 8]
        },
        "MLPClassifier" : {
            "activation": ["logistic", "relu"],
            "hidden_layer_sizes": [(10,), (20, ), (50, )]
        }
    }

    def __init__(self, classifier_type :str, parameters: Dict, verbose=False):
        """
        Initialize the classifier
        :param classifier_type: The name of the classifiers
        :param grid: Parameter settings to check
        :param verbose: verbosity true or false
        """
        self.verbose = verbose

        if classifier_type not in self.grid:
            raise Exception("Unsupported classifier type {0}. Use one of {1}".format(classifier_type, self.grid.keys()))

        if classifier_type == "RandomForestClassifier":
            self.sklearn_classifier = RandomForestClassifier()
        elif classifier_type == "MLPClassifier":
            self.sklearn_classifier = MLPClassifier()

        self.parameters = parameters

        self.classifier_type = classifier_type
        self.count_vectorizer = CountVectorizer(min_df=10, max_df=0.8, ngram_range=(1, 1))
        self.tfidf_transformer = TfidfTransformer(use_idf=True)

    def grid_search(self, training_data: str, text_label: str, class_label: str) -> Dict:
        """
        Train the classifier
        :param training_data: File name. Training data is one json per line
        :param text_label: Json field which contains the text
        :param class_label:  Json field which contains the label for the classes to train
        :return: Nothing
        """
        """
        Train the algorithm with the data from the knowledge graph
        """

        data_train = create_data_table_from_training_file(training_data, text_label, class_label, 10000)
        print("INFO: grid evaluation with {0} data points".format(len(data_train)))
        data_train = data_train.fillna(0)

        matrix_train_counts = self.count_vectorizer.fit_transform(data_train.text)
        self.tfidf_transformer = self.tfidf_transformer.fit(matrix_train_counts)
        matrix_train_tf = self.tfidf_transformer.transform(matrix_train_counts)
        matrix_train_tf = matrix_train_tf.toarray()

        grid_search = GridSearchCV(self.sklearn_classifier, self.parameters, n_jobs=10)
        grid_search.fit(matrix_train_tf, data_train.label)
        print(grid_search.best_params_)
        return grid_search.best_params_
Example #53
0
qualified_descriptions = getDescriptionsFromSheet(qualified_sheet)
qualified_clean_descriptions = cleanUp(qualified_descriptions)
disqualified_sheet = getSheet('input/disqualified.xlsx')
disqualified_descriptions = getDescriptionsFromSheet(disqualified_sheet)
disqualified_clean_descriptions = cleanUp(disqualified_descriptions, True)

X = qualified_clean_descriptions + disqualified_clean_descriptions
Y = createLabelArray(qualified_clean_descriptions,
                     disqualified_clean_descriptions)
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)
vectorizer.fit(X_train)
X_train, X_test = transformByVectorizer(X_train, X_test, vectorizer)
tfidf_transformer.fit(X_train)
X_train, X_test = transformByTfIdf(X_train, X_test, tfidf_transformer)

# gnb, gnb_score=runGaussianNB(X_train, X_test, Y_train, Y_test)
forest_score, forest = runForest(X_train, X_test, Y_train, Y_test)

print 'Random Forest score: ', forest_score
# print 'Random Gaussian Naive Bayes score: ', gnb_score

with open('forest', 'wb') as f:
    cPickle.dump(forest, f)

# with open('gnb', 'wb') as fi:
#    cPickle.dump(gnb, fi)

with open('vectorizer', 'wb') as file:
Example #54
0
# In[13]:


from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 


# In[14]:


count_v1 = CountVectorizer(max_df=0.4, max_features=3000) # 考虑到内存只选用了3000个特征
counts_train = count_v1.fit_transform(train_data_list)
print("the shape of train is " + repr(counts_train.shape))
tfidftransformer = TfidfTransformer()
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
tfidf_ndarray = tfidf_train.toarray() 


# ### 用K-Means聚类

# In[15]:


from sklearn.cluster import KMeans


# In[16]:


kmeans = KMeans(n_clusters=10, random_state=0).fit(tfidf_ndarray)
Example #55
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary["salad"]], 1)
        assert_equal(counts_test[0, vocabulary["tomato"]], 1)
        assert_equal(counts_test[0, vocabulary["water"]], 1)

        # stop word from the fixed list
        assert_false("the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false("copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary["coke"]], 0)
        assert_equal(counts_test[0, vocabulary["burger"]], 0)
        assert_equal(counts_test[0, vocabulary["beer"]], 0)
        assert_equal(counts_test[0, vocabulary["pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    assert_raises(ValueError, t3.transform, counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    assert_raises(ValueError, t3.transform, X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')
    assert_false(tv.fixed_vocabulary)

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    assert_equal(v3.build_preprocessor(), strip_accents_ascii)

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    assert_raises(ValueError, v3.build_preprocessor)

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    assert_raises(ValueError, v3.build_analyzer)
Example #56
0
    def createFeatureVectorsLazy(self, input_dir, label, chunks_record, flat_list=False, chunks=True):
        print("in createFeatureVectorsLazy")
        function_words_vectorizer = CountVectorizer(token_pattern=r"[a-z]*'[a-z]*|(?u)\b\w\w+\b|...|.|!|\?|\"|\'",
                                                    vocabulary=self.function_words_list)
        POS_vectorizer = CountVectorizer(ngram_range=(3, 3), vocabulary=self.POS_trigrams_list, stop_words=None,
                                         tokenizer=BinaryNLIClassifier.Tokenize, lowercase=False)
        function_word_feature_vector = []
        POS_feature_vector = []
        text = []

        for f in os.listdir(input_dir):
            with open(os.path.join(input_dir, f), 'rb') as fh:
                # print("file = {}".format(f))
                text.append(pickle.load(fh))
                # text = pickle.load(fh)
                if flat_list:
                    text = [item for sublist in text for item in sublist]
        text = [item for sublist in text for item in sublist]
        shuffle(text)
        try:
            text = [s.split(BEGIN_SENTENCE)[1] for s in text]
        except:
            print(text)
        # text = [s.split(END_SENTENCE)[0] for s in text]
        chunk = 0
        token_counter = 0
        lemmas = {}
        pos = {}
        lemmas[chunk] = []
        pos[chunk] = []
        pos_seq = ""
        lemmas_seq = ""
        for sentence in text:

            sentence = sentence.strip()
            if len(sentence) > 0 and chunks and token_counter >= CHUNK_SIZE:
                lemmas[chunk].append(lemmas_seq)
                pos[chunk].append(pos_seq)
                chunks_record.append("{}_{}".format(os.path.basename(os.path.normpath(input_dir)), chunk))
                chunk += 1
                pos[chunk] = []
                lemmas[chunk] = []
                token_counter = 0
                pos_seq = ""
                lemmas_seq = ""

            token_counter += len(sentence.split(" "))
            # print(token_counter)
            pos_seq += BEGIN_SENTENCE + " "
            for token in sentence.split(" "):
                try:
                    lemma_pos = token.split(SEPERATOR)
                    lemmas_seq += lemma_pos[0] + " "
                    pos_seq += lemma_pos[1] + " "
                    lemmas_seq += ' '

                except:
                    continue
            pos_seq += END_SENTENCE + ' '

        if not chunks:
            lemmas[chunk].append(lemmas_seq)
            pos[chunk].append(pos_seq)
            chunks_record.append("{}_{}".format(f, chunk))
            chunk += 1

        for index in range(chunk):
            if len(lemmas[index]) == 0:
                continue
            # print(index)
            function_word_feature_vector.append(function_words_vectorizer.fit_transform(lemmas[index]))
            POS_feature_vector.append(POS_vectorizer.fit_transform(pos[index]))
            with open(os.path.join(TOEFL_DEBUG, "{}_{}".format(f, index)), 'w', encoding='utf-8') as chunk_out:
                for sent in lemmas[index]:
                    # sent = (sent.split(END_SENTENCE)[0]).strip()

                    chunk_out.write(sent + "\n")
            # print(pos[index])
            # print(POS_feature_vector[index])
            # print(lemmas[index])
            # print(function_word_feature_vector[index])
            # print(len(function_word_feature_vector))

        # break
        print("chunk = {}".format(chunk))
        result_func_words = vstack(function_word_feature_vector)
        result_POS_trigrams = vstack(POS_feature_vector)
        final_feature_vecotr_structure = hstack([result_func_words, result_POS_trigrams], format='csr')
        # final_feature_vecotr_structure = result_POS_trigrams
        # final_feature_vecotr_structure = result_func_words
        tf_idf_transformer = TfidfTransformer()
        tf_idf_transformer.fit(final_feature_vecotr_structure)
        tf_idf_transformer.transform(final_feature_vecotr_structure)
        labels = [label] * final_feature_vecotr_structure.shape[0]
        return final_feature_vecotr_structure, labels
Example #57
0
                   encoding='utf-8').read().split('\n')
all_text = train_texts + test_texts

print('(2) doc to var...')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_v0 = CountVectorizer()
counts_all = count_v0.fit_transform(all_text)
count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_)
counts_train = count_v1.fit_transform(train_texts)
print("the shape of train is " + repr(counts_train.shape))
count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
counts_test = count_v2.fit_transform(test_texts)
print("the shape of test is " + repr(counts_test.shape))

tfidftransformer = TfidfTransformer()
train_data = tfidftransformer.fit(counts_train).transform(counts_train)
test_data = tfidftransformer.fit(counts_test).transform(counts_test)

x_train = train_data
y_train = train_labels
x_test = test_data
y_test = test_labels

print('(3) Naive Bayes...')
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB(alpha=0.01)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
num = 0
preds = preds.tolist()
Example #58
0
class SequenceBagOfWordsSVMClassifier(SupervisedLearnerPrimitiveBase[Inputs,
                                                                     Outputs,
                                                                     Params]):
    """
    BBN D3M Naive Sequence Classifier

    Arguments:
        hp_seed ... 
        hp_splice ... 
    """

    __author__ = 'BBN'
    __metadata__ = {
        "common_name": "Discontinuity-based segmentation",
        "algorithm_type": ["Bayesian"],
        "original_name":
        "bbn_primitives.time_series.SequenceBagOfWordsSVMClassifier",
        "task_type": ["Modeling"],
        "learning_type": ["Supervised learning"],
        "compute_resources": {
            "sample_size": [],
            "sample_unit": [],
            "disk_per_node": [],
            "expected_running_time": [],
            "gpus_per_node": [],
            "cores_per_node": [],
            "mem_per_gpu": [],
            "mem_per_node": [],
            "num_nodes": [],
        },
        "handles_regression": False,
        "handles_classification": False,
        "handles_multiclass": False,
        "handles_multilabel": False,
    }

    def __init__(
        self,
        hp_seed: int = 0,
        hp_splice: int = 0,
    ):
        super().__init__()

        np.random.seed(hp_seed)

        self.hp_splice = hp_splice

        self.training_inputs = None
        self.training_outputs = None
        self.tfidf = None
        self.vocab = None
        self.model = None
        self.fitted = False

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self.training_inputs = inputs
        self.training_outputs = outputs
        self.fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        """
        Arguments
            - inputs: List(d3m_ndarray)
        """
        if self.fitted:
            return

        if self.training_inputs is None or self.training_outputs is None:
            raise Exception('Missing training data')

        with stopit.ThreadingTimeout(timeout) as timer:
            #            if self.hp_splice > 1:
            #                spliced_data = list()
            #                for cinput in self.training_inputs:
            #                    framer = SignalFramer(
            #                        sampling_rate = 1,
            #                        frame_length_s = self.hp_splice,
            #                        frame_shift_s = 1,
            #                        flatten_output = True,
            #                    )
            #                    cdata = frames.produce([cinput])[0]
            #                    spliced_data.append(cdata)
            #            else:
            #                spliced_data = self.training_inputs
            self.vocab = seq_vocab(self.training_inputs)
            train_x = seq_to_tokenfreq_csr(self.training_inputs, self.vocab)
            train_y = self.training_outputs

            self.tfidf = TfidfTransformer(norm='l1')
            train_x_tfid = self.tfidf.fit(train_x).transform(train_x)

            # Build a classification model
            svm = sklearn.svm.SVC(probability=True)
            svm.fit(train_x_tfid, train_y)

            self.model = svm
            self.fitted = True

        if timer.state == timer.EXECUTED:
            return
        else:
            raise TimeoutError(
                'SequenceBagOfWordsSVMClassifier exceeded time limit')

    def produce(self,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> Outputs:
        """
        Arguments:
            - inputs: List(d3m_ndarray)

        Returns:
            - List(int)
        """
        with stopit.ThreadingTimeout(timeout) as timer:
            x = seq_to_tokenfreq_csr(inputs, self.vocab)
            train_x_tfid = self.tfidf.transform(x)
            pred = self.model.predict(train_x_tfid)
            outputs = [cpred for cpred in pred]

        if timer.state == timer.EXECUTED:
            return outputs
        else:
            raise TimeoutError(
                'SequenceBagOfWordsSVMClassifier exceeded time limit')

    def get_params(self) -> Params:
        return Params(coefficient=self.model.coef_)

    def set_params(self, *, params: Params) -> None:
        self.model.coef_ = params.coefficient
Example #59
0
def Train():
    CU_X, Y = create_Preturbed_Dataset(inputFile='CASIS-25_CU.txt')

    fold_accuracy = []

    for repeat in range(1):  # it was 10 intitially
        #-----------------------------Classifiers------------------------

        # Multilayer Perceptron
        mlp = MLPClassifier(hidden_layer_sizes=(95, 25),
                            activation=('relu'),
                            max_iter=1000)

        # Data Manipulation, Preprocessing, Training and Testing

        # 4-Fold CrossValidation with Shuffling
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

        scaler = StandardScaler()
        tfidf = TfidfTransformer(norm=None)
        dense = DenseTransformer()

        for train, test in skf.split(CU_X, Y):
            #train split
            CU_train_data = CU_X[train]
            train_labels = Y[train]

            #test split
            CU_eval_data = CU_X[test]
            eval_labels = Y[test]

            # tf-idf
            tfidf.fit(CU_train_data)
            CU_train_data = dense.transform(tfidf.transform(CU_train_data))
            CU_eval_data = dense.transform(tfidf.transform(CU_eval_data))

            # standardization
            scaler.fit(CU_train_data)
            CU_train_data = scaler.transform(CU_train_data)
            CU_eval_data = scaler.transform(CU_eval_data)

            # normalization
            CU_train_data = normalize(CU_train_data)
            CU_eval_data = normalize(CU_eval_data)

            train_data = CU_train_data
            eval_data = CU_eval_data

            # training
            mlp.fit(train_data, train_labels)

            # evaluation
            mlp_acc = mlp.score(eval_data, eval_labels)

    print('MLP Accuracy = ', mlp_acc)

    #---------------------------------------------------------------------------
    # Save the Trained Models Now
    path = "Trained_Models/"
    dump(mlp, open(path + 'mlp.pkl', 'wb'))
    print('***************************')
    print('Trained and Saved the Model')
    print('***************************')
    return mlp
Example #60
0
def run_classifer(X_train, s_train, y_train, X_test, s_test, y_test):
    s_train = np.array(s_train)  # samples x features
    s_test = np.array(s_test)

    num_labels = 15
    batch_size = 100

    stemmer = sb.SnowballStemmer('english')

    swlist = sw.words('english')
    swlist += [stemmer.stem(w) for w in swlist]
    swlist += [
        "'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure',
        'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha',
        'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv'
    ]  #complained about not having these as stop words
    pubs = [
        'buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox',
        'guardian', 'review', 'theatlant'
    ]
    punct = [
    ]  #[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now

    swlist += pubs
    swlist += punct
    if sys.argv[4].lower() == 'true':
        tkzr = StemTokenizer()
    else:
        tkzr = None

    if sys.argv[5].lower() != 'true':
        swlist = []

    #what features are we using?
    if sys.argv[7].lower() == 'word':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train = tfidf_transformer.transform(X_train)
        X_test = tfidf_transformer.transform(X_test)

    elif sys.argv[7].lower() == 'topic':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)
        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train = lda_model.transform(X_train)
        X_test = lda_model.transform(X_test)

    elif sys.argv[7].lower() == 'style':
        X_train = csr_matrix(s_train)
        X_test = csr_matrix(s_test)

    elif sys.argv[7].lower() == 'all':
        count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr)
        count_vect.fit(X_train)
        X_train = count_vect.transform(X_train)
        X_test = count_vect.transform(X_test)

        tfidf_transformer = TfidfTransformer()
        tfidf_transformer.fit(X_train)
        X_train_tf = tfidf_transformer.transform(X_train)
        X_test_tf = tfidf_transformer.transform(X_test)
        print(type(X_train_tf))

        lda_model = LatentDirichletAllocation(n_components=10)
        lda_model.fit(X_train)
        X_train_lda = lda_model.transform(X_train)
        X_test_lda = lda_model.transform(X_test)
        print(type(X_train_lda))

        X_train = csr_matrix(
            sparse.hstack(
                [X_train_tf,
                 csr_matrix(X_train_lda),
                 csr_matrix(s_train)]))
        X_test = csr_matrix(
            sparse.hstack(
                [X_test_tf,
                 csr_matrix(X_test_lda),
                 csr_matrix(s_test)]))

        print(type(X_train))

        # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train)
        # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test)

    else:
        sys.exit('unknown features')

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    # np.save('X_train.npy', X_train)
    # np.save('X_test.npy', X_test)
    # np.save('y_train.npy', y_train)
    # np.save('y_test.npy', y_test)

    # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train)
    # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test)

    # load everything back
    # X_train = sparse.load_npz("X_train.npz")

    input_dim = X_train.shape[1]
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim, )))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=5,
                        verbose=1,
                        validation_split=0.1)

    # model.model.save(sys.argv[6] + '.h5')

    # X_train = np.load('X_train.npy')
    # X_test = np.load('X_test.npy')
    # y_train = np.load('y_train.npy')
    # y_test = np.load('y_test.npy')

    # model = keras.models.load_model(sys.argv[6] + '.h5')
    score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

    print('Test accuracy:', score[1])

    y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
    predicted = np.argmax(y_pred, axis=1)
    p, r, fs, s = precision_recall_fscore_support(np.argmax(y_test, axis=1),
                                                  predicted)
    print(p, r, fs, s)