class TF_Transformer(base.BaseEstimator, base.TransformerMixin):
	def __init__(self):

		self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2))
		self.tfidf_trans = TfidfTransformer()
		self.SVD_trans = TruncatedSVD(n_components=300)

    # X is a list of Fit_Review named tuples, y is none
	def fit(self, X, y=None):

		texts = [review.text for review in X]

		counts = self.cv_bi.fit_transform(texts)
		counts_tfidf = self.tfidf_trans.fit_transform(counts)
		self.SVD_trans.fit(counts_tfidf)

		return self

    # X is a list of either Fit_Review or Prod_Corpus named tuples
	def transform(self, X):

		texts = [review.text for review in X]

		counts = self.cv_bi.transform(texts)
		counts_tfidf = self.tfidf_trans.transform(counts)
		counts_trunc = self.SVD_trans.transform(counts_tfidf)

		return counts_trunc
Example #2
0
def check_webshell(clf,dir):
    all=0
    all_php=0
    webshell=0

    webshell_files_list = load_files_re(webshell_dir)
    CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features,
                         token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
    x = CV.fit_transform(webshell_files_list).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    transformer.fit_transform(x)


    g = os.walk(dir)
    for path, d, filelist in g:
        for filename in filelist:
            fulepath=os.path.join(path, filename)
            t = load_file(fulepath)
            t_list=[]
            t_list.append(t)
            x2 = CV.transform(t_list).toarray()
            x2 = transformer.transform(x2).toarray()
            y_pred = clf.predict(x2)
            all+=1
            if filename.endswith('.php'):
                all_php+=1
            if y_pred[0] == 1:
                print "%s is webshell" % fulepath
                webshell+=1

    print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def Bags_Of_Words(train_fp, test_fp, freq_flag):
    # This function will a dictionary of bags of words
    # one will be the training bag of words and one will be the test bag of words

    # train_fp is the filepath to the dataset
    # test_fp is a CountVectorizer object you need to pass in
    # freq_flag will determine if you want to consider either occurence or frequency when making bags of words
    # Could affect how accurate our model is

    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()

    df_train = pd.read_csv(train_fp)
    df_test = pd.read_csv(test_fp)

    # Only keep parent comments
    # df = df[df.comment_under_post != False]
    X_train_counts = count_vect.fit_transform(df_train.body)
    X_test_counts = count_vect.transform(df_test.body)

    # Takes into account comment length, transforms matrix into frequency of a particular word, not simply occurence
    if freq_flag:
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
        X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
        return {"train": X_train_tfidf, "test": X_test_tfidf}
    else:
        return {"train": X_train_counts, "test": X_test_counts}
Example #4
0
def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    with warnings.catch_warnings(record=True) as w:
        tfidf = tr.fit_transform(X).toarray()
        assert_equal(len(w), 1)
        # For Python 3 compatibility
        if hasattr(w[0].message,'args') :
            assert_true("divide by zero" in\
                w[0].message.args[0])
        else : 
            assert_true("divide by zero" in\
                w[0].message)
def work_with_simple_bag_of_words():
    count = CountVectorizer()
    docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet',
    ])
    bag = count.fit_transform(docs)
    print(count.vocabulary_)
    print(bag.toarray())

    np.set_printoptions(precision=2)
    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    print(tfidf.fit_transform(bag).toarray())

    tf_is = 2
    n_docs = 3
    idf_is = np.log((n_docs+1) / (3+1))
    tfidf_is = tf_is * (idf_is + 1)
    print("tf-idf of term 'is' = %.2f" % tfidf_is)

    tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
    raw_tfidf = tfidf.fit_transform(bag).toarray()[-1]
    print(raw_tfidf)

    l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
    print(l2_tfidf)
Example #6
0
def TFIDF():
    global segcont
    global weight
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont))
    word = vectorizer.get_feature_names()  # 所有文本的关键字
    weight = tfidf.toarray()  # 对应的tfidf矩阵
    del segcont

    seg = []
    for i in range(len(weight)):
        enstr = ""
        for j in range(len(word)):
            if weight[i][j] >= 0.1:#####################################
                enstr = enstr + " " + word[j]
        seg.append(enstr)

    del weight
    vec = CountVectorizer()
    tra = TfidfTransformer()
    tidf = tra.fit_transform(vec.fit_transform(seg))
    wo = vec.get_feature_names()
    we = tidf.toarray()

    global we
    def runAnalysis(self):
        
        trainingData = np.loadtxt(open(self.training_file, 'rb'), delimiter = ',', skiprows = 0);
        testData = np.loadtxt(open(self.test_file,'rb'), delimiter = ',', skiprows = 0);
        
        #trainingData = np.genfromtxt(open(self.training_file,'rb'),delimiter=',');
        #testData = np.genfromtxt(open(self.testData,'rb'),delimiter=',');       
        xTrain =  trainingData[:, :trainingData.shape[1]-1]
        yTrain = trainingData[:,trainingData.shape[1]-1]
                  
        xTest = testData[:, :testData.shape[1] -1]
        yTest = testData[:, testData.shape[1]-1]
        
        #evaluateCorrelationResults(xTrain, yTrain)
        
        #xTrain,xTest = transform(xTrain,yTrain,xTest)
        
        #tf-idf transformation
        transformer = TfidfTransformer()
        xTrain = transformer.fit_transform(xTrain)        
        xTest = transformer.fit_transform(xTest)

        appendDataTofile("Training dimension -> ",xTrain.shape)
        appendDataTofile("Testing dimension ->  ",xTest.shape)
        
        
        
                
        #MultinomialNB classification
        """appendDataTofile("MultiNB");
        yPred = classify(lambda:naive_bayes.MultinomialNB(),xTrain,xTest,yTrain,yTest)"""
        
        #Logistic Regression classification
        #penalty="l1",C=0.5,intercept_scaling=2
        """appendDataTofile("Log regression");
        yPred = classify(lambda:linear_model.LogisticRegression(),
                 xTrain,xTest,yTrain,yTest)"""
                                  
        #SVM based classification
        appendDataTofile("SVM");
        #C=8.0,gamma=0.10,kernel='rbf',probability=True,shrinking=True
        #yPred = classify(lambda:svm.SVC(),
        #        xTrain,xTest,yTrain,yTest)
        
        #Grid search SVM
        yPred =  gridSearchCVforSVM(xTrain,xTest,yTrain,yTest)
        
        #yPred =  clusterify(lambda:KMeans(n_clusters=3),xTrain,xTest,yTrain,yTest)
         
        """yPred = classify(lambda:KNeighborsClassifier(),xTrain,xTest,yTrain,yTest)""" 
        """yPred = classify(lambda: linear_model.RidgeClassifierCV,xTrain,xTest,yTrain,yTest)"""
        
        outputFile = open("../results.txt", 'w+')
        rows = len(yPred)
        #outputFile.write("ID\tSentiment\n");
        for i in range(0,rows):
            outputFile.write(str(yPred[i])+"\n")
        outputFile.close()
Example #8
0
    def linearSVC_prediction(self):
        tfidf = TfidfTransformer()
        X = tfidf.fit_transform(self.dvec.fit_transform(self.words))

        c_tfidf = TfidfTransformer()
        c_X = c_tfidf.fit_transform(self.c_dvec.fit_transform(self.c_words))

        self.svc = LinearSVC()
        self.svc.fit(X, self.scores)

        self.c_svc = LinearSVC()
        self.c_svc.fit(c_X, self.c_scores)
Example #9
0
def test_tf_idf_smoothing():
    X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm="l2")
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1.0, 1.0, 1.0])

    # this is robust to features with only zeros
    X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm="l2")
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())
Example #10
0
def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]
def bayes_tfidf(prefix, sufix, dic_fn):
    """
    prefix example: ./data/single_label_sen/sen_spanish_protest
    sufix example: pop_cat
    """

    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix
    
    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}


    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]

    # construct the word count matrix
    count_vect = CountVectorizer()
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)

    # construct tfidf matrix
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count)
    test_set_x = tfidf_transformer.transform(test_set_count)

    print "start the model"
    test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y])
    return test_score
Example #12
0
def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    clean_warning_registry()
    with warnings.catch_warnings(record=True) as w:
        1. / np.array([0.])
        numpy_provides_div0_warning = len(w) == 1

    in_warning_message = 'divide by zero'
    tfidf = assert_warns_message(RuntimeWarning, in_warning_message,
                                 tr.fit_transform, X).toarray()
    if not numpy_provides_div0_warning:
        raise SkipTest("Numpy does not provide div 0 warnings.")
Example #13
0
def get_topic_tfidf(cor_list, topic_num, path_base):
    # 该类将文本中的词语转换成词频矩阵,矩阵元素a[i][j]表示词j在第i个文本下的词频
    vectorizer = CountVectorizer()
    # 统计每个词语的tfidf
    transformer = TfidfTransformer()

    corpus_split = list()

    for i in range(topic_num):
        corpus_split.append(list())
        for j in topics_list[i]:
            # 把评论放到相应主题下的列表中
            corpus_split[i].append(cor_list[j])
        print('第', i, '个列表元素的长度', len(corpus_split[i]))

        # 第一个fit_transformer是计算tfidf,第二个是转化成词频矩阵
        tfidf = transformer.fit_transform(
            vectorizer.fit_transform(corpus_split[i]))
        # 获取词袋模型中的所有词语
        word = vectorizer.get_feature_names()
        # 将tfidf矩阵抽取出来,a[i][j]表示词j在第i个文本中的tfidf权重
        weight = tfidf.toarray()

        path = path_base + str(i)

        with open(path, 'w') as f:
            for m in range(len(weight)):
                for n in range(len(word)):
                    # f.write(word[n])
                    # f.write(' ')
                    f.write(str(weight[m][n]))
                    f.write(' ')
                f.write('\n')
Example #14
0
def tfidf(fileList):
    segPath = sys.path[0] + '/seg_result'
    corpus = [] #存取文档的分词结果
    for eachFile in fileList:
        fileName = segPath + '/' + eachFile
        f = open(fileName,'r+')
        content = f.read()
        corpus.append(content)
    vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值,同时会使用默认的中文停用词
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    #创建tfidf文件夹,保存tf-idf的结果
    tfidfFilePath = os.getcwd() + '/tfidfFile'
    if not os.path.exists(tfidfFilePath):
        os.mkdir(tfidfFilePath)
    for i in range(len(weight)):
        print u"--------Writing all the tf-idf in the", i, u" file into ", tfidfFilePath + '/' + str(i) + '.txt', "--------"
        name = tfidfFilePath + '/' + string.zfill(i, 5) + '.txt'
        f = open(name,'w+')
        for j in range(len(word)):
            #f.write(word[j] + "    " + str(weight[i][j]) + "\n")
            #f.write(str(weight[i][j]) + "\n")
            f.write(word[j] + "\n")
        f.close()
Example #15
0
def test_pickling_transformer():
    X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
    orig = TfidfTransformer().fit(X)
    s = pickle.dumps(orig)
    copy = pickle.loads(s)
    assert_equal(type(copy), orig.__class__)
    assert_array_equal(copy.fit_transform(X).toarray(), orig.fit_transform(X).toarray())
Example #16
0
def get_feature_by_bag_tfidf():
    global white_count
    global black_count
    global max_features
    print "max_features=%d" % max_features
    x=[]
    y=[]

    webshell_files_list = load_files_re(webshell_dir)
    y1=[1]*len(webshell_files_list)
    black_count=len(webshell_files_list)

    wp_files_list =load_files_re(whitefile_dir)
    y2=[0]*len(wp_files_list)

    white_count=len(wp_files_list)


    x=webshell_files_list+wp_files_list
    y=y1+y2

    CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features,
                                       token_pattern = r'\b\w+\b',min_df=1, max_df=1.0)
    x=CV.fit_transform(x).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(x)
    x = x_tfidf.toarray()

    return x,y
Example #17
0
def tf_idf(seg_files):
    seg_path = './segfile/'
    corpus = []
    for file in seg_files:
        fname = seg_path + file
        f = open(fname, 'r+')
        content = f.read()
        f.close()
        corpus.append(content)

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word = vectorizer.get_feature_names()
    weight = tfdif.toarray()

    save_path = './tfidffile'
    if not os._exists(save_path):
        os.mkdir(save_path)

    for i in range(len(weight)):
        print('--------Writing all the tf-idf in the', i, u' file into ', save_path + '/' + string.zfill(i, 5) + '.txt',
              '--------')
        f = open(save_path + '/' + string.zfill(i, 5) + '.txt', 'w+')
        for j in range(len(word)):
            f.write(word[j] + ' ' + str(weight[i][j]) + '\r\n')
        f.close()
Example #18
0
def load_dataset(prefix, sufix, dic_fn, vocab_fn='./data/english_review.trn-100000.vocab'):
    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix

    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}
    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]
    
    vocab = [l.strip().lower().split("\t")[0] for l in open(vocab_fn)]
    count_vect = CountVectorizer(vocabulary=vocab)
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count).toarray()
    test_set_x = tfidf_transformer.transform(test_set_count).toarray()

    train_shared_x, train_shared_y = shared_dataset([train_set_x, train_y]) 
    test_shared_x, test_shared_y = shared_dataset([test_set_x, test_y]) 
    return [(train_shared_x, train_shared_y), (test_shared_x, test_shared_y)]
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def extract_text_features(train_data, test_data):
    """
    Returns one types of training and test data features.
        1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf

    Parameters
    ----------
    train_data : List[str]
        Training data in list. Will only take 30000 reviews for efficiency purposes
    test_data : List[str]
        Test data in list

    Returns
    -------
    Tuple(scipy.sparse.csr.csr_matrix,.., list)
        Returns X_train_tfidf, X_test_tfidf, vocab as a tuple.
    """
    
    # set up a count vectorizer that removes english stopwords when building a term-doc matrix
    count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
    # build the term frequency per document matrix from a random sublist of 30,000 documents
    train_counts = count_vect.fit_transform(random.sample(train_data, 30000))
    test_counts = count_vect.transform(test_data)
    tfidf_transformer = TfidfTransformer()

    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    test_tfidf = tfidf_transformer.transform(test_counts)
    
    vocab = count_vect.get_feature_names()
    
    return (train_tfidf, test_tfidf, vocab)
Example #21
0
class VectorModel(object):
    
    def __init__(self , list_of_comments=None):
        self.__list_of_comments = list_of_comments
        self.__vectorizer = []
        self.__corpus_simple_vector = []
        self.__transformer = []
        self.__corpus_tf_idf = []
        #self.prepare_models()
    
    def prepare_models(self):
        self.__vectorizer = CountVectorizer()
        vector = self.__vectorizer.fit_transform(self.__list_of_comments)
        self.__corpus_simple_vector = vector.toarray()
        self.__transformer = TfidfTransformer()
        tfidf = self.__transformer.fit_transform(self.__corpus_simple_vector)
        self.__corpus_tf_idf = tfidf.toarray()
        return [self.__vectorizer , self.__corpus_simple_vector , self.__transformer , self.__corpus_tf_idf]
    
    def set_models(self , vectorizer , transformer):    
        self.__vectorizer = vectorizer
        self.__transformer = transformer
        
    
    def get_comment_frequency_vector(self , comments):
        vec_comments = []
        for i in comments:
            vec_comments.append(i)
        vectores = self.__vectorizer.transform(vec_comments).toarray()
        return vectores
    
    def get_comment_tf_idf_vector(self , comments):
        vector = self.get_comment_frequency_vector(comments)
        result = self.__transformer.transform(vector).toarray()
        return result
Example #22
0
def tfidf(corpus, word_category, file_to_write):
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    weight = tfidf.toarray()
    sum_weight = np.sum(weight, axis=0)
    word = vectorizer.get_feature_names()
    word_and_weight = []
    for i in range(len(sum_weight)):
        word_and_weight.append([word[i], sum_weight[i]])
    word_and_weight.sort(key=lambda key: key[1], reverse=True)
    f = open(file_to_write, "w+")
    result = []
    for j in range(len(word_and_weight)):
        try:
            f.write(
                word_and_weight[j][0]
                + " "
                + str(word_and_weight[j][1])
                + " "
                + word_category[word_and_weight[j][0]]
                + "\n"
            )
            result.append([word_and_weight[j][0], word_and_weight[j][1], word_category[word_and_weight[j][0]]])
        except:
            continue
    f.close()
    return result
Example #23
0
def LR_modeling(file_name, k, AUC=True, weight=False):
    raw_data = pd.read_csv(file_name)
    raw_data = raw_data.drop(['issue', 'field'], axis=1)
    X = raw_data.drop('panelvote', axis=1)
    y = raw_data['panelvote']
    tfidf = TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
    X = tfidf.fit_transform(X.values)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8, random_state=42)
    lr = LogisticRegression(C=1)
    lr.fit(X_train, y_train)
    auc = np.mean(cross_validation.cross_val_score(lr, X, y, scoring="roc_auc"))
    if AUC == True:
        print "AUC for %s on the test data = %.3f" % (file_name, auc)
    if weight == False:
        top_positive, top_negative = get_top_k_nocoeff(lr.coef_[0], k)
        return raw_data.columns[top_positive], raw_data.columns[top_negative]
    else:
        top_positive, top_negative = get_top_k(lr.coef_[0], k)
        final_pos = {}
        final_neg = {}
        for i in top_positive.keys():
            final_pos[raw_data.columns[i]] = top_positive[i]
        for j in top_negative.keys():
            final_neg[raw_data.columns[j]] = top_negative[j]
        pos = sorted(final_pos.items(), key=operator.itemgetter(1), reverse=True)
        neg = sorted(final_neg.items(), key=operator.itemgetter(1))
        return pos, neg
Example #24
0
def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
Example #25
0
class TfIdfMixin(RawBOWMixin):
    def build(self):
        if not hasattr(self, "_tfidf_transformer"):
            self._tfidf_transformer = None

        if self._tfidf_transformer is None:
            self._tfidf_transformer = TfidfTransformer()
                    #input=u"content", preprocessor=lambda x: x,
                    #tokenizer=lambda x: x)

    def process(self, input_df, ndarray_data):
        X = self._tfidf_transformer.fit_transform(
            ndarray_data["RawBOWMatrix"])
        ndarray_data["TfIdfMatrix"] = X
        return input_df, ndarray_data

    def requires(self):
        return []

    def returns(self):
        return []

    def ndarray_requires(self):
        return ["RawBOWMatrix",]

    def ndarray_returns(self):
        return ["TfIdfMatrix"]

    def name(self):
        return "TfIdfMixin"
Example #26
0
def get_tf_idf(x_array):
	print ('start get tf-idf array...')
	transformer = TfidfTransformer()
	tfidf = transformer.fit_transform(x_array)
	tfidf_array = tfidf.toarray()
	print ('ok...\n')
	return tfidf_array
Example #27
0
    def classify(self):
        # 1. load Corpus from files
        #(corpus, labels) = self.loadFile("dataless/20NG/train_rho0.2_epsilon0.3_window_default")
        (corpus, labels) = self.loadFile("dataless/20NG/20ng-train-no-stop.txt")
        print set(labels)
        m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()} 
        #m = Word2Vec.load_word2vec_format("vectors/whole/new_c_e_train_neg10size400min_count1", binary=True) 
        #words = set(m.index2word)
        #words = set(m.keys())
        #print corpus
        #print labels
        # 2. Encode Feature Matrix
        cv = CountVectorizer(min_df=1)
        X = cv.fit_transform(corpus) # Frequency
        #print "Frequency:",X
        #print cv.get_feature_names()
        transformer = TfidfTransformer()
        X = transformer.fit_transform(X) # TF-IDF weighted entities
        #print "Tf-idf:",X
        # 3. calculate final vectors to predict labels
        #        print X[0]for x in X[0]:
        pre_vectors = self.pre_vectors(X, cv ,m)


        # 3. Encode label vector
        le = preprocessing.LabelEncoder()
        Y = le.fit_transform(labels)
Example #28
0
def estimation(file='song_text.txt', separator=u'--text--'):
    arr = text_split_line(file, u'--text--')
    dvect = data_vector(arr)
    target = dvect[0]
    text = dvect[1]
    dic = dvect[2]      # for converting target integer to artist name
#    print (target)
#    print (dic)
    count_vect = CountVectorizer()
    word_vect = count_vect.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    vect_tfidf = tfidf_transformer.fit_transform(word_vect)
    machine = svm.SVC(probability=True) # one of the best for text, see tutorial working with text
    machine.fit(vect_tfidf, target)
    print (machine.score(vect_tfidf, target))
    prediction = machine.predict(vect_tfidf)        # accuracy test (tutorial)
    print (u'model predictive accuracy:  {:.1%}'
           .format(np.mean(prediction == target)))
    new_texts = [text[500], text[2345], text[-2], text[0], text[5893]]
    new_data = count_vect.transform(new_texts)
    new_tfidf = tfidf_transformer.transform(new_data)
    prediction = machine.predict(new_tfidf)
    for i in range(len(new_texts)):
        print (u'{}\t=> {}'.format(new_texts[i].splitlines()[:2],
                                  dic[prediction[i]]))
    return
Example #29
0
 def getContextFeature(self):
     import time
     print 'start to get Context Feature'
     start = time.time()
     
     from sklearn.feature_extraction.text import TfidfTransformer
     from sklearn.feature_extraction.text import CountVectorizer
     #when we meet the large corpus, need to input an iteration!
     corpus = self.getIterText()
     #transfer the text into word frequency matrix
     vectorizer = CountVectorizer()
     transformer = TfidfTransformer()
     tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
     
     print 'get word'
     word=vectorizer.get_feature_names()
     print 'get weight'
     weight=tfidf
     
     print 'weight type:', type(weight)
     #print weight
     end = time.time()
     
     print 'total time: \t', end-start
     return weight,word
Example #30
0
    def tf_idf(self, **kwargs):
        """Perform tf-idf transformation."""
        tfid = TfidfTransformer(**kwargs)

        tfidf_matrix = tfid.fit_transform(self.matrix)
        return Space(tfidf_matrix, self.row_labels, self.column_labels)
Example #31
0
file_name = "segment.p"
df = pickle.load(open(file_name, "rb"))
df['new_words'] = df["words"].map(lambda x: ("".join(re.findall(ur'[\u4e00-\u9fff]+', x))).strip())
df['token_words'] = df['new_words'].map(lambda x: " ".join(jieba.lcut(x, cut_all=False)))
df = df.reindex(np.random.permutation(df.index))
corpus = []
label = []
org_word = []
for i in range(len(df)):
	corpus = corpus + [df['token_words'][i] for x in range(df['num_term'][i]+1)]
	label = label + [df['label'][i] for x in range(df['num_term'][i]+1)]
	org_word = org_word + [df['new_words'][i] for x in range(df['num_term'][i]+1)]

vectorizer=CountVectorizer()
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
label = np.asarray(label)
idf = transformer.idf_


# non_word_value = 9.2564775671942705

clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(weight, label)
# final_results = clf.predict(weight)

test_files=os.listdir(test_dir)
alldict=[]
for j1,i1 in enumerate(test_files):
    print("  . Most correlated unigrams:\n       . {}".format(
        '\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format(
        '\n       . '.join(bigrams[-N:])))

# splitting to training and test set
X_train, X_test, y_train, y_test = train_test_split(
    dataset['consumer_complaint_narrative'],
    dataset['product'],
    random_state=0)

# fitting the countvectorizer and tfidf transformer to the x_train dataset
count_vect = CountVectorizer()
X_train_count_vect = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count_vect)

#Fitting the dataset to the Naive bayes classifier
classifier = MultinomialNB().fit(X_train_tfidf, y_train)

# sample predictions
classifier.predict(
    count_vect.transform([
        'This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine.'
    ]))
classifier.predict(
    count_vect.transform([
        "I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"
    ]))

dataset[
    infile = open(wordEmbeddingModel,'rb')
    results = pickle.load(infile)

    #infile = open('../wordEmbeddings/wikiVectors','rb')

    print('is creating feature matrix...')
    proto_matrix = append_features(results)
    fea = np.matrix(proto_matrix)
    fea = np.nan_to_num(fea)

    y = getLabel(results)
    y = np.array(y)

    print('tifidf word vectors')
    tfidf_transformer = TfidfTransformer()
    X_vec = tfidf_transformer.fit_transform(fea).toarray()

    #reduce dimension
    reducer = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
    reducer.fit(X_vec)
    X_vec = reducer.transform(X_vec)


    print('load LIWC data...')
    text_liwc = pd.read_csv('../data/LIWC_self_label_valence.csv')
    liwc = text_liwc.loc[:,'function':'OtherP'].values

    ####combine with liwc
    X = np.concatenate((X_vec, liwc), axis=1)

    #Normalize data, convert it to unit vectors
Example #34
0
def IncreasingFIT1():
    global recordAccuracy
    recordAccuracy = []
    vectorizer = CountVectorizer(stop_words=stopwordslist)
    transformer = TfidfTransformer()
    global total_vect_time, parsing_time, vectorizing_time, oldVocubularysave, newVocubularysave
    # for T in range(TrainDataSize):
    global T
    for T in range(updatesize):
        tick = time.time()
        # X_train = vectorizer.transform(xtrain[i])

        count = vectorizer.fit_transform(xtrain[T])
        X_train = transformer.fit_transform(count)

        # ----------------------------------------
        VocubularyList = vectorizer.get_feature_names()

        # vectorizer = CountVectorizer(stop_words=None,vocabulary=VocubularyList)

        # tfidf = X_train.toarray().T

        print(X_train.shape)
        model1 = SelectKBest(chi2, k=1)
        X_chi2 = model1.fit_transform(X_train, ytrain[T])
        print(X_chi2.shape)
        print(model1.scores_.shape)

        j = 0
        for i in VocubularyList:
            # print(i,",",vectorizer2.vocabulary_[i],",",max(tfidf[vectorizer2.vocabulary_[i]]))
            newVocubularysave.append({
                "name": i,
                'numb': vectorizer.vocabulary_[i],
                'value': model1.scores_[j]
            })
            j = j + 1

        print("get newVocubularysave!")
        # newVocubularysave=oldVocubularysave
        newVocubularysave = oldVocubularysave + newVocubularysave
        newVocubularysave = sortbyword(newVocubularysave, FeatureSpaceSize,
                                       'value')
        # print(newVocubularysave)
        l = []
        for numV in newVocubularysave:
            l.append(numV['name'])
        # print(l)
        print("========================================================")
        print("========================================================")
        oldVocubularysave = newVocubularysave

        # /-----------------------------------------

        total_vect_time += time.time() - tick

        # 测试集的处理-----------------

        tick = time.time()
        parsing_time = time.time() - tick
        tick = time.time()

        vectorizing_time = time.time() - tick
        test_stats['n_test'] += len(ytest)
        test_stats['n_test_pos'] += sum(ytest)
        # end 数据集文本向量化 (哈希技巧) -------------------------------------------------------

        print(len(newVocubularysave))
        joblib.dump(newVocubularysave, "VocubularySave.v")

        print('开始增量训练...')
        IncreasingFIT()
        # IncreasingFIT()
        print('已完成...')
Example #35
0
def IncreasingFIT():
    global total_vect_time
    classifiers = {
        'SGD': SGDClassifier(),
        'Perceptron': Perceptron(),
        'NB Multinomial': MultinomialNB(alpha=0.01),
        'Passive-Aggressive': PassiveAggressiveClassifier(),
    }

    Vocubularysave = []
    if os.path.exists("VocubularySave.v"):
        Vocubularysave = joblib.load("VocubularySave.v")
    VocubularyList = []
    for numV in Vocubularysave:
        VocubularyList.append(numV['name'])
    vectorizer = CountVectorizer(stop_words=None, vocabulary=VocubularyList)
    transformer = TfidfTransformer()

    count = vectorizer.fit_transform(xtest)
    X_test = transformer.fit_transform(count)

    for i in range(TrainDataSize):
        tick = time.time()

        # X_train = vectorizer.transform(xtrain[i])
        count = vectorizer.fit_transform(xtrain[i])
        X_train = transformer.fit_transform(count)

        total_vect_time += time.time() - tick

        for cls_name, cls_useless in partial_fit_classifiers.items():
            cls = classifiers[cls_name]

            tick = time.time()
            # update estimator with examples in the current mini-batch
            # 使用当前最小批次中的示例更新估算器
            # print(X_train)

            cls.partial_fit(X_train, ytrain[i], classes=all_classes)

            # if i % printjumpsize == 0:
            if i == (TrainDataSize - 1):
                # accumulate test accuracy stats
                # 累积测试准确度统计
                cls_stats[cls_name]['total_fit_time'] += time.time() - tick
                cls_stats[cls_name]['n_train'] += X_train.shape[0]
                cls_stats[cls_name]['n_train_pos'] += sum(ytrain[i])
                tick = time.time()

                # 测试准确性函数
                cls_stats[cls_name]['accuracy'] = cls.score(X_test, ytest)

                cls_stats[cls_name]['prediction_time'] = time.time() - tick
                acc_history = (cls_stats[cls_name]['accuracy'],
                               cls_stats[cls_name]['n_train'])
                cls_stats[cls_name]['accuracy_history'].append(acc_history)
                run_history = (cls_stats[cls_name]['accuracy'],
                               total_vect_time +
                               cls_stats[cls_name]['total_fit_time'])
                cls_stats[cls_name]['runtime_history'].append(run_history)

                # accumulate test accuracy stats
                # 累积测试准确度统计
                if T == 0:
                    print(progress(cls_name, cls_stats[cls_name]))
                if T != 0:
                    AccuracyAverage[cls_name]['total_fit_time'] += time.time(
                    ) - tick
                    AccuracyAverage[cls_name]['n_train'] += X_train.shape[0]
                    AccuracyAverage[cls_name]['n_train_pos'] += sum(ytrain[i])
                    tick = time.time()

                    # 测试准确性函数
                    AccuracyAverage[cls_name]['accuracy'] += cls.score(
                        X_test, ytest)
                    RecordOneAccuracy[cls_name]['accuracy'] += cls.score(
                        X_test, ytest)
                    acc_history = (AccuracyAverage[cls_name]['accuracy'],
                                   AccuracyAverage[cls_name]['n_train'])
                    AccuracyAverage[cls_name]['accuracy_history'].append(
                        acc_history)
                    run_history += (
                        AccuracyAverage[cls_name]['accuracy'],
                        total_vect_time +
                        AccuracyAverage[cls_name]['total_fit_time'])
                    AccuracyAverage[cls_name]['runtime_history'].append(
                        run_history)

                    recordAccuracy.append(RecordOneAccuracy)

                    print(progress2(cls_name, AccuracyAverage[cls_name], T))
Example #36
0
print(y[:250])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.02,random_state=0)

print(99999999999999999999)

vectorizer=CountVectorizer(ngram_range=(1,2))

training_features=vectorizer.fit_transform(X_train)

print(8888888888888888888888)

np.asarray(training_features)
   #now to create the tfidf thing

tfidf_vec=TfidfTransformer()
X_train_tfidfvec=tfidf_vec.fit_transform(training_features)
print(77777777777777777777777777)

#classifier=svm.SVC(probability=True)
#classifier = KNeighborsClassifier(n_neighbors=5)
classifier= MultinomialNB()
print(666666666666666666666666)


classifier.fit(X_train_tfidfvec,y_train)

print(5555555555555555)


testing_features=vectorizer.transform(X_test)
Example #37
0
def main(_):
	np.random.seed(3)	#固定seed让每次的random都一样
	TIME_STEPS = FLAGS.N
	IMPUT_SIZE = 625	
	BATCH_SIZE = 30
	BATCH_INDEX = 0
	OUTPUT_SIZE = 2
	CELL_SIZE = 175
	LR = 0.001

	totalData = []
	totalDataLabel = []
	counter = 0
	totalDoc = 0
	totalpost = 0
	tdlist1 = 0
	Pos = 0
	Neg = 0
	maxpost = 0
	minpost = 62827

	thulac_pip = thulac.thulac(seg_only=True)  #只进行分词,不进行词性标注
	EventList = GetEventList()

	print("Generating BlackList with N = ", TIME_STEPS, " ...")
	for event in EventList:
		totalDoc += 1
		Eid = event["eid"]
		Label = event["label"]
		# print("Eid : ", Eid, "Label: ", Label)
		WeiboPostIdList = event["posts"]
		if len(WeiboPostIdList) == 1:
			tdlist1 += 1
			continue
		if len(WeiboPostIdList) >= maxpost:
			maxpost = len(WeiboPostIdList)
		if len(WeiboPostIdList) <= minpost:
			minpost = len(WeiboPostIdList)

		event_file_path = os.path.join(Weibo_Json_Dir, Eid + ".json")
		event_file = open(event_file_path, "r")
		event_json = json.load(event_file)

		WeiboPostList = []
		index = 0
		for WeiboPostId in WeiboPostIdList:
			totalpost += 1
			WeiboJson = event_json[index]
			index += 1
			WeiboText = WeiboJson["text"]
			Time = WeiboJson["t"]
			WeiboPost = {"text" : WeiboText, "time" : Time}
			WeiboPostList.append(WeiboPost)
		if Label == "0":
			Pos += 1
		else:
			Neg += 1
		#Sort by time
		WeiboPostList = sorted(WeiboPostList, key=lambda k: k['time'])

		#find Time Invertal of weibo
		TotalTimeLine = WeiboPostList[-1]['time']-WeiboPostList[0]['time']
		IntervalTime = TotalTimeLine/TIME_STEPS
		k = 0
		PreConInt = []
		while True:
			k += 1
			WeiboIndex = 0
			output = []
			if TotalTimeLine == 0:	
				for weibo in WeiboPostList:
					weibo_text = thulac_pip.cut(weibo["text"], text=True)
					output.append(weibo_text)
				break
			Start = WeiboPostList[0]['time']
			Interval = int(TotalTimeLine/IntervalTime)
			Intset = []
			for inter in range(0,Interval):
				empty = 0
				interval = []
				for q in range(WeiboIndex,len(WeiboPostList)):
					if WeiboPostList[q]['time'] >= Start and WeiboPostList[q]['time'] < Start+IntervalTime:
						empty += 1
						weibo_text = thulac_pip.cut(WeiboPostList[q]["text"], text=True)
						interval.append(weibo_text)
					#记录超出interval的weibo位置,下次可直接从此开始
					elif WeiboPostList[q]['time'] >= Start+IntervalTime:
						WeiboIndex = q-1
						break
				# empty interval
				if empty == 0:
					output.append([])
				else:
					#add the last weibo
					if WeiboPostList[-1]['time'] == Start+IntervalTime:
						weibo_text = thulac_pip.cut(WeiboPostList[-1]["text"], text=True)
						interval.append(weibo_text)
					Intset.append(inter)
					output.append(interval)
				Start = Start+IntervalTime
			ConInt = ContinuousInterval(Intset)
			if len(ConInt)<TIME_STEPS and len(ConInt) > len(PreConInt):
				IntervalTime = int(IntervalTime*0.5)
				PreConInt = ConInt
				if IntervalTime == 0:
					output = output[ConInt[0]:ConInt[-1]+1]
					break
			else:
				# print(len(ConInt))
				output = output[ConInt[0]:ConInt[-1]+1]
				break
		counter+=1
		event_file.close()
		# print (counter)
		# 把Interval的所有字都串在一起
		for q in range(0,len(output)):
			output[q] = ''.join(s for s in output[q])

		try:
		#Caculate Tfidf
			vectorizer = CountVectorizer()
			transformer = TfidfTransformer()
		#print(output)
			tf = vectorizer.fit_transform(output)
			tfidf = transformer.fit_transform(tf)
		# Debug
		# print(tfidf.toarray())
			Allvocabulary = vectorizer.get_feature_names()
		except ValueError:
			BlackList.append(Eid)
			continue

		# print(vectorizer.get_feature_names())
		Input = []

		for interval in tfidf.toarray():
			interval = sorted(interval,reverse=True)
			while len(interval) < IMPUT_SIZE:
				interval.append(0.0)
			Input.append(interval[:IMPUT_SIZE])
		if len(Input) < TIME_STEPS:
			for q in range(0,TIME_STEPS-len(Input)):
				Input.insert(0,[0.0] * IMPUT_SIZE)
		totalData.append(Input[:TIME_STEPS])
		totalDataLabel.append(Label)

	file_name = CX_WORD_DIR + "/BlackLists/BlackList" + str(FLAGS.N) + ".txt"
	f = open(file_name,'w')
	f.write(str(BlackList))
	f.close()
	print("Generating BlackList with N = ", TIME_STEPS, " done.")
def tfidfTransform(matrix):
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(matrix)
    return tfidf_matrix
Example #39
0
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories,
                                 shuffle=True)
print(len(twenty_train.data))
print(len(twenty_test.data))
print(twenty_train.target_names)
print("\n".join(twenty_train.data[0].split("\n")))
print(twenty_train.target[0])
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(twenty_train.data)
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

mod = MultinomialNB()
mod.fit(X_train_tfidf, twenty_train.target)
X_test_tf = count_vect.transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = mod.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(twenty_test.target, predicted))
print(
    classification_report(twenty_test.target,
                          predicted,
                          target_names=twenty_test.target_names))
print("confusion matrix is \n", confusion_matrix(twenty_test.target,
Example #40
0
def cluster_rows(sliced_data,
                 n_clusters=2,
                 cluster_method='PDN',
                 n_iters=100,
                 n_restarts=3,
                 cluster_prep_method=None,
                 cluster_penalty=1.0,
                 rand_gen=None,
                 sklearn_args=None):
    """
    A wrapper to abstract from the implemented clustering method

    cluster_method = GMM | DPGMM | HOEM
    """

    clustering = None

    #
    # slicing the data

    # allIndexes = numpy.arange(0, sliced_data.shape[0])
    # zerorowsIdx = allIndexes[numpy.sum(sliced_data, 1) == 0]
    # datarowsIdx = allIndexes[numpy.sum(sliced_data, 1) > 0]
    # clustering_data = numpy.delete(sliced_data, zerorowsIdx, 0)
    clustering_data = sliced_data

    if cluster_prep_method == "tf-idf":
        tfidf_transformer = TfidfTransformer()
        clustering_data = tfidf_transformer.fit_transform(clustering_data)
    elif cluster_prep_method == "log+1":
        clustering_data = numpy.log(clustering_data + 1)
    elif cluster_prep_method == "sqrt":
        clustering_data = numpy.sqrt(clustering_data)
        # clustering_data = clustering_data
        # row_sums = clustering_data.sum(axis=1) + 0.001
        # clustering_data = clustering_data / row_sums[:, numpy.newaxis]
        # clustering_data = numpy.sqrt(clustering_data)

    # sliced_data_sum = numpy.sum(sliced_data, axis=1)
    # sliced_data = sliced_data / sliced_data_sum[:, numpy.newaxis]
    # sliced_data = numpy.sqrt(sliced_data)

    print("RUNNING CLUSTERING dims: " + str(sliced_data.shape) + " into: " +
          str(n_clusters) + " method: " + cluster_method + " pre: " +
          str(cluster_prep_method))
    #if sliced_data.shape[1] == 1:
    #    print("V" + str(data_slice.feature_ids))

    start_t = perf_counter()
    if cluster_method == 'PDN':

        assert cluster_prep_method == None

        clustering = ABPDN.pdnClustering(clustering_data,
                                         nM=n_clusters,
                                         maxIters=n_iters,
                                         max_depth=5)

    elif cluster_method == 'GMM':
        clustering_data = numpy.log(clustering_data + 1)
        #
        # retrieving other properties
        cov_type = sklearn_args['covariance_type'] \
            if 'covariance_type' in sklearn_args else 'diag'
        #
        # creating the cluster from sklearn
        gmm_c = sklearn.mixture.GMM(n_components=n_clusters,
                                    covariance_type=cov_type,
                                    random_state=rand_gen,
                                    n_iter=n_iters,
                                    n_init=n_restarts)

        #
        # fitting to training set
        try:
            gmm_c.fit(clustering_data)
        except Exception:
            pass

        #
        # getting the cluster assignment
        clustering = gmm_c.predict(clustering_data)

    elif cluster_method == "KMeans":
        clustering = KMeans(n_clusters=n_clusters,
                            random_state=rand_gen,
                            n_jobs=1).fit_predict(clustering_data)

    elif cluster_method == "RandomPartition":
        clustering = above(make_planes(1, clustering_data.shape[1]),
                           clustering_data)[:, 0]

    elif cluster_method == 'DPGMM':
        #
        # retrieving other properties
        cov_type = sklearn_args['covariance_type'] \
            if 'covariance_type' in sklearn_args else 'diag'
        verbose = sklearn_args['verbose']\
            if 'verbose' in sklearn_args else False

        dpgmm_c = sklearn.mixture.DPGMM(n_components=n_clusters,
                                        covariance_type=cov_type,
                                        random_state=rand_gen,
                                        n_iter=n_iters,
                                        alpha=cluster_penalty,
                                        verbose=verbose)

        #
        # fitting to training set
        dpgmm_c.fit(clustering_data)

        #
        # getting the cluster assignment
        clustering = dpgmm_c.predict(clustering_data)

    elif cluster_method == 'HOEM':
        raise NotImplementedError('Hard Online EM is not implemented yet')
    else:
        raise Exception('Clustering method not valid')

    end_t = perf_counter()

    print('Clustering done in %f secs' % (end_t - start_t))

    #    nI = sliced_data.shape[0]
    #    uniqueNi = len(set([tuple(x) for x in clustering_data]))
    #    print(nI, uniqueNi, sum(clustering))
    # guarantee that we have a partition
    #if sum(clustering) == 0:
    #split evenly in n clusters
    #    clustering = numpy.asarray((list(range(n_clusters))*math.ceil(nI/n_clusters))[0:nI])

    # print(sliced_data)
    print(list(map(lambda c: numpy.sum(clustering == c), range(n_clusters))))

    # clusteringComplete = numpy.zeros(data_slice.instance_ids.shape)
    # clusteringComplete[zerorowsIdx] = n_clusters
    # clusteringComplete[datarowsIdx] = clustering
    # return retrieve_clustering(clustering, data_slice.instance_ids[datarowsIdx])
    return clustering
Example #41
0
def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=False):
    corpus = np.array(table[input_col])
    if max_df == None:
        max_df = len(corpus)
    tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca)
    tf_vectorizer.fit(corpus)
    csr_matrix_tf = tf_vectorizer.transform(corpus)
    tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
    csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf)

    voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1))
    len_voca = len(voca_dict)
    
    # tf-idf table

    tfidf_table = pd.DataFrame()
    document_list = []
    docID_list = []
    if output_type == False:
        vocabulary_list = []
        label_table = pd.DataFrame()
        for doc in range(len(corpus)):
            docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
            document_list += [str(corpus[doc]) for _ in range(len_voca)]
            vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
        label_table['document_id'] = docID_list
        label_table[input_col] = document_list
        label_table['vocabulary'] = vocabulary_list
        tfidf_table = label_table
        tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            tfidf_table['tfidf score'] = np.ravel(csr_matrix_tfidf.todense())
        elif idf_weighting_scheme == 'unary':
            tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency'])))
    
    elif output_type == True:
        for doc in range(len(corpus)):
            docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
            document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
        tfidf_table['document_id'] = docID_list
        tfidf_table[input_col] = document_list
        tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices]
        tfidf_table['frequency'] = csr_matrix_tf.data
        data_list = []
        for doc in range(len(corpus)):
            data_list += [csr_matrix_tfidf.data[i]  for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])][::-1]
        if idf_weighting_scheme == 'inverseDocumentFrequency':
            tfidf_table['tfidf score'] = data_list
        elif idf_weighting_scheme == 'unary':
            tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency'])))

    else:
        raise_runtime_error("Please check 'output_type'.")
        
        # idf table
    
    idf_table = pd.DataFrame()
    idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
    if idf_weighting_scheme == 'inverseDocumentFrequency':
        idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist()
    elif idf_weighting_scheme == 'unary':
        idf_table['idf weight'] = float(1)
        
    params = {
        'Input Column': input_col,
        'Max DF': max_df,
        'Min DF': min_df,
        'Number of Vocabularies': num_voca,
        'IDF Weighting Scheme': idf_weighting_scheme,
        'Norm': norm,
        'Smooth IDF': smooth_idf,
        'Sublinear TF': sublinear_tf,
        'Remove Zero Counts': output_type
    }
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# TF-IDF Result"""))
    rb.addMD(strip_margin("""
    |
    |### Parameters
    |
    |{display_params}
    |
    |### IDF Table
    |
    |{idf_table}
    |
    |### TFIDF Table
    |
    |{tfidf_table}
    |
    """.format(display_params=dict2MD(params), idf_table=pandasDF2MD(idf_table, num_rows=200), tfidf_table=pandasDF2MD(tfidf_table, num_rows=200))))

    model = _model_dict('tfidf')
    model['csr_matrix_tf'] = csr_matrix_tf
    model['csr_matrix_tfidf'] = csr_matrix_tfidf
    model['parameter'] = params
    model['idf_table'] = idf_table
    model['tfidf_table'] = tfidf_table
    model['_repr_brtc_'] = rb.get()
    
    return {'model' : model}
Example #42
0
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Variety))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
        unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(
        bigrams[-N:])))

X_train, X_test, y_train, y_test = train_test_split(df['Description'],
                                                    df['Variety'],
                                                    random_state=0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

print(clf.predict(count_vect.transform(["Tannins and acidity"])))
#we get pinot noir as the prediction

print(
    clf.predict(
        count_vect.transform(["A rich blend of blackberry, strong flavors"])))
#we get red blend

X_train, X_test, y_train, y_test = train_test_split(df['Variety'],
                                                    df['Description'],
                                                    random_state=0)
count_vect = CountVectorizer()
        #      from_data.append(1)

        from_data.append(0 if name == "sara" else 1)

        email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

# The string that you get for word_data[152]
word_data[152]

### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(word_data)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)

vector = vectorizer.get_feature_names()

# How many unique words are there in your Tfldf?
print len(vector)
# What is word number 34597 in your TfIdf?
vector[34597]
Example #44
0

def remove_stopwords(text, stopwords):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return " ".join(filtered_tokens)


if __name__ == "__main__":
    corpus = [
        "我 来到 北京 清华大学", "他 来到 了 网易 杭研 大厦", "小明 硕士 毕业 与 中国 科学院", "我 爱 北京 天安门"
    ]
    vectorizer = CountVectorizer(
    )  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    tfidf = transformer.fit_transform(vectorizer.fit_transform(
        corpus))  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    print(word)
    print(len(word))
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    print(weight.shape)
    for i in range(
            len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        print("-------这里输出第", i, "类文本的词语tf-idf权重------")
        for j in range(len(word)):
            print(word[j], weight[i][j])

    vectorizer = TfidfVectorizer(encoding="utf8")
    X = vectorizer.fit_transform(corpus)
    idf = vectorizer.idf_
    print(idf)
    "Football": 2,
    "Film": 3,
    "Technology": 4
})
Y = df["Category"]

count_vect = CountVectorizer(stop_words=sw)
vectorizer = TfidfTransformer()

# In[7]:

test = pd.read_csv('test_set.csv', sep='\t')
testX = test["Title"] + test["Content"]

X_train_counts = count_vect.fit_transform(X)
X_train_counts = vectorizer.fit_transform(X_train_counts)
svd = TruncatedSVD(n_components=200)
X_lsi = svd.fit_transform(X_train_counts)

parameters = {'C': [1, 10]}
svr = svm.LinearSVC()
clf = GridSearchCV(svr, parameters)

clf = clf.fit(X_lsi, Y)

X_test_counts = count_vect.transform(testX)
X_test_counts = vectorizer.transform(X_test_counts)
X_test_counts = svd.transform(X_test_counts)
predicted = clf.predict(X_test_counts)

output = np.zeros((len(predicted), 2), dtype=object)
Example #46
0
    news_train_data.append(x[2])

news_train_data_target = []
for x in news_train:
    news_train_data_target.append(x[0])

#создаем словарь характерных признаков и переводим документ в векторы признаков
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(news_train_data)
#считаем частоту и обратную частоту терминов
tf_transformer = TfidfTransformer(use_idf=False).fit(
    X_train_counts)  #прогоняем алгоритм оценки на данных
X_train_tf = tf_transformer.transform(
    X_train_counts)  #преобразуем матрицу к виду tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(
    X_train_counts)  #а можно сделать сразу и быстрее
#обучаем классификатор
clf = MultinomialNB().fit(
    X_train_tfidf,
    news_train_data_target)  #полиномиальный Наивный Байесовский классификатор
docs_new = ['В Ираке новые танки']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
#######################
#Чтобы с цепочкой vectorizer => transformer => classifier, используем Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
def docdir_handler_tfidf(dir_path, f, stop_word_list=stop_words, stop_word_pattern_list=stop_word_patterns, scale=0.9, frange=(0, )):
    '''
    对某一目录下的所有文档,进行遍历分词和对每篇执行f回调函数
    先进行一遍tf-idf去掉 非重要词 默认阈值0.5 即在没文档的tfidf中去掉较小的50%
    :param dir_path:
    :param f: f(index, word),表示第几篇的什么单词,利用全局变量或闭包,引用等完成值传递或者操作
    :return: 所有文件名 和 原始文档

    这是f的一个例子,把每个文档的词连为一个字符串,同时存在列表里
    corpus = []
    def f(index, word):
        while(len(corpus) <= index):
            corpus.append('')
        corpus[index] += ' ' + word

    '''
    filenames = []
    docs = []
    corpus = []
    filtered_words = set()
    print('start cut....')
    print('start filter stopword...')
    for index, filename in enumerate(os.listdir(dir_path)):
        if (len(frange) > 0 and frange[0] > index) or (len(frange) > 1 and frange[1] <= index):
            continue
        filenames.append(filename)
        try:
            td_file = open(os.path.join(dir_path, filename))
            td_content = td_file.read()
        finally:
            td_file.close()
        docs.append(td_content)
        seg_list = jieba.cut(td_content)
        for word in seg_list:
            word = word.strip()
            # 检查是否是停用词
            if len(word) > 0 and word not in filtered_words and word not in stop_word_list and not pattern_check(word, stop_word_patterns) :
                while (len(corpus) <= index):
                    corpus.append([])
                corpus[index].append(word)
            else:
                # 备份被去掉的单词,加快匹配
                filtered_words.add(word)
    vectorizer = CountVectorizer(dtype=np.int32)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(
        vectorizer.fit_transform([' '.join(doc) for doc in corpus]))
    vocas = vectorizer.get_feature_names()
    # weight = tfidf.toarray()
    weight = tfidf
    tfidf_filtered_count = 0
    # 计算总文档的tf-idf
    col_weight = np.sum(weight, 0)
    ti = list(np.asarray(vocas)[np.argsort(col_weight)])
    for index, row in enumerate(weight):
        # 计算该文档的tf-idf
        # ti = list(np.asarray(vocas)[np.argsort(row)])
        for word in corpus[index]:
            try:
                if ti.index(word) >= len(vocas) * scale:
                    f(index, word)
            except ValueError, e:
                f(index, word)
            else:
                tfidf_filtered_count += 1
Example #48
0
def create_word_matrix(text, way=1):
    """
    中文文本调用分词;
    轻量级调用根据需求调用方法一二三;
    重量级调用方法四
    """
    if way in [1, 2, 3, 4]:
        #用词袋表示点评文本
        if way == 1:
            from sklearn.feature_extraction.text import CountVectorizer
            vectorizer = CountVectorizer()
            X_tr_bow = vectorizer.fit_transform(text)
            X_te_bow = vectorizer.transform(text)  #划为训练集和测试集
            #word_llen = len(vectorizer.vocabulary_)
            #print(vectorizer.get_feature_names())
            word_matrix = X_tr_bow.toarray()

            return X_tr_bow, X_te_bow, word_matrix

        if way == 2:
            #使用词袋矩阵创建tf-idf
            from sklearn.feature_extraction.text import TfidfTransformer

            tfidf_trfm = TfidfTransformer(norm=None)
            X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
            X_te_tfidf = tfidf_trfm.transform(X_te_bow)

            return X_tr_tfidf, X_te_tfidf

        if way == 3:
            #对tf-idf进行归一化
            from sklearn.feature_extraction.text import CountVectorizer
            vectorizer = CountVectorizer()
            X_tr_bow = vectorizer.fit_transform(text)
            X_te_bow = vectorizer.transform(text)  #划为训练集和测试集
            word_llen = len(vectorizer.vocabulary_)
            from sklearn.feature_extraction.text import TfidfTransformer
            from sklearn.preprocessing import StandardScaler
            tfidf_trfm = TfidfTransformer(norm=None)
            X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
            X_te_tfidf = tfidf_trfm.transform(X_te_bow)
            from sklearn.preprocessing import Normalizer
            scaler = Normalizer().fit(X_tr_tfidf)
            normalized_X = scaler.transform(X_tr_tfidf)
            normalized_X_test = scaler.transform(X_te_tfidf)

            return normalized_X, normalized_X_test

        if way == 4:
            import jieba
            from sklearn.feature_extraction.text import TfidfVectorizer
            all_list = [" ".join(jieba.cut(s, cut_all=False))
                        for s in text]  #生成初始词表
            stpwrdpath = r"E:\MyMySql\feature\data\stopword.txt"
            with open(stpwrdpath, 'rb') as fp:
                stopword = fp.read().decode('utf-8')  # 提用词提取
            stopwordlist = stopword.splitlines()  #将提用词转换为列表
            tfidf = TfidfVectorizer(stop_words=stopwordlist)  #创建tfidf
            stopwordlist = stopword.splitlines()
            tfidf = TfidfVectorizer(stop_words=stopwordlist)
            X_tf = tfidf.fit_transform(all_list).toarray()
            X_tr_tfidf = X_tf[:-1]
            X_te_tfidf = X_tf[-1]

            return X_tr_tfidf, X_te_tfidf
Example #49
0
count_vect = CountVectorizer(ngram_range=(1, 1), max_df=0.1)
#print(count_vect)
tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_counts = count_vect.fit_transform(data[:1050])
#print(X_train_counts)
# X_train_counts2 = count_vect.transform(data[5000:])
testData = count_vect.transform(testData)
#print(testData)

# print (X_train_counts.shape ,X_train_counts2.shape )

# # X_train_counts = vstack([X_train_counts, X_train_counts2]).toarray()
# x1 = X_train_counts.toarray().tolist()
# x2 = X_train_counts2.toarray().tolist()
# xf = x1 + x2
# train = np.asarray(xf)

trainData = tfidf_transformer.fit_transform(X_train_counts)
# trainData2 = tfidf_transformer.transform(X_train_counts[10000:])

# trainData = [*trainData , *trainData2]
# print (trainData.shape , trainData2.shape)
testData = tfidf_transformer.transform(testData)
# print (trainData.shape)

clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)
clf.fit(trainData.toarray(), labels[:1050])
ans = clf.predict(testData.toarray())
for x in ans:
    print(x)
    if word not in stopwords.words('english')
]

#lowercase#
labeldata1.data = labeldata1.data.str.lower()

#remove punctuation#
labeldata1.data = labeldata1.data.str.strip("?")

# create count vector
count_vect = CountVectorizer()
counts = count_vect.fit_transform(labeldata1.data)

# create tfidf matrix
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(counts)

# To check which model is more suitable to data,initialize different classification models.
model = []

model.append(('SVM', SVC()))
model.append(('LDA', LinearDiscriminantAnalysis()))
model.append(('CART', DecisionTreeClassifier()))
model.append(('NB', GaussianNB()))
model.append(('RF', RandomForestClassifier()))

# By cross validation using Kfold analysis identify model accuracy
results = []
names = []
scoring = 'accuracy'
for name, model in model:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 18 09:08:44 2018

@author: xsxsz
"""

import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB

train=datasets.fetch_20newsgroups(subset='train')
test=datasets.fetch_20newsgroups(subset='test',shuffle=True,random_state=10)

tfidf=TfidfTransformer()

X_tain_tfidf=tfidf.fit_transform(train)
X_test_tfidf=tfidf.transform(test)

clf=GaussianNB()
clf.fit(X_tain_tfidf,train.target)

predict=clf.predict(X_test_tfidf)
print(metrics.classification_report(train.target,predict,target_names=test.target_names))
Example #52
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
vector = CountVectorizer()
x = vector.fit_transform(corpus)
word = vector.get_feature_names()

print(x.toarray())
print(word)

transform = TfidfTransformer()
print(transform)
tfidf = transform.fit_transform(x)
print(tfidf.toarray())
Example #53
0
def get_tf_idf(X_train_count):
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
    print "TF IDF Done", X_train_tfidf.shape
    return X_train_tfidf
Example #54
0
def BOW_NN(directory):
    corpus, label = read_documents(directory)
    print "Corpus Size: " + str(len(corpus))
    print "Label Size: " + str(len(label))

    """
    vectorize the corpus and convert it to a matrix. in the matrix, a row is a document, a column is a token(word).
    """
    vectorizer = CountVectorizer(min_df=1)
    sparse_matrix = vectorizer.fit_transform(corpus)
    print("Original BOW Matrix Shape: ")
    print(sparse_matrix.toarray().shape)


    """
    tf-idf weighting
    """
    transformer = TfidfTransformer(smooth_idf=True)
    tfidf = transformer.fit_transform(sparse_matrix)
    tfidf = tfidf.todense()
    tfidf = np.array(tfidf)

    """
    #SVD for LSA
    """
    svd = TruncatedSVD(300)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    reduced_matrix = lsa.fit_transform(tfidf)
    print(reduced_matrix.shape)

    print("Reduced Matrix Shape: ")
    print reduced_matrix.shape

    #print("Tf-idf Matrix Shape: ")
    #print tfidf.shape
    cv_index = []

    #Generate a list of random numbers for cross validation use.
    for i in range(len(corpus)):
        cv_index.append(np.random.randint(0, 10))

    #Start 10-fold Cross Validation:
    score_array = []
    max_score = 0.0

    for j in range (10):
        #datasets, labelsets = make_idx_data_cv(tfidf, label, cv_index, j)
        datasets, labelsets = make_idx_data_cv(reduced_matrix, label, cv_index, j)

        """
        #Neural Networks Classification
        """
        print "====================Start Neural Networks Classifier Training " + str(j+1) + "=============================="
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(500, ), random_state=1)
        print "Training Dataset Shape: "
        print datasets[0].shape

        clf.fit(datasets[0], labelsets[0])
        print "Training Complete."

        #Start predicting testing corpus.
        print "Start Predicting..."
        score = clf.score(datasets[1], labelsets[1])
        score_array.append(score)
        #if score > max_score:
            #max_score = score
            # Save classifier using Joblib. this method can only pickle a model to the disk
            # To load back the model, use "clf = joblib.load("filename.pkl")."
            #joblib.dump(clf, 'E:/A1113/FYP/BloombergNews/BloombergNews/SVM+BOW.pkl')
        print "Testing data accuracy :"
        print score

    print "====================Cross Validation Complete=============================="
    #print "Highest accuracy is " + str(max_score)
    print "Average accuracy is " + str(np.mean(score_array))
Example #55
0
while os.path.exists(splitfilename):
    with open(splitfilename, 'r+', encoding='UTF-8-sig',
              errors='ignore') as wf:
        word_lst = []
        a = wf.read()
        #word_lst = list(a.split(','))
        a = "".join(a)
        c.append(a)
        #word_lst.append(a.split(' '))

    num += 1
    splitfilename = "split_text/split" + str(num) + ".txt"

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(c))

word = vectorizer.get_feature_names()  # 所有文本的关键字
weight = tfidf.toarray()  # 对应的tfidf矩阵
#print(weight)

# n_clusters=4,参数设置需要的分类这里设置成4类
kmeans = KMeans(n_clusters=177, random_state=0).fit(weight)
#center为各类的聚类中心,保存在df_center的DataFrame中给数据加上标签
center = kmeans.cluster_centers_
df_center = pd.DataFrame(center)
#标注每个点的聚类结果
labels = kmeans.labels_
# print(labels)
# print(kmeans.labels_)
print(kmeans.cluster_centers_)
Example #56
0
TF-IDF 로 가중치를 주어 벡터화
TfidfTransformer()
norm='l2' 각 문서의 피처 벡터를 어떻게 벡터 정규화 할지 정합니다.
L2 : 벡터의 각 원소의 제곱의 합이 1이 되도록 만드는 것이고 기본 값(유클리디안거리)
L1 : 벡터의 각 원소의 절댓값의 합이 1이 되도록 크기를 조절(맨하탄거리)
smooth_idf=False
피처를 만들 때 0으로 나오는 항목에 대해 작은 값을 더해서(스무딩을 해서) 피처를 만들지 아니면 그냥 생성할지를 결정
sublinear_tf=False
use_idf=True
TF-IDF를 사용해 피처를 만들 것인지 아니면 단어 빈도 자체를 사용할 것인지 여부
'''
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
# print(transformer) # TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)
feature_tfidf = transformer.fit_transform(feature_vector)
'''
%%time 
feature_tfidf.shape
'''
tfidf_freq = pd.DataFrame(feature_tfidf.toarray(), columns=vocab)

df_tfidf = pd.DataFrame(tfidf_freq.sum())
df_tfidf_top = df_tfidf.sort_values(by=0, ascending=False)
# print(df_tfidf_top.head())
'''
Clustering
    - K-Means
    - MiniBatchKMeans
https://scikit-learn.org/stable/auto_examples/cluster/plot_mini_batch_kmeans.html
'''
Example #57
0
# print(df.head())
df = pd.read_csv('./movie_data.csv')

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining', 'The weather is sweet',
    'The sun is shining and the weather is sweet'
])

bag = count.fit_transform(docs)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform((count.fit_transform(docs))).toarray())

# 使用正则表达式处理文本
import re


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace(
        '-', '')
    return text


df['review'] = df['review'].apply(preprocessor)
Example #58
0
for index, row in clickbait.iterrows():
    if isinstance(row["status_message_without_tags"], str):
        texts.append(row["status_message_without_tags"])
        y.append('c')

# min_df is the minimum frequency
word_vectorizer = CountVectorizer(analyzer='word', min_df=4)
X = word_vectorizer.fit_transform(texts)
#print("Shape of X: ", X.shape)
#print("Vocabulary", word_vectorizer.vocabulary_)
#print("Vocabulary length = ", len(word_vectorizer.vocabulary_))

# 'Normalize' the count matrix X (precisely scale down the impact of higher frequency ones)
tfid = TfidfTransformer()
X_normalized = tfid.fit_transform(X)

#####################################################
# 3 CLASSIFICATION FOR RAW X AND X NORMALIZED       #
#####################################################

model(X, y)

# But normalized measures of precision and recall improve

model(X_normalized, y)

# def feature_extraction(text):
#    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
#    extracted_features = [len(text), text.count('!'), len(urls)]
#    return extracted_features
Example #59
0
		


count_vect1 = CountVectorizer(min_df=3, stop_words='english')


# class_1_Count = count_vect1.fit_transform(class_1.data)
# class_1_tfidf = tfidf_transform.fit_transform(class_1_Count)
# print "Number of terms in class1 data TF-IDF representation:",class_1_tfidf.shape

# class_2_Count = count_vect1.fit_transform(class_2.data)
# class_2_tfidf = tfidf_transform.fit_transform(class_2_Count)
# print "Number of terms in class2 data TF-IDF representation:",class_2_tfidf.shape

totalCount = count_vect1.fit_transform(total.data)
totalData_tfidf = tfidf_transform.fit_transform(totalCount)
#print "Number of terms in combined data TFxIDF representation:",totalData_tfidf.shape

labels = [ int(x / 4) for x in total.target]



svd = TruncatedSVD(n_components=2)
totalLSI = svd.fit_transform(totalData_tfidf)
kmeans = KMeans(n_clusters=2, n_init=30).fit(totalLSI)

x1 = totalLSI[kmeans.labels_ == 0][:, 0]
y1 = totalLSI[kmeans.labels_ == 0][:, 1]

plt.plot(x1,y1,'r+', label='Computer Technology')
x2 = totalLSI[kmeans.labels_ == 1][:, 0]
Example #60
-1
def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    # First we need to verify that numpy here provides div 0 warnings
    with warnings.catch_warnings(record=True) as w:
        1. / np.array([0.])
        numpy_provides_div0_warning = len(w) == 1

    with warnings.catch_warnings(record=True) as w:
        tfidf = tr.fit_transform(X).toarray()
        if not numpy_provides_div0_warning:
            raise SkipTest("Numpy does not provide div 0 warnings.")
        assert_equal(len(w), 1)
        # For Python 3 compatibility
        if hasattr(w[0].message, 'args'):
            assert_true("divide by zero" in w[0].message.args[0])
        else:
            assert_true("divide by zero" in w[0].message)