def preprocess():
    data,target = load_file()
    count_vectorizer = CountVectorizer(binary='true')
    data = count_vectorizer.fit_transform(data)
      

    tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)
    #print tfidf_data
    
    tfidf_data.toarray()
    
   
            

    return tfidf_data
Example #2
0
            label_inds.append(label)
            corpus.append(' '.join(doc))
    # print len(corpus)

        label_inds = np.array(label_inds)
    vectorizer = CountVectorizer()
    tfidf = TfidfTransformer()
    X = vectorizer.fit_transform(corpus)
    # print len(vectorizer.get_feature_names())
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    # print tfidf.toarray().shape
    # print label_inds.shape

    test_split_mask = np.random.rand(len(corpus)) < 0.8
    train_X = tfidf.toarray()[test_split_mask]
    train_Y = label_inds[test_split_mask]
    test_X = tfidf.toarray()[~test_split_mask]
    test_Y = label_inds[~test_split_mask]
    # print train_X.shape, test_Y.shape

    #  KNN
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(train_X, train_Y)
    print clf.score(test_X, test_Y)

    #  SVM
    clf = SVC()
    clf.fit(train_X, train_Y)
    print clf.score(test_X, test_Y)
            soup = BeautifulSoup(text,"lxml")
            text = soup.get_text()
            text = text.lower()
            text = text.translate(trantab)
            text = tokenize(text)
            corpus.append(text)
        df[colname] = corpus
    return df
    
posts_clean = clean_text(posts)       
title_tfidf = TfidfTransformer().fit_transform(
        CountVectorizer().fit_transform(posts_clean['title']))
content_tfidf = TfidfTransformer().fit_transform(
        CountVectorizer().fit_transform(posts_clean['content']))

X = np.concatenate((title_tfidf.toarray(),content_tfidf.toarray()),axis=1)
X = scipy.sparse.csr_matrix(X)
y = multi_labels

#%% classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import \
RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,\
GradientBoostingClassifier,VotingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression,\
 RidgeClassifier, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
Example #4
0
content = [open(os.path.join(PATH, 'text', str(f))).read() for f in metadata.id]

# Go from text to document-term-matrix ('vectorize' data)
vectorizer = CountVectorizer(decode_error = 'ignore')
X = vectorizer.fit_transform(content)

# Select the 100 best features
select = SelectKBest(chi2, k=50)
select.fit(X[train,:], y[train])

# Xs = matrix with the 100 selected features
Xs = select.transform(X)

# Weigthing the matrix
Xw = TfidfTransformer().fit_transform(Xs)
Xw = Xw.toarray()
# Extract feature names
feature_names = np.array(vectorizer.get_feature_names())[select.get_support()]

# Build orange data table, this is not really easy
features = [Orange.feature.Continuous(x) for x in [str(b) for b in list(feature_names)]]
classes = Orange.feature.Discrete("class", values=['0', '1'])
domain = Orange.data.Domain(features, classes, id)
feat_list = []
for i in range(Xw.shape[0]):
    feat_list.append(list(Xw[i,:]) + [int(y[i]) if y[i] < 3 else None])
out_data = Orange.data.Table(domain, feat_list)


id = Orange.feature.Descriptor.new_meta_id()
Example #5
0
            text = soup.get_text()
            text = text.lower()
            text = text.translate(trantab)
            text = tokenize(text)
            corpus.append(text)
        df[colname] = corpus
    return df


posts_clean = clean_text(posts)
title_tfidf = TfidfTransformer().fit_transform(CountVectorizer().fit_transform(
    posts_clean['title']))
content_tfidf = TfidfTransformer().fit_transform(
    CountVectorizer().fit_transform(posts_clean['content']))

X = np.concatenate((title_tfidf.toarray(), content_tfidf.toarray()), axis=1)
X = scipy.sparse.csr_matrix(X)
y = multi_labels

#%% classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import \
RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,\
GradientBoostingClassifier,VotingClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression,\
 RidgeClassifier, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
Example #6
0
def get_tfidf_dictionary():
    # vectorizer =
    # transformer =
    tfidf = TfidfTransformer(norm=None).fit_transform(
        CountVectorizer().fit_transform(train['title_spilt']))
    tfidf.toarray()
stop = []
with open('stop_words.txt', 'r', encoding='utf-8') as f:
    for i in f:
        stop += i.split()


def get_data(x):
    lists = jieba.cut(x)
    a = []
    for i in lists:
        if len(i) >= 2 and i not in stop:
            a.append(i)
    return ' '.join(a)


textitor = map(get_data, content)
#文字預處理

#文字轉tfidf矩陣
vectorize = CountVectorizer()
vt = vectorize.fit_transform(textitor)
transformer = TfidfTransformer().fit_transform(vt)
word = vectorize.get_feature_names()
weight = transformer.toarray()
print(DataFrame(weight, columns=word))
'''
x = sorted([ (n,i.sum()) for n, i in enumerate(weight.T)],key = lambda x : x[1], reverse = True)
for i in range(5):
    print(word[x[i][0]],x[i][1])
'''
Example #8
0
print('Result demo array is {}'.format(res_demo.toarray()))

# Result is 2-d matrix containing document text matrix
# Notice that in the second row, there is 2.
# also, bad is repeated twice in that sentence.
# so we can infer that 2 is corresponding to the word 'bad'
print('Feature list: {}'.format(cv_demo.get_feature_names()))

print('The data type of bow matrix {}'.format(type(cv_matrix)))
print('Shape of the matrix {}'.format(cv_matrix.get_shape))
print('Size of the matrix is: {}'.format(sys.getsizeof(cv_matrix)))
print(cv.get_feature_names())
print(cv_matrix.toarray())

normal_matrix = TfidfTransformer().fit_transform(cv_matrix)
print(normal_matrix.toarray())

print(normal_matrix.T.toarray)
res_graph = normal_matrix * normal_matrix.T
# plt.spy(res_graph)

nx_graph = nx.from_scipy_sparse_matrix(res_graph)
nx.draw_circular(nx_graph)
print('Number of edges {}'.format(nx_graph.number_of_edges()))
print('Number of vertices {}'.format(nx_graph.number_of_nodes()))
# plt.show()
print('The memory used by the graph in Bytes is: {}'.format(
    sys.getsizeof(nx_graph)))

ranks = nx.pagerank(nx_graph)
Example #9
0
# In[13]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(train['Message'])

# In[14]:

#converting bow to tfidf

# In[15]:

from sklearn.feature_extraction.text import TfidfTransformer
bow = TfidfTransformer().fit_transform(bow)
bow = bow.toarray()
bow = bow.tolist()

# In[16]:

#concatenating length to bow

# In[62]:

for i in range(5572):
    bow[i].append(train['Length'].iloc[i])

# In[18]:

#splitting train_test
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
# 语料
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
# 将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer()
# 计算个词语出现的次数
X = vectorizer.fit_transform(corpus)
#获取tfidf值
tfidf = TfidfTransformer().fit_transform(X)
# 获取词袋中所有文本关键词
word = vectorizer.get_feature_names()
print(word)
# 查看词频结果
print(X.toarray())
print(tfidf.toarray())