Esempio n. 1
0
    def to_vector(self, title_list):

        vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=self.MAX_DF)
        vectorizer.max_features = self.MAX_FEATURES
        vectorizer.fit(title_list)
        tf = vectorizer.transform(title_list)

        lsa = TruncatedSVD(self.LSA_DIM)
        lsa.fit(tf)
        tf = lsa.transform(tf)
        return tf, vectorizer, lsa
Esempio n. 2
0
def get_keywords(docs, max_feature, stopwords=None):
    vectorizer = TfidfVectorizer(max_features=max_feature,
                                 min_df=3,
                                 stop_words=stopwords)
    try:
        vectorizer.fit(docs)
    except:
        vectorizer.min_df = 1
        vectorizer.max_features = 30
        vectorizer.fit(docs)
    return vectorizer.vocabulary_
Esempio n. 3
0
 def to_vector(self, title_list):
     
     vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=self.MAX_DF)
     vectorizer.max_features = self.MAX_FEATURES       
     vectorizer.fit(title_list)
     tf = vectorizer.transform(title_list)
     
     lsa = TruncatedSVD(self.LSA_DIM)
     lsa.fit(tf)
     tf = lsa.transform(tf)
     return tf, vectorizer, lsa
Esempio n. 4
0
 def to_vector(self, text_set, MAX_DF, MAX_FEATURES, LSA_DIM):
     '''
     bag of words に変換、次元削減    
     '''
     
     vectorizer = TfidfVectorizer(analyzer=analyzer ,max_df=MAX_DF, stop_words = stopwords)
     vectorizer.max_features = MAX_FEATURES
     X = vectorizer.fit_transform(text_set)
     lsa= TruncatedSVD(LSA_DIM)
     X = lsa.fit_transform(X)
     
     return X, lsa, vectorizer
Esempio n. 5
0
def transform_data(filename,MAX_DF = 0.9, MAX_FEATURES = 500, LSA_DIM = 100):
    '''mecabのテンプレート、ファイルを読み込み、タイトルを形態素解析して次元圧縮して正規化かする。戻り値はデータセットとタイトルの行列'''
    data = pd.read_csv(filename)
    title = []
    for i in data.index:
        title.append(data.ix[i, 'Title'].decode('utf-8'))
    
    vectorizer = TfidfVectorizer(analyzer=analyzer ,max_df=MAX_DF, stop_words = stopwords)
    vectorizer.max_features = MAX_FEATURES
    X = vectorizer.fit_transform(title)
    lsa= TruncatedSVD(LSA_DIM)
    X = lsa.fit_transform(X)
    X = Normalizer(copy=False).fit_transform(X)

    return data,X
Esempio n. 6
0
def main(filename):
    # load tweets
    tweets = get_tweets_from_csv(filename)

    # feature extraction
    vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF)
    vectorizer.max_features = MAX_FEATURES
    X = vectorizer.fit_transform(tweets)

    # dimensionality reduction by LSA
    lsa = TruncatedSVD(LSA_DIM)
    X = lsa.fit_transform(X)
    X = Normalizer(copy=False).fit_transform(X)

    # clustering by KMeans
    if MINIBATCH:
        km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS,
                             init='k-means++',
                             batch_size=1000,
                             n_init=10,
                             max_no_improvement=10,
                             verbose=True)
    else:
        km = KMeans(n_clusters=NUM_CLUSTERS,
                    init='k-means++',
                    n_init=1,
                    verbose=True)
    km.fit(X)
    labels = km.labels_

    transformed = km.transform(X)
    dists = np.zeros(labels.shape)
    for i in range(len(labels)):
        dists[i] = transformed[i, labels[i]]

    # sort by distance
    clusters = []
    for i in range(NUM_CLUSTERS):
        cluster = []
        ii = np.where(labels == i)[0]
        dd = dists[ii]
        di = np.vstack([dd, ii]).transpose().tolist()
        di.sort()
        for d, j in di:
            cluster.append(tweets[int(j)])
        clusters.append(cluster)

    return clusters
Esempio n. 7
0
	def handle(self, *args, **options):
		# tweets
		ret    = Timeline.objects.all()[:100]
		tweets = [r.body for r in ret]
		
		# feature extraction
		vectorizer = TfidfVectorizer(analyzer = self.__analyzer, max_df = MAX_DF)
		vectorizer.max_features = MAX_FEATURES
		x = vectorizer.fit_transform(tweets)

		# dimensionality reduction by LSA
		lsa = TruncatedSVD(LSA_DIM)
		x= lsa.fit_transform(x)
		x= Normalizer(copy=False).fit_transform(x)

		# clustering by KMeans
		if MINIBATCH:
			km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++',batch_size=1000,n_init=10,max_no_improvement=10)
		else:
			km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1)
		
		km.fit(x)
		labels = km.labels_

		transformed = km.transform(x)
		dists = np.zeros(labels.shape)
		for i in range(len(labels)):
			dists[i] = transformed[i, labels[i]]

		# sort by distance
		clusters = []
		for i in range(NUM_CLUSTERS):
			cluster = []
			ii = np.where(labels == i)[0]
			dd = dists[ii]
			di = np.vstack([dd,ii]).transpose().tolist()
			di.sort()
			for d, j in di:
				cluster.append(tweets[int(j)])
			clusters.append(cluster)

		for i,cluster in enumerate(clusters):
			for c in cluster:
				print "%s: %s" % (i,c)
Esempio n. 8
0
def main(filename):
    # load tweets
    tweets = get_tweets_from_csv(filename)
    # print tweets
 
    # feature extraction
    vectorizer = TfidfVectorizer(analyzer=analyzer, max_df=MAX_DF)
    vectorizer.max_features = MAX_FEATURES
    X = vectorizer.fit_transform(tweets)
    # dimensionality reduction by LSA
    lsa = TruncatedSVD(LSA_DIM)
    X = lsa.fit_transform(X)
    X = Normalizer(copy=False).fit_transform(X)
 
    # clustering by KMeans
    if MINIBATCH:
        km = MiniBatchKMeans(n_clusters=NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True)
    else:
        km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, verbose=True)
    km.fit(X)
    labels = km.labels_
 
    transformed = km.transform(X)
    dists = np.zeros(labels.shape)
    for i in range(len(labels)):
        dists[i] = transformed[i, labels[i]]
 
    # sort by distance
    clusters = []
    for i in range(NUM_CLUSTERS):
        cluster = []
        ii = np.where(labels==i)[0]
        dd = dists[ii]
        di = np.vstack([dd,ii]).transpose().tolist()
        di.sort()
        for d, j in di:
            cluster.append(tweets[int(j)])
        clusters.append(cluster)
 
    return clusters
    # preprocess the sentences
    train['sentence'] = preprocess(train)
    test['sentence'] = preprocess(test)

    # if there are enough training samples, even the label ratios out
    if train.shape[0] > 1000:
        train = training_sample(train)

    list_tokens = train['sentence'].apply(lambda x: x.split(' '))
    test_tokens = test['sentence'].apply(lambda x: x.split(' '))

    # if there are more than 1000 training samples, limit the max_features to 1000 as otherwise it will exceed memory
    # try tfidf
    vectorizer = TfidfVectorizer()
    if train.shape[0] > 1000:
        vectorizer.max_features = 1000
    vectorizer.fit(train['sentence'])
    selected_features = vectorizer.get_feature_names()

    # try bow
    # tokenizer = Tokenizer(num_words=1000, lower=True)
    # tokenizer.fit_on_texts(train['sentence'].values)
    # selected_features = list(tokenizer.word_index.keys())[:1000]

    # w2v
    model = Word2Vec(list_tokens, size=100, window=5, min_count=1)
    # fast text
    # model = FastText(size=100, window=3, min_count=1)
    # model.build_vocab(sentences=list_tokens)
    model.train(list_tokens, total_examples=len(list_tokens), epochs=30)
X = dataset.iloc[:, 1].values
y = dataset.iloc[:, 0].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

# Applying TF-TDF
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer()
v.max_features = 5000
X_train = v.fit_transform(X_train).toarray()
X_test = v.transform(X_test).toarray()

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB  # GaussianNB

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix