def train_lda(): tfidf = models.TfidfModel.load(conf.tfidf) corpus_tfidf = tfidf[corpus.load_corpus()] lda = models.LdaModel(corpus_tfidf, id2word=corpora.Dictionary.load(conf.dictionary), num_topics=conf.num_topics) corpus_topics = lda[corpus_tfidf] objs = bytearray() for obj in corpus_topics: objs += pickle.dumps(obj) f = open(conf.corpus_topics, 'wb') f.write(objs) f.close() lda.save(conf.lda) return lda
a = { "NN":"Common Noun", "NNP":"Proper Noun", } try: TEST = int(sys.argv[1]) except ValueError: print("Enter 0,1,2 to specify testing document") exit() except IndexError: TEST = 2 #def get_ from sklearn.tree import DecisionTreeClassifier train = corpus.load_corpus(all=True) statistic = analytics.load_analytics(train) heighest_probabilty = {} for i in statistic: heighest_probabilty[i] = max(statistic[i].items(),key=lambda x:x[1])[0] X_train_raw, Y_train_raw = extract_feature(data=train) #Global label_encoder to encode X values global_label_encoder,global_hot_encoder = set_encoder(Y_train_raw) print("Training Global Classifer ....") X_train,Y_train = encode_features(X_train_raw,Y_train_raw,global_label_encoder,global_hot_encoder) global_clf = DecisionTreeClassifier() global_clf.fit(X_train,Y_train) print("Completed")
def tsne(docs, target, outpath, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) # Visualize the frequency distribution visualizer = TSNEVisualizer(ax=ax, **kwargs) visualizer.fit(docs, target) visualizer.poof(outpath=outpath) if __name__ == '__main__': # Load and vectorize the corpus corpus = load_corpus("../../../examples/data/hobbies") tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) target = corpus.target # Whole corpus visualization tsne(docs, target, "images/tsne_all_docs.png") # No labels tsne(docs, None, "images/tsne_no_labels.png", labels=["documents"]) # Apply clustering instead of class names. clusters = KMeans(n_clusters=5) clusters.fit(docs)
from sklearn.model_selection import KFold, StratifiedShuffleSplit, StratifiedKFold from sklearn.metrics import classification_report, confusion_matrix from six.moves import xrange, input # pylint: disable=redefined-builtin from six import text_type from os.path import join from utils import plot_confusion_matrix, label_classification_report, print_cm from corpus import load_corpus from feats import sent2features, sent2labels, sent2tokens, pos_feats, pos_word_feats, crf_feats import pickle nltk.download('averaged_perceptron_tagger') nltk.download('punkt') label_set, train_set, test_set = load_corpus() MODEL_ROOT = './models/' print "\nExtracting word features ..." # uncomment below lines to use these features but they perform inferior to the ones used in demo # WORD FEATS #train_featuresets = [(word_feats(sample[0]), sample[1]) for sample in train_set] #test_featuresets = [(word_feats(sample[0]), sample[1]) for sample in test_set] # POS FEATS #train_featuresets = [(pos_feats(sample[0]), sample[1]) for sample in train_set] #test_featuresets = [(pos_feats(sample[0]), sample[1]) for sample in test_set] # WORD+POS FEATS
from features import extract_feature, set_encoder,encode_features from corpus import load_corpus from sklearn.tree import DecisionTreeClassifier X_train_raw, Y_train_raw = extract_feature(data=load_corpus()) label_encoder,hot_encoder = set_encoder(Y_train_raw) X_train,Y_train = encode_features(X_train_raw,Y_train_raw,label_encoder,hot_encoder) clf = DecisionTreeClassifier() clf.fit(X_train,Y_train) X_test_raw,Y_test_raw = extract_feature(load_corpus(last=True)) X_test,Y_test = encode_features(X_test_raw,Y_test_raw,label_encoder,hot_encoder) print(clf.score(X_test,Y_test))
# A simple probablistic tagger # The goal is to beat this accuracy using ML import analytics, corpus statistic = analytics.load_analytics() heighest_probabilty = {} for i in statistic: heighest_probabilty[i] = max(statistic[i].items(), key=lambda x: x[1])[0] test = corpus.load_corpus(last=True) test_dict = [] for i in test: for j in i: test_dict.append(j) hit = 0 miss = 0 unknwon = 0 ambiguity_miss = 0 unknwon_ambiguity = 0 a = 1 for i in test_dict: try: if heighest_probabilty[i[0]] == i[1]: hit += 1 else: if i[1] in statistic[i[0]]: ambiguity_miss += 1 miss += 1 else: if len(statistic[i[0]].keys()) == 1: a += 1 print("ambigity:", statistic[i[0]], i)
def init_tfidf(): tfidf = models.TfidfModel(corpus.load_corpus()) tfidf.save(conf.tfidf) return tfidf
# -*- coding: utf-8 -*- import os import random import math from numpy import random as nprand import copy as cp import corpus as cputil corpus = cputil.load_corpus("corpus-formatted.csv") cc = [([w for w in doc if len(w)>1 and not w.isdigit() and not w.lower().islower()],ul) for doc,ul in corpus] corpus = [d for d in cc if len(d[0])>43] class TopicOverTime: def __init__(self,corpus,alpha=0.1,beta=0.01,gamma=0.01,C=20,n_iter=300): self.corpus = corpus self.corpus_user = [[u for t,u in ul] for _,ul in self.corpus] self.corpus_timestamp = [[t for t,u in ul] for _,ul in self.corpus] self.M = len(self.corpus) self.NU = sum(map(len,self.corpus_user)) self.udic = list(set([u for d in self.corpus_user for u in d])) self.usize = len(self.udic) self.C = C self.alpha = alpha self.gamma = gamma self.n_iter = n_iter self.communities = [nprand.randint(0,self.C,size=l) for l in map(len,self.corpus_user)]
return self.word_list def set_clf(self, clf): self.clf = clf def get_clf(self): return self.clf def __str__(self): return "".join(self.word_list) #def get_ from sklearn.tree import DecisionTreeClassifier train = corpus.load_corpus(test=TEST) statistic = analytics.load_analytics(train) heighest_probabilty = {} for i in statistic: heighest_probabilty[i] = max(statistic[i].items(), key=lambda x: x[1])[0] X_train_raw, Y_train_raw = extract_feature(data=train) #Global label_encoder to encode X values global_label_encoder, global_hot_encoder = set_encoder(Y_train_raw) print("Training Global Classifer ....") X_train, Y_train = encode_features(X_train_raw, Y_train_raw, global_label_encoder, global_hot_encoder) global_clf = DecisionTreeClassifier() global_clf.fit(X_train, Y_train)
# -*- coding: utf-8 -*- import os import random import math from numpy import random as nprand import copy as cp import corpus as cputil corpus = cputil.load_corpus("corpus-formatted.csv") cc = [([ w for w in doc if len(w) > 1 and not w.isdigit() and not w.lower().islower() ], ul) for doc, ul in corpus] corpus = [d for d in cc if len(d[0]) > 43] class TopicOverTime: def __init__(self, corpus, alpha=0.1, beta=0.01, gamma=0.01, C=20, n_iter=300): self.corpus = corpus self.corpus_user = [[u for t, u in ul] for _, ul in self.corpus] self.corpus_timestamp = [[t for t, u in ul] for _, ul in self.corpus] self.M = len(self.corpus) self.NU = sum(map(len, self.corpus_user))
# -*- coding: utf-8 -*- import os import random import math from numpy import random as nprand import copy as cp import corpus as cputil corpus = cputil.load_corpus("corpus_filtered.final") corpus = filter(lambda c:len(c[0])>48,corpus) class UserLDA: def __init__(self,corpus,alpha=0.1,beta=0.01,gamma=0.01,K=20,n_iter=300): self.corpus = corpus self.corpus_word = [w for w,_ in self.corpus] self.corpus_user = [u for _,u in self.corpus] self.M = len(self.corpus) self.NW = sum(map(len,self.corpus_word)) self.NU = sum(map(len,self.corpus_user)) self.wdic = list(set([w for d in self.corpus_word for w in d])) self.wsize = len(self.wdic) self.udic = list(set([u for d in self.corpus_user for u in d])) self.usize = len(self.udic) self.K = K self.C = K self.alpha = alpha self.beta = beta self.gamma = gamma