Beispiel #1
0
def main():
    n_samples = 2000
    n_features = 1000
    n_topics = 20
    n_top_words = 15

    dataset = fetch_20newsgroups(
        shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9, max_features=n_features, min_df=2, stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])
    lda = LDA(n_topics=n_topics, kappa=0.7, tau0=1024., n_jobs=4, random_state=0)
    
    feature_names = vectorizer.get_feature_names()
    start_time = time.clock()
    lda.fit(doc_word_count)
    end_time = time.clock()
    # print feature_names[:10]
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    print 'run time =  %.3f seconds' % (end_time - start_time)
def main():
    n_samples = 2000
    n_features = 1000
    n_topics = 20
    n_top_words = 15

    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9,
                                 max_features=n_features,
                                 min_df=2,
                                 stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])
    lda = LDA(n_topics=n_topics,
              kappa=0.7,
              tau0=1024.,
              n_jobs=4,
              random_state=0)

    feature_names = vectorizer.get_feature_names()
    start_time = time.clock()
    lda.fit(doc_word_count)
    end_time = time.clock()
    # print feature_names[:10]
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    print 'run time =  %.3f seconds' % (end_time - start_time)
Beispiel #3
0
 def get_lda_feats(self, args, group_key, indiv_key):
     d={}
     for ii in glob.iglob(args.input_big5_folder + '/*.csv'):
         print '\n--> Adding lda data from file: %s'%(ii)
         if os.path.basename(ii) == 'gz':
             csvobj = csv.reader(gzip.open(ii), delimiter=',')   
         else:
             csvobj = csv.reader(open(ii, 'rb'), delimiter=',')
 
         header=csvobj.next()
         ind_group=header.index(group_key) # year
         ind_indiv=header.index(indiv_key) # id
         
         for jj in csvobj:
             try:
                 group=jj[ind_group].strip()
                 if not d.has_key(group):
                     d[group]=[]
                     ldaa = LDA(args.feat_folder+'/' + 'lda-' + args.num_topics + 
                               '.'+self.feat_name+'-' + group + '.post')
                 
                 indiv=jj[ind_indiv].strip()
                 d[group].append(ldaa.posterior_feats(indiv))
                 print indiv
             except:
                 continue
     return d
def configure(config):
    global lda, train_topics
    lda = LDA(config['lda_model'], config['lda_dict'])
    lda.load()
    logging.info('Loading training corpus topics')
    with open(config['train_topics']) as fp:
        train_topics = cPickle.load(fp)
    logging.info('Read topics for %d sentences', len(train_topics))
Beispiel #5
0
def main():
    corpus = Corpus()
    corpus.load_ldac(menu_path + 'reuters.ldac')
    model = LDA(n_topic=20)
    model.fit(corpus, valid_split=0.1, n_iter=10)

    perplexity = model.perplexity(corpus.docs)
    print perplexity
Beispiel #6
0
def extract_aspects_for_reviews_v1(topic_num=10):
    '''
        load model from file, then obtain topics for every reivew
        some terms may not be in the model.id2term, ignore this in this version
    '''

    raw_review_filename = 'raw_reviews.ldapre'
    raw_texts = load_reviews(raw_review_filename)

    #train LDA
    #lda_model = LDA(K=K, doc_set=raw_texts)
    #lda_model.train()
    #lda_model.save(yelp_dir + 'review_t%s.lda' % K)

    #return the topic for every reivew
    version = 'v1'
    model_filename = yelp_dir + 'lda_%s/review_t%s.lda' % (version, topic_num)
    res = []
    model = LDA(model_filename=model_filename, load_from_file=True)
    res_dir = yelp_dir + 'aspects/lda_%s/' % version
    res_filename = 'review_topic%s.res' % topic_num
    fw = open(res_dir + res_filename, 'w+')
    fw.write('#review_id\trate\ttopic_res\traw_text\n')
    model_topic_filename = 'topic%s.res' % topic_num
    tn = 0.0

    start = time.time()
    for ind, t in enumerate(raw_texts):
        rid = t[0]
        rate = float(t[1])
        terms = t[2:]
        topic_ids = model.get_document_topics(terms)
        #checked_res = model.check_existence_doc_term(terms)
        #if checked_res:
        #    print 'ind=%s, rid=%s, not_exist:%s, raw_texts=%s' % (ind,rid,checked_res,terms)
        tn += len(topic_ids)

        topic_str = '|'.join(
            ['%s,%s' % (t, round(p, 4)) for t, p in topic_ids])
        line = '%s\t%s\t%s\t%s' % (unicode2str(rid), rate,
                                   unicode2str(topic_str), '\t'.join(
                                       [unicode2str(t) for t in terms]))
        #line = '\t'.join([unicode2str(t) for t in terms])
        fw.write(line + '\n')
        if (ind + 1) % 100000 == 0:
            print 'cost  %.1fmin in this round, processed %s review:\n%s\n' % (
                (time.time() - start) / 60.0, ind + 1, line)
            start = time.time()
    fw.close()

    topics_res = model.print_topics(topic_num)
    fw = open(res_dir + model_topic_filename, 'w+')
    fw.write('\n'.join(['%s,%s' % (t, unicode2str(r)) for t, r in topics_res]))
    fw.close()
    print 'finish extracting aspects for %s reviews(avg=%s), saved in %s, corpurs topics in %s' % (
        len(raw_texts), tn / len(raw_texts), res_filename,
        model_topic_filename)
Beispiel #7
0
class TopicModelingLDA(object):
	#wrapper de la libreriar LDA
	#permite caracterizar los topicos en base a varios scores encontrados en la literatura

	def __init__(self,corpus,metrics_criteria='simple'):
		super(TopicModelingLDA, self).__init__()
		self.corpus = corpus
		self.select_metric_criteria(metrics_criteria)
		self.model = None
		self.topic_words = None
		self.top_words = None
		self.all_words = []

	def fit(self,num_topic=5,n_iter=1500):
		count_vect = CountVectorizer()
		x_train_counts = count_vect.fit_transform(self.corpus)
		self.model = LDA(n_topics=num_topic, n_iter=n_iter, random_state=1)
		self.model.fit(x_train_counts)

		self.topic_words = self.model.topic_word_
		self.vocabulary = count_vect.get_feature_names()

	def select_metric_criteria(self,metrics_criteria):
		if metrics_criteria == 'term_score':
			self.metrics = TopicTermScore()
		else:
			self.metrics = TopicSimpleScore()

	def get_highest_scores(self,k_top=10):
		#topic_words es una matriz (numero de topicos,palabras)
		#la fila k indica la distribucion de palabras del topico k
		num_topics = len(self.topic_words) 
		print ("Numero de topicos",num_topics)
		top_words = []
		self.top_words = {}

		for topic_k in range(num_topics):
			scores = []
			for v,word in enumerate(self.vocabulary):
				score = self.metrics.calculate(self.topic_words,topic_k,v)
				scores.append((word,score))
			scores.sort(key=lambda tup: tup[1]) 
			scores = scores[-k_top:]
			
			print ("Topico %d"%(topic_k))
			for word,score in scores:
				print ("%s,%.4f"%(word,score))
			print ("")

			self.top_words[topic_k] = [{'word':word,'score':score} for word,score in scores]
			self.all_words += [ word for word,score in scores]

		return self.top_words

	def get_all_words(self):
		return self.all_words
def DocIndex():
    core = TermiteCore(request, response)
    lda = LDA(request)
    docIndex, docMaxCount = lda.GetDocIndex()
    return core.GenerateResponse(
        lda.params, {
            'docCount': len(docIndex),
            'docMaxCount': docMaxCount,
            'DocIndex': docIndex
        })
def TermIndex():
    core = TermiteCore(request, response)
    lda = LDA(request)
    termIndex, termMaxCount = lda.GetTermIndex()
    return core.GenerateResponse(
        lda.params, {
            'termCount': len(termIndex),
            'termMaxCount': termMaxCount,
            'TermIndex': termIndex
        })
def TopicIndex():
    core = TermiteCore(request, response)
    lda = LDA(request)
    topicIndex, topicMaxCount = lda.GetTopicIndex()
    return core.GenerateResponse(
        lda.params, {
            'topicCount': len(topicIndex),
            'topicMaxCount': topicMaxCount,
            'TopicIndex': topicIndex
        })
    def generate_topics(self):
        file_to_tokens = self._get_normalized_corpus(self.files)

        np_matrix = self._get_document_term_matrix(file_to_tokens)
        model = LDA(n_topics=self.n_topics,
                    n_iter=self.n_iter,
                    random_state=self.random_state)
        model.fit(np_matrix)

        self._lda_model = model
    def test_aggregate(self):
        data = [['123', 'some text'],['123', 'some more text'], ['123', 'evn more text'],
                ['456', 'some stuff'],['456', 'some more stuff'], ['456', 'even more stuff'],
                ['789','just a little thing.']]

        df = pd.DataFrame(data=data, columns=['asin','review_text'])
        bcr = LDA()
        result = bcr.aggregate_df(df, 'asin', 'review_text')
        # This should aggregate to only three rows.
        self.assertEqual(len(result),3)
        print(result)
 def test_there_is_data(self):
     bcr = LDA()
     categories, category_ids, asin_counts = bcr.get_categories('category.csv')
     i = 0
     for category in categories:
         asin_count = asin_counts[i]
         i = i + 1
         dataset = bcr.get_dataset('category.csv', category)
         print("Dataset {} has {} records.".format(category, len(dataset)))
         self.assertEqual(len(dataset), asin_count)
         print("Displaying the first 10 elements in the dataset...")
         print(dataset.head(10))
Beispiel #14
0
def train(request):
    if request.is_ajax():
        if request.method == 'GET':
            print 'train func'
            db = StateModel(state_name='lock_model', status=1)
            db.save()

            # neural_network = neural_net(75, 3)
            # neural_network.create_struct(150)
            file_train = settings.STATICFILES_DIRS[
                0] + 'main_app/media/train_data.csv'

            lda = LDA(75, 3)
            #neural_network.data_input(feature_list, answer, "train")
            #neural_network.file_input(file_train)
            lda.file_input(file_train)
            #neural_network.file_input(file_test, type_set='test')
            #test_data, test_answer = neural_network.get_test_data()
            # neural_network.training(5000)
            # neural_network.save_model(settings.STATICFILES_DIRS[0])
            lda.training()
            lda.save_model(settings.STATICFILES_DIRS[0])
            #print neural_network.predict(test_data[0])
            #print test_answer[0]
            db = StateModel(state_name='lock_model', status=0)
            db.save()
            return HttpResponse('OK')
Beispiel #15
0
def main(model, dic, corpus, output):
    logging.basicConfig(level=logging.INFO)
    lda = LDA(model, dic)
    lda.load()
    topics = []
    with open(corpus) as fp:
        n_sentences = sum(1 for line in fp)
    logging.info('Computing topic vectors for %d sentences', n_sentences)
    bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences)
    with open(corpus) as fp:
        for sentence in bar(fp):
            topics.append(lda.topic_vector(sentence.split()))
    logging.info('Saving topic information to %s', output)
    with open(output, 'w') as fp:
        cPickle.dump(np.vstack(topics), fp, protocol=cPickle.HIGHEST_PROTOCOL)
Beispiel #16
0
 def train_model(self, filename, model_name):
     self.create_label_corpus(filename)
     self.lda = LDA(self.options.K, self.options.alpha, self.options.beta)
     self.lda.set_corpus(self.labelset, self.corpus, self.labels)
     print "M=%d, V=%d, L=%d, K=%d" % (len(self.corpus), len(self.lda.vocas), len(self.labelset), self.options.K)
     for index in range(self.options.iteration):
         sys.stderr.write("-- %d : %.4f\n" % (index, self.lda.perplexity()))
     print "perplexity : %.4f" % self.lda.perplexity()
     phi = self.lda.phi()
     theta = self.lda.theta()
     new_stopword = []
     for k, label in enumerate(self.labelset):
         print "\n-- label %d : %s" % (k, label)
         for w in numpy.argsort(-phi[k]):
             print "%s: %f" % (self.lda.vocas[w], phi[k,w])
     self.save_model(model_name)
Beispiel #17
0
def train_acc(data_path, algorithm_name):
    print(data_path)
    x, y, test_x, test_y = data.run(data_path)
    clf = None
    if algorithm_name == "gnb":
        clf = GNB()
        print("gnb instance.")
    elif algorithm_name == "lda":
        clf = LDA()
        print("lda instance.")
    elif algorithm_name == "qda":
        clf = QDA()
        print("qda instance.")
    else:
        print("NO Implement")
        return "NO Implement"

    num = 0
    clf.fit(x, y)
    train_result = clf.predict(x)
    for i in range(len(train_result)):
        if train_result[i] == y[i]:
            num += 1

    return num / len(y)
Beispiel #18
0
def plot_unsmoothed():
    corpus, T = generate_corpus()
    L = LDA(T)
    L.train(corpus, verbose=False)
    fig, axes = plt.subplots(1, 2)
    ax1 = sns.heatmap(L.beta, xticklabels=[], yticklabels=[], ax=axes[0])
    ax1.set_xlabel("Topics")
    ax1.set_ylabel("Words")
    ax1.set_title("Recovered topic-word distribution")

    ax2 = sns.heatmap(L.gamma, xticklabels=[], yticklabels=[], ax=axes[1])
    ax2.set_xlabel("Topics")
    ax2.set_ylabel("Documents")
    ax2.set_title("Recovered document-topic distribution")
    plt.savefig("img/plot_unsmoothed.png", dpi=300)
    plt.close("all")
Beispiel #19
0
	def fit(self,num_topic=5,n_iter=1500):
		count_vect = CountVectorizer()
		x_train_counts = count_vect.fit_transform(self.corpus)
		self.model = LDA(n_topics=num_topic, n_iter=n_iter, random_state=1)
		self.model.fit(x_train_counts)

		self.topic_words = self.model.topic_word_
		self.vocabulary = count_vect.get_feature_names()
Beispiel #20
0
def train(request):
    if request.is_ajax():
        if request.method == 'GET':
            print 'train func'
            db = StateModel(state_name='lock_model', status=1)
            db.save()

            # neural_network = neural_net(75, 3)
            # neural_network.create_struct(150)
            file_train = settings.STATICFILES_DIRS[0]+'main_app/media/train_data.csv'

            lda = LDA(75,3)
            #neural_network.data_input(feature_list, answer, "train")
            #neural_network.file_input(file_train)
            lda.file_input(file_train)
            #neural_network.file_input(file_test, type_set='test')
            #test_data, test_answer = neural_network.get_test_data()
            # neural_network.training(5000)
            # neural_network.save_model(settings.STATICFILES_DIRS[0])
            lda.training()
            lda.save_model(settings.STATICFILES_DIRS[0])
            #print neural_network.predict(test_data[0])
            #print test_answer[0]
            db = StateModel(state_name='lock_model', status=0)
            db.save()
            return HttpResponse('OK')
Beispiel #21
0
def lda(fname,
        indF,
        nTopics=20,
        iterations=50,
        fmax=math.inf,
        ofhead='cancer_py_cust_gvLDA_'):
    cts = pd.read_csv(fname + '.csv', header=0, index_col=0, dtype={0: str})
    ind = pd.read_csv(indF + '.csv', header=None)
    patID = cts.index
    gvID = cts.columns
    rows = np.where(ind > 0)[0]
    splits = np.max(np.array(ind))
    patID = patID[rows]
    phi = cts.iloc[rows]
    ind = ind.iloc[rows, 0]

    for i in range(1, splits + 1):

        ofname = ofhead + str(nTopics) + '_' + str(i)

        # training set
        rowsT = np.where(ind != i)
        X = np.asarray(phi.iloc[rowsT])
        cols = X.sum(axis=0) < fmax
        X = X[:, cols]

        # valid set
        rowsV = np.where(ind == i)
        X_test = np.asarray(phi.iloc[rowsV])
        X_test = X_test[:, cols]

        lda = LDA(nTopics)
        patTop, gvTop = lda.train(X, iters=iterations)
        ofname = 'data/' + ofname
        gvTop = pd.DataFrame(gvTop)
        gvTop.columns = np.asarray(gvID)[cols]
        gvTop.to_csv(ofname + '_genes.csv')
        pd.DataFrame(lda.alpha).to_csv(ofname + '_alpha.csv')
        patTop = pd.DataFrame(patTop)
        patTop.index = patID[rowsT]
        patTop.to_csv(ofname + '_train.csv')
        patTop = lda.predict(X_test, iters=iterations)
        patTop = pd.DataFrame(patTop)
        patTop.index = patID[rowsV]
        patTop.to_csv(ofname + '_valid.csv')
def _getLDA(text, label, n_topic_words):
    vectorizer = CountVectorizer(min_df=100, max_df=5000)
    transformer = TfidfTransformer()
    df = vectorizer.fit_transform(text)
    tfidf_word_name = vectorizer.get_feature_names()

    model = LDA(n_topics=20, n_iter=1000, random_state=1)
    model.fit(df)
    Dump(model, 'LDA_model', 'joblib')
    topic_word = model.topic_word_
    doc_topic = model.doc_topic_
    with open('topic_word.txt', 'w') as f:
        n_top_words = 300
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(tfidf_word_name)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]
            f.write('Topic {}: {}'.format(i, ' '.join(topic_words)) + '\n')
    return topic_word, doc_topic
Beispiel #23
0
def main(model, dic, corpus, output):
    logging.basicConfig(level=logging.INFO)
    lda = LDA(model, dic)
    lda.load()
    topics = []
    with open(corpus) as fp:
        n_sentences = sum(1 for line in fp)
    logging.info('Computing topic vectors for %d sentences', n_sentences)
    bar = pb.ProgressBar(widgets=[pb.Percentage(),
                                  pb.Bar(),
                                  pb.ETA()],
                         maxval=n_sentences)
    with open(corpus) as fp:
        for sentence in bar(fp):
            topics.append(lda.topic_vector(sentence.split()))
    logging.info('Saving topic information to %s', output)
    with open(output, 'w') as fp:
        cPickle.dump(np.vstack(topics), fp, protocol=cPickle.HIGHEST_PROTOCOL)
Beispiel #24
0
    def __init__(self, docs, K, alpha, eta):
        LDA.__init__(self, docs, K, alpha, eta)

        ### Gibbs sampler related data structures ###

        # C_VK[w,k] := number of times word w is assigned to topic k
        self.C_VK = np.zeros((self.V, self.K), dtype=int)
        # C_DK[d,k] := number of times topic k is present in document d
        self.C_DK = np.zeros((self.D, self.K), dtype=int)

        # Cache these values as we go (equivalent to performing column sums for above matrices)
        # For each document, total number of topics assigned
        self.total_topics_per_doc = np.zeros(self.D)
        # For each topic, total number of words assigned to it
        self.total_words_per_topic = np.zeros(self.K)

        # Save results here
        self.log_prob = []
        self.samples = []
Beispiel #25
0
def learn_topics(textpath, topicnum):

    with open(textpath) as f:
        texts = f.readlines()

    # Get vocabulary and word counts.  Use the top 10,000 most frequent
    # lowercase unigrams with at least 3 alphabetical, non-numeric characters,
    # punctuation treated as separators.
    CVzer = CountVectorizer(token_pattern=r"(?u)\b[^\W\d]{3,}\b",
                            max_features=None,
                            lowercase=True)
    doc_vcnts = CVzer.fit_transform(texts)
    vocabulary = CVzer.get_feature_names()

    # Learn topics.  Refresh conrols print frequency.
    lda_model = LDA(topicnum, n_iter=8000, refresh=2000) 
    doc_topic = lda_model.fit_transform(doc_vcnts)
    topic_word = lda_model.topic_word_

    return doc_topic, topic_word, vocabulary
Beispiel #26
0
def fit_reuters():
    corpus = Corpus()
    corpus.load_ldac(menu_path + 'reuters.ldac')
    model = LDA(n_topic=20)
    model.fit(corpus, n_iter=50)

    model.save_model(protocol=2)
Beispiel #27
0
def exampleLDAExecution():
    X = data.load_reuters()
    vocab = data.load_reuters_vocab()
    titles = data.load_reuters_titles()

    # document-term matrix
    X = data.load_reuters()
    print("type(X): {}".format(type(X)))
    print("shape: {}\n".format(X.shape))

    # the vocab
    vocab = data.load_reuters_vocab()
    print("type(vocab): {}".format(type(vocab)))
    print("len(vocab): {}\n".format(len(vocab)))

    # titles for each story
    titles = data.load_reuters_titles()
    print("type(titles): {}".format(type(titles)))
    print("len(titles): {}\n".format(len(titles)))

    doc_id = 0
    word_id = 3117

    print("doc id: {} word id: {}".format(doc_id, word_id))
    print("-- count: {}".format(X[doc_id, word_id]))
    print("-- word : {}".format(vocab[word_id]))
    print("-- doc  : {}".format(titles[doc_id]))

    model = LDA(n_topics=20, n_iter=500, random_state=1)
    model.fit(X)  # model.fit_transform(X) is also available

    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 10
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    doc_topic = model.doc_topic_
    for i in range(10):
        print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
def RunLDA(FileLocation, NumDocs, NumTopics):
    # In order to create a Term Document matrix,
    # We read in every file and then make a list containing the body of 
    # all of the articles
    fin=open(FileLocation,'r')
    #Will need to store the urls when we make the tdm
    UrlArray = []
    #Create TDM object. It will also remove stopwords
    TDM = TermDocumentMatrix(simple_tokenize_remove_stopwords)
    # Add each article to the TDM object. Also create a list of urls
    # This is a massive corpus, so we are only doing this for 300 articles.
    for i in range(NumDocs):
        Article = fin.next()
        UrlArray.append(re.split(r'\t',Article)[0])
        TDM.add_doc(re.split(r'\t',Article)[1])
    # Rows in TDM is an iterable 
    # We can't have that to input it into numpy
    X = list(TDM.rows())
    # Oddly enough the first row of the .rows() iterable in TDM returms a 
    # List of all of the words used. Think of it as a header file
    Vocab = X[0]
    Y = []
    #creating a 2d list containing the rows of the document matrix
    for i in range(len(X)-1):
        Y.append(X[i+1])
    # Create the LDA model object. 20 topics this time, but that can be changed. 
    model = LDA(n_topics=20, n_iter=1500, random_state=1)
    # Make a numpy Array to use as input
    Yarray = np.asarray(Y)
    #Fit the model. This process is similiar to scikit-learn's algorithms
    model.fit(Yarray)
    TopicWords = []
    topic_word = model.topic_word_
    n_top_words = 50
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(Vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        TopicWords.append(topic_words)
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))    
Beispiel #29
0
def test_lda(model_file, dict_file, dbs_dir):
    """ Run training and display test results if visualize is true

  Args:
    model_file(str): saved model file to continue training on
    dict_file(str): dict_file path to load dictionary from 
    dbs_dir(str): dir path to load databases from 
  """

    assert (os.path.isdir(dbs_dir)), "Invalid data directory path"
    lda = LDA()
    print 'Loading existing dictionary...'
    lda.load_dict_from_disk(dict_file)
    test_results = list()
    #Iterate over all data and train model
    for root, dirs, files in os.walk(dbs_dir):
        #Iterate over sub-dirs
        for d in files:
            db = Database()
            #Load database object from saved file
            db.load_from_disk(dbs_dir + '/' + d)

            #Add database to model
            lda.add_database(db)
            #Test model
            test_results.append(lda.test(model_file, db_name=db.get_name()))
            lda.remove_database(db.get_name())

            del db
            gc.collect()

    #Print test results
    for idx, i in enumerate(test_results):
        print('Test results for database {}'.format(idx))
        for j in i[0]:
            print('Topic: {} has probability: {}'.format(j[0], j[1]))
        counter = 0
        for k in i[1]:
            print('Topic {} has topic-coherence score: {}'.format(
                counter, k[1]))
            counter += 1

    print lda.model.show_topics()
Beispiel #30
0
class TestLDA(unittest.TestCase):
    """
    Test the LDA class.
    """
    def setUp(self):
        self.description_csv = pd.read_csv("docs/description.csv")
        self.description_1000_csv = pd.read_csv("docs/description_1000.csv")
        self.dp = DocsPreprocessor()
        self.description_1000 = self.dp.process(self.description_1000_csv)
        self.lda = LDA(self.description_1000)

    def test_1(self):
        k_values, coherence_values, topic_list = self.lda.compute_coherence_values(
            5, 20, 5)
Beispiel #31
0
def appDescriptionsLDA():
    X = data.load_reuters()
    vocab = data.load_reuters_vocab()
    titles = data.load_reuters_titles()

    print X
    print vocab
    print titles

    X.shape
    X.sum()
    model = LDA(n_topics=20, n_iter=500, random_state=1)
    model.fit(X)  # model.fit_transform(X) is also available

    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 10
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    doc_topic = model.doc_topic_
    for i in range(10):
        print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
Beispiel #32
0
def main(dataset, compute_errors=True, plot_boundaries=True, save=False):
    """ Fit the four models on the training sets, depending on the parameters
    compute the accuracy et plot the boundary
    args :: dataset : array(str) """

    filename = "data/" + dataset + ".train"
    x_train, y_train = read_file(filename)
    filename = "data/" + dataset + ".test"
    x_test, y_test = read_file(filename)

    models = [
        LDA(x_train, y_train),
        LinearRegression(x_train, y_train),
        LogisiticRegression(x_train, y_train),
        QDA(x_train, y_train)
    ]

    model_names = ["LDA", "LinearRegression", "LogisiticRegression", "QDA"]
    for i, model in enumerate(models):
        model_name = model_names[i]
        model.fit()
        if compute_errors:
            y_pred_train = [model.predict(x) for x in x_train]
            e = accuracy(y_train, y_pred_train)
            print("Accuracy with " + model_name)
            print("Training: ", e)
            y_pred_test = [model.predict(x) for x in x_test]
            e = accuracy(y_test, y_pred_test)
            print("Testing: ", e)
        if plot_boundaries:
            model.plot_boundary()
            plt.scatter(model.x[:, 0], model.x[:, 1], c=model.y, s=1)
            title = "Model: " + model_name + ", " + dataset + " (Train)"
            plt.title(title)
            if save:
                plt.savefig("figs/" + model_name + "_" + dataset[-1] +
                            "Train.png")
            plt.show()
            model.plot_boundary()
            plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test, s=1)
            title = "Model: " + model_name + ", " + dataset + " (Test)"
            plt.title(title)
            if save:
                plt.savefig("figs/" + model_name + "_" + dataset[-1] +
                            "Test.png")
            plt.show()
Beispiel #33
0
def output_reuters():
    model = LDA()
    model.load_model()

    corpus = Corpus()
    corpus.load_ldac(menu_path + 'reuters.ldac')
    corpus.load_vocabulary(menu_path + 'reuters.tokens')
    corpus.load_context(menu_path + 'reuters.titles')

    topic_word = model.topic_word(n_top_word=10, corpus=corpus)
    print '\n'.join(map(str, topic_word))

    document_topic = model.document_topic(n_top_topic=1, corpus=corpus, limit=10)
    print '\n'.join(map(str, document_topic))
Beispiel #34
0
    def __init__(self, block, num_blocks, n_classes, lda_args):
        super(ResNet, self).__init__()
        self.lda_args = lda_args
        if self.lda_args:  # LDA
            self.in_planes = 32
            self.out_planes = 16
        else:  # Usual CNN with CE loss
            self.in_planes = 32
            self.out_planes = 16  # 64

        self.conv1 = nn.Conv2d(3,
                               self.in_planes,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_planes)
        self.layer1 = self._make_layer(block,
                                       self.out_planes * 1,
                                       num_blocks[0],
                                       stride=1)
        self.layer2 = self._make_layer(block,
                                       self.out_planes * 2,
                                       num_blocks[1],
                                       stride=2)
        self.layer3 = self._make_layer(block,
                                       self.out_planes * 4,
                                       num_blocks[2],
                                       stride=2)
        self.layer4 = self._make_layer(block,
                                       self.out_planes * 8,
                                       num_blocks[3],
                                       stride=2)
        if self.lda_args:
            self.lda = LDA(n_classes, lda_args['lamb'])
        else:
            self.linear = nn.Linear(self.out_planes * 8 * block.expansion,
                                    n_classes)
Beispiel #35
0

train_re_path = '../data/train/relevant.txt'
train_ir_path = '../data/train/irrelevant.txt'
test2_ir_path = '../data/test2/irrelevant.txt'
test2_re_path = '../data/test2/relevant.txt'
test1_ir_path = '../data/test1/irrelevant.txt'
test1_re_path = '../data/test1/relevant.txt'


words_dict, idx_dict = create_dict(full_path, stop_words)

train_X = load_data(train_path)
train_X = word_to_idx(train_X, words_dict)

lda = LDA(5)

lda.fit(train_X, words_dict.items())

test1_re_X = load_data(test1_re_path)
test1_re_X = word_to_idx(test1_re_X, words_dict)
test1_ir_X = load_data(test1_ir_path)
test1_ir_X = word_to_idx(test1_ir_X, words_dict)

test2_re_X = load_data(test2_re_path)
test2_re_X = word_to_idx(test2_re_X, words_dict)
test2_ir_X = load_data(test2_ir_path)
test2_ir_X = word_to_idx(test2_ir_X, words_dict)

target_X = load_data(target_path)
target_X = word_to_idx(target_X, words_dict)
Beispiel #36
0
class Classifier:

    def __init__(self, options):
        self.options = options
        self.file_dir = "./build/"
        self.labels= []
        self.corpus = []

        if not os.path.exists(self.file_dir):
            os.makedirs("build")

        self.stopwords = self.get_stopwords()


    def train_model(self, filename, model_name):
        self.create_label_corpus(filename)
        self.lda = LDA(self.options.K, self.options.alpha, self.options.beta)
        self.lda.set_corpus(self.labelset, self.corpus, self.labels)
        print "M=%d, V=%d, L=%d, K=%d" % (len(self.corpus), len(self.lda.vocas), len(self.labelset), self.options.K)
        for index in range(self.options.iteration):
            sys.stderr.write("-- %d : %.4f\n" % (index, self.lda.perplexity()))
        print "perplexity : %.4f" % self.lda.perplexity()
        phi = self.lda.phi()
        theta = self.lda.theta()
        new_stopword = []
        for k, label in enumerate(self.labelset):
            print "\n-- label %d : %s" % (k, label)
            for w in numpy.argsort(-phi[k]):
                print "%s: %f" % (self.lda.vocas[w], phi[k,w])
        self.save_model(model_name)

    def lemmatize(self, string):
        return WordNetLemmatizer().lemmatize(string, pos='v')

    def create_label_corpus(self,filename):
        with open(os.path.join(self.file_dir,filename)) as model:
            for row in model:
                label_class_list = []
                selected_words = []

                split_row = row.lower().split("\"|\"")
                label_array = self.filter_split(split_row[0])
                # Create Unicoded label_type
                for label_type in self.filter_split(split_row[1]):
                    label_class_list.append(unicode(label_type,"utf-8"))

                for word in label_array:
                    lemmatized_word = self.lemmatize(word)
                    if word not in self.stopwords and len(word) > 2 and not bool(re.search(r'\d',lemmatized_word)) and lemmatized_word not in self.stopwords:
                        selected_words.append(lemmatized_word)

                self.corpus.append(selected_words)
                self.labels.append(label_class_list)
                self.labelset = list(set(reduce(list.__add__, self.labels)))

    def filter_split(self,label):
        return re.sub(r'\W+',' ',label).split()

    def classify(self,model_name,label):
        self.lda = self.load_model(model_name)
        self.stopwords = self.get_stopwords()
        result_vector = numpy.zeros(self.lda.K)
        phi = self.lda.phi()
        label_array = self.filter_split(label)

        for word in label_array:
            for r in range(self.lda.K):
                lemmatized_word = self.lemmatize(word)
                if word not in self.stopwords and len(word) > 2 and not bool(re.search(r'\d',lemmatized_word)) and lemmatized_word not in self.stopwords and lemmatized_word in self.lda.vocas_id:
                    result_vector[r] += phi[r,self.lda.vocas_id[lemmatized_word]]

        result = 0
        if result_vector.argmax() == 0:
            v = max(n for n in result_vector if n != max(result_vector))
            result = numpy.argwhere(result_vector == v)
        else:
            result = result_vector.argmax()
        print self.lda.labelmap.keys()[self.lda.labelmap.values().index(result)]
        return self.lda.labelmap.keys()[self.lda.labelmap.values().index(result)]

    def save_model(self, model_name):
        with open(os.path.join(self.file_dir,model_name + "_trained.p"),'wb') as model_file:
            pickle.dump(self.lda,model_file,protocol=pickle.HIGHEST_PROTOCOL)

    def load_model(self,model_name):
        if os.path.isfile(os.path.join(self.file_dir,model_name+ "_trained.p")):
            with open(os.path.join(self.file_dir,model_name + "_trained.p"),'rb') as model_file:
                return pickle.load(model_file)
        else:
            print "Trained model for %s is not found in \"%s\" directory" % ((model_name), (file_dir))
            print "Please train the model"

    def get_stopwords(self):
        return Stopword(self.file_dir).get_stopwords()
Beispiel #37
0
alpha = 1./5
lmda = 1./2

#Top down LDA data
X = sp.coo_matrix((M, V)).tolil()
beta = np.zeros((K, V))
for k in range(K):
    beta[k, :] = np.random.dirichlet(np.ones(V)*lmda)
for d in range(M):
    theta_d = np.random.dirichlet(np.ones(K)*alpha)
    zs = np.random.choice(np.arange(K), size=numwords, p=theta_d)
    for z in zs:
        w_n = np.random.choice(np.arange(V), p=beta[z, :])
        X[d, w_n] += 1

lda = LDA(alpha=alpha, lmda=lmda, nr_em_epochs=10)

print "No collapsing"
props, word_props, log_Xsno, perpno = lda.gibbs_sample(X)
# plt.plot(range(len(log_Xs)), log_Xs, '*-')
# plt.show()
#
# plt.plot(range(len(perp)), perp, 'o-')
# plt.show()
print "Perplexity:"
print perpno

print "logX:"
print log_Xsno

print "All collapsed"
    def clustering_measure(self, n_cluster):
        km = KMeans(n_cluster)
        km.fit(self.doc_features)
        print("Adjusted Rand-Index: %.3f"
              % metrics.adjusted_rand_score(self.doc_class, km.labels_))

    def cross_validation(self):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            self.doc_features, self.doc_class, test_size=0.4, random_state=0)
        clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
        print ("Cross-Validation Score: %.3f" % clf.score(X_test, y_test))


if __name__ == '__main__':
    # load dataset
    dataset = CNN()
    dataset.load_data('/home/yi/Dropbox/workspace/data/cnn/')

    # train lda
    lda = LDA(5)
    lda.initialize(dataset.data_matrix)
    #lda.load_label('labels.txt', dataset.dictionary)
    for iter in range(20):
        lda.fit(dataset.data_matrix)
    lda.fininsh()
    lda.print_top_words(dataset.dictionary, 10)

    # evaluate lda
    eval = Evaluator(dataset, lda)
    eval.clustering_measure(n_cluster=5)
    eval.cross_validation()
Beispiel #39
0
# bow = bow / bow.sum(axis=1)[:, None]

# Number of docs
n_docs = bow.shape[0]
# Number of unique words in the vocabulary
n_vocab = bow.shape[1]
# Number of dimensions in a single word vector
n_units = 256
# number of topics
n_topics = 20
batchsize = 128
counts = corpus.keys_counts[:n_vocab]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]

model = LDA(n_docs, n_topics, n_units, n_vocab)
if os.path.exists('lda.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = prepare_topics(p, f, w, words)
Beispiel #40
0
        [4, 5, 8, 9, ],
    ]
    #~ l = [
        #~ [1,2,3,4,5,],
        #~ [1,2,3,4,5,],
        #~ [6,7,8,9,10,],
        #~ [6,7,8,9,10,],
        #~ [1,2,3,4,5,],
    #~ ]
    for d in l:
        yield d


if __name__ == "__main__":
    # Format: (name of analysis, number of topics, alpha, beta, burn, length, dataset feature vector iterator)
    given = [
        #~ ("test", 2, 0.1, 0.1, 100, 10, test_data),
        ('state_of_the_union', 5, 0.1, 0.1, 499, 1, state_of_the_union),
    ]
    
    for settings in given:
        analysis = LDA(settings[1], settings[2], settings[3], settings[4], settings[5])
        print(settings[0])
        analysis.run_analysis(settings[6]())
        analysis.print_topics(10)
        with io.open('results_%s.json'%(settings[0]), 'w', encoding='utf-8', errors='ignore') as f:
            f.write(unicode(json.dumps(analysis.log_likelihoods)))
        
        
        
Beispiel #41
0
from lda import LDA, _doc_update, _slice_doc_update
import pickle
import numpy as np

np.seterr(divide="raise")

from data.datafile import AADataFile
dfile = pickle.load(open("data/datafile.pkl"))

dt = dfile.DT
te = dfile.TE

f = te.toarray().argmax(axis=1)

lda = LDA(K=10, n_jobs=8, nr_em_epochs=20)

perp, b, g, e = lda.fit(dt, f)
 def __init__(self, n_topics, alpha=0.1, beta=0.01, random_state=0):
     LDA.__init__(self, n_topics, alpha=0.1, beta=0.01, random_state=0)
from lda import LDA
from settings import demo_dataset_dir


model = LDA()
model.train(dataset_dir=demo_dataset_dir, output_final_result=True)
Beispiel #44
0
    def run(self, mode,  cntStatus = True, saveVid = False, showVid = True ):
        lbp = lbp_feature()
        # neural_network = neural_net(75, 3)
        # neural_network.create_struct(150)
        # neural_network.load_model(settings.STATICFILES_DIRS[0])
        lda = LDA(75, 3)
        #lda.create_struct(150)
        if mode == 'predict':
            lda.load_model(settings.STATICFILES_DIRS[0])
        self.video.set(cv2.cv.CV_CAP_PROP_POS_MSEC, 0)
        kernel = np.ones((10, 10), np.uint8)
        lanes = [[] for x in range(self.totalLane)]
        totalCars = [0] * self.totalLane
        num_car_detect = 0
        self.timer = threading.Timer(5.0, self.progress)
        self.timer.start()
        while self.video.isOpened():
            ret, frame = self.video.read()
            if not ret:
                break
            frameOrigin = deepcopy(frame)
            res = frame
            self.num_frame +=1
            for point in self.lanePoints:
                cv2.polylines(frame, [point], True, (0, 255, 0), 3)

            filteredFrame = cv2.GaussianBlur(frame, (5, 5), 0)
            if self.fgMask is None:
                self.fgMask = self.subtractor.apply(filteredFrame, -1)
                test = deepcopy(self.fgMask)
            self.fgMask = self.subtractor.apply(filteredFrame, self.fgMask, -1)
            self.fgMask = cv2.dilate(self.fgMask, kernel, iterations=1)
            self.fgMask = cv2.erode(self.fgMask, kernel, iterations=1)

            self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8))
            self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8))
            self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_OPEN, np.ones((5, 5), np.uint8))
            tempMask = deepcopy(self.fgMask)
            carImg = cv2.bitwise_and(frameOrigin, frameOrigin, mask=self.fgMask)
# Section tracking and Detection
            contours, hrc = cv2.findContours(tempMask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS)
            isIn = [False] * self.totalLane
            laneObj = [[] for x in range(self.totalLane)]

            outLane = [[] for x in range(self.totalLane)]
            for obj in contours:
                moment = cv2.moments(obj)
                if moment['m00'] == 0:
                    continue
                cx = int(moment['m10']/moment['m00'])
                cy = int(moment['m01']/moment['m00'])
                pX, pY, w, h = cv2.boundingRect(obj)

                isNotLane = True
                for numLane in range(len(self.laneContours)):
                    if cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False) == 1:
                        car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w}
                        laneObj[numLane].append(car_object)
                        isNotLane = False
                        break
                if isNotLane:
                    for numLane in range(len(self.laneContours)):
                        lanePoint =  self.lanePoints[numLane]

                        if cx >= lanePoint[3][0][0] and cx <= lanePoint[2][0][0]\
                                and cy >= lanePoint[3][0][1]  and cy <= lanePoint[3][0][1]+50:
                            car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w}
                            outLane[numLane].append(car_object)

            for numLane in range(len(self.laneContours)):
                for i in outLane[numLane]:
                    diffRange = 50
                    foundedObj = None
                    for j in lanes[numLane]:
                        diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1])
                        if diff < diffRange:
                            diffRange = diff
                            foundedObj = j
                    if foundedObj is not None:
                        totalCars[numLane] += 1
                        originX = i["origin"][0]
                        originY = i["origin"][1]
                        crop_img = frameOrigin[originY:originY + i["height"], originX:originX+i["width"]]
                        normal_image = cv2.resize(crop_img, (64, 64))
                        num_car_detect += 1
                        if mode == 'train':
                            directory = settings.STATICFILES_DIRS[0]+'main_app/media/train_image/'
                            if not os.path.exists(directory):
                                os.makedirs(directory)
                            cv2.imwrite(directory + 'car'+str(num_car_detect)+'.png', crop_img)
                        if mode == 'predict':
                            height, width, channels = crop_img.shape
                            size_data = [height/100.0, width/100.0, height * width/10000.0]
                            lbp.read_image(normal_image)
                            feature = lbp.extract_feature(size_data[0], size_data[1], size_data[2])
                            #answer = neural_network.predict(feature)
                            answer = int(lda.predict(feature))
                            save_type(self.video_name, answer, self.num_frame)
                            if answer == 2:
                                self.typeCar["small"] += 1
                            elif answer == 1:
                                self.typeCar["medium"] += 1
                            else:
                                self.typeCar["large"] += 1
                            print answer
                            file_name = self.video_name[:self.video_name.find('.avi')] + '.png'
                            path = settings.STATICFILES_DIRS[0]+'main_app/media/result_image/'+str(num_car_detect)+'-'+str(answer)+'-'+file_name
                            cv2.imwrite(path, crop_img)
                        lanes[numLane].remove(foundedObj)

                for i in lanes[numLane]:
                    i["stat"] = False
                for i in laneObj[numLane]:
                    diffRange = 50
                    foundedObj = None
                    for j in lanes[numLane]:
                        diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1])

                        if diff < diffRange:
                            diffRange = diff
                            foundedObj = j
                    if foundedObj is not None:
                        foundedObj["point"].insert(0, i["centroid"])
                        foundedObj["stat"] = True
                    else:
                        lanes[numLane].append({ "point": [i["centroid"]], "stat": True })
                tempLane = []
                for i in lanes[numLane]:
                    if i["stat"]:
                        tempLane.append(i)
                        cv2.polylines(res, np.int32([i["point"]]), False, (0, 255, 255), 3)
                lanes[numLane] = tempLane

# Section Draw TrackLine
            for obj in contours:
                moment = cv2.moments(obj)
                if moment['m00'] == 0:
                    continue
                pX, pY, w, h = cv2.boundingRect(obj)
                cx = int(moment['m10']/moment['m00'])
                cy = int(moment['m01']/moment['m00'])+h/2
                cv2.circle(res, (cx, cy), 3, (0, 0, 255), 4)
                distance = []
                for numLane in range(len(self.laneContours)):
                    distance.append(cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False))
                for numLane in range(len(self.laneContours)):
                    if distance[numLane] == 1:
                        isIn[numLane] = True
                        cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (0, 255, 255), 2)
                        if self.lanes[numLane]["is_empty"]:
                            self.lanes[numLane]["is_empty"] = False
                            self.lanes[numLane]["pts"].append((cx, cy))
                        else:
                            self.lanes[numLane]["pts"].insert(0, (cx, cy))
                        break
                    else:
                        cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (255, 255, 0), 2)
            for i in range(0, self.totalLane):
                if isIn[i]:
                    if showVid:
                        pass
                else:
                    self.lanes[numLane]["is_empty"] = True
                    self.lanes[numLane]["pts"] = []
            if cntStatus:
                cv2.putText(res, 'lane1: '+str(totalCars[0]), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(res, 'lane2: '+str(totalCars[1]), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (125, 0, 255), 2)
                cv2.putText(res, 'truck/bus: '+str(self.typeCar["large"]), (400, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
                cv2.putText(res, 'small car: '+str(self.typeCar["medium"]), (400, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
                cv2.putText(res, 'motorcycle: '+str(self.typeCar["small"]), (400, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

            if showVid:
                resMask = cv2.bitwise_and(frame, frame, mask=~self.fgMask)
                cv2.imshow('frame', res)
                if cv2.waitKey(5) & 0xFF == ord('q'):
                    cv2.imwrite('tesf.png', frameOrigin)
                    cv2.imwrite('tesM.png', self.fgMask)
                    break
        self.timer.cancel()
        update_progress(self.video_name, self.num_frame, self.total_frame)
        print totalCars
        self.video.release()
        cv2.destroyAllWindows()
        print self.typeCar
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

if __name__ == '__main__':
	startTime = datetime.datetime.now()  
	documentList = ["./texts/t11.txt","./texts/t22.txt"] 
	# documentList = ["./texts/test_shak1.txt"] 
	# documentList = ["./texts/shak.txt"] 
	totalDocs = len(documentList) 
	# Add language check on init and load correct stopwords list   
	stopList = stopwords.words('english') 
	# Init weighting libraries 
	TfIdf = TfIdf(documentList, stopList) 
	LSI = LSI(documentList, stopList) 
	LDA = LDA(documentList, stopList) 
	# Loop to get this argument 
	print "Ready " 
	while 1:
		try:
			line = sys.stdin.readline()
			print (TfIdf.runQuery(line)) 
			print (LSI.runQuery(line)) 
			print (LDA.runQuery(line)) 
		except KeyboardInterrupt:
			break
		if not line:
			break 



Beispiel #46
0
import pickle

from lda import LDA

from data.datafile import AADataFile
dfile = pickle.load(open("data/datafile.pkl"))

dt = dfile.DT
te = dfile.TE


lda = LDA(K=10, n_jobs=8, nr_em_epochs=20)

perp, b, g = lda.fit(dt)