Example #1
0
def create_topics(choise):
    articles_by_author = {}
    article_authors = db.documents.find(types[choise], {
        '_id': 1,
        'authors': 1
    })
    for article in article_authors:
        for author in article['authors']:
            if articles_by_author.get(author['name'], -1) == -1:
                articles_by_author[author['name']] = [article['_id']]
            else:
                articles_by_author[author['name']] += [article['_id']]

    for author in articles_by_author:
        no_articles = len(articles_by_author[author])
        lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR')
        query = {'_id': {'$in': articles_by_author[author]}}
        idx = 0
        no_topics = number_topics(no_articles)
        print author, 'topics for:', len(
            articles_by_author[author]), 'articles, no topics', no_topics
        for topic in lda.apply(query=query,
                               num_topics=no_topics,
                               num_words=10,
                               iterations=1500)[0]:
            t = ""
            for elem in topic:
                t += elem[1] + " "
            print idx, t
            idx += 1
def create_topics(choise):
	affiliation_articles = {}
	idx = 0
	for f in onlyfiles:
		d_id = int(f[:-4])
		types[choise]['_id'] = d_id
		selected = db.documents.find_one(types[choise])
		if selected:
			with codecs.open(mypath + '/' + f) as open_file:
				l_emails = []
				for line in open_file:
					if '@' in line:
						for affiliation in extract_affiliation(line):
							if affiliation_articles.get(affiliation, -1) == -1:
								affiliation_articles[affiliation] = [d_id]
							else:
								affiliation_articles[affiliation] = list(set(affiliation_articles[affiliation] + [d_id]))

	for affiliation in affiliation_articles:
		no_articles = len(affiliation_articles[affiliation])
		lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR')
		query = {'_id': {'$in': affiliation_articles[affiliation]}}
		idx = 0
		no_topics = number_topics(no_articles)
		print affiliation, 'topics for ', len(affiliation_articles[affiliation]), 'articles, no topics', no_topics
		for topic in lda.apply(query=query, num_topics=no_topics, num_words=10, iterations=1500)[0]:
			t = ""
			for elem in topic:
				t += elem[1] + " "
			print idx, t
			idx += 1
Example #3
0
File: cats.py Project: MeryemB/CATS
def thread_lda(k, db_name, query):
    global lda_running
    global lda_results
    lda_running[db_name] = True
    lda_results[db_name] = None
    lda = LDA(dbname=db_name, host=host, port=port)
    results = lda.apply(query=query, num_topics=k, num_words=10, iterations=500)
    scores = [0]*k
    for doc in results[1]:
        for topic in doc:
            scores[int(topic[0])] += float(topic[1])
    topics = []
    for i in range(0, k):
        topics.append([i, scores[i], results[0][i]])
    lda_running[db_name] = False
    lda_results[db_name] = topics
Example #4
0
def create_topics(choise):
    affiliation_articles = {}
    idx = 0
    for f in onlyfiles:
        d_id = int(f[:-4])
        types[choise]['_id'] = d_id
        selected = db.documents.find_one(types[choise])
        if selected:
            with codecs.open(mypath + '/' + f) as open_file:
                l_emails = []
                for line in open_file:
                    if '@' in line:
                        for affiliation in extract_affiliation(line):
                            if affiliation_articles.get(affiliation, -1) == -1:
                                affiliation_articles[affiliation] = [d_id]
                            else:
                                affiliation_articles[affiliation] = list(
                                    set(affiliation_articles[affiliation] +
                                        [d_id]))

    for affiliation in affiliation_articles:
        no_articles = len(affiliation_articles[affiliation])
        lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR')
        query = {'_id': {'$in': affiliation_articles[affiliation]}}
        idx = 0
        no_topics = number_topics(no_articles)
        print affiliation, 'topics for ', len(
            affiliation_articles[affiliation]
        ), 'articles, no topics', no_topics
        for topic in lda.apply(query=query,
                               num_topics=no_topics,
                               num_words=10,
                               iterations=1500)[0]:
            t = ""
            for elem in topic:
                t += elem[1] + " "
            print idx, t
            idx += 1
def create_topics(choise):
	articles_by_author = {}
	article_authors = db.documents.find(types[choise], {'_id': 1, 'authors': 1})
	for article in article_authors:
		for author in article['authors']:
			if articles_by_author.get(author['name'], -1) == -1:
				articles_by_author[author['name']] = [article['_id']]
			else:
				articles_by_author[author['name']] += [article['_id']]

	for author in articles_by_author:
		no_articles = len(articles_by_author[author])
		lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR')
		query = {'_id': {'$in': articles_by_author[author]}}
		idx = 0
		no_topics = number_topics(no_articles)
		print author, 'topics for:', len(articles_by_author[author]), 'articles, no topics', no_topics
		for topic in lda.apply(query=query, num_topics=no_topics, num_words=10, iterations=1500)[0]:
			t = ""
			for elem in topic:
				t += elem[1] + " "
			print idx, t
			idx += 1