def create_topics(choise): articles_by_author = {} article_authors = db.documents.find(types[choise], { '_id': 1, 'authors': 1 }) for article in article_authors: for author in article['authors']: if articles_by_author.get(author['name'], -1) == -1: articles_by_author[author['name']] = [article['_id']] else: articles_by_author[author['name']] += [article['_id']] for author in articles_by_author: no_articles = len(articles_by_author[author]) lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR') query = {'_id': {'$in': articles_by_author[author]}} idx = 0 no_topics = number_topics(no_articles) print author, 'topics for:', len( articles_by_author[author]), 'articles, no topics', no_topics for topic in lda.apply(query=query, num_topics=no_topics, num_words=10, iterations=1500)[0]: t = "" for elem in topic: t += elem[1] + " " print idx, t idx += 1
def create_topics(choise): affiliation_articles = {} idx = 0 for f in onlyfiles: d_id = int(f[:-4]) types[choise]['_id'] = d_id selected = db.documents.find_one(types[choise]) if selected: with codecs.open(mypath + '/' + f) as open_file: l_emails = [] for line in open_file: if '@' in line: for affiliation in extract_affiliation(line): if affiliation_articles.get(affiliation, -1) == -1: affiliation_articles[affiliation] = [d_id] else: affiliation_articles[affiliation] = list(set(affiliation_articles[affiliation] + [d_id])) for affiliation in affiliation_articles: no_articles = len(affiliation_articles[affiliation]) lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR') query = {'_id': {'$in': affiliation_articles[affiliation]}} idx = 0 no_topics = number_topics(no_articles) print affiliation, 'topics for ', len(affiliation_articles[affiliation]), 'articles, no topics', no_topics for topic in lda.apply(query=query, num_topics=no_topics, num_words=10, iterations=1500)[0]: t = "" for elem in topic: t += elem[1] + " " print idx, t idx += 1
def thread_lda(k, db_name, query): global lda_running global lda_results lda_running[db_name] = True lda_results[db_name] = None lda = LDA(dbname=db_name, host=host, port=port) results = lda.apply(query=query, num_topics=k, num_words=10, iterations=500) scores = [0]*k for doc in results[1]: for topic in doc: scores[int(topic[0])] += float(topic[1]) topics = [] for i in range(0, k): topics.append([i, scores[i], results[0][i]]) lda_running[db_name] = False lda_results[db_name] = topics
def create_topics(choise): affiliation_articles = {} idx = 0 for f in onlyfiles: d_id = int(f[:-4]) types[choise]['_id'] = d_id selected = db.documents.find_one(types[choise]) if selected: with codecs.open(mypath + '/' + f) as open_file: l_emails = [] for line in open_file: if '@' in line: for affiliation in extract_affiliation(line): if affiliation_articles.get(affiliation, -1) == -1: affiliation_articles[affiliation] = [d_id] else: affiliation_articles[affiliation] = list( set(affiliation_articles[affiliation] + [d_id])) for affiliation in affiliation_articles: no_articles = len(affiliation_articles[affiliation]) lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR') query = {'_id': {'$in': affiliation_articles[affiliation]}} idx = 0 no_topics = number_topics(no_articles) print affiliation, 'topics for ', len( affiliation_articles[affiliation] ), 'articles, no topics', no_topics for topic in lda.apply(query=query, num_topics=no_topics, num_words=10, iterations=1500)[0]: t = "" for elem in topic: t += elem[1] + " " print idx, t idx += 1
def create_topics(choise): articles_by_author = {} article_authors = db.documents.find(types[choise], {'_id': 1, 'authors': 1}) for article in article_authors: for author in article['authors']: if articles_by_author.get(author['name'], -1) == -1: articles_by_author[author['name']] = [article['_id']] else: articles_by_author[author['name']] += [article['_id']] for author in articles_by_author: no_articles = len(articles_by_author[author]) lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR') query = {'_id': {'$in': articles_by_author[author]}} idx = 0 no_topics = number_topics(no_articles) print author, 'topics for:', len(articles_by_author[author]), 'articles, no topics', no_topics for topic in lda.apply(query=query, num_topics=no_topics, num_words=10, iterations=1500)[0]: t = "" for elem in topic: t += elem[1] + " " print idx, t idx += 1