Example #1
0
def create_topics(choise):
    articles_by_author = {}
    article_authors = db.documents.find(types[choise], {
        '_id': 1,
        'authors': 1
    })
    for article in article_authors:
        for author in article['authors']:
            if articles_by_author.get(author['name'], -1) == -1:
                articles_by_author[author['name']] = [article['_id']]
            else:
                articles_by_author[author['name']] += [article['_id']]

    for author in articles_by_author:
        no_articles = len(articles_by_author[author])
        lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR')
        query = {'_id': {'$in': articles_by_author[author]}}
        idx = 0
        no_topics = number_topics(no_articles)
        print author, 'topics for:', len(
            articles_by_author[author]), 'articles, no topics', no_topics
        for topic in lda.apply(query=query,
                               num_topics=no_topics,
                               num_words=10,
                               iterations=1500)[0]:
            t = ""
            for elem in topic:
                t += elem[1] + " "
            print idx, t
            idx += 1
Example #2
0
def create_topics(choise):
    affiliation_articles = {}
    idx = 0
    for f in onlyfiles:
        d_id = int(f[:-4])
        types[choise]['_id'] = d_id
        selected = db.documents.find_one(types[choise])
        if selected:
            with codecs.open(mypath + '/' + f) as open_file:
                l_emails = []
                for line in open_file:
                    if '@' in line:
                        for affiliation in extract_affiliation(line):
                            if affiliation_articles.get(affiliation, -1) == -1:
                                affiliation_articles[affiliation] = [d_id]
                            else:
                                affiliation_articles[affiliation] = list(
                                    set(affiliation_articles[affiliation] +
                                        [d_id]))

    for affiliation in affiliation_articles:
        no_articles = len(affiliation_articles[affiliation])
        lda = LDA(dbname=dbname, host='localhost', port=27017, language='FR')
        query = {'_id': {'$in': affiliation_articles[affiliation]}}
        idx = 0
        no_topics = number_topics(no_articles)
        print affiliation, 'topics for ', len(
            affiliation_articles[affiliation]
        ), 'articles, no topics', no_topics
        for topic in lda.apply(query=query,
                               num_topics=no_topics,
                               num_words=10,
                               iterations=1500)[0]:
            t = ""
            for elem in topic:
                t += elem[1] + " "
            print idx, t
            idx += 1