def author_centrality(titles_to_authors): author_graph = digraph() author_graph.add_nodes(map(lambda x: u"title_%s" % x, titles_to_authors.keys())) author_graph.add_nodes(list(set( [u'author_%s' % author[u'user'] for authors in titles_to_authors.values() for author in authors]))) for title in titles_to_authors: log.debug(u"Working on title: %s" % title) for author in titles_to_authors[title]: try: author_graph.add_edge( (u'title_%s' % title, u'author_%s' % author[u'user'])) except AdditionError: pass centralities = dict([ ('_'.join(item[0].split('_')[1:]), item[1]) for item in pagerank(author_graph).items() if item[0].startswith(u'author_')]) centrality_scaler = MinMaxScaler(centralities.values()) return dict([(cent_author, centrality_scaler.scale(cent_val)) for cent_author, cent_val in centralities.items()])
def get_title_top_authors(args, all_titles, all_revisions): pool = multiprocessing.Pool(processes=args.processes) title_top_authors = {} r = pool.map_async( get_contributing_authors_safe, [(title_obj, all_revisions.get(title_obj[u'title'], [])) for title_obj in all_titles], callback=title_top_authors.update) r.wait() if len(title_top_authors) == 0: log.info(u"No title top authors for wiki %s" % args.wiki_id) log.info(r.get()) sys.exit(1) contribs = [author[u'contribs'] for title in title_top_authors for author in title_top_authors[title]] if len(contribs) == 0: log.info(u"No contributions for wiki %s" % args.wiki_id) sys.exit(1) contribs_scaler = MinMaxScaler(contribs) scaled_title_top_authors = {} for title, authors in title_top_authors.items(): new_authors = [] for author in authors: author[u'contribs'] = contribs_scaler.scale(author[u'contribs']) new_authors.append(author) scaled_title_top_authors[title] = new_authors return scaled_title_top_authors
def main(): use_caching() args = get_args() set_global_num_processes(args.num_processes) api_data = get_api_data(args.wiki_id) workbook = xlwt.Workbook() pages_sheet = workbook.add_sheet("Pages by Authority") pages_sheet.write(0, 0, "Page") pages_sheet.write(0, 1, "Authority") print "Getting Page Data..." page_authority = get_page_authority(api_data) print "Writing Page Data..." pages, authorities = zip(*page_authority) scaler = MinMaxScaler(authorities, enforced_min=0, enforced_max=100) for i, page in enumerate(pages): if i > 65000: break pages_sheet.write(i+1, 0, page) pages_sheet.write(i+1, 1, scaler.scale(authorities[i])) print "Getting Author and Topic Data..." author_authority = get_author_authority(api_data) topic_authority = sorted(WikiTopicsToAuthorityService().get_value(args.wiki_id), key=lambda y: y[1]['authority'], reverse=True) print "Writing Author Data..." authors_sheet = workbook.add_sheet("Authors by Authority") authors_sheet.write(0, 0, "Author") authors_sheet.write(0, 1, "Authority") authors_topics_sheet = workbook.add_sheet("Topics for Best Authors") authors_topics_sheet.write(0, 0, "Author") authors_topics_sheet.write(0, 1, "Topic") authors_topics_sheet.write(0, 2, "Rank") authors_topics_sheet.write(0, 3, "Score") # why is total_authority not there? all_total_authorities = [author.get('total_authority', 0) for author in author_authority] scaler = MinMaxScaler(all_total_authorities, enforced_min=0, enforced_max=100) pivot_counter = 1 for i, author in enumerate(author_authority): authors_sheet.write(i+1, 0, author['name']) authors_sheet.write(i+1, 1, scaler.scale(author['total_authority'])) for rank, topic in enumerate(author['topics'][:10]): if pivot_counter > 65000: break authors_topics_sheet.write(pivot_counter, 0, author['name']) authors_topics_sheet.write(pivot_counter, 1, topic[0]) authors_topics_sheet.write(pivot_counter, 2, rank+1) authors_topics_sheet.write(pivot_counter, 3, topic[1]) pivot_counter += 1 if i > 65000: break print "Writing Topic Data" topics_sheet = workbook.add_sheet("Topics by Authority") topics_sheet.write(0, 0, "Topic") topics_sheet.write(0, 1, "Authority") topics_authors_sheet = workbook.add_sheet("Authors for Best Topics") topics_authors_sheet.write(0, 0, "Topic") topics_authors_sheet.write(0, 1, "Author") topics_authors_sheet.write(0, 2, "Rank") topics_authors_sheet.write(0, 3, "Authority") scaler = MinMaxScaler([x[1].get('authority', 0) for x in topic_authority], enforced_min=0, enforced_max=100) pivot_counter = 1 for i, topic in enumerate(topic_authority): topics_sheet.write(i+1, 0, topic[0]) topics_sheet.write(i+1, 1, scaler.scale(topic[1]['authority'])) authors = topic[1]['authors'] for rank, author in enumerate(authors[:10]): if pivot_counter > 65000: break topics_authors_sheet.write(pivot_counter, 0, topic[0]) topics_authors_sheet.write(pivot_counter, 1, author['author']) topics_authors_sheet.write(pivot_counter, 2, rank+1) topics_authors_sheet.write(pivot_counter, 3, author['topic_authority']) pivot_counter += 1 if i > 65000: break print "Saving to Excel" wiki_name = api_data['url'].replace('http://', '').replace('.wikia', '').replace('.com/', '') fname = "%s-%s-authority-data-%s.xls" % (args.wiki_id, wiki_name, datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M')) workbook.save(fname) if args.send_to_s3: bucket = connect_s3().get_bucket('nlp-data') k = bucket.new_key('authority/%s/%s' % (args.wiki_id, fname)) k.set_contents_from_fiename(fname) print fname