def analyze_all_user_pages_globally(): collection = solr.all_user_pages_collection() new_docs = {} for doc in solr.get_all_docs_by_query(collection, '*:*', fields="id,doc_authority_f"): user_id = doc['id'].split('_').pop() if user_id in new_docs: new_docs[user_id]['total_authority_f']['set'] += doc['doc_authority_f'] new_docs[user_id]['total_contribs_f']['set'] += doc['contribs_f'] else: new_docs[user_id] = { 'id': user_id, 'total_authority_f': {'set': doc['doc_authority_f']}, 'total_contribs_f': {'set': doc['contribs_f']} } authorities, contribs = [], [] for doc in new_docs.values(): authorities.append(doc['total_authority_f']) contribs.append(doc['total_contribs_f']) authorityscaler = MinMaxScaler(authorities) contribscaler = MinMaxScaler(contribs) for doc in new_docs.values(): doc['scaled_authority_f'] = authorityscaler.scale(doc['total_authority_f']) doc['contribs_scaled_f'] = contribscaler.scale(doc['total_contribs_f']) doc['scaled_authority_contribs_f'] = doc['scaled_authority_f'] * doc['contribs_scaled_f'] collection.add(new_docs.values()) collection.commit()
def author_centrality(titles_to_authors): """ Identifies the centrality of an author :param titles_to_authors: a dict keying title strings to the authors associated :type titles_to_authors: dict :return: a dict matching author to centrality :rtype: dict """ author_graph = digraph() author_graph.add_nodes(map(lambda x: u"title_%s" % x, titles_to_authors.keys())) author_graph.add_nodes(list(set([u'author_%s' % author[u'user'] for authors in titles_to_authors.values() for author in authors]))) for title in titles_to_authors: for author in titles_to_authors[title]: try: author_graph.add_edge((u'title_%s' % title, u'author_%s' % author[u'user'])) except AdditionError: pass centralities = dict([('_'.join(item[0].split('_')[1:]), item[1]) for item in pagerank(author_graph).items() if item[0].startswith(u'author_')]) centrality_scaler = MinMaxScaler(centralities.values()) return dict([(cent_author, centrality_scaler.scale(cent_val)) for cent_author, cent_val in centralities.items()])
def analyze_wikis_globally(): print "Analyzing Wikis..." wiki_collection = solr.existing_collection(solr.global_collection()) wiki_docs = [doc for doc in solr.get_all_docs_by_query(wiki_collection, '*:*')] scaler = MinMaxScaler([doc['total_authority_f'] for doc in wiki_docs]) new_docs = [] for doc in wiki_docs: new_docs.append({'id': doc['id'], 'scaled_authority_f': {'set': scaler.scale(doc['total_authority_f'])}}) if len(new_docs) > 10: try: wiki_collection.add(new_docs) except ReadTimeout: sleep(5) try: wiki_collection.add(new_docs) except ReadTimeout: pass new_docs = [] try: wiki_collection.add(new_docs) except ReadTimeout: sleep(5) try: wiki_collection.add(new_docs) except ReadTimeout: pass wiki_collection.commit()
def get_title_top_authors(wiki_id, api_url, all_titles, all_revisions): """ Creates a dictionary of titles and its top authors :param wiki_id: the ID of the wiki :type wiki_id: int :param api_url: the API URL of the wiki :type api_url: str :param all_titles: a list of all title objects :type all_titles: list :param all_revisions: a dict keying titles to revisions :type all_revisions: dict :return: a dict keying title to top authors :rtype: dict """ print "Initializing edit distance data" all_title_len = len(all_titles) group_map = [] for i in range(0, all_title_len, 25): print "%d/%d" % (i, all_title_len) group_map.append(group(prime_edit_distance.s(wiki_id, api_url, title_obj, all_revisions[title_obj[u'title']]) for title_obj in all_titles[i:i+100])()) print "Waiting on initialization to complete" readies = len(filter(lambda x: x.ready(), group_map)) group_size = len(group_map) while False in map(lambda x: x.ready(), group_map): new_readies = len(filter(lambda x: x.ready(), group_map)) if new_readies > readies: print "%d/%d" % (new_readies, group_size) readies = new_readies time.sleep(1) print "Getting contributing authors for titles" title_to_authors = group(get_contributing_authors.s(wiki_id, api_url, title_obj, all_revisions[title_obj[u'title']]) for title_obj in all_titles)().get() contribs_scaler = MinMaxScaler([author[u'contribs'] for title, authors in title_to_authors for author in authors]) print "Scaling top authors" scaled_title_top_authors = {} for title, authors in title_to_authors: new_authors = [] for author in authors: author[u'contribs'] = contribs_scaler.scale(author[u'contribs']) new_authors.append(author) scaled_title_top_authors[title] = new_authors return scaled_title_top_authors
def get_title_top_authors(wiki_id, api_url, all_titles, all_revisions): """ Creates a dictionary of titles and its top authors :param wiki_id: the ID of the wiki :type wiki_id: int :param api_url: the API URL of the wiki :type api_url: str :param all_titles: a list of all title objects :type all_titles: list :param all_revisions: a dict keying titles to revisions :type all_revisions: dict :return: a dict keying title to top authors :rtype: dict """ print "Getting contributing authors for titles" futures = group(get_contributing_authors.s(wiki_id, api_url, title_obj, all_revisions[title_obj[u'title']]) for title_obj in all_titles if title_obj[u'title'] in all_revisions)() future_len = len(futures) cc = futures.completed_count() while not futures.ready(): new_cc = futures.completed_count() if new_cc > cc: print "%d/%d" % (new_cc, future_len) cc = new_cc time.sleep(1) title_to_authors = get_with_backoff(futures, []) if not title_to_authors: print "Failed to get title to authors. Connection failure?" return contribs_scaler = MinMaxScaler([author[u'contribs'] for title, authors in title_to_authors for author in authors]) print "Scaling top authors" scaled_title_top_authors = {} for title, authors in title_to_authors: new_authors = [] for author in authors: author[u'contribs'] = contribs_scaler.scale(author[u'contribs']) new_authors.append(author) scaled_title_top_authors[title] = new_authors return scaled_title_top_authors
def analyze_pages_globally(): print "Analyzing All Pages..." page_collection = solr.all_pages_collection() authorities = [] for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'): authorities.append(page_doc['authority_f']) page_scaler = MinMaxScaler(authorities) docs = [] counter = 0 for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'): docs.append({'id': page_doc['id'], 'scaled_authority_f': {'set': page_scaler.scale(page_doc['authority_f'])}}) counter += 1 if counter % 500: page_collection.add(docs) docs = [] page_collection.commit()
def analyze_users_globally(): print "Analyzing Users..." user_collection = solr.existing_collection(solr.user_collection()) wiki_user_collection = solr.wiki_user_collection() id_to_docs = dict() for user_doc in solr.get_all_docs_by_query(wiki_user_collection, '*:*'): # these are gonna be wiki-id_user-id doc_id = user_doc['id'].split('_').pop() if doc_id not in id_to_docs: id_to_docs[doc_id] = dict(id=doc_id, attr_entities={'set': []}, name_s={'set': user_doc['name_s']}, name_txt_en={'set': user_doc['name_txt_en']}, wikis_is={'set': []}, attr_wikis={'set': []}, authorities_fs={'set': []}, total_authority_f={'set': 0}, scaled_authority_f={'set': 0}) try: map(id_to_docs[doc_id]['attr_entities']['set'].append, user_doc['attr_entities']) id_to_docs[doc_id]['wikis_is']['set'].append(user_doc['wiki_id_i']) id_to_docs[doc_id]['attr_wikis']['set'].append(user_doc['wiki_name_txt']) id_to_docs[doc_id]['authorities_fs']['set'].append(user_doc['total_page_authority_f']) except KeyError: pass # zero f***s id_to_total_authorities = dict([(uid, sum(doc['authorities_fs']['set'])) for uid, doc in id_to_docs.items()]) user_scaler = MinMaxScaler(id_to_total_authorities.values()) for uid, total_authority in id_to_total_authorities.items(): id_to_docs[uid]['total_authority_f']['set'] = total_authority id_to_docs[uid]['scaled_authority_f']['set'] = user_scaler.scale(total_authority) user_collection.add(id_to_docs.values()) user_collection.commit()
def ingest_data(wiki_id): """ Create Solr documents for a given wiki ID :param wiki_id: the ID of the wiki (int or str) :type wiki_id: int :return: """ # make sure all pages and all user pages exists solr.existing_collection(solr.all_pages_collection()) solr.existing_collection(solr.all_user_pages_collection()) resp = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details', params={u'ids': wiki_id}) items = resp.json()['items'] if wiki_id not in items: print u"Wiki doesn't exist?" return api_data = items[wiki_id] wiki_data = { 'id': api_data['id'], 'wam_f': {'set': api_data['wam_score']}, 'title_s': {'set': api_data['title']}, 'attr_title': {'set': api_data['title']}, 'attr_desc': {'set': api_data['desc']} } for key in api_data['stats'].keys(): wiki_data['%s_i' % key] = {'set': api_data['stats'][key]} wiki_api_data = requests.get(u'%swikia.php' % (api_data[u'url']), params={u'method': u'getForWiki', u'service': u'CrossWikiCore', u'controller': u'WikiaSearchIndexerController'}).json()[u'contents'] wiki_data[u'hub_s'] = wiki_api_data[u'hub_s'] # easier api_data[u'hub_s'] = wiki_api_data[u'hub_s'] collection = solr.existing_collection(solr.collection_for_wiki(wiki_id)) use_caching(is_read_only=True, shouldnt_compute=True) wpe = WikiPageToEntitiesService().get_value(wiki_id) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", wiki_id return False documents = [] grouped_futures = [] pages_to_authority = WikiAuthorityService().get_value(str(wiki_data['id'])) for counter, (doc_id, entity_data) in enumerate(wpe.items()): documents.append({ 'id': doc_id, 'attr_entities': {'set': list(set(entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles')))}, 'type_s': {'set': 'Page'}, 'authority_f': {'set': pages_to_authority.get(doc_id, 0)}, 'hub_s': wiki_api_data['hub_s'] }) if counter != 0 and counter % 1500 == 0: grouped_futures.append( group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))() ) documents = [] grouped_futures.append( group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))() ) # block on completion of all grouped futures completed = 0 total = 0 while len(filter(lambda x: not x.ready(), grouped_futures)) > 1: new_completed = 0 new_total = 0 for future in grouped_futures: new_completed += future.completed_count() new_total += len(future.results) if completed != new_completed or total != new_total: completed = new_completed total = new_total print "Grouped Tasks: (%d/%d)" % (completed, total) sleep(2) all_user_tuples = [] for future in grouped_futures: result = get_with_backoff(future, []) map(all_user_tuples.extend, result) all_user_tuples = list(set(all_user_tuples)) if not all_user_tuples: print "Empty user tuples, bailing" return # assign the unique user ids to the first variable, and the unique usernames to the second all_user_ids, all_users = zip(*all_user_tuples) collection.commit() solr.all_pages_collection().commit() solr.all_user_pages_collection().commit() wiki_data['attr_entities'] = {'set': []} for count, entities in WikiEntitiesService().get_value(str(wiki_id)).items(): for entity in entities: map(wiki_data['attr_entities']['set'].append, [entity] * int(count)) # goddamnit count isn't int wiki_data['user_ids_is'] = {'set': all_user_ids} wiki_data['attr_users'] = {'set': all_users} wiki_data['total_authority_f'] = {'set': sum(pages_to_authority.values())} wiki_data['authorities_fs'] = {'set': pages_to_authority.values()} wiki_collection = solr.existing_collection(solr.global_collection()) wiki_collection.add([wiki_data]) wiki_collection.commit() print "Committed wiki data" print "Retrieving user docs..." futures = group(build_wiki_user_doc.s(api_data, user_tuple) for user_tuple in all_user_tuples)() future_result_len = len(futures.results) while not futures.ready(): print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len) sleep(2) user_docs = get_with_backoff(futures, []) if not user_docs: print "User docs was empty. Possibly connection problems." return authority_scaler = MinMaxScaler([doc['total_page_authority_f']['set'] for doc in user_docs]) contribs_scaler = MinMaxScaler([doc['total_contribs_f']['set'] for doc in user_docs]) for doc in user_docs: scaled_authority = authority_scaler.scale(doc['total_page_authority_f']['set']) scaled_contribs = contribs_scaler.scale(doc['total_contribs_f']['set']) doc['scaled_authority_f'] = {'set': scaled_authority} doc['scaled_contribs_f'] = {'set': scaled_contribs} doc['scaled_contribs_authority_f'] = {'set': scaled_authority * scaled_contribs} wiki_user_collection = solr.existing_collection(solr.wiki_user_collection()) wiki_user_collection.add(user_docs) wiki_user_collection.commit() print "Analyzing topics" futures = group(get_wiki_topic_doc.s(wiki_data['id'], topic) for topic in list(set(wiki_data['attr_entities']['set'])))() future_result_len = len(futures.results) counter = 0 while not futures.ready(): if counter % 5 == 0: print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len) sleep(2) counter += 1 topic_docs = get_with_backoff(futures, []) if not topic_docs: print "No topics, probably a connection error" return collection.add(topic_docs) collection.commit() topic_collection = solr.existing_collection(solr.all_topics_collection()) topic_collection.add(topic_docs) topic_collection.commit()