def run(self): """ Drops all indices """ global_coll = solr.global_collection() for doc in solr.get_all_docs_by_query(global_coll, '*:*', fields='id'): solr.collection_for_wiki(doc['id']).drop() global_coll.drop() solr.all_pages_collection().drop() solr.all_topics_collection().drop() solr.all_user_pages_collection().drop() solr.wiki_user_collection().drop() solr.user_collection().drop()
def get_row(self): """ Returns the row from the DB as a dict :return: row data :rtype: dict """ for row in solr.get_all_docs_by_query(solr.all_pages_collection(), 'id:%s_%s' % (str(self.page_id), str(self.wiki_id))): return row
def get_row(self): """ Returns the row from the DB as a dict :return: row data :rtype: dict """ for row in solr.get_all_docs_by_query(solr.all_pages_collection(), 'id:%s' % self.doc_id, fields=','.join(self.fields)): return row
def run(self): """ Drops all indices """ global_coll = solr.global_collection() print 'global' global_coll.optimize() for doc in solr.get_all_docs_by_query(global_coll, '*:*', fields='id'): print doc['id'] solr.collection_for_wiki(doc['id']).optimize() print 'all pages' solr.all_pages_collection().optimize() print 'all topics' solr.all_topics_collection().optimize() print 'all user pages' solr.all_user_pages_collection().optimize() print 'wiki user' solr.wiki_user_collection().optimize() print 'user' solr.user_collection().optimize()
def analyze_pages_globally(): print "Analyzing All Pages..." page_collection = solr.all_pages_collection() authorities = [] for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'): authorities.append(page_doc['authority_f']) page_scaler = MinMaxScaler(authorities) docs = [] counter = 0 for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'): docs.append({'id': page_doc['id'], 'scaled_authority_f': {'set': page_scaler.scale(page_doc['authority_f'])}}) counter += 1 if counter % 500: page_collection.add(docs) docs = [] page_collection.commit()
def get_pages(self, limit=10, offset=None, **kwargs): """ Gets most authoritative pages for a topic using Authority DB and Wikia API data :param limit: Number of results we want :type limit: int :param offset: offset :type offset: int :return: a list of objects reflecting page results :rtype: list """ collection = solr.all_pages_collection() return solr.get_docs_by_query_with_limit(collection, self.topic, limit=limit, offset=offset, boost='scaled_authority_f', fields=','.join(PageModel.fields), **sans_q(kwargs))
def get_pages(self, limit=10, offset=None, for_api=False): """ Gets most authoritative pages for a topic using Authority DB and Wikia API data :param limit: Number of results we want :type limit: int :param offset: offset :type offset: int :param for_api: if it's for the api, we add less :type for_api: bool :return: a list of objects reflecting page results :rtype: list """ collection = solr.all_pages_collection() return solr.get_docs_by_query_with_limit(collection, self.topic, limit=limit, offset=offset, boost='scaled_authority_f')
def ingest_data(wiki_id): """ Create Solr documents for a given wiki ID :param wiki_id: the ID of the wiki (int or str) :type wiki_id: int :return: """ # make sure all pages and all user pages exists solr.existing_collection(solr.all_pages_collection()) solr.existing_collection(solr.all_user_pages_collection()) resp = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details', params={u'ids': wiki_id}) items = resp.json()['items'] if wiki_id not in items: print u"Wiki doesn't exist?" return api_data = items[wiki_id] wiki_data = { 'id': api_data['id'], 'wam_f': {'set': api_data['wam_score']}, 'title_s': {'set': api_data['title']}, 'attr_title': {'set': api_data['title']}, 'attr_desc': {'set': api_data['desc']} } for key in api_data['stats'].keys(): wiki_data['%s_i' % key] = {'set': api_data['stats'][key]} wiki_api_data = requests.get(u'%swikia.php' % (api_data[u'url']), params={u'method': u'getForWiki', u'service': u'CrossWikiCore', u'controller': u'WikiaSearchIndexerController'}).json()[u'contents'] wiki_data[u'hub_s'] = wiki_api_data[u'hub_s'] # easier api_data[u'hub_s'] = wiki_api_data[u'hub_s'] collection = solr.existing_collection(solr.collection_for_wiki(wiki_id)) use_caching(is_read_only=True, shouldnt_compute=True) wpe = WikiPageToEntitiesService().get_value(wiki_id) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", wiki_id return False documents = [] grouped_futures = [] pages_to_authority = WikiAuthorityService().get_value(str(wiki_data['id'])) for counter, (doc_id, entity_data) in enumerate(wpe.items()): documents.append({ 'id': doc_id, 'attr_entities': {'set': list(set(entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles')))}, 'type_s': {'set': 'Page'}, 'authority_f': {'set': pages_to_authority.get(doc_id, 0)}, 'hub_s': wiki_api_data['hub_s'] }) if counter != 0 and counter % 1500 == 0: grouped_futures.append( group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))() ) documents = [] grouped_futures.append( group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))() ) # block on completion of all grouped futures completed = 0 total = 0 while len(filter(lambda x: not x.ready(), grouped_futures)) > 1: new_completed = 0 new_total = 0 for future in grouped_futures: new_completed += future.completed_count() new_total += len(future.results) if completed != new_completed or total != new_total: completed = new_completed total = new_total print "Grouped Tasks: (%d/%d)" % (completed, total) sleep(2) all_user_tuples = [] for future in grouped_futures: result = get_with_backoff(future, []) map(all_user_tuples.extend, result) all_user_tuples = list(set(all_user_tuples)) if not all_user_tuples: print "Empty user tuples, bailing" return # assign the unique user ids to the first variable, and the unique usernames to the second all_user_ids, all_users = zip(*all_user_tuples) collection.commit() solr.all_pages_collection().commit() solr.all_user_pages_collection().commit() wiki_data['attr_entities'] = {'set': []} for count, entities in WikiEntitiesService().get_value(str(wiki_id)).items(): for entity in entities: map(wiki_data['attr_entities']['set'].append, [entity] * int(count)) # goddamnit count isn't int wiki_data['user_ids_is'] = {'set': all_user_ids} wiki_data['attr_users'] = {'set': all_users} wiki_data['total_authority_f'] = {'set': sum(pages_to_authority.values())} wiki_data['authorities_fs'] = {'set': pages_to_authority.values()} wiki_collection = solr.existing_collection(solr.global_collection()) wiki_collection.add([wiki_data]) wiki_collection.commit() print "Committed wiki data" print "Retrieving user docs..." futures = group(build_wiki_user_doc.s(api_data, user_tuple) for user_tuple in all_user_tuples)() future_result_len = len(futures.results) while not futures.ready(): print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len) sleep(2) user_docs = get_with_backoff(futures, []) if not user_docs: print "User docs was empty. Possibly connection problems." return authority_scaler = MinMaxScaler([doc['total_page_authority_f']['set'] for doc in user_docs]) contribs_scaler = MinMaxScaler([doc['total_contribs_f']['set'] for doc in user_docs]) for doc in user_docs: scaled_authority = authority_scaler.scale(doc['total_page_authority_f']['set']) scaled_contribs = contribs_scaler.scale(doc['total_contribs_f']['set']) doc['scaled_authority_f'] = {'set': scaled_authority} doc['scaled_contribs_f'] = {'set': scaled_contribs} doc['scaled_contribs_authority_f'] = {'set': scaled_authority * scaled_contribs} wiki_user_collection = solr.existing_collection(solr.wiki_user_collection()) wiki_user_collection.add(user_docs) wiki_user_collection.commit() print "Analyzing topics" futures = group(get_wiki_topic_doc.s(wiki_data['id'], topic) for topic in list(set(wiki_data['attr_entities']['set'])))() future_result_len = len(futures.results) counter = 0 while not futures.ready(): if counter % 5 == 0: print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len) sleep(2) counter += 1 topic_docs = get_with_backoff(futures, []) if not topic_docs: print "No topics, probably a connection error" return collection.add(topic_docs) collection.commit() topic_collection = solr.existing_collection(solr.all_topics_collection()) topic_collection.add(topic_docs) topic_collection.commit()
def add_with_metadata(wiki_data, docs): """ For a grouping of docs, gets metadata from SearchController, and generates user documents. Then commits. Let's hear it for asynchronous request handling. :param wiki_data: a dict representing the data we've retrieved from the Wikia API :type wiki_data: dict :param docs: a list of docs ready to be uploaded to solr :type docs: list :return: a list of tuples including user ids and accompanying user name :rtype: list """ docs = filter(lambda x: x, docs) params = { u'controller': u'WikiaSearchIndexerController', u'method': u'get', u'service': u'All', u'ids': u'|'.join([doc['id'].split('_').pop() for doc in docs if doc]) # doc can be none here LOL } r = requests.get(u"%swikia.php" % wiki_data['url'], params=params) response = r.json() contents = response.get('contents', []) author_pages = [] pa = PageAuthorityService() user_dict = {} for doc in docs: for search_doc in contents: if 'id' not in search_doc: continue if doc['id'] == search_doc['id']: doc.update(dict( attr_title=search_doc['title_en'], title_s=search_doc['title_en'], url_s=search_doc['url'], hub_s=search_doc['hub'] )) users_txt = [] user_ids_is = [] total_contribs_f = 0.0 pa_response = pa.get(doc['id']) if pa_response['status'] != 200: continue wiki_id, page_id = doc['id'].split('_') for contrib in pa_response[doc['id']]: user_dict[(contrib['userid'], contrib['user'])] = 1 users_txt.append(contrib['user']) user_ids_is.append(contrib['userid']) total_contribs_f += contrib['contribs'] author_pages.append({ 'id': '%s_%s' % (doc['id'], contrib['userid']), 'doc_id_s': {'set': doc['id']}, 'wiki_id_i': wiki_id, 'page_id_i': page_id, 'user_id_i': '%s' % contrib['userid'], 'type_s': {'set': 'PageUser'}, 'name_txt_en': {'set': contrib['user']}, 'name_s': {'set': contrib['user']}, 'contribs_f': {'set': contrib['contribs']}, 'attr_entities': {'set': doc['attr_entities']['set']}, 'doc_authority_f': {'set': doc['authority_f']['set']}, 'user_page_authority_f': {'set': contrib['contribs'] * doc['authority_f']['set']}, 'hub_s': doc['hub_s'] }) doc['attr_users'] = {'set': users_txt} doc['user_ids_is'] = {'set': user_ids_is} doc['total_contribs_f'] = {'set': total_contribs_f} update_docs = list(docs) + list(author_pages) solr.collection_for_wiki(wiki_data['id']).add(update_docs) solr.all_pages_collection().add(docs) solr.all_user_pages_collection().add(author_pages) return user_dict.keys()