def aggregate_global_topic(topic): collection = solr.all_topics_collection() total_authorities = [] all_user_id_dict = {} all_user_name_dict = {} all_wikis = [] for doc in solr.get_all_docs_by_query(collection, topic): total_authorities.append(doc['total_authority_f']) if 'user_id_is' in doc: for user_id in doc['user_id_is']: all_user_id_dict[user_id] = True if 'user_names_ss' in doc: for user_name in doc['user_names_ss']: all_user_name_dict[user_name] = True if 'wiki_id_i' in doc: all_wikis.append(doc['wiki_id_i']) total_authority = sum(total_authorities) avg_authority = 0 if total_authority > 0: avg_authority = total_authority / float(total_authority) return { 'id': topic, 'topic_s': {'set': topic}, 'wikis_is': {'set': all_wikis}, 'user_ids_is': {'set': all_user_id_dict.keys()}, 'user_names_ss': {'set': all_user_name_dict.keys()}, 'total_authority_f': {'set': total_authority}, 'avg_authority_f': {'set': avg_authority}, 'type_s': {'set': 'GlobalTopic'} }
def analyze_wikis_globally(): print "Analyzing Wikis..." wiki_collection = solr.existing_collection(solr.global_collection()) wiki_docs = [doc for doc in solr.get_all_docs_by_query(wiki_collection, '*:*')] scaler = MinMaxScaler([doc['total_authority_f'] for doc in wiki_docs]) new_docs = [] for doc in wiki_docs: new_docs.append({'id': doc['id'], 'scaled_authority_f': {'set': scaler.scale(doc['total_authority_f'])}}) if len(new_docs) > 10: try: wiki_collection.add(new_docs) except ReadTimeout: sleep(5) try: wiki_collection.add(new_docs) except ReadTimeout: pass new_docs = [] try: wiki_collection.add(new_docs) except ReadTimeout: sleep(5) try: wiki_collection.add(new_docs) except ReadTimeout: pass wiki_collection.commit()
def analyze_all_user_pages_globally(): collection = solr.all_user_pages_collection() new_docs = {} for doc in solr.get_all_docs_by_query(collection, '*:*', fields="id,doc_authority_f"): user_id = doc['id'].split('_').pop() if user_id in new_docs: new_docs[user_id]['total_authority_f']['set'] += doc['doc_authority_f'] new_docs[user_id]['total_contribs_f']['set'] += doc['contribs_f'] else: new_docs[user_id] = { 'id': user_id, 'total_authority_f': {'set': doc['doc_authority_f']}, 'total_contribs_f': {'set': doc['contribs_f']} } authorities, contribs = [], [] for doc in new_docs.values(): authorities.append(doc['total_authority_f']) contribs.append(doc['total_contribs_f']) authorityscaler = MinMaxScaler(authorities) contribscaler = MinMaxScaler(contribs) for doc in new_docs.values(): doc['scaled_authority_f'] = authorityscaler.scale(doc['total_authority_f']) doc['contribs_scaled_f'] = contribscaler.scale(doc['total_contribs_f']) doc['scaled_authority_contribs_f'] = doc['scaled_authority_f'] * doc['contribs_scaled_f'] collection.add(new_docs.values()) collection.commit()
def all_wikis(args): """ Accesses all wikis from database :return: dict keying wiki name to ids :rtype: dict """ return solr.get_all_docs_by_query(solr.global_collection(), '*:*', sort='scaled_authority_f')
def get_all_pages(self): """ Optimized for all pages :return: dict of pages :rtype: dict """ return solr.get_all_docs_by_query(solr.collection_for_wiki(self.wiki_id), 'type_s:Page', sort='authority_f')
def analyze_pages_globally(): print "Analyzing All Pages..." page_collection = solr.all_pages_collection() authorities = [] for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'): authorities.append(page_doc['authority_f']) page_scaler = MinMaxScaler(authorities) docs = [] counter = 0 for page_doc in solr.get_all_docs_by_query(page_collection, '*:*'): docs.append({'id': page_doc['id'], 'scaled_authority_f': {'set': page_scaler.scale(page_doc['authority_f'])}}) counter += 1 if counter % 500: page_collection.add(docs) docs = [] page_collection.commit()
def get_row(self): """ Returns the row from the DB as a dict :return: row data :rtype: dict """ for doc in solr.get_all_docs_by_query(solr.user_collection(), 'name_txt_en:"%s"' % self.user_name): return doc['attr_entities']
def get_all_authors(self): """ Optimized to get all authors :return: an OrderedDict with author dicts :rtype: collections.OrderedDict """ return solr.get_all_docs_by_query(solr.wiki_user_collection(), 'wiki_id_i:%s' % self.wiki_id);
def get_all_users(self): """ Optimized to get all users :return: an OrderedDict with user dicts :rtype: collections.OrderedDict """ return solr.get_all_docs_by_query(solr.wiki_user_collection(), 'wiki_id_i:%s' % self.wiki_id, fields=','.join(UserModel.fields))
def get_row(self): """ Returns the row from the DB as a dict :return: row data :rtype: dict """ for row in solr.get_all_docs_by_query(solr.all_pages_collection(), 'id:%s_%s' % (str(self.page_id), str(self.wiki_id))): return row
def get_row(self): """ Gets the database for this wiki :rtype: dict :return: a dict representing the row and its column titles """ collection = solr.global_collection() for doc in solr.get_all_docs_by_query(collection, 'id:%s' % str(self.wiki_id)): return doc
def all_wikis(): """ Accesses all wikis from database :return: dict keying wiki name to ids :rtype: dict """ return solr.get_all_docs_by_query(solr.global_collection(), '*:*', sort='scaled_authority_f desc', fields=','.join(WikiModel.fields))
def get_row(self): """ Returns the row from the DB as a dict :return: row data :rtype: dict """ for row in solr.get_all_docs_by_query(solr.all_pages_collection(), 'id:%s' % self.doc_id, fields=','.join(self.fields)): return row
def get_row(self): """ Returns the row from the DB as a dict :return: row data :rtype: dict """ for doc in solr.get_all_docs_by_query(solr.user_collection(), "*:*", fields=','.join(self.fields+['attr_entities'])): return doc
def run(self): """ Drops all indices """ global_coll = solr.global_collection() for doc in solr.get_all_docs_by_query(global_coll, '*:*', fields='id'): solr.collection_for_wiki(doc['id']).drop() global_coll.drop() solr.all_pages_collection().drop() solr.all_topics_collection().drop() solr.all_user_pages_collection().drop() solr.wiki_user_collection().drop() solr.user_collection().drop()
def build_wiki_user_doc(wiki_data, user_tuple): """ Retrieves data from wiki collection to generate a user document at the wiki level :param wiki_data: a dict representing the data we've retrieved from the Wikia API :type wiki_data: dict :param user_tuple: a tuple containing user id and user name :type user_tuple: tuple :return: the document we want to add to solr; we will commit in bulk instead of blasting the network :rtype: dict """ user_id, user_name = user_tuple collection = solr.collection_for_wiki(str(wiki_data['id'])) user_doc = { 'id': '%d_%d' % (wiki_data['id'], user_id), 'user_id_i': user_id, 'wiki_id_i': wiki_data['id'], 'wiki_name_txt': wiki_data['title'], 'name_s': {'set': user_name}, 'type_s': {'set': 'WikiUser'}, 'name_txt_en': {'set': user_name}, 'hub_s': wiki_data['hub_s'] } doc_ids = [] entities = [] authorities = [] contribs = [] for doc in solr.get_all_docs_by_query(collection, 'type_s:PageUser AND user_id_i:%d' % user_id): doc_ids.append(doc['doc_id_s']) if 'attr_entities' in doc: map(entities.append, doc['attr_entities']) if 'user_page_authority_f' in doc: authorities.append(doc['user_page_authority_f']) if 'contribs_f' in doc: contribs.append(doc['contribs_f']) total_authorities = sum(authorities) total_contribs = sum(contribs) user_doc['doc_ids_ss'] = {'set': doc_ids} user_doc['attr_entities'] = {'set': entities} user_doc['total_page_authority_f'] = {'set': total_authorities} user_doc['total_contribs_f'] = {'set': total_contribs} user_doc['page_authority_fs'] = {'set': authorities} user_doc['contribs_fs'] = {'set': contribs} user_doc['total_contribs_authority_f'] = {'set': total_authorities * total_contribs} return user_doc
def get_topics_for_wiki(self, wiki_id, limit=10, offset=0, **kwargs): """ Gets most important topics for this user on this wiki :param limit: the wiki id :type limit: str :param limit: limit :type limit: int :param offset: offset :type offset: int :return: ordered dict of topic name to auth or a list of dicts for api :rtype: collections.OrderedDict|list """ for doc in solr.get_all_docs_by_query(solr.wiki_user_collection(), 'user_id_i:%d_%d' % (wiki_id, self.user_id), fields=','.join(TopicModel.fields+['attr_entities'])): return doc['attr_entities']
def get_topics_for_wiki(self, wiki_id, limit=10, offset=0, for_api=False): """ Gets most important topics for this user on this wiki :param limit: the wiki id :type limit: str :param limit: limit :type limit: int :param offset: offset :type offset: int :param for_api: if it's for the api, we fix the naming :type for_api: bool :return: ordered dict of topic name to auth or a list of dicts for api :rtype: collections.OrderedDict|list """ for doc in solr.get_all_docs_by_query(solr.wiki_user_collection(), 'name_txt_en:"%s"' % self.user_name): return doc['attr_entities']
def run(self): """ Drops all indices """ global_coll = solr.global_collection() print 'global' global_coll.optimize() for doc in solr.get_all_docs_by_query(global_coll, '*:*', fields='id'): print doc['id'] solr.collection_for_wiki(doc['id']).optimize() print 'all pages' solr.all_pages_collection().optimize() print 'all topics' solr.all_topics_collection().optimize() print 'all user pages' solr.all_user_pages_collection().optimize() print 'wiki user' solr.wiki_user_collection().optimize() print 'user' solr.user_collection().optimize()
def get_wiki_topic_doc(wiki_id, topic): """ Create a solr doc for a given topic based on all matching pages for a wiki :param wiki_id: the ID of the wiki :type wiki_id: str :param topic: the topic we're creating a document for :type topic: str :return: the solr document we want to add :rtype: dict """ collection = solr.collection_for_wiki(wiki_id) authorities = [] all_user_id_dict = {} all_user_name_dict = {} for doc in solr.get_all_docs_by_query(collection, 'type_s:Page AND attr_entities:"%s"' % topic): if 'user_id_is' in doc: for user_id in doc['user_ids_is']: all_user_id_dict[user_id] = True if 'attr_users' in doc: for user_name in doc['attr_users']: all_user_name_dict[user_name] = True if 'authority_f' in doc: authorities.append(doc['authority_f']) total_authority = sum(authorities) return { 'id': '%s_%s' % (wiki_id, topic), 'wiki_id_i': wiki_id, 'topic_s': topic, 'topic_txt_en': topic, 'type_s': {'set': 'Topic'}, 'user_ids_is': {'set': all_user_id_dict.keys()}, 'user_names_ss': {'set': all_user_name_dict.keys()}, 'total_authority_f': {'set': total_authority}, 'avg_authority_f': {'set': total_authority / float(len(authorities))} }
def analyze_users_globally(): print "Analyzing Users..." user_collection = solr.existing_collection(solr.user_collection()) wiki_user_collection = solr.wiki_user_collection() id_to_docs = dict() for user_doc in solr.get_all_docs_by_query(wiki_user_collection, '*:*'): # these are gonna be wiki-id_user-id doc_id = user_doc['id'].split('_').pop() if doc_id not in id_to_docs: id_to_docs[doc_id] = dict(id=doc_id, attr_entities={'set': []}, name_s={'set': user_doc['name_s']}, name_txt_en={'set': user_doc['name_txt_en']}, wikis_is={'set': []}, attr_wikis={'set': []}, authorities_fs={'set': []}, total_authority_f={'set': 0}, scaled_authority_f={'set': 0}) try: map(id_to_docs[doc_id]['attr_entities']['set'].append, user_doc['attr_entities']) id_to_docs[doc_id]['wikis_is']['set'].append(user_doc['wiki_id_i']) id_to_docs[doc_id]['attr_wikis']['set'].append(user_doc['wiki_name_txt']) id_to_docs[doc_id]['authorities_fs']['set'].append(user_doc['total_page_authority_f']) except KeyError: pass # zero f***s id_to_total_authorities = dict([(uid, sum(doc['authorities_fs']['set'])) for uid, doc in id_to_docs.items()]) user_scaler = MinMaxScaler(id_to_total_authorities.values()) for uid, total_authority in id_to_total_authorities.items(): id_to_docs[uid]['total_authority_f']['set'] = total_authority id_to_docs[uid]['scaled_authority_f']['set'] = user_scaler.scale(total_authority) user_collection.add(id_to_docs.values()) user_collection.commit()