def insert_entities(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) wpe = WikiPageToEntitiesService().get_value(args.wid) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid return False print u"Priming entity data on", args.wid for page, entity_data in wpe.items(): entity_list = map( my_escape, list( set( entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles')))) for i in range(0, len(entity_list), 50): cursor.execute(u""" INSERT IGNORE INTO topics (name) VALUES ("%s") """ % u'"), ("'.join(entity_list[i:i + 50])) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def get_data(wid): log(wid) use_caching(shouldnt_compute=True) #should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: log(wid, "no heads") if doc_ids_to_entities == {}: log(wid, "no entities") from_s3 = json.loads( bucket.get_key('feature-data/page-%s.json' % wid).get_contents_as_string()) for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, { 'titles': [], 'redirects': {} }) doc_ids_combined[doc_id] = (map( preprocess, entity_response['titles'] + entity_response['redirects'].keys() + entity_response['redirects'].values() + list(set(doc_ids_to_heads.get(doc_id, [])))) + from_s3.get(doc_id, [])) return doc_ids_combined.items()
def get_data_wid(wid): print wid use_caching(shouldnt_compute=True) #should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print wid, "no heads" if doc_ids_to_entities == {}: print wid, "no entities" for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, { 'titles': [], 'redirects': {} }) doc_ids_combined[doc_id] = map( preprocess, entity_response['titles'] + entity_response['redirects'].keys() + entity_response['redirects'].values() + list(set(doc_ids_to_heads.get(doc_id, [])))) return doc_ids_combined.items()
def insert_contrib_data(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) wpe = WikiPageToEntitiesService().get_value(args.wid) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid return False authority_dict_fixed = get_authority_dict_fixed(args) if not authority_dict_fixed: return False print u"Inserting page and author and contrib data for wiki", args.wid for doc_id in authority_dict_fixed: wiki_id, article_id = doc_id.split(u'_') entity_data = wpe.get(doc_id, {}) entity_list = filter( lambda x: x, map( lambda x: x.strip(), map( my_escape, list( set( entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles', [])))))) cursor.execute(u""" SELECT topic_id FROM topics WHERE name IN ("%s") """ % (u'", "'.join(entity_list))) topic_ids = list(set([result[0] for result in cursor.fetchall()])) for topic_id in topic_ids: sql = u""" INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s) """ % (article_id, wiki_id, topic_id) cursor.execute(sql) db.commit() cursor = db.cursor() for contribs in PageAuthorityService().get_value(doc_id, []): cursor.execute(u""" INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s") """ % (contribs[u'userid'], my_escape(contribs[u'user']))) db.commit() cursor.execute(u""" INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s) """ % (article_id, wiki_id, contribs[u'userid'], contribs[u'contribs'])) db.commit() local_authority = contribs[ u'contribs'] * authority_dict_fixed.get(doc_id, 0) for topic_id in topic_ids: cursor.execute(u""" INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s) ON DUPLICATE KEY UPDATE local_authority = local_authority + %s """ % (contribs[u'userid'], topic_id, local_authority, local_authority)) db.commit() db.commit() print u"Done with", args.wid return args except Exception as e: print e, traceback.format_exc() return False
def entities(wid): #pprint(WikiPageToEntitiesService().get_value(wid, {})) return WikiPageToEntitiesService().get_value(wid, {})