def process_entities(args): wikisite = args.wikisite if wikisite == "zhwiki": title_db, redir_db = 1, 3 elif wikisite == "enwiki": title_db, redir_db = 4, 5 title_idx = get_redis_title(title_db) redir_idx = get_redis_redir(redir_db) logging.info('collect titles for entities...') # thousands level title_bulk = set() for i, entity in enumerate(reader(args.entities)): try: title = entity['sitelinks'][wikisite]['title'] except: continue title = canonicalize(title, redir_idx) title_bulk.add(title) logging.info("got %d title out of %d entities" % (len(title_bulk), i)) logging.debug('====================') logging.debug('required: ' + u','.join(title_bulk).encode('utf-8')) title_text = bulk_query_wikipage(title_idx, title_bulk, redir_idx) logging.info("found page amount: %d" % len(title_text)) logging.debug('====================') logging.debug('got text: ' + u','.join(title_text.keys()).encode('utf-8')) output = open(args.output, 'w') for i, entity in enumerate(reader(args.entities)): try: title = entity['sitelinks'][wikisite]['title'] except: title = "" title = canonicalize(title, redir_idx) text = title_text.get(title) if text is None: if title: logging.warning("specified title %s not found" % title.encode('utf-8')) text = "" else: logging.debug("title %s found" % title.encode('utf-8')) entity[wikisite] = text output.write(json.dumps(entity) + '\n') if i % 500 == 0: logging.info("%d entities processed" % i) output.flush() output.close()
def process_entities(args): logging.info('building title to wiki filename mapping...') title_idx = load_idx_mapping(args.wiki_index) logging.info('building title redirect mapping...') redir_idx = load_redir_mapping(args.redir_file) logging.info('collect titles for entities...') # thousands level wikisite = args.wikisite title_bulk = set() for i, entity in enumerate(reader(args.entities)): try: title = entity['sitelinks'][wikisite]['title'] except: continue title = redirect(redir_idx, title) title = cc.convert(title) title_bulk.add(title) logging.info("got %d title out of %d entities" % (len(title_bulk), i)) title_text = bulk_query_wikipage(title_idx, title_bulk) logging.info("found page amount: %d" % len(title_text)) logging.debug('====================') logging.debug('got text: ' + u','.join(title_text.keys()).encode('utf-8')) logging.debug('--------------------') logging.debug('required: ' + u','.join(title_bulk).encode('utf-8')) output = open(args.output, 'w') for i, entity in enumerate(reader(args.entities)): try: title = entity['sitelinks'][wikisite]['title'] except: title = "" title = redirect(redir_idx, title) title = cc.convert(title) text = title_text.get(title) if text is None: if title: logging.warning("specified title %s not found" % title.encode('utf-8')) text = "" else: logging.debug("title %s found" % title.encode('utf-8')) entity[wikisite] = text output.write(json.dumps(entity) + '\n') if i % 500 == 0: logging.info("%d entities processed" % i) output.flush() output.close()
def find_neighbor(args): filelist = [l.rstrip() for l in args.filelist] qids = set(v for _, _, v in next_neighbor_id(reader(args.entities))) written_qids = set() output = open(args.output, 'w') for d in xrange(args.depth): logging.info('========> %d-hop neighbors amount: %d' % (d + 1, len(qids))) if args.debug: logging.debug(u','.join(qids).encode('utf-8')) break next_qids = set() for i, e in enumerate(next_required_entity_from_files(qids, filelist)): if e['id'] not in written_qids: output.write(json.dumps(e) + '\n') written_qids.add(e['id']) for v in next_neighbor_of_entity(e): next_qids.add(v) if i % 1000 == 0: logging.info("found %d neighbors so far by reading files, %d have been written" % (i, len(written_qids))) output.flush() qids = next_qids output.close()
def collect_categories(args): categories = set() for i, entity in enumerate(reader(args.input)): if i % 20000 == 0: logging.info( "%d data (first) processed: %s" % (i, datetime.datetime.now().strftime('%m%d-%H:%M:%S'))) if args.debug and i / 20000 == 1: break try: subclass_claims = entity['claims']['P279'] values = filter(None, (claim_value(claim) for claim in subclass_claims)) # An entity with a non-empty 'subclass_of' property is itself a class. # The parent entities indicated by the 'subclass_of' property are all classes. # If the class is an 'instance_of' another entity, the other one must also be a class. if values: categories.add(entity['id']) categories.update(values) instance_claims = entity['claims']['P31'] values = filter(None, (claim_value(claim) for claim in instance_claims)) categories.update(values) except: continue return categories
def collect_categories(args): categories = set() for i, entity in enumerate(reader(args.input)): if i % 20000 == 0: logging.info("%d data (first) processed: %s" %(i, datetime.datetime.now().strftime('%m%d-%H:%M:%S'))) if args.debug and i / 20000 == 1: break try: subclass_claims = entity['claims']['P279'] values = filter(None, (claim_value(claim) for claim in subclass_claims)) # An entity with a non-empty 'subclass_of' property is itself a class. # The parent entities indicated by the 'subclass_of' property are all classes. # If the class is an 'instance_of' another entity, the other one must also be a class. if values: categories.add(entity['id']) categories.update(values) instance_claims = entity['claims']['P31'] values = filter(None, (claim_value(claim) for claim in instance_claims)) categories.update(values) except: continue return categories
def stat(args): for i, entity in enumerate(reader(args.entity_file)): if i / 50000 == 1: break for claims in entity['claims'].itervalues(): for c in claims: # datatype, datavalue.type, datavalue.value x = extract_kv(c) print entity['id'].encode('utf-8') + '\t' + u'\t'.join(x).encode('utf-8')
def next_required_entity_from_files(qids, filelist): i = 0 for f in filelist: logging.info("reading file %s ..." % f) for e in reader(f): if i % 10000 == 0: logging.info('read %d input entities in file' % i) i += 1 qid = e['id'] if qid in qids: yield e
def extract(args): categories = collect_categories(args) logging.info('got all categories, count %d' % len(categories)) logging.debug('first 100 categories repr: ' + ','.join(repr(x) for x in list(categories)[:100])) output = open(args.output, 'w') for i, entity in enumerate(reader(args.input)): if i % 20000 == 0: logging.info( "%d data (first) processed: %s" % (i, datetime.datetime.now().strftime('%m%d-%H:%M:%S'))) if args.debug and i / 20000 == 1: break try: qid = entity['id'] except: continue if qid not in categories: continue if 'claims' not in entity: continue claims = entity['claims'] subclass_claims = claims['P279'] if 'P279' in claims else [ ] # subclass_of instance_claims = claims['P31'] if 'P31' in claims else [ ] # instance_of if len(subclass_claims) + len(instance_claims) == 0: continue subclass_values = filter(None, (claim_value(claim) for claim in subclass_claims)) instance_values = filter(None, (claim_value(claim) for claim in instance_claims)) new_entity = {} new_entity['id'] = entity['id'] try: new_entity['enlabel'] = entity['labels']['en']['value'] except: new_entity['enlabel'] = "" try: new_entity['zhlabel'] = entity['labels']['zh']['value'] except: new_entity['zhlabel'] = "" new_entity['pids'] = subclass_values + instance_values output.write(json.dumps(new_entity) + "\n")
def main(filename, dbname, colname): conn = pymongo.MongoClient() db = conn.get_database(dbname) col = db.get_collection(colname) logging.getLogger().setLevel(logging.INFO) for obj in reader(filename): try: col.insert_one(obj) logging.info('inserted id=%s' % str(obj['id'])) except pymongo.errors.DuplicateKeyError: logging.info('duplicated id=%s' % str(obj['id'])) continue except Exception: logging.warning('exception when writing obj %s' % str(obj['id']))
def bulk_query_wikipage(title_idx, titles, redir_idx={}): filelist = sorted(set(filter(None, (title_idx.get(t) for t in titles)))) logging.info('there are %d files to read, given these %d titles' % (len(filelist), len(titles))) texts = {} # scan all filelist for f in filelist: logging.info("reading file %s .." % f) for page in reader(f): try: title = page['title'] title = cc.convert(redirect(redir_idx, title)) if title in titles: texts[title] = page['text'] except: continue return texts
def extract(args): categories = collect_categories(args) logging.info('got all categories, count %d' % len(categories)) logging.debug('first 100 categories repr: ' + ','.join(repr(x) for x in list(categories)[:100])) output = open(args.output, 'w') for i, entity in enumerate(reader(args.input)): if i % 20000 == 0: logging.info("%d data (first) processed: %s" %(i, datetime.datetime.now().strftime('%m%d-%H:%M:%S'))) if args.debug and i / 20000 == 1: break try: qid = entity['id'] except: continue if qid not in categories: continue if 'claims' not in entity: continue claims = entity['claims'] subclass_claims = claims['P279'] if 'P279' in claims else [] # subclass_of instance_claims = claims['P31'] if 'P31' in claims else [] # instance_of if len(subclass_claims) + len(instance_claims) == 0: continue subclass_values = filter(None, (claim_value(claim) for claim in subclass_claims)) instance_values = filter(None, (claim_value(claim) for claim in instance_claims)) new_entity = {} new_entity['id'] = entity['id'] try: new_entity['enlabel'] = entity['labels']['en']['value'] except: new_entity['enlabel'] = "" try: new_entity['zhlabel'] = entity['labels']['zh']['value'] except: new_entity['zhlabel'] = "" new_entity['pids'] = subclass_values + instance_values output.write(json.dumps(new_entity) + "\n")
def bulk_query_wikidata(qids, wikidata_idx): filelist = sorted(set(mquery_redis_idx(qids, wikidata_idx))) logging.info('there are %d files to read for %d entities' % (len(filelist), len(qids))) entities = {} for f in filelist: logging.info("reading file %s ..." % f) for i, entity in enumerate(reader(f)): try: qid = entity['id'] except KeyError: continue if qid in qids and qid not in entities: entities[qid] = entity if i % 10000 == 0: logging.info("%d items processed, current %s" % (i, qid)) logging.info("%d items cumulated after reading file %s" % (len(entities), f)) return entities
def find_neighbor(args): wikidata_idx = get_redis_wikidata() es = list(x for x in reader(args.entities)) logging.info('read %d input entities' % len(es)) output = open(args.output, 'w') for l in xrange(args.depth): if len(es) == 0 and l < args.depth: logging.info("early breaking at layer %d because of empty entities" % l) break logging.info('find %d-hop neighbors for %d inputs' % (l, len(es))) qids = set(filter(lambda v: v is not None and re.match('^Q\d+$', v), (claim_value(c) for e in es for _, cs in e['claims'].iteritems() for c in cs))) logging.debug('qids: %s' % u','.join(qids).encode('utf-8')) logging.info('there\'re %d neighbors to read' % len(qids)) es_d = bulk_query_wikidata(qids, wikidata_idx) es = [x for x in es_d.itervalues()] for e in es: output.write(json.dumps(e) + '\n')
def find_entities_by_kinships(args, kinships): logging.info('now filtering entities by kinships...') # iterate over the outputs output = open(args.output, 'w') dataset = reader_for_list(args.input_filelist) if args.input_filelist else reader(args.input) for i, entity in enumerate(dataset): if 'claims' not in entity: continue claims = entity['claims'] subclass_claims = claims['P279'] if 'P279' in claims else [] # subclass_of instance_claims = claims['P31'] if 'P31' in claims else [] # instance_of if len(subclass_claims) + len(instance_claims) == 0: continue categories = filter(lambda x: x is not None, (claim_value(claim) for claim in subclass_claims)) classes = filter(lambda x: x is not None, (claim_value(claim) for claim in instance_claims)) if any(x in kinships for x in categories + classes): output.write(json.dumps(entity) + '\n') if i % 20000 == 0: logging.info('categories: %s, classes: %s' % (repr(categories), repr(classes))) logging.info('%d entities iterated over: %s' % (i, datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
trainer_id = int(sys.argv[1]) # trainer id for each guest job_path = "fl_job_config" job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer # print(job._trainer_send_program) trainer = FLTrainerFactory().create_fl_trainer(job) use_cuda = False place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer._current_ep = "127.0.0.1:8192" trainer.start(place=place) trainer._logger.setLevel(logging.DEBUG) g = reader() if trainer_id > 0: for i in range(trainer_id): next(g) data = next(g) print(data) output_folder = "fl_model" step_i = 0 while not trainer.stop(): step_i += 1 print("batch %d start train" % step_i) trainer.run(feed=data, fetch=[]) if trainer_id == 0: print("start saving model") trainer.save_inference_program(output_folder)
def dataextractor(filelist): for f in open(filelist): f = f.rstrip() for i, entity in enumerate(reader(f)): qid = entity['id'] yield qid, f, i + 1