def worker(): tokenizer = rltk.CrfTokenizer() # load Datasets ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) valid_match = [] for r_imdb in ds_imdb: # test this record with AFI records optimum = (None, MY_TRESH) for r_afi in ds_afi: result, confidence = rule_based_method(r_imdb, r_afi) if result and confidence > optimum[1]: optimum = (r_afi, confidence) if optimum[0] is not None: r_afi, confidence = optimum valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': r_afi.raw_object['url'] }) else: valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': None }) fout = open(result_file, 'w') fout.write(json.dumps(valid_match, indent=4)) fout.close()
def entity_links_stage_4(): # load Datasets ds_issue_location = rltk.Dataset(reader=rltk.JsonLinesReader('ISSUE_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter()) ds_wikia_location = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter()) # print some entries print(ds_issue_location.generate_dataframe().head(5)) print(ds_wikia_location.generate_dataframe().head(5)) tot_counter = 0 for item in ds_issue_location: tot_counter += 1 res_id, res_conf = match_record_to_ds(item, ds_wikia_location, False) if res_id != None: print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id)) SIM_LOCATIONS__ISSUE_TO_WIKIA[item.id] = (res_id, res_conf) with open('SIM_LOCATIONS__ISSUE_TO_WIKIA.json', 'w') as outfile: print('SIM_LOCATIONS__ISSUE_TO_WIKIA: ' + str(len(SIM_LOCATIONS__ISSUE_TO_WIKIA))) json.dump(SIM_LOCATIONS__ISSUE_TO_WIKIA, outfile, indent=2)
def entity_links_stage_1(): # load Datasets ds_movie_char = rltk.Dataset(reader=rltk.JsonLinesReader('MOVIE_CHARS_DICT.jl'), record_class=MovieCharRecord, adapter=rltk.MemoryAdapter()) ds_wikia_char = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_CHARS_DICT.jl'), record_class=WikiaCharRecord, adapter=rltk.MemoryAdapter()) # print some entries print(ds_movie_char.generate_dataframe().head(5)) print(ds_wikia_char.generate_dataframe().head(5)) tot_counter = 0 for item in ds_movie_char: tot_counter += 1 res_id, res_conf = match_record_to_ds(item, ds_wikia_char) if res_id != None: print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id)) SIM_CHARS__MOVIE_TO_WIKIA[item.id] = (res_id, res_conf) with open('SIM_CHARS__MOVIE_TO_WIKIA.json', 'w') as outfile: print('SIM_CHARS__MOVIE_TO_WIKIA: ' + str(len(SIM_CHARS__MOVIE_TO_WIKIA))) json.dump(SIM_CHARS__MOVIE_TO_WIKIA, outfile, indent=2)
def id(self): return self.raw_object['ident'] @rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.MemoryAdapter()) # for r in ds1: # print(r.id, r.first_name, r.last_name) # for r in ds2: # print(r.id, r.first_name, r.last_name) block_writer = rltk.BlockFileWriter('blocks.jl') # block_writer = rltk.BlockArrayWriter() block_writer.write('1', 'a') block_writer.write('2', 'b') block_writer.write('2', 'd') block_writer.write('1', 'a') block_writer.flush() # flush / close must be called in order to read later
@rltk.cached_property def genre_set(self): return set(self.genre_string.split(',')) @rltk.cached_property def year(self): if re.search("(\d{4})", self.date_string): return str(re.search("(\d{4})", self.date_string).group(0)) else: return '' imdb_file = 'imdb.jl' afi_file = 'afi.jl' ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) def name_similarity(r_imdb, r_afi): s1 = r_imdb.name_string s2 = r_afi.name_string return rltk.jaro_winkler_similarity(s1, s2) def genre_similarity(r_imdb, r_afi): s1 = r_imdb.genre_set
def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset: ''' Create rltk dataset from a given jl file ''' assert Path(input_file).suffix == ".jl" return rltk.Dataset(reader=rltk.JsonLinesReader(input_file), record_class=rcrd_class, adapter=rltk.MemoryKeyValueAdapter())
@rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] @property def full_name(self): return self.first_name + ' ' + self.last_name ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2) ngram = rltk.NGramTokenizer() bg = rltk.TokenBlockGenerator() block1 = bg.block(ds1, function_=lambda r: ngram.basic(r.first_name, 3), block=rltk.Block( rltk.LevelDbKeySetAdapter('block_store', 'b1', clean=True))) block2 = bg.block(ds2, function_=lambda r: ngram.basic(r.first_name, 3), block=rltk.Block( rltk.LevelDbKeySetAdapter('block_store', 'b2',
return self.raw_object['product_name'] @rltk.cached_property def brand(self): return set(self.raw_object['brand']) @rltk.cached_property def ingredients(self): return set(self.raw_object['ingredients_ids']) product_file = './output/sephora_skincare_product_ingredient_list.jl' with open(product_file) as json_products: products = [json.loads(line) for line in json_products] ds_products = rltk.Dataset(reader=rltk.JsonLinesReader(product_file), record_class=Product, adapter=rltk.MemoryKeyValueAdapter()) df_products = ds_products.generate_dataframe() def name_token_similarity(prod1, prod2): '''set''' set1 = prod1.name_tokens set2 = prod2.name_tokens return rltk.dice_similarity(set1, set2) def name_string_similarity(prod1, prod2): s1 = prod1.name_string s2 = prod2.name_string
return rltk.hybrid_jaccard_similarity(r_museum.name_tokens, r_ulan.name_tokens, threshold=0.67) if __name__ == '__main__': ulan_ds_adapter = rltk.RedisKeyValueAdapter('127.0.0.1', key_prefix='ulan_ds_') bg = rltk.TokenBlockGenerator() ulan_block = rltk.Block( rltk.RedisKeySetAdapter('127.0.0.1', key_prefix='ulan_block_')) # pre computing for ulan data if rltk.cli.confirm('Regenerate ULAN data caches?', default=False): ds_ulan = rltk.Dataset( reader=rltk.JsonLinesReader('../../datasets/museum/ulan.json'), record_class=RecordULAN, adapter=ulan_ds_adapter) b_ulan = bg.block(ds_ulan, function_=block_on_name_prefix, block=ulan_block) # load ulan ds_ulan = rltk.Dataset(adapter=ulan_ds_adapter) b_ulan = ulan_block # compare against museums' data museums = list( map(lambda x: os.path.splitext(os.path.basename(x))[0], glob.glob('../../datasets/museum/*.json'))) museums.remove('ulan')