def worker():
    tokenizer = rltk.CrfTokenizer()

    # load Datasets
    ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file),
                           record_class=IMDBRecord,
                           adapter=rltk.MemoryKeyValueAdapter())
    ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file),
                          record_class=AFIRecord,
                          adapter=rltk.MemoryKeyValueAdapter())
    valid_match = []
    for r_imdb in ds_imdb:
        # test this record with AFI records
        optimum = (None, MY_TRESH)
        for r_afi in ds_afi:
            result, confidence = rule_based_method(r_imdb, r_afi)
            if result and confidence > optimum[1]:
                optimum = (r_afi, confidence)

        if optimum[0] is not None:
            r_afi, confidence = optimum
            valid_match.append({
                'imdb_movie': r_imdb.raw_object['url'],
                'afi_movie': r_afi.raw_object['url']
            })
        else:
            valid_match.append({
                'imdb_movie': r_imdb.raw_object['url'],
                'afi_movie': None
            })

    fout = open(result_file, 'w')
    fout.write(json.dumps(valid_match, indent=4))
    fout.close()
def entity_links_stage_4():
    # load Datasets
    ds_issue_location = rltk.Dataset(reader=rltk.JsonLinesReader('ISSUE_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter())
    ds_wikia_location = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter())
    # print some entries
    print(ds_issue_location.generate_dataframe().head(5))
    print(ds_wikia_location.generate_dataframe().head(5))
    tot_counter = 0
    for item in ds_issue_location:
        tot_counter += 1
        res_id, res_conf = match_record_to_ds(item, ds_wikia_location, False)
        if res_id != None:
            print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id))
            SIM_LOCATIONS__ISSUE_TO_WIKIA[item.id] = (res_id, res_conf)
    with open('SIM_LOCATIONS__ISSUE_TO_WIKIA.json', 'w') as outfile:
        print('SIM_LOCATIONS__ISSUE_TO_WIKIA: ' + str(len(SIM_LOCATIONS__ISSUE_TO_WIKIA)))
        json.dump(SIM_LOCATIONS__ISSUE_TO_WIKIA, outfile, indent=2)
def entity_links_stage_1():
    # load Datasets
    ds_movie_char = rltk.Dataset(reader=rltk.JsonLinesReader('MOVIE_CHARS_DICT.jl'), record_class=MovieCharRecord, adapter=rltk.MemoryAdapter())
    ds_wikia_char = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_CHARS_DICT.jl'), record_class=WikiaCharRecord, adapter=rltk.MemoryAdapter())
    # print some entries
    print(ds_movie_char.generate_dataframe().head(5))
    print(ds_wikia_char.generate_dataframe().head(5))
    tot_counter = 0
    for item in ds_movie_char:
        tot_counter += 1
        res_id, res_conf = match_record_to_ds(item, ds_wikia_char)
        if res_id != None:
            print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id))
            SIM_CHARS__MOVIE_TO_WIKIA[item.id] = (res_id, res_conf)
    with open('SIM_CHARS__MOVIE_TO_WIKIA.json', 'w') as outfile:
        print('SIM_CHARS__MOVIE_TO_WIKIA: ' + str(len(SIM_CHARS__MOVIE_TO_WIKIA)))
        json.dump(SIM_CHARS__MOVIE_TO_WIKIA, outfile, indent=2)
Beispiel #4
0
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.MemoryAdapter())

# for r in ds1:
#     print(r.id, r.first_name, r.last_name)
# for r in ds2:
#     print(r.id, r.first_name, r.last_name)

block_writer = rltk.BlockFileWriter('blocks.jl')
# block_writer = rltk.BlockArrayWriter()
block_writer.write('1', 'a')
block_writer.write('2', 'b')
block_writer.write('2', 'd')
block_writer.write('1', 'a')
block_writer.flush()  # flush / close must be called in order to read later
Beispiel #5
0
    @rltk.cached_property
    def genre_set(self):
        return set(self.genre_string.split(','))

    @rltk.cached_property
    def year(self):
        if re.search("(\d{4})", self.date_string):
            return str(re.search("(\d{4})", self.date_string).group(0))
        else:
            return ''


imdb_file = 'imdb.jl'
afi_file = 'afi.jl'

ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file),
                       record_class=IMDBRecord,
                       adapter=rltk.MemoryKeyValueAdapter())
ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file),
                      record_class=AFIRecord,
                      adapter=rltk.MemoryKeyValueAdapter())


def name_similarity(r_imdb, r_afi):
    s1 = r_imdb.name_string
    s2 = r_afi.name_string
    return rltk.jaro_winkler_similarity(s1, s2)


def genre_similarity(r_imdb, r_afi):
    s1 = r_imdb.genre_set
Beispiel #6
0
def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset:
    ''' Create rltk dataset from a given jl file '''
    assert Path(input_file).suffix == ".jl"
    return rltk.Dataset(reader=rltk.JsonLinesReader(input_file),
                        record_class=rcrd_class,
                        adapter=rltk.MemoryKeyValueAdapter())
Beispiel #7
0
    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]

    @property
    def full_name(self):
        return self.first_name + ' ' + self.last_name


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1)
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2)

ngram = rltk.NGramTokenizer()

bg = rltk.TokenBlockGenerator()
block1 = bg.block(ds1,
                  function_=lambda r: ngram.basic(r.first_name, 3),
                  block=rltk.Block(
                      rltk.LevelDbKeySetAdapter('block_store',
                                                'b1',
                                                clean=True)))
block2 = bg.block(ds2,
                  function_=lambda r: ngram.basic(r.first_name, 3),
                  block=rltk.Block(
                      rltk.LevelDbKeySetAdapter('block_store',
                                                'b2',
Beispiel #8
0
        return self.raw_object['product_name']

    @rltk.cached_property
    def brand(self):
        return set(self.raw_object['brand'])

    @rltk.cached_property
    def ingredients(self):
        return set(self.raw_object['ingredients_ids'])


product_file = './output/sephora_skincare_product_ingredient_list.jl'
with open(product_file) as json_products:
    products = [json.loads(line) for line in json_products]

ds_products = rltk.Dataset(reader=rltk.JsonLinesReader(product_file),
                           record_class=Product,
                           adapter=rltk.MemoryKeyValueAdapter())
df_products = ds_products.generate_dataframe()


def name_token_similarity(prod1, prod2):
    '''set'''
    set1 = prod1.name_tokens
    set2 = prod2.name_tokens
    return rltk.dice_similarity(set1, set2)


def name_string_similarity(prod1, prod2):
    s1 = prod1.name_string
    s2 = prod2.name_string
    return rltk.hybrid_jaccard_similarity(r_museum.name_tokens,
                                          r_ulan.name_tokens,
                                          threshold=0.67)


if __name__ == '__main__':
    ulan_ds_adapter = rltk.RedisKeyValueAdapter('127.0.0.1',
                                                key_prefix='ulan_ds_')
    bg = rltk.TokenBlockGenerator()
    ulan_block = rltk.Block(
        rltk.RedisKeySetAdapter('127.0.0.1', key_prefix='ulan_block_'))

    # pre computing for ulan data
    if rltk.cli.confirm('Regenerate ULAN data caches?', default=False):
        ds_ulan = rltk.Dataset(
            reader=rltk.JsonLinesReader('../../datasets/museum/ulan.json'),
            record_class=RecordULAN,
            adapter=ulan_ds_adapter)
        b_ulan = bg.block(ds_ulan,
                          function_=block_on_name_prefix,
                          block=ulan_block)

    # load ulan
    ds_ulan = rltk.Dataset(adapter=ulan_ds_adapter)
    b_ulan = ulan_block

    # compare against museums' data
    museums = list(
        map(lambda x: os.path.splitext(os.path.basename(x))[0],
            glob.glob('../../datasets/museum/*.json')))
    museums.remove('ulan')