def get_datasets_from_csvs(csv_path_1, record_1, csv_path_2, record_2):
    dataset_1 = rltk.Dataset(reader=rltk.CSVReader(csv_path_1),
                             record_class=record_1,
                             adapter=rltk.MemoryAdapter())
    dataset_2 = rltk.Dataset(reader=rltk.CSVReader(csv_path_2),
                             record_class=record_2,
                             adapter=rltk.MemoryAdapter())

    return dataset_1, dataset_2
Example #2
0
def featurize(mode, output_filename=None):
    """
    Catch all method to featurize either train or test dataset and save to CSV

    Params:
        mode: (str) TRAIN or TEST
        output_filename: (str) Optional- name of the csv to save the data
    """
    MODE = mode
    if not os.path.exists('train/') or not os.path.exists('test/'):
        train_test_split()
        
    if not os.path.exists('block_files/'):
        os.mkdir('block_files/')

    BLOCK_FILE = 'block_files/'+MODE+'.jl'
    CORPUS_FREQ_FILE = MODE+'/corpus_freq.json'

    ds_amzn = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/Amazon.csv', encoding='latin-1')),
                    record_class=AmazonRecord, adapter=rltk.MemoryAdapter())

    ds_goog = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/GoogleProducts.csv', encoding='latin-1')),
                    record_class=GoogleRecord, adapter=rltk.MemoryAdapter())

    try:
        block_handler = open(BLOCK_FILE,'r')
        print("Block file exists. Reading from disk...")
    except FileNotFoundError:
        block_handler = rltk.InvertedIndexBlockGenerator(
            ds_amzn, ds_goog, writer=rltk.BlockFileWriter(BLOCK_FILE), tokenizer=tokenizer).generate()

    features = ['id1', 'id2', 'price_difference',
       'desc_jaccard', 'desc_tf_idf', 'desc_trigram',
       'manufacturer_jaccard', 'manufacturer_jaro_winkler',
       'manufacturer_levenshtien', 'name_jaccard', 'name_jaro_winkler',
       'name_trigram','label']

    pairs = rltk.get_record_pairs(ds_amzn, ds_goog, rltk.BlockFileReader(block_handler))
    freq = get_document_frequency(CORPUS_FREQ_FILE, ds_amzn, ds_goog)

    if MODE == "train":
        print("Featurizing train")
        if not output_filename:
            output_filename = 'train/features_train.csv'
        featurize_all_records(pairs, features, output_filename, freq, TRAIN_DOC_SIZE)
    elif MODE == "test":
        print("Featurizing test")
        if not output_filename:
            output_filename = 'test/features_test.csv'
        featurize_all_records(pairs, features, output_filename, freq, TEST_DOC_SIZE)
Example #3
0
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.MemoryAdapter())

# for r in ds1:
#     print(r.id, r.first_name, r.last_name)
# for r in ds2:
#     print(r.id, r.first_name, r.last_name)

block_writer = rltk.BlockFileWriter('blocks.jl')
# block_writer = rltk.BlockArrayWriter()
block_writer.write('1', 'a')
block_writer.write('2', 'b')
block_writer.write('2', 'd')
block_writer.write('1', 'a')
def entity_links_stage_4():
    # load Datasets
    ds_issue_location = rltk.Dataset(reader=rltk.JsonLinesReader('ISSUE_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter())
    ds_wikia_location = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter())
    # print some entries
    print(ds_issue_location.generate_dataframe().head(5))
    print(ds_wikia_location.generate_dataframe().head(5))
    tot_counter = 0
    for item in ds_issue_location:
        tot_counter += 1
        res_id, res_conf = match_record_to_ds(item, ds_wikia_location, False)
        if res_id != None:
            print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id))
            SIM_LOCATIONS__ISSUE_TO_WIKIA[item.id] = (res_id, res_conf)
    with open('SIM_LOCATIONS__ISSUE_TO_WIKIA.json', 'w') as outfile:
        print('SIM_LOCATIONS__ISSUE_TO_WIKIA: ' + str(len(SIM_LOCATIONS__ISSUE_TO_WIKIA)))
        json.dump(SIM_LOCATIONS__ISSUE_TO_WIKIA, outfile, indent=2)
def entity_links_stage_1():
    # load Datasets
    ds_movie_char = rltk.Dataset(reader=rltk.JsonLinesReader('MOVIE_CHARS_DICT.jl'), record_class=MovieCharRecord, adapter=rltk.MemoryAdapter())
    ds_wikia_char = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_CHARS_DICT.jl'), record_class=WikiaCharRecord, adapter=rltk.MemoryAdapter())
    # print some entries
    print(ds_movie_char.generate_dataframe().head(5))
    print(ds_wikia_char.generate_dataframe().head(5))
    tot_counter = 0
    for item in ds_movie_char:
        tot_counter += 1
        res_id, res_conf = match_record_to_ds(item, ds_wikia_char)
        if res_id != None:
            print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id))
            SIM_CHARS__MOVIE_TO_WIKIA[item.id] = (res_id, res_conf)
    with open('SIM_CHARS__MOVIE_TO_WIKIA.json', 'w') as outfile:
        print('SIM_CHARS__MOVIE_TO_WIKIA: ' + str(len(SIM_CHARS__MOVIE_TO_WIKIA)))
        json.dump(SIM_CHARS__MOVIE_TO_WIKIA, outfile, indent=2)