def get_datasets_from_csvs(csv_path_1, record_1, csv_path_2, record_2): dataset_1 = rltk.Dataset(reader=rltk.CSVReader(csv_path_1), record_class=record_1, adapter=rltk.MemoryAdapter()) dataset_2 = rltk.Dataset(reader=rltk.CSVReader(csv_path_2), record_class=record_2, adapter=rltk.MemoryAdapter()) return dataset_1, dataset_2
def featurize(mode, output_filename=None): """ Catch all method to featurize either train or test dataset and save to CSV Params: mode: (str) TRAIN or TEST output_filename: (str) Optional- name of the csv to save the data """ MODE = mode if not os.path.exists('train/') or not os.path.exists('test/'): train_test_split() if not os.path.exists('block_files/'): os.mkdir('block_files/') BLOCK_FILE = 'block_files/'+MODE+'.jl' CORPUS_FREQ_FILE = MODE+'/corpus_freq.json' ds_amzn = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/Amazon.csv', encoding='latin-1')), record_class=AmazonRecord, adapter=rltk.MemoryAdapter()) ds_goog = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/GoogleProducts.csv', encoding='latin-1')), record_class=GoogleRecord, adapter=rltk.MemoryAdapter()) try: block_handler = open(BLOCK_FILE,'r') print("Block file exists. Reading from disk...") except FileNotFoundError: block_handler = rltk.InvertedIndexBlockGenerator( ds_amzn, ds_goog, writer=rltk.BlockFileWriter(BLOCK_FILE), tokenizer=tokenizer).generate() features = ['id1', 'id2', 'price_difference', 'desc_jaccard', 'desc_tf_idf', 'desc_trigram', 'manufacturer_jaccard', 'manufacturer_jaro_winkler', 'manufacturer_levenshtien', 'name_jaccard', 'name_jaro_winkler', 'name_trigram','label'] pairs = rltk.get_record_pairs(ds_amzn, ds_goog, rltk.BlockFileReader(block_handler)) freq = get_document_frequency(CORPUS_FREQ_FILE, ds_amzn, ds_goog) if MODE == "train": print("Featurizing train") if not output_filename: output_filename = 'train/features_train.csv' featurize_all_records(pairs, features, output_filename, freq, TRAIN_DOC_SIZE) elif MODE == "test": print("Featurizing test") if not output_filename: output_filename = 'test/features_test.csv' featurize_all_records(pairs, features, output_filename, freq, TEST_DOC_SIZE)
@rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.MemoryAdapter()) # for r in ds1: # print(r.id, r.first_name, r.last_name) # for r in ds2: # print(r.id, r.first_name, r.last_name) block_writer = rltk.BlockFileWriter('blocks.jl') # block_writer = rltk.BlockArrayWriter() block_writer.write('1', 'a') block_writer.write('2', 'b') block_writer.write('2', 'd') block_writer.write('1', 'a')
def entity_links_stage_4(): # load Datasets ds_issue_location = rltk.Dataset(reader=rltk.JsonLinesReader('ISSUE_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter()) ds_wikia_location = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter()) # print some entries print(ds_issue_location.generate_dataframe().head(5)) print(ds_wikia_location.generate_dataframe().head(5)) tot_counter = 0 for item in ds_issue_location: tot_counter += 1 res_id, res_conf = match_record_to_ds(item, ds_wikia_location, False) if res_id != None: print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id)) SIM_LOCATIONS__ISSUE_TO_WIKIA[item.id] = (res_id, res_conf) with open('SIM_LOCATIONS__ISSUE_TO_WIKIA.json', 'w') as outfile: print('SIM_LOCATIONS__ISSUE_TO_WIKIA: ' + str(len(SIM_LOCATIONS__ISSUE_TO_WIKIA))) json.dump(SIM_LOCATIONS__ISSUE_TO_WIKIA, outfile, indent=2)
def entity_links_stage_1(): # load Datasets ds_movie_char = rltk.Dataset(reader=rltk.JsonLinesReader('MOVIE_CHARS_DICT.jl'), record_class=MovieCharRecord, adapter=rltk.MemoryAdapter()) ds_wikia_char = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_CHARS_DICT.jl'), record_class=WikiaCharRecord, adapter=rltk.MemoryAdapter()) # print some entries print(ds_movie_char.generate_dataframe().head(5)) print(ds_wikia_char.generate_dataframe().head(5)) tot_counter = 0 for item in ds_movie_char: tot_counter += 1 res_id, res_conf = match_record_to_ds(item, ds_wikia_char) if res_id != None: print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id)) SIM_CHARS__MOVIE_TO_WIKIA[item.id] = (res_id, res_conf) with open('SIM_CHARS__MOVIE_TO_WIKIA.json', 'w') as outfile: print('SIM_CHARS__MOVIE_TO_WIKIA: ' + str(len(SIM_CHARS__MOVIE_TO_WIKIA))) json.dump(SIM_CHARS__MOVIE_TO_WIKIA, outfile, indent=2)