def get_datasets_from_csvs(csv_path_1, record_1, csv_path_2, record_2): dataset_1 = rltk.Dataset(reader=rltk.CSVReader(csv_path_1), record_class=record_1, adapter=rltk.MemoryAdapter()) dataset_2 = rltk.Dataset(reader=rltk.CSVReader(csv_path_2), record_class=record_2, adapter=rltk.MemoryAdapter()) return dataset_1, dataset_2
def featurize(mode, output_filename=None): """ Catch all method to featurize either train or test dataset and save to CSV Params: mode: (str) TRAIN or TEST output_filename: (str) Optional- name of the csv to save the data """ MODE = mode if not os.path.exists('train/') or not os.path.exists('test/'): train_test_split() if not os.path.exists('block_files/'): os.mkdir('block_files/') BLOCK_FILE = 'block_files/'+MODE+'.jl' CORPUS_FREQ_FILE = MODE+'/corpus_freq.json' ds_amzn = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/Amazon.csv', encoding='latin-1')), record_class=AmazonRecord, adapter=rltk.MemoryAdapter()) ds_goog = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/GoogleProducts.csv', encoding='latin-1')), record_class=GoogleRecord, adapter=rltk.MemoryAdapter()) try: block_handler = open(BLOCK_FILE,'r') print("Block file exists. Reading from disk...") except FileNotFoundError: block_handler = rltk.InvertedIndexBlockGenerator( ds_amzn, ds_goog, writer=rltk.BlockFileWriter(BLOCK_FILE), tokenizer=tokenizer).generate() features = ['id1', 'id2', 'price_difference', 'desc_jaccard', 'desc_tf_idf', 'desc_trigram', 'manufacturer_jaccard', 'manufacturer_jaro_winkler', 'manufacturer_levenshtien', 'name_jaccard', 'name_jaro_winkler', 'name_trigram','label'] pairs = rltk.get_record_pairs(ds_amzn, ds_goog, rltk.BlockFileReader(block_handler)) freq = get_document_frequency(CORPUS_FREQ_FILE, ds_amzn, ds_goog) if MODE == "train": print("Featurizing train") if not output_filename: output_filename = 'train/features_train.csv' featurize_all_records(pairs, features, output_filename, freq, TRAIN_DOC_SIZE) elif MODE == "test": print("Featurizing test") if not output_filename: output_filename = 'test/features_test.csv' featurize_all_records(pairs, features, output_filename, freq, TEST_DOC_SIZE)
@rltk.remove_raw_object class Record2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.MemoryAdapter()) # for r in ds1: # print(r.id, r.first_name, r.last_name) # for r in ds2: # print(r.id, r.first_name, r.last_name) block_writer = rltk.BlockFileWriter('blocks.jl') # block_writer = rltk.BlockArrayWriter() block_writer.write('1', 'a') block_writer.write('2', 'b')
def name(self): return self.raw_object['name'] @rltk.cached_property def laptop(self): return self.raw_object['laptop_brand'] @rltk.remove_raw_object class EvaluationRecord2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['id'] @rltk.cached_property def name(self): return self.raw_object['name'] @rltk.cached_property def laptop(self): return self.raw_object['laptop'] dataset_1_file_name = 'data_1.csv' dataset_2_file_name = 'data_2.csv' ds1 = rltk.Dataset(reader=rltk.CSVReader(dataset_1_file_name), record_class=EvaluationRecord) ds2 = rltk.Dataset(reader=rltk.CSVReader(dataset_2_file_name), record_class=EvaluationRecord2)
import rltk import pandas as pd from sklearn import svm from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, precision_recall_curve from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier import matplotlib.pyplot as plt import numpy as np from featurize import featurize, get_document_frequency, featurize_record_pair, TRAIN_DOC_SIZE from utils import impute_df, DATASET_DIR from amazon_record import AmazonRecord from google_record import GoogleRecord ds_amzn = rltk.Dataset(reader=rltk.CSVReader( open(DATASET_DIR + 'Amazon.csv', encoding='latin-1')), record_class=AmazonRecord, adapter=rltk.MemoryAdapter()) ds_goog = rltk.Dataset(reader=rltk.CSVReader( open(DATASET_DIR + 'GoogleProducts.csv', encoding='latin-1')), record_class=GoogleRecord, adapter=rltk.MemoryAdapter()) def generate_features(gt_train): """ Generate features from stratifed ground truth DataFrames Params: gt_train: (DataFrame) Df containing statified training data ids and labels
return address.strip().lower() @rltk.cached_property def phone(self): phone = self.raw_object['Phone'].replace('/', '-').replace( ' ', '') #.replace('and','or').split('or') # print(phone.strip()[:15]) return phone.strip()[:15] @rltk.cached_property def cuisine(self): cs = self.raw_object['Cuisine'] return cs if cs else '' ds_fod = rltk.Dataset(rltk.CSVReader(file_F), record_class=DBFod, adapter=rltk.MemoryKeyValueAdapter()) # dFod = [[k+1,dblp.id,dblp.cuisine,dblp.address] for k,dblp in enumerate(ds_fod)] # print(dFod[506]) # for r_dblp in ds_fod: # print(r_dblp.name) tokenizer = rltk.CrfTokenizer() i = 0 def tokenize_id(t): tokens = tokenizer.tokenize(t) global i i += 1
def parent_id(self): return '4' if self.id == '1' else None class Record2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def value(self): v = self.raw_object.get('values', list()) return v[0] if len(v) > 0 else 'empty' ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.DBMAdapter('file_index')) pairs = rltk.get_record_pairs(ds1, ds2) for r1, r2 in pairs: print('-------------') print(r1.id, r1.value, '\t', r2.id, r2.value) if r1.parent_id: print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value) print('levenshtein_distance:', rltk.levenshtein_distance(r1.value, r2.value)) print('levenshtein_similarity:',
return s.lower().replace('-', '').replace('/', '').replace('&', '') @rltk.cached_property def brand_cleaned(self): _ = self.name_tokens manufacturer = self.manufacturer return process_brand_alias( manufacturer if manufacturer != '' else self.brand) @rltk.cached_property def model_cleaned(self): m = self.model return BuyRecord._clean(m) ds_abt = rltk.Dataset(reader=rltk.CSVReader( open('../../datasets/Abt-Buy/Abt.csv', encoding='latin-1')), record_class=AbtRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_buy = rltk.Dataset(reader=rltk.CSVReader( open('../../datasets/Abt-Buy/Buy.csv', encoding='latin-1')), record_class=BuyRecord, adapter=rltk.MemoryKeyValueAdapter()) # statistics print_details = False name_count = model_count = description_count = price_count = brand_count = 0 for r in ds_abt: name_count += 1 print('------\nname:', r.name) if print_details else '' if len(r.description) > 0: