Esempio n. 1
0
def get_datasets_from_csvs(csv_path_1, record_1, csv_path_2, record_2):
    dataset_1 = rltk.Dataset(reader=rltk.CSVReader(csv_path_1),
                             record_class=record_1,
                             adapter=rltk.MemoryAdapter())
    dataset_2 = rltk.Dataset(reader=rltk.CSVReader(csv_path_2),
                             record_class=record_2,
                             adapter=rltk.MemoryAdapter())

    return dataset_1, dataset_2
Esempio n. 2
0
def featurize(mode, output_filename=None):
    """
    Catch all method to featurize either train or test dataset and save to CSV

    Params:
        mode: (str) TRAIN or TEST
        output_filename: (str) Optional- name of the csv to save the data
    """
    MODE = mode
    if not os.path.exists('train/') or not os.path.exists('test/'):
        train_test_split()
        
    if not os.path.exists('block_files/'):
        os.mkdir('block_files/')

    BLOCK_FILE = 'block_files/'+MODE+'.jl'
    CORPUS_FREQ_FILE = MODE+'/corpus_freq.json'

    ds_amzn = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/Amazon.csv', encoding='latin-1')),
                    record_class=AmazonRecord, adapter=rltk.MemoryAdapter())

    ds_goog = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/GoogleProducts.csv', encoding='latin-1')),
                    record_class=GoogleRecord, adapter=rltk.MemoryAdapter())

    try:
        block_handler = open(BLOCK_FILE,'r')
        print("Block file exists. Reading from disk...")
    except FileNotFoundError:
        block_handler = rltk.InvertedIndexBlockGenerator(
            ds_amzn, ds_goog, writer=rltk.BlockFileWriter(BLOCK_FILE), tokenizer=tokenizer).generate()

    features = ['id1', 'id2', 'price_difference',
       'desc_jaccard', 'desc_tf_idf', 'desc_trigram',
       'manufacturer_jaccard', 'manufacturer_jaro_winkler',
       'manufacturer_levenshtien', 'name_jaccard', 'name_jaro_winkler',
       'name_trigram','label']

    pairs = rltk.get_record_pairs(ds_amzn, ds_goog, rltk.BlockFileReader(block_handler))
    freq = get_document_frequency(CORPUS_FREQ_FILE, ds_amzn, ds_goog)

    if MODE == "train":
        print("Featurizing train")
        if not output_filename:
            output_filename = 'train/features_train.csv'
        featurize_all_records(pairs, features, output_filename, freq, TRAIN_DOC_SIZE)
    elif MODE == "test":
        print("Featurizing test")
        if not output_filename:
            output_filename = 'test/features_test.csv'
        featurize_all_records(pairs, features, output_filename, freq, TEST_DOC_SIZE)
Esempio n. 3
0
@rltk.remove_raw_object
class Record2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.MemoryAdapter())

# for r in ds1:
#     print(r.id, r.first_name, r.last_name)
# for r in ds2:
#     print(r.id, r.first_name, r.last_name)

block_writer = rltk.BlockFileWriter('blocks.jl')
# block_writer = rltk.BlockArrayWriter()
block_writer.write('1', 'a')
block_writer.write('2', 'b')
Esempio n. 4
0
    def name(self):
        return self.raw_object['name']

    @rltk.cached_property
    def laptop(self):
        return self.raw_object['laptop_brand']


@rltk.remove_raw_object
class EvaluationRecord2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name(self):
        return self.raw_object['name']

    @rltk.cached_property
    def laptop(self):
        return self.raw_object['laptop']


dataset_1_file_name = 'data_1.csv'
dataset_2_file_name = 'data_2.csv'

ds1 = rltk.Dataset(reader=rltk.CSVReader(dataset_1_file_name),
                   record_class=EvaluationRecord)
ds2 = rltk.Dataset(reader=rltk.CSVReader(dataset_2_file_name),
                   record_class=EvaluationRecord2)
Esempio n. 5
0
import rltk
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import numpy as np

from featurize import featurize, get_document_frequency, featurize_record_pair, TRAIN_DOC_SIZE
from utils import impute_df, DATASET_DIR
from amazon_record import AmazonRecord
from google_record import GoogleRecord

ds_amzn = rltk.Dataset(reader=rltk.CSVReader(
    open(DATASET_DIR + 'Amazon.csv', encoding='latin-1')),
                       record_class=AmazonRecord,
                       adapter=rltk.MemoryAdapter())

ds_goog = rltk.Dataset(reader=rltk.CSVReader(
    open(DATASET_DIR + 'GoogleProducts.csv', encoding='latin-1')),
                       record_class=GoogleRecord,
                       adapter=rltk.MemoryAdapter())


def generate_features(gt_train):
    """
    Generate features from stratifed ground truth DataFrames

    Params:
        gt_train: (DataFrame) Df containing statified training data ids and labels
Esempio n. 6
0
        return address.strip().lower()

    @rltk.cached_property
    def phone(self):
        phone = self.raw_object['Phone'].replace('/', '-').replace(
            ' ', '')  #.replace('and','or').split('or')
        #         print(phone.strip()[:15])
        return phone.strip()[:15]

    @rltk.cached_property
    def cuisine(self):
        cs = self.raw_object['Cuisine']
        return cs if cs else ''


ds_fod = rltk.Dataset(rltk.CSVReader(file_F),
                      record_class=DBFod,
                      adapter=rltk.MemoryKeyValueAdapter())
# dFod = [[k+1,dblp.id,dblp.cuisine,dblp.address] for k,dblp in enumerate(ds_fod)]
# print(dFod[506])
# for r_dblp in ds_fod:
#     print(r_dblp.name)

tokenizer = rltk.CrfTokenizer()
i = 0


def tokenize_id(t):
    tokens = tokenizer.tokenize(t)
    global i
    i += 1
Esempio n. 7
0
    def parent_id(self):
        return '4' if self.id == '1' else None


class Record2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def value(self):
        v = self.raw_object.get('values', list())
        return v[0] if len(v) > 0 else 'empty'


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.DBMAdapter('file_index'))

pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in pairs:
    print('-------------')
    print(r1.id, r1.value, '\t', r2.id, r2.value)
    if r1.parent_id:
        print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value)
    print('levenshtein_distance:',
          rltk.levenshtein_distance(r1.value, r2.value))
    print('levenshtein_similarity:',
        return s.lower().replace('-', '').replace('/', '').replace('&', '')

    @rltk.cached_property
    def brand_cleaned(self):
        _ = self.name_tokens
        manufacturer = self.manufacturer
        return process_brand_alias(
            manufacturer if manufacturer != '' else self.brand)

    @rltk.cached_property
    def model_cleaned(self):
        m = self.model
        return BuyRecord._clean(m)


ds_abt = rltk.Dataset(reader=rltk.CSVReader(
    open('../../datasets/Abt-Buy/Abt.csv', encoding='latin-1')),
                      record_class=AbtRecord,
                      adapter=rltk.MemoryKeyValueAdapter())

ds_buy = rltk.Dataset(reader=rltk.CSVReader(
    open('../../datasets/Abt-Buy/Buy.csv', encoding='latin-1')),
                      record_class=BuyRecord,
                      adapter=rltk.MemoryKeyValueAdapter())

# statistics
print_details = False
name_count = model_count = description_count = price_count = brand_count = 0
for r in ds_abt:
    name_count += 1
    print('------\nname:', r.name) if print_details else ''
    if len(r.description) > 0: