def __init__(self, config_path, model_name):
     model_props = load_properties(config_path, model_name)
     self.non_categorical_features = model_props['non_categorical_features']
     self.target = model_props['target']
     self.test_size = float(model_props['test_size'])
     self.model_output_filepath = model_props['model_output_filepath']
     self.model_result_filepath = model_props['model_result_filepath']
     self.features_filename = model_props['features_filename']
     self.Features = Features(config_path, model_name)
     self.subscr_type = model_props['subscr_type']
     self.drop_columns = model_props['drop_columns']
     self.drop_rows = model_props['drop_rows']
Example #2
0
 def test_model_function(self):
     tests = Features().get_tests().keys()
     model1 = Model(tests)
     model1.fit(x, y)
     fm = FinkMos(x, x, model1.tests, model1.tag_corpus)
     a = model1.model_function(1, 3, [2, 3], fm)
     print("model function result")
     print(a)
Example #3
0
 def __init__(self, x, y, tag_corpus):
     assert isinstance(x, pd.Series)
     self.tag_corpus = tag_corpus
     self.test_dict = Features().get_tests()
     self.test_vec = np.array([test['func'][1] for test in self.test_dict.values()])
     self.x = x
     self.y = y
     self.f_matrix_list = None  #
     self.linear_loss_done = None
     # self.word2number = {word: index for index, word in enumerate(x.value_counts().index)}
     # tc = tag_corpus.shape[0]
     self.fast_test = dict()
     self.fast_predict = dict()
     self.weight_mat = None
     self.tuple_5_list = None
     self.tup5_2index = dict()
     self.opt = None
     self.v = None
     self.f_v_train = None
     self.calc_from_mem = None
Example #4
0
def api(img=None):
    response = {
        'status': False,
        'msg': 'Unexpected argument. No image specified',
        'data': None
    }
    if img:
        img = request.json('img')
        if upload_file(img, img.filename, app.config['UPLOAD_FOLDER']):
            # preprocess image
            features = Features(data_dir=config.DATASET_PATH)
            img_class = predict(img.filename)
            # Response
            response['status'] = True
            response['msg'] = 'Upload sucessful.'
            response['data'] = {'img': img, 'img_class': img_class}
        else:
            response['msg'] = 'Could not upload image'
    return jsonify(response)
Example #5
0
 def test_predict(self):
     # tests = pass
     # Load Data
     data = PreprocessTags(True).load_data(r'..\data\train.wtag')
     word_num = 30
     x = data.x[0:word_num]
     y = data.y[0:word_num]
     # generate tests - (comment out if file is updated)
     feat_generator = Features()
     feat_generator.generate_tuple_corpus(x, y)
     for template in feat.templates_dict.values():
         feat_generator.generate_lambdas(template['func'],
                                         template['tuples'])
     feat_generator.save_tests()
     model1 = Model()
     a = model1.fit(x, y)
     x_test = x
     y_hat = model1.predict(x_test)
     print(y_hat)
     cm = model1.confusion(y_hat=y_hat, y=y)
     cm.to_csv(r'../training/confusion_matrix.csv')
Example #6
0
    def test_create_tuples(self):
        data = PreprocessTags(True).load_data(r'..\data\train.wtag')
        word_num = 1_000
        tag_corp = pd.Series(data.y[0:word_num]).unique()
        # generate tests - (comment out if file is updated)
        feat_generator = Features()
        feat_generator.generate_tuple_corpus(data.x[0:word_num],
                                             data.y[0:word_num])
        for template in feat.templates_dict.values():
            feat_generator.generate_lambdas(template['func'],
                                            template['tuples'])
        feat_generator.save_tests()

        fm = FinkMos(data.x[0:word_num], data.y[0:word_num], tag_corp)
        fm.create_tuples()
        print("fm.weight_mat")
        print(fm.weight_mat)
        print("fm.tuple_5_list")
        print(fm.tuple_5_list)
        fm.create_feature_sparse_list_v2()
        # print(len(fm.f_matrix_list))
        print(fm.f_matrix_list[0].shape)
        fm.minimize_loss()
        fm.v.dump('values')
Example #7
0
 def test_feature_generator(self):
     data = PreprocessTags(True).load_data(
         r'..\data\toy_dataset.txt')
     feat_generator = Features()
     feat_generator.generate_tuple_corpus(data.x[0:10000], data.y[0:10000])
     try:
         # feat_generator.get_tests()  # loads last version saved
         pass
     except:
         pass
     for template in feat.templates_dict.values():
         feat_generator.generate_lambdas(template['func'], template['tuples'])
     # feat_generator.add_lambdas(feat.suffix_funcs_all)  # DONE
     # feat_generator.add_lambdas(feat.prefix_funcs_all)  # DONE
     result = feat_generator.lambdas
     print(len(result))
     with open(fr"../training/report_lambdas_dict.p", 'wb') as stream:
         pickle.dump(result, stream)
    # Set up logger
    logger = setup_logger()

    logger.info(args)

    # Load Dataset & Batch Loader
    logger.info("Loading the dataset ...")
    dataset = TrainingDataset(args.dataset)
    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers)

    # Load Model
    logger.info("Loading the model ...")
    model = Features()

    # Triplets Loss
    logger.info("Loading the triplets loss function ...")
    triplets_loss = TripletsLoss()

    # Enable GPU
    if use_gpu:
        model = model.cuda()
        triplets_loss = triplets_loss.cuda()

    # Set model in training mode
    logger.info("Setting up training mode ...")
    model.train()

    # Adam Optimizer
Example #9
0
import unittest

import numpy as np
import pandas as pd
from models.model import Model
import models.features as feat
from models.features import Features
from models.prerocesing import PreprocessTags
from models.sentence_processor import FinkMos
import os

os.chdir(r'C:\Users\amoscoso\Documents\Technion\nlp\nlp_hw\tests')
# %%
data = PreprocessTags(True).load_data(r'..\data\train.wtag')
word_num = 500
# generate tests - (comment out if file is updated)
feat_generator = Features()
feat_generator.generate_tuple_corpus(data.x[0:word_num], data.y[0:word_num])
for template in feat.templates_dict.values():
    feat_generator.generate_lambdas(template['func'], template['tuples'])
feat_generator.save_tests()
test_data = PreprocessTags(True).load_data(r'..\data\test.wtag')
# %%
word_num = 500
test_number = 50
model1 = Model()
model1.fit(data.x[0:word_num], data.y[0:word_num])

y_hat = model1.predict(test_data.x[:test_number])
model1.confusion(y_hat, data.y[:test_number])
Example #10
0
# coding: utf-8

# import necessary dependencies and files
import os
import sys

import tensorflow as tf
import numpy as np

from models.config import DATASET_PATH, SAVED_FEATURES
from models.features import Features


# Load in the datasets
features = Features(data_dir=DATASET_PATH)
if os.path.isfile(SAVED_FEATURES):
    datasets = np.load(SAVED_FEATURES)
else:
    datasets = features.create(save_file=SAVED_FEATURES)

# Split into training and testing set
X_train, y_train, X_test, y_test = features.train_test_split(datasets)
print('Length of training set: {:,}'.format(len(y_train)))
print('Length of testing set:  {:,}'.format(len(y_test)))

# Define Hyperparameters
# Image & labels
image_size = features.image_size
image_channel = 3
image_shape = (image_size, image_size, image_channel)
Example #11
0
class TrainPredictDuration:
    def __init__(self, config_path, model_name):
        model_props = load_properties(config_path, model_name)
        self.non_categorical_features = model_props['non_categorical_features']
        self.target = model_props['target']
        self.test_size = float(model_props['test_size'])
        self.model_output_filepath = model_props['model_output_filepath']
        self.model_result_filepath = model_props['model_result_filepath']
        self.features_filename = model_props['features_filename']
        self.Features = Features(config_path, model_name)
        self.subscr_type = model_props['subscr_type']
        self.drop_columns = model_props['drop_columns']
        self.drop_rows = model_props['drop_rows']

    def data_preparation(self):

        cursor = make_connection()

        df = load_df(
            cursor,
            """select aa.*, bb.municipal as start_municipal, bb.lat as start_lat, bb.lng as start_lng,\
                                cc.municipal as end_municipal, cc.lat as end_lat, cc.lng as end_lng from hubway_trips as aa \
                                left join hubway_stations as bb on aa.strt_statn = bb.id \
                                left join hubway_stations as cc on aa.end_statn = cc.id"""
        )

        weather_df = load_df(cursor, """select * from weather""")

        df[[
            'duration', 'birth_date', 'start_lat', 'start_lng', 'end_lat',
            'end_lng'
        ]] = df[[
            'duration', 'birth_date', 'start_lat', 'start_lng', 'end_lat',
            'end_lng'
        ]].apply(pd.to_numeric)
        weather_df['hpcp'] = pd.to_numeric(weather_df['hpcp'])

        df[['start_date', 'end_date']] = df[['start_date',
                                             'end_date']].apply(pd.to_datetime)
        weather_df['date_time'] = weather_df['date_time'].apply(pd.to_datetime)

        df = df[(df['subsc_type'] == self.subscr_type)]
        df = df[(df['duration'] < df['duration'].quantile(0.75))
                & (df['duration'] > 0)]
        df.dropna(subset=[col for col in self.drop_rows.split(',')],
                  inplace=True)

        def weather_hpcp(df, weather_df, date):
            new_col_name = date.split("_")[0] + '_hpcp'
            tol = pd.Timedelta(days=3)
            df = pd.merge_asof(df.sort_values(by=date),
                               weather_df[['date_time', 'hpcp']].sort_values(
                                   by='date_time').set_index('date_time'),
                               right_index=True,
                               direction='nearest',
                               tolerance=tol,
                               left_on=date)
            df.rename(columns={'hpcp': new_col_name}, inplace=True)
            df[new_col_name] = df[new_col_name].groupby(
                [df[date].dt.month]).transform(lambda x: x.fillna(x.mean()))
            return df

        df = weather_hpcp(df, weather_df, "start_date")
        df = weather_hpcp(df, weather_df, "end_date")

        return df

    def feature_engineering(self, df):
        df['driver_age'] = df.apply(
            lambda x: self.Features.driver_age(x['birth_date']), axis=1)
        df['driver_age_cat'] = df.apply(
            lambda x: self.Features.driver_age_category(x['driver_age']),
            axis=1)
        df['travel_distance'] = df.apply(lambda x: self.Features.distance(
            x['start_lat'], x['start_lng'], x['end_lat'], x['end_lng']),
                                         axis=1)
        df['average_speed'] = df.apply(lambda x: self.Features.average_speed(
            x['travel_distance'], x['duration']),
                                       axis=1)
        df = self.Features.temporal_features(df)
        df = self.Features.one_hot_encoding(df)
        df['is_station_diff'] = self.Features.strt_end_diff(df)
        df = self.Features.station_flows(df)
        for col in df.columns:
            if df[col].isna().sum() != 0:
                print(col)
        df.to_csv(os.path.join("data", self.features_filename + '.csv'),
                  index=False)
        return df

    def feature_selection(self, df):
        non_impact_features = [
            'is_start_11Q4', 'is_adult', 'is_start_weekend', 'is_start_9',
            'is_start_12Q2', 'is_start_4', 'is_start_17', 'is_young_adult',
            'is_start_working_day', 'is_start_12Q3'
        ]
        non_categorical_features = [
            feature for feature in self.non_categorical_features.split(",")
        ]
        X = df[non_categorical_features + list(df.filter(regex='is_').columns)]
        # X = X.drop(X[non_impact_features], axis=1)
        feature_names = list(X.columns)
        y = df[self.target]
        return X, y, feature_names

    def train_test_split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=42)
        return X_train, X_test, y_train, y_test

    def grid_search_train(self, X, y, estimator, param):
        param_grid = {
            "rf_{}".format(self.subscr_type): {
                # "n_estimators": list(range(20, 81, 10)),
                "n_estimators": [80],
                "bootstrap": ['True'],
                "criterion": ['mse'],
                "max_features": ['auto', 'sqrt'],
                "min_samples_leaf": [5]
            },
            "lr_{}".format(self.subscr_type): {
                "normalize": ['True'],
                "alpha": [0.01, 0.02, 0.03, 0.04],
            },
            "gb_{}".format(self.subscr_type): {
                "n_estimators": list(range(20, 81, 10)),
                "learning_rate": [0.01, 0.02, 0.03, 0.04],
                "min_samples_split": [500],
                "min_samples_leaf": [50],
                "max_depth": [4, 6, 8, 10],
                "max_features": ['auto'],
                "subsample": [0.9, 0.5, 0.2, 0.1]
            }
        }
        print(param_grid[param])
        X_train, X_test, y_train, y_test = self.train_test_split(X, y)
        grid_search = GridSearchCV(estimator,
                                   param_grid[param],
                                   cv=5,
                                   n_jobs=-1)
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
        predictions = model.predict(X_test)
        return model, predictions, y_test

    def save_model(self, model, model_name):
        full_path = os.path.join(self.model_output_filepath, model_name)
        pickle.dump(model, open(full_path, 'wb'))

    def save_output(self, y_pred_test, y_test, output_name):
        results = {
            "y_pred_test": y_pred_test.tolist(),
            "y_test": y_test.tolist()
        }
        with open(os.path.join(self.model_result_filepath, output_name),
                  'w') as ofile:
            json.dump(results, ofile)

    def post_processing(self):
        data_file_path = "data/" + self.features_filename + ".csv"
        ddl_file_path = "ddls/" + self.features_filename + "_table_create.sql"
        cursor = make_connection()
        with open(ddl_file_path, "r") as file_obj:
            sql_statement = file_obj.read()
        try:
            if SQLHandler().execute_ddl(cursor, sql_statement):
                print("Table Created")
            else:
                print("Skipping table creation")
        except IOError as f_ex:
            print("File {} not accessible, Error message : {}".format(
                ddl_file_path, f_ex))

        try:
            if SQLHandler().ingest_csv(cursor, data_file_path,
                                       self.features_filename):
                print("Data Ingested")
            else:
                print("Please Check CSV File")
        except Exception as ex:
            print("Unable to ingest : {}".format(data_file_path, ex))
        return
Example #12
0
class FinkMos:

    def __init__(self, x, y, tag_corpus):
        assert isinstance(x, pd.Series)
        self.tag_corpus = tag_corpus
        self.test_dict = Features().get_tests()
        self.test_vec = np.array([test['func'][1] for test in self.test_dict.values()])
        self.x = x
        self.y = y
        self.f_matrix_list = None  #
        self.linear_loss_done = None
        # self.word2number = {word: index for index, word in enumerate(x.value_counts().index)}
        # tc = tag_corpus.shape[0]
        self.fast_test = dict()
        self.fast_predict = dict()
        self.weight_mat = None
        self.tuple_5_list = None
        self.tup5_2index = dict()
        self.opt = None
        self.v = None
        self.f_v_train = None
        self.calc_from_mem = None

    def create_tuples(self):
        """
        tuple handling
        :Create: tuple_5_list (list of 5 tuple combinations)
            weight_mat (number of occurences of each tuple in dataset)
        :return:
        :rtype:
        """
        tx_0 = self.x.values
        ty_0 = self.y.values
        tx_1 = np.roll(tx_0, 1)
        tx_2 = np.roll(tx_1, 1)
        ty_1 = np.roll(ty_0, 1)
        ty_2 = np.roll(ty_1, 1)

        tuple_6_np = ty_0 + "_" + ty_1 + "_" + ty_2 + "_" + tx_0 + "_" + tx_1 + "_" + tx_2
        tuple_6_counts_series = pd.Series(tuple_6_np).value_counts()
        tuple_5_df = pd.DataFrame([ty_1, ty_2, tx_0, tx_1, tx_2]).T
        tuple_5_df.sort_values([0, 1, 2, 3, 4], inplace=True)  # sort the 5 tuple_list by
        tuple_5_df.drop_duplicates(inplace=True, keep='first')  # remove duplicates
        self.tuple_5_list = list(map(lambda x: list(x[1]), tuple_5_df.iterrows()))  # make list of every row of the DF

        #  create wight mask
        weight_mask = spar.csr_matrix((self.tag_corpus.shape[0], tuple_5_df.shape[0]), dtype=int)
        self.tup5_2index = {"_".join(x): num for num, x in enumerate(tuple_5_df.values)}
        for tup, count in tuple_6_counts_series.items():
            tup_0 = tup.split('_')[0]
            tup_5 = '_'.join(tup.split('_')[1:])
            ind_j = self.tup5_2index[tup_5]
            itemindex = np.where(self.tag_corpus == tup_0)
            ind_i = itemindex[0]
            weight_mask[ind_i, ind_j] = count
        self.weight_mat = weight_mask

    def create_feature_sparse_list_v2(self, training_fm=None):
        # return a list of sparse matrices, each matrix
        # tuple_5_list = self.tuple_5_list

        tuple_5_size = len(self.tuple_5_list)
        # tuple_0_list = self.tag_corpus  # [[elem1], [elem2], ...] ->
        tuple_0_size = self.tag_corpus.shape[0]
        num_test = len(self.test_dict)
        # returns a list of empty spars matrices
        result = [spar.csr_matrix((tuple_5_size, num_test), dtype=bool) for _ in range(tuple_0_size)]
        # iterate list of test names
        if self.y is None:  # inference mode
            calculated = spar.csr_matrix((tuple_5_size, tuple_0_size), dtype=int)
            for tup_5_ind, tup5 in enumerate(self.tuple_5_list):
                # if calculated before take value
                tup_5_str = ('_').join(tup5)
                if tup_5_str in training_fm.tup5_2index:  # TODO: _get instead of in
                    ind_in_train = training_fm.tup5_2index[tup_5_str]
                    calculated[tup_5_ind, :] = training_fm.f_v_train[:, ind_in_train]
                    continue
                for tup_0_ind, tup0 in enumerate(self.tag_corpus):
                    tup = (tup0,) + tuple(tup5)
                    result[tup_0_ind][tup_5_ind, :] = np.array([test(tup) for test in self.test_vec])
            self.calc_from_mem = calculated
        else:
            for test_ind, (key, val) in enumerate(self.test_dict.items()):
                # iterate list of tuples per test
                for tup in set(val['tup_list']):
                    tup_0_ind = np.where(tup[0] == self.tag_corpus)[0][0]
                    tup_5_ind = self.tup5_2index['_'.join(tup[1:])]
                    result[tup_0_ind][tup_5_ind, test_ind] = True
        self.f_matrix_list = result

    def loss_function(self, v):
        f_v = self.dot(v)  # add factor
        f_v_mask = self.weight_mat.multiply(f_v)
        l_fv = np.sum(np.sum(f_v_mask))  # * mask
        exp_ = np.exp(f_v)
        exp_sum = np.sum(exp_, axis=0)
        repetitions = np.array(self.weight_mat.sum(axis=0))  # from here not sparse
        ln = np.log(exp_sum) * repetitions
        sum_ln = np.sum(ln)
        return sum_ln - l_fv  # + 0.1 * np.linalg.norm(v)

    def loss_gradient(self, v):
        f_v = self.dot(v)  # dims: tup_0 x tup5
        e_f_v = np.exp(f_v)  # dims: tup0 x tup5
        z = np.sum(e_f_v, axis=1) + 1e-11  # dims: tup0 x tup5
        p = (e_f_v.T / z).T  # dims: tup0 x tup5
        f_p_tup5_list = []  # sum over tuples list
        f_v_tup_0_tests = []
        for tup_0_ind, sparse_matrix in enumerate(self.f_matrix_list):
            spar_t = sparse_matrix.T
            # Left
            weight_vec = self.weight_mat[tup_0_ind, :]
            weighted_slice = spar.csr_matrix.multiply(spar_t, weight_vec)
            f_v_tests = weighted_slice.sum(axis=1)
            f_v_tup_0_tests.append(f_v_tests)

            # Right
            f_p = spar.csr_matrix.multiply(spar_t, p[tup_0_ind, :])  # dims: tup5 x tests
            f_p_tup5_list.append(f_p)
        sparce_list = sum(f_p_tup5_list)
        sparce_list_w_weight = spar.csr_matrix.multiply(sparce_list, self.weight_mat.sum(axis=0))
        right = np.squeeze(np.array(sparce_list_w_weight.sum(axis=1)))

        left = np.array(f_v_tup_0_tests)  # dims 1 X dim(V)
        left_sum = np.squeeze(np.array(np.sum(left, axis=0)))
        regularization = 0.2 * v
        result = left_sum - right  # - regularization
        neg_result = - result
        return neg_result

    def dot(self, v):
        results = []
        for sparce_matrix in self.f_matrix_list:
            t = sparce_matrix.dot(v)
            results.append(t)
        return np.array(results)

    def minimize_loss(self):
        self.opt = minimize(self.loss_function,
                            np.ones(len(self.test_dict)),
                            jac=self.loss_gradient,
                            options=dict(disp=True,
                                         maxiter=15,
                                         # eps=1e-5,
                                         # gtol= 1e-6
                                         ),
                            method='CG',
                            callback=self.callback_cunf)
        self.v = self.opt.x
        self.f_v_train = self.dot(self.v)

    def callback_cunf(self, x):
        print(f'Current loss {self.loss_function(x)}')



    def prob_q2(self, v, y_token, training_fm):
        self.create_feature_sparse_list_v2(training_fm)  # creates f_matrix_list
        f_v = self.dot(v) + self.calc_from_mem.T  # dims tup0 x tup5
        y_nomin = np.array(f_v[y_token])  # dims tup5 x 1
        exp_ = np.array(np.exp(f_v)).squeeze()
        exp_sum = np.sum(exp_, axis=0)  # dims tup5 x 1
        prob = np.array(y_nomin / (exp_sum+1e-10))[0]  # dims tup5 x 1
        return prob