Example #1
0
    def add(self, data=None, row_ids=None, sync=False):
        '''add the data to the multiverso MatrixTable

        If row_ids is None, we will add all data, and the data
        should be a list, e.g. [1, 2, 3, ...]

        Otherwise we will add the data according to the row_ids

        Data type of `data` is numpy.ndarray with two-dimensional

        If sync is True, this call will blocked by IO until the call finish.
        Otherwise it will return immediately
        '''
        assert(data is not None)
        data = convert_data(data)

        if row_ids is None:
            assert(data.size == self._size)
            if sync:
                mv_lib.MV_AddMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
            else:
                mv_lib.MV_AddAsyncMatrixTableAll(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
        else:
            row_ids_n = len(row_ids)
            assert(data.size == row_ids_n * self._num_col)
            int_array_type = ctypes.c_int * row_ids_n
            if sync:
                mv_lib.MV_AddMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P),
                                               row_ids_n * self._num_col,
                                               int_array_type(*row_ids), row_ids_n)
            else:
                mv_lib.MV_AddAsyncMatrixTableByRows(self._handler, data.ctypes.data_as(C_FLOAT_P),
                                               row_ids_n * self._num_col,
                                               int_array_type(*row_ids), row_ids_n)
Example #2
0
def run_models(model_ensemble, data_file):
    feature_names = model_ensemble[0].feature_names

    df = read_data(data_file)
    df = convert_data(df)

    # Drop extra columns
    extra_columns = set(df.columns) - set(feature_names)
    logging.info(f'Columns present in now, but not in training {extra_columns}')
    df.drop(extra_columns, axis=1, inplace=True)

    # Add missing columns
    missing_columns = set(feature_names) - set(df.columns)
    logging.info(f'Columns present in training, but not in test {missing_columns}')
    for col in missing_columns:
        df[col] = 0

    df = df[feature_names]

    preds = pd.DataFrame(model.predict(df) for model in model_ensemble)

    result = pd.DataFrame()
    result['ident'] = df.reset_index()['ident']
    result['probs'] = preds.mean() > 0.5

    return result
Example #3
0
    def add(self, data, sync=False):
        '''add the data to the multiverso ArrayTable

        Data type of `data` is numpy.ndarray with one-dimensional

        If sync is True, this call will blocked by IO until the call finish.
        Otherwise it will return immediately
        '''
        data = convert_data(data)
        assert(data.size == self._size)
        if sync:
            mv_lib.MV_AddArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
        else:
            mv_lib.MV_AddAsyncArrayTable(self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
Example #4
0
    def __init__(self, size, init_value=None):
        '''Constructor for syncing array-like (one-dimensional) value.

        The `size` should be a int equal to the size of value we want to sync.
        If init_value is None, zeros will be used to initialize the tables,
        otherwise the table will be initialized as the init_value.
        Notice: if the init_value is different in different processes, the
        average of them will be used.
        '''
        self._handler = ctypes.c_void_p()
        self._size = size
        mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns.
            self.add(init_value / api.workers_num(), sync=True)
Example #5
0
    def __init__(self, size, init_value=None):
        '''Constructor for syncing array-like (one-dimensional) value.

        The `size` should be a int equal to the size of value we want to sync.
        If init_value is None, zeros will be used to initialize the table,
        otherwise the table will be initialized as the init_value.
        *Notice*: if the init_value is different in different processes, the
        average of them will be used.
        '''
        self._handler = ctypes.c_void_p()
        self._size = size
        mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns.
            self.add(init_value / api.workers_num(), sync=True)
Example #6
0
    def add(self, data, sync=False):
        '''add the data to the multiverso ArrayTable

        Data type of `data` is numpy.ndarray with one-dimensional

        If sync is True, this call will blocked by IO until the call finish.
        Otherwise it will return immediately
        '''
        data = convert_data(data)
        assert (data.size == self._size)
        if sync:
            mv_lib.MV_AddArrayTable(self._handler,
                                    data.ctypes.data_as(C_FLOAT_P), self._size)
        else:
            mv_lib.MV_AddAsyncArrayTable(self._handler,
                                         data.ctypes.data_as(C_FLOAT_P),
                                         self._size)
Example #7
0
    def __init__(self, size, init_value=None):
        '''Constructor for syncing array-like (one-dimensional) value.

        The `size` should be a int equal to the size of value we want to sync.
        If init_value is None, zeros will be used to initialize the table,
        otherwise the table will be initialized as the init_value.
        *Notice*: Only the init_value from the master will be used!
        '''
        self._handler = ctypes.c_void_p()
        self._size = size
        mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns. No matter whether
            # it is master worker,  we should call add to make sure it works in
            # sync mode
            self.add(init_value if api.is_master_worker() else np.zeros(init_value.shape), sync=True)
Example #8
0
    def __init__(self, size, init_value=None):
        '''Constructor for syncing array-like (one-dimensional) value.

        The `size` should be a int equal to the size of value we want to sync.
        If init_value is None, zeros will be used to initialize the table,
        otherwise the table will be initialized as the init_value.
        *Notice*: Only the init_value from the master will be used!
        '''
        self._handler = ctypes.c_void_p()
        self._size = size
        mv_lib.MV_NewArrayTable(size, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns. No matter whether
            # it is master worker,  we should call add to make sure it works in
            # sync mode
            self.add(init_value
                     if api.is_master_worker() else np.zeros(init_value.shape),
                     sync=True)
Example #9
0
def main(*data_files):
    if len(data_files) < 1:
        raise TypeError(
            f'No data files found! Usage: python {os.path.basename(__file__)} data_file [data_file]...'
        )

    start = datetime.now()
    logger.info(f'Starting {start}')

    df = read_data(*data_files)

    df = convert_data(df)

    model = train_model(df)

    save_model(model=model,
               filename=MODEL_PATH.joinpath(SESSION_NAME + '.pkl'))

    end = datetime.now()
    logger.info(f'All done {end}, elapsed: {end - start}')
Example #10
0
    def __init__(self, num_row, num_col, init_value=None):
        '''Constructor for syncing matrix-like (two-dimensional) value.

        The `num_row` should be the number of rows and the `num_col` should be
        the number of columns.

        If init_value is None, zeros will be used to initialize the table,
        otherwise the table will be initialized as the init_value.
        Notice: if the init_value is different in different processes, the
        average of them will be used.
        '''
        self._handler = ctypes.c_void_p()
        self._num_row = num_row
        self._num_col = num_col
        self._size = num_col * num_row
        mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns.
            self.add(init_value / api.workers_num(), sync=True)
Example #11
0
    def __init__(self, num_row, num_col, init_value=None):
        '''Constructor for syncing matrix-like (two-dimensional) value.

        The `num_row` should be the number of rows and the `num_col` should be
        the number of columns.

        If init_value is None, zeros will be used to initialize the tables,
        otherwise the table will be initialized as the init_value.
        Notice: if the init_value is different in different processes, the
        average of them will be used.
        '''
        self._handler = ctypes.c_void_p()
        self._num_row = num_row
        self._num_col = num_col
        self._size = num_col * num_row
        mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns.
            self.add(init_value / api.workers_num(), sync=True)
Example #12
0
    def add(self, data=None, row_ids=None, sync=False):
        '''add the data to the multiverso MatrixTable

        If row_ids is None, we will add all data, and the data
        should be a list, e.g. [1, 2, 3, ...]

        Otherwise we will add the data according to the row_ids

        Data type of `data` is numpy.ndarray with two-dimensional

        If sync is True, this call will blocked by IO until the call finish.
        Otherwise it will return immediately
        '''
        assert (data is not None)
        data = convert_data(data)

        if row_ids is None:
            assert (data.size == self._size)
            if sync:
                mv_lib.MV_AddMatrixTableAll(self._handler,
                                            data.ctypes.data_as(C_FLOAT_P),
                                            self._size)
            else:
                mv_lib.MV_AddAsyncMatrixTableAll(
                    self._handler, data.ctypes.data_as(C_FLOAT_P), self._size)
        else:
            row_ids_n = len(row_ids)
            assert (data.size == row_ids_n * self._num_col)
            int_array_type = ctypes.c_int * row_ids_n
            if sync:
                mv_lib.MV_AddMatrixTableByRows(self._handler,
                                               data.ctypes.data_as(C_FLOAT_P),
                                               row_ids_n * self._num_col,
                                               int_array_type(*row_ids),
                                               row_ids_n)
            else:
                mv_lib.MV_AddAsyncMatrixTableByRows(
                    self._handler,
                    data.ctypes.data_as(C_FLOAT_P), row_ids_n * self._num_col,
                    int_array_type(*row_ids), row_ids_n)
Example #13
0
    def __init__(self, num_row, num_col, init_value=None):
        '''Constructor for syncing matrix-like (two-dimensional) value.

        The `num_row` should be the number of rows and the `num_col` should be
        the number of columns.

        If init_value is None, zeros will be used to initialize the table,
        otherwise the table will be initialized as the init_value.
        *Notice*: Only the init_value from the master will be used!
        '''
        self._handler = ctypes.c_void_p()
        self._num_row = num_row
        self._num_col = num_col
        self._size = num_col * num_row
        mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns. No matter whether
            # it is master worker,  we should call add to make sure it works in
            # sync mode
            self.add(init_value if api.is_master_worker() else np.zeros(init_value.shape), sync=True)
Example #14
0
    def de_parse(cls, prefix, json):
        '''
            de parse
        '''
        result_dict = {}
        for key, value in json.iteritems():
            if key in {'_id'}:
                pass
            else:
                key_name = make_column_name(prefix, key)
                if key_name in USER_DATETIME_COLUMN_SET:
                    result_dict[key_name] = parse_datetime_into_hbase(value)
                elif key_name in USER_BOOLEAN_COLUMN_SET:
                    result_dict[key_name] = parse_boolean_into_hbase(value)
                elif key_name in USER_INT_COLUMN_SET:
                    result_dict[key_name] = parse_int_into_hbase(value)
                elif key_name in USER_LIST_COLUMN_SET:
                    json = import_simplejson()
                    result_dict[key_name] = json.dumps(value)
                else:
                    result_dict[key_name] = convert_data(value)

        return result_dict
Example #15
0
    def __init__(self, num_row, num_col, init_value=None):
        '''Constructor for syncing matrix-like (two-dimensional) value.

        The `num_row` should be the number of rows and the `num_col` should be
        the number of columns.

        If init_value is None, zeros will be used to initialize the table,
        otherwise the table will be initialized as the init_value.
        *Notice*: Only the init_value from the master will be used!
        '''
        self._handler = ctypes.c_void_p()
        self._num_row = num_row
        self._num_col = num_col
        self._size = num_col * num_row
        mv_lib.MV_NewMatrixTable(num_row, num_col, ctypes.byref(self._handler))
        if init_value is not None:
            init_value = convert_data(init_value)
            # sync add is used because we want to make sure that the initial
            # value has taken effect when the call returns. No matter whether
            # it is master worker,  we should call add to make sure it works in
            # sync mode
            self.add(init_value
                     if api.is_master_worker() else np.zeros(init_value.shape),
                     sync=True)
Example #16
0
    name_model = "dt_gridsearchcv.joblib"
    model_path = os.path.join(save_path,name_model)
    joblib.dump(tree_clf, model_path)
    tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
    print('DecisionTree Classifier Cross Validation Score', \
            round(tree_score.mean() * 100, 2).astype(str) + '%')
    
    print("-"*80)


if __name__ =="__main__":
    path = "/home/hoangnv68/BankFraudDetection/creditcard.csv"
    df = read_data(path)
    sub_df = Downsample_data(df)
    sub_df = Remove_ouliers(sub_df, ["V14", "V12", "V10"], 25, 75, 1.5)

    X = sub_df.drop("Class", axis=1)
    y = sub_df["Class"]

    X_train, X_test, y_train, y_test = convert_data(X, y)
    classifiers = {
        "LogisiticRegression": LogisticRegression(max_iter=1000),
        "KNearest": KNeighborsClassifier(),
        "Support Vector Classifier": SVC(),
        "DecisionTreeClassifier": DecisionTreeClassifier()
    }

    save_path = "/home/hoangnv68/BankFraudDetection/supervise_model/pretrained_model"
    train(X_train, X_test, y_train, y_test, save_path)
    gridsearchCV(X_train, X_test, y_train, y_test, save_path)
Example #17
0
def main(argv):

    print '\nSYSTEM START\n'
    print 'Emb Dim: %d\tHidden Dim: %d\tOptimization: %s\tLayer: %d\tEpoch: %d' %\
          (argv.emb, argv.hidden, argv.opt, argv.layer, argv.epoch)
    print 'Parameters to be saved: %s' % argv.save

    """data preprocessing"""
    print 'DATA Preprocessing...'
    corpus, vocab_word = utils.load_conll(argv.data)
    id_corpus = utils.convert_words_into_ids(corpus, vocab_word)
    train_samples = utils.convert_data(id_corpus)
    n_samples = len(id_corpus)
    print 'Samples: %d\tVocab: %d' % (n_samples, vocab_word.size())

    """symbol definition"""
    index = T.iscalar()
    w = T.ivector()
    d = T.ivector()
    n_hidden = argv.hidden
    n_words = argv.n_words
    batch_size = argv.batch

    """model setup"""
    print 'Compiling Theano Code...'
    model = lstm.LSTM(w=w, d=d, n_layers=argv.layer, vocab_size=vocab_word.size(), n_in=n_hidden, n_h=n_hidden,
                      n_words=n_words, batch_size=batch_size
                      )
    cost = model.nll
    opt = optimizers.main(name=argv.opt, cost=cost, params=model.params, emb=model.emb, x=model.x, w=model.w)

    """ train """
    def _train():
        train_model = theano.function(
            inputs=[index],
            outputs=[model.nll, model.errors],
            updates=opt,
            givens={
                w: train_samples[index * n_words * batch_size: (index+1) * n_words * batch_size],
                d: train_samples[index * n_words * batch_size + 1: (index+1) * n_words * batch_size + 1]
            },
            mode='FAST_RUN'
        )

        n_batch_samples = n_samples / n_words / batch_size
        print 'Vocabulary Size: %d\tBatch Sample Size: %d' % (vocab_word.size(), n_batch_samples)
        print '\nTrain START'

        for epoch in xrange(argv.epoch):
            print '\nEpoch: %d' % (epoch + 1)
            print '\tIndex: ',
            start = time.time()

            losses = []
            errors = []
            for b_index in xrange(n_batch_samples):
                if b_index % 100 == 0 and b_index != 0:
                    print b_index,
                    sys.stdout.flush()
                loss, error = train_model(b_index)
                losses.append(loss)
                errors.append(error)
            avg_loss = np.mean(losses)
            end = time.time()
            print '\tTime: %f seconds' % (end - start)
            print '\tAverage Negative Log Likelihood: %f' % avg_loss

            total = 0.0
            correct = 0
            for sent in errors:
                total += len(sent)
                for y_pred in sent:
                    if y_pred == 0:
                        correct += 1
            print '\tTrain Accuracy: %f' % (correct / total)
            if argv.save:
                model.save()

    _train()
def s2s_data_generator(s2s_df=duplets,
                       all_catalog=catalog_images,
                       batch_size=None):
    """
    A data generator for generating triplets, i.e, (input_image, positive_image, negative_image) on the fly before training.
    Select a random input image from the training set and select an positive_image with 
    same product id and negative image with different product id and generate batch of triplets
    
    The function keeps yielding a batch of triplets until the whole training process is complete.
    """
    orig_index_list = duplets.index.tolist()
    all_shop_index_list = catalog_images.index.tolist()
    dummy = np.zeros((1, 3 * N))

    while True:

        q_list = list()
        p_list = list()
        n_list = list()
        dummy_list = list()

        index_list = copy.copy(orig_index_list)

        while len(index_list) > 0:

            index = random.choice(index_list)
            product_id = duplets.loc[index, 'product_id']

            q_temp = duplets.loc[index, 'street_images']
            q_img = os.path.join(Path, q_temp + '.jpeg')

            p_temp = duplets.loc[index, 'shop_images']
            p_img = os.path.join(Path, p_temp + '.jpeg')

            while True:
                idx = random.choice(all_shop_index_list)
                prod_idx = catalog_images.loc[idx, 'product_id']

                if prod_idx != product_id:
                    temp = random.choice(catalog_images.loc[idx,
                                                            'shop_images'])
                    n_img = os.path.join(Path, temp + '.jpeg')

            q_img = os.path.join(Path, q_index + '.jpeg')
            p_img = os.path.join(Path, p_index + '.jpeg')
            n_img = os.path.join(Path, n_index + '.jpeg')

            res = bbox_mappings[q_index]

            left = res['left']
            top = res['top']
            right = left + res['width']
            bottom = top + res['height']

            query_img = Image.open(q_img)
            query_crop = query_img.crop((left, top, right, bottom))
            positive_img = Image.open(p_img)
            negative_img = Image.open(n_img)

            query = np.array(query_crop.resize((300, 300), Image.NEAREST))
            positive = np.array(positive_img.resize((300, 300), Image.NEAREST))
            negative = np.array(negative_img.resize((300, 300), Image.NEAREST))

            q_list.append(query_array)
            p_list.append(positive_array)
            n_list.append(negative_array)
            dummy_list.append(dummy)

            index_list.remove(index)

            if len(q_list) == batch_size or (len(index_list) == 0
                                             and len(q_list) > 0):
                yield convert_data(q_list, p_list, n_list, dummy_list)
                q_list = list()
                p_list = list()
                n_list = list()
                dummy_list = list()
Example #19
0
def main(argv):
    print '\nSYSTEM START'
    print '\nMODE: Training'
    print '\nRECURRENT HIDDEN UNIT: %s\n' % argv.unit

    print '\tTRAINING\t\tBatch: %d  Epoch: %d  Parameters Save: %s' % (
        argv.batch, argv.epoch, argv.save)
    print '\tINITIAL EMBEDDING\t %s' % argv.init_emb
    print '\tNETWORK STRUCTURE\tEmb Dim: %d  Hidden Dim: %d  Layers: %d' % (
        argv.emb, argv.hidden, argv.layer)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f %f  L2 Reg: %f' % (
        argv.opt, argv.lr1, argv.lr2, argv.reg)
    """ load corpus"""
    print '\n\tCorpus Preprocessing...'

    train_corpus = load_conll(argv.train_data, exclude=True)
    print '\tTrain Sentences: %d' % len(train_corpus)

    if argv.dev_data:
        dev_corpus = load_conll(argv.dev_data)
        print '\tDev   Sentences: %d' % len(dev_corpus)

    if argv.test_data:
        test_corpus = load_conll(argv.test_data)
        print '\tTest  Sentences: %d' % len(test_corpus)
    """ load initial embedding file """
    print '\n\tInitial Embedding Loading...'
    init_emb, vocab_word = load_init_emb(init_emb=argv.init_emb)
    print '\tVocabulary Size: %d' % vocab_word.size()
    """ convert words into ids """
    print '\n\tConverting Words into IDs...'

    tr_id_sents, tr_id_ctx, tr_marks, tr_prds, train_y, arg_dict = get_id_samples(
        train_corpus, vocab_word=vocab_word, sort=True)

    if argv.dev_data:
        dev_id_sents, dev_id_ctx, dev_marks, dev_prds, dev_y, dev_arg_dict =\
            get_id_samples(dev_corpus, vocab_word=vocab_word, a_dict=arg_dict)
    if argv.test_data:
        te_id_sents, te_id_ctx, te_marks, te_prds, test_y, test_arg_dict =\
            get_id_samples(test_corpus, vocab_word=vocab_word, a_dict=arg_dict)

    print '\tLabel size: %d' % arg_dict.size()
    dump_data(data=arg_dict,
              fn=argv.train_dir + 'arg_dict-%d' % (arg_dict.size()))
    """ convert formats for theano """
    print '\n\tCreating Training/Dev/Test Samples...'

    train_sample_x, train_sample_y = convert_data(tr_id_sents, tr_prds,
                                                  tr_id_ctx, tr_marks, train_y,
                                                  init_emb)
    print '\tTrain Samples: %d' % len(train_sample_x)

    if argv.dev_data:
        dev_sample_x, dev_sample_y = convert_data_test(dev_id_sents, dev_prds,
                                                       dev_id_ctx, dev_marks,
                                                       dev_y, init_emb)
        print '\tDev Samples: %d' % len(dev_sample_x)

    if argv.test_data:
        test_sample_x, test_sample_y = convert_data_test(
            te_id_sents, te_prds, te_id_ctx, te_marks, test_y, init_emb)
        print '\tTest Samples: %d' % len(test_sample_x)
    """symbol definition"""
    x = T.ftensor3()
    d = T.imatrix()

    n_in = init_emb.shape[1]
    n_h = argv.hidden
    n_y = arg_dict.size()
    reg = argv.reg
    batch = argv.batch
    """ Model Setup """
    print '\nTheano Code Compiling...'

    tagger = RNN(unit=argv.unit,
                 x=x,
                 d=d,
                 n_layers=argv.layer,
                 n_in=n_in,
                 n_h=n_h,
                 n_y=n_y,
                 reg=reg)

    train_model = theano.function(inputs=[x, d],
                                  outputs=[tagger.nll, tagger.errors],
                                  updates=tagger.updates,
                                  mode='FAST_RUN')

    test_model = theano.function(inputs=[x, d],
                                 outputs=[tagger.y_pred, tagger.errors],
                                 mode='FAST_RUN')
    """ Training """
    print '\nTRAIN START'

    best_dev_f = 0.0
    best_test_f = 0.0
    best_epoch = -1
    flag = False

    for epoch in xrange(argv.epoch):
        _train_sample_x, _train_sample_y = shuffle(train_sample_x,
                                                   train_sample_y)

        print '\nEpoch: %d' % (epoch + 1)
        print '\tIndex: ',
        start = time.time()

        losses = []
        errors = []

        sample_index = 0
        for index in xrange(len(train_sample_x)):
            batch_x = _train_sample_x[index]
            batch_y = _train_sample_y[index]

            for b_index in xrange(len(batch_x) / batch + 1):
                sample_index += 1
                if sample_index % 100 == 0:
                    print '%d' % sample_index,
                    sys.stdout.flush()

                sample_x = batch_x[b_index * batch:(b_index + 1) * batch]
                sample_y = batch_y[b_index * batch:(b_index + 1) * batch]

                if len(sample_x) == 0:
                    continue

                loss, error = train_model(sample_x, sample_y)

                losses.append(loss)
                errors.extend(error)

        end = time.time()
        avg_loss = np.mean(losses)
        total, correct = count_correct(errors)

        print '\tTime: %f seconds' % (end - start)
        print '\tAverage Negative Log Likelihood: %f' % avg_loss
        print '\tTrain Accuracy: %f' % (correct / total)
        """ Check model performance """
        if argv.dev_data:
            dev_f, predicts = test(test_model, dev_sample_x, dev_sample_y,
                                   dev_arg_dict, 'Dev')
            if best_dev_f < dev_f:
                best_dev_f = dev_f
                best_epoch = epoch
                """ Save Parameters """
                if argv.save:
                    fn = 'Layer-%d_Dim-%d_Batch-%d_Hidden-%d_Reg-%f_Epoch-%d' % (
                        argv.layer, argv.hidden, argv.batch, argv.hidden,
                        argv.reg, epoch)
                    dump_data(data=tagger, fn=argv.train_dir + fn)
                """ Output Results """
                output_results(
                    dev_corpus, dev_prds, arg_dict, predicts, argv.train_dir +
                    'Dev-result.layer%d.batch%d.hidden%d.opt-%s.reg-%f.epoch%d.txt'
                    % (argv.layer, argv.batch, argv.hidden, argv.opt, argv.reg,
                       epoch))
                flag = True
            print '\t### Best Dev F Score: %f  Epoch: %d ###' % (
                best_dev_f, best_epoch + 1)

        if argv.test_data:
            test_f, predicts = test(test_model, test_sample_x, test_sample_y,
                                    test_arg_dict, 'Test')
            if flag:
                best_test_f = test_f
                flag = False
                output_results(
                    test_corpus, te_prds, arg_dict, predicts, argv.train_dir +
                    'Test-result.layer%d.batch%d.hidden%d.opt-%s.reg-%f.epoch%d.txt'
                    % (argv.layer, argv.batch, argv.hidden, argv.opt, argv.reg,
                       epoch))
            if argv.dev_data:
                print '\t### Best Test F Score: %f  Epoch: %d ###' % (
                    best_test_f, best_epoch + 1)
Example #20
0
    testing_data = readData("KBQA_data/sq_relations/test.replace_ne.withpool",
                            False)
    print "Start to read validation data"
    valid_data = readData("KBQA_data/sq_relations/valid.replace_ne.withpool",
                          False)
    print "\n"

    print "start to get word dictionary for questions and relations"
    question_words = wordStatForQuestion(training_data)
    relation_words = wordStatForRelation(relation_list_seg,
                                         relation_list_seg_all, training_data)
    print "\n"

    print "Start to convert data to vectors..."
    training_data_conv = convert_data(question_words, relation_words,
                                      relation_list_seg, relation_list_seg_all,
                                      training_data)
    print "\nThere are", len(training_data_conv), "in the training data"
    testing_data_conv = convert_data(question_words, relation_words,
                                     relation_list_seg, relation_list_seg_all,
                                     testing_data)
    print "\nThere are", len(testing_data_conv), "in the testing data"
    valid_data_conv = convert_data(question_words, relation_words,
                                   relation_list_seg, relation_list_seg_all,
                                   valid_data)
    print "\nThere are", len(valid_data_conv), "in the valid data"
    print "\n"

    print "Start to calculate the max length for sequence length..."
    max_length_dict = data_static(training_data_conv, testing_data_conv,
                                  valid_data_conv)