Esempio n. 1
0
def test(path, filter=True):
    # read dataset
    ds = dataset(path)
    entity_size = ds.entity_nums + 1  # add 1 avoid out_of_dict
    relation_size = ds.relation_nums[0] + 1
    model_path = path + '/model/'

    e_emb, r_emb = np.load(model_path + '_TransE_ent.npy'), \
        np.load(model_path + '_TransE_rel.npy')

    # filter build
    def get_head_tail(pairs, filter_list=[]):
        for p in pairs:
            filter_list.append(p[0])
            filter_list.append(p[1])
        filter_list = list(set(filter_list))
        return filter_list

    filter_l = get_head_tail(ds.train_pair)
    filter_l = get_head_tail(ds.test_pair, filter_l)
    filter_l = get_head_tail(ds.val_pair, filter_l)

    print("filter build done.")

    # eval
    eval_h, eval_t, index_h, index_t = [], [], [], []

    for test_p in ds.test_pair[:100]:
        h, t, r = e_emb[test_p[0]], e_emb[test_p[1]], r_emb[test_p[2]]
        index_h.append(test_p[0])
        index_t.append(test_p[1])

        if filter:
            head_predict_list = [l1_distance(e_emb[i], t, r) for i in filter_l]
            tail_predict_list = [l1_distance(h, e_emb[i], r) for i in filter_l]
        else:
            head_predict_list = [l1_distance(e_emb[i], t, r) for i in range(entity_size)]
            tail_predict_list = [l1_distance(h, e_emb[i], r) for i in range(entity_size)]

        head_sorted_rank = np.argsort(head_predict_list)
        tail_sorted_rank = np.argsort(tail_predict_list)

        eval_h.append(head_sorted_rank)
        eval_t.append(tail_sorted_rank)

    h_result = eval_ranking(rank_l=eval_h, index_l=index_h), eval_top_k(
            rank_l=eval_h, index_l=index_h)
    t_result = eval_ranking(rank_l=eval_t, index_l=index_t), eval_top_k(
            rank_l=eval_t, index_l=index_t)

    print("result of h predict is {0} (rank,top_10), t predict is {1}.".format(
            h_result, t_result))
    return h_result, t_result
Esempio n. 2
0
'''
分类变量创建虚拟变量
'''
from util import dataset
import pandas as pd

print('Loading data......')
train = dataset.load('categorical', 'train')
test = dataset.load('categorical', 'test')
cat_col = dataset.load('categorical', 'feature')

for col in cat_col:
    dummies = pd.get_dummies(train[col], prefix=col)
    train = pd.concat([train, dummies], axis=1)
    train.drop([col], axis=1, inplace=True)

    dummies = pd.get_dummies(test[col], prefix=col)
    test = pd.concat([test, dummies], axis=1)
    test.drop([col], axis=1, inplace=True)

print('Saving data......')
dataset(categorical_dummy=train).save('train')
dataset(categorical_dummy=test).save('test')

print('Done!')
Esempio n. 3
0
'''
连续变量进行对数转换
'''
from util import dataset
from sklearn.preprocessing import MinMaxScaler

print('Loading data......')
train = dataset.load('numeric', 'train').astype(float)
test = dataset.load('numeric', 'test').astype(float)
num_col = dataset.load('numeric', 'feature')

scaler = MinMaxScaler()
for col in num_col:
    scaler.fit(train[col].values.reshape(-1, 1))
    train[col] = scaler.transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))

print(train.head())
print('=' * 20)
print(test.head())
print('=' * 20)

print('Saving data......')
dataset(numeric_maxmin=train).save('train')
dataset(numeric_maxmin=test).save('test')

print('Done!')
Esempio n. 4
0
#     if y == 0:
#         return 0
#     else:
#         return x / y

# df['AverageWorkingYears'] = df[['WorkingYearsBefore',
#                                 'NumCompaniesWorked']].apply(
#                                     average_years, axis=1)

X_train = df[df['source'] == 'train'].copy()
X_test = df[df['source'] == 'test'].copy()

X_train.drop(['source'], axis=1, inplace=True)
X_test.drop(['source'], axis=1, inplace=True)

dataset(new_feature=X_train).save('train')
dataset(new_feature=X_test).save('test')

# dataset(new_numeric=X_train).save('train')
# dataset(new_numeric=X_test).save('test')

# process = NumProcess(X_train, X_test)
# train, test = process.boxcox()
# dataset(new_numeric_boxcox=train).save('train')
# dataset(new_numeric_boxcox=test).save('test')

# train, test = process.log1p()
# dataset(new_numeric_log1p=train).save('train')
# dataset(new_numeric_log1p=test).save('test')

# train, test = process.maxmin()
Esempio n. 5
0
print('=' * 20)
print(test.isnull().sum())
print('=' * 20)

# 将分类变量转化为数值
label_enc = LabelEncoder()
for x in [col for col in cat_col if train.dtypes[col] == 'object']:
    label_enc.fit(train[x])
    train[x] = label_enc.transform(train[x])
    test[x] = label_enc.transform(test[x])

num_col = [x for x in test.columns if x not in cat_col]

# 将数据保存为pickle文件
print('Saving feature name')
dataset(numeric=num_col).save('feature')
dataset(categorical=cat_col).save('feature')

print('Saving train set')
dataset(train=train).save('all')

print('Saving test set')
dataset(test=test).save('all')

print('Saving categorical data')
dataset(categorical=train[cat_col]).save('train')
dataset(categorical=test[cat_col]).save('test')
np.save('cat.npy', train[cat_col])

print('Saving numeric data')
dataset(numeric=train[num_col]).save('train')
Esempio n. 6
0
'''
连续变量进行对数转换
'''
from util import dataset
from sklearn.preprocessing import StandardScaler

print('Loading data......')
train = dataset.load('numeric', 'train').astype(float)
test = dataset.load('numeric', 'test').astype(float)
num_col = dataset.load('numeric', 'feature')

scaler = StandardScaler()
for col in num_col:
    scaler.fit(train[col].values.reshape(-1, 1))
    train[col] = scaler.transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))

print(train.head())
print('=' * 20)
print(test.head())
print('=' * 20)

print('Saving data......')
dataset(numeric_stdscale=train).save('train')
dataset(numeric_stdscale=test).save('test')

print('Done!')
Esempio n. 7
0
    'DistanceFromHome', 'Education', 'PerformanceRating',
    'RelationshipSatisfaction', 'TrainingTimesLastYear'
], axis=1, inplace=True)
test.drop([
    'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
    'YearsWithCurrManager', 'JobRole', 'StockOptionLevel', 'Gender',
    'DistanceFromHome', 'Education', 'PerformanceRating',
    'RelationshipSatisfaction', 'TrainingTimesLastYear'
], axis=1, inplace=True)

print(train.head())

new_cat = [x for x in train.columns if x in cat_col]
new_num = [x for x in train.columns if x in num_col]
new_ord = [x for x in train.columns if x in ord_col]

print('Saving data...')
dataset(numeric=new_num, categorical=new_cat, order=new_ord).save('feature')

dataset(train=train, test=test).save('all')

dataset(
    categorical=train[new_cat],
    numeric=train[new_num],
    order=train[new_ord]).save('train')
dataset(
    categorical=test[new_cat], numeric=test[new_num],
    order=test[new_ord]).save('test')

print('Done!')
Esempio n. 8
0
def trans_e_model(path):

    #read dataset
    ds = dataset(path)
    entity_size = ds.entity_nums + 1  #add 1 avoid out_of_dict
    relation_size = ds.relation_nums[0] + 1
    model_path = path + 'model/'

    #the distance of h r t
    def l1_energy(batch):
        #h = t+r
        return tf.reduce_sum(
            tf.abs(batch[:, 1, :] - batch[:, 0, :] - batch[:, 2, :]), 1)

    with tf.device('/cpu:0'):
        e_embedding_table = tf.Variable(tf.truncated_normal(
            [entity_size, embedding_size],
            stddev=1.0 / math.sqrt(embedding_size)),
                                        name='e_embed')
        r_embedding_table = tf.Variable(tf.truncated_normal(
            [relation_size, embedding_size],
            stddev=1.0 / math.sqrt(embedding_size)),
                                        name='r_embed')

    postive_sample = tf.placeholder(tf.int32,
                                    shape=[batch_size, 3],
                                    name='p_sample')
    negtive_sample = tf.placeholder(tf.int32,
                                    shape=[batch_size, 3],
                                    name='n_sample')

    pos_embed_e = tf.nn.embedding_lookup(e_embedding_table,
                                         postive_sample[:, :2])
    pos_embed_r = tf.nn.embedding_lookup(r_embedding_table,
                                         postive_sample[:, -1:])
    pos_embed = tf.concat([pos_embed_e, pos_embed_r], axis=1)
    neg_embed_e = tf.nn.embedding_lookup(e_embedding_table,
                                         negtive_sample[:, :2])
    neg_embed_r = tf.nn.embedding_lookup(r_embedding_table,
                                         negtive_sample[:, -1:])
    neg_embed = tf.concat([neg_embed_e, neg_embed_r], axis=1)

    p_loss, n_loss = l1_energy(pos_embed), l1_energy(neg_embed)

    loss = tf.reduce_sum(tf.nn.relu(margin + p_loss - n_loss))  #loss of TransE
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)  #opt

    #session
    with tf.Session(config=tf_config) as sess:
        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver(max_to_keep=None)
        #e_emb, r_emb = [],[]
        print("start training with total {0} epochs and each batch size is{1}".
              format(epochs, batch_size))
        for e in range(epochs):
            for step in range(len(ds.train_pair) / batch_size):
                p, n = ds.get_next_batch(batch_size=batch_size,
                                         corpus=ds.train_pair)
                feed_dict = {postive_sample: p, negtive_sample: n}
                loss_val, _, e_emb, r_emb = sess.run(
                    [loss, optimizer, e_embedding_table, r_embedding_table],
                    feed_dict=feed_dict)
            print(" loss_val {1} at epoch {2}".format(step, loss_val, e))
        saver.save(sess, save_path=model_path + '_TransE.model')
        np.save(model_path + "_TransE_ent.npy", e_emb)
        np.save(model_path + "_TransE_rel.npy", r_emb)
        print("Train Done!")
def run_ctc():
    print("cur dir: ", os.path.curdir)
    ckpt = tf.train.get_checkpoint_state('./checkpoint_10/')
    checkpoint_file = ckpt.model_checkpoint_path
    config_file = str('./config_4.json')
    img_dir = str('./predictImg/')
    print("len arg: ", len(sys.argv))
    if len(sys.argv) == 1:
        print("Execution without arguments, default arguments")
        print("checkpoints_file=", checkpoint_file)
        print("config_file=", config_file)
        print("img_dir=", img_dir)
    elif len(sys.argv) == 2:
        print("Execution without some arguments, default arguments")
        print("checkpoints_file=", checkpoint_file)
        print("config_file=", config_file)
        img_dir = str(sys.argv[1])

    elif len(sys.argv) == 3:
        print("Execution without some arguments, default arguments")
        print("config_file=", config_file)
        print("img_dir=", img_dir)
        img_dir = str(sys.argv[1])
        checkpoint_file = str(sys.argv[2])

    elif len(sys.argv) == 4:
        img_dir = str(sys.argv[1])
        checkpoint_file = str(sys.argv[2])
        config_file = str(sys.argv[3])

    else:
        print()
        print("ERROR")
        print("Wrong number of arguments. Execute:")
        print(
            ">> python3 predict.py [checkpoint_file] [config_file] [img_dir]")
        print(
            "e.g. python predict.py ./checkpoints/model.ckpt_1000 config.json ./img_to_predict/"
        )
        exit(1)

    try:
        config = json.load(open(config_file))
    except FileNotFoundError:
        print()
        print("ERROR")
        print("No such config file : " + config_file)
        exit(1)

    BATCH_SIZE = 4
    std_height = 300
    std_width = 1024
    ctc_input_len = int(config['ctc_input_len'])
    word_len = int(config['word_len'])

    net = modelctc1.model(config)
    graph = net[0]
    X = net[1]
    Y = net[2]
    keep_prob = net[3]
    seq_len = net[4]
    optimizer = net[5]
    cost = net[6]
    ler = net[7]
    decoded = net[8]
    wer = net[9]

    #result_test = pd.DataFrame()
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allocator_type = 'BFC'

    with tf.Session(graph=graph, config=sess_config) as session:
        #with tf.Session(graph=graph) as session:
        saver = tf.train.Saver()
        saver.restore(session, checkpoint_file)
        print("Loaded Model")

        predict_set = util.dataset(img_dir, BATCH_SIZE, ctc_input_len,
                                   word_len, 0, 0)
        cont = 1
        while cont > 0:
            outputs = []
            pre_inputs, pre_seq_len, img_list = predict_set.extract_predict_data_batch(
                std_height, std_width)
            #print("img list: ", img_list)
            if len(pre_inputs) > 0:
                predict_feed = {
                    X: pre_inputs,
                    keep_prob: 1,
                    seq_len: pre_seq_len
                }
                result = session.run(decoded[0], predict_feed)
                #print("result: ", result.values)
                #print("result.indices: ", result.indices)
                output = convert_word(result.indices, result.values,
                                      result.dense_shape)
                #print("val step: ", count, "total cost: ", total_val_cost, "total ler: ", total_val_ler)
            else:
                cont = 0
            print("outputs: ", outputs)
            for img_file, word in zip(img_list, output):
                print("image: " + img_file + "predict: " + str(word))

        return outputs
Esempio n. 10
0
连续变量进行对数转换
'''
from util import dataset
from scipy import stats

print('Loading data......')
train = dataset.load('numeric', 'train').astype(float)
test = dataset.load('numeric', 'test').astype(float)
num_col = dataset.load('numeric', 'feature')

for col in num_col:
    if stats.skew(train[col]) > 0.25:
        values, lam = stats.boxcox(train[col].values + 1)
        train[col] = values
        print(col)

    if stats.skew(test[col]) > 0.25:
        values, lam = stats.boxcox(test[col].values + 1)
        test[col] = values

print(train.head())
print('=' * 20)
print(test.head())
print('=' * 20)

print('Saving data......')
dataset(numeric_boxcox=train).save('train')
dataset(numeric_boxcox=test).save('test')

print('Done!')
Esempio n. 11
0
temp = pd.DataFrame({
    'Attriton':
    train.groupby('JobRole')['Attrition'].mean().sort_values(),
    'ranking':
    np.arange(1, 10)
})
train['JobRole'] = train['JobRole'].map(lambda x: temp.loc[x, 'ranking'])
test['JobRole'] = test['JobRole'].map(lambda x: temp.loc[x, 'ranking'])

temp = pd.DataFrame({
    'Attriton':
    train.groupby('MaritalStatus')['Attrition'].mean().sort_values(),
    'ranking':
    np.arange(1, 4)
})
train['MaritalStatus'] = train['MaritalStatus'].map(
    lambda x: temp.loc[x, 'ranking'])
test['MaritalStatus'] = test['MaritalStatus'].map(
    lambda x: temp.loc[x, 'ranking'])

train.drop(['Attrition'], axis=1, inplace=True)

print(train.head())

print('Saving data......')
dataset(custom_label=train).save('train')
dataset(custom_label=test).save('test')

print('Done!')
Esempio n. 12
0
target = dataset.load('target', 'train')

df = pd.concat([train, target], axis=1)

for col in num_col:
    _, interval_list = chimerge.ChiMerge(df, col, 'Attrition')
    train[col] = train[col].map(lambda x: meger(x, interval_list))
    test[col] = test[col].map(lambda x: meger(x, interval_list))

print(train.head())
print('=' * 20)
print(test.head())
print('=' * 20)

print('Saving data......')
dataset(numeric_disc=train).save('train')
dataset(numeric_disc=test).save('test')

train['source'] = 'train'
test['source'] = 'test'

df = pd.concat([train, test], axis=0)

for col in num_col:
    dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, dummies], axis=1)
    df.drop([col], axis=1, inplace=True)

train = df[df['source'] == 'train'].copy()
test = df[df['source'] == 'test'].copy()
Esempio n. 13
0
@author: miha
"""

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()
import kmeans
from util import dataset

points = np.vstack(((np.random.randn(150, 2) * 0.75 + np.array([1, 0])),
                    (np.random.randn(50, 2) * 0.25 + np.array([-0.5, 0.5])),
                    (np.random.randn(50, 2) * 0.5 + np.array([-0.5, -0.5]))))

dataset = dataset(points)
dataset.reduce(5)

plt.scatter(points[:, 0], points[:, 1])
#ax = plt.gca()
#ax.add_artist(plt.Circle(np.array([1, 0]), 0.75/2, fill=False, lw=3))
#ax.add_artist(plt.Circle(np.array([-0.5, 0.5]), 0.25/2, fill=False, lw=3))
#ax.add_artist(plt.Circle(np.array([-0.5, -0.5]), 0.5/2, fill=False, lw=3))

#centroids = kmeans.cluster(points, 3)
centroids, closest = kmeans.cluster(dataset.reduced_data, 3)

arg1 = np.argwhere(closest == 0)
cluster1 = np.array(points[arg1])
print(cluster1)
plt.scatter(cluster1[:, 0], cluster1[:, 1], c='g')
Esempio n. 14
0
@author: miha
"""

from util import files
#from util import kmeans
import kmeans
from util import dataset
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

data = files.read_numpy("data.csv")

dataset = dataset(data)
dataset.reduce(10)

row_count = dataset.row_count()
column_count = dataset.column_count()

populated = np.array((row_count, column_count))

for col in range(0, column_count):
    removed = dataset.remove_column(col)
    y = removed[0]
    X = removed[1]
    k = kmeans.get_centroid_count(y)
    centroids, closest = kmeans.cluster(dataset.reduced_data, k)

x = dataset.remove_column(2)
Esempio n. 15
0
'''
连续变量进行对数转换
'''
from util import dataset
import pandas as pd
import numpy as np

print('Loading data......')
train = dataset.load('numeric', 'train')
test = dataset.load('numeric', 'test')
num_col = dataset.load('numeric', 'feature')

for col in num_col:
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

print(train.head())
print('=' * 20)
print(test.head())
print('=' * 20)

print('Saving data......')
dataset(numeric_log1p=train).save('train')
dataset(numeric_log1p=test).save('test')

print('Done!')
Esempio n. 16
0
        train_score = cv_result['train_score'].mean()
        test_score = cv_result['test_score'].mean()

        result.loc[len(result)] = [
            name, '{} + {}:'.format(num_feature, cat_feature), train_score,
            test_score
        ]

        print('train score:{:.4f}, test score:{:.4f}'.format(
            train_score, test_score))

        clf.fit(X, y)
        y_pred = clf.predict(X_test)

        submission = pd.DataFrame({
            'Loan_Status': y_pred,
            'Loan_ID': X_test.index.tolist()
        })
        # submission['Loan_Status'] = submission['Loan_Status'].map({
        #     1: 'Y',
        #     0: 'N'
        # })
        filename = './result/{}_{}_{}.csv'.format(num_feature, cat_feature,
                                                  name)
        submission.to_csv(filename, index=False)

dataset(cv_result=result).save('all')

print('Done!')