Esempio n. 1
0
# In[3]:

Encoder_max_len = 60
Decoder_max_len = 30
min_count = 3

# In[4]:

train_path = ["data/{}/train.csv".format(x) for x in ["length"]]
test_path = ["data/{}/test.csv".format(x) for x in ["length"]]

# In[5]:

print("### Loading Train Data ###")
data_agent = data_manager(train_path, train=True)

# In[6]:

print("### Loading Test Data ###")
test_agent = data_manager(test_path, train=False)

# ## Preprocessing and Padding

# In[7]:

idx_in_sen, idx_out_sen, mask_in, mask_out, length_in, idx2word, word2idx, remain_idx = transform_orig(
    [data_agent.orig_data, data_agent.out_sen],
    min_count=min_count,
    max_len=[Encoder_max_len, Decoder_max_len],
    path="Attn_ver1/tmp/tokenizer.pkl")
# ## Loading data

# In[3]:

input_file_name = sys.argv[1]
output_file_name = sys.argv[2]
try:
    model_index = sys.argv[3]
except:
    model_index = None



print("### Loading Test Data ###")
test_agent = data_manager(input_file_name , train=False)


# ## Preprocessing and Padding

# pickle.dump({"orig_word":[idx2word,word2idx] },
#             open(os.path.join(tmp_path,"tokenizer.pkl") , "wb"))

idx2word,word2idx = pickle.load(open(os.path.join(tmp_path,"tokenizer.pkl") , "rb"))["orig_word"]

##################################################################################################
#######################################  Building Model  #########################################
##################################################################################################

def Encoder(inputs , dim , name , init_state=None , t_len=20 , reuse=False , stack_flag=False):
    cell = tf.contrib.rnn.LSTMCell(dim,name=name,reuse=reuse)
exp_folder = "ALL_ver0"
model_path = "model_para"
tmp_path = "tmp"
log_path = "log"

exp_folder = exist_or_mkdir("./", exp_folder)
model_path = exist_or_mkdir(exp_folder, model_path)
tmp_path = exist_or_mkdir(exp_folder, tmp_path)
log_path = exist_or_mkdir(exp_folder, log_path)

max_len = 20
min_count = 3

print("\n### Loading Train Data ###")
data_agent = data_manager("data/all/train.csv", train=True)

print("\n### Loading Test Data ###")
test_agent = data_manager("data/all/test.csv", train=False)

## Preprocessing and Padding
print("\n### Preprocessing and Padding ###")
start_t = time.time()
idx_in_sen, idx_out_sen, mask_in, mask_out, idx2word, word2idx, remain_idx = transform_word(
    [data_agent.in_sen, data_agent.out_sen],
    min_count=min_count,
    max_len=max_len)

idx_gramma, idx2gramma, gramma2idx = transform_gramma(data_agent.gramma,
                                                      remain_idx,
                                                      max_len=max_len)