import time
import conv_util 
import common_io 
import json
import statistic_vocabulary
import numpy as np

text_file_in = '/home/ldap/lidongliang/lidongliang/data0825/train.final.lst'
text_file_in2 = '/home/ldap/lidongliang/lidongliang/data0825/val.final.lst'

vocab_out  = '/home/ldap/lidongliang/lidongliang/data0825/vocab.pkl'


nlpcaffe_data_out_dir = '/home/ldap/lidongliang/lidongliang/code/nlpcaffe/data/imagernn/'

label_list1 = common_io.read_txt_lines( text_file_in ,remove_space = False)
label_list2 = common_io.read_txt_lines( text_file_in2 ,remove_space = False)


data_set = {}
data_set['image_text.train'] = label_list1
data_set['image_text.val'] = label_list2


vocab_list = []
for vac in label_list1+label_list2:
    tmp = vac.split('\t')
    if len(tmp)>1:
        vocab_list.append(tmp[1])
    else: 
        vocab_list.append("")
def main():
    
    
    label_list = common_io.read_txt_lines( text_file_in ,remove_space = True)
    misc = preProBuildWordVocab(label_list, word_count_threshold)
    common_io.dump_to_file(vocab_out , misc)