import time import conv_util import common_io import json import statistic_vocabulary import numpy as np text_file_in = '/home/ldap/lidongliang/lidongliang/data0825/train.final.lst' text_file_in2 = '/home/ldap/lidongliang/lidongliang/data0825/val.final.lst' vocab_out = '/home/ldap/lidongliang/lidongliang/data0825/vocab.pkl' nlpcaffe_data_out_dir = '/home/ldap/lidongliang/lidongliang/code/nlpcaffe/data/imagernn/' label_list1 = common_io.read_txt_lines( text_file_in ,remove_space = False) label_list2 = common_io.read_txt_lines( text_file_in2 ,remove_space = False) data_set = {} data_set['image_text.train'] = label_list1 data_set['image_text.val'] = label_list2 vocab_list = [] for vac in label_list1+label_list2: tmp = vac.split('\t') if len(tmp)>1: vocab_list.append(tmp[1]) else: vocab_list.append("")
def main(): label_list = common_io.read_txt_lines( text_file_in ,remove_space = True) misc = preProBuildWordVocab(label_list, word_count_threshold) common_io.dump_to_file(vocab_out , misc)