def preProBuildWordVocab(sentence_iterator, word_count_threshold = 5): # count up all word counts so that we can threshold # this shouldnt be too expensive of an operation print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ) t0 = time.time() total_sent_len = len(sentence_iterator) total_world_cnt = 0 word_counts = {} for sent in sentence_iterator: line_seg = conv_util.segment_sentence_cn(sent) for w in line_seg: word_counts[w] = word_counts.get(w, 0) + 1 for w in word_counts.keys(): total_world_cnt = total_world_cnt + word_counts[w] vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] print 'filtered words from %d to %d in %.2fs' % (len(word_counts), len(vocab), time.time() - t0) # with K distinct words: # - there are K+1 possible inputs (START token and all the words) # - there are K+1 possible outputs (END token and all the words) # we use ixtoword to take predicted indeces and map them to words for output visualization # we use wordtoix to take raw words and get their index in word vector matrix ixtoword = {} #ixtoword[0] = '.' # period at the end of the sentence. make first dimension be end token wordtoix = {} #wordtoix['#START#'] = 0 # make first vector be the start token ix = 1 for w in vocab: wordtoix[w] = ix ixtoword[ix] = w ix += 1 misc = {} misc['wordtoix'] = wordtoix misc['ixtoword'] = ixtoword misc['total_sent_len'] = total_sent_len misc['total_world_cnt'] = total_world_cnt misc['avg_snt_len'] = float(total_world_cnt)/total_sent_len print (total_sent_len,total_world_cnt,misc['avg_snt_len']) print 'vocab size = %d'%len(misc['ixtoword']) return misc
ix_label_out_file = nlpcaffe_data_out_dir + file_name ix_label_out = [] for i in xrange(0,len(label_list)): labels = label_list[i] label_split = labels.split('\t') if len(label_split)<2: ix_label_out.append("") continue #!!! each_label = label_split[1] token_label = conv_util.segment_sentence_cn(each_label) ix_token_label = [] for each_token in token_label: if each_token in wordtoix: ix_token_label.append(str(wordtoix[each_token])) ix_token_label_str = " ".join(ix_token_label) ix_label_out.append(ix_token_label_str) print "set %s size is %d"%(file_name,len(ix_label_out)) common_io.write_txt_lines(ix_label_out_file,ix_label_out)