def translate_maxibatch(maxibatch, num_to_target, num_prev_translated, mask=0): """Translates an individual maxibatch. Args: maxibatch: a list of sentences. num_to_target: dictionary mapping target vocabulary IDs to strings. num_prev_translated: the number of previously translated sentences. """ # Sort the maxibatch by length and split into minibatches. try: minibatches, idxs = util.read_all_lines(config, maxibatch, minibatch_size) except exception.Error as x: logging.error(x.msg) sys.exit(1) # Translate the minibatches and store the resulting beam (i.e. # translations and scores) for each sentence. beams = [] for x in minibatches: y_dummy = numpy.zeros(shape=(len(x), 1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, config.factors, maxlen=None) sample = translate_batch(session, sampler, x, x_mask, max_translation_len, normalization_alpha) beams.extend(sample) num_translated = num_prev_translated + len(beams) logging.info('Translated {} sents'.format(num_translated)) # Put beams into the same order as the input maxibatch. tmp = numpy.array(beams, dtype=numpy.object) ordered_beams = tmp[idxs.argsort()] # Write the translations to the output file. for i, beam in enumerate(ordered_beams): if nbest: num = num_prev_translated + i for sent, cost in beam: translation = util.seq2words(sent, num_to_target) line = "{} ||| {} ||| {}\n".format(num, translation, str(cost)) output_file.write(line) else: best_hypo, cost = beam[0] # print(best_hypo) eos_idx = list(best_hypo).index(0) if 0 in best_hypo else len( best_hypo) best_hypo = best_hypo[:eos_idx] best_hypo = best_hypo[:len(best_hypo) - mask] if len(best_hypo) > mask else [] best_hypo = list(best_hypo) + [0] # print(best_hypo) line = util.seq2words(best_hypo, num_to_target) + '\n' output_file.write(line)
def _send_jobs(self, input_, translation_settings): """ """ source_batches = [] try: batches, idxs = util.read_all_lines(self._options[0], input_, self._batch_size) except exception.Error as x: logging.error(x.msg) for process in self._processes: process.terminate() sys.exit(1) for idx, batch in enumerate(batches): input_item = QueueItem( verbose=self._verbose, k=translation_settings.beam_size, normalization_alpha=translation_settings.normalization_alpha, nbest=translation_settings.n_best, batch=batch, idx=idx, request_id=translation_settings.request_id) self._input_queue.put(input_item) source_batches.append(batch) return idx + 1, source_batches, idxs
def EventCoreference_t(): ''' test ''' PATH = u"/home/lzh/Documents/SinoCoreferencer/突发事件/社会安全/topic62" testLoc = "/home/lzh/Documents/SinoCoreferencer/test" childfiles = [] for dirpath, dirpathnames, filenames in os.walk(PATH): directory = os.path.join(dirpath, 'directory.all') dir_lines = read_all_lines(directory) for line in dir_lines: f_test = open(testLoc, 'w') absoulePath = os.path.join(dirpath, line).encode('utf-8') print absoulePath f_test.write(absoulePath + '\n') f_test.flush() input_file = line + '.shtml.out' each_input = 'topic62_' input_dir = os.path.join('/home/lzh/Downloads/data/', each_input, input_file) FileGeneration(dirpath, input_dir, line) os.chdir("/home/lzh/Documents/SinoCoreferencer/") os.system("./run-coreference.sh test") Post_arg(dirpath, input_dir, line)
def InputProcess(inputFile): input_lines = read_all_lines(inputFile) attr_set = [] attr = [] for line in input_lines: pos_set = [] for i in range(len(attr_set)): pos_set.append(attr_set[i][2]) attr = line.strip().split('|') word = attr[0].split(',') attr[0] = word[0] if attr[2] in pos_set: for i in range(len(attr_set)): if attr[2] == attr_set[i][2]: attr_set[i].append(word[1]) # attr_set[i].append(word[2]) else: attr.append(word[1]) # attr.append(word[2]) attr_set.append(attr) # for a in attr_set: # print a return attr_set
def translate_maxibatch(maxibatch, model_set, num_to_target, num_prev_translated): """Translates an individual maxibatch. Args: maxibatch: a list of sentences. model_set: an InferenceModelSet object. num_to_target: dictionary mapping target vocabulary IDs to strings. num_prev_translated: the number of previously translated sentences. """ # Sort the maxibatch by length and split into minibatches. try: pre_minibatches, minibatches, idxs = util.read_all_lines( configs[0], maxibatch, minibatch_size) except exception.Error as x: logging.error(x.msg) sys.exit(1) # Translate the minibatches and store the resulting beam (i.e. # translations and scores) for each sentence. beams = [] for px, x in zip(pre_minibatches, minibatches): y_dummy = numpy.zeros(shape=(len(x), 1)) px, x, x_mask, _, _ = util.prepare_data(x, y_dummy, configs[0].factors, px, maxlen=None) sample = model_set.decode(session=session, px=px, x=x, x_mask=x_mask, beam_size=beam_size, normalization_alpha=normalization_alpha) beams.extend(sample) num_translated = num_prev_translated + len(beams) logging.info('Translated {} sents'.format(num_translated)) # Put beams into the same order as the input maxibatch. tmp = numpy.array(beams, dtype=numpy.object) ordered_beams = tmp[idxs.argsort()] # Write the translations to the output file. for i, beam in enumerate(ordered_beams): if nbest: num = num_prev_translated + i for sent, cost in beam: translation = util.seq2words(sent, num_to_target) line = "{} ||| {} ||| {}\n".format(num, translation, str(cost)) output_file.write(line) else: best_hypo, cost = beam[0] line = util.seq2words(best_hypo, num_to_target) + '\n' output_file.write(line)
def init_test_rel_dict(): """ 训练语料编号到关系类型的映射 """ all_lines = read_all_lines('corpus/TEST_FILE_KEY.TXT') num2rel_dict = nltk.defaultdict(str) for line in all_lines: num, rel = line.split('\t')[:] num2rel_dict[num] = rel return num2rel_dict
def translate_maxibatch(maxibatch, model_set, num_to_target, num_prev_translated): """Translates an individual maxibatch. Args: maxibatch: a list of sentences. model_set: an InferenceModelSet object. num_to_target: dictionary mapping target vocabulary IDs to strings. num_prev_translated: the number of previously translated sentences. """ # Sort the maxibatch by length and split into minibatches. try: minibatches, idxs = util.read_all_lines(configs[0], maxibatch, minibatch_size) except exception.Error as x: logging.error(x.msg) sys.exit(1) # Translate the minibatches and store the resulting beam (i.e. # translations and scores) for each sentence. beams = [] for x in minibatches: y_dummy = numpy.zeros(shape=(len(x),1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, configs[0].factors, maxlen=None) sample = model_set.beam_search( session=session, x=x, x_mask=x_mask, beam_size=beam_size, normalization_alpha=normalization_alpha) beams.extend(sample) num_translated = num_prev_translated + len(beams) logging.info('Translated {} sents'.format(num_translated)) # Put beams into the same order as the input maxibatch. tmp = numpy.array(beams, dtype=numpy.object) ordered_beams = tmp[idxs.argsort()] # Write the translations to the output file. for i, beam in enumerate(ordered_beams): if nbest: num = num_prev_translated + i for sent, cost in beam: translation = util.seq2words(sent, num_to_target) line = "{} ||| {} ||| {}\n".format(num, translation, str(cost)) output_file.write(line) else: best_hypo, cost = beam[0] line = util.seq2words(best_hypo, num_to_target) + '\n' output_file.write(line)
def Post_arg(destination, inputFile, file): ''' post process ''' arg = os.path.join(destination, "%s.arg" % file) # all attributes extracted from '.out' file attr_set = InputProcess(inputFile) # get news time destination_file = os.path.join(destination, file) news_time = '' child_body = read_all_lines(destination_file) for child_line in child_body: if child_line.strip().startswith("time:"): news_time = child_line[5:].strip() # post generate .arg file f_arg = open(arg, 'w') for attr in attr_set: try: f_arg.write("==================\n") length = len(attr[0]) line = attr[2] + ',' + str(int(attr[2]) + length - 1 ) + ' ' + attr[1] + ' ' + attr[0] + '\n' f_arg.write(line) # add place feature length_place = len(attr[9]) if attr[9] != 'NULL': line = attr[10] + ',' + str( int(attr[10]) + length_place - 1) + '\t' + 'place' + '\t' + attr[9] + '\n' f_arg.write(line) # Add time Feature length_time = len(attr[7]) if attr[7] != 'NULL': new_time = TimeRegularzation(news_time, attr[7]) line = attr[8] + ',' + str( int(attr[8]) + length_time - 1) + '\t' + 'time' + '\t' + new_time + '\n' # without changing time # line = attr[8] + ',' + str(int(attr[8])+length_time-1) + ' time ' + attr[7]+ '\n' f_arg.write(line) except Exception, e: print Exception, ":", e continue
def CombineAllCoreference_t(): ''' process all files contained ''' PATH = u"/home/lzh/Documents/SinoCoreferencer/突发事件/社会安全/topic62" childfiles = [] for dirpath, dirpathnames, filenames in os.walk(PATH): directory = os.path.join(dirpath, 'directory.all') dir_lines = read_all_lines(directory) for line in dir_lines: print dirpath print line CombineCoreference(dirpath, line)
def CombineCoreference( Destination, file, ): ''' generate .coreference file ,which contains detail infomation about coreference events ''' coref_file = file + '.coref.events' coref_dir = os.path.join(Destination, coref_file) if os.path.exists(coref_dir): coref_lines = read_all_lines(coref_dir) arg_file = file + '.arg' arg_dir = os.path.join(Destination, arg_file) arg_lines = read_all_lines(arg_dir) coref_attr_file = file + '.coreference3' coref_attr_dir = os.path.join(Destination, coref_attr_file) f_coref = open(coref_attr_dir, 'w') f_coref.close() writeCoreference_v2(coref_lines, arg_lines, coref_attr_dir)
def build_test_corpus(cut=False): """ 构建测试语料 """ num2rel_dict = init_test_rel_dict() file = open('corpus_handle/test.txt', 'w', encoding='utf-8') all_lines = read_all_lines('corpus/TEST_FILE.txt') for line in all_lines: items = line.split('\t')[:] num, sentence = items[0], items[1][1:-2] sentence = re.sub('\s+', ' ', sentence) sentence, index_1, index_2 = cut_sentence(sentence, cut=cut) # 截取句子 relation_type = num2rel_dict[num] file.write('%s|%d|%d|%s\n' % (relation_type, index_1, index_2, sentence)) file.close()
def build_train_corpus(cut=False): """ 构建训练语料 """ file = open('corpus_handle/train.txt', 'w', encoding='utf-8') all_lines = read_all_lines('corpus/TRAIN_FILE.TXT') times = int(len(all_lines) / 3) for i in range(times): # 每次读取三行 line_0, line_1, line_2 = all_lines[i * 3:(i + 1) * 3] # 句子 sentence = line_0.split('\t')[1][1:-2] sentence = re.sub('\s+', ' ', sentence) sentence, index_1, index_2 = cut_sentence(sentence, cut=cut) # 关系类型 relation_type = line_1.split('(')[0] file.write('%s|%d|%d|%s\n' % (relation_type, index_1, index_2, sentence)) file.close()
def dataAnalysis(): ''' count the num of time both processed and unprocessed ''' lines = read_all_lines("/home/lzh/Downloads/data/collection.time") new_lines = [] process = 0 unprocess = 0 for line in lines: time_re = TimeRegularzation("2016-11-28 19:37:00", line) new_lines.append(time_re) if time_re == '2016-11-28 0': unprocess += 1 else: process += 1 f_time = open("/home/lzh/Downloads/data/collection1.time", 'w') for l in new_lines: f_time.write(l + '\n') f_time.write("processed: %d" % process + '\n') f_time.write("unprocessed: %d" % unprocess + '\n')
def Multi(): ''' time attribute statictics ''' f_time = open("/home/lzh/Downloads/data/collection.time", 'w') PATH = u"/home/lzh/Documents/SinoCoreferencer/突发事件/社会安全" for dirpath, dirpathnames, filenames in os.walk(PATH): for each in dirpathnames: childfile = os.path.join(dirpath, each) directory = os.path.join(childfile, 'directory.all') dir_lines = read_all_lines(directory) for line in dir_lines: input_file = line + '.shtml.out' each_input = each + '_' input_dir = os.path.join('/home/lzh/Downloads/data/', each_input, input_file) time_set = ExtractTiming(input_dir) for t in time_set: f_time.write(t + '\n')
def FileGeneration(Destination, inputFile, file): ''' create file un-contained ''' arg = os.path.join(Destination, "%s.arg" % file) entities = os.path.join(Destination, '%s.coref.entities' % file) timeFile = os.path.join(Destination, '%s.time' % file) trigger = os.path.join(Destination, '%s.trigger' % file) typeFile = os.path.join(Destination, '%s.type' % file) valueFile = os.path.join(Destination, '%s.value' % file) xmlFile = os.path.join(Destination, '%s.xml' % file) # all attributes extracted from '.out' file attr_set = InputProcess(inputFile) # get news time destination_file = os.path.join(Destination, file) news_time = '' child_body = read_all_lines(destination_file) for child_line in child_body: if child_line.strip().startswith("time:"): news_time = child_line[5:].strip() ''' add event attributes ''' f_arg = open(arg, 'w') for attr in attr_set: try: f_arg.write("==================\n") length = len(attr[0]) line = attr[2] + ',' + str(int(attr[2]) + length - 1 ) + ' ' + attr[1] + ' ' + attr[0] + '\n' f_arg.write(line) # add place feature length_place = len(attr[9]) if attr[9] != 'NULL': line = attr[10] + ',' + str(int(attr[10]) + length_place - 1) + ' place ' + attr[9] + '\n' f_arg.write(line) # Add time Feature length_time = len(attr[7]) if attr[7] != 'NULL': # change time to standard formulate # new_time = TimeRegularzation(news_time, attr[7]) # line = attr[8] + ',' + str(int(attr[8])+length_time-1) + ' time ' + new_time + '\n' # without changing time line = attr[8] + ',' + str(int(attr[8]) + length_time - 1) + ' time ' + attr[7] + '\n' f_arg.write(line) # add relation feature # for i in range(11,len(attr)): # relation = attr[i].split(':') # pos1 = attr[4].index(relation[1]) # pos2 = attr[4].index(attr[0]) # length_relation = len(relation[1]) # start_pos = int(attr[2])+pos1-pos2 # line = str(start_pos) + ',' + str(start_pos+length_relation-1) + ' ' + relation[0] + ' ' + relation[1] + '\n' # f_arg.write(line) if not os.path.exists(entities): open(entities, 'w') f_time = open(timeFile, 'w') for attr in attr_set: length = len(attr[7]) if attr[7] != 'NULL': new_time = TimeRegularzation(news_time, attr[7]) line = attr[8] + ',' + str( int(attr[8]) + length - 1) + ' time time ' + new_time + '\n' # without changing time # line = attr[8] + ',' + str(int(attr[8])+length-1) + ' time time ' + attr[7] + '\n' f_time.write(line) f_trigger = open(trigger, 'w') for attr in attr_set: length = len(attr[0]) line = attr[2] + ',' + str( int(attr[2]) + length - 1) + ' ' + attr[1] + ' ' + attr[0] + '\n' f_trigger.write(line) if not os.path.exists(typeFile): open(typeFile, 'w') if not os.path.exists(valueFile): open(valueFile, 'w') except Exception, e: print Exception, ":", e continue