class TestBPESegmentMethod(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries) def _run_test_case(self, test_case): orig, expected = test_case out = self.bpe.segment(orig) self.assertEqual(out, expected) def test_multiple_glossaries(self): orig = 'wordlikeword likeManuelword' exp = 'w@@ o@@ r@@ d@@ like@@ w@@ o@@ r@@ d l@@ i@@ k@@ e@@ M@@ a@@ n@@ u@@ e@@ l@@ word' test_case = (orig, exp) self._run_test_case(test_case)
class TestBPESegmentMethod(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries) @mock.patch('apply_bpe.encode', side_effect=encode_mock) def _run_test_case(self, test_case, encode_function): orig, expected = test_case out = self.bpe.segment(orig) self.assertEqual(out, expected) def test_multiple_glossaries(self): orig = 'wordlikeword likeManuelword' exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd' test_case = (orig, exp) self._run_test_case(test_case)
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@") wordMapPath = "word-map" tagMapPath = "tag-map" pMapPath = "prefix-list" sMapPath = "suffix-list" pMap = readAffix(pMapPath) sMap = readAffix(sMapPath) wordMap = readMap(wordMapPath) tagMap = readMap(tagMapPath) wordMap, _ = bpe.segment(wordMap) wordMap = list(set(process_seg_sent(wordMap))) wordMap.insert(0, "-start-") wordMap.insert(0, "-end-") wordMap.insert(0, "-unknown-") pMap.insert(0, "-start-") pMap.insert(0, "-unknown-") sMap.insert(0, "-start-") sMap.insert(0, "-unknown-") feature_sizes = [ 8, 8, 2, 4 ] #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1] num_actions = 45 embedding_dims = [8, 64, 8, 16] train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu' dev_data_path = '/cs/natlang-user/vivian/wsj-conll/dev.conllu' logging.info("loading data and precomputing features...") train_data = ConllData(train_data_path, wordMap, tagMap, pMap, sMap, bpe) dev_data = ConllData(dev_data_path, wordMap, tagMap, pMap, sMap, bpe) with tf.Session(FLAGS.tf_master) as sess: Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims, wordMap, tagMap, pMap, sMap, train_data, dev_data, bpe)
class E2C(object): def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt) def tokenDoc(self, doc): sentenceList = sent_tokenize(doc.strip()) print 'e2c sentenceList : ', sentenceList tokens = [] for sent in sentenceList: sent = sent.lower() sent = self.detokenizer.unescape_xml( self.tokenizer.tokenize(sent, return_str=True)) if self.opt.bpe_codes != "": sent = self.bpe.segment(sent).strip() token = sent.split() tokens += [token] return tokens def translate(self, doc): batch = self.tokenDoc(doc) pred, _, _, _, _ = self.translator.translate(batch, None) rstr = "" #ipdb.set_trace() for idx in range(len(pred)): rstr += ''.join(' '.join(pred[idx][0]).replace( self.sep, '').split()) + "\n\n" print 'e2c rstr : ', rstr.strip() return rstr.strip()
class VHRED(object): def __init__(self, config): self.config = config self.f_dict = config['vhred_dict'] # Load the VHRED model. self.model, self.enc_fn, self.dec_fn = self._build_vhred_model() # Load in Twitter dictionaries for BPE conversion. f_bpe_dictionary = config['vhred_bpe_file'] with open(f_bpe_dictionary, 'r') as handle: self.bpe = BPE(handle.readlines(), '@@') with open(self.f_dict, 'r') as handle: twitter_dict = cPickle.load(handle) self.str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in twitter_dict]) self.idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in twitter_dict]) self.MODELS = ['hred', 'human', 'tfidf', 'de'] def _convert_text_to_bpe(self, contexts, gt_responses, model_responses, ignore_models=False): # Files needed for BPE conversions. context_ids = self._strs_to_idxs(contexts) gt_response_ids = self._strs_to_idxs(gt_responses) longest = 0 for res in gt_response_ids: if len(res) > longest: longest = len(res) print 'Longest Response:', longest if not ignore_models: model_response_ids = self._strs_to_idxs(model_responses) else: model_response_ids = None return context_ids, gt_response_ids, model_response_ids def _strs_to_idxs(self, data): out = [] for row in data: bpe_segmented = self.bpe.segment(row.strip()) out.append([ self.str_to_idx[word] for word in bpe_segmented.split() if word in self.str_to_idx ]) return out def _idxs_to_strs(self, data): out = [] for row in data: s = ' '.join([self.idx_to_str[word] for word in row]) out.append(s.replace('@@ ', '')) return out def _build_vhred_model(self): # Update the state dictionary. state = VHRED_prototype_state() model_prefix = self.config['vhred_prefix'] state_path = model_prefix + "_state.pkl" model_path = model_prefix + "_model.npz" with open(state_path, 'rb') as handle: state.update(cPickle.load(handle)) # Update the bs for the current data. state['bs'] = 100 state['dictionary'] = self.f_dict # Create the model: model = VHRED_DialogEncoderDecoder(state) model.bs = 100 enc_fn = model.build_encoder_function() dec_fn = model.build_decoder_encoding() return model, enc_fn, dec_fn def _extract_text(self, dataset, ignore_models=False): cs, gt_rs, m_rs = [], [], [] for entry in dataset: cs.append(entry['c']) gt_rs.append(entry['r_gt']) # Extract in this order so we don't mix up which responses came from which models. if not ignore_models: for m_name in self.MODELS: m_rs.append(entry['r_models'][m_name][0]) # Add </s> token to beginning of each. cs = [ '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip() for c in cs ] gt_rs = [ '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip() for c in gt_rs ] if not ignore_models: m_rs = [ '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip() for c in m_rs ] return cs, gt_rs, m_rs # Compute model embeddings for contexts or responses # Embedding type can be 'CONTEXT' or 'DECODER' def _compute_embeddings(self, data): embeddings = [] context_ids_batch = [] batch_index = 0 batch_total = int(math.ceil(float(len(data)) / float(self.model.bs))) counter = 0 max_len = 0 for context_ids in data: counter += 1 context_ids_batch.append(context_ids) # If we have filled up a batch, or reached the end of our data: if len(context_ids_batch) == self.model.bs or counter == len(data): batch_index += 1 length = len(context_ids_batch) if len(context_ids_batch) < self.model.bs: # Pad the data to get a full batch. while len(context_ids_batch) < self.model.bs: context_ids_batch.append(context_ids_batch[0]) print 'Computing embeddings for batch %d/%d' % (batch_index, batch_total) encs = VHRED_compute_encodings(context_ids_batch, self.model, self.enc_fn, self.dec_fn, self.config['embedding_type']) if length < self.model.bs: encs = encs[:length] for i in range(len(encs)): embeddings.append(encs[i, :].tolist()) context_ids_batch = [] return embeddings def _add_embeddings_to_dataset(self, dataset, c_embs, r_gt_embs, r_model_embs, ignore_models=False): for ix in xrange(len(dataset)): dataset[ix]['c_emb'] = c_embs[ix] dataset[ix]['r_gt_emb'] = r_gt_embs[ix] if not ignore_models: dataset[ix]['r_model_embs'] = {} for jx, m_name in enumerate(self.MODELS): dataset[ix]['r_model_embs'][m_name] = r_model_embs[ ix * len(self.MODELS) + jx] return dataset def get_embeddings(self, dataset, new_models=None, ignore_models=False): ''' Dataset should be a list of dictionaries. Each dictionary should have keys: c, r_gt, r_models = {'model_name': [r, score, length], ...} ''' if not new_models is None: self.MODELS = new_models if 'r_models' not in dataset[0]: ignore_models = True contexts, gt_responses, model_responses = self._extract_text( dataset, ignore_models=ignore_models) context_ids, gt_response_ids, model_response_ids = self._convert_text_to_bpe( contexts, gt_responses, model_responses, ignore_models=ignore_models) print 'Computing context embeddings...' context_embs = self._compute_embeddings(context_ids) print 'Computing ground truth response embeddings...' gt_response_embs = self._compute_embeddings(gt_response_ids) if not ignore_models: print 'Computing model response embeddings...' model_response_embs = self._compute_embeddings(model_response_ids) else: model_response_embs = None # Update our dataset with each of the embeddings. dataset = self._add_embeddings_to_dataset(dataset, context_embs, gt_response_embs, model_response_embs, ignore_models=ignore_models) return dataset def use_saved_embeddings(self): with open(self.config['vhred_embeddings_file'], 'rb') as handle: dataset = cPickle.load(handle) return dataset
def Eval(sess): """Builds and evaluates a network.""" logging.set_verbosity(logging.INFO) bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@") wordMapPath = "word-map" tagMapPath = "tag-map" pMapPath = "prefix-list" sMapPath = "suffix-list" pMap = readAffix(pMapPath) sMap = readAffix(sMapPath) wordMap = readMap(wordMapPath) tagMap = readMap(tagMapPath) wordMap, _ = bpe.segment(wordMap) wordMap = list(set(process_seg_sent(wordMap))) wordMap.insert(0, "-start-") wordMap.insert(0, "-end-") wordMap.insert(0, "-unknown-") pMap.insert(0, "-start-") pMap.insert(0, "-unknown-") sMap.insert(0, "-start-") sMap.insert(0, "-unknown-") feature_sizes = [ 8, 8, 2, 4 ] #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1] num_actions = 45 embedding_dims = [8, 64, 8, 16] t = time.time() hidden_layer_sizes = map(int, FLAGS.hidden_layer_sizes.split(',')) logging.info( 'Building training network with parameters: feature_sizes: %s ' 'domain_sizes: %s', feature_sizes, domain_sizes) test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu' logging.info("loading data and precomputing features...") test_data = ConllData(test_data_path, wordMap, tagMap, pMap, sMap, bpe) tagger = GreedyTagger(num_actions, feature_sizes, domain_sizes, embedding_dims, hidden_layer_sizes, gate_gradients=True) tagger.AddEvaluation(FLAGS.batch_size) tagger.AddSaver() sess.run(tagger.inits.values()) tagger.saver.restore(sess, FLAGS.model_path) t = time.time() num_epochs = None num_tokens = 0 num_correct = 0 index = 0 epochs = 0 epochs, sent_batch = loadBatch(FLAGS.batch_size, epochs, test_data) while True: sent_batch, epochs, feature_endpoints, gold_tags, words = get_current_features( sent_batch, epochs, test_data, wordMap, tagMap, pMap, sMap) tf_eval_metrics = sess.run( tagger.evaluation['logits'], feed_dict={tagger.test_input: feature_endpoints}) for i in range(FLAGS.batch_size): best_action = 0 best_score = float("-inf") for j in range(45): if tf_eval_metrics[i][j] > best_score: best_score = tf_eval_metrics[i][j] best_action = j sent_batch[i].set_tag(tagMap[best_action]) if num_epochs is None: num_epochs = epochs elif num_epochs < sent_batch[0].get_epoch(): break test_data.reset_index() while test_data.has_next_sent(): sent = test_data.get_next_sent() output_tags = sent.get_tag_output() gold_tags = sent.origin_tag_list word_list, output_tags = combine_seg(sent.seg_word_list, output_tags) for idx, tag in enumerate(gold_tags): num_tokens += 1 if tag == output_tags[idx]: num_correct += 1 sent.reset_state() eval_metric = 0 if num_tokens == 0 else (100.0 * num_correct / num_tokens) logging.info( 'Number of Tokens: %d, Seconds elapsed in evaluation: %.2f, ' 'eval metric: %.2f%%', num_tokens, time.time() - t, eval_metric) logging.info('num correct tokens: %d', num_correct)
class C2E(object): """ """ def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"), self.opt.seprator, None, None) self.translator = onmt.Translator(opt) self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S") def seg(self, doc): res = "" try: print "using boson....." boson_res = self.nlp.tag(l) res = boson[0]['word'] except: res = jieba.cut(doc, cut_all=False) return " ".join(res) def truecase(self, text): text = text.encode('utf-8') truecase_sents = [] tagged_sent = nltk.pos_tag( [word.lower() for word in nltk.word_tokenize(text)]) normalize_sent = [ w.captitalize() if t in ['NN', 'NNS'] else w for (w, t) in tagged_sent ] normalize_sent[0] = normalize_sent[0].capitalize() pretty_string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent)) return pretty_string def tokenDoc(self, doc): doc = doc.strip() sentenceList = re.split(PAT, doc.decode('utf-8')) assert len(sentenceList) >= 1 if sentenceList[-1].strip() == "": sentenceList = sentenceList[:-1] punctuaList = re.findall(PAT, doc.decode('utf-8')) punctuaList += (len(sentenceList) - len(punctuaList)) * [' '] sents = [ sent + punc for (sent, punc) in zip(sentenceList, punctuaList) ] sents = [sent.strip() for sent in sents] print 'c2e sentenceList : ', sentenceList tokens = [] for sent in sents: sent = sent.lower() #sent = self.detokenizer.unescape_xml(self.tokenizer.tokenize(sent, return_str=True)) sent = self.seg(sent) if self.opt.bpe_codes != "": sent = self.bpe.segment(sent).strip() token = sent.split() tokens += [token] print 'c2e tokens : ', tokens return tokens def translate(self, doc): batch = self.tokenDoc(doc) pred, _, _, _, _ = self.translator.translate(batch, None) rstr = "" for idx in range(len(pred)): pred_sent = ' '.join(pred[idx][0]).replace(' @-@ ', '-').replace( self.sep, '') #pred_sent = self.truecase(pred_sent) pred_sent = pred_sent.capitalize() rstr += pred_sent + "\n" print 'c2e rstr : ', rstr.strip() return rstr.strip()
vw_file = args.vw file_name = os.path.basename(vw_file) gazette_title = vw_file.replace(file_name, "") with open(vw_file) as rectangles_to_check: for rectangle in rectangles_to_check.readlines(): page = re.search(r"PAGE:\d\d?", rectangle).group(0).replace("PAGE:","") x1 = re.search(r"X1:\d{1,4}", rectangle).group(0).replace("X1:","") x2 = re.search(r"X2:\d{1,4}", rectangle).group(0).replace("X2:","") y1 = re.search(r"Y1:\d{1,4}", rectangle).group(0).replace("Y1:","") y2 = re.search(r"Y2:\d{1,4}", rectangle).group(0).replace("Y2:","") xml_coord = gazette_title + "/page_" + page + ".xml_coord" with open(xml_coord) as xml_coord_file: xml_coords = xml_coord_file.readlines() #LM for rectangle -> corpus of necrologue rectangle_text = cut_xml(x1, y1, x2, y2, xml_coords) rectangle_text_normalized = " ".join(rectangle_text).replace(""," ")[1: -1].lower() necro_lm_score = necrologues_lm.score(rectangle_text_normalized) sys.stdout.write("LM_RECT_SCORE:" + str(necro_lm_score) + " ") #LM for page which contains rectangle -> corpus of pages with necrologies page_file = gazette_title + "/page_" + page.replace("PAGE:","") + ".txt" with open(page_file) as page_txt: bpe_text = bpe.segment((" ".join(page_txt.readlines())).lower().strip()) page_lm_score = pages_lm.score(bpe_text) sys.stdout.write("LM_PAGE_SCORE:" + str(page_lm_score)) sys.stdout.write("\n")