Example #1
0
class TestBPESegmentMethod(unittest.TestCase):
    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe.segment(orig)
        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'w@@ o@@ r@@ d@@ like@@ w@@ o@@ r@@ d l@@ i@@ k@@ e@@ M@@ a@@ n@@ u@@ e@@ l@@ word'
        test_case = (orig, exp)
        self._run_test_case(test_case)
class TestBPESegmentMethod(unittest.TestCase):
    def setUp(self):
        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    @mock.patch('apply_bpe.encode', side_effect=encode_mock)
    def _run_test_case(self, test_case, encode_function):
        orig, expected = test_case
        out = self.bpe.segment(orig)

        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
        test_case = (orig, exp)
        self._run_test_case(test_case)
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))
    bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@")
    wordMapPath = "word-map"
    tagMapPath = "tag-map"
    pMapPath = "prefix-list"
    sMapPath = "suffix-list"

    pMap = readAffix(pMapPath)
    sMap = readAffix(sMapPath)

    wordMap = readMap(wordMapPath)
    tagMap = readMap(tagMapPath)
    wordMap, _ = bpe.segment(wordMap)
    wordMap = list(set(process_seg_sent(wordMap)))

    wordMap.insert(0, "-start-")
    wordMap.insert(0, "-end-")
    wordMap.insert(0, "-unknown-")

    pMap.insert(0, "-start-")
    pMap.insert(0, "-unknown-")
    sMap.insert(0, "-start-")
    sMap.insert(0, "-unknown-")

    feature_sizes = [
        8, 8, 2, 4
    ]  #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags
    domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1]
    num_actions = 45
    embedding_dims = [8, 64, 8, 16]

    train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu'
    dev_data_path = '/cs/natlang-user/vivian/wsj-conll/dev.conllu'
    logging.info("loading data and precomputing features...")
    train_data = ConllData(train_data_path, wordMap, tagMap, pMap, sMap, bpe)
    dev_data = ConllData(dev_data_path, wordMap, tagMap, pMap, sMap, bpe)

    with tf.Session(FLAGS.tf_master) as sess:
        Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims,
              wordMap, tagMap, pMap, sMap, train_data, dev_data, bpe)
class TestBPESegmentMethod(unittest.TestCase):

    def setUp(self):
        
        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    @mock.patch('apply_bpe.encode', side_effect=encode_mock)
    def _run_test_case(self, test_case, encode_function):

        orig, expected = test_case
        out = self.bpe.segment(orig)

        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
        test_case = (orig, exp)
        self._run_test_case(test_case)
Example #5
0
class E2C(object):
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)

    def tokenDoc(self, doc):
        sentenceList = sent_tokenize(doc.strip())
        print 'e2c sentenceList : ', sentenceList
        tokens = []
        for sent in sentenceList:
            sent = sent.lower()
            sent = self.detokenizer.unescape_xml(
                self.tokenizer.tokenize(sent, return_str=True))
            if self.opt.bpe_codes != "":
                sent = self.bpe.segment(sent).strip()
            token = sent.split()
            tokens += [token]
        return tokens

    def translate(self, doc):
        batch = self.tokenDoc(doc)
        pred, _, _, _, _ = self.translator.translate(batch, None)
        rstr = ""
        #ipdb.set_trace()
        for idx in range(len(pred)):
            rstr += ''.join(' '.join(pred[idx][0]).replace(
                self.sep, '').split()) + "\n\n"
        print 'e2c rstr : ', rstr.strip()
        return rstr.strip()
Example #6
0
class VHRED(object):
    def __init__(self, config):
        self.config = config
        self.f_dict = config['vhred_dict']
        # Load the VHRED model.
        self.model, self.enc_fn, self.dec_fn = self._build_vhred_model()
        # Load in Twitter dictionaries for BPE conversion.
        f_bpe_dictionary = config['vhred_bpe_file']
        with open(f_bpe_dictionary, 'r') as handle:
            self.bpe = BPE(handle.readlines(), '@@')
        with open(self.f_dict, 'r') as handle:
            twitter_dict = cPickle.load(handle)
        self.str_to_idx = dict([(tok, tok_id)
                                for tok, tok_id, _, _ in twitter_dict])
        self.idx_to_str = dict([(tok_id, tok)
                                for tok, tok_id, _, _ in twitter_dict])
        self.MODELS = ['hred', 'human', 'tfidf', 'de']

    def _convert_text_to_bpe(self,
                             contexts,
                             gt_responses,
                             model_responses,
                             ignore_models=False):
        # Files needed for BPE conversions.
        context_ids = self._strs_to_idxs(contexts)
        gt_response_ids = self._strs_to_idxs(gt_responses)

        longest = 0
        for res in gt_response_ids:
            if len(res) > longest:
                longest = len(res)
        print 'Longest Response:', longest

        if not ignore_models:
            model_response_ids = self._strs_to_idxs(model_responses)
        else:
            model_response_ids = None
        return context_ids, gt_response_ids, model_response_ids

    def _strs_to_idxs(self, data):
        out = []
        for row in data:
            bpe_segmented = self.bpe.segment(row.strip())
            out.append([
                self.str_to_idx[word] for word in bpe_segmented.split()
                if word in self.str_to_idx
            ])
        return out

    def _idxs_to_strs(self, data):
        out = []
        for row in data:
            s = ' '.join([self.idx_to_str[word] for word in row])
            out.append(s.replace('@@ ', ''))
        return out

    def _build_vhred_model(self):
        # Update the state dictionary.
        state = VHRED_prototype_state()
        model_prefix = self.config['vhred_prefix']
        state_path = model_prefix + "_state.pkl"
        model_path = model_prefix + "_model.npz"
        with open(state_path, 'rb') as handle:
            state.update(cPickle.load(handle))
        # Update the bs for the current data.
        state['bs'] = 100
        state['dictionary'] = self.f_dict

        # Create the model:
        model = VHRED_DialogEncoderDecoder(state)
        model.bs = 100
        enc_fn = model.build_encoder_function()
        dec_fn = model.build_decoder_encoding()

        return model, enc_fn, dec_fn

    def _extract_text(self, dataset, ignore_models=False):
        cs, gt_rs, m_rs = [], [], []
        for entry in dataset:
            cs.append(entry['c'])
            gt_rs.append(entry['r_gt'])
            # Extract in this order so we don't mix up which responses came from which models.
            if not ignore_models:
                for m_name in self.MODELS:
                    m_rs.append(entry['r_models'][m_name][0])

        # Add </s> token to beginning of each.
        cs = [
            '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip()
            for c in cs
        ]
        gt_rs = [
            '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip()
            for c in gt_rs
        ]
        if not ignore_models:
            m_rs = [
                '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip()
                for c in m_rs
            ]

        return cs, gt_rs, m_rs

    # Compute model embeddings for contexts or responses
    # Embedding type can be 'CONTEXT' or 'DECODER'
    def _compute_embeddings(self, data):
        embeddings = []
        context_ids_batch = []
        batch_index = 0
        batch_total = int(math.ceil(float(len(data)) / float(self.model.bs)))

        counter = 0
        max_len = 0
        for context_ids in data:
            counter += 1
            context_ids_batch.append(context_ids)

            # If we have filled up a batch, or reached the end of our data:
            if len(context_ids_batch) == self.model.bs or counter == len(data):
                batch_index += 1
                length = len(context_ids_batch)
                if len(context_ids_batch) < self.model.bs:
                    # Pad the data to get a full batch.
                    while len(context_ids_batch) < self.model.bs:
                        context_ids_batch.append(context_ids_batch[0])
                print 'Computing embeddings for batch %d/%d' % (batch_index,
                                                                batch_total)
                encs = VHRED_compute_encodings(context_ids_batch, self.model,
                                               self.enc_fn, self.dec_fn,
                                               self.config['embedding_type'])
                if length < self.model.bs:
                    encs = encs[:length]
                for i in range(len(encs)):
                    embeddings.append(encs[i, :].tolist())
                context_ids_batch = []

        return embeddings

    def _add_embeddings_to_dataset(self,
                                   dataset,
                                   c_embs,
                                   r_gt_embs,
                                   r_model_embs,
                                   ignore_models=False):
        for ix in xrange(len(dataset)):
            dataset[ix]['c_emb'] = c_embs[ix]
            dataset[ix]['r_gt_emb'] = r_gt_embs[ix]
            if not ignore_models:
                dataset[ix]['r_model_embs'] = {}
                for jx, m_name in enumerate(self.MODELS):
                    dataset[ix]['r_model_embs'][m_name] = r_model_embs[
                        ix * len(self.MODELS) + jx]
        return dataset

    def get_embeddings(self, dataset, new_models=None, ignore_models=False):
        ''' Dataset should be a list of dictionaries. Each dictionary should have
			keys: c, r_gt, r_models = {'model_name': [r, score, length], ...}
		'''
        if not new_models is None:
            self.MODELS = new_models
        if 'r_models' not in dataset[0]:
            ignore_models = True

        contexts, gt_responses, model_responses = self._extract_text(
            dataset, ignore_models=ignore_models)
        context_ids, gt_response_ids, model_response_ids = self._convert_text_to_bpe(
            contexts,
            gt_responses,
            model_responses,
            ignore_models=ignore_models)

        print 'Computing context embeddings...'
        context_embs = self._compute_embeddings(context_ids)
        print 'Computing ground truth response embeddings...'
        gt_response_embs = self._compute_embeddings(gt_response_ids)
        if not ignore_models:
            print 'Computing model response embeddings...'
            model_response_embs = self._compute_embeddings(model_response_ids)
        else:
            model_response_embs = None

        # Update our dataset with each of the embeddings.
        dataset = self._add_embeddings_to_dataset(dataset,
                                                  context_embs,
                                                  gt_response_embs,
                                                  model_response_embs,
                                                  ignore_models=ignore_models)

        return dataset

    def use_saved_embeddings(self):
        with open(self.config['vhred_embeddings_file'], 'rb') as handle:
            dataset = cPickle.load(handle)
        return dataset
def Eval(sess):
    """Builds and evaluates a network."""
    logging.set_verbosity(logging.INFO)
    bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@")
    wordMapPath = "word-map"
    tagMapPath = "tag-map"
    pMapPath = "prefix-list"
    sMapPath = "suffix-list"

    pMap = readAffix(pMapPath)
    sMap = readAffix(sMapPath)

    wordMap = readMap(wordMapPath)
    tagMap = readMap(tagMapPath)
    wordMap, _ = bpe.segment(wordMap)
    wordMap = list(set(process_seg_sent(wordMap)))

    wordMap.insert(0, "-start-")
    wordMap.insert(0, "-end-")
    wordMap.insert(0, "-unknown-")

    pMap.insert(0, "-start-")
    pMap.insert(0, "-unknown-")
    sMap.insert(0, "-start-")
    sMap.insert(0, "-unknown-")

    feature_sizes = [
        8, 8, 2, 4
    ]  #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags
    domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1]
    num_actions = 45
    embedding_dims = [8, 64, 8, 16]

    t = time.time()
    hidden_layer_sizes = map(int, FLAGS.hidden_layer_sizes.split(','))
    logging.info(
        'Building training network with parameters: feature_sizes: %s '
        'domain_sizes: %s', feature_sizes, domain_sizes)

    test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu'
    logging.info("loading data and precomputing features...")
    test_data = ConllData(test_data_path, wordMap, tagMap, pMap, sMap, bpe)

    tagger = GreedyTagger(num_actions,
                          feature_sizes,
                          domain_sizes,
                          embedding_dims,
                          hidden_layer_sizes,
                          gate_gradients=True)

    tagger.AddEvaluation(FLAGS.batch_size)
    tagger.AddSaver()
    sess.run(tagger.inits.values())
    tagger.saver.restore(sess, FLAGS.model_path)

    t = time.time()
    num_epochs = None
    num_tokens = 0
    num_correct = 0
    index = 0
    epochs = 0

    epochs, sent_batch = loadBatch(FLAGS.batch_size, epochs, test_data)
    while True:
        sent_batch, epochs, feature_endpoints, gold_tags, words = get_current_features(
            sent_batch, epochs, test_data, wordMap, tagMap, pMap, sMap)
        tf_eval_metrics = sess.run(
            tagger.evaluation['logits'],
            feed_dict={tagger.test_input: feature_endpoints})
        for i in range(FLAGS.batch_size):
            best_action = 0
            best_score = float("-inf")
            for j in range(45):
                if tf_eval_metrics[i][j] > best_score:
                    best_score = tf_eval_metrics[i][j]
                    best_action = j
            sent_batch[i].set_tag(tagMap[best_action])
        if num_epochs is None:
            num_epochs = epochs
        elif num_epochs < sent_batch[0].get_epoch():
            break

    test_data.reset_index()
    while test_data.has_next_sent():
        sent = test_data.get_next_sent()
        output_tags = sent.get_tag_output()
        gold_tags = sent.origin_tag_list
        word_list, output_tags = combine_seg(sent.seg_word_list, output_tags)
        for idx, tag in enumerate(gold_tags):
            num_tokens += 1
            if tag == output_tags[idx]:
                num_correct += 1
        sent.reset_state()

    eval_metric = 0 if num_tokens == 0 else (100.0 * num_correct / num_tokens)

    logging.info(
        'Number of Tokens: %d, Seconds elapsed in evaluation: %.2f, '
        'eval metric: %.2f%%', num_tokens,
        time.time() - t, eval_metric)
    logging.info('num correct tokens: %d', num_correct)
Example #8
0
class C2E(object):
    """
    """
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"),
                       self.opt.seprator, None, None)

        self.translator = onmt.Translator(opt)

        self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S")

    def seg(self, doc):
        res = ""
        try:
            print "using boson....."
            boson_res = self.nlp.tag(l)
            res = boson[0]['word']
        except:
            res = jieba.cut(doc, cut_all=False)
        return " ".join(res)

    def truecase(self, text):
        text = text.encode('utf-8')
        truecase_sents = []
        tagged_sent = nltk.pos_tag(
            [word.lower() for word in nltk.word_tokenize(text)])
        normalize_sent = [
            w.captitalize() if t in ['NN', 'NNS'] else w
            for (w, t) in tagged_sent
        ]
        normalize_sent[0] = normalize_sent[0].capitalize()
        pretty_string = re.sub(" (?=[\.,'!?:;])", "",
                               ' '.join(normalized_sent))
        return pretty_string

    def tokenDoc(self, doc):
        doc = doc.strip()
        sentenceList = re.split(PAT, doc.decode('utf-8'))
        assert len(sentenceList) >= 1
        if sentenceList[-1].strip() == "":
            sentenceList = sentenceList[:-1]
        punctuaList = re.findall(PAT, doc.decode('utf-8'))
        punctuaList += (len(sentenceList) - len(punctuaList)) * [' ']
        sents = [
            sent + punc for (sent, punc) in zip(sentenceList, punctuaList)
        ]
        sents = [sent.strip() for sent in sents]
        print 'c2e sentenceList : ', sentenceList
        tokens = []
        for sent in sents:
            sent = sent.lower()
            #sent = self.detokenizer.unescape_xml(self.tokenizer.tokenize(sent, return_str=True))
            sent = self.seg(sent)
            if self.opt.bpe_codes != "":
                sent = self.bpe.segment(sent).strip()
            token = sent.split()
            tokens += [token]
        print 'c2e tokens : ', tokens
        return tokens

    def translate(self, doc):
        batch = self.tokenDoc(doc)
        pred, _, _, _, _ = self.translator.translate(batch, None)
        rstr = ""
        for idx in range(len(pred)):
            pred_sent = ' '.join(pred[idx][0]).replace(' @-@ ', '-').replace(
                self.sep, '')
            #pred_sent = self.truecase(pred_sent)
            pred_sent = pred_sent.capitalize()
            rstr += pred_sent + "\n"
        print 'c2e rstr : ', rstr.strip()
        return rstr.strip()
Example #9
0
    vw_file = args.vw
    file_name = os.path.basename(vw_file)
    gazette_title = vw_file.replace(file_name, "")

    with open(vw_file) as rectangles_to_check:
        for rectangle in rectangles_to_check.readlines():
            page = re.search(r"PAGE:\d\d?", rectangle).group(0).replace("PAGE:","")
            x1 = re.search(r"X1:\d{1,4}", rectangle).group(0).replace("X1:","")
            x2 = re.search(r"X2:\d{1,4}", rectangle).group(0).replace("X2:","")
            y1 = re.search(r"Y1:\d{1,4}", rectangle).group(0).replace("Y1:","")
            y2 = re.search(r"Y2:\d{1,4}", rectangle).group(0).replace("Y2:","")

            xml_coord = gazette_title + "/page_" + page + ".xml_coord"

            with open(xml_coord) as xml_coord_file:
                xml_coords = xml_coord_file.readlines()
            #LM for rectangle -> corpus of necrologue
            rectangle_text = cut_xml(x1, y1, x2, y2, xml_coords)
            rectangle_text_normalized = " ".join(rectangle_text).replace(""," ")[1: -1].lower()
            necro_lm_score = necrologues_lm.score(rectangle_text_normalized)
            sys.stdout.write("LM_RECT_SCORE:" + str(necro_lm_score) + " ")

            #LM for page which contains rectangle -> corpus of pages with necrologies 
            page_file = gazette_title + "/page_" + page.replace("PAGE:","") + ".txt"
            with open(page_file) as page_txt:
                bpe_text = bpe.segment((" ".join(page_txt.readlines())).lower().strip())
                page_lm_score = pages_lm.score(bpe_text)
                sys.stdout.write("LM_PAGE_SCORE:" + str(page_lm_score))
            
            sys.stdout.write("\n")