Example #1
0
class CRF:
    def __init__(self):
        self.__model = type('test', (object,), {})()
        pass

    def train(self, X_training_data):
        self.__model = CRFTagger()
        self.__model.train(X_training_data, 'crf.model')
        pass

    def test(self, X_test_data):

        total = 0
        correct = 0
        for kalimat in X_test_data:
            temp = []
            for word in kalimat:
                temp.append(word[0])

            if len(temp) != 0:
                predicted_y = self.__model.tag(temp)
                for i in range(len(predicted_y)):
                    total += 1
                    if predicted_y[i][1] == kalimat[i][1]:
                        correct += 1

        print(correct, total)
        print(correct / total)
    pass
Example #2
0
class NamedEntityChunker(ChunkParserI):
  def __init__(self, train_sents, **kwargs):
    assert isinstance(train_sents, Iterable)
 
    self.feature_detector = features
    self.tagger = CRFTagger(
      feature_func=features
    )
    self.tagger.train(train_sents, 'model.crf.tagger')

    # self.tagger = ClassifierBasedTagger(
    #   train=train_sents,
    #   feature_detector=features,
    #   **kwargs)
 
  def parse(self, tagged_sent):
    chunks = self.tagger.tag(tagged_sent)
 
    # Transform the result from [((w1, t1), iob1), ...] 
    # to the preferred list of triplets format [(w1, t1, iob1), ...]
    iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
    # iob_triplets = [(w, t, 'O') for ((w, t), c) in chunks]
 
    # Transform the list of triplets to nltk.Tree format
    return conlltags2tree(iob_triplets)
Example #3
0
 def tag_crf(self, untagged_string: str):
     """Tag POS with CRF tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['crf']
     tagger = CRFTagger()
     tagger.set_model_file(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
Example #4
0
File: pos.py Project: j-duff/cltk
 def tag_crf(self, untagged_string: str):
     """Tag POS with CRF tagger.
     :type untagged_string: str
     :param : An untagged, untokenized string of text.
     :rtype tagged_text: str
     """
     untagged_tokens = wordpunct_tokenize(untagged_string)
     pickle_path = self.available_taggers['crf']
     tagger = CRFTagger()
     tagger.set_model_file(pickle_path)
     tagged_text = tagger.tag(untagged_tokens)
     return tagged_text
Example #5
0
def crf_tag():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    ct = CRFTagger()
    tagger = ct.train(train_sents, 'model.crf.tagger')
    test = ct.evaluate(test_sents)
    print test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode(
        'utf-8')
    sent_w = sent3.lower().split()
    print sent_w
    tag = ct.tag(sent_w)
    print "The Tag Is:", tag
Example #6
0
def train_and_save_tagger(language, model_type, feature, untagged_text, verbose = False):
    training_file = 'corpora/{0}/{0}_train.{1}'.format(language, feature)
    tagger, acc, _, _ = make_morpho_model(language, model_type, feature, training_file)

    model_file = 'taggers/{0}/{1}/{2}.pickle'.format(language, feature, model_type)
    if model_type != 'crf': #annoying hack because the crf model saves itself.
        with open(model_file, 'wb') as f:
            pickle.dump(tagger, f)

    # check the model file by reloading it and using it to tag sample text
    if model_type != 'crf':
        with open(model_file, 'rb') as f:
            tagger2 = pickle.load(f)
    else:
        tagger2 = CRFTagger()
        tagger2.set_model_file(model_file)

    tagged_text = tagger2.tag(untagged_text)

    if verbose == True:
        print("Model {0} for language {4}, feature {1} saved at {2}. Training accuracy = {3:.3f}".format(model_type, feature, model_file, acc, language))
        print("Sample tagging output: {0}".format(tagged_text[:10]))
Example #7
0
# In[26]:

TAGGER_PATH = "crfpostagger"   # pre-trained POS-tagger


# In[27]:

tagger = CRFTagger()  # initialize tagger
tagger.set_model_file(TAGGER_PATH)


# In[30]:

# try some sentences out- must all be unicode strings- trained on lower case
print(tagger.tag([u"i", u"like", u"revision"]))
print(tagger.tag([u"i", u"like", u"natural", u"language", u"processing"]))


# In[31]:

# scaling up as you might get them in text- make sure unicode and lower case
sentences = ["I like revision",
            "I like Natural Language Processing"]
print(tagger.tag_sents([unicode(word.lower()) for word in s.split()] for s in sentences))


# In[ ]:


Example #8
0
crf = CRFTagger()

'''
############# Train #############
crf.train(train, 'crf_brown.tagger')
print crf.evaluate(test) # 0.954383534534
'''

############# Test #############
crf.set_model_file('crf_brown.tagger')
tokens = []
for i in test:
	for j in i:
		tokens.append(j[0])
 
test_tagged = crf.tag(tokens)

'''
f = open("test_tagged_obj.pickle", 'w')
pickle.dump(test_tagged, f)
f.close()
'''

#print test_tagged
temp = ""
for i in test_tagged:
	temp += str(i[0])+'\t\t'+str(i[1])+'\n'

f = open('crf_brown_tagged.txt', 'w')
f.write(temp)
f.close()
Example #9
0
class DataAdapter(object):
    def __init__(self, data=[]):
        self.tagger = CRFTagger()
        self.tagger.set_model_file('model.crf.tagger')

        if data.count(True) > 0:
            self.data_tagging, self.data_testing = self.for_tagging_testing(
                data)
            # print('TAGGING', self.data_tagging)
            # print('TESTING', self.data_testing)

    def tokenize_tag(self, text):
        text = text.replace('\r', ' | ').replace('\n', ' | ')
        tokens = word_tokenize(text, preserve_line=True)
        labels = []
        for label in self.tag(tokens):
            labels.append(label[1])
        return tokens, labels

    def for_tagging_testing(self, data):
        # self.data = data
        array_tagging = []
        array_testing = []
        for d in data:
            all_tags = []
            all_test = []
            for index, t in enumerate(d['text']):
                one_tag = [t, d['label'][index]]
                all_test.append(one_tag)
                all_tags.append(t)
            array_tagging.append(all_tags)
            array_testing.append(all_test)
            # print(all_tags)
        return array_tagging, array_testing

    def for_testing(self, data):
        # self.data = data
        array = []
        # print('TEST', data.count())
        for d in data:
            all_tags = []
            for index, t in enumerate(d['text']):
                # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')]
                one_tag = [t, d['label'][index]]
                all_tags.append(one_tag)
            array.append(all_tags)
            # print(all_tags)
        return array

    def for_tagging(self, data):
        # self.data = data
        array = []
        for d in data:
            all_tags = []
            for t in d['text']:
                all_tags.append(t)
            array.append(all_tags)
            # print(all_tags)
        return array

    def tag_sents(self):
        if self.data_tagging is not None:
            return self.tagger.tag_sents(self.data_tagging)
        else:
            return 'NoData'

    def tag(self, data):
        return self.tagger.tag(data)

    def evaluate(self):
        if self.data_testing is not None:
            return self.tagger.evaluate(self.data_testing)
        else:
            return 'NoData'

    def train(self, data):
        data = self.for_testing(data)
        self.tagger.train(data, 'model.crf.tagger')
        print('ACCURACY:', self.tagger.evaluate(data))
Example #10
0
class DeepDisfluencyTagger(IncrementalTagger):
    """A deep-learning driven incremental disfluency tagger
    (and optionally utterance-segmenter).

    Tags each word with the following:
    <f/> - a fluent word
    <e/> - an edit term word, not necessarily inside a repair structure
    <rms id="N"/> - reparandum start word for repair with ID number N
    <rm id="N"/> - mid-reparandum word for repair N
    <i id="N"/> - interregnum word for repair N
    <rps id="N"/> - repair onset word for repair N
    <rp id="N"/> - mid-repair word for repair N
    <rpn id="N"/> - repair end word for substitution or repetition repair N
    <rpnDel id="N"/> - repair end word for a delete repair N

    If in joint utterance segmentation mode
    according to the config file,
    the following utterance segmentation tags are used:

    <cc/> - a word which continues the current utterance and whose
            following word will continue it
    <ct/> - a word which continues the current utterance and is the
            last word of it
    <tc/> - a word which is the beginning of an utterance and whose following
            word will continue it
    <tt/> - a word constituting an entire utterance
    """
    def __init__(self,
                 config_file=None,
                 config_number=None,
                 saved_model_dir=None,
                 pos_tagger=None,
                 language_model=None,
                 pos_language_model=None,
                 edit_language_model=None,
                 timer=None,
                 timer_scaler=None,
                 use_timing_data=False):

        if not config_file:
            config_file = os.path.dirname(os.path.realpath(__file__)) +\
                "/../experiments/experiment_configs.csv"
            config_number = 35
            print "No config file, using default", config_file, config_number

        super(DeepDisfluencyTagger, self).__init__(config_file, config_number,
                                                   saved_model_dir)
        print "Processing args from config number {} ...".format(config_number)
        self.args = process_arguments(config_file,
                                      config_number,
                                      use_saved=False,
                                      hmm=True)
        #  separate manual setting
        setattr(self.args, "use_timing_data", use_timing_data)
        print "Intializing model from args..."
        self.model = self.init_model_from_config(self.args)

        # load a model from a folder if specified
        if saved_model_dir:
            print "Loading saved weights from", saved_model_dir
            self.load_model_params_from_folder(saved_model_dir,
                                               self.args.model_type)
        else:
            print "WARNING no saved model params, needs training."
            print "Loading original embeddings"
            self.load_embeddings(self.args.embeddings)

        if pos_tagger:
            print "Loading POS tagger..."
            self.pos_tagger = pos_tagger
        elif self.args.pos:
            print "No POS tagger specified,loading default CRF switchboard one"
            self.pos_tagger = CRFTagger()
            tagger_path = os.path.dirname(os.path.realpath(__file__)) +\
                "/../feature_extraction/crfpostagger"
            self.pos_tagger.set_model_file(tagger_path)

        if self.args.n_language_model_features > 0 or \
                'noisy_channel' in self.args.decoder_type:
            print "training language model..."
            self.init_language_models(language_model, pos_language_model,
                                      edit_language_model)

        if timer:
            print "loading timer..."
            self.timing_model = timer
            self.timing_model_scaler = timer_scaler
        else:
            # self.timing_model = None
            # self.timing_model_scaler = None
            print "No timer specified, using default switchboard one"
            timer_path = os.path.dirname(os.path.realpath(__file__)) +\
                '/../decoder/timing_models/' + \
                'LogReg_balanced_timing_classifier.pkl'
            with open(timer_path, 'rb') as fid:
                self.timing_model = cPickle.load(fid)
            timer_scaler_path = os.path.dirname(os.path.realpath(__file__)) +\
                '/../decoder/timing_models/' + \
                'LogReg_balanced_timing_scaler.pkl'
            with open(timer_scaler_path, 'rb') as fid:
                self.timing_model_scaler = cPickle.load(fid)
                # TODO a hack
                # self.timing_model_scaler.scale_ = \
                #    self.timing_model_scaler.std_.copy()

        print "Loading decoder..."
        hmm_dict = deepcopy(self.tag_to_index_map)
        # add the interegnum tag
        if "disf" in self.args.tags:
            intereg_ind = len(hmm_dict.keys())
            interreg_tag = \
                "<i/><cc/>" if "uttseg" in self.args.tags else "<i/>"
            hmm_dict[interreg_tag] = intereg_ind  # add the interregnum tag

        # decoder_file = os.path.dirname(os.path.realpath(__file__)) + \
        #     "/../decoder/model/{}_tags".format(self.args.tags)
        noisy_channel = None
        if 'noisy_channel' in self.args.decoder_type:
            noisy_channel = SourceModel(self.lm,
                                        self.pos_lm,
                                        uttseg=self.args.do_utt_segmentation)
        self.decoder = FirstOrderHMM(
            hmm_dict,
            markov_model_file=self.args.tags,
            timing_model=self.timing_model,
            timing_model_scaler=self.timing_model_scaler,
            constraint_only=True,
            noisy_channel=noisy_channel)

        # getting the states in the right shape
        self.state_history = []
        self.softmax_history = []
        # self.convert_to_output_tags = get_conversion_method(self.args.tags)
        self.reset()

    def init_language_models(self,
                             language_model=None,
                             pos_language_model=None,
                             edit_language_model=None):
        clean_model_dir = os.path.dirname(os.path.realpath(__file__)) +\
            "/../data/lm_corpora"
        if language_model:
            self.lm = language_model
        else:
            print "No language model specified, using default switchboard one"
            lm_corpus_file = open(clean_model_dir +
                                  "/swbd_disf_train_1_clean.text")
            lines = [
                line.strip("\n").split(",")[1] for line in lm_corpus_file
                if "POS," not in line and not line.strip("\n") == ""
            ]
            split = int(0.9 * len(lines))
            lm_corpus = "\n".join(lines[:split])
            heldout_lm_corpus = "\n".join(lines[split:])
            lm_corpus_file.close()
            self.lm = KneserNeySmoothingModel(
                order=3,
                discount=0.7,
                partial_words=self.args.partial_words,
                train_corpus=lm_corpus,
                heldout_corpus=heldout_lm_corpus,
                second_corpus=None)
        if pos_language_model:
            self.pos_lm = pos_language_model
        elif self.args.pos:
            print "No pos language model specified, \
            using default switchboard one"

            lm_corpus_file = open(clean_model_dir +
                                  "/swbd_disf_train_1_clean.text")
            lines = [
                line.strip("\n").split(",")[1] for line in lm_corpus_file
                if "POS," in line and not line.strip("\n") == ""
            ]
            split = int(0.9 * len(lines))
            lm_corpus = "\n".join(lines[:split])
            heldout_lm_corpus = "\n".join(lines[split:])
            lm_corpus_file.close()
            self.pos_lm = KneserNeySmoothingModel(
                order=3,
                discount=0.7,
                partial_words=self.args.partial_words,
                train_corpus=lm_corpus,
                heldout_corpus=heldout_lm_corpus,
                second_corpus=None)
        if edit_language_model:
            self.edit_lm = edit_language_model
        else:
            edit_lm_corpus_file = open(clean_model_dir +
                                       "/swbd_disf_train_1_edit.text")
            edit_lines = [
                line.strip("\n").split(",")[1] for line in edit_lm_corpus_file
                if "POS," not in line and not line.strip("\n") == ""
            ]
            edit_split = int(0.9 * len(edit_lines))
            edit_lm_corpus = "\n".join(edit_lines[:edit_split])
            heldout_edit_lm_corpus = "\n".join(edit_lines[edit_split:])
            edit_lm_corpus_file.close()
            self.edit_lm = KneserNeySmoothingModel(
                train_corpus=edit_lm_corpus,
                heldout_corpus=heldout_edit_lm_corpus,
                order=2,
                discount=0.7)
            # TODO an object for getting the lm features incrementally
            # in the language model

    def init_model_from_config(self, args):
        # for feat, val in args._get_kwargs():
        #     print feat, val, type(val)
        if not test_if_using_GPU():
            print "Warning: not using GPU, might be a bit slow"
            print "\tAdjust Theano config file ($HOME/.theanorc)"
        print "loading tag to index maps..."
        label_path = os.path.dirname(os.path.realpath(__file__)) +\
            "/../data/tag_representations/{}_tags.csv".format(args.tags)
        word_path = os.path.dirname(os.path.realpath(__file__)) +\
            "/../data/tag_representations/{}.csv".format(args.word_rep)
        pos_path = os.path.dirname(os.path.realpath(__file__)) +\
            "/../data/tag_representations/{}.csv".format(args.pos_rep)
        self.tag_to_index_map = load_tags(label_path)
        self.word_to_index_map = load_tags(word_path)
        self.pos_to_index_map = load_tags(pos_path)
        self.model_type = args.model_type
        vocab_size = len(self.word_to_index_map.keys())
        emb_dimension = args.emb_dimension
        n_hidden = args.n_hidden
        n_extra = args.n_language_model_features + args.n_acoustic_features
        n_classes = len(self.tag_to_index_map.keys())
        self.window_size = args.window
        n_pos = len(self.pos_to_index_map.keys())
        update_embeddings = args.update_embeddings
        lr = args.lr
        print "Initializing model of type", self.model_type, "..."
        if self.model_type == 'elman':
            model = Elman(ne=vocab_size,
                          de=emb_dimension,
                          nh=n_hidden,
                          na=n_extra,
                          n_out=n_classes,
                          cs=self.window_size,
                          npos=n_pos,
                          update_embeddings=update_embeddings)
            self.initial_h0_state = model.h0.get_value()
            self.initial_c0_state = None

        elif self.model_type == 'lstm':
            model = LSTM(ne=vocab_size,
                         de=emb_dimension,
                         n_lstm=n_hidden,
                         na=n_extra,
                         n_out=n_classes,
                         cs=self.window_size,
                         npos=n_pos,
                         lr=lr,
                         single_output=True,
                         cost_function='nll')
            self.initial_h0_state = model.h0.get_value()
            self.initial_c0_state = model.c0.get_value()
        else:
            raise NotImplementedError('No model init for {0}'.format(
                self.model_type))
        return model

    def load_model_params_from_folder(self, model_folder, model_type):
        if model_type in ["lstm", "elman"]:
            self.model.load_weights_from_folder(model_folder)
            self.initial_h0_state = self.model.h0.get_value()
            if model_type == "lstm":
                self.initial_c0_state = self.model.c0.get_value()
        else:
            raise NotImplementedError(
                'No weight loading for {0}'.format(model_type))

    def load_embeddings(self, embeddings_name):
        # load pre-trained embeddings
        embeddings_dir = os.path.dirname(os.path.realpath(__file__)) +\
                                "/../embeddings/"
        pretrained = gensim.models.Word2Vec.load(embeddings_dir +
                                                 embeddings_name)
        print "emb shape", pretrained[pretrained.index2word[0]].shape
        # print pretrained[0].shape
        # assign and fill in the gaps
        emb = populate_embeddings(self.args.emb_dimension,
                                  len(self.word_to_index_map.items()),
                                  self.word_to_index_map, pretrained)
        self.model.load_weights(emb=emb)

    def standardize_word_and_pos(
            self,
            word,
            pos=None,
            proper_name_pos_tags=["NNP", "NNPS", "CD", "LS", "SYM", "FW"]):
        word = word.lower()
        if not pos and self.pos_tagger:
            pos = self.pos_tagger.tag([])  # TODO
        if pos:
            pos = pos.upper()
            if pos in proper_name_pos_tags and "$unc$" not in word:
                word = "$unc$" + word
            if self.pos_to_index_map.get(pos) is None:
                # print "unknown pos", pos
                pos = "<unk>"
        if self.word_to_index_map.get(word) is None:
            # print "unknown word", word
            word = "<unk>"
        return word, pos

    def tag_new_word(self,
                     word,
                     pos=None,
                     timing=None,
                     extra=None,
                     diff_only=True,
                     rollback=0):
        """Tag new incoming word and update the word and tag graphs.

        :param word: the word to consume/tag
        :param pos: the POS tag to consume/tag (optional)
        :param timing: the duration of the word (optional)
        :param diff_only: whether to output only the diffed suffix,
        if False, outputs entire output tags
        :param rollback: the number of words to rollback
        in the case of changed word hypotheses from an ASR
        """
        self.rollback(rollback)
        if pos is None and self.args.pos:
            # if no pos tag provided but there is a pos-tagger, tag word
            test_words = [
                unicode(x) for x in get_last_n_features(
                    "words", self.word_graph, len(self.word_graph) - 1, n=4)
            ] + [unicode(word.lower())]
            pos = self.pos_tagger.tag(test_words)[-1][1]
            # print "tagging", word, "as", pos
        # 0. Add new word to word graph
        word, pos = self.standardize_word_and_pos(word, pos)
        # print "New word:", word, pos
        self.word_graph.append((word, pos, timing))
        # 1. load the saved internal rnn state
        # TODO these nets aren't (necessarily) trained statefully
        # The internal state in training self.args.bs words back
        # are the inital ones in training, however here
        # They are the actual state reached.
        if self.state_history == []:
            c0_state = self.initial_c0_state
            h0_state = self.initial_h0_state
        else:
            if self.model_type == "lstm":
                c0_state = self.state_history[-1][0][-1]
                h0_state = self.state_history[-1][1][-1]
            elif self.model_type == "elman":
                h0_state = self.state_history[-1][-1]

        if self.model_type == "lstm":
            self.model.load_weights(c0=c0_state, h0=h0_state)
        elif self.model_type == "elman":
            self.model.load_weights(h0=h0_state)
        else:
            raise NotImplementedError("no history loading for\
                             {0} model".format(self.model_type))

        # 2. do the softmax output with converted inputs
        word_window = [
            self.word_to_index_map[x]
            for x in get_last_n_features("words",
                                         self.word_graph,
                                         len(self.word_graph) - 1,
                                         n=self.window_size)
        ]
        pos_window = [
            self.pos_to_index_map[x]
            for x in get_last_n_features("POS",
                                         self.word_graph,
                                         len(self.word_graph) - 1,
                                         n=self.window_size)
        ]
        # print "word_window, pos_window", word_window, pos_window
        if self.model_type == "lstm":
            h_t, c_t, s_t = self.model.\
                soft_max_return_hidden_layer([word_window], [pos_window])
            self.softmax_history.append(s_t)
            if len(self.state_history) == 20:  # just saving history
                self.state_history.pop(0)  # pop first one
            self.state_history.append((c_t, h_t))
        elif self.model_type == "elman":
            h_t, s_t = self.model.soft_max_return_hidden_layer([word_window],
                                                               [pos_window])
            self.softmax_history.append(s_t)
            if len(self.state_history) == 20:
                self.state_history.pop(0)  # pop first one
            self.state_history.append(h_t)
        else:
            raise NotImplementedError("no softmax implemented for\
                                 {0} model".format(self.model_type))
        softmax = np.concatenate(self.softmax_history)

        # 3. do the decoding on the softmax
        if "disf" in self.args.tags:
            edit_tag = "<e/><cc>" if "uttseg" in self.args.tags else "<e/>"
            # print self.tag_to_index_map[edit_tag]
            adjustsoftmax = np.concatenate(
                (softmax, softmax[:, self.tag_to_index_map[edit_tag]].reshape(
                    softmax.shape[0], 1)), 1)
        else:
            adjustsoftmax = softmax
        last_n_timings = None if ((not self.args.use_timing_data) or
                                  not timing) \
            else get_last_n_features("timings", self.word_graph,
                                     len(self.word_graph)-1,
                                     n=3)
        new_tags = self.decoder.viterbi_incremental(
            adjustsoftmax,
            a_range=(len(adjustsoftmax) - 1, len(adjustsoftmax)),
            changed_suffix_only=True,
            timing_data=last_n_timings,
            words=[word])
        # print "new tags", new_tags
        prev_output_tags = deepcopy(self.output_tags)
        self.output_tags = self.output_tags[:len(self.output_tags) -
                                            (len(new_tags) - 1)] + new_tags

        # 4. convert to standardized output format
        if "simple" in self.args.tags:
            for p in range(
                    len(self.output_tags) - (len(new_tags) + 1),
                    len(self.output_tags)):
                rps = self.output_tags[p]
                self.output_tags[p] = rps.replace('rm-0',
                                                  'rps id="{}"'.format(p))
                if "<i" in self.output_tags[p]:
                    self.output_tags[p] = self.output_tags[p].\
                        replace("<e/>", "").replace("<i", "<e/><i")
        else:
            # new_words = [word]
            words = get_last_n_features("words",
                                        self.word_graph,
                                        len(self.word_graph) - 1,
                                        n=len(self.word_graph) -
                                        (self.window_size - 1))
            self.output_tags = convert_from_inc_disfluency_tags_to_eval_tags(
                self.output_tags,
                words,
                start=len(self.output_tags) - (len(new_tags)),
                representation=self.args.tags)
        if diff_only:
            for i, old_new in enumerate(zip(prev_output_tags,
                                            self.output_tags)):
                old, new = old_new
                if old != new:
                    return self.output_tags[i:]
            return self.output_tags[len(prev_output_tags):]
        return self.output_tags

    def tag_utterance(self, utterance):
        """Tags entire utterance, only possible on models
        trained on unsegmented data.
        """
        if not self.args.utts_presegmented:
            raise NotImplementedError("Tagger trained on unsegmented data,\
            please call tag_prefix(words) instead.")
        # non segmenting
        self.reset()  # always starts in initial state
        if not self.args.pos:  # no pos tag model
            utterance = [(w, None, t) for w, p, t in utterance]
            # print "Warning: not using pos tags as not pos tag model"
        if not self.args.use_timing_data:
            utterance = [(w, p, None) for w, p, t in utterance]
            # print "Warning: not using timing durations as no timing model"
        for w, p, t in utterance:
            if self.args.pos:
                self.tag_new_word(w, pos=p, timing=t)
        return self.output_tags

    def rollback(self, backwards):
        super(DeepDisfluencyTagger, self).rollback(backwards)
        self.state_history = self.state_history[:len(self.state_history) -
                                                backwards]
        self.softmax_history = self.softmax_history[:len(self.softmax_history
                                                         ) - backwards]
        self.decoder.rollback(backwards)

    def init_deep_model_internal_state(self):
        if self.model_type == "lstm":
            self.model.load_weights(c0=self.initial_c0_state,
                                    h0=self.initial_h0_state)
        elif self.model_type == "elman":
            self.model.load_weights(h0=self.initial_h0_state)

    def reset(self):
        super(DeepDisfluencyTagger, self).reset()
        self.word_graph = [("<s>", "<s>", 0)] * \
            (self.window_size - 1)
        self.state_history = []
        self.softmax_history = []
        self.decoder.viterbi_init()
        self.init_deep_model_internal_state()

    def evaluate_fast_from_matrices(self, validation_matrices, tag_file,
                                    idx_to_label_dict):
        output = []
        true_y = []
        for v in validation_matrices:
            words_idx, pos_idx, extra, y, indices = v
            if extra:
                output.extend(
                    self.model.classify_by_index(words_idx, indices, pos_idx,
                                                 extra))
            else:
                output.extend(
                    self.model.classify_by_index(words_idx, indices, pos_idx))
            true_y.extend(y)
        p_r_f_tags = precision_recall_fscore_support(true_y,
                                                     output,
                                                     average='macro')
        tag_summary = classification_report(
            true_y,
            output,
            labels=[i for i in xrange(len(idx_to_label_dict.items()))],
            target_names=[
                idx_to_label_dict[i]
                for i in xrange(len(idx_to_label_dict.items()))
            ])
        print tag_summary
        results = {
            "f1_rmtto": p_r_f_tags[2],
            "f1_rm": p_r_f_tags[2],
            "f1_tto1": p_r_f_tags[2],
            "f1_tto2": p_r_f_tags[2]
        }

        results.update({'f1_tags': p_r_f_tags[2], 'tag_summary': tag_summary})
        return results

    def train_net(self,
                  train_dialogues_filepath=None,
                  validation_dialogues_filepath=None,
                  model_dir=None,
                  tag_accuracy_file_path=None):
        """Train the internal deep learning model
        from a list of dialogue matrices.
        """
        tag_accuracy_file = open(tag_accuracy_file_path, "a")
        print "Verifying files..."
        for filepath in [
                train_dialogues_filepath, validation_dialogues_filepath
        ]:
            if not verify_dialogue_data_matrices_from_folder(
                    filepath,
                    word_dict=self.word_to_index_map,
                    pos_dict=self.pos_to_index_map,
                    tag_dict=self.tag_to_index_map,
                    n_lm=self.args.n_language_model_features,
                    n_acoustic=self.args.n_acoustic_features):
                raise Exception("Dialogue vectors in wrong format!\
                See README.md.")
        lr = self.args.lr  # even if decay, start with specific lr
        n_extra = self.args.n_language_model_features + \
            self.args.n_acoustic_features
        # validation matrices filepath much smaller so can store these
        # and preprocess them all:
        validation_matrices = [
            np.load(validation_dialogues_filepath + "/" + fp)
            for fp in os.listdir(validation_dialogues_filepath)
        ]
        validation_matrices = [
            dialogue_data_and_indices_from_matrix(
                d_matrix,
                n_extra,
                pre_seg=self.args.utts_presegmented,
                window_size=self.window_size,
                bs=self.args.bs,
                tag_rep=self.args.tags,
                tag_to_idx_map=self.tag_to_index_map,
                in_utterances=self.args.utts_presegmented)
            for d_matrix in validation_matrices
        ]
        idx_2_label_dict = {v: k for k, v in self.tag_to_index_map.items()}
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)
        start = 1  # by default start from the first epoch
        best_score = 0
        best_epoch = 0
        print "Net training started..."
        for e in range(start, self.args.n_epochs + 1):
            tic = time.time()
            epoch_folder = model_dir + "/epoch_{}".format(e)
            if not os.path.exists(epoch_folder):
                os.mkdir(epoch_folder)
            train_loss = 0
            # TODO IO is slow, where the memory allows do in one
            load_separately = True
            test = False
            if load_separately:
                for i, dialogue_f in enumerate(
                        os.listdir(train_dialogues_filepath)):
                    if test and i > 3:
                        break
                    print dialogue_f
                    d_matrix = np.load(train_dialogues_filepath + "/" +
                                       dialogue_f)
                    word_idx, pos_idx, extra, y, indices = \
                        dialogue_data_and_indices_from_matrix(
                                          d_matrix,
                                          n_extra,
                                          window_size=self.window_size,
                                          bs=self.args.bs,
                                          pre_seg=self.args.utts_presegmented
                                                              )
                    # for i in range(len(indices)):
                    #     print i, word_idx[i], pos_idx[i], \
                    #     y[i], indices[i]
                    train_loss += self.model.fit(word_idx,
                                                 y,
                                                 lr,
                                                 indices,
                                                 pos_idx=pos_idx,
                                                 extra_features=extra)
                    print '[learning] file %i >>' % (i+1),\
                        'completed in %.2f (sec) <<\r' % (time.time() - tic)
            # save the initial states we've learned to override the random
            self.initial_h0_state = self.model.h0.get_value()
            if self.args.model_type == "lstm":
                self.initial_c0_state = self.model.c0.get_value()
            # reset and evaluate simply
            self.reset()
            results = self.evaluate_fast_from_matrices(
                validation_matrices,
                tag_accuracy_file,
                idx_to_label_dict=idx_2_label_dict)
            val_score = results['f1_tags']  #TODO get best score type
            print "epoch training loss", train_loss
            print '[learning] epoch %i >>' % (e),\
                'completed in %.2f (sec) <<\r' % (time.time() - tic)
            print "validation score", val_score
            tag_accuracy_file.write(
                str(e) + "\n" + results['tag_summary'] + "\n%%%%%%%%%%\n")
            tag_accuracy_file.flush()
            print "saving model..."
            self.model.save(epoch_folder)  # Epoch file dump
            # checking patience and decay, if applicable
            # stopping criterion
            if val_score > best_score:
                self.model.save(model_dir)
                best_score = val_score
                print 'NEW BEST raw labels at epoch ', e, 'best valid',\
                    best_score
                best_epoch = e
            # stopping criteria = if no improvement in 10 epochs
            if e - best_epoch >= 10:
                print "stopping, no improvement in 10 epochs"
                break
            if self.args.decay and (e - best_epoch) > 1:
                # just a steady decay if things aren't improving for 2 epochs
                # a hidden hyperparameter
                decay_rate = 0.85
                lr *= decay_rate
                print "learning rate decayed, now ", lr
            if lr < 1e-5:
                print "stopping, below learning rate threshold"
                break
            print '[learning and testing] epoch %i >>' % (e),\
                'completed in %.2f (sec) <<\r' % (time.time()-tic)

        print 'BEST RESULT: epoch', best_epoch, 'valid score', best_score
        tag_accuracy_file.close()
        return best_epoch

    def incremental_output_from_file(self,
                                     source_file_path,
                                     target_file_path=None,
                                     is_asr_results_file=False):
        """Return the incremental output in an increco style
        given the incoming words + POS. E.g.:

        Speaker: KB3_1

        Time: 1.50
        KB3_1:1    0.00    1.12    $unc$yes    NNP    <f/><tc/>

        Time: 2.10
        KB3_1:1    0.00    1.12    $unc$yes    NNP    <rms id="1"/><tc/>
        KB3_1:2    1.12    2.00     because    IN    <rps id="1"/><cc/>

        Time: 2.5
        KB3_1:2    1.12    2.00     because    IN    <rps id="1"/><rpndel id="1"/><cc/>

        from an ASR increco style input without the POStags:

        or a normal style disfluency dectection ground truth corpus:

        Speaker: KB3_1
        KB3_1:1    0.00    1.12    $unc$yes    NNP    <rms id="1"/><tc/>
        KB3_1:2    1.12    2.00     $because    IN    <rps id="1"/><cc/>
        KB3_1:3    2.00    3.00    because    IN    <f/><cc/>
        KB3_1:4    3.00    4.00    theres    EXVBZ    <f/><cc/>
        KB3_1:6    4.00    5.00    a    DT    <f/><cc/>
        KB3_1:7    6.00    7.10    pause    NN    <f/><cc/>


        :param source_file_path: str, file path to the input file
        :param target_file_path: str, file path to output in the above format
        :param is_asr_results_file: bool, whether the input is increco style
        """
        if target_file_path:
            target_file = open(target_file_path, "w")
        if not self.args.do_utt_segmentation:
            print "not doing utt seg, using pre-segmented file"
        if is_asr_results_file:
            return NotImplementedError
        if 'timings' in source_file_path:
            print "input file has timings"
            if not is_asr_results_file:
                dialogues = []
                IDs, timings, words, pos_tags, labels = \
                    get_tag_data_from_corpus_file(source_file_path)
                for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags,
                                                labels):
                    dialogues.append((dialogue, (a, b, c, d)))
        else:
            print "no timings in input file, creating fake timings"
            raise NotImplementedError

        for speaker, speaker_data in dialogues:
            # if "4565" in speaker: quit()
            print speaker
            self.reset()  # reset at the beginning of each dialogue
            if target_file_path:
                target_file.write("Speaker: " + str(speaker) + "\n\n")
            timing_data, lex_data, pos_data, labels = speaker_data
            # iterate through the utterances
            # utt_idx = -1
            current_time = 0
            for i in range(0, len(timing_data)):
                # print i, timing_data[i]
                _, end = timing_data[i]
                if (not self.args.do_utt_segmentation) \
                        and "<t" in labels[i]:
                    self.reset()  # reset after each utt if non pre-seg
                # utt_idx = frames[i]
                timing = None
                if 'timings' in source_file_path and self.args.use_timing_data:
                    timing = end - current_time
                word = lex_data[i]
                pos = pos_data[i]
                diff = self.tag_new_word(word,
                                         pos,
                                         timing,
                                         diff_only=True,
                                         rollback=0)
                current_time = end
                if target_file_path:
                    target_file.write("Time: " + str(current_time) + "\n")
                    new_words = lex_data[i - (len(diff) - 1):i + 1]
                    new_pos = pos_data[i - (len(diff) - 1):i + 1]
                    new_timings = timing_data[i - (len(diff) - 1):i + 1]
                    for t, w, p, tag in zip(new_timings, new_words, new_pos,
                                            diff):
                        target_file.write("\t".join(
                            [str(t[0]), str(t[1]), w, p, tag]))
                        target_file.write("\n")
                    target_file.write("\n")
            target_file.write("\n")

    def train_decoder(self, tag_file):
        raise NotImplementedError

    def save_decoder_model(self, dir_path):
        raise NotImplementedError
Example #11
0
class SimpleSLU:
    def __init__(self):
        self.__semantic_instance_list = []
        self.__speech_act_instance_list = []

        self.__semantic_model = None
        self.__speech_act_model = None

        self.__speech_act_lb = None

    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True

    def add_instance(self, utter, speech_act, semantic_tagged):
        tokenized = self.__tokenize(utter, semantic_tagged)
        if tokenized is None:
            return False

        semantic_instance = []
        for word, (bio, tag, attrs) in tokenized:
            if bio is None:
                sem_label = 'O'
            else:
                cat = None
                for attr, val in attrs:
                    if attr == 'cat':
                        cat = val
                sem_label = '%s-%s_%s' % (bio, tag, cat)
            semantic_instance.append((unicode(word.lower()), unicode(sem_label)))
        self.__semantic_instance_list.append(semantic_instance)

        sa_label_list = []
        for sa in speech_act:
            sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list += sa_labels

        sa_label_list = sorted(set(sa_label_list))

        word_feats = ' '.join([word.lower() for word, _ in tokenized])
        self.__speech_act_instance_list.append((word_feats, sa_label_list))

        return True

    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.MultiLabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)

    def pred(self, utter):
        tokenized = self.__tokenize(utter)
        word_feats = ' '.join([word.lower() for word, _ in tokenized])

        pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats]))
        pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized])

        return (pred_act, pred_semantic)

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Example #12
0
X_train_preprocessed = []
print('\n\nPreprocessing The Data .......\n\n')
for sent in X_train:

    sent_processed = []
    sent = emoji.demojize(sent)
    sent = sent.lower()
    sent = re.sub(r"http\S+", "", sent)
    sent = re.sub(r"@\S+", "", sent)
    sent = sent.replace('url', '')
    words = tokenizer.tokenize(sent)

    words = Convert_Short_Hands(words)

    for word, tag in POS_tagger.tag(words):

        if word in apost_Dict:
            word = apost_Dict[word]
        word = remove_PunctuationAndNum(word)
        word = word.lower()

        if word != "":
            word = lemmatizer.lemmatize(word, tag_map[tag[0]])
            sent_processed.append(word)

    X_train_preprocessed.append(sent_processed)

# print(X_train_preprocessed)

t = keras.preprocessing.text.Tokenizer()
ct = CRFTagger()

ct.set_model_file("model.crf.tagger")

brown_sents = brown.sents()
size = int(len(brown_sents) * 0.7)

test_sents = brown_sents[size:]

flat_list = []
for sublist in test_sents:
    for item in sublist:
        flat_list.append(item)

l = ct.tag(flat_list)
y_pred = []

for each in l:
    y_pred.append(each[1])

#print(y_pred[:10])

tagged_sents = brown.tagged_sents(tagset="universal")[size:]

y_true = []
for each in tagged_sents:
    for e in each:
        y_true.append(e[1])

#print(y_true[0:10])
Example #14
0
from nltk.tag import CRFTagger

crflan = CRFTagger()
crf = CRFTagger()

crflan.set_model_file('model.crf.tagger')
crf.set_model_file('model1.crf.tagger')

print "Give a sentence..."
# Test
test_sent = raw_input()
test_sent = test_sent.encode('utf-8').decode('utf-8').split(' ')
print test_sent
half_ans = crflan.tag(test_sent)
print half_ans

# print test_sent
print crf.tag(test_sent)
Example #15
0
def onsentencelist():
        ct = CRFTagger()

        """sentencelist contains nertaged sentences"""
        sentencelist = pickle.load(open('sentencelist.pickle','rb'))

        """training size as percentage"""
        trainingsize = 0.9

        """ calculate where to split data """
        limit = round(trainingsize*len(sentencelist))

        """wordsentencelist contains the same sentences not ner-tagged"""
        wordsentencelist = pickle.load(open("wordsentencelist.pickle","rb"))

        
        """train the data / choose one of the 2 blocks """
        #train_data = sentencelist[:limit]
        #ct.train(train_data,'model.crf.tagger')
        ct.set_model_file('tweetmodel.crf.tagger')
        

        """Test data and evaluate"""
        test_data = wordsentencelist[limit:]
        ct.tag_sents(test_data) # tagging sentences
        gold_sentences = sentencelist[limit:]
        print("\nAccuracy:", ct.evaluate(gold_sentences))


        """ TURN TRAINED TAGGED LIST AND TEST LIST INTO ONE LIST CONTAINING
        ONLY THE TRUE AND PREDTAGS"""
        pred_nerlist = []
        for sentence in wordsentencelist[:limit]:
                for (word,nertag) in ct.tag(sentence):
                        #pred_nerlist.append((word,nertag))
                        pred_nerlist.append(nertag.lower())
                        
        true_nerlist = []
        #ct_true = gold_sentences
        for sentence in sentencelist[:limit]:
                for (word,nertag) in sentence:
                        #true_nerlist.append((word,nertag))
                        true_nerlist.append(nertag.lower())
        
        """ Print baseline """
        #print("\nBaseline = 0.9048987094135446 (everything tagged O)")

        
        """"Print F-score and confusion matrix """
        #print(len(pred_nerlist))
        #print(len(true_nerlist))
        
        """"Print F-score and confusion matrix """        
        print("\nF-score (micro):", f1_score(true_nerlist, pred_nerlist, average='micro') )
        print("\nF-score (macro):", f1_score(true_nerlist, pred_nerlist, average='macro') )
        print("\nF-score (weigthed):", f1_score(true_nerlist, pred_nerlist, average='weighted') )
        print("\nF-score (None):", f1_score(true_nerlist, pred_nerlist, average=None, labels=["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
        
        
        print("\nConfusion matrix:\n")
        for item in ["O","B-per","I-per","B-loc","I-loc","B-org","I-org","B-misc","I-misc"]: print("  ",item,end="")
        print("\n",confusion_matrix(true_nerlist, pred_nerlist,labels = ["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
Example #16
0
class SimpleSLU:
    def __init__(self):
        self.__semantic_instance_list = []
        self.__speech_act_instance_list = []

        self.__semantic_model = None
        self.__speech_act_model = None

        self.__speech_act_lb = None

    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True

    def add_instance(self, utter, speech_act, semantic_tagged):
        tokenized = self.__tokenize(utter, semantic_tagged)
        if tokenized is None:
            return False

        semantic_instance = []
        for word, (bio, tag, attrs) in tokenized:
            if bio is None:
                sem_label = 'O'
            else:
                cat = None
                for attr, val in attrs:
                    if attr == 'cat':
                        cat = val
                sem_label = '%s-%s_%s' % (bio, tag, cat)
            semantic_instance.append((unicode(word.lower()), unicode(sem_label)))
        self.__semantic_instance_list.append(semantic_instance)

        sa_label_list = []
        for sa in speech_act:
            sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list += sa_labels

        sa_label_list = sorted(set(sa_label_list))

        word_feats = ' '.join([word.lower() for word, _ in tokenized])
        self.__speech_act_instance_list.append((word_feats, sa_label_list))

        return True

    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.LabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)

    def pred(self, utter):
        tokenized = self.__tokenize(utter)
        word_feats = ' '.join([word.lower() for word, _ in tokenized])

        pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats]))
        pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized])

        return (pred_act, pred_semantic)

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Example #17
0
class NamedEntityChunker(ChunkParserI):
    def __init__(self,
                 train_sents=None,
                 tagger="ClassifierBasedTagger",
                 model=None,
                 model_name="../results/modelCRF_featured",
                 entities=None,
                 language="english",
                 **kwargs):

        self.all_entities = []
        self.acronyms = []
        self.language = language

        if not model:
            assert isinstance(train_sents, Iterable)

        if tagger == "ClassifierBasedTagger":
            self.feature_detector = iob_features
            self.tagger = ClassifierBasedTagger(train=train_sents,
                                                feature_detector=iob_features,
                                                **kwargs)

        elif tagger == "CRFTagger":
            self.set_entities(entities)
            if not model:

                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.train(
                    train_data=train_sents,
                    model_file="../results/{}".format(model_name))
            else:
                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.set_model_file(model)
        else:
            raise Exception('Unknown tagger')

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        return chunks

    def get_position(self, w):
        positions = []
        for e in self.all_entities:
            if w in e:
                positions.append(e.index(w))
        return positions

    def get_positions(self, tokens, index):
        w = tokens[index][0]
        prev = tokens[index - 1][0]
        next = tokens[index + 1][0]
        positions = []
        for e in self.all_entities:
            if w in e and prev in e and next in e:
                positions.append(e.index(w))
        return list(set(positions))

    def set_entities(self, entities):
        if entities:

            entities = [l.split() for l in entities]

            for l in entities:
                if len(l) == 1 and is_all_caps(l[0]):
                    self.acronyms.append(l[0].lower())
                else:
                    self.all_entities.append([w.lower() for w in l])

            self.all_entities = list(
                set([tuple(entity) for entity in self.all_entities]))
            self.acronyms = list(set(self.acronyms))

            with open('../data/entities_{}.txt'.format(self.language),
                      'w') as f:
                f.write("\n".join(
                    [" ".join(line) for line in self.all_entities]))

            with open('../data/acronyms_{}.txt'.format(self.language),
                      'w') as f:
                f.write("\n".join(
                    [" ".join(line) for line in self.all_entities]))
        else:
            with open('../data/entities_{}.txt'.format(self.language),
                      'r') as f:
                for line in f:
                    self.all_entities.append(line.strip().split())

            with open('../data/acronyms_{}.txt'.format(self.language),
                      'r') as f:
                for line in f:
                    self.acronyms.append(line.strip())

        self.all_entities = list(
            set([tuple(entity) for entity in self.all_entities]))
        self.acronyms = list(set(self.acronyms))

    def crf_features(self, tokens, index):
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        """

        # init the stemmer
        stemmer = SnowballStemmer(self.language)

        # Pad the sequence with
        num_of_previous = 3
        num_of_posterior = 2
        tk = []
        for i in range(0, num_of_previous):
            tk.append(('[START{}]'.format(num_of_previous - i),
                       '[START{}]'.format(num_of_previous - i)))

        tk = tk + list(tokens)
        for i in range(1, num_of_posterior + 1):
            tk.append(('[END{}]'.format(i), '[END{}]'.format(i)))

        tokens = tk

        index += num_of_previous

        word, pos = tokens[index]

        contains_dash = ('–' in word or '-' in word or '_' in word)
        contains_dot = '.' in word

        prev2_words = tokens[index - 2][0] + "_._" + tokens[index - 1][0]
        prev2_pos = tokens[index - 2][1] + "_._" + tokens[index - 1][1]

        prev1_words = tokens[index - 1][0] + "_._" + tokens[index][0]
        prev1_pos = tokens[index - 1][1] + "_._" + tokens[index][1]
        prev1_lemma = stemmer.stem(
            tokens[index - 1][0]) + "_._" + stemmer.stem(tokens[index][0])

        next1_words = tokens[index][0] + "_._" + tokens[index + 1][0]
        next1_pos = tokens[index][1] + "_._" + tokens[index + 1][1]

        next2_words = tokens[index + 1][0] + "_._" + tokens[index + 2][0]
        next2_pos = tokens[index + 1][1] + "_._" + tokens[index + 2][1]

        allcaps = is_all_caps(word)
        strange_cap = word[
            0] not in string.ascii_uppercase and word != word.lower()

        inside_ent = word.lower() in self.all_entities
        is_acronym = word.lower() in self.acronyms
        features = {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-caps': allcaps,
            'strange-cap': strange_cap,
            'prev2-pos': prev2_pos,
            'prev2-word': prev2_words,
            'next2-pos': next2_pos,
            'next2-word': next2_words,
            'prev1-pos': prev1_pos,
            'prev1-word': prev1_words,
            'prev1-lemma': prev1_lemma,
            'next1-pos': next1_pos,
            'next1-word': next1_words,
        }

        features['inside-entities'] = inside_ent
        if is_acronym:
            features['is-acronym'] = is_acronym

        positions = self.get_position(word.lower())
        for p in positions:
            features['position-{}'.format(p)] = True
        features['total-position-{}'.format(len(positions))] = True

        if contains_dash:
            features['contains-dash'] = contains_dash
        if contains_dot:
            features['contains-dot'] = contains_dot

        for i in range(1, num_of_previous + 1):
            word, pos = tokens[index - i]
            lemma = stemmer.stem(word)

            features['prev-{}-word'.format(i)] = word
            features['prev-{}-pos'.format(i)] = pos

            features['prev-{}-lemma'.format(i)] = lemma

        for i in range(1, num_of_posterior + 1):
            word, pos = tokens[index + i]
            inside_ent = word.lower() in self.all_entities

            features['next-{}-word'.format(i)] = word
            features['next-{}-pos'.format(i)] = pos
            features['next-{}-inside-ent'.format(i)] = inside_ent

        return features
test_data_new = []
test_data_tags = []
for i in range(len(test_set)):
    if len(test_set[i]) != 0:
        for j in range(len(test_set[i])):
            test_data_new.append(test_set[i][j][0])
            test_data_tags.append(test_set[i][j][1])
gold_sentences = test_data_new
# print ct.evaluate(gold_sentences)

# print test_data_new
pred_tags = []
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

pred = ct.tag(gold_sentences)
for i in range(len(pred)):
    pred_tags.append(pred[i][1])

for i in range(len(test_data_tags)):
    refsets[test_data_tags[i]].add(i)
    testsets[pred_tags[i]].add(i)

print "CRF language model"
print 'Accuracy:', accuracy(pred_tags, test_data_tags)
print "\n"
print 'Precision of en:', precision(refsets['en'], testsets['en'])
print 'Precision of hi:', precision(refsets['hi'], testsets['hi'])
print "\n"
print 'Recall of en:', recall(refsets['en'], testsets['en'])
print 'Recall of hi:', recall(refsets['hi'], testsets['hi'])
Example #19
0
 ct = CRFTagger()  # initialize tagger
 ct.set_model_file(TAGGER_PATH)
 dialogue_speakers = []
 for disf_file in DISFLUENCY_TEST_FILES:
     IDs, mappings, utts, pos_tags, labels = \
         load_data_from_disfluency_corpus_file(disf_file)
     dialogue_speakers.extend(sort_into_dialogue_speakers(IDs,
                                                          mappings,
                                                          utts,
                                                          pos_tags,
                                                          labels))
 word_pos_data = {}  # map from the file name to the data
 for data in dialogue_speakers:
     dialogue, a, b, c, d = data
     word_pos_data[dialogue] = (a, b, c, d)
 ct.tag([unicode(w) for w in "uh my name is john".split()])
 # either gather training data or test data
 training_data = []
 for speaker in word_pos_data.keys():
     # print speaker
     sp_data = []
     prefix = []
     predictions = []
     for word, pos in zip(word_pos_data[speaker][1],
                          word_pos_data[speaker][2]):
         prefix.append(unicode(word.replace("$unc$", "")
                               .encode("utf8")))
         prediction = ct.tag(prefix[-5:])[-1][1]
         sp_data.append((unicode(word.replace("$unc$", "")
                                 .encode("utf8")),
                         unicode(pos.encode("utf8"))))