Example #1
0
    def test_senna_ner_tagger(self):
        nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
        result_1 = nertagger.tag("Shakespeare theatre was in London .".split())
        expected_1 = [
            ("Shakespeare", "B-PER"),
            ("theatre", "O"),
            ("was", "O"),
            ("in", "O"),
            ("London", "B-LOC"),
            (".", "O"),
        ]

        result_2 = nertagger.tag("UN headquarters are in NY , USA .".split())
        expected_2 = [
            ("UN", "B-ORG"),
            ("headquarters", "O"),
            ("are", "O"),
            ("in", "O"),
            ("NY", "B-LOC"),
            (",", "O"),
            ("USA", "B-LOC"),
            (".", "O"),
        ]
        self.assertEqual(result_1, expected_1)
        self.assertEqual(result_2, expected_2)
Example #2
0
    def test_senna_ner_tagger(self):
        nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
        result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
        expected_1 = [
            ('Shakespeare', 'B-PER'),
            ('theatre', 'O'),
            ('was', 'O'),
            ('in', 'O'),
            ('London', 'B-LOC'),
            ('.', 'O'),
        ]

        result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
        expected_2 = [
            ('UN', 'B-ORG'),
            ('headquarters', 'O'),
            ('are', 'O'),
            ('in', 'O'),
            ('NY', 'B-LOC'),
            (',', 'O'),
            ('USA', 'B-LOC'),
            ('.', 'O'),
        ]
        self.assertEqual(result_1, expected_1)
        self.assertEqual(result_2, expected_2)
Example #3
0
    def test_senna_ner_tagger(self):
        nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
        result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
        expected_1 = [
            ('Shakespeare', 'B-PER'),
            ('theatre', 'O'),
            ('was', 'O'),
            ('in', 'O'),
            ('London', 'B-LOC'),
            ('.', 'O'),
        ]

        result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
        expected_2 = [
            ('UN', 'B-ORG'),
            ('headquarters', 'O'),
            ('are', 'O'),
            ('in', 'O'),
            ('NY', 'B-LOC'),
            (',', 'O'),
            ('USA', 'B-LOC'),
            ('.', 'O'),
        ]
        self.assertEqual(result_1, expected_1)
        self.assertEqual(result_2, expected_2)
Example #4
0
def spell_checker(l):
	ner = SennaNERTagger(r"path_to_senna")
	chkr = SpellChecker("en_US")
	chkr.set_text(l)
	for err in chkr:
		#print(err.word)
		if ner.tag(err.word.split())[0][1] == "O":
			print(err.word)
			print("Did you mean: ", err.suggest()[0])
def senna_NER(text):
    nertagger = SennaNERTagger(
        'C:\\Users\\Clara2\\Downloads\\senna-v3.0\\senna-v3.0\\senna')
    text = text.split('\n')

    list = []
    for sentance in text:
        sentance = sentance.split()
        list.append(nertagger.tag(sentance))
    locations = [j for i in list for j in i]
    return locations
Example #6
0
 def answer(speech, user_prefix):
     #tokens = nltk.word_tokenize(speech)
     #tagged = nltk.pos_tag(tokens)
     topic_obj = TopicExtractor(speech)
     result = topic_obj.extract()
     try:
         print(("~Topic: " + result[0]))
     except:
         pass
     speech = speech.upper()
     if "WHEN" in speech or "BIRTHDATE" in speech or "DATE " in speech or " DATE" in speech or "BORN" in speech:
         with nostdout():
             with nostderr():
                 try:
                     wikipage = wikipedia.page(result[0])
                     wikicontent = "".join([
                         i if ord(i) < 128 else ' '
                         for i in wikipage.content
                     ])
                     #print TimeDetector.tag(wikicontent)
                     count = Counter(TimeDetector.tag(wikicontent))
                     return count.most_common()[0][0]
                 except:
                     return noanswer(user_prefix)
     elif "WHERE" in speech or "LOCATION" in speech or "ADDRESS" in speech or "COUNTRY" in speech or "CITY" in speech or "STREET" in speech:
         with nostdout():
             with nostderr():
                 try:
                     wikipage = wikipedia.page(result[0])
                     wikicontent = "".join([
                         i if ord(i) < 128 else ' '
                         for i in wikipage.content
                     ])
                     wikicontent = re.sub(r'\([^)]*\)', '', wikicontent)
                     nertagger = SennaNERTagger('/usr/share/senna')
                     tagged = nertagger.tag(wikicontent.split())
                     for tag in tagged:
                         if tag[1] == 'B-LOC':
                             return tag[0]
                     return noanswer(user_prefix)
                 except:
                     return noanswer(user_prefix)
     else:
         with nostdout():
             with nostderr():
                 try:
                     summary = wikipedia.summary(result[0], sentences=1)
                     summary = "".join(
                         [i if ord(i) < 128 else ' ' for i in summary])
                     summary = re.sub(r'\([^)]*\)', '', summary)
                     return summary
                 except:
                     return noanswer(user_prefix)
Example #7
0
#This program extracts an author's name from this document
from nltk.tag import SennaNERTagger
import os
import re
nertagger=SennaNERTagger('/home/bhardwaj/Documents/zipped folders/senna')

XMLfolder=('/home/bhardwaj/Documents/Conference Dataset/ICDAR Downloads Test/ICDAR2015/papers/XML/')
year='2015'
for file in os.listdir(XMLfolder):
	name=[]
	country=''
	keywords=[]

	abspath=os.path.join(XMLfolder,file)
	print abspath
	f=open(abspath,'r')
	for line in f:
		if ('<country>') in line:
			country= line[9:-11]
			print country
			break
		#print line
		newline= re.sub(r'[,]',' , ',line)
		newline= re.sub(' \.','.',newline)
		#print newline
		#to remove all non-ascii charaters from the line
		#print re.sub(r'\W+',' ',line)
		s= nertagger.tag(newline.decode('utf8').split())
		#print s

		
                arg = labels[i][1][2][2:]
                if 'A1' == arg:
                    EventStructures['Who'] = text
                elif 'A2' == arg:
                    EventStructures['Whom'] = text
                text = labels[i][1][0]
                Args.append(text)
        else:
            text += ' ' + labels[i][1][0]

    print(EventStructures)
    return Args


srltagger = SennaSRLTagger(path)
nertagger = SennaNERTagger(path)
chktagger = SennaChunkTagger(path)
tagger = SennaTagger(path)

#w = s.tag("Are you studying here?".split())
#w = s.tag("""A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED OPERATIONS..""".split())

#print(tagger.tag(sents))
#print('\n___________________\n')
#print(chktagger.tag(sents))
#print('\n___________________\n')
#print(nertagger.tag(sents))
#print('\n___________________\n')
#print(srltagger.tag(sents))
#print('\n___________________\n')
#text = sent
Example #9
0
    def __init__(self, embedder, tag_vocab, ner_vocab, pos_vocab, sess=None):

        # check gpu
        if not check_gpu_existence():
            raise RuntimeError('Ontonotes NER model requires GPU with cuDNN!')

        n_hidden = (256, 256, 256)
        token_embeddings_dim = 100
        n_tags = len(tag_vocab)

        # Create placeholders
        x_word = tf.placeholder(dtype=tf.float32,
                                shape=[None, None, token_embeddings_dim],
                                name='x_word')
        x_char = tf.placeholder(dtype=tf.int32,
                                shape=[None, None, None],
                                name='x_char')

        # Features
        x_pos = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(pos_vocab)],
                               name='x_pos')  # Senna
        x_ner = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(ner_vocab)],
                               name='x_ner')  # Senna
        x_capi = tf.placeholder(dtype=tf.float32,
                                shape=[None, None],
                                name='x_capi')

        y_true = tf.placeholder(dtype=tf.int32,
                                shape=[None, None],
                                name='y_tag')
        mask = tf.placeholder(dtype=tf.float32,
                              shape=[None, None],
                              name='mask')
        sequence_lengths = tf.reduce_sum(mask, axis=1)

        # Concat features to embeddings
        emb = tf.concat(
            [x_word, tf.expand_dims(x_capi, 2), x_pos, x_ner], axis=2)

        # The network
        units = emb
        for n, n_h in enumerate(n_hidden):
            with tf.variable_scope('RNN_' + str(n)):
                units, _ = cudnn_bi_lstm(units, n_h,
                                         tf.to_int32(sequence_lengths))

        # Classifier
        with tf.variable_scope('Classifier'):
            units = tf.layers.dense(units,
                                    n_hidden[-1],
                                    kernel_initializer=xavier_initializer())
            logits = tf.layers.dense(units,
                                     n_tags,
                                     kernel_initializer=xavier_initializer())

        # CRF
        _, trainsition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_true, sequence_lengths)

        # Initialize session
        if sess is None:
            sess = tf.Session()

        self._ner_tagger = SennaNERTagger('download/senna/')
        self._pos_tagger = SennaChunkTagger('download/senna/')

        self._x_w = x_word
        self._x_c = x_char
        self._x_capi = x_capi
        self.x_pos = x_pos
        self.x_ner = x_ner
        self._y_true = y_true
        self._mask = mask
        self._sequence_lengths = sequence_lengths
        self._token_embeddings_dim = token_embeddings_dim

        self._pos_dict = pos_vocab
        self._ner_dict = ner_vocab
        self._tag_dict = tag_vocab

        self._logits = logits
        self._trainsition_params = trainsition_params

        self._sess = sess
        sess.run(tf.global_variables_initializer())
        self._embedder = embedder
Example #10
0
class NerNetwork:
    def __init__(self, embedder, tag_vocab, ner_vocab, pos_vocab, sess=None):

        # check gpu
        if not check_gpu_existence():
            raise RuntimeError('Ontonotes NER model requires GPU with cuDNN!')

        n_hidden = (256, 256, 256)
        token_embeddings_dim = 100
        n_tags = len(tag_vocab)

        # Create placeholders
        x_word = tf.placeholder(dtype=tf.float32,
                                shape=[None, None, token_embeddings_dim],
                                name='x_word')
        x_char = tf.placeholder(dtype=tf.int32,
                                shape=[None, None, None],
                                name='x_char')

        # Features
        x_pos = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(pos_vocab)],
                               name='x_pos')  # Senna
        x_ner = tf.placeholder(dtype=tf.float32,
                               shape=[None, None, len(ner_vocab)],
                               name='x_ner')  # Senna
        x_capi = tf.placeholder(dtype=tf.float32,
                                shape=[None, None],
                                name='x_capi')

        y_true = tf.placeholder(dtype=tf.int32,
                                shape=[None, None],
                                name='y_tag')
        mask = tf.placeholder(dtype=tf.float32,
                              shape=[None, None],
                              name='mask')
        sequence_lengths = tf.reduce_sum(mask, axis=1)

        # Concat features to embeddings
        emb = tf.concat(
            [x_word, tf.expand_dims(x_capi, 2), x_pos, x_ner], axis=2)

        # The network
        units = emb
        for n, n_h in enumerate(n_hidden):
            with tf.variable_scope('RNN_' + str(n)):
                units, _ = cudnn_bi_lstm(units, n_h,
                                         tf.to_int32(sequence_lengths))

        # Classifier
        with tf.variable_scope('Classifier'):
            units = tf.layers.dense(units,
                                    n_hidden[-1],
                                    kernel_initializer=xavier_initializer())
            logits = tf.layers.dense(units,
                                     n_tags,
                                     kernel_initializer=xavier_initializer())

        # CRF
        _, trainsition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_true, sequence_lengths)

        # Initialize session
        if sess is None:
            sess = tf.Session()

        self._ner_tagger = SennaNERTagger('download/senna/')
        self._pos_tagger = SennaChunkTagger('download/senna/')

        self._x_w = x_word
        self._x_c = x_char
        self._x_capi = x_capi
        self.x_pos = x_pos
        self.x_ner = x_ner
        self._y_true = y_true
        self._mask = mask
        self._sequence_lengths = sequence_lengths
        self._token_embeddings_dim = token_embeddings_dim

        self._pos_dict = pos_vocab
        self._ner_dict = ner_vocab
        self._tag_dict = tag_vocab

        self._logits = logits
        self._trainsition_params = trainsition_params

        self._sess = sess
        sess.run(tf.global_variables_initializer())
        self._embedder = embedder

    def load(self, model_file_path):
        saver = tf.train.Saver(tf.trainable_variables())
        saver.restore(self._sess, model_file_path)

    @staticmethod
    def to_one_hot(x, n):
        b = np.zeros([len(x), n], dtype=np.float32)
        for q, tok in enumerate(x):
            b[q, tok] = 1
        return b

    def tokens_batch_to_numpy_batch(self, batch_x):
        """ Convert a batch of tokens to numpy arrays of features"""
        x = dict()
        batch_size = len(batch_x)
        max_utt_len = max([len(utt) for utt in batch_x])

        # Embeddings
        x['emb'] = self._embedder(batch_x)

        # Capitalization
        x['capitalization'] = np.zeros([batch_size, max_utt_len],
                                       dtype=np.float32)
        for n, utt in enumerate(batch_x):
            x['capitalization'][n, :len(utt)] = [
                tok[0].isupper() for tok in utt
            ]

        # POS
        n_pos = len(self._pos_dict)
        x['pos'] = np.zeros([batch_size, max_utt_len, n_pos])
        for n, utt in enumerate(batch_x):
            token_tag_pairs = self._pos_tagger.tag(utt)
            pos_tags = list(zip(*token_tag_pairs))[1]
            pos = np.array([self._pos_dict[p] for p in pos_tags])
            pos = self.to_one_hot(pos, n_pos)
            x['pos'][n, :len(pos)] = pos

        # NER
        n_ner = len(self._ner_dict)
        x['ner'] = np.zeros([batch_size, max_utt_len, n_ner])
        for n, utt in enumerate(batch_x):
            token_tag_pairs = self._ner_tagger.tag(utt)
            ner_tags = list(zip(*token_tag_pairs))[1]
            ner = np.array([self._ner_dict[p] for p in ner_tags])
            ner = self.to_one_hot(ner, n_ner)
            x['ner'][n, :len(ner)] = ner

        # Mask for paddings
        x['mask'] = np.zeros([batch_size, max_utt_len], dtype=np.float32)
        for n in range(batch_size):
            x['mask'][n, :len(batch_x[n])] = 1

        return x

    def train_on_batch(self, x_word, x_char, y_tag):
        raise NotImplementedError

    def predict(self, x):
        feed_dict = self._fill_feed_dict(x)
        y_pred = []
        logits, trans_params, sequence_lengths = self._sess.run(
            [self._logits, self._trainsition_params, self._sequence_lengths],
            feed_dict=feed_dict)

        # iterate over the sentences because no batching in viterbi_decode
        for logit, sequence_length in zip(logits, sequence_lengths):
            logit = logit[:int(sequence_length)]  # keep only the valid steps
            viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
                logit, trans_params)
            y_pred += [viterbi_seq]

        pred = []
        batch_size = len(x['emb'])
        for n in range(batch_size):
            pred.append([self._tag_dict[tag] for tag in y_pred[n]])
        return pred

    def predict_on_batch(self, tokens_batch):
        batch_x = self.tokens_batch_to_numpy_batch(tokens_batch)
        # Prediction indices
        predictions_batch = self.predict(batch_x)
        predictions_batch_no_pad = list()
        for n, predicted_tags in enumerate(predictions_batch):
            predictions_batch_no_pad.append(
                predicted_tags[:len(tokens_batch[n])])
        return predictions_batch_no_pad

    def _fill_feed_dict(self, x):

        feed_dict = dict()
        feed_dict[self._x_w] = x['emb']
        feed_dict[self._mask] = x['mask']

        feed_dict[self.x_pos] = x['pos']
        feed_dict[self.x_ner] = x['ner']

        feed_dict[self._x_capi] = x['capitalization']
        return feed_dict
    def load_pretrained_model(self,
                              modelname='pretrained-StanfordNER',
                              numclass=3):
        """ Loads a pre-trained model.

        Parameters
        ----------
        modelname : str
            The name of the pre-trained model to use. The options are:
                * 'pretrained-StanfordNER': Used a CRF and word embeddings.
                    See: https://nlp.stanford.edu/software/CRF-NER.shtml
                * 'pretrained-MITIE': Used Structural SVMs and word embeddings.
                    Uses Dhillon et al's "eigenwords" word embeddings.
                    See: https://github.com/mit-nlp/MITIE
                * 'pretrained-SENNA': Used multilayer perceptrons and the
                    50-dimensional CW (2008) word embeddings.
                    See: http://ml.nec-labs.com/senna/
                * 'pretrained-spacy': Used BILOU scheme; the algorithm is "a
                    pastiche of well-known methods...a greedy transition-based
                    parser guided by a linear model whose weights are learned
                    using the averaged perceptron loss, via the dynamic oracle
                    imitation strategy". See:
                    https://spacy.io/docs/usage/entity-recognition.
                    Using pre-trained model 'en_core_web_sm' here.
                    NOTE: could try 'en_depent_web_md' instead.
        numclass : int
            The number of classes for the pre-trained classifier; this is
            relevant only when modelname is 'pretrained-StanfordNER'.

        """
        self.pretrained_model = modelname
        self.transfer_method = 'none'
        if modelname == 'pretrained-StanfordNER':
            if numclass == 3:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.all.3class.distsim.crf.ser.gz')  #,
                #STANFORD_CLASSPATH)
                self.model['entity_types'] = ['LOC', 'ORG', 'PER']
                self.model['training_corpus'] = [
                    'CONLL03 eng.train', 'MUC6 train', 'MUC7 train', 'ACE2002',
                    'in-house data'
                ]
            elif numclass == 4:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.conll.4class.distsim.crf.ser.gz')  #,
                ##STANFORD_CLASSPATH)
                self.model['entity_types'] = ['LOC', 'PER', 'ORG', 'MISC']
                self.model['training_corpus'] = ['CONLL03 eng.train']
            elif numclass == 7:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.muc.7class.distsim.crf.ser.gz')  #,
                ##STANFORD_CLASSPATH)
                self.model['entity_types'] = [
                    'LOC',
                    'ORG',
                    'PER',
                    'MISC',
                    'MON',  # MONEY
                    'PCT',  # PERCENT
                    'DAT',  # DATE
                    'TIM'
                ]  # TIME
                self.model['training_corpus'] = ['MUC6 train', 'MUC7 train']
            else:
                raise ValueError(
                    'When using StanfordNER, numclass must be 3, 4 or 7.')

        elif modelname == 'pretrained-MITIE':
            self.NER = mitie.named_entity_extractor(MITIE_MODEL_DIR)
            self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']
            self.model['training_corpus'] = ['?']

        elif modelname == 'pretrained-SENNA':
            self.NER = SennaNERTagger(SENNA_DIR)
            self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']
            self.model['training_corpus'] = ["?"]

        elif modelname == 'pretrained-spacy':
            self.NER = None
            self.model['entity_types'] = [
                'PER',  # PERSON
                'NOR',  # NORP
                'FAC',  # FACILITY
                'ORG',  # ORGANIZATION
                'GPE',  # GEO-POLITICAL
                'LOC',  # LOCATION
                'PRO',  # PRODUCT
                'EVE',  # EVENT
                'WOR',  # WORK OF ART
                'LAN',  # LANGUAGE
                'DAT',  # DATE
                'TIM',  # TIME
                'PCT',  # PERCENT
                'MON',  # MONEY
                'QUA',  # QUANTITY
                'ORD',  # ORDINAL
                'CAR'
            ]  # CARDINAL
            self.model['training_corpus'] = ["?"]
        else:
            raise ValueError("Wrong modelname; must be 'pretrained-spacy',\
                             'pretrained-SENNA', 'pretrained-MITIE',\
                             or 'pretrained-StanfordNER'.")
    def train(self,
              transfer_method,
              classifier_name,
              src_train,
              tgt_train,
              tgt_test,
              features_name='ZhangJohnson',
              **kwargs):
        """ Train the model with a given classifier and with a given domain
        adaptation method (preprocessing or post-processing).

        Parameters
        ----------
        transfer_method : str
            The name of the transfer method to use. They can be:
                * src: Train the model with source training data
                * tgt: Train the model with target training data
                * all: Train the model with both source and target data
                * augment:
                    Train the model both source and target data, but
                    enlarge the feature space, using Daume's easyadapt
                    method[1], so if a token i is in the source,
                    use feature (x_i, x_i, 0) instead of x_i for each feature;
                    if the token in the target data use feature (x_i, 0, x_i).
                    The first entry of the tuple stands for 'general' features,
                    the second is 'source only', and the third is 'target only'.
                 * pred: The 'PRED' method, described in Daume (#TODO put other
                    references in here).
                    Unlike the other methods, the train function both trains
                    and tests, saves the result. Calling 'test' merely prints
                    the score.
                    This permits another optional keyword argument, 'no_prefix':
                    if True, it removes the 'I-' or 'B-' from the PRED feature.


        classifier_name : str
            The name of the classifier to use. Roughly in order of performance:
                * CRF: the CRFTagger from nltk, which calls external CRFSuite.
				  Optional keyword parameter: 'algorithm', which can be either
				  'l2sgd' or 'lbfgs'. If not given, 'l2sgd' is used.
                * averaged_perceptron : the averaged perceptron from nltk
                * megam: nltk's binding from of Daume's external megam program
                * IIS: Improved Iterative Scaling, via nltk
                * GIS: Generalized Iterative Scaling, via nltk
                * naivebayes: Naive Bayes from nltk.

        features_name : str or list
            Which features to use. Can be:
                * 'ZhangJohnson': The features used in Zhang and Johnson (2003).
                * 'word_embedding': Word embedding only.
                * a list containing any combination of the above options.

        src_train, tgt_train, tgt_test : lists
            Each of these is a list of lists, with entries of the form:
                    ( (word, pos, domain), entity )
            For now tgt_test is needed as an argument in order to get the full
            vocabulary for word embeddings.

        **kwargs:
            if classifier_name is 'averaged_perceptron':
                'num_iterations', default: 5 (same as nltk's default)
            if classifier_name is 'megam', 'IIS', or 'GIS':
                'gauss_prior_sigma', default: 1.0 (same as nltk's default)

        References
        ----------
        [1] Daumé III, Hal. "Frustratingly easy domain adaptation." arXiv
            preprint arXiv:0907.1815 (2009).

        [2] L-BFGS: http://aria42.com/blog/2014/12/understanding-lbfgs
                    http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf

        Remarks
        -------
        On speed and memory:
            * megam is slow and memory intensive, though using the optimized
              megam (megam opt) does help.
            * IIS and GIS are pure python and slower than megam.

        """

        #######################################################################
        ## Store model information
        #######################################################################

        self.model['entity_types_src'] = sentence_utils.get_tagset(
            src_train, with_prefix=False)
        self.model['entity_types_tgt'] = sentence_utils.get_tagset(
            tgt_test + tgt_train, with_prefix=False)

        #self.classifier = classifier_name #unused.
        self.transfer_method = transfer_method
        self.parameters = kwargs
        #print '... called train. These are the parameters: ', self.parameters
        #self.featurelist = features.keys()
        if self._verbose:
            print "Transfer Learning: ", transfer_method, "  Classifier: ", classifier_name
        print 'kwargs', kwargs
        print 'exclude_O', self.parameters.get('exclude_O')
        #######################################################################
        ## Determine which features to use
        #######################################################################
        #TODO make an option so can choose whether to augment the word-embeddings with the other features or not...
        if isinstance(features_name, str):
            features = self.get_featurefunc(features_name, transfer_method,
                                            src_train, tgt_train, tgt_test)

        if isinstance(features_name, list):
            featureslist = []
            for featname in features_name:
                f = self.get_featurefunc(featname, transfer_method, src_train,
                                         tgt_train, tgt_test)
                featureslist.append(f)
            print 'Combining features...'
            features = combine_several_featfunctions(featureslist)

    #######################################################################
    ## Transfer Learning Options (specify training data & preprocessing)
    #######################################################################

        if transfer_method in ['src', 'tgt', 'all']:
            features_used = features
            if transfer_method == 'src': train_data = src_train
            if transfer_method == 'tgt': train_data = tgt_train
            if transfer_method == 'all':
                train_data = src_train + tgt_train  #self.all_train

        elif transfer_method == 'augment':
            train_data = src_train + tgt_train  #self.all_train

            def augment_features(tokens, index, history):
                word, pos, domain = tokens[index]
                fts = features(tokens, index, history)
                for key in fts.keys():
                    if domain == 'src':
                        fts[domain + '-' + key] = fts[key]
                        fts['tgt' + '-' + key] = 0
                    else:
                        fts[domain + '-' + key] = fts[key]
                        fts['src' + '-' + key] = 0
                return fts

            features_used = augment_features

        elif transfer_method == '_pred':  # this is not to be called directly;
            # It is used by pred, to train the second classifier.
            train_data = tgt_train

            no_prefix = self.parameters.get('no_prefix')
            with_cca = self.parameters.get('with_cca')
            kdim = self.parameters.get('kdim')
            exclude_O = self.parameters.get('exclude_O')

            if with_cca:
                label2vec = cca(src_train + tgt_train,
                                no_prefix=no_prefix,
                                k=kdim,
                                exclude_O=exclude_O)

            def pred_features(tokens, index, history):
                PRED = tokens[index][3]
                fts = features(tokens, index, history)
                if with_cca:
                    for i in range(kdim):
                        fts['PRED-cca-' + str(i)] = label2vec[PRED][i]
                else:
                    fts['PRED'] = PRED

                return fts

            def pred_features_noprefix(tokens, index, history):
                PRED = tokens[index][3]
                fts = features(tokens, index, history)
                # remove prefix 'I-' or 'B-':
                if PRED != 'O':
                    PRED = PRED[2:]

                if with_cca:
                    for i in range(kdim):
                        fts['PRED-cca-' + str(i)] = label2vec[PRED][i]
                else:
                    fts['PRED'] = PRED

                return fts

            if no_prefix:
                features_used = pred_features_noprefix
            else:
                features_used = pred_features  # default

        elif transfer_method == 'pred':
            no_prefix = self.parameters.get('no_prefix')
            with_cca = self.parameters.get('with_cca')
            kdim = self.parameters.get('kdim')
            if kdim is None:
                kdim = 5
            exclude_O = self.parameters.get('exclude_O')

            # TODO test this (i.e., using two different classifiers)
            if isinstance(classifier_name,
                          list):  # names of the two classifiers, in order
                classifier_name1 = classifier_name[0]
                classifier_name2 = classifier_name[1]
            else:
                classifier_name1 = classifier_name2 = classifier_name

            print('Training first classifier.')
            self.train('src',
                       classifier_name1,
                       src_train,
                       tgt_train,
                       tgt_test,
                       features_name=features_name)

            # FIRST: Use classifier on both the tgt_test and tgt_train
            print('Tagging tgt test data.')
            test_input_sentences = [zip(*t)[0] for t in tgt_test]
            test_predsents = self.NER.tag_sents(test_input_sentences)
            # flatten them:
            test_augmented = [[
                tuple(list(f) + [zip(*p)[1][i]])
                for i, f in enumerate(zip(*p)[0])
            ] for p in test_predsents]
            tgt_test = [
                zip(x, [iob for (x, iob) in tgt_test[i]])
                for i, x in enumerate(test_augmented)
            ]
            # This is a list of lists of the form ((word, pos, dom, pred), iob)

            print('Tagging tgt train data.')
            train_input_sentences = [zip(*t)[0] for t in tgt_train]
            train_predsents = self.NER.tag_sents(train_input_sentences)
            train_augmented = [[
                tuple(list(f) + [zip(*p)[1][i]])
                for i, f in enumerate(zip(*p)[0])
            ] for p in train_predsents]
            tgt_train = [
                zip(x, [iob for (x, iob) in tgt_train[i]])
                for i, x in enumerate(train_augmented)
            ]

            # SECOND: train another classifier on the tgt_train data, with
            # the appended features from the first classifier.
            print('Training second classifier.\n')
            self.train('_pred',
                       classifier_name2,
                       src_train,
                       tgt_train,
                       tgt_test,
                       features_name=features_name,
                       kdim=kdim,
                       no_prefix=no_prefix,
                       with_cca=with_cca,
                       exclude_O=exclude_O)
            #features_used = features # the features.py takes care of it
            classifier_name = 'none'  # to prevent from continuing a second time.

            #self.predscore = self.test(tgt_test)
            self.predscore = self.evaluate(tgt_test)
            ##print self.predscore
            self.transfer_method = 'pred'  # because the recursion will have changed it.

        else:
            pass

    #######################################################################
    ## Classifier Options: specifies which classifier to use and train
    #######################################################################
    # With 'megam, 'IIS', 'GIS' and 'naivebayes', will use
    # ClassifierBasedTagger to train the model.
        if classifier_name in ['megam', 'IIS', 'GIS', 'naivebayes']:
            if classifier_name == 'naivebayes':
                print "Training the model now..."
                classifier = NaiveBayesClassifier.train
                # NOTE Naive bayes works poorly with augment (due to the
                # breaking down of the independence assumption). This is
                # described in:
                #      Sutton and McCallum, An Introduction to Conditional
                #      Random Fields, p.16.

            if classifier_name in ['megam', 'IIS', 'GIS']:
                print "Training the model now..."
                if classifier_name in ['IIS', 'GIS']:
                    print("megam is recommended instead of IIS or GIS.")
                # NOTE: Though GIS and IIS cases also take gaussian_prior_sigma,
                #       they don't use it.  It only applies to megam.
                self._set_parameter('gauss_prior_sigma', classifier_name, 1.0)
                gauss_prior_sigma = self.parameters['gauss_prior_sigma']
                classifier = lambda traindata: MaxentClassifier.train(
                    traindata,
                    algorithm=classifier_name,
                    gaussian_prior_sigma=gauss_prior_sigma,
                    trace=3 * self._verbose)

            self.NER = ClassifierBasedTagger(
                train=train_data,
                feature_detector=features_used,
                classifier_builder=classifier,
                verbose=self._verbose,
            )

        if classifier_name == 'averaged_perceptron':
            print "Training the model now..."
            self._set_parameter('num_iterations', classifier_name, 5)
            num_iter = self.parameters['num_iterations']

            self.NER = PerceptronNER(feature_detector=features_used,
                                     verbose=self._verbose)
            self.NER.train(train_data, num_iterations=num_iter)

        if classifier_name == 'CRF':
            crfalgorithm = self.parameters.get('algorithm')
            if crfalgorithm is None:
                crfalgorithm = 'lbfgs'  #'l2sgd'
                self.parameters['algorithm'] = crfalgorithm
            else:
                if crfalgorithm not in {'l2sgd', 'lbfgs'}:
                    raise ValueError("algorithm must be l2sgd' or 'lbfgs'.")

            print "Training the model now..."
            self.NER = CRFTagger(
                feature_detector=features_used,
                verbose=self._verbose,  # more training options possible.
                algorithm=crfalgorithm  #'lbfgs' #'l2sgd' # lbfgs
            )
            self.NER.train(train_data, 'model.crf.tagger')

        if classifier_name not in {
                'CRF', 'averaged_perceptron', 'megam', 'IIS', 'GIS',
                'naivebayes', 'none'
        }:
            raise ValueError("Wrong classifier name.")
class DomainAdaptation():
    """ This will make it easy to compare different NER classifiers, domain,
    adaptation techniques and features. One can train the model on a corpus
    or use a pre-trained model, and test the method on a labeled dataset.

    """
    def __init__(self, verbose=False):
        self._verbose = verbose
        self.pretrained_model = 'None'  # This is changed to something else if using a pretrained model.
        self.model = {}

    def _set_parameter(self, paramname, classifier_name, defaultvalue):
        """ Raise ValueError if the wrong parameter name (paramname) is given and
        the dictionary self.parameters in not empty, and
        save the new parameters into self.parameters. If no parameters are given
        the defaultvalue is used.

        This is used by train method of DomainAdaptation to set default parameters.

        Parameters
        ----------
        paramname : str, name of parameter to set
        defaultvalue : the default value
        """
        if not self.parameters.has_key(paramname) and len(
                self.parameters.keys()) != 0:
            self.parameters = {}
            raise ValueError('Optional argument for ' + classifier_name +
                             ' must be ' + paramname)
        else:
            param = self.parameters.get(paramname, defaultvalue)
            self.parameters[paramname] = param

    def get_featurefunc(self, features_name, transfer_method, src_train,
                        tgt_train, tgt_test):
        if features_name == 'ZhangJohnson':
            features = ZhangJohnson
        elif features_name == 'word_embedding':
            if transfer_method == 'src': allsentences = src_train + tgt_test
            if transfer_method == 'tgt': allsentences = tgt_train + tgt_test
            if transfer_method == '_pred': allsentences = tgt_train + tgt_test
            if transfer_method in ['all', 'augment', 'pred']:
                allsentences = src_train + tgt_train + tgt_test
            allwords = get_unique_words(allsentences)
            print 'Obtaining word embedding information.'
            wordEmbeddings, word2Idx = get_word_embeddings(
                embeddingsPath, allwords)
            features = wordembeddings_as_features(wordEmbeddings, word2Idx)
            print 'Done obtaining word embeddings to use as features.'
        else:
            raise ValueError("features name is incorrect.")
        return features

    def train(self,
              transfer_method,
              classifier_name,
              src_train,
              tgt_train,
              tgt_test,
              features_name='ZhangJohnson',
              **kwargs):
        """ Train the model with a given classifier and with a given domain
        adaptation method (preprocessing or post-processing).

        Parameters
        ----------
        transfer_method : str
            The name of the transfer method to use. They can be:
                * src: Train the model with source training data
                * tgt: Train the model with target training data
                * all: Train the model with both source and target data
                * augment:
                    Train the model both source and target data, but
                    enlarge the feature space, using Daume's easyadapt
                    method[1], so if a token i is in the source,
                    use feature (x_i, x_i, 0) instead of x_i for each feature;
                    if the token in the target data use feature (x_i, 0, x_i).
                    The first entry of the tuple stands for 'general' features,
                    the second is 'source only', and the third is 'target only'.
                 * pred: The 'PRED' method, described in Daume (#TODO put other
                    references in here).
                    Unlike the other methods, the train function both trains
                    and tests, saves the result. Calling 'test' merely prints
                    the score.
                    This permits another optional keyword argument, 'no_prefix':
                    if True, it removes the 'I-' or 'B-' from the PRED feature.


        classifier_name : str
            The name of the classifier to use. Roughly in order of performance:
                * CRF: the CRFTagger from nltk, which calls external CRFSuite.
				  Optional keyword parameter: 'algorithm', which can be either
				  'l2sgd' or 'lbfgs'. If not given, 'l2sgd' is used.
                * averaged_perceptron : the averaged perceptron from nltk
                * megam: nltk's binding from of Daume's external megam program
                * IIS: Improved Iterative Scaling, via nltk
                * GIS: Generalized Iterative Scaling, via nltk
                * naivebayes: Naive Bayes from nltk.

        features_name : str or list
            Which features to use. Can be:
                * 'ZhangJohnson': The features used in Zhang and Johnson (2003).
                * 'word_embedding': Word embedding only.
                * a list containing any combination of the above options.

        src_train, tgt_train, tgt_test : lists
            Each of these is a list of lists, with entries of the form:
                    ( (word, pos, domain), entity )
            For now tgt_test is needed as an argument in order to get the full
            vocabulary for word embeddings.

        **kwargs:
            if classifier_name is 'averaged_perceptron':
                'num_iterations', default: 5 (same as nltk's default)
            if classifier_name is 'megam', 'IIS', or 'GIS':
                'gauss_prior_sigma', default: 1.0 (same as nltk's default)

        References
        ----------
        [1] Daumé III, Hal. "Frustratingly easy domain adaptation." arXiv
            preprint arXiv:0907.1815 (2009).

        [2] L-BFGS: http://aria42.com/blog/2014/12/understanding-lbfgs
                    http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf

        Remarks
        -------
        On speed and memory:
            * megam is slow and memory intensive, though using the optimized
              megam (megam opt) does help.
            * IIS and GIS are pure python and slower than megam.

        """

        #######################################################################
        ## Store model information
        #######################################################################

        self.model['entity_types_src'] = sentence_utils.get_tagset(
            src_train, with_prefix=False)
        self.model['entity_types_tgt'] = sentence_utils.get_tagset(
            tgt_test + tgt_train, with_prefix=False)

        #self.classifier = classifier_name #unused.
        self.transfer_method = transfer_method
        self.parameters = kwargs
        #print '... called train. These are the parameters: ', self.parameters
        #self.featurelist = features.keys()
        if self._verbose:
            print "Transfer Learning: ", transfer_method, "  Classifier: ", classifier_name
        print 'kwargs', kwargs
        print 'exclude_O', self.parameters.get('exclude_O')
        #######################################################################
        ## Determine which features to use
        #######################################################################
        #TODO make an option so can choose whether to augment the word-embeddings with the other features or not...
        if isinstance(features_name, str):
            features = self.get_featurefunc(features_name, transfer_method,
                                            src_train, tgt_train, tgt_test)

        if isinstance(features_name, list):
            featureslist = []
            for featname in features_name:
                f = self.get_featurefunc(featname, transfer_method, src_train,
                                         tgt_train, tgt_test)
                featureslist.append(f)
            print 'Combining features...'
            features = combine_several_featfunctions(featureslist)

    #######################################################################
    ## Transfer Learning Options (specify training data & preprocessing)
    #######################################################################

        if transfer_method in ['src', 'tgt', 'all']:
            features_used = features
            if transfer_method == 'src': train_data = src_train
            if transfer_method == 'tgt': train_data = tgt_train
            if transfer_method == 'all':
                train_data = src_train + tgt_train  #self.all_train

        elif transfer_method == 'augment':
            train_data = src_train + tgt_train  #self.all_train

            def augment_features(tokens, index, history):
                word, pos, domain = tokens[index]
                fts = features(tokens, index, history)
                for key in fts.keys():
                    if domain == 'src':
                        fts[domain + '-' + key] = fts[key]
                        fts['tgt' + '-' + key] = 0
                    else:
                        fts[domain + '-' + key] = fts[key]
                        fts['src' + '-' + key] = 0
                return fts

            features_used = augment_features

        elif transfer_method == '_pred':  # this is not to be called directly;
            # It is used by pred, to train the second classifier.
            train_data = tgt_train

            no_prefix = self.parameters.get('no_prefix')
            with_cca = self.parameters.get('with_cca')
            kdim = self.parameters.get('kdim')
            exclude_O = self.parameters.get('exclude_O')

            if with_cca:
                label2vec = cca(src_train + tgt_train,
                                no_prefix=no_prefix,
                                k=kdim,
                                exclude_O=exclude_O)

            def pred_features(tokens, index, history):
                PRED = tokens[index][3]
                fts = features(tokens, index, history)
                if with_cca:
                    for i in range(kdim):
                        fts['PRED-cca-' + str(i)] = label2vec[PRED][i]
                else:
                    fts['PRED'] = PRED

                return fts

            def pred_features_noprefix(tokens, index, history):
                PRED = tokens[index][3]
                fts = features(tokens, index, history)
                # remove prefix 'I-' or 'B-':
                if PRED != 'O':
                    PRED = PRED[2:]

                if with_cca:
                    for i in range(kdim):
                        fts['PRED-cca-' + str(i)] = label2vec[PRED][i]
                else:
                    fts['PRED'] = PRED

                return fts

            if no_prefix:
                features_used = pred_features_noprefix
            else:
                features_used = pred_features  # default

        elif transfer_method == 'pred':
            no_prefix = self.parameters.get('no_prefix')
            with_cca = self.parameters.get('with_cca')
            kdim = self.parameters.get('kdim')
            if kdim is None:
                kdim = 5
            exclude_O = self.parameters.get('exclude_O')

            # TODO test this (i.e., using two different classifiers)
            if isinstance(classifier_name,
                          list):  # names of the two classifiers, in order
                classifier_name1 = classifier_name[0]
                classifier_name2 = classifier_name[1]
            else:
                classifier_name1 = classifier_name2 = classifier_name

            print('Training first classifier.')
            self.train('src',
                       classifier_name1,
                       src_train,
                       tgt_train,
                       tgt_test,
                       features_name=features_name)

            # FIRST: Use classifier on both the tgt_test and tgt_train
            print('Tagging tgt test data.')
            test_input_sentences = [zip(*t)[0] for t in tgt_test]
            test_predsents = self.NER.tag_sents(test_input_sentences)
            # flatten them:
            test_augmented = [[
                tuple(list(f) + [zip(*p)[1][i]])
                for i, f in enumerate(zip(*p)[0])
            ] for p in test_predsents]
            tgt_test = [
                zip(x, [iob for (x, iob) in tgt_test[i]])
                for i, x in enumerate(test_augmented)
            ]
            # This is a list of lists of the form ((word, pos, dom, pred), iob)

            print('Tagging tgt train data.')
            train_input_sentences = [zip(*t)[0] for t in tgt_train]
            train_predsents = self.NER.tag_sents(train_input_sentences)
            train_augmented = [[
                tuple(list(f) + [zip(*p)[1][i]])
                for i, f in enumerate(zip(*p)[0])
            ] for p in train_predsents]
            tgt_train = [
                zip(x, [iob for (x, iob) in tgt_train[i]])
                for i, x in enumerate(train_augmented)
            ]

            # SECOND: train another classifier on the tgt_train data, with
            # the appended features from the first classifier.
            print('Training second classifier.\n')
            self.train('_pred',
                       classifier_name2,
                       src_train,
                       tgt_train,
                       tgt_test,
                       features_name=features_name,
                       kdim=kdim,
                       no_prefix=no_prefix,
                       with_cca=with_cca,
                       exclude_O=exclude_O)
            #features_used = features # the features.py takes care of it
            classifier_name = 'none'  # to prevent from continuing a second time.

            #self.predscore = self.test(tgt_test)
            self.predscore = self.evaluate(tgt_test)
            ##print self.predscore
            self.transfer_method = 'pred'  # because the recursion will have changed it.

        else:
            pass

    #######################################################################
    ## Classifier Options: specifies which classifier to use and train
    #######################################################################
    # With 'megam, 'IIS', 'GIS' and 'naivebayes', will use
    # ClassifierBasedTagger to train the model.
        if classifier_name in ['megam', 'IIS', 'GIS', 'naivebayes']:
            if classifier_name == 'naivebayes':
                print "Training the model now..."
                classifier = NaiveBayesClassifier.train
                # NOTE Naive bayes works poorly with augment (due to the
                # breaking down of the independence assumption). This is
                # described in:
                #      Sutton and McCallum, An Introduction to Conditional
                #      Random Fields, p.16.

            if classifier_name in ['megam', 'IIS', 'GIS']:
                print "Training the model now..."
                if classifier_name in ['IIS', 'GIS']:
                    print("megam is recommended instead of IIS or GIS.")
                # NOTE: Though GIS and IIS cases also take gaussian_prior_sigma,
                #       they don't use it.  It only applies to megam.
                self._set_parameter('gauss_prior_sigma', classifier_name, 1.0)
                gauss_prior_sigma = self.parameters['gauss_prior_sigma']
                classifier = lambda traindata: MaxentClassifier.train(
                    traindata,
                    algorithm=classifier_name,
                    gaussian_prior_sigma=gauss_prior_sigma,
                    trace=3 * self._verbose)

            self.NER = ClassifierBasedTagger(
                train=train_data,
                feature_detector=features_used,
                classifier_builder=classifier,
                verbose=self._verbose,
            )

        if classifier_name == 'averaged_perceptron':
            print "Training the model now..."
            self._set_parameter('num_iterations', classifier_name, 5)
            num_iter = self.parameters['num_iterations']

            self.NER = PerceptronNER(feature_detector=features_used,
                                     verbose=self._verbose)
            self.NER.train(train_data, num_iterations=num_iter)

        if classifier_name == 'CRF':
            crfalgorithm = self.parameters.get('algorithm')
            if crfalgorithm is None:
                crfalgorithm = 'lbfgs'  #'l2sgd'
                self.parameters['algorithm'] = crfalgorithm
            else:
                if crfalgorithm not in {'l2sgd', 'lbfgs'}:
                    raise ValueError("algorithm must be l2sgd' or 'lbfgs'.")

            print "Training the model now..."
            self.NER = CRFTagger(
                feature_detector=features_used,
                verbose=self._verbose,  # more training options possible.
                algorithm=crfalgorithm  #'lbfgs' #'l2sgd' # lbfgs
            )
            self.NER.train(train_data, 'model.crf.tagger')

        if classifier_name not in {
                'CRF', 'averaged_perceptron', 'megam', 'IIS', 'GIS',
                'naivebayes', 'none'
        }:
            raise ValueError("Wrong classifier name.")

    def load_pretrained_model(self,
                              modelname='pretrained-StanfordNER',
                              numclass=3):
        """ Loads a pre-trained model.

        Parameters
        ----------
        modelname : str
            The name of the pre-trained model to use. The options are:
                * 'pretrained-StanfordNER': Used a CRF and word embeddings.
                    See: https://nlp.stanford.edu/software/CRF-NER.shtml
                * 'pretrained-MITIE': Used Structural SVMs and word embeddings.
                    Uses Dhillon et al's "eigenwords" word embeddings.
                    See: https://github.com/mit-nlp/MITIE
                * 'pretrained-SENNA': Used multilayer perceptrons and the
                    50-dimensional CW (2008) word embeddings.
                    See: http://ml.nec-labs.com/senna/
                * 'pretrained-spacy': Used BILOU scheme; the algorithm is "a
                    pastiche of well-known methods...a greedy transition-based
                    parser guided by a linear model whose weights are learned
                    using the averaged perceptron loss, via the dynamic oracle
                    imitation strategy". See:
                    https://spacy.io/docs/usage/entity-recognition.
                    Using pre-trained model 'en_core_web_sm' here.
                    NOTE: could try 'en_depent_web_md' instead.
        numclass : int
            The number of classes for the pre-trained classifier; this is
            relevant only when modelname is 'pretrained-StanfordNER'.

        """
        self.pretrained_model = modelname
        self.transfer_method = 'none'
        if modelname == 'pretrained-StanfordNER':
            if numclass == 3:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.all.3class.distsim.crf.ser.gz')  #,
                #STANFORD_CLASSPATH)
                self.model['entity_types'] = ['LOC', 'ORG', 'PER']
                self.model['training_corpus'] = [
                    'CONLL03 eng.train', 'MUC6 train', 'MUC7 train', 'ACE2002',
                    'in-house data'
                ]
            elif numclass == 4:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.conll.4class.distsim.crf.ser.gz')  #,
                ##STANFORD_CLASSPATH)
                self.model['entity_types'] = ['LOC', 'PER', 'ORG', 'MISC']
                self.model['training_corpus'] = ['CONLL03 eng.train']
            elif numclass == 7:
                self.NER = StanfordNERTagger(
                    STANFORD_MODEL_DIR +
                    'english.muc.7class.distsim.crf.ser.gz')  #,
                ##STANFORD_CLASSPATH)
                self.model['entity_types'] = [
                    'LOC',
                    'ORG',
                    'PER',
                    'MISC',
                    'MON',  # MONEY
                    'PCT',  # PERCENT
                    'DAT',  # DATE
                    'TIM'
                ]  # TIME
                self.model['training_corpus'] = ['MUC6 train', 'MUC7 train']
            else:
                raise ValueError(
                    'When using StanfordNER, numclass must be 3, 4 or 7.')

        elif modelname == 'pretrained-MITIE':
            self.NER = mitie.named_entity_extractor(MITIE_MODEL_DIR)
            self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']
            self.model['training_corpus'] = ['?']

        elif modelname == 'pretrained-SENNA':
            self.NER = SennaNERTagger(SENNA_DIR)
            self.model['entity_types'] = ['PER', 'LOC', 'ORG', 'MISC']
            self.model['training_corpus'] = ["?"]

        elif modelname == 'pretrained-spacy':
            self.NER = None
            self.model['entity_types'] = [
                'PER',  # PERSON
                'NOR',  # NORP
                'FAC',  # FACILITY
                'ORG',  # ORGANIZATION
                'GPE',  # GEO-POLITICAL
                'LOC',  # LOCATION
                'PRO',  # PRODUCT
                'EVE',  # EVENT
                'WOR',  # WORK OF ART
                'LAN',  # LANGUAGE
                'DAT',  # DATE
                'TIM',  # TIME
                'PCT',  # PERCENT
                'MON',  # MONEY
                'QUA',  # QUANTITY
                'ORD',  # ORDINAL
                'CAR'
            ]  # CARDINAL
            self.model['training_corpus'] = ["?"]
        else:
            raise ValueError("Wrong modelname; must be 'pretrained-spacy',\
                             'pretrained-SENNA', 'pretrained-MITIE',\
                             or 'pretrained-StanfordNER'.")

    def evaluate(self, tgt_test):
        """ Evaluate the model on data tgt_test.

        """
        if self.pretrained_model == 'pretrained-StanfordNER':
            sents = [[x[0] for (x, iob) in iobs] for iobs in tgt_test]
            predicted = self.NER.tag_sents(sents)
            NERchunks = [
                BIO_utils.stanfordNE2BIO(NERchunk) for NERchunk in predicted
            ]
            NERchunks = [[((x, x), iob) for (x, iob) in C] for C in NERchunks]
            #TODO test if tgt_test is OK, don't need tgt_test_mod.
            tgt_test_mod = [[((x[0], x[0]), iob) for (x, iob) in iobs]
                            for iobs in tgt_test]
            # Symbols like / are removed sometimes. Fix:
            for jj, chunk in enumerate(NERchunks):
                for ii, x in enumerate(chunk):
                    if x[0] == '':
                        chunk[ii] = (tgt_test_mod[jj][ii], x[1])
            tagset_src = self.model['entity_types']
            E = Evaluator(NERchunks, tgt_test_mod, tagset_src)

        elif self.pretrained_model == 'pretrained-MITIE':
            sents = [[x[0] for (x, iob) in iobs] for iobs in tgt_test]
            NERchunks = []
            for tagged_sent in sents:
                mitie_entities = self.NER.extract_entities(tagged_sent)
                iobtags = BIO_utils.MITIE_NER_to_iobtags(
                    tagged_sent, mitie_entities)
                NERchunk = BIO_utils.stanfordNE2BIO(iobtags)
                NERchunk = [((x, x), iob) for (x, iob) in NERchunk]
                NERchunks.append(NERchunk)
            tagset_src = self.model['entity_types']
            E = Evaluator(NERchunks, tgt_test, tagset_src)

        elif self.pretrained_model == 'pretrained-SENNA':
            sents = [[x[0] for (x, iob) in iobs] for iobs in tgt_test]
            NERchunks = self.NER.tag_sents(sents)
            NERchunks = [[((x, x), iob) for (x, iob) in C] for C in NERchunks]
            tagset_src = self.model['entity_types']
            E = Evaluator(NERchunks, tgt_test, tagset_src)

        elif self.pretrained_model == 'pretrained-spacy':
            sents = [[x[0] for (x, iob) in iobs] for iobs in tgt_test]
            NERchunks = []
            for tagged_sent in sents:
                NERchunk = BIO_utils.spacy_iob(tagged_sent)
                NERchunk = BIO_utils.stanfordNE2BIO(
                    NERchunk)  # Need BIO format
                NERchunk = [((x, x), iob) for (x, iob) in NERchunk]
                NERchunks.append(NERchunk)
            tagset_src = self.model['entity_types']
            E = Evaluator(NERchunks, tgt_test, tagset_src)
        else:
            if self.transfer_method == 'pred':
                print(
                    'There is no testing to be done here (it was done during training).'
                )
                E = self.predscore
            else:
                print('Tagging the test dataset.')
                test_input_sentences = [zip(*t)[0] for t in tgt_test]
                predicted = self.NER.tag_sents(test_input_sentences)
                tagset_src = self.model['entity_types_src']

                E = Evaluator(predicted, tgt_test, tagset_src)

        return E