Ejemplo n.º 1
0
class Tagger(object):
    def __init__(self, cess_name="cess_esp"):
        """
            Tagger object.
            Allows to specify a cess.
        """
        cess = getattr(nltk.corpus, cess_name)
        self.wnl = WordNetLemmatizer()
        self.ut = UnigramTagger(cess.tagged_sents())

    def pos_tag(self, tokens, lemmatize=False):
        def clean_tag(tag):
            def get_type(tag):
                if tag[1]:
                    return tag[1][0].upper()
                return "X"
            if lemmatize:
                return (self.wnl.lemmatize(tag[0]), get_type(tag))
            return (tag[0], get_type(tag))

        if type(tokens) == str:
            tokens = tokens.split()

        return [clean_tag(a) for a in self.ut.tag(tokens)]

    def get_main_words(self, tokens, lemmatize=True, type_w=False):
        def cond(t):
            if type_w:
                for type_w_ in type_w:
                    if t[1].lower().startswith(type_w_.lower()):
                        return True
                return False
            return True

        return filter(cond, self.pos_tag(tokens, lemmatize=lemmatize))
Ejemplo n.º 2
0
    def pos_tag(self):
        tokenize_obj = NLTKTokenize(self.options)
        res = tokenize_obj.tokenize()
        tokens = res['result']
        tags = []

        # Performs Bigram / Unigram / Regex Tagging
        if self.options.get('tagger') in ['unigram', 'bigram', 'regex']:
            trainer = self.options['train'] if self.options.get(
                'train') in TRAINERS else DEFAULT_TRAIN

            train = brown.tagged_sents(categories=trainer)

            # Create your custom regex tagging pattern here
            regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
                                      (r'.*able$', 'JJ'),
                                      (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'),
                                      (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'),
                                      (r'.*ed$', 'VBD'), (r'.*', 'NN')])

            current = os.path.dirname(os.path.abspath(__file__))

            # Unigram tag training data load / dump pickle
            pkl_name = current + '/trained/unigram_' + trainer + '.pkl'
            if os.path.isfile(pkl_name):
                with open(pkl_name, 'rb') as pkl:
                    unigram_tag = load(pkl)
            else:
                unigram_tag = UnigramTagger(train, backoff=regex_tag)
                with open(pkl_name, 'wb') as pkl:
                    dump(unigram_tag, pkl, -1)

            # Bigram tag training data load / dump pickle
            if self.options['tagger'] == 'bigram':
                pkl_name = current + '/trained/bigram_' + trainer + '.pkl'
                if os.path.isfile(pkl_name):
                    with open(pkl_name, 'rb') as pkl:
                        bigram_tag = load(pkl)
                else:
                    bigram_tag = BigramTagger(train, backoff=unigram_tag)
                    with open(pkl_name, 'wb') as pkl:
                        dump(bigram_tag, pkl, -1)
                tags = bigram_tag.tag(tokens)  # Bigram tagging performed here
            elif self.options['tagger'] == 'unigram':
                tags = unigram_tag.tag(
                    tokens)  # Unigram tagging performed here
            else:
                tags = regex_tag.tag(tokens)  # Regex tagging performed here

        # Performs default pos_tag
        elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos':
            tags = pos_tag(tokens)

        return self._dump(tags)
Ejemplo n.º 3
0
def lookup_tag(num_sampling):
    raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship'
    #Get the frequency distribution of the words
    fd = FreqDist(brown.words(categories='news'))
    #Get the most frequent tag of each word in the corpus
    cfd = ConditionalFreqDist(brown.tagged_words(
        categories='news'))  #, backoff=nltk.DefaultTagger('NN'))
    #Get the first 100 most common words
    most_freq_words = fd.most_common(num_sampling)
    #Create a dictionary in form of  a tuple (word, most_likely_tag)
    likely_tags = dict(
        (word, cfd[word].max()) for (word, _) in most_freq_words)
    #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic

    lookup_tagger = UnigramTagger(model=likely_tags)
    tagged = lookup_tagger.tag(word_tokenize(raw))
    print(tagged)
    score = lookup_tagger.evaluate(brown_tagged_sents)
    print(score)
Ejemplo n.º 4
0
# A *UnigramTagger*-object is generated and trained with the Brown Corpus with universal tagset:

# In[10]:


complete_tagger=UnigramTagger(train=brown_tagged_sents)


# The trained Unigram-Tagger is applied to tag a single sentence:

# In[11]:


mySent1="the cat is on the mat".split()
print(complete_tagger.tag(mySent1))


# Compare tags assigned by the Unigram-Tagger and the tags assigned by the current NLTK standard tagger on a single sentence:

# In[12]:


mySent2="This is major tom calling ground control from space".split()
print("Unigram Tagger: \n",complete_tagger.tag(mySent2))
print("\nCurrent Tagger applied for NLTK pos_tag(): \n",nltk.pos_tag(mySent2,tagset='universal'))


# The performance of the trained tagger is evaluated on the same corpus as applied for training. The performance measure is the rate of words that have been tagged correctly.

# In[13]:
from nltk import DefaultTagger, UnigramTagger, BigramTagger
from nltk.corpus import treebank

train_set = treebank.tagged_sents()[:3000]
test_set = treebank.tagged_sents()[3000:]

bitagger = UnigramTagger(train_set)
print(bitagger.evaluate(test_set))  # quanto è buono da 0 a 1?
print(bitagger.tag("I love Alessia too much her since years".split())
      )  # lo provo su una frase che non conosce nessuno

# domanda: e se voglio utilizzare un custom train/test_set invece che quelli del treebank?
# soluzione: devo crearmelo io e al solito splittare in train & test
custom_set = [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'),
               ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'),
               ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'),
               ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'),
               ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')],
              [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'),
               ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'),
               ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'),
               ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')],
              [('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'),
               ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'),
               ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'),
               ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'),
               ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'),
               ('*-1', '-NONE-'), ('a', 'DT'), ('nonexecutive', 'JJ'),
               ('director', 'NN'), ('of', 'IN'), ('this', 'DT'),
               ('British', 'JJ'), ('industrial', 'JJ'), ('conglomerate', 'NN'),
               ('.', '.')],
Ejemplo n.º 6
0
class Zero_Shooter:
    def __init__(self, modelpath, candidates):
        self.modelpath = modelpath
        self.bus_counter = 0
        with open(modelpath + 'all_highest_probs_' + str(candidates) + '.json',
                  'r') as f:
            self.candidates = json.load(f)
        with open(
                modelpath +
                'inject_refcoco_refrnn_compositional_3_512_1/4eval_greedy.json',
                'r'
        ) as f:  # 'restoredmodel_refs_greedy.json') as f: restoredmodel_refs_greedy/4eval_greedy
            self.refs = json.load(f)
        self.words_that_are_names = list()
        with open("./noun_list_long.txt", 'r') as f:
            for row in f.readlines():
                self.words_that_are_names.append(row.strip())
        self.unigram_tagger = UnigramTagger(brown.tagged_sents())
        self.zero_shot_refs = defaultdict()
        self.non_noun_counter = 0
        self.baseline_top_1 = defaultdict()
        self.baseline_top_5 = defaultdict()
        self.baseline_top_10 = defaultdict()

    # read and sort the candidates for a position in the sequence
    def get_predictions(self, region_id):
        predictions = list()
        tmp_dict = self.candidates[region_id]
        sorted_tmp = OrderedDict(sorted(tmp_dict.items(), key=lambda t: t[0]))
        for entry in sorted_tmp:
            predictions.append(sorted_tmp[entry][-1][0])
        return predictions

    # parse a sequence with the noun-list method
    def parse_for_names(self, predicted_words, cat):
        for i, word in enumerate(predicted_words):
            if word == str(cat):
                self.bus_counter += 1
            if word in self.words_that_are_names:  ## always returns first instance ...
                return i
        return -1

    # parse a sequence with a POS-tagger
    def parse_pos(self, tokens, cat):
        tags = self.unigram_tagger.tag(tokens)
        nouns = [x for x in tags if x[1] == 'NN']
        if len(nouns) > 0:
            if nouns[0][0] == cat:
                self.bus_counter += 1
            return tokens.index(
                nouns[0][0]
            )  # to keep it easy - if two nouns, this is a simplification!
        else:
            unknown_nouns = [x for x in tags if x[1] == 'None']
            if len(unknown_nouns) > 0:
                return tokens.index(unknown_nouns[0])
            else:
                return -1

    # apply the parsing, word combination and exchange of a word to the test set,
    # count the hit@k accuracies and the frequency of predicitions (baseline).
    # The parsing is currently done with the noun-list, but can be changed to POS-tagger
    def do_zero_shot(self, embeddings, category, use_reduced_vector_space):
        self.word_changed_counter = 0
        self.zero_shot_counter = 0
        self.word_counter = 0
        category = str(category)
        hit_at_1 = 0
        hit_at_2 = 0
        hit_at_5 = 0
        hit_at_10 = 0
        #with open("/mnt/Data/zero_shot_reg/src/eval/new_models/with_reduced_cats_all/vocab_list.txt", 'r') as f:
        # vocab = f.read().splitlines()

        for region_id in self.candidates:

            region_id = str(region_id)
            sentence = self.get_predictions(region_id)
            self.word_counter += len(sentence)

            ## use pos tagger
            #index = self.parse_pos(sentence, category)

            ## OR use name list
            index = self.parse_for_names(sentence, category)

            if index < 0:
                self.zero_shot_refs[region_id] = self.refs[region_id]
                continue

            candidate_words_and_probs = self.candidates[region_id][str(index +
                                                                       1)]
            cand_words = [x[0] for x in candidate_words_and_probs]
            cand_probs = [float(x[1]) for x in candidate_words_and_probs]

            new_vec = embeddings.words2embedding_weighted(
                cand_words, cand_probs, use_reduced_vector_space)
            if new_vec is not None:
                new_words_10 = embeddings.get_words_for_vector(
                    new_vec, 10, use_reduced_vector_space)
                new_words_5 = embeddings.get_words_for_vector(
                    new_vec, 5, use_reduced_vector_space)
                new_words_2 = embeddings.get_words_for_vector(
                    new_vec, 2, use_reduced_vector_space)
                new_words_1 = embeddings.get_words_for_vector(
                    new_vec, 1, use_reduced_vector_space)

                ##### generate baselines for comparison with WAC  ####
                for x in new_words_1:
                    if x[0] in self.baseline_top_1:
                        self.baseline_top_1[x[0]] += 1
                    else:
                        self.baseline_top_1[x[0]] = 1

                for x in new_words_5:
                    if x[0] in self.baseline_top_5:
                        self.baseline_top_5[x[0]] += 1
                    else:
                        self.baseline_top_5[x[0]] = 1

                for x in new_words_10:
                    if x[0] in self.baseline_top_10:
                        self.baseline_top_10[x[0]] += 1
                    else:
                        self.baseline_top_10[x[0]] = 1
                ######################################################

            # for x in new_words_10:
            #     if not x[0] in vocab:
            #         print "**************", x
            # for x in new_words_1:
            #     if not x[0] in vocab:
            #         print "***********************", x  # code to test whether out-of-vocabulary words appear at all

                if category in [x[0] for x in new_words_10]:
                    hit_at_10 += 1
                if category in [x[0] for x in new_words_5]:
                    hit_at_5 += 1
                if category in [x[0] for x in new_words_1]:
                    hit_at_1 += 1
                if category in [x[0] for x in new_words_2]:
                    hit_at_2 += 1

                if not new_words_1[0][0] in self.words_that_are_names:
                    self.non_noun_counter += 1
                #print self.words_that_are_names
                #    print new_words_1[0][0]

                ref = self.refs[region_id][0].split()
                #print ref

                self.zero_shot_counter += 1
                if not new_words_1[0][0] == ref[index]:
                    self.word_changed_counter += 1

                ref[index] = new_words_1[0][0]
                new_ref = ' '.join(ref)
                self.zero_shot_refs[region_id] = [new_ref]

        with open(self.modelpath + 'baseline_frequencies_top1.json', 'w') as f:
            json.dump(self.baseline_top_1, f)
        with open(self.modelpath + 'baseline_frequencies_top5.json', 'w') as f:
            json.dump(self.baseline_top_5, f)
        with open(self.modelpath + 'baseline_frequencies_top10.json',
                  'w') as f:
            json.dump(self.baseline_top_10, f)

        print "non-nouns: ", self.non_noun_counter, " of ", len(
            self.candidates), " -> ", round(
                self.non_noun_counter / float(len(self.candidates)) * 100, 2)
        return hit_at_1/ float(len(self.candidates)), hit_at_2/ float(len(self.candidates)), hit_at_5/ float(len(self.candidates)), \
               hit_at_10/ float(len(self.candidates)), len(self.candidates)

    # apply the method not to single nouns, but to all words of an expression (without parsing)
    # hit@k accuracies do not work here, because not all words are supposed to be the target word (only one in sequence)
    def do_zero_shot_all_words(self, embeddings, category,
                               use_reduced_vector_space):

        self.word_counter = 0
        self.zero_shot_counter = 0
        self.word_changed_counter = 0

        category = str(category)
        for region_id in self.candidates:
            region_id = str(region_id)
            sentence = self.get_predictions(region_id)
            self.word_counter += len(sentence)

            for index, word in enumerate(sentence):
                candidate_words_and_probs = self.candidates[region_id][str(
                    index + 1)]
                cand_words = [x[0] for x in candidate_words_and_probs]
                cand_probs = [float(x[1]) for x in candidate_words_and_probs]
                new_vec = embeddings.words2embedding_weighted(
                    cand_words, cand_probs, use_reduced_vector_space)
                if new_vec is not None:
                    new_word = embeddings.get_words_for_vector(
                        new_vec, 1, use_reduced_vector_space)

                    ref = self.refs[region_id][0].split()
                    ref[index] = new_word[0][0]
                    new_ref = ' '.join(ref)
                    self.zero_shot_refs[region_id] = [new_ref]
                    self.zero_shot_counter += 1
                    if not new_word[0][0] == word:
                        self.word_changed_counter += 1
                        print "____ ", region_id
                        print "original: ", sentence
                        print "after: ", new_ref

        return [], [], [], [], len(self.candidates)