class Tagger(object): def __init__(self, cess_name="cess_esp"): """ Tagger object. Allows to specify a cess. """ cess = getattr(nltk.corpus, cess_name) self.wnl = WordNetLemmatizer() self.ut = UnigramTagger(cess.tagged_sents()) def pos_tag(self, tokens, lemmatize=False): def clean_tag(tag): def get_type(tag): if tag[1]: return tag[1][0].upper() return "X" if lemmatize: return (self.wnl.lemmatize(tag[0]), get_type(tag)) return (tag[0], get_type(tag)) if type(tokens) == str: tokens = tokens.split() return [clean_tag(a) for a in self.ut.tag(tokens)] def get_main_words(self, tokens, lemmatize=True, type_w=False): def cond(t): if type_w: for type_w_ in type_w: if t[1].lower().startswith(type_w_.lower()): return True return False return True return filter(cond, self.pos_tag(tokens, lemmatize=lemmatize))
def pos_tag(self): tokenize_obj = NLTKTokenize(self.options) res = tokenize_obj.tokenize() tokens = res['result'] tags = [] # Performs Bigram / Unigram / Regex Tagging if self.options.get('tagger') in ['unigram', 'bigram', 'regex']: trainer = self.options['train'] if self.options.get( 'train') in TRAINERS else DEFAULT_TRAIN train = brown.tagged_sents(categories=trainer) # Create your custom regex tagging pattern here regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) current = os.path.dirname(os.path.abspath(__file__)) # Unigram tag training data load / dump pickle pkl_name = current + '/trained/unigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: unigram_tag = load(pkl) else: unigram_tag = UnigramTagger(train, backoff=regex_tag) with open(pkl_name, 'wb') as pkl: dump(unigram_tag, pkl, -1) # Bigram tag training data load / dump pickle if self.options['tagger'] == 'bigram': pkl_name = current + '/trained/bigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: bigram_tag = load(pkl) else: bigram_tag = BigramTagger(train, backoff=unigram_tag) with open(pkl_name, 'wb') as pkl: dump(bigram_tag, pkl, -1) tags = bigram_tag.tag(tokens) # Bigram tagging performed here elif self.options['tagger'] == 'unigram': tags = unigram_tag.tag( tokens) # Unigram tagging performed here else: tags = regex_tag.tag(tokens) # Regex tagging performed here # Performs default pos_tag elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos': tags = pos_tag(tokens) return self._dump(tags)
def lookup_tag(num_sampling): raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship' #Get the frequency distribution of the words fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(num_sampling) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) tagged = lookup_tagger.tag(word_tokenize(raw)) print(tagged) score = lookup_tagger.evaluate(brown_tagged_sents) print(score)
# A *UnigramTagger*-object is generated and trained with the Brown Corpus with universal tagset: # In[10]: complete_tagger=UnigramTagger(train=brown_tagged_sents) # The trained Unigram-Tagger is applied to tag a single sentence: # In[11]: mySent1="the cat is on the mat".split() print(complete_tagger.tag(mySent1)) # Compare tags assigned by the Unigram-Tagger and the tags assigned by the current NLTK standard tagger on a single sentence: # In[12]: mySent2="This is major tom calling ground control from space".split() print("Unigram Tagger: \n",complete_tagger.tag(mySent2)) print("\nCurrent Tagger applied for NLTK pos_tag(): \n",nltk.pos_tag(mySent2,tagset='universal')) # The performance of the trained tagger is evaluated on the same corpus as applied for training. The performance measure is the rate of words that have been tagged correctly. # In[13]:
from nltk import DefaultTagger, UnigramTagger, BigramTagger from nltk.corpus import treebank train_set = treebank.tagged_sents()[:3000] test_set = treebank.tagged_sents()[3000:] bitagger = UnigramTagger(train_set) print(bitagger.evaluate(test_set)) # quanto รจ buono da 0 a 1? print(bitagger.tag("I love Alessia too much her since years".split()) ) # lo provo su una frase che non conosce nessuno # domanda: e se voglio utilizzare un custom train/test_set invece che quelli del treebank? # soluzione: devo crearmelo io e al solito splittare in train & test custom_set = [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], [('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('*-1', '-NONE-'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('of', 'IN'), ('this', 'DT'), ('British', 'JJ'), ('industrial', 'JJ'), ('conglomerate', 'NN'), ('.', '.')],
class Zero_Shooter: def __init__(self, modelpath, candidates): self.modelpath = modelpath self.bus_counter = 0 with open(modelpath + 'all_highest_probs_' + str(candidates) + '.json', 'r') as f: self.candidates = json.load(f) with open( modelpath + 'inject_refcoco_refrnn_compositional_3_512_1/4eval_greedy.json', 'r' ) as f: # 'restoredmodel_refs_greedy.json') as f: restoredmodel_refs_greedy/4eval_greedy self.refs = json.load(f) self.words_that_are_names = list() with open("./noun_list_long.txt", 'r') as f: for row in f.readlines(): self.words_that_are_names.append(row.strip()) self.unigram_tagger = UnigramTagger(brown.tagged_sents()) self.zero_shot_refs = defaultdict() self.non_noun_counter = 0 self.baseline_top_1 = defaultdict() self.baseline_top_5 = defaultdict() self.baseline_top_10 = defaultdict() # read and sort the candidates for a position in the sequence def get_predictions(self, region_id): predictions = list() tmp_dict = self.candidates[region_id] sorted_tmp = OrderedDict(sorted(tmp_dict.items(), key=lambda t: t[0])) for entry in sorted_tmp: predictions.append(sorted_tmp[entry][-1][0]) return predictions # parse a sequence with the noun-list method def parse_for_names(self, predicted_words, cat): for i, word in enumerate(predicted_words): if word == str(cat): self.bus_counter += 1 if word in self.words_that_are_names: ## always returns first instance ... return i return -1 # parse a sequence with a POS-tagger def parse_pos(self, tokens, cat): tags = self.unigram_tagger.tag(tokens) nouns = [x for x in tags if x[1] == 'NN'] if len(nouns) > 0: if nouns[0][0] == cat: self.bus_counter += 1 return tokens.index( nouns[0][0] ) # to keep it easy - if two nouns, this is a simplification! else: unknown_nouns = [x for x in tags if x[1] == 'None'] if len(unknown_nouns) > 0: return tokens.index(unknown_nouns[0]) else: return -1 # apply the parsing, word combination and exchange of a word to the test set, # count the hit@k accuracies and the frequency of predicitions (baseline). # The parsing is currently done with the noun-list, but can be changed to POS-tagger def do_zero_shot(self, embeddings, category, use_reduced_vector_space): self.word_changed_counter = 0 self.zero_shot_counter = 0 self.word_counter = 0 category = str(category) hit_at_1 = 0 hit_at_2 = 0 hit_at_5 = 0 hit_at_10 = 0 #with open("/mnt/Data/zero_shot_reg/src/eval/new_models/with_reduced_cats_all/vocab_list.txt", 'r') as f: # vocab = f.read().splitlines() for region_id in self.candidates: region_id = str(region_id) sentence = self.get_predictions(region_id) self.word_counter += len(sentence) ## use pos tagger #index = self.parse_pos(sentence, category) ## OR use name list index = self.parse_for_names(sentence, category) if index < 0: self.zero_shot_refs[region_id] = self.refs[region_id] continue candidate_words_and_probs = self.candidates[region_id][str(index + 1)] cand_words = [x[0] for x in candidate_words_and_probs] cand_probs = [float(x[1]) for x in candidate_words_and_probs] new_vec = embeddings.words2embedding_weighted( cand_words, cand_probs, use_reduced_vector_space) if new_vec is not None: new_words_10 = embeddings.get_words_for_vector( new_vec, 10, use_reduced_vector_space) new_words_5 = embeddings.get_words_for_vector( new_vec, 5, use_reduced_vector_space) new_words_2 = embeddings.get_words_for_vector( new_vec, 2, use_reduced_vector_space) new_words_1 = embeddings.get_words_for_vector( new_vec, 1, use_reduced_vector_space) ##### generate baselines for comparison with WAC #### for x in new_words_1: if x[0] in self.baseline_top_1: self.baseline_top_1[x[0]] += 1 else: self.baseline_top_1[x[0]] = 1 for x in new_words_5: if x[0] in self.baseline_top_5: self.baseline_top_5[x[0]] += 1 else: self.baseline_top_5[x[0]] = 1 for x in new_words_10: if x[0] in self.baseline_top_10: self.baseline_top_10[x[0]] += 1 else: self.baseline_top_10[x[0]] = 1 ###################################################### # for x in new_words_10: # if not x[0] in vocab: # print "**************", x # for x in new_words_1: # if not x[0] in vocab: # print "***********************", x # code to test whether out-of-vocabulary words appear at all if category in [x[0] for x in new_words_10]: hit_at_10 += 1 if category in [x[0] for x in new_words_5]: hit_at_5 += 1 if category in [x[0] for x in new_words_1]: hit_at_1 += 1 if category in [x[0] for x in new_words_2]: hit_at_2 += 1 if not new_words_1[0][0] in self.words_that_are_names: self.non_noun_counter += 1 #print self.words_that_are_names # print new_words_1[0][0] ref = self.refs[region_id][0].split() #print ref self.zero_shot_counter += 1 if not new_words_1[0][0] == ref[index]: self.word_changed_counter += 1 ref[index] = new_words_1[0][0] new_ref = ' '.join(ref) self.zero_shot_refs[region_id] = [new_ref] with open(self.modelpath + 'baseline_frequencies_top1.json', 'w') as f: json.dump(self.baseline_top_1, f) with open(self.modelpath + 'baseline_frequencies_top5.json', 'w') as f: json.dump(self.baseline_top_5, f) with open(self.modelpath + 'baseline_frequencies_top10.json', 'w') as f: json.dump(self.baseline_top_10, f) print "non-nouns: ", self.non_noun_counter, " of ", len( self.candidates), " -> ", round( self.non_noun_counter / float(len(self.candidates)) * 100, 2) return hit_at_1/ float(len(self.candidates)), hit_at_2/ float(len(self.candidates)), hit_at_5/ float(len(self.candidates)), \ hit_at_10/ float(len(self.candidates)), len(self.candidates) # apply the method not to single nouns, but to all words of an expression (without parsing) # hit@k accuracies do not work here, because not all words are supposed to be the target word (only one in sequence) def do_zero_shot_all_words(self, embeddings, category, use_reduced_vector_space): self.word_counter = 0 self.zero_shot_counter = 0 self.word_changed_counter = 0 category = str(category) for region_id in self.candidates: region_id = str(region_id) sentence = self.get_predictions(region_id) self.word_counter += len(sentence) for index, word in enumerate(sentence): candidate_words_and_probs = self.candidates[region_id][str( index + 1)] cand_words = [x[0] for x in candidate_words_and_probs] cand_probs = [float(x[1]) for x in candidate_words_and_probs] new_vec = embeddings.words2embedding_weighted( cand_words, cand_probs, use_reduced_vector_space) if new_vec is not None: new_word = embeddings.get_words_for_vector( new_vec, 1, use_reduced_vector_space) ref = self.refs[region_id][0].split() ref[index] = new_word[0][0] new_ref = ' '.join(ref) self.zero_shot_refs[region_id] = [new_ref] self.zero_shot_counter += 1 if not new_word[0][0] == word: self.word_changed_counter += 1 print "____ ", region_id print "original: ", sentence print "after: ", new_ref return [], [], [], [], len(self.candidates)