def ap(train_path, test_path): modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle' test_sentences = list(gen_corpus(test_path)) if not isfile(modelref): start = perf_counter() training_sentences = list(gen_corpus(train_path)) ap_model = PerceptronTagger(load=False) ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref) end = perf_counter() print('Training took {} ms.'.format(int((end - start) * 1000))) else: ap_model = PerceptronTagger(load=False) ap_model.load(modelref) print('Model loaded from file.') # Evaluation start = perf_counter() y_pred, y_true = [], [] for words, tags in test_sentences: y_pred.extend(y for x, y in ap_model.tag(words)) y_true.extend(tags) end = perf_counter() print('Testing took {} ms.'.format(int((end - start) * 1000))) for l in classification_report(y_true, y_pred).split('\n'): print(l)
def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger
def _get_tagger(lang=None): if lang == 'rus': tagger = PerceptronTagger(False) ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) elif lang == 'eng': tagger = PerceptronTagger() else: tagger = PerceptronTagger() return tagger
def __init__(self, stop_words=None, sent_detector=None, documents=None): # Load a persisted list of stopwords # unless something else is specified if not stop_words: self._stop_words = stopwords.words('english') self._stop_words += ['minister', 'question', 'member', "member’s"] else: self._stop_words = stop_words # Load these to save time reading from # disk later if not sent_detector: self._sent_detector = nltk.data.load( 'tokenizers/punkt/english.pickle') else: self._sent_detector = sent_detector self._tokenizor = TreebankWordTokenizer() self._lemmatizer = WordNetLemmatizer() self._tagger = PerceptronTagger() if documents: # Create a list of lists of tokens all lowercased # , lemmatized and filtered self.documents = self._make_tokens(documents) self.dictionary = corpora.Dictionary(self.documents) self.corpus = self._bag_o_words(self.documents) self._tf_idf_model = models.TfidfModel(self.corpus) self.transformed_corpus = self._tf_idf_model[self.corpus] else: self.documents = None
def wordTagging(self, sentence): posWords = [] # text = nltk.word_tokenize(sentence) tagset = None start_time = time.time() try: tokens = nltk.word_tokenize(sentence) except UnicodeDecodeError: tokens = nltk.word_tokenize(sentence.decode('utf-8')) tagger = PerceptronTagger() pos_arrs = nltk.tag._pos_tag(tokens, tagset, tagger) for pos_arr in pos_arrs: pos = [] pos.append(pos_arr[0]) pos.append(pos_arr[1]) # print pos_arr if pos_arr[1] in self.posDictionary: ppt = self.posDictionary[pos_arr[1]] else: ppt = "symbol" pos.append(ppt) posWords.append(pos) return posWords
def Top_K_verbs(k, df, text_col, class_col, plot=False): vect = TfidfVectorizer() vect.fit(df[text_col]) tfidf_df = pd.DataFrame(vect.transform(df[text_col]).toarray()) tfidf_df.columns = vect.get_feature_names() tfidf_T = tfidf_df.transpose() tagger = PerceptronTagger() tfidf_T['pos'] = tagger.tag(tfidf_T.index) tfidf_T = tfidf_T[tfidf_T['pos'].apply( lambda tup: tup[1] in ['VB', 'VBD', 'VBG', 'VBN'])] tfidf_df = tfidf_T.drop(['pos'], axis=1).transpose() top_k_by_class = dict() for v in df[class_col].value_counts().index: freq_in_class = tfidf_df[df[class_col] == v].sum(axis=0).sort_values( ascending=False) frac_in_class = freq_in_class / freq_in_class.sum() top_k_by_class[v] = frac_in_class[:k].index if plot: print('the top {} frequent nouns for class {}:'.format(k, v)) plt.figure(figsize=(5, 10)) sns.barplot(y=frac_in_class[:k].index, x=frac_in_class[:k]) plt.xlabel('fraction') plt.show() return (top_k_by_class)
def get_instances(self, label_file, xml_file): instances = [] labels_final = set() tagger = PerceptronTagger( ) # load nltk perceptron just once to speed up tagging labels_dict = { 0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "sadness", 5: "surprise" } tree = ET.parse(xml_file) root = tree.getroot() with open(label_file) as f: for sent, line in izip(root, f): id_xml = sent.attrib.values()[0] id_labels = line.rstrip().split() id_file = id_labels[0] if id_xml == id_file: for i in sent.itertext(): text = i labels = id_labels[1:] label = labels.index( str(max([int(label) for label in labels]))) inst = Instance(text, labels_dict[label]) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels_final.add(label) return instances, labels_final
def __init__(self, dbFilePath=None, revDbFilePath=None, pos=False): self.dbFilePath = dbFilePath self.revDbFilePath = revDbFilePath if pos: self.tagger = PerceptronTagger() self.pos = pos # try to open forward database if not dbFilePath: self.dbFilePath = os.path.join(os.path.dirname(__file__), "markovdb") try: with open(self.dbFilePath, 'rb') as dbfile: self.db = pickle.load(dbfile) except (IOError, ValueError): logging.warn( 'Database file corrupt or not found, using empty database') self.db = _db_factory() # try to open backwards database if not revDbFilePath: self.revDbFilePath = os.path.join(os.path.dirname(__file__), "revmarkovdb") try: with open(self.revDbFilePath, 'rb') as dbfile: self.rev_db = pickle.load(dbfile) except (IOError, ValueError): logging.warn( 'Database file corrupt or not found, using empty database') self.rev_db = _db_factory()
def get_instances(self, folder): # happiness/joy??????????????????????????? labels_dict = { "hp": "joy", "sd": "sadness", "ag": "anger", "dg": "disgust", "sp": "surprise", "fr": "fear" } instances = [] labels = set() tagger = PerceptronTagger( ) # load nltk perceptron just once to speed up tagging with open(folder) as f: for line in f: label, id, text = line.strip().split( " ", 2) # split by first two spaces only if label == "ne": # ignore no emotion continue inst = Instance(text, labels_dict[label]) inst_tokenized = word_tokenize(text) inst_tagged = tagger.tag(inst_tokenized) for tokentag in inst_tagged: token = Token(tokentag[0], tokentag[1]) inst.add_token(token) instances.append(inst) labels.add(label) return instances, labels
def __init__(self): self.hmm_models = [] self.n_hmm = 0 self.hmm2idx = {} self.idx2hmm = {} self.tagger = PerceptronTagger() return
def pos_tagger(infile, outfile): start = time.time() tagged_ngrams_counter = Counter() tagger = PerceptronTagger() with io.open(infile, encoding='utf-8', mode='rt') as text_file: for i, line in enumerate(text_file): if i % 100000 == 0: print( f'{os.getpid()} process, {i} lines, {time.time()-start:.1f} time' ) if n == 1: tagged_ngrams = tagger.tag(line.rstrip().split(' ')) else: tagged_ngrams = [ tuple(tagger.tag(ngram)) for ngram in get_ngrams(line.rstrip().split(' '), n) ] tagged_ngrams_counter.update(tagged_ngrams) with open(outfile, mode='wb') as counter_pickle, \ open(outfile[:-6]+'csv', 'w', encoding='utf-8') as counter_csv: pickle.dump(tagged_ngrams_counter, counter_pickle) counter_csv.write(csv_headers[n]) if n == 1: for tagged_ngram, count in tagged_ngrams_counter.items(): counter_csv.write('{} {} {}\n'.format( *tagged_ngram, count)) else: for tagged_ngram, count in tagged_ngrams_counter.items(): ngram, tags = zip(*tagged_ngram) counter_csv.write('{} {} {}\n'.format( ' '.join(ngram), ' '.join(tags), count))
def get_all_terms_in_sent(reg_exp, sent): tokens = nltk.word_tokenize(sent) # tags = nltk.pos_tag(tokens) pretrain = PerceptronTagger() tags = pretrain.tag(tokens) tags = [[tag[0], tag[1]] for tag in tags] if (not (tags[0][1].startswith("NNP"))): tags[0][0] = tags[0][0].lower() tag_string = "".join(get_tag_string(tags)) p = re.compile(reg_exp) res = [] retrieved_phrases = p.finditer(tag_string) for m in retrieved_phrases: np_lst = [ tok for (tok, tag) in tags[m.start():m.start() + len(m.group())] ] tag_lst = [ interpret_tag(tag) for (tok, tag) in tags[m.start():m.start() + len(m.group())] ] res.append(" ".join(np_lst)) if "P" in tag_lst: idx = tag_lst.index("P") res.append(" ".join(np_lst[:idx])) res.append(" ".join(np_lst[idx + 1:])) return res
def __init__(self): self.tagger = PerceptronTagger() self.alarm_ai = AlarmResponse() self.converter_ai = ConversionResponse() self.summarize_ai = SummarizerResponse() self.news_ai = NewsResponse() self.maps_ai = MapsResponse() self.lemmatizer = WordNetLemmatizer() self.parser = spy.English() self.conversion = ConversionService() self.ai = { "alarm_api": self.alarm_ai, "unit_conversion": self.converter_ai, "summarization": self.summarize_ai, "news": self.news_ai, "maps_api": self.maps_ai } self.classifiers = [] with open('nb_dumped_classifier.pkl', 'rb') as fid: self.classifiers.append(cPickle.load(fid)) with open('sgd_dumped_classifier.pkl', 'rb') as fid: self.classifiers.append(cPickle.load(fid)) with open('pla_dumped_classifier.pkl', 'rb') as fid: self.classifiers.append(cPickle.load(fid)) self.previous_ents = {"PERSON": "", "GPE": ""} self.special_tags = { 'PERSON': ['his', 'her', 'him', 'he', 'she'], 'GPE': ['it', 'there'] }
def __init__(self, vocab_file, do_lower_case=True, never_split=None, additional_special_tokens=[ "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]" ], **kwargs): self.inflection_tokens = additional_special_tokens self.tagger = PerceptronTagger() super().__init__(vocab_file, do_lower_case=do_lower_case, never_split=never_split, additional_special_tokens=additional_special_tokens, **kwargs) self.have_inflections = {'NOUN', 'ADJ', 'VERB'} self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"} self.do_lower_case = do_lower_case if do_lower_case: self.cased_tokenizer = BasicTokenizer(do_lower_case=False, never_split=never_split) else: self.cased_tokenizer = self.basic_tokenizer
def __init__(self): import nltk nltk.download('averaged_perceptron_tagger') from nltk.tag.perceptron import PerceptronTagger self.inst = PerceptronTagger()
def pos_tag(tokens, tagset=None): """ Use NLTK's currently recommended part of speech tagger to tag the given list of tokens. >>> from nltk.tag import pos_tag >>> from nltk.tokenize import word_tokenize >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. :param tokens: Sequence of tokens to be tagged :type tokens: list(str) :param tagset: the tagset to be used, e.g. universal, wsj, brown :type tagset: str :return: The tagged tokens :rtype: list(tuple(str, str)) """ tagger = PerceptronTagger() return _pos_tag(tokens, tagset, tagger)
def __init__(self, extended=False): self.__chunk_patterns = r""" # helps us find noun phrase chunks NP: {<DT>?<JJ.*>*<NN.*>+} {<NN.*>+} """ # create a chunk parser self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns) # now define the Hearst patterns # format is <hearst-pattern>, <hypernym_location> # so, what this means is that if you apply the first pattern, self.__hearst_patterns = [ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"), ("(NP_such_\w+ as (NP_\w+ ? (, )?(and |or )?)+\w+)", "first"), ("((NP_\w+ (, )?)+( )?(and |or )?NP_other_\w+)", "last"), ("(NP_\w+ (, )?including (NP_\w+ (, )?(and |or )?)+\w+)", "first"), ("(NP_\w+ (, )?especially (NP_\w+ (, )?(and |or )?)+\w+)", "first") # ''' IMPLEMENT ADDITIONAL HEARST PATTERNS HERE ''' ] if extended: self.__hearst_patterns.extend([ ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"), #''' IMPLEMENT ADDITIONAL PATTERNS HERE ''' ]) self.__pos_tagger = PerceptronTagger()
def tag_text_en(tokens, tokens_span): """Receive tokens and spans and return tuple list with tagged tokens""" tagger = PerceptronTagger() tags = [] for i, tagged in enumerate(tagger.tag(tokens)): tags.append(tagged + (tokens_span[i], [])) return tags
def __init__(self, hparams, dataset_type="relocar"): self.hparams = hparams self.dataset_type = dataset_type self._get_labels() self._get_word_dict() self._pad_idx = self.hparams.pad_idx # POS Tagger perceptron_tagger = PerceptronTagger() self.pos2id = {"<PAD>": 0} for pos_tag in list(perceptron_tagger.classes): self.pos2id[pos_tag] = len(self.pos2id) # Stanford NER self.ne2id = { "<PAD>": 0, "O": 1, "LOCATION": 2, "ORGANIZATION": 3, "PERSON": 4 } if self.hparams.do_bert and not self.hparams.do_roberta and not self.hparams.do_xlnet: self._bert_tokenizer_init() elif self.hparams.do_bert and self.hparams.do_roberta: self._roberta_tokenizer_init() elif self.hparams.do_bert and self.hparams.do_xlnet: self._xlnet_tokenizer_init()
def main(): input_fname = 'small' if len(sys.argv) > 1: input_fname = sys.argv[1] tknzr = TweetTokenizer() tagger = PerceptronTagger() fout = ( 'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname)) fname, delimiter, ndim = ( 'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52) word2vec = load_glove_vec(fname, {}, delimiter, ndim) tagdict = tagger.tagdict tagidx = {} nRows = len(word2vec) nCols = len(tagdict) print nRows, ':', nCols counter = 0 for tag in tagdict.keys(): tagidx[tag] = counter counter += 1 exp_wemb = {} for word in word2vec.keys(): exp_wemb[word] = np.zeros(nCols) print tagidx train = "semeval/task-B-train-plus-dev.tsv.gz" test = "semeval/task-B-test2014-twitter.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" test15 = "semeval/task-B-test2015-twitter.tsv.gz" smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) it = 0 files = [train, test, dev, test15, smiley_pos] for filen in files: for tweet in gzip.open(filen, 'rb'): tweet = tknzr.tokenize(tweet.decode('utf-8')) tags = _pos_tag(tweet, None, tagger) for (word, tag) in tags: if word in exp_wemb.keys() and tag in tagidx.keys(): idx = tagidx[tag] exp_wemb[word][idx] = 1 if (it % 10) == 0: print 'Progress:', it it += 1 f = open(fout, 'wb') for word in exp_wemb: f.write(word) tags = exp_wemb[word] for i in np.nditer(tags): f.write(' {}'.format(i)) fname.write("\n")
def postag(txt): precptag = PerceptronTagger() input_list=txt.split() try: tagged=nltk.tag._pos_tag(tokens=input_list, tagset=None, tagger=precptag, lang='eng') except: tagged=nltk.tag._pos_tag(tokens=input_list, tagset=None, tagger=precptag) return tagged
def _get_tagger(): # TODO: Instead of manually downloading the dutch_tagger, download it from an external source if it isn't installed at Data/ try: os.chdir(r"Data") tagger = PerceptronTagger(load=False) tagger.load('model.perc.dutch_tagger_small.pickle') return tagger except (IndexError, FileNotFoundError): return None
def nltk_perceptron_pos_tagger(input_dict): name= 'PerceptronPosTagger' if not input_dict['training_corpus']: perceptron_tagger = PerceptronTagger() name += '-pretrained' else: perceptron_tagger = PerceptronTagger(load=False) chunk = input_dict['training_corpus']['chunk'] corpus = input_dict['training_corpus']['corpus'] training_corpus=corpus_reader(corpus, chunk) perceptron_tagger.train(list(training_corpus)) return {'pos_tagger': { 'function':'tag_sents', 'object': perceptron_tagger, 'name': name } }
def filter_input(txt): precptag = PerceptronTagger() input_list=txt.lower().split() temp_list=nltk.tag._pos_tag(input_list, None, tagger=precptag); return_list=[] for temp in temp_list: if temp[1] not in ['CC','WDT','WP','WRB']: return_list.append(temp[0]) #return_list=remove_stopwords(return_list) return return_list
def test_perceptron_tagging(self): sentence = "This is a test sentence to test if the testing works." tokens = word_tokenize(sentence) pt = PerceptronTagger(load=True) tag_result1 = [x[1] for x in pt.tag(tokens)] pt2 = perctagger() pt2.load() tag_result2 = pt2.tag(tokens) self.assertListEqual(tag_result1, tag_result2)
def pos_sequence_from(keyphrase, tags): """Receive keyphrase dict and return list of tags""" pos_sequence = list(map(lambda i: tags[i][1], keyphrase["tokens-indices"])) # Special case when tokenization don't match with annotation if pos_sequence == []: tagger = PerceptronTagger() keyphrase_tags = tagger.tag(keyphrase['keyphrase-text'].split()) pos_sequence = list(map(lambda t: t[1], keyphrase_tags)) return pos_sequence
def __init__(self, df_train, df_valid, df_test, args, unk=True): self.dictionary = Dictionary(max_vocab_size=args.max_vocab_size) self.unk = unk self.classification = args.classification self.transfer_learning = args.transfer_learning self.max_length = args.n_ctx self.pos = args.POS_tags self.pos_tags = set() self.bpe = args.bpe self.lang = args.lang if self.bpe: self.sp = spm.SentencePieceProcessor() self.sp.Load(args.bpe_model_path) if self.pos: self.tagger = PerceptronTagger() if self.transfer_learning: with open(args.dict_path, 'rb') as file: self.dictionary = pickle.load(file) else: self.tokenize_df(df_train) self.tokenize_df(df_valid) self.tokenize_df(df_test) self.dictionary.sort_words(unk=self.unk, pos_tags=self.pos_tags) if not self.transfer_learning: with open(args.dict_path, 'wb') as file: pickle.dump(self.dictionary, file) if not self.classification: if self.pos: self.train, self.train_pos = self.tokenize_(df_train) self.valid, self.valid_pos = self.tokenize_(df_valid) self.test, self.test_pos = self.tokenize_(df_test) else: self.train = self.tokenize_(df_train) self.valid = self.tokenize_(df_valid) self.test = self.tokenize_(df_test) else: if self.pos: self.train, self.train_pos, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc( df_train, max_length=self.max_length) self.valid, self.valid_pos, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc( df_valid, max_length=self.max_length, valid=False) self.test, self.test_pos, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc( df_test, max_length=self.max_length) else: self.train, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc( df_train, max_length=self.max_length) self.valid, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc( df_valid, max_length=self.max_length, valid=False) self.test, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc( df_test, max_length=self.max_length)
def __init__(self, model_type='english-bidirectional-distsim.tagger'): """ Args: model: model available in $STANFORD_MODELS: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger """ #self.eng_tagger = StanfordPOSTagger(model_type, java_options='-mx16000m') self.eng_tagger = PerceptronTagger()
def pos_tokenizer(text): word_tokens = tokenize(text) # using pretrained model to tag all tokens pretrained_tagger = PerceptronTagger(load=True) results = pretrained_tagger.tag(word_tokens) # collecting pos from resulting tuples pos_tokens = [] for word_pos in results: pos_tokens.append(word_pos[1]) return pos_tokens
def __init__(self): # Prepare levels self.levels["USAGE"] = 1 self.levels["RESULT"] = 2 self.levels["MODEL-FEATURE"] = 3 self.levels["PART_WHOLE"] = 4 self.levels["TOPIC"] = 5 self.levels["COMPARE"] = 6 self.pos_tagger = PerceptronTagger() self.stopwords = nltk.corpus.stopwords.words('english')