def pos_tag_tokens(line, lang): """ Do POS-tagging but return tuples for each input word :type line: list(str) An already tokenized line :type lang: str The language (2-letter code) """ iso3 = ('sve' if lang[:2] == 'sv' else 'eng') if iso3 in ht_cache: ht = ht_cache[iso3] else: if iso3 == 'eng': model = 'en_wsj.model' enc = 'utf-8' else: # Swedish model = 'suc-suctags.model' enc = 'ISO-8859-1' # build the tagger if platform.system() == 'Windows': ht = HunposTagger( model, path_to_bin=r'.\thirdparty\hunpos-win\hunpos-tag.exe', encoding=enc) else: ht = HunposTagger(model, path_to_bin='./hunpos-tag', encoding=enc) # cache it ht_cache[iso3] = ht tuples = ht.tag(line) return tuples
def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: try: self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!") sys.stderr.write("WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer()
def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None, stok_model=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: if stok_model is not None: try: self.senTokenizer = stok_model except LookupError: sys.stderr.write("WARNING: tokenizer cannot be loaded") sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() else: try: self.senTokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write( "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!" ) sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join( os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer()
def _perform_analysis(self, tokenized_sents): res = [] if len(self.precalced_data): return self.precalced_data else: for tokens in tokenized_sents: tagger = HunposTagger(self.model_path, self.hunpos_bin, 'utf-8') tags = tagger.tag(tokens) res += tags self.precalced_data = res return res
def filter_on_basis_pos_tag(self): hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath) filtered_list = list() def check_np(np): print "Actual NP %s\n" % np try: #__list = hunpos_tagger.tag(nltk.wordpunct_tokenize(np.encode("utf-8"))) __list = hunpos_tagger.tag(nltk.wordpunct_tokenize(np)) print __list except Exception as e: print "This bullsht string is being ignored %s" % np return None if not set.intersection(set(["NNP", "NN", "NNS"]), set([__tag for (token, __tag) in __list])): return None result = [ __token for (__token, __tag) in __list if not __tag in ["RB", "CD", "FW"] ] print "Stripped off NP %s \n" % " ".join(result) return " ".join(result) for __e in self.result: filtered_list.append(check_np(__e)) return filter(None, filtered_list)
def custom_ner(self): ner = list() regexp_grammer = r"NER:{<IN><NN.*><NN.*>?}" __parser = nltk.RegexpParser(regexp_grammer) hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath) for __sentence in self.sentences: try: tagged = hunpos_tagger.tag(nltk.word_tokenize(encoding_helper(__sentence))) tree = __parser.parse(tagged) for subtree in tree.subtrees(filter = lambda t: t.label()=='NER'): l = " ".join([e[0] for e in subtree.leaves() if e[1] == 'NNP' or e[1] == 'NNS' or e[1] == 'NN']) ner.append(l.lower()) except Exception as e: pass result = sorted(Counter(ner).items(), reverse=True, key=lambda x: x[1]) return result
def filter_on_basis_pos_tag(self): """ pos tagging of noun phrases will be done, and if the noun phrases contains some adjectives or RB or FW, it will be removed from the total noun_phrases list Any Noun phrases when split, if present in self.list_to_exclude will not be included in the final result for Example: self.list_to_exclude = ["food", "i", "service", "cost", "ambience", "delhi", "Delhi", "place", "Place"] noun_phrase = "great place" """ print "{0} These noun phrases will be removed from the noun phrases {1}".format( bcolors.OKBLUE, bcolors.RESET) print "{0} List To Exclude {1}".format(bcolors.OKBLUE, bcolors.RESET) print self.list_to_exclude print "\n" print "{0} Common name entities {1}".format(bcolors.OKBLUE, bcolors.RESET) print self.common_ners print "\n" hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath) filtered_list = list() for __e in self.result: __list = [ pos_tag for (np, pos_tag) in hunpos_tagger.tag( nltk.wordpunct_tokenize(__e.get("name"))) ] if bool( set.intersection(set(__e.get("name").split(" ")), set(self.list_to_exclude))): print __e.get("name") pass #elif __e.get("name") in self.common_ners: # pass elif "RB" == __list[0] or "CD" in __list or "FW" in __list: pass else: filtered_list.append(__e) return filtered_list
def work(): global conversion pos_tagger = HunposTagger(path_to_model=config.path_list.hunpos_model, path_to_bin=config.path_list.hunpos_bin) conversion = inflect.engine() with open(config.path_list.en_stopwords, 'r', encoding='utf-8') as f: stopwords = set(f.read().split('\n')) with open(config.path_list.en_kp_list, 'r', encoding='utf-8') as f: lists = f.read().split('\n') print('load kp_list done.') with open(config.path_list.input, 'r', encoding='utf-8') as f: data = [] for line in f.read().split('\n'): if line == '': continue data.append(line.lower()) res1 = segment(data, pos_tagger) res2 = pattern_filter(lists, res1, stopwords) json_gen(res2) print('prework done.') pos_tagger.close()
def filter_on_basis_pos_tag(self): """ pos tagging of noun phrases will be d one, and if the noun phrases contains some adjectives or RB or FW, it will be removed from the total noun_phrases list Any Noun phrases when split, if present in self.list_to_exclude will not be included in the final result for Example: self.list_to_exclude = ["food", "i", "service", "cost", "ambience", "delhi", "Delhi", "place", "Place"] noun_phrase = "great place" """ hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath) filtered_list = list() for __e in self.result: __list = [pos_tag for (np, pos_tag) in hunpos_tagger.tag(nltk.wordpunct_tokenize(__e.get("name").encode("ascii", "ignore")))] if set.intersection(set(__list), set(["FW", "CD", "LS"])): print "This will be droppped out of total noun phrases %s"%__e.get("name") else: filtered_list.append(__e) return filtered_list
class NNOTagger (BaseTagger, object): """ TextBlob compatible Norsk Nynorsk POS tagger class based on the NLTK HunPos wrapper. """ def __init__(self, model_fn=None): self.tokenizer = NOTokenizer() self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN, hunpos_tag_bin(), encoding='utf-8') def tag(self, text, tokenize=True): text = clean_input(text) if tokenize: text = self.tokenizer.tokenize(text) return self.tagger.tag(text)
class NNOTagger(BaseTagger, object): """ TextBlob compatible Norsk Nynorsk POS tagger class based on the NLTK HunPos wrapper. """ def __init__(self, model_fn=None): self.tokenizer = NOTokenizer() self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN, hunpos_tag_bin(), encoding='utf-8') def tag(self, text, tokenize=True): text = clean_input(text) if tokenize: text = self.tokenizer.tokenize(text) return self.tagger.tag(text)
def __init__(self, model_fn=None): self.tokenizer = NOTokenizer() self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN, hunpos_tag_bin(), encoding='utf-8')
__author__ = 'pengzhang' # New Version # chunking the tuples into three arguments import copy import json import re import nltk from nltk.tokenize import word_tokenize from nltk.tag.hunpos import HunposTagger ############### # load data tuple_anno = json.load(open('../abstract/binary_extracted_train.json')) ############### ht = HunposTagger('../en_wsj.model') p_list = ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'UH'] s_list = ['NN', 'JJ', 'JJS', 'JJR', 'DT', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$'] s_jj_list = ['JJ', 'JJS', 'JJR', 'DT'] verb_list = ['VB', 'VBG', 'VBD', 'VBZ', 'VBN', 'VBP'] obj_list = json.load(open('../abstract/abstract_obj_list.json')) location_list = ['bed','bar','pad','lilypad','shelf','house','coatrack','endtable','left','right','center','top',\ 'front','middle','back','ground','cartoon','monkeybar','petbed','rope','footstool','bat'] all_obj_list = [] for i in range(len(obj_list)): all_obj_list += obj_list[i].values()[0] all_obj_list = set(all_obj_list) all_obj_list = list(all_obj_list)
class NltkTools: _abbrevPattern = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE) _datePattern = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE) _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE) def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: try: self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!") sys.stderr.write("WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer() def tokenize(self, raw): """Runs sentence and then word tokenization. Does some abbreviation- detection to fix false sentence endings.""" sentences = self.sen_tokenize(raw) tokens = [self.word_tokenize(sen) for sen in sentences] for i in reversed(xrange(len(tokens) - 1)): if ( self.is_abbrev(tokens[i][-1]) or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None and not NltkTools.starts_with_upper(tokens[i + 1][0])): tokens[i].extend(tokens[i + 1]) tokens.pop(i + 1) return tokens def sen_tokenize(self, raw): """Tokenizes the raw text into sentences.""" raw = NltkTools.cleanup_puncts(raw) return self.senTokenizer.tokenize(raw) def filter_long_sentences(self, raw, length=1024): """Filters "sentences" (non-whitespace character sequences longer than length) from the text.""" # TODO: This looks nice but it is too generous with memory use return ' '.join(filter(lambda x: len(x) <= length, re.split(r"\s+", raw))) def sen_abbr_tokenize(self, raw): """Tokenizes the raw text into sentences, and tries to handle problems caused by abbreviations and such.""" sentences = self.sen_tokenize(raw) for i in reversed(xrange(len(sentences) - 1)): if (NltkTools._abbrevPattern.search(sentences[i]) is not None and not NltkTools.starts_with_upper(sentences[i + 1])): sentences[i] = ' '.join(sentences[i:i+2]) sentences.pop(i + 1) return sentences @staticmethod def starts_with_upper(text): """Checks if the sentence starts with an upper case letter.""" t = text.lstrip() return len(t) > 0 and t[0].isupper() @staticmethod def cleanup_puncts(raw): pos = 0 cleaner = NltkTools._cleanerPattern.search(raw[pos:]) while cleaner: if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper(): pos = cleaner.end() elif cleaner.group(1)[-1].isdigit() and cleaner.group(3)[0].isdigit(): pos = cleaner.end() else: changed_part_string = cleaner.expand(r"\1\2 \3\4") raw = raw[:cleaner.start()] + changed_part_string + raw[cleaner.end():] pos = cleaner.end() cleaner = NltkTools._cleanerPattern.search(raw, pos) return raw def is_abbrev(self, tok): return tok in self.abbrev_set def word_tokenize(self, sen): """Tokenizes the sentence to words and splits the sentence ending punctuation mark from the last word and adds it as the last token.""" tokens = self.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] punktMatchObject = self.punktSplitter.match(tokens[-1]) if punktMatchObject is not None and not self.is_abbrev(tokens[-1]): tokens = tokens[:-1] + list(punktMatchObject.groups()) return tokens def pos_tag(self, sentokens): return self.posTagger.tag(sentokens) def stem(self, tokens): return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens) def tag_raw(self, raw_text): """Convenience method for tagging (a line of) raw text. The NltkTools instance must have been initialized with C{pos=True, stem=True, tok=True}. It is a generator: returns attribute array of one word at a time. The attributes are the word, the pos tag and the stem.""" sens = self.tokenize(raw_text) pos_tagged = list(self.pos_tag(sen) for sen in sens) stemmed = list(self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged) for sen in stemmed: for tok in sen: yield tok yield [] return
def get_hunpos_tagger(): hunmorph_dir = os.environ['HUNMORPH_DIR'] hunpos_binary = os.path.join(hunmorph_dir, 'hunpos-tag') hunpos_model = os.path.join(hunmorph_dir, 'en_wsj.model') return HunposTagger(hunpos_model, hunpos_binary)
__author__ = 'pengzhang' # New Version # chunking the tuples into three arguments import copy import json import re import nltk from nltk.tokenize import word_tokenize from nltk.tag.hunpos import HunposTagger ############### # load data tuple_anno = json.load(open('../abstract/binary_extracted_train.json')) ############### ht = HunposTagger('../en_wsj.model') p_list = ['NN','NNS','NNP','NNPS','PRP','PRP$','UH'] s_list = ['NN','JJ','JJS','JJR','DT','NNS','NNP','NNPS','PRP','PRP$'] s_jj_list = ['JJ','JJS','JJR','DT'] verb_list = ['VB','VBG','VBD','VBZ','VBN','VBP'] obj_list = json.load(open('../abstract/abstract_obj_list.json')) location_list = ['bed','bar','pad','lilypad','shelf','house','coatrack','endtable','left','right','center','top',\ 'front','middle','back','ground','cartoon','monkeybar','petbed','rope','footstool','bat'] all_obj_list = [] for i in range(len(obj_list)): all_obj_list += obj_list[i].values()[0] all_obj_list = set(all_obj_list) all_obj_list = list(all_obj_list)
root_tuple = [] root_imgid = [] root_answer = [] root_obj = [] root_tag = [] root_queid = [] nmod_key = [] nmod_question = [] nmod_tuple = [] nmod_imgid = [] nmod_answer = [] nmod_obj = [] nmod_tag = [] nmod_queid = [] ht = HunposTagger('../en_wsj.model') for i in range(len(tuple_parsing)): question = tuple_question[i]['question'] word_token = nltk.word_tokenize(question) word_tag = ht.tag(word_token) item = tuple_parsing[i] if "nsubj" in item.keys(): ind = item.keys().index("nsubj") item_keys = item.keys() if list(word_tag[ind])[-1] not in noun_list and list(word_tag[ind])[-1] not in other_list and list(word_tag[ind])[0] not in location_list: if ind+1 < len(word_tag): if list(word_tag[ind-1])[-1] in noun_list or list(word_tag[ind])[0] in location_list: sent_tuple_key_tmp = item_keys[ind-1:len(item_keys)] elif list(word_tag[ind+1])[-1] in noun_list or list(word_tag[ind])[0] in location_list: sent_tuple_key_tmp = item_keys[ind:len(item_keys)]
class NltkTools: _abbrevPattern = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE) _datePattern = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE) _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE) def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False, pos_model=None, abbrev_set=None, stok_model=None): """@param abbrev_set: a set of frequent abbreviations.""" if tok: wtok = True stok = True if wtok: self.wordTokenizer = PunktWordTokenizer() #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE) self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE) # Bragantino,2006.In fix this shit if stok: if stok_model is not None: try: self.senTokenizer = stok_model except LookupError: sys.stderr.write("WARNING: tokenizer cannot be loaded") sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() else: try: self.senTokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') except LookupError: sys.stderr.write( "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!" ) sys.stderr.write( "WARNING: using an untrained sen_tokenizer") self.senTokenizer = PunktSentenceTokenizer() self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set()) if pos: if pos_model is not None: self.posTagger = HunposTagger(pos_model, encoding="utf-8") else: self.posTagger = HunposTagger(os.path.join( os.environ['HUNPOS'], 'english.model'), encoding="utf-8") if stem: self.stemmer = WordNetLemmatizer() def tokenize(self, raw): """Runs sentence and then word tokenization. Does some abbreviation- detection to fix false sentence endings.""" sentences = self.sen_tokenize(raw) tokens = [self.word_tokenize(sen) for sen in sentences] for i in reversed(xrange(len(tokens) - 1)): if (self.is_abbrev(tokens[i][-1]) or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None and not NltkTools.starts_with_upper(tokens[i + 1][0])): tokens[i].extend(tokens[i + 1]) tokens.pop(i + 1) return tokens def sen_tokenize(self, raw): """Tokenizes the raw text into sentences.""" raw = NltkTools.cleanup_puncts(raw) return self.senTokenizer.tokenize(raw) def filter_long_sentences(self, raw, length=1024): """Filters "sentences" (non-whitespace character sequences longer than length) from the text.""" # TODO: This looks nice but it is too generous with memory use return ' '.join( filter(lambda x: len(x) <= length, re.split(r"\s+", raw))) def sen_abbr_tokenize(self, raw): """Tokenizes the raw text into sentences, and tries to handle problems caused by abbreviations and such.""" sentences = self.sen_tokenize(raw) for i in reversed(xrange(len(sentences) - 1)): if (NltkTools._abbrevPattern.search(sentences[i]) is not None and not NltkTools.starts_with_upper(sentences[i + 1])): sentences[i] = ' '.join(sentences[i:i + 2]) sentences.pop(i + 1) return sentences @staticmethod def starts_with_upper(text): """Checks if the sentence starts with an upper case letter.""" t = text.lstrip() return len(t) > 0 and t[0].isupper() @staticmethod def cleanup_puncts(raw): pos = 0 cleaner = NltkTools._cleanerPattern.search(raw[pos:]) while cleaner: if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper(): pos = cleaner.end() elif cleaner.group(1)[-1].isdigit() and cleaner.group( 3)[0].isdigit(): pos = cleaner.end() else: changed_part_string = cleaner.expand(r"\1\2 \3\4") raw = raw[:cleaner.start( )] + changed_part_string + raw[cleaner.end():] pos = cleaner.end() cleaner = NltkTools._cleanerPattern.search(raw, pos) return raw def is_abbrev(self, tok): return tok in self.abbrev_set def word_tokenize(self, sen): """Tokenizes the sentence to words and splits the sentence ending punctuation mark from the last word and adds it as the last token.""" tokens = self.wordTokenizer.tokenize(sen) if len(tokens) == 0: return [] punktMatchObject = self.punktSplitter.match(tokens[-1]) if punktMatchObject is not None and not self.is_abbrev(tokens[-1]): tokens = tokens[:-1] + list(punktMatchObject.groups()) return tokens def pos_tag(self, sentokens): return self.posTagger.tag(sentokens) def stem(self, tokens): return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens) def tag_raw(self, raw_text): """Convenience method for tagging (a line of) raw text. The NltkTools instance must have been initialized with C{pos=True, stem=True, tok=True}. It is a generator: returns attribute array of one word at a time. The attributes are the word, the pos tag and the stem.""" sens = self.tokenize(raw_text) pos_tagged = list(self.pos_tag(sen) for sen in sens) stemmed = list( self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged) for sen in stemmed: for tok in sen: yield tok yield [] return
from os.path import isfile, join import nltk from nltk.tag.hunpos import HunposTagger from nltk.tokenize import word_tokenize import os import sys reload(sys) #sys.setdefaultencoding('utf8') mypath = "texts_burek/" onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] corpus = "zeleni ko polje, blagi ko oboje" #please help me to correct my python syntax errors, i'm new to python #but i really need this to work. sorry ##from nltk.tag import hunpos.HunPosTagger print os.environ['HUNPOS'] ht = HunposTagger('model.hunpos.mte5.defnpout') tagged = ht.tag(word_tokenize(corpus)) print tagged i = 0 for t in tagged: if(tagged[i-1]!=None and ('A' in tagged[i-1][1] or 'N' in tagged[i-1][1]) and (tagged[i][0]=='kao' or tagged[i][0]=='ko' or tagged[i][0]=='k\'o')): p = i+1 while('A' in tagged[p][1]): p=p+1 if('N' in tagged[p][1]): simile = tagged[i-1][0].encode('utf8')+' '+tagged[i][0].encode('utf8') for k in range(i+1,p+1): simile = simile+' '+tagged[k][0].encode('utf8') if simile!='': print simile i = i+1