Beispiel #1
0
def pos_tag_tokens(line, lang):
    """
    Do POS-tagging but return tuples for each input word
    :type line: list(str) An already tokenized line
    :type lang: str The language (2-letter code)
    """
    iso3 = ('sve' if lang[:2] == 'sv' else 'eng')
    if iso3 in ht_cache:
        ht = ht_cache[iso3]
    else:
        if iso3 == 'eng':
            model = 'en_wsj.model'
            enc = 'utf-8'
        else:  # Swedish
            model = 'suc-suctags.model'
            enc = 'ISO-8859-1'
        # build the tagger
        if platform.system() == 'Windows':
            ht = HunposTagger(
                model,
                path_to_bin=r'.\thirdparty\hunpos-win\hunpos-tag.exe',
                encoding=enc)
        else:
            ht = HunposTagger(model, path_to_bin='./hunpos-tag', encoding=enc)
        # cache it
        ht_cache[iso3] = ht
    tuples = ht.tag(line)
    return tuples
Beispiel #2
0
    def __init__(self,
                 tok=False,
                 wtok=False,
                 stok=False,
                 pos=False,
                 stem=False,
                 pos_model=None,
                 abbrev_set=None,
                 stok_model=None):
        """@param abbrev_set: a set of frequent abbreviations."""
        if tok:
            wtok = True
            stok = True

        if wtok:
            self.wordTokenizer = PunktWordTokenizer()
            #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE)
            self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE)
            # Bragantino,2006.In fix this shit
        if stok:
            if stok_model is not None:
                try:
                    self.senTokenizer = stok_model
                except LookupError:
                    sys.stderr.write("WARNING: tokenizer cannot be loaded")
                    sys.stderr.write(
                        "WARNING: using an untrained sen_tokenizer")
                    self.senTokenizer = PunktSentenceTokenizer()
            else:
                try:
                    self.senTokenizer = nltk.data.load(
                        'tokenizers/punkt/english.pickle')
                except LookupError:
                    sys.stderr.write(
                        "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!"
                    )
                    sys.stderr.write(
                        "WARNING: using an untrained sen_tokenizer")
                    self.senTokenizer = PunktSentenceTokenizer()

        self.abbrev_set = (set(abbrev_set)
                           if abbrev_set is not None else set())

        if pos:
            if pos_model is not None:
                self.posTagger = HunposTagger(pos_model, encoding="utf-8")
            else:
                self.posTagger = HunposTagger(os.path.join(
                    os.environ['HUNPOS'], 'english.model'),
                                              encoding="utf-8")
        if stem:
            self.stemmer = WordNetLemmatizer()
    def filter_on_basis_pos_tag(self):
        hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath)
        filtered_list = list()

        def check_np(np):
            print "Actual NP %s\n" % np
            try:
                #__list = hunpos_tagger.tag(nltk.wordpunct_tokenize(np.encode("utf-8")))
                __list = hunpos_tagger.tag(nltk.wordpunct_tokenize(np))
                print __list
            except Exception as e:
                print "This bullsht string is being ignored %s" % np
                return None

            if not set.intersection(set(["NNP", "NN", "NNS"]),
                                    set([__tag for (token, __tag) in __list])):
                return None

            result = [
                __token for (__token, __tag) in __list
                if not __tag in ["RB", "CD", "FW"]
            ]
            print "Stripped off NP %s \n" % " ".join(result)
            return " ".join(result)

        for __e in self.result:
            filtered_list.append(check_np(__e))
        return filter(None, filtered_list)
    def custom_ner(self):
        ner = list()
        regexp_grammer = r"NER:{<IN><NN.*><NN.*>?}"
        __parser = nltk.RegexpParser(regexp_grammer)

        hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath)
        for __sentence in self.sentences:
            try:
                tagged = hunpos_tagger.tag(
                    nltk.word_tokenize(__sentence.encode("utf-8")))
            except Exception as e:
                hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath)
                tagged = hunpos_tagger.tag(
                    nltk.word_tokenize(__sentence.encode("utf-8")))
            tree = __parser.parse(tagged)
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'NER'):
                l = " ".join([
                    e[0] for e in subtree.leaves()
                    if e[1] == 'NNP' or e[1] == 'NNS' or e[1] == 'NN'
                ])
                ner.append(l.lower())

        result = sorted(Counter(ner).items(), reverse=True, key=lambda x: x[1])
        return result
    def filter_on_basis_pos_tag(self):
        """
                pos tagging of noun phrases will be done, and if the noun phrases contains some adjectives or RB or FW, 
                it will be removed from the total noun_phrases list

                Any Noun phrases when split, if present in self.list_to_exclude will not be included in the final result
                for Example: 
                self.list_to_exclude = ["food", "i", "service", "cost", "ambience", "delhi", "Delhi", "place", "Place"]
                noun_phrase = "great place"
                
                """
        print "{0} These noun phrases will be removed from the noun phrases {1}".format(
            bcolors.OKBLUE, bcolors.RESET)
        print "{0} List To Exclude {1}".format(bcolors.OKBLUE, bcolors.RESET)
        print self.list_to_exclude
        print "\n"
        print "{0} Common name entities  {1}".format(bcolors.OKBLUE,
                                                     bcolors.RESET)
        print self.common_ners
        print "\n"
        hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath)
        filtered_list = list()
        for __e in self.result:
            __list = [
                pos_tag for (np, pos_tag) in hunpos_tagger.tag(
                    nltk.wordpunct_tokenize(__e.get("name")))
            ]

            if bool(
                    set.intersection(set(__e.get("name").split(" ")),
                                     set(self.list_to_exclude))):
                print __e.get("name")
                pass

            #elif __e.get("name") in self.common_ners:
            #        pass

            elif "RB" == __list[0] or "CD" in __list or "FW" in __list:
                pass
            else:
                filtered_list.append(__e)

        return filtered_list
Beispiel #6
0
def work():
    global conversion
    pos_tagger = HunposTagger(path_to_model=config.path_list.hunpos_model,
                              path_to_bin=config.path_list.hunpos_bin)
    conversion = inflect.engine()
    with open(config.path_list.en_stopwords, 'r', encoding='utf-8') as f:
        stopwords = set(f.read().split('\n'))
    with open(config.path_list.en_kp_list, 'r', encoding='utf-8') as f:
        lists = f.read().split('\n')
        print('load kp_list done.')
    with open(config.path_list.input, 'r', encoding='utf-8') as f:
        data = []
        for line in f.read().split('\n'):
            if line == '':
                continue
            data.append(line.lower())
        res1 = segment(data, pos_tagger)
        res2 = pattern_filter(lists, res1, stopwords)
        json_gen(res2)
        print('prework done.')
    pos_tagger.close()
        def filter_on_basis_pos_tag(self):
                """
                pos tagging of noun phrases will be d
                one, and if the noun phrases contains some adjectives or RB or FW, 
                it will be removed from the total noun_phrases list

                Any Noun phrases when split, if present in self.list_to_exclude will not be included in the final result
                for Example: 
                self.list_to_exclude = ["food", "i", "service", "cost", "ambience", "delhi", "Delhi", "place", "Place"]
                noun_phrase = "great place"
                
                """
                hunpos_tagger = HunposTagger(HunPosModelPath, HunPosTagPath)
                filtered_list = list()
                for __e in self.result:
                        __list = [pos_tag for (np, pos_tag) in hunpos_tagger.tag(nltk.wordpunct_tokenize(__e.get("name").encode("ascii", "ignore")))]
                        if set.intersection(set(__list), set(["FW", "CD", "LS"])):
                                    print "This will be droppped out of total noun phrases %s"%__e.get("name")
                        else:
                            filtered_list.append(__e)


                return filtered_list
Beispiel #8
0
 def get_hunpos_tagger():
     hunmorph_dir = os.environ['HUNMORPH_DIR']
     hunpos_binary = os.path.join(hunmorph_dir, 'hunpos-tag')
     hunpos_model = os.path.join(hunmorph_dir, 'en_wsj.model')
     return HunposTagger(hunpos_model, hunpos_binary)
__author__ = 'pengzhang'
# New Version
# chunking the tuples into three arguments
import copy
import json
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag.hunpos import HunposTagger

###############
# load data
tuple_anno = json.load(open('../abstract/binary_extracted_train.json'))
###############
ht = HunposTagger('../en_wsj.model')

p_list = ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'UH']
s_list = ['NN', 'JJ', 'JJS', 'JJR', 'DT', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$']
s_jj_list = ['JJ', 'JJS', 'JJR', 'DT']
verb_list = ['VB', 'VBG', 'VBD', 'VBZ', 'VBN', 'VBP']

obj_list = json.load(open('../abstract/abstract_obj_list.json'))
location_list = ['bed','bar','pad','lilypad','shelf','house','coatrack','endtable','left','right','center','top',\
                 'front','middle','back','ground','cartoon','monkeybar','petbed','rope','footstool','bat']

all_obj_list = []
for i in range(len(obj_list)):
    all_obj_list += obj_list[i].values()[0]

all_obj_list = set(all_obj_list)
all_obj_list = list(all_obj_list)
Beispiel #10
0
 def __init__(self, model_fn=None):
     self.tokenizer = NOTokenizer()
     self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
                                hunpos_tag_bin(),
                                encoding='utf-8')