Beispiel #1
0
    def pos_tag(self):
        tokenize_obj = NLTKTokenize(self.options)
        res = tokenize_obj.tokenize()
        tokens = res['result']
        tags = []

        # Performs Bigram / Unigram / Regex Tagging
        if self.options.get('tagger') in ['unigram', 'bigram', 'regex']:
            trainer = self.options['train'] if self.options.get(
                'train') in TRAINERS else DEFAULT_TRAIN

            train = brown.tagged_sents(categories=trainer)

            # Create your custom regex tagging pattern here
            regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
                                      (r'.*able$', 'JJ'),
                                      (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'),
                                      (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'),
                                      (r'.*ed$', 'VBD'), (r'.*', 'NN')])

            current = os.path.dirname(os.path.abspath(__file__))

            # Unigram tag training data load / dump pickle
            pkl_name = current + '/trained/unigram_' + trainer + '.pkl'
            if os.path.isfile(pkl_name):
                with open(pkl_name, 'rb') as pkl:
                    unigram_tag = load(pkl)
            else:
                unigram_tag = UnigramTagger(train, backoff=regex_tag)
                with open(pkl_name, 'wb') as pkl:
                    dump(unigram_tag, pkl, -1)

            # Bigram tag training data load / dump pickle
            if self.options['tagger'] == 'bigram':
                pkl_name = current + '/trained/bigram_' + trainer + '.pkl'
                if os.path.isfile(pkl_name):
                    with open(pkl_name, 'rb') as pkl:
                        bigram_tag = load(pkl)
                else:
                    bigram_tag = BigramTagger(train, backoff=unigram_tag)
                    with open(pkl_name, 'wb') as pkl:
                        dump(bigram_tag, pkl, -1)
                tags = bigram_tag.tag(tokens)  # Bigram tagging performed here
            elif self.options['tagger'] == 'unigram':
                tags = unigram_tag.tag(
                    tokens)  # Unigram tagging performed here
            else:
                tags = regex_tag.tag(tokens)  # Regex tagging performed here

        # Performs default pos_tag
        elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos':
            tags = pos_tag(tokens)

        return self._dump(tags)
Beispiel #2
0
def verifygrammar(label, codestring, varname):
    regexp_tagger = RegexpTagger([
        (r"^[0-9]+$", "decimal"),
        (r"^0x[0-9A-Fa-f]+$", "hexadecimal"),
    ])
    # VARIABLE LINE GENERATION - Assumption - Complex numbers data types are ignored for data mining algorithms
    if label.tag == 'var':
        varGrammar = CFG.fromstring("""
            S -> VN "=" VV
            VN -> """ + varname + """
            VV -> I | D | ST | B
            B -> True | False
            I -> I N | N
            D -> I"."F
            F -> F N | N
            ST -> "'"STI"'"
            STI -> S N | S C | N | C
            N -> 0|1|2|3|4|5|6|7|8|9
            C -> a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z
            """)
    elif label.tag == 'array':
        arrayGrammar = CFG.fromstring("""
            S -> AN "= [" AE "]"
            AN -> """ + varname + """
            AE -> VV AE | VV
            VV -> I | D | ST | B
            B -> True | False
            I -> I N | N
            D -> I"."F
            F -> F N | N
            ST -> "'"STI"'"
            STI -> S N | S C | N | C
            N -> 0|1|2|3|4|5|6|7|8|9
            C -> a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z
            """)
Beispiel #3
0
 def _model_definition(self) -> RegexpTagger:
     """Function to define and compile the model.
     
     Returns:
       Model object.
     """
     t0 = DefaultTagger('NOUN')
     return RegexpTagger(Model.RULES, backoff=t0)
Beispiel #4
0
def create_tagger(sents,patterns=PATTERNS,maxngram=4):
    '''Обучение Backoff tagger на каком-либо корпусе предложений'''
    
    train = sents
    def_tagger = DefaultTagger('NN')
    re_tagger = RegexpTagger(patterns, backoff=def_tagger)
    uni_tagger = UnigramTagger(train, backoff=re_tagger) 
    bi_tagger = BigramTagger(train, backoff=uni_tagger) 
    tri_tagger = TrigramTagger(train, backoff=bi_tagger) 
    ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger)
    return ngram_tagger
def regex_tag():
    raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship'
    raw_incorrect = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment'
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')
    ]  # nouns (default)
    regexp_tagger = RegexpTagger(patterns)
    tagged = regexp_tagger.tag(word_tokenize(raw))
    tagged_incorrect = regexp_tagger.tag(word_tokenize(raw_incorrect))
    print(tagged)
    print(tagged_incorrect)
    score = regexp_tagger.evaluate(brown_tagged_sents)
    print(score)
Beispiel #6
0
def generateTagger():
    default_tagger = DefaultTagger('V')
    patterns = [
        (r'.*o$', 'NMS'),  # noun masculine singular
        (r'.*os$', 'NMP'),  # noun masculine plural
        (r'.*a$', 'NFS'),  # noun feminine singular
        (r'.*as$', 'NFP')  # noun feminine singular
    ]
    regexp_tagger = RegexpTagger(patterns, backoff=default_tagger)
    #train nltk.UnigramTagger using tagged sentences from cess_esp
    cess_tagged_sents = cess_esp.tagged_sents()
    combined_tagger = UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)

    return combined_tagger
 def __init__(self, train=None, default=None, name=None):
     self.name = name
     # As found on page 199 of the nltk book
     regexps = [
         (r'.*ing$', 'VBG'),  # gerunds
         (r'.*ed$', 'VBD'),  # simple past
         (r'.*es$', 'VBZ'),  # 3rd singular present
         (r'.*ould$', 'MD'),  # modals
         (r'.*\'s$', 'NN$'),  # possessive nouns
         (r'.*s$', 'NNS'),  # plural nouns
         (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     ]
     self.default = default
     self.regex = RegexpTagger(regexps, backoff=self.default)
     self.unigram = UnigramTagger(train=train, backoff=self.regex)
     self.bigram = BigramTagger(train=train, backoff=self.unigram)
def train_and_save_unigram_tagger():
    train_text = brown.tagged_sents()
    regexp_tagger = RegexpTagger(
                [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                 (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                 (r'.*able$', 'JJ'),                # adjectives
                 (r'.*ness$', 'NN'),                # nouns formed from adjectives
                 (r'.*ly$', 'RB'),                  # adverbs
                 (r'.*s$', 'NNS'),                  # plural nouns
                 (r'.*ing$', 'VBG'),                # gerunds
                 (r'.*ed$', 'VBD'),                 # past tense verbs
                 (r'.*', 'NN')                      # nouns (default)
            ])

    unigram_tagger = UnigramTagger(train_text, backoff=regexp_tagger)

    output = open('../taggers/unigram_tagger.pkl', 'wb')
    dump(unigram_tagger, output, -1)
    output.close()
Beispiel #9
0
def prepare_toolset():
    toolset = {}
    patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'),
                (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'),
                (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')]
    train_set = brown.tagged_sents(
        categories='learned', tagset='universal') + brown.tagged_sents(
            categories='news', tagset='universal') + brown.tagged_sents(
                categories='reviews', tagset='universal')
    utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN'))
    btgr = BigramTagger(train=train_set, backoff=utgr)
    ttgr = TrigramTagger(train=train_set, backoff=btgr)
    toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr)
    toolset['sw'] = stopwords.words('english')
    toolset['lr'] = WordNetLemmatizer()
    toolset['wntg'] = {
        'NOUN': wordnet.NOUN,
        'VERB': wordnet.VERB,
        'ADJ': wordnet.ADJ,
        'ADV': wordnet.ADV,
        'X': wordnet.NOUN
    }
    print('Tools Ready')
    return toolset
Beispiel #10
0
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'the', 'DT'),                   # Determiner
     (r'in','IN'),                     # preposition
     (r'.*', 'NN')                     # nouns (default)
]


# Generate a *RegexpTagger*-object with the defined rules. 

# In[3]:


regexp_tagger = RegexpTagger(patterns)


# Apply the RegexpTagger for tagging a single sentence:

# In[4]:


regexp_tagger.tag("5 friends have been singing in the rain".split())


# Apply the *RegexpTagger* for tagging the first 3 sentences of the brown corpus.

# In[5]:

Beispiel #11
0
@author: ibews
"""
import codecs
from nltk import RegexpTagger

f = codecs.open("./staedte.txt", "r", encoding="utf8")
names = [line.strip() for line in f.readlines()]

# https://de.wikipedia.org/wiki/Liste_der_St%C3%A4dte_in_Deutschland
# grab url
# parse html
# find all links of tho form:
#   <dd><a href="/wiki/.*" title=".*">$1</a>.*</dd>
# combing all $1 into a list.

# look through tho complete list of cities looking for similarities
# compile similaritiesinto patterns
# apply

patterns = [
    (ur'.+(Hall|bach|b[eu]rg|dorf|feld|hausen|heim|leben|r[ao]de|st[ae]dt|tah?l)$', 'city'),
    (ur'^Bad .+', 'city'),
    (ur'^Burg', 'city'),
    (ur'^.*$', 'DEFAULT')
]

ret = RegexpTagger(patterns)
tagged = ret.tag(names)

print len(filter(lambda (_, tag): tag == 'city', tagged))*100.0/len(names)
Beispiel #12
0
 def __init__(self):
     from nltk import RegexpTagger
     from nltk import RegexpParser
     self.tagger = RegexpTagger(patterns)
     self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)
Beispiel #13
0
class CopyrightDetector(object):
    """
    Class to detect copyrights and authorship.
    """
    def __init__(self):
        from nltk import RegexpTagger
        from nltk import RegexpParser
        self.tagger = RegexpTagger(patterns)
        self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)

    @staticmethod
    def as_str(node):
        """
        Return a parse tree node as a space-normalized string.
        """
        node_string = ' '.join(k for k, _ in node.leaves())
        return u' '.join(node_string.split())

    def detect(self, numbered_lines):
        """
        Return a sequence of tuples (copyrights, authors, years, holders)
        detected in a sequence of numbered line tuples.
        """
        from nltk.tree import Tree
        numbered_lines = list(numbered_lines)
        numbers = [n for n, _l in numbered_lines]
        start_line = min(numbers)
        end_line = max(numbers)
        # logger.debug('CopyrightDetector:detect:lines numbers: %(start_line)d->%(end_line)d' % locals())
        tokens = self.get_tokens(numbered_lines)

        # we accumulate detected items in these synchronized lists
        # this could be a single list of namedtuples
        # or a list of dicts instead
        copyrights, authors, years, holders = [], [], [], []

        if not tokens:
            return copyrights, authors, years, holders, None, None

        # OPTIMIZED
        copyrights_append = copyrights.append
        authors_append = authors.append
        years_append = years.append
        holders_append = holders.append

        # first, POS tag each token using token regexes
        tagged_text = self.tagger.tag(tokens)
        logger.debug('CopyrightDetector:tagged_text: ' + str(tagged_text))

        # then build a parse tree based on tagged tokens
        tree = self.chunker.parse(tagged_text)
        logger.debug('CopyrightDetector:parse tree: ' + str(tree))

        CopyrightDetector_as_str = CopyrightDetector.as_str

        def collect_year_and_holder(detected_copyright):
            """
            Walk the a parse sub-tree starting with the `detected_copyright`
            node collecting all years and holders.
            """
            for copyr in detected_copyright:
                if isinstance(copyr, Tree):
                    # logger.debug('n: ' + str(copyr))
                    node_text = CopyrightDetector_as_str(copyr)
                    copyr_label = copyr.label()
                    if 'YR-RANGE' in copyr_label:
                        years_append(refine_date(node_text))
                    elif 'NAME' == copyr_label or 'COMPANY' in copyr_label:
                        # FIXME : this would wreck things like 23andme
                        # where a company name contains numbers
                        holders_append(refine_author(node_text))
                        # logger.debug('CopyrightDetector: node_text: ' + node_text)
                    else:
                        collect_year_and_holder(copyr)

        # then walk the parse tree, collecting copyrights, years and authors
        for tree_node in tree:
            if isinstance(tree_node, Tree):
                node_text = CopyrightDetector_as_str(tree_node)
                tree_node_label = tree_node.label()
                if 'COPYRIGHT' in tree_node_label:
                    if node_text and node_text.strip():
                        refined = refine_copyright(node_text)
                        if not is_junk(refined):
                            copyrights_append(refined)
                            collect_year_and_holder(tree_node)
                elif tree_node_label == 'AUTHOR':
                    authors_append(refine_author(node_text))

        return copyrights, authors, years, holders, start_line, end_line

    def get_tokens(self, numbered_lines):
        """
        Return an iterable of tokens from lines of text.
        """
        tokens = []
        tokens_append = tokens.append

        # simple tokenization: spaces and some punctuation
        splitter = re.compile('[\\t =;]+').split

        for _line_number, line in numbered_lines:
            line = line.strip()
            if line:
                line = prepare_text_line(line)
            if line:
                line = strip_markup(line)
            if line and line.strip():
                for tok in splitter(line):
                    # strip trailing quotes and ignore empties
                    tok = tok.strip("' ")
                    if not tok:
                        continue
                    # strip trailing colons: why?
                    tok = tok.rstrip(':').strip()
                    # strip leading @: : why?
                    tok = tok.lstrip('@').strip()
                    if tok and tok not in (':', ):
                        tokens_append(tok)
        logger.debug('CopyrightDetector:tokens: ' + repr(tokens))
        return tokens
 def __init__(self):
     from nltk import RegexpTagger
     from nltk import RegexpParser
     self.tagger = RegexpTagger(patterns)
     self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)
Beispiel #15
0
        (ur'.*schaft$', 'NN'),
        (ur'.*mus$', 'NN'),
        (ur'.*er$', 'NN'),
        (ur'.*chen$', 'NN'),
        (ur'.*lein$', 'NN'),
        (ur'.*lich$', 'ADJ'),
        (ur'.*ig$', 'ADJ'),
        (ur'.*isch$', 'ADJ'),
        (ur'.*haft$', 'ADJ'),
        (ur'.*bar$', 'ADJ'),
        (ur'.*los$', 'ADJ'),
        (ur'.*sam$', 'ADJ'),
        (ur'.*', None)
    ]
    
ret = RegexpTagger(patterns)
tokens = nltk.word_tokenize(text)
tagged = ret.tag(tokens)

diff = [(w, lt, rt)
        for ((w, lt),(_, rt)) in zip(tagged, words)
        if lt != rt
        ]

print diff
# [(u'der', 'NN', None), (u'Brahmane', None, 'NN'), (u'stumm', None, 'ADJ'),
# (u'gekreuzten', None, 'ADJ'), (u'der', 'NN', None), (u'Vater.', None, 'NN'),
# (u'der', 'NN', None), (u'der', 'NN', None), (u'er', 'NN', None), (u'einer', 'NN', None),
# (u'Stunde', None, 'NN'), (u'Schlaf', None, 'NN'), (u'Augen', None, 'NN'),
# (u'der', 'NN', None), (u'Brahmane', None, 'NN'), (u'Schritte', None, 'NN'),
# (u'her', 'NN', None), (u'Haus', None, 'NN'), (u'Mond', None, 'NN'), (u'der', 'NN', None),
Beispiel #16
0
if taggerTrained:
    input = open('data/bestTagger.pkl', 'rb')
    tagger = load(input)
    input.close()
else:
    output = open('data/bestTagger.pkl', 'wb')
    train_set = make_train_set([brown, treebank])
    backoffTagger = backoff_tagger(
        train_set, [UnigramTagger, BigramTagger, TrigramTagger],
        backoff=DefaultTagger('NN'))
    brillTagger = brill_tagger_wrapper.train_brill_tagger(
        backoffTagger, train_set)
    dump(brillTagger, output, -1)
    output.close()
    tagger = brillTagger
print(tagger.evaluate(test_set))
print(tagger.evaluate(brown.tagged_sents()))
print(tagger.evaluate(treebank.tagged_sents()))
# cpos = ClassifierBasedPOSTagger(train=train_set)
# print(cpos.evaluate(test_set))
##################
#  Regex tagger  #
##################
patterns = [(r'[0-9]+:[0-9]+|[0-9]+-[0-9]+', 'T'), (r'.*road', 'L')]
regexTagger = RegexpTagger(patterns)

# try out the regex tagger
x = nltk.word_tokenize("its at 9:00 till 5-00 on bigroad")
print(regexTagger.tag(x))
Beispiel #17
0
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default)
    ]
    

additions = [
        (r'[.?;!:]', '.'),
        ('\\($', '('),
        (r'.*ly$', 'ADV'),
        ('n[o\']t$', '*'),
        (r'^,$', ',')
    ]

ret = RegexpTagger(patterns)
print ret.evaluate(brown.tagged_sents(categories='news'))
    
for pattern in additions:
    patterns.insert(-1, pattern)
    print "added pattern {}".format(pattern)
    ret = RegexpTagger(patterns)
    print ret.evaluate(brown.tagged_sents(categories='news'))
    
# 0.203263917895
# added pattern ('[.?;!:]', '.')
# 0.247538635957
# added pattern ('\\($', '(')
# 0.24901048193
# added pattern ('.*ly$', 'ADV')
# 0.248314338564
Beispiel #18
0
from nltk import NgramTagger
from nltk import RegexpTagger

# In[5]:

# Prepare training
brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal')
train = brown_news_tagged[100:]
test = brown_news_tagged[:100]

regexp_tagger = RegexpTagger([
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'),  # cardinal numbers
    (r'(The|the|A|a|An|an)$', 'DET'),  # articles
    (r'.*able$', 'ADJ'),  # adjectives
    (r'.*ness$', 'NOUN'),  # nouns formed from adjectives
    (r'.*ly$', 'ADV'),  # adverbs
    (r'.*s$', 'NOUN'),  # plural nouns
    (r'.*ing$', 'VERB'),  # gerunds
    (r'.*ed$', 'VERB'),  # past tense verbs
    (r'.*', 'NOUN')  # nouns (default)
])
#Affix tagger
at2 = AffixTagger(train, backoff=regexp_tagger)
#Unigram tagger
ut3 = UnigramTagger(train, backoff=at2)
ut3.evaluate(test)
# Ngram tagger
ct3 = NgramTagger(3, train, backoff=ut3)

google3.EnsureDir("tagged/")
for i in range(0, 12):
Beispiel #19
0
defaultTChat90 = nltk.DefaultTagger(freqChat90)
#print(defaultTChat90.evaluate(chatT90))

#b)
#using regex from nltk.org/book/chp05.html, 4.2
patterns = [
    (r'.*ing$', 'VBG'),  #gerunds
    (r'.*ed$', 'VBD'),  #simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  #modal
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  #plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  #nouns (default)
]
regexp_tagger = RegexpTagger(patterns)
uniB = UnigramTagger(brownT90, backoff=defaultTB90)
biB = BigramTagger(brownT90, backoff=uniB)
triB = TrigramTagger(brownT90, backoff=biB)

uniC = UnigramTagger(chatT50, backoff=defaultTChat50)
biC = BigramTagger(chatT50, backoff=uniC)
triC = TrigramTagger(chatT50, backoff=uniC)

print("Regextag50/50: ", regexp_tagger.evaluate(brownT50))
print("Default: ", defaultTB90.evaluate(brownT50))

print("Bigram Brown 50/50: ",
      BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50))
print("Default: ", defaultTB50.evaluate(brownT50))
Beispiel #20
0
from pythainlp.tokenize import word_tokenize


def open_dict(name):
    with open(name + ".dict", "r") as f:
        return [i.strip() for i in f.readlines()]


dict_word = {
    "NUM": open_dict("NUM"),
    "PART": open_dict("part"),
    "DET": open_dict("det"),
    "PROPN": open_dict("PROPN"),
    "ADJ": open_dict("ADJ"),
    "NOUN": open_dict("NOUN"),
    "NOTKNOW": [".*"]
}

regexp_tagger = RegexpTagger([('(' + '|'.join(dict_word[a]) + ')$', a)
                              for a in dict_word])
while True:
    text = input("input : ")
    if text == "exit":
        break
    print(regexp_tagger.tag(word_tokenize(text)))
    print("\n")
"""
https://stackoverflow.com/questions/14802442/how-to-use-a-regex-backoff-tagger-in-python-nltk-to-override-nns
"""

#print('Regexp accuracy %4.1f%%' % (100.0 * regexp_tagger.evaluate(brown_test)))
class CopyrightDetector(object):
    """
    Class to detect copyrights and authorship.
    """
    def __init__(self):
        from nltk import RegexpTagger
        from nltk import RegexpParser
        self.tagger = RegexpTagger(patterns)
        self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)

    @staticmethod
    def as_str(node):
        """
        Return a parse tree node as a space-normalized string.
        """
        node_string = ' '.join(k for k, _ in node.leaves())
        return u' '.join(node_string.split())

    def detect(self, numbered_lines):
        """
        Return a sequence of tuples (copyrights, authors, years, holders)
        detected in a sequence of numbered line tuples.
        """
        from nltk.tree import Tree
        numbered_lines = list(numbered_lines)
        numbers = [n for n, _l in numbered_lines]
        start_line = min(numbers)
        end_line = max(numbers)
        # logger.debug('CopyrightDetector:detect:lines numbers: %(start_line)d->%(end_line)d' % locals())
        tokens = self.get_tokens(numbered_lines)

        # we accumulate detected items in these synchronized lists
        # this could be a single list of namedtuples
        # or a list of dicts instead
        copyrights, authors, years, holders = [], [], [], []

        if not tokens:
            return copyrights, authors, years, holders, None, None

        # OPTIMIZED
        copyrights_append = copyrights.append
        authors_append = authors.append
        years_append = years.append
        holders_append = holders.append

        # first, POS tag each token using token regexes
        tagged_text = self.tagger.tag(tokens)
        logger.debug('CopyrightDetector:tagged_text: ' + str(tagged_text))

        # then build a parse tree based on tagged tokens
        tree = self.chunker.parse(tagged_text)
        logger.debug('CopyrightDetector:parse tree: ' + str(tree))

        CopyrightDetector_as_str = CopyrightDetector.as_str

        def collect_year_and_holder(detected_copyright):
            """
            Walk the a parse sub-tree starting with the `detected_copyright`
            node collecting all years and holders.
            """
            for copyr in detected_copyright:
                if isinstance(copyr, Tree):
                    # logger.debug('n: ' + str(copyr))
                    node_text = CopyrightDetector_as_str(copyr)
                    copyr_label = copyr.label()
                    if 'YR-RANGE' in copyr_label:
                        years_append(refine_date(node_text))
                    elif 'NAME' == copyr_label or 'COMPANY' in copyr_label:
                        # FIXME : this would wreck things like 23andme
                        # where a company name contains numbers
                        holders_append(refine_author(node_text))
                        # logger.debug('CopyrightDetector: node_text: ' + node_text)
                    else:
                        collect_year_and_holder(copyr)

        # then walk the parse tree, collecting copyrights, years and authors
        for tree_node in tree:
            if isinstance(tree_node, Tree):
                node_text = CopyrightDetector_as_str(tree_node)
                tree_node_label = tree_node.label()
                if 'COPYRIGHT' in tree_node_label:
                    if node_text and node_text.strip():
                        refined = refine_copyright(node_text)
                        if not is_junk(refined):
                            copyrights_append(refined)
                            collect_year_and_holder(tree_node)
                elif tree_node_label == 'AUTHOR':
                    authors_append(refine_author(node_text))

        return copyrights, authors, years, holders, start_line, end_line

    def get_tokens(self, numbered_lines):
        """
        Return an iterable of tokens from lines of text.
        """
        tokens = []
        tokens_append = tokens.append

        # simple tokenization: spaces and some punctuation
        splitter = re.compile('[\\t =;]+').split

        for _line_number, line in numbered_lines:
            line = line.strip()
            if line:
                line = prepare_text_line(line)
            if line :
                line = strip_markup(line)
            if line and line.strip():
                for tok in splitter(line):
                    # strip trailing quotes and ignore empties
                    tok = tok.strip("' ")
                    if not tok:
                        continue
                    # strip trailing colons: why?
                    tok = tok.rstrip(':').strip()
                    # strip leading @: : why?
                    tok = tok.lstrip('@').strip()
                    if tok and tok not in (':',):
                        tokens_append(tok)
        logger.debug('CopyrightDetector:tokens: ' + repr(tokens))
        return tokens
Beispiel #22
0
    def createModel(self):

        model_name = None
        try:
            unigrams = self.buildUnigrams()

            N = len(self.corpusSents)
            toTraining = round(self.training_portion * N)

            #logging.info("Sentencias totales:" + str(N))

            training = self.corpusSents[:toTraining]
            test = self.corpusSents[toTraining:]

            post_patterns = []

            for regex, post in self.regex_list:
                try:
                    regex = regex.decode('utf-8')
                except:
                    pass

                post_patterns.append((regex, post))

            for regex, post in self.config.items('postaggers.regex'):
                post_patterns.append((regex.decode('utf-8'), post))

            regexpTagger = RegexpTagger(post_patterns)
            unigramTagger = UnigramTagger(unigrams + training,
                                          backoff=regexpTagger)
            bigramTagger = BigramTagger(training, backoff=unigramTagger)
            trigramTagger = TrigramTagger(training, backoff=bigramTagger)
            NTagger = NgramTagger(self.max_ngrams,
                                  training,
                                  backoff=trigramTagger)

            print("Sentencias de entrenamiento para n-taggers:" +
                  str(len(training)))
            print("Sentencias de entrenamiento para unitaggers:" +
                  str(len(unigrams)))
            print(
                "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:"
                + str(len(unigrams)))
            print("Sentencias para testing:" + str(len(test)))
            print("Expresiones regulares para el Tagger:")

            for post_regex in post_patterns:
                print post_regex

            if self.training_portion != 1:

                score_ut = unigramTagger.evaluate(test)
                score_bt = bigramTagger.evaluate(test) - 0.002
                score_tt = trigramTagger.evaluate(test)
                score_nt = NTagger.evaluate(test)

                scores = [score_ut, score_bt, score_tt, score_nt]
                tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"]
                taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger]

                bestTagger_index = scores.index(max(scores))
                best_msg = max(scores), tagger_names[bestTagger_index]

            fname = self.taggers_path + tagger_names[bestTagger_index]
            if os.path.isfile(fname + self.tagger_extension_file):
                fname = fname + str(len(listdir(
                    self.taggers_path))) + self.tagger_extension_file
            else:
                fname = self.taggers_path + tagger_names[
                    bestTagger_index] + self.tagger_extension_file

            model = taggers[bestTagger_index]

            f = open(fname, 'wb')
            pickle.dump(model, f)
            f.close()

            print("Guardando el tagger :" + fname)
            #logging.info("Guardando el mejor tagger :" + fname)

            model_name = fname

        except Exception, e:
            print "ERRPR EN POS TAGGER GENERATOR:", str(e)
            pdb.set_trace()
dt = DefaultTagger('NN')

dt.evaluate(test_data)

patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default) ...
]

rt = RegexpTagger(patterns)

rt.evaluate(test_data)

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

ut.evaluate(test_data)


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff