Esempio n. 1
0
class NltkToolsStemmer(LemmatizerWrapper):
    """
    Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer,
    which is English only.

    @warning This is the original implementation as used in our English
             Wikipedia parser. No effort has been made to clean up the
             code, or to fix the hardwired indexing, etc. The data must
             be already POS tagged, and the POS field must be the last one.
    """
    def __init__(self, params):
        self.nt = NltkTools(stem=True)

    def lemmatize(self, tokens):
        # HACK
        for sen_i, sen in enumerate(tokens):
            stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen))
            hard_stemmed = self.nt.stem(
                (((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper()
                   and tok[0][1:].islower() else tok[0]), tok[-1])
                 for tok in sen))
            for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(
                    zip(stemmed, hard_stemmed)):
                tokens[sen_i][tok_i].append(tok_stemmed[2])
                tokens[sen_i][tok_i].append(tok_hard_stemmed[2])
Esempio n. 2
0
 def _join_condition(self, sentences, current):
     """If this method returns @c True, _join_sentences joins the two current
     and the next sentence."""
     end_with_abbrev = self._end_in_abbrev(sentences[current])
     if end_with_abbrev is not None:
         if end_with_abbrev or not NltkTools.starts_with_upper(sentences[current + 1]):
             return True
         return False
     else:
         return (self._match_patterns(sentences[current]) and
                 NltkTools.starts_with_upper(sentences[current + 1]))
Esempio n. 3
0
 def _join_condition(self, sentences, current):
     """If this method returns @c True, _join_sentences joins the two current
     and the next sentence."""
     end_with_abbrev = self._end_in_abbrev(sentences[current])
     if end_with_abbrev is not None:
         if end_with_abbrev or not NltkTools.starts_with_upper(
                 sentences[current + 1]):
             return True
         return False
     else:
         return (self._match_patterns(sentences[current])
                 and NltkTools.starts_with_upper(sentences[current + 1]))
Esempio n. 4
0
class HunposPosTagger(PosTaggerWrapper):
    """
    Wraps NltkTools, which wraps HunPos as a POS tagger :).
    
    In order for NLTK to find the hunpos executable, the $HUNPOS environment
    variable must point to the directory with the hunpos-tag executable in it.

    The following parameters are used:
    - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model;
    - hunpos_encoding: the encoding used by the hunpos model file. Default is
      iso-8859-1.
    """
    def __init__(self, params):
        self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
        self.encoding = params.get('hunpos_encoding', 'iso-8859-1')

    def pos_tag(self, tokens):
        for sen_i, sen in enumerate(tokens):
            tagged_sen = self.nt.pos_tag(
                [tok[0].encode(self.encoding) for tok in sen])
            for tok_i, tagged_tok in enumerate(tagged_sen):
                try:
                    tok, pos = [x.decode(self.encoding) for x in tagged_tok]
                except ValueError:
                    continue
                tokens[sen_i][tok_i].append(pos)
Esempio n. 5
0
class HunposPosTagger(PosTaggerWrapper):
    """
    Wraps NltkTools, which wraps HunPos as a POS tagger :).
    
    In order for NLTK to find the hunpos executable, the $HUNPOS environment
    variable must point to the directory with the hunpos-tag executable in it.

    The following parameters are used:
    - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model;
    - hunpos_encoding: the encoding used by the hunpos model file. Default is
      iso-8859-1.
    """
    def __init__(self, params):
        self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
        self.encoding = params.get('hunpos_encoding', 'iso-8859-1')

    def pos_tag(self, tokens):
        for sen_i, sen in enumerate(tokens):
            tagged_sen = self.nt.pos_tag([tok[0].encode(self.encoding) for tok in sen])
            for tok_i, tagged_tok in enumerate(tagged_sen):
                try:
                    tok, pos = [x.decode(self.encoding) for x in tagged_tok]
                except ValueError:
                    continue
                tokens[sen_i][tok_i].append(pos)
Esempio n. 6
0
class NltkToolsStemmer(LemmatizerWrapper):
    """
    Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer,
    which is English only.

    @warning This is the original implementation as used in our English
             Wikipedia parser. No effort has been made to clean up the
             code, or to fix the hardwired indexing, etc. The data must
             be already POS tagged, and the POS field must be the last one.
    """
    def __init__(self, params):
        self.nt = NltkTools(stem=True)

    def lemmatize(self, tokens):
        # HACK
        for sen_i, sen in enumerate(tokens):
            stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen))
            hard_stemmed = self.nt.stem((((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen))
            for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(zip(stemmed, hard_stemmed)):
                tokens[sen_i][tok_i].append(tok_stemmed[2])
                tokens[sen_i][tok_i].append(tok_hard_stemmed[2])
Esempio n. 7
0
class NltkToolsTokenizer(SentenceTokenizerWrapper, WordTokenizerWrapper):
    """
    Wraps the NltkTools sentence and word tokenizer.

    The only parameter used is
    - abbrevs: a file that lists abbreviations and other problematic tokens
               that, because they include punctuation marks, can be  mistaken
               for a sentence ending. Optional.
    """
    def __init__(self, params):
        SentenceTokenizerWrapper.__init__(self, params)
        WordTokenizerWrapper.__init__(self, params)
        self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)

    def sen_tokenize(self, raw):
        """@note Does not use the abbrev_set."""
        sentences = self.nt.sen_tokenize(raw)
        self._join_sentences(sentences)
        return sentences

    def word_tokenize(self, sen):
        tokens = self.nt.wordTokenizer.tokenize(sen)
        if len(tokens) == 0:
            return []

        # Punctuation handling
        tokens = list(
            chain.from_iterable(
                [w for w in remove_quot_and_wiki_crap_from_word(token)]
                for token in tokens))
        last_token, read_last = self.__get_last_token(tokens)
        punktMatchObject = self.nt.punktSplitter.match(tokens[last_token])
        if punktMatchObject is not None and not self._is_abbrev(
                tokens[last_token]):
            tokens = tokens[:last_token] + list(
                punktMatchObject.groups()) + read_last
        return tokens

    def __get_last_token(self, tokens):
        last_token = -1
        while len(tokens) > last_token * -1 and is_quote_or_garbage(
                tokens[last_token]):
            last_token -= 1
        read_last = tokens[last_token + 1:] if last_token != -1 else []
        return last_token, read_last
Esempio n. 8
0
class NltkToolsTokenizer(SentenceTokenizerWrapper, WordTokenizerWrapper):
    """
    Wraps the NltkTools sentence and word tokenizer.

    The only parameter used is
    - abbrevs: a file that lists abbreviations and other problematic tokens
               that, because they include punctuation marks, can be  mistaken
               for a sentence ending. Optional.
    """
    def __init__(self, params):
        SentenceTokenizerWrapper.__init__(self, params)
        WordTokenizerWrapper.__init__(self, params)
        self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)

    def sen_tokenize(self, raw):
        """@note Does not use the abbrev_set."""
        sentences = self.nt.sen_tokenize(raw)
        self._join_sentences(sentences)
        return sentences

    def word_tokenize(self, sen):
        tokens = self.nt.wordTokenizer.tokenize(sen)
        if len(tokens) == 0:
            return []

        # Punctuation handling
        tokens = list(chain.from_iterable([w for w in remove_quot_and_wiki_crap_from_word(token)] for token in tokens))
        last_token, read_last = self.__get_last_token(tokens)
        punktMatchObject = self.nt.punktSplitter.match(tokens[last_token])
        if punktMatchObject is not None and not self._is_abbrev(tokens[last_token]):
            tokens = tokens[:last_token] + list(punktMatchObject.groups()) + read_last
        return tokens

    def __get_last_token(self, tokens):
        last_token = -1
        while len(tokens) > last_token * -1 and is_quote_or_garbage(tokens[last_token]):
            last_token -= 1
        read_last = tokens[last_token + 1:] if last_token != -1 else []
        return last_token, read_last
Esempio n. 9
0
 def __init__(self, params):
     self.nt = NltkTools(stem=True)
Esempio n. 10
0
 def __init__(self, params):
     self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
     self.encoding = params.get('hunpos_encoding', 'iso-8859-1')
Esempio n. 11
0
        )
        print(
            '           considered to be titles, and will be processed accordingly.'
        )
        print(
            '       -a: the output is appended to output_file, instead of overwriting it.'
        )
        sys.exit()

    if 'o' in params:
        output_mode = 'a' if 'a' in params else 'w'
        out = FileWriter(params['o'], output_mode).open()
    else:
        out = StreamWriter(sys.stdout)

    nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
    for infile in filter(os.path.isfile, [
            os.path.join(params['i'], infile)
            for infile in os.listdir(params['i'])
    ]):
        doc = FieldedDocument(infile)
        doc.fields = {}
        for field, raw_text in read_file(infile, True).iteritems():
            filtered = nt.filter_long_sentences(raw_text)
            diff = len(raw_text) - len(filtered)
            if diff > 0:
                sys.stderr.write("{0}: {1} bytes filtered.\n".format(
                    infile, diff))
            if len(filtered) > 0:
                doc.fields[field] = nt.tag_raw(filtered)
        if len(doc.fields) > 0:
Esempio n. 12
0
"""
This script reads normal parsed Wikipedia pages in Conll-like format
and transforms it to format needed by ndavid
"""

parser = OptionParser()
parser.add_option("-m", "--model", dest="model",
                  help="the hunpos model file. Default is $HUNPOS/english.model",
                  metavar="MODEL_FILE")
parser.add_option("-e", "--encoding", dest="encoding",
                  help="the encoding used by the hunpos model file. Default is utf-8",
                  default='utf-8')
options, args = parser.parse_args()

from langtools.nltk.nltktools import NltkTools
nt = NltkTools(tok=True, pos=True, stem=True, pos_model=options.model)

pageSep = "%%#PAGE"
actPage = None
starter = False
for line in sys.stdin:
    l = line.strip().decode("utf-8")
    if l.startswith(pageSep):
        if actPage is not None:
            print
        
        actPage = l.split(" ", 1)[1]
        starter = True
        print l.encode("utf-8").replace(" ", "\t", 1)
        print "%%#Field\tTitle"
        titleTokens = nt.word_tokenize(actPage)
Esempio n. 13
0
 def __init__(self, params):
     SentenceTokenizerWrapper.__init__(self, params)
     WordTokenizerWrapper.__init__(self, params)
     self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)
Esempio n. 14
0
 def __init__(self, params):
     self.nt = NltkTools(stem=True)
Esempio n. 15
0
 def __init__(self, params):
     self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
     self.encoding = params.get('hunpos_encoding', 'iso-8859-1')
Esempio n. 16
0
            raise ValueError('Input must be a directory of files.')
    except ValueError as err:
        print('Error: {0}'.format(err))
        print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' +
            '[-a]').format(sys.argv[0]))
        print('       input_dir: the directory with the input text files.')
        print('       hunpos_model: the hunpos model file.')
        print('       output_file: the conll2 output file. If omitted, the result will')
        print('                    be written to stdout.')
        print('       hunpos_model: the hunpos model file.')
        print('       -a: the output is appended to output_file, instead of overwriting it.')
        sys.exit()
    
    if 'o' in params:
        output_mode = 'a' if 'a' in params else 'w'
        out = FileWriter(params['o'], output_mode).open()
    else:
        out = StreamWriter(sys.stdout)
    
    nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
    for infile in (os.path.join(d, f) for d, _, fs in os.walk(params['i']) for f in fs):
        print "File " + infile
        doc = FieldedDocument(infile)
        doc.fields = {}
        for field, raw_text in read_file(infile).iteritems():
            doc.fields[field] = nt.tag_raw(raw_text)
        write_doc(doc, out)
    
    if 'o' in params:
        out.close()
Esempio n. 17
0
 def __init__(self, params):
     SentenceTokenizerWrapper.__init__(self, params)
     WordTokenizerWrapper.__init__(self, params)
     self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)
Esempio n. 18
0
     print('       hunpos_model: the hunpos model file.')
     print('       output_file: the conll2 output file. If omitted, the result will')
     print('                    be written to stdout.')
     print('       hunpos_model: the hunpos model file.')
     print('       -t: If specified, the first non-empty line of the the text files are')
     print('           considered to be titles, and will be processed accordingly.')
     print('       -a: the output is appended to output_file, instead of overwriting it.')
     sys.exit()
 
 if 'o' in params:
     output_mode = 'a' if 'a' in params else 'w'
     out = FileWriter(params['o'], output_mode).open()
 else:
     out = StreamWriter(sys.stdout)
 
 nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
 for infile in filter(os.path.isfile, [os.path.join(params['i'], infile)
                                       for infile in os.listdir(params['i'])]):
     doc = FieldedDocument(infile)
     doc.fields = {}
     for field, raw_text in read_file(infile, True).iteritems():
         filtered = nt.filter_long_sentences(raw_text)
         diff = len(raw_text) - len(filtered)
         if diff > 0:
             sys.stderr.write("{0}: {1} bytes filtered.\n".format(infile, diff))
         if len(filtered) > 0:
             doc.fields[field] = nt.tag_raw(filtered)
     if len(doc.fields) > 0:
         write_doc(doc, out)
 
 if 'o' in params: