コード例 #1
0
 def __init__(self, params):
     SentenceTokenizerWrapper.__init__(self, params)
     WordTokenizerWrapper.__init__(self, params)
     self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)
コード例 #2
0
 def __init__(self, params):
     self.nt = NltkTools(stem=True)
コード例 #3
0
ファイル: txt_to_conll.py プロジェクト: zseder/hunmisc
        )
        print(
            '           considered to be titles, and will be processed accordingly.'
        )
        print(
            '       -a: the output is appended to output_file, instead of overwriting it.'
        )
        sys.exit()

    if 'o' in params:
        output_mode = 'a' if 'a' in params else 'w'
        out = FileWriter(params['o'], output_mode).open()
    else:
        out = StreamWriter(sys.stdout)

    nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
    for infile in filter(os.path.isfile, [
            os.path.join(params['i'], infile)
            for infile in os.listdir(params['i'])
    ]):
        doc = FieldedDocument(infile)
        doc.fields = {}
        for field, raw_text in read_file(infile, True).iteritems():
            filtered = nt.filter_long_sentences(raw_text)
            diff = len(raw_text) - len(filtered)
            if diff > 0:
                sys.stderr.write("{0}: {1} bytes filtered.\n".format(
                    infile, diff))
            if len(filtered) > 0:
                doc.fields[field] = nt.tag_raw(filtered)
        if len(doc.fields) > 0:
コード例 #4
0
 def __init__(self, params):
     self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
     self.encoding = params.get('hunpos_encoding', 'iso-8859-1')
コード例 #5
0
"""
This script reads normal parsed Wikipedia pages in Conll-like format
and transforms it to format needed by ndavid
"""

parser = OptionParser()
parser.add_option("-m", "--model", dest="model",
                  help="the hunpos model file. Default is $HUNPOS/english.model",
                  metavar="MODEL_FILE")
parser.add_option("-e", "--encoding", dest="encoding",
                  help="the encoding used by the hunpos model file. Default is utf-8",
                  default='utf-8')
options, args = parser.parse_args()

from langtools.nltk.nltktools import NltkTools
nt = NltkTools(tok=True, pos=True, stem=True, pos_model=options.model)

pageSep = "%%#PAGE"
actPage = None
starter = False
for line in sys.stdin:
    l = line.strip().decode("utf-8")
    if l.startswith(pageSep):
        if actPage is not None:
            print
        
        actPage = l.split(" ", 1)[1]
        starter = True
        print l.encode("utf-8").replace(" ", "\t", 1)
        print "%%#Field\tTitle"
        titleTokens = nt.word_tokenize(actPage)