def __init__(self, params): SentenceTokenizerWrapper.__init__(self, params) WordTokenizerWrapper.__init__(self, params) self.nt = NltkTools(tok=True, abbrev_set=self.abbrevs)
def __init__(self, params): self.nt = NltkTools(stem=True)
) print( ' considered to be titles, and will be processed accordingly.' ) print( ' -a: the output is appended to output_file, instead of overwriting it.' ) sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in filter(os.path.isfile, [ os.path.join(params['i'], infile) for infile in os.listdir(params['i']) ]): doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile, True).iteritems(): filtered = nt.filter_long_sentences(raw_text) diff = len(raw_text) - len(filtered) if diff > 0: sys.stderr.write("{0}: {1} bytes filtered.\n".format( infile, diff)) if len(filtered) > 0: doc.fields[field] = nt.tag_raw(filtered) if len(doc.fields) > 0:
def __init__(self, params): self.nt = NltkTools(pos=True, pos_model=params['hunpos_model']) self.encoding = params.get('hunpos_encoding', 'iso-8859-1')
""" This script reads normal parsed Wikipedia pages in Conll-like format and transforms it to format needed by ndavid """ parser = OptionParser() parser.add_option("-m", "--model", dest="model", help="the hunpos model file. Default is $HUNPOS/english.model", metavar="MODEL_FILE") parser.add_option("-e", "--encoding", dest="encoding", help="the encoding used by the hunpos model file. Default is utf-8", default='utf-8') options, args = parser.parse_args() from langtools.nltk.nltktools import NltkTools nt = NltkTools(tok=True, pos=True, stem=True, pos_model=options.model) pageSep = "%%#PAGE" actPage = None starter = False for line in sys.stdin: l = line.strip().decode("utf-8") if l.startswith(pageSep): if actPage is not None: print actPage = l.split(" ", 1)[1] starter = True print l.encode("utf-8").replace(" ", "\t", 1) print "%%#Field\tTitle" titleTokens = nt.word_tokenize(actPage)