def predict_with_parser(cls, options): if options.input_format == "standard": data_test = cls.DataType.from_file(options.conll_test, False) elif options.input_format == "space": with smart_open(options.conll_test) as f: data_test = [cls.DataType.from_words_and_postags([(word, "X") for word in line.strip().split(" ")]) for line in f] elif options.input_format == "english": from nltk import download, sent_tokenize from nltk.tokenize import TreebankWordTokenizer download("punkt") with smart_open(options.conll_test) as f: raw_sents = sent_tokenize(f.read().strip()) tokenized_sents = TreebankWordTokenizer().tokenize_sents(raw_sents) data_test = [cls.DataType.from_words_and_postags([(token, "X") for token in sent]) for sent in tokenized_sents] elif options.input_format == "tokenlist": with smart_open(options.conll_test) as f: items = eval(f.read()) data_test = cls.DataType.from_words_and_postags(items) else: raise ValueError("invalid format option") logger.info('Initializing...') parser = cls.load(options.model, options) ts = time.time() cls.predict_and_output(parser, options, data_test, options.out_file) te = time.time() logger.info('Finished predicting and writing test. %.2f seconds.', te - ts)
def from_file(cls, file_name, use_edge=None, limit=float("inf")): result = [] with smart_open(file_name) as f: extra_info = {} count = 0 for line in f: if count >= limit: break line_s = line.strip() if not line_s: continue if line_s.startswith("#"): key, _, value = line_s[1:].partition(":") if value: extra_info[key.strip()] = value.strip() continue result.append((line_s, extra_info)) extra_info = {} count += 1 with Pool(processes=8) as pool: trees = list( pool.imap_unordered(cls.line_mapper, list(enumerate(result)), chunksize=400)) trees = [tree for idx, tree in sorted(trees)] return trees
def predict_with_parser(cls, options): DataFormatClass = cls.get_data_formats()[options.data_format] if options.input_format == "standard": data_test = DataFormatClass.from_file(options.conll_test, False) elif options.input_format == "space": with smart_open(options.conll_test) as f: data_test = [DataFormatClass.from_words_and_postags([(word, "X") for word in line.strip().split(" ")]) for line in f] elif options.input_format.startswith("english"): from nltk import download, sent_tokenize from nltk.tokenize import TreebankWordTokenizer download("punkt") with smart_open(options.conll_test) as f: raw_sents = [] for line in f: if options.input_format == "english-line": raw_sents.append(line.strip()) else: this_line_sents = sent_tokenize(line.strip()) raw_sents.extend(this_line_sents) tokenized_sents = TreebankWordTokenizer().tokenize_sents(raw_sents) data_test = [DataFormatClass.from_words_and_postags([(token, "X") for token in sent]) for sent in tokenized_sents] elif options.input_format == "tokenlist": with smart_open(options.conll_test) as f: items = eval(f.read()) data_test = DataFormatClass.from_words_and_postags(items) else: raise ValueError("invalid format option") logger.info('Loading Model...') options.is_train = False parser = cls.load(options.model, options) logger.info('Model loaded') ts = time.time() with smart_open(options.out_file, "w") as f_output: if hasattr(DataFormatClass, "file_header"): f_output.write(DataFormatClass.file_header + "\n") for i in parser.predict(data_test): f_output.write(i.to_string()) te = time.time() logger.info('Finished predicting and writing test. %.2f seconds.', te - ts) if options.evaluate: DataFormatClass.evaluate_with_external_program(options.conll_test, options.out_file)