Beispiel #1
0
    def predict_with_parser(cls, options):
        if options.input_format == "standard":
            data_test = cls.DataType.from_file(options.conll_test, False)
        elif options.input_format == "space":
            with smart_open(options.conll_test) as f:
                data_test = [cls.DataType.from_words_and_postags([(word, "X") for word in line.strip().split(" ")])
                             for line in f]
        elif options.input_format == "english":
            from nltk import download, sent_tokenize
            from nltk.tokenize import TreebankWordTokenizer
            download("punkt")
            with smart_open(options.conll_test) as f:
                raw_sents = sent_tokenize(f.read().strip())
                tokenized_sents = TreebankWordTokenizer().tokenize_sents(raw_sents)
                data_test = [cls.DataType.from_words_and_postags([(token, "X") for token in sent])
                             for sent in tokenized_sents]
        elif options.input_format == "tokenlist":
            with smart_open(options.conll_test) as f:
                items = eval(f.read())
            data_test = cls.DataType.from_words_and_postags(items)
        else:
            raise ValueError("invalid format option")

        logger.info('Initializing...')
        parser = cls.load(options.model, options)

        ts = time.time()
        cls.predict_and_output(parser, options, data_test, options.out_file)
        te = time.time()
        logger.info('Finished predicting and writing test. %.2f seconds.', te - ts)
Beispiel #2
0
 def from_file(cls, file_name, use_edge=None, limit=float("inf")):
     result = []
     with smart_open(file_name) as f:
         extra_info = {}
         count = 0
         for line in f:
             if count >= limit:
                 break
             line_s = line.strip()
             if not line_s:
                 continue
             if line_s.startswith("#"):
                 key, _, value = line_s[1:].partition(":")
                 if value:
                     extra_info[key.strip()] = value.strip()
                 continue
             result.append((line_s, extra_info))
             extra_info = {}
             count += 1
     with Pool(processes=8) as pool:
         trees = list(
             pool.imap_unordered(cls.line_mapper,
                                 list(enumerate(result)),
                                 chunksize=400))
     trees = [tree for idx, tree in sorted(trees)]
     return trees
Beispiel #3
0
    def predict_with_parser(cls, options):
        DataFormatClass = cls.get_data_formats()[options.data_format]
        if options.input_format == "standard":
            data_test = DataFormatClass.from_file(options.conll_test, False)
        elif options.input_format == "space":
            with smart_open(options.conll_test) as f:
                data_test = [DataFormatClass.from_words_and_postags([(word, "X") for word in line.strip().split(" ")])
                             for line in f]
        elif options.input_format.startswith("english"):
            from nltk import download, sent_tokenize
            from nltk.tokenize import TreebankWordTokenizer
            download("punkt")
            with smart_open(options.conll_test) as f:
                raw_sents = []
                for line in f:
                    if options.input_format == "english-line":
                        raw_sents.append(line.strip())
                    else:
                        this_line_sents = sent_tokenize(line.strip())
                        raw_sents.extend(this_line_sents)
                tokenized_sents = TreebankWordTokenizer().tokenize_sents(raw_sents)
                data_test = [DataFormatClass.from_words_and_postags([(token, "X") for token in sent])
                             for sent in tokenized_sents]
        elif options.input_format == "tokenlist":
            with smart_open(options.conll_test) as f:
                items = eval(f.read())
            data_test = DataFormatClass.from_words_and_postags(items)
        else:
            raise ValueError("invalid format option")

        logger.info('Loading Model...')
        options.is_train = False
        parser = cls.load(options.model, options)
        logger.info('Model loaded')

        ts = time.time()
        with smart_open(options.out_file, "w") as f_output:
            if hasattr(DataFormatClass, "file_header"):
                f_output.write(DataFormatClass.file_header + "\n")
            for i in parser.predict(data_test):
                f_output.write(i.to_string())
        te = time.time()
        logger.info('Finished predicting and writing test. %.2f seconds.', te - ts)

        if options.evaluate:
            DataFormatClass.evaluate_with_external_program(options.conll_test,
                                                           options.out_file)