Exemple #1
0
def load_docs(args):
    class HoldoutDataset:
        def __init__(self, train_docs, dev_docs):
            self.__train_docs = train_docs
            self.__dev_docs = dev_docs

        def get_splits(self):
            yield (self.__train_docs, self.__dev_docs)

        @property
        def splits_number(self):
            return 1

        def transformed_by(self, transformer):
            return HoldoutDataset(
                [transformer.transform(doc) for doc in self.__train_docs],
                [transformer.transform(doc) for doc in self.__dev_docs])

    class CVDataset:
        def __init__(self, docs):
            self.__docs = docs

        def get_splits(self):
            for i in range(args.folds):
                yield get_fold(self.__docs, args.folds, i)

        @property
        def splits_number(self):
            return args.folds

        def transformed_by(self, transformer):
            return CVDataset(
                [transformer.transform(doc) for doc in self.__docs])

    if args.strategy == 'holdout':
        dataset = HoldoutDataset(load(args.train_dir), load(args.dev_dir))
    elif args.strategy == 'cross_validation':
        dataset = CVDataset(load(args.traindev_dir))
    else:
        raise Exception(
            'Only holdout and cross_validation strategies are supported')

    return dataset, FuncIterable(lambda: read_conllu_file(args.unlabeled)
                                 ) if args.unlabeled is not None else None
def main():
    parser, input_dirs_factory = __create_argparser()
    args = parser.parse_args()

    with open(args.props, "r", encoding="utf-8") as f:
        props = json.load(f)

    preprocessor = preprocessor_for(args.task_name, props)

    for collection_name, input_directory in input_dirs_factory(args):
        output_directory = args.output_directory
        if collection_name is not None:
            output_directory = os.path.join(output_directory, collection_name)

        docs = load(input_directory)
        processed_docs = list(map(preprocessor.process_doc, docs))
        dump(output_directory, processed_docs)
Exemple #3
0
def main():
    args = create_argparser().parse_args()
    docs = load(args.docs_path)
    evaluator = evaluator_for(args.task_name)

    if args.transformers_props_path is not None:
        with open(args.transformers_props_path, "r", encoding="utf-8") as f, \
                transformer_from_props(json.load(f)) as transformer:
            docs = [transformer.transform(doc) for doc in docs]

    if args.strategy == "holdout":
        folds_num = 1
        models = [args.model_path]
    else:
        folds_num = len(args.splits_model_paths)
        models = args.splits_model_paths

    main_scores = []
    for split_idx, model_path in enumerate(models):
        _, test_docs = get_fold(docs, folds_num, split_idx)

        with classifier_for(args.task_name)(model_path) as clf:
            main_score, scores, stats_generator = evaluator(
                clf, test_docs, args.stats_path is not None)
            main_scores.append(main_score)

            print("Split {}, Main score={:.4f}".format(split_idx, main_score))
            print(
                f"Scores: \n{json.dumps(scores, indent=4, sort_keys=True)}\n")

            if stats_generator is not None:
                stats_path = join(args.stats_path, f"split_{split_idx}")
                makedirs(stats_path, exist_ok=True)

                for doc_idx, doc in enumerate(test_docs):
                    with open(join(stats_path, doc.name + '_stats.txt'),
                              'w',
                              encoding='utf-8') as f:

                        f.write(stats_generator(doc_idx))

    print("\nMean splits score={:.4f}".format(
        sum(main_scores) / len(main_scores)))
Exemple #4
0
    def train(self, props: dict, params: dict, working_dir: str):
        docs = load(params['data_path'])
        serializer = CoNLLSerializer()

        n_folds, fold_num, seed = params['n_folds'], props['fold_num'], props[
            'seed']
        out_path = os.path.join(working_dir,
                                str(fold_num) + '_fold', str(seed))

        param_str = params_to_str(props)
        print(param_str)

        os.makedirs(out_path, exist_ok=True)

        train_docs, dev_docs = get_fold(docs, n_folds, fold_num)
        print("Fold:", fold_num)

        with open(os.path.join(out_path, 'gold.conll'), 'w',
                  encoding="utf-8") as f:
            serializer.serialize_docs(dev_docs, f)

        classifier_path = props.get('classifier_path')
        if classifier_path is not None and props.get(
                'sampling_strategy', 'coref') in [
                    'coref_pron', 'coref_pron_cluster',
                    'coref_pron_cluster_strict'
                ]:
            with Classifier(classifier_path) as clf:
                print("Applying known model")
                known_rels = get_known_rels(clf, dev_docs)
            strategies = ['pron_rank', 'pron_vote_rank']
        else:
            known_rels = None
            strategies = ['easy_first']

        hook, evaluation_result = get_evaluating_hook(
            serializer, dev_docs, out_path, seed, fold_num,
            params.get('save_models', False), strategies, known_rels)

        with Trainer(props) as trainer:
            trainer.train(train_docs, hook)

        return dict(evaluation_result)
Exemple #5
0
def main():
    if len(sys.argv) < 4:
        print(
            "Usage: <model-path> <test-path> <out-path> <transformers-props-path>"
        )
        return
    model_path = sys.argv[1]
    docs = load(sys.argv[2])
    out_path = sys.argv[3]
    transformers_props_path = sys.argv[4] if len(sys.argv) > 4 else None

    if transformers_props_path is not None:
        with open(transformers_props_path, 'r', encoding='utf-8') as f, \
                transformer_from_props(json.load(f)) as transformer:
            docs = [transformer.transform(doc) for doc in docs]

    with RelExtClassifier(model_path) as classifier:
        rels = classifier.predict_docs(docs)

    write_relations(rels, out_path)