Beispiel #1
0
    def set_up(self):
        basedir_contents = os.listdir(self.basedir)
        files = [f for f in basedir_contents if os.path.isfile(join(self.basedir, f))]
        directories = [f for f in basedir_contents if os.path.isdir(join(self.basedir, f))]

        if "feature_list.txt" not in files or\
            "tree_list.txt" not in files:
            sys.exit("You need feature_list.txt and tree_list.txt in the base directory")
        if "gold_files" not in directories:
            os.mkdir(join(self.basedir, "gold_files"))
        if "models" not in directories:
            os.mkdir(join(self.basedir, "models"))
        if "tagged_files" not in directories:
            os.mkdir(join(self.basedir, "tagged_files"))
        self.tree_funcs = feature_list_reader(
            join(self.basedir, "tree_list.txt"), globals()
        )
        self.feature_funcs = feature_list_reader(
            join(self.basedir, "feature_list.txt"), globals()
        )
    parser.add_argument("output_dir")
    parser.add_argument("file_suffix")
    parser.add_argument("tree_list")
    parser.add_argument("feature_list")
    parser.add_argument("-a",
                        "--answers",
                        help="the input file has the answers",
                        action="store_true")
    parser.add_argument("-m",
                        "--mallet",
                        help="prepare mallet output",
                        action="store_true")

    all_args = parser.parse_args()

    tree_funcs = feature_list_reader(all_args.tree_list, locals())
    feature_funcs = feature_list_reader(all_args.feature_list, locals())

    if all_args.mallet:
        feature_funcs.insert(0, relation_type)

    data = get_original_data(all_args.input_file)
    f = Featurizer(data, tree_funcs, feature_funcs, not all_args.answers)
    if all_args.mallet:
        f.build_mallet_features()
        f.write_no_tag(all_args.output_dir, all_args.file_suffix)
    else:
        f.build_features()
        if all_args.answers:
            f.build_relation_class_vectors()
            f.write_multiple_vectors(all_args.output_dir, all_args.file_suffix)
            for row in self.new_features:
                f_out.write("{}\n".format(" ".join(row)))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file")
    parser.add_argument("output_dir")
    parser.add_argument("file_suffix")
    parser.add_argument("tree_list")
    parser.add_argument("feature_list")
    parser.add_argument("-a", "--answers", help="the input file has the answers", action="store_true")
    parser.add_argument("-m", "--mallet", help="prepare mallet output", action="store_true")

    all_args = parser.parse_args()

    tree_funcs = feature_list_reader(all_args.tree_list, locals())
    feature_funcs = feature_list_reader(all_args.feature_list, locals())

    if all_args.mallet:
        feature_funcs.insert(0, relation_type)

    data = get_original_data(all_args.input_file)
    f = Featurizer(data, tree_funcs, feature_funcs, not all_args.answers)
    if all_args.mallet:
        f.build_mallet_features()
        f.write_no_tag(all_args.output_dir, all_args.file_suffix)
    else:
        f.build_features()
        if all_args.answers:
            f.build_relation_class_vectors()
            f.write_multiple_vectors(all_args.output_dir, all_args.file_suffix)