Beispiel #1
0
def main(params: Parameters):
    # create_cas_from_apf(TEST_APF_PATH, TEST_SGM_PATH, OUTPUT_DIR_PATH)
    corpus_paths = params.arbitrary_list("corpus_paths")
    output_xmi_dir_path = params.creatable_directory("cached_xmi_path")
    type_system_path = params.existing_file("type_system_path")
    cas_xmi_template_path = params.existing_file("cas_xmi_template_path")

    # Load Typesystem
    with type_system_path.open('rb') as file:
        typesystem = load_typesystem(file)

    # Load xmi_template
    with cas_xmi_template_path.open('rb') as cas_xmi_file:
        cas_template = load_cas_from_xmi(cas_xmi_file, typesystem=typesystem)

    for ace_corpus_path in corpus_paths:
        print('Processing apf files from: ' + ace_corpus_path)
        start_time = time.perf_counter()
        for filename in os.listdir(ace_corpus_path):
            if filename.endswith(".apf.xml"):
                print("Processing " + filename)
                create_cas_from_apf(apf_filename=filename,
                                    apf_path=ace_corpus_path + filename,
                                    source_sgm_path=ace_corpus_path + filename.replace(
                                        ".apf.xml", ".sgm"),
                                    output_dir_path=output_xmi_dir_path, typesystem=typesystem,
                                    cas_template=cas_template)
        elapsed_time = time.perf_counter() - start_time
        print(f"Processing Completed. Time elapsed: {elapsed_time:0.4f} seconds")
Beispiel #2
0
    def from_parameters(params: Parameters) -> KeyValueSource[str, str]:
        """
        Construct a zip file key-value source from parameters.

        The "path" parameter should be the zip file to be opened.

        Currently we assume the zipfile contains a file with the IDs, which it will if it
        were created by the default CharSink.zip_character_sink(). Support for custom key
        functions will be added in the future.
        """
        return KeyValueSource.zip_character_source(params.existing_file("path"))
Beispiel #3
0
def main(params: Parameters):
    input_file_path = params.existing_file("input_file")
    output_file_path = params.creatable_file("output_file")
    logging.info("Reading from input file: %s",
                 str(input_file_path.absolute()))
    with input_file_path.open() as input_file:
        nums = [int(x.strip()) for x in input_file if x.strip() != ""]

    nums.sort()

    output_file_path.write_text("\n".join(immutableset([str(x)
                                                        for x in nums])))
def main(params: Parameters):
    graph_def_file = params.existing_file("graph_def_file")
    checkpoint_glob = params.string("checkpoint_glob")
    vocab_file = params.existing_file("vocab_file")
    sentences_file = params.existing_file("sentences_file")
    output_file = params.creatable_file("output_file")
    do_profiling = params.optional_boolean_with_default("profile", False)

    with tensorflow.contrib.tfprof.ProfileContext(
            os.getcwd(),
            trace_steps=range(2, 10),
            dump_steps=range(1, 10, 2),
            enabled=params.optional_boolean_with_default("profile", False)
        ):
        lm = LM1B.load(graph_def_file=graph_def_file,
                       checkpoint_file=checkpoint_glob,
                       vocab=vocab_file)

        start_time = None
        num_tokens_processed = 0

        with open(sentences_file, 'r', newline='') as inp:
            csv_input = csv.reader(inp, delimiter='\t')
            with open(output_file, 'w', newline='') as out:
                csv_output = csv.writer(out, delimiter='\t')
                for line in csv_input:
                    tokens = line[0].split(' ')
                    output_row = list(line)
                    output_row.insert(0, lm.log_probability_of_sentence(tokens))
                    csv_output.writerow(output_row)
                    # we delay till after the first sentence to avoid counting startup time
                    if num_tokens_processed == 0:
                        start_time = time.time()
                    num_tokens_processed += len(tokens)

    elapsed_time = time.time() - start_time
    print(f"Processed {num_tokens_processed - 1} sentences in {elapsed_time} "
          f"seconds, {num_tokens_processed / elapsed_time} tokens per second. First sentence not "
          f"included in time calculation.")
Beispiel #5
0
def main(params: Parameters):
    input_file_path = params.existing_file("input_file")
    output_file_path = params.creatable_file("output_file")
    x = params.integer("x")
    logging.info("Reading from input file: %s",
                 str(input_file_path.absolute()))
    with input_file_path.open() as input_file:
        with output_file_path.open("w") as output_file:
            for num in input_file:
                output_file.write(f"{int(num)*x}\n")

    logging.info("Writing to output file: %s", str(input_file_path.absolute()))

    # Pause so that we can examine the job on the SAGA cluster
    time.sleep(30)
def main(params: Parameters):
    # List of the six ACE corpus /adj/ folders (one for each type: bc, bn, cts, nw, un, wl)
    corpus_paths = params.arbitrary_list("corpus_paths")
    # Path to the project config file template (json file)
    json_template_path = params.existing_file("json_template_path")
    # Path to the cached_annotation_ser directory
    annotation_ser_path = params.existing_directory("annotation_ser_path")

    # Path to the cached_xmi directory
    cached_xmi_path = params.existing_directory("cached_xmi_path")

    # Path to target corpus (narrowed ACE-Corpus)
    cached_ace_data_path = params.creatable_directory("cached_ace_data_path")

    # List of users (strings)
    user_list = params.arbitrary_list("user_list")
    # List of event type (Format: "EVENT_TYPE.SUBTYPE" strings)
    event_list = params.arbitrary_list("event_list")

    # Output Directory Path where configured projects are moved to (use an empty directory)
    output_dir_path = params.creatable_directory("output_dir_path")

    flatten_ace_data(corpus_paths, cached_ace_data_path)

    complete_map = get_complete_project_to_doc_mapping(cached_ace_data_path)

    for user in user_list:
        for event_type in event_list:
            # For All events to be printed
            if event_type == "All":
                for event in complete_map:
                    configure_and_generate_project(
                        json_template_path=json_template_path,
                        event_name=event,
                        user_name=user,
                        event_doc_map=complete_map,
                        cached_ser_path=annotation_ser_path,
                        cached_xmi_path=cached_xmi_path,
                        output_dir_path=output_dir_path)
            else:
                configure_and_generate_project(
                    json_template_path=json_template_path,
                    event_name=event_type,
                    user_name=user,
                    event_doc_map=complete_map,
                    cached_ser_path=annotation_ser_path,
                    cached_xmi_path=cached_xmi_path,
                    output_dir_path=output_dir_path)
Beispiel #7
0
def main(params: Parameters):
    conda_script_generator = CondaJobScriptGenerator.from_parameters(params)
    entry_point = params.string("entry_point")
    work_dir = params.optional_creatable_directory(
        "working_directory") or Path(os.getcwd())
    stdout_file = params.string("log_file") or work_dir / "___stdout.log"
    shell_script = conda_script_generator.generate_shell_script(
        entry_point_name=entry_point,
        param_file=params.existing_file("job_param_file"),
        working_directory=work_dir,
        stdout_file=stdout_file,
    )

    params.creatable_file("conda_script_path").write_text(  # type: ignore
        shell_script, encoding="utf-8")

    if params.boolean("echo_template", default=False):
        print(shell_script)
Beispiel #8
0
def configure_logging_from(params: Parameters, *, log_params=False) -> None:
    """
    Configures logging from parameters.

    This will examine the 'logging' namespace of the provided parameters. If that namespace
    has a 'config_file' parameter, logging will be configured based on the parameter file it
    points to.  Otherwise, if 'logging.root_level' is specified, the logging level of the root
    logger will be set to its value.  For reference, the standard values are CRITICAL, FATAL,
    ERROR, WARNING, INFO, and DEBUG.
    """
    if "logging.config_file" in params:
        logging.config.fileConfig(
            str(params.existing_file("logging.config_file")))
    else:
        _config_logging_from_params(params)

    if log_params:
        log.info(str(params))
Beispiel #9
0
def _doc_id_binary_source_from_params(params: Parameters) -> KeyValueSource[str, bytes]:
    return KeyValueSource.binary_from_doc_id_to_file_map(params.existing_file("path"))