Beispiel #1
0
 def _read_one_file(self, formalism:str, file_path: str):
     # if `file_path` is a URL, redirect to the cache
     file_path = cached_path(file_path)
     if self.fraction < 0.9999 and (not self.only_read_fraction_if_train_in_filename or (self.only_read_fraction_if_train_in_filename and "train" in file_path)):
         with open(file_path, 'r') as amconll_file:
             logger.info("Reading a fraction of "+str(self.fraction)+" of the AM dependency trees from amconll dataset at: %s", file_path)
             sents = list(parse_amconll(amconll_file))
             for i,am_sentence in  enumerate(sents):
                 if i <= len(sents) * self.fraction:
                     yield self.text_to_instance(formalism,i,am_sentence)
     else:
         with open(file_path, 'r') as amconll_file:
             logger.info("Reading AM dependency trees from amconll dataset at: %s", file_path)
             for i,am_sentence in  enumerate(parse_amconll(amconll_file)):
                 yield self.text_to_instance(formalism,i,am_sentence)
Beispiel #2
0
def get_amsents(filename, use_id_as_key: bool = True) -> dict:
    """
    Read file, get id/sentencestring to AMSentence map

    :param filename: amconll file
    :param use_id_as_key: switch sentence id or sentence string equality
    :return: dictionry with id/sentencestring -> AMSentence
    """
    # todo [enhancement] input validation? file exists?
    graphs = dict()  # id -> AMSentence
    with open(file=filename, mode="r", encoding="utf-8") as fileobj:
        for sent in amconll_tools.parse_amconll(fileobj, validate=False):
            # todo what if sentence don't have id, but want to use id as keystr?
            # keystr = ''
            if not use_id_as_key:
                # sentence equality, instead of id equality
                # todo [enhancement] improve equality checks
                toks = sent.get_tokens(shadow_art_root=False)
                toks = normalize_toks(tokens=toks)
                keystr = ' '.join(toks)
                # if keystr.startswith("The total of"):
                #   print(keystr) # about double hypens
            else:
                keystr = sent.attributes["id"]
            graphs[keystr] = sent
    return graphs
Beispiel #3
0
        evaluator = StandardEvaluator.from_params(param_evaluator[1])

        if args.batch_size is not None:
            data_iterator = SameFormalismIterator(list(model.tasks.keys()),
                                                  args.batch_size)
            evaluator.predictor.data_iterator = data_iterator

        evaluator.predictor.set_model(model)
        filename = args.archive_file + "/test_" + prefix + ".amconll"
        local_metrics = evaluator.predictor.parse_and_eval(
            evaluator.formalism,
            evaluator.system_input,
            evaluator.gold_file,
            filename=filename)
        metrics.update({prefix + "_" + k: v for k, v in local_metrics.items()})
        #evaluator.predictor.parse_and_save(evaluator.formalism, evaluator.system_input, filename)
        #evaluator.
        cumulated_parse_time = 0.0
        with open(filename) as f:
            for am_sentence in parse_amconll(f, validate=False):
                cumulated_parse_time += float(am_sentence.attributes["normalized_nn_time"]) + float(am_sentence.attributes["parsing_time"]) \
                                        + float(am_sentence.attributes["normalized_prepare_ftd_time"])
        metrics[prefix + "_time"] = cumulated_parse_time

with open(args.archive_file + "/test_metrics.json", "w") as f:
    f.write(json.dumps(metrics))

print(metrics)

logger.info("Finished parsing.")
from typing import Dict
import argparse

import sys

sys.path.append("..")  # Adds higher directory to python modules path.

import graph_dependency_parser.components.dataset_readers.amconll_tools as amconll_tools

optparser = argparse.ArgumentParser(
    add_help=True,
    description="reads an amconll file and produces a conllu file")
optparser.add_argument("file", type=str)
optparser.add_argument("direc", help="where to store conllu file", type=str)

opts = optparser.parse_args()

with open(opts.file) as f1:
    with open(
            os.path.join(opts.direc,
                         os.path.basename(opts.file) + ".conllu"), "w") as of1:
        for sent in amconll_tools.parse_amconll(f1, validate=False):
            for i, e in enumerate(sent):
                of1.write(
                    str(i + 1) + "\t" + "\t".join([
                        e.token, e.lexlabel, e.fragment, e.typ, "_",
                        str(e.head), e.label, "_", "_"
                    ]))
                of1.write("\n")
            of1.write("\n")
embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping)
                                     if args.embedding_sources_mapping else {})
if args.extend_vocab:
    logger.info("Vocabulary is being extended with test instances.")
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)
    model.vocab.extend_from_instances(Params({}), instances=instances)
    model.extend_embedder_vocab(embedding_sources)

formalism = args.formalism
instances = dataset_reader.read([[formalism, args.input_file]])  # we need to give the formalism to amconll dataset_reader
model.train(False)
data_iterator = DataIterator.from_params(config.pop('iterator'))

with open (args.input_file) as f:
    conll_sentences = list(amconll_tools.parse_amconll(f))

predictions = dataset_reader.restore_order(forward_on_instances(model, instances, data_iterator))

i2edge_label = [model.vocab.get_token_from_index(i, namespace=formalism + "_head_tags") for i in
                range(model.vocab.get_vocab_size(formalism + "_head_tags"))]

i2supertag = [model.vocab.get_token_from_index(i, namespace=formalism+"_supertag_labels")
              for i in range(model.vocab.get_vocab_size(formalism+"_supertag_labels"))]

lexlabel2i = { model.vocab.get_token_from_index(i, namespace=formalism+"_lex_labels") : i
              for i in range(model.vocab.get_vocab_size(formalism+"_lex_labels"))}

def dump_tags(score, fragment, type):
    if type == "_": #\bot
        x = "NULL"
Beispiel #6
0
    add_help=True,
    description=
    "reads an amconll file and produces the LaTeX and dot files for a specified AM dependency tree, by default the first one in the file."
)
optparser.add_argument("file", type=str)
optparser.add_argument("direc", help="where to store the LaTeX file", type=str)
optparser.add_argument("--id",
                       help="id of sentence to visualize",
                       type=str,
                       default=None)
optparser.add_argument(
    "--i",
    help="1-based index of sentence to visualize, only used if no id given",
    type=int,
    default=1)

opts = optparser.parse_args()

found = False
with open(opts.file) as f:
    for i, sent in enumerate(amconll_tools.parse_amconll(f, validate=False),
                             1):
        if opts.id == sent.attributes["id"] or (opts.id is None
                                                and i == opts.i):
            sent.to_tex_svg(opts.direc)
            found = True
            break

if not found:
    print("Sorry, couldn't find your sentence.")
Beispiel #7
0
optparser.add_argument("output_dir", help="output path", type=str)

args = optparser.parse_args()

if args.n < 1:
    print("Number of times subsets are taken must be at least 1")
    sys.exit()


files = []
formalisms = []
for f in args.corpora:
    normpath = os.path.normpath(f)
    formalisms.append(os.path.basename(normpath))
    with open(os.path.join(f,"train","train.amconll")) as fil:
        sents : Dict[str,amconll_tools.AMSentence] = { sent.attributes["id"] : sent for sent in amconll_tools.parse_amconll(fil,validate=False) }
        files.append(sents)

intersection_of_ids = set(files[0].keys())
for sents in files:
    intersection_of_ids = intersection_of_ids & set(sents.keys())
    
intersection_of_ids = sorted(intersection_of_ids)
random.seed(13)

for i in range(1,args.n+1):
    random.shuffle(intersection_of_ids)
    for formalism, sents, source_dir in zip(formalisms, files, args.corpora):
        for subset_size in [100, 1000, 10_000]:
            path = os.path.join(args.output_dir, str(subset_size) + "_" + str(i), formalism)
            #Copy everything from source directory
Beispiel #8
0
async def handle_client(reader, writer):
    request = (await reader.read(4048)).decode(
        'utf8')  # read a maximum of 4048 bytes, that's more than enough
    print("Request", request)
    ret_val = {
        "errors": [],
        "times": {
            "amdep": 0.0,
            "svg": 0.0,
            "graph": 0.0,
            "amdep-svg": 0.0
        }
    }
    # times: amdep: parse time, svg: time to visualize graph, graph: evaluation time from amdep to graph, amdep-svg: viz. of amdep tree.
    t1 = time.time()
    try:
        json_req = json.loads(request)
        print("-- as json", json_req)
        sentence = json_req["sentence"]
        if len(sentence) > 256:
            raise ValueError("Your input exceeded the maximal input length")

        formalisms = json_req["formats"]
        words = spacy_tokenize(sentence)

        with TemporaryDirectory() as direc:
            ret_val["sentence"] = sentence
            ret_val["parses"] = {f: {} for f in formalisms}

            for formalism in formalisms:
                if formalism not in model.tasks:
                    err = f"Model was not trained on '{formalism}' but on {list(model.tasks.keys())}"
                    print(err)
                    ret_val["errors"].append(err)
                    continue

                if formalism not in requires_art_root:
                    err = f"Server doesn't know how to handle '{formalism}' although the model was trained on it."
                    print(err)
                    ret_val["errors"].append(err)
                    continue

                t = time.time()
                # Create input and save to file:
                sentences = [
                    from_raw_text(sentence.rstrip("\n"), words,
                                  requires_art_root[formalism], dict(),
                                  requires_ne_merging[formalism])
                ]
                temp_path = direc + f"/sentences_{formalism}.amconll"
                output_filename = direc + "/parsed_" + formalism + ".amconll"

                with open(temp_path, "w") as f:
                    for s in sentences:
                        f.write(str(s))
                        f.write("\n\n")

                predictor.parse_and_save(formalism, temp_path, output_filename)

                # Read AM dependency tree
                with open(output_filename) as f:
                    ret_val["parses"][formalism]["amdep"] = f.read()
                ret_val["times"]["amdep"] += time.time() - t

                # ...and as svg:
                t = time.time()
                with open(output_filename) as f:
                    amdep = next(parse_amconll(f))
                    #with open(direc + "/amdep.dot", "w") as g:
                    #    g.write(amdep.to_dot())
                    #os.system(f"dot -Tsvg {direc}/amdep.dot -o {direc}/amdep.svg")
                    #with open(direc + "/amdep.svg") as g:
                    #    ret_val["parses"][formalism]["amdep-svg"] = g.read()
                    ret_val["parses"][formalism][
                        "amdep-svg"] = amdep.displacy_svg()
                ret_val["times"]["amdep-svg"] += time.time() - t

                # Evaluate to graph
                (raw_graph, graph_time), (svg, svg_time) = postprocess(
                    output_filename, direc, formalism)
                ret_val["parses"][formalism]["graph"] = raw_graph
                if svg:
                    ret_val["parses"][formalism]["svg"] = svg

                ret_val["times"]["graph"] += graph_time
                ret_val["times"]["svg"] += svg_time
    except BaseException as ex:  #
        err = "".join(
            traceback.TracebackException.from_exception(
                ex).format_exception_only())
        ret_val["errors"].append(err)
        print("Ignoring error:")
        print(err)

    writer.write(bytes(json.dumps(ret_val), "utf8"))
    await writer.drain()
    writer.close()
    t2 = time.time()
    print("Handling request took", t2 - t1)
    print("Breakdown:", ret_val["times"])