Example #1
0
def formatToAnnotation(folia_file: str):
    doc = folia.Document(file=folia_file)
    doc.declare(
        folia.EntitiesLayer,
        "https://raw.githubusercontent.com/BaptisteBlouin/ttl/master/namedentities.ontonotesset.ttl",
    )
    doc.save(folia_file)
Example #2
0
def process(inputfile, outputfile, metadatadir, oztfile):
    docid = os.path.basename(inputfile).split('.')[0]
    metadatafile = os.path.join(metadatadir, docid + '.json')
    hasmetadata = os.path.exists(metadatafile)
    doc = folia.Document(file=inputfile)
    doc.provenance.append(
        folia.Processor.create("PICCL/nederlab.nf/addmetadata.py"))

    if hasmetadata:
        with open(metadatafile, 'r') as f:
            data = json.load(f)
        for key, value in sorted(data.items()):
            doc.metadata[key] = value
        print("Added metadata from JSON file for " + docid, file=sys.stderr)

    if oztfile:
        addsubmetadata_ozt(doc, oztfile, metadatadir)

    assignids(doc)
    doc.provenance.processors[-1].append(
        folia.Processor.create(name="foliaid",
                               version=TOOLVERSION,
                               src="https://github.com/proycon/foliatools"))

    if outputfile == '-':
        print(doc.xmlstring())
    else:
        doc.save(outputfile)
Example #3
0
    def validator(self):
        try:
            import folia.main as folia  #soft-dependency, not too big a deal if it is not present, but no metadata extraction then

            #this loads a whole FoLiA document into memory! which is a bit of a waste of memory and a performance hog!
            try:
                doc = folia.Document(file=str(self.file))
            except Exception as e:
                self['validation_error'] = str(e)
                self.valid = False
                return False
            self['version'] = doc.version
            for annotationtype, annotationset in doc.annotations:
                key = folia.annotationtype2str(
                    annotationtype).lower() + "-annotation"
                if annotationset is None: annotationset = "no-set"
                if key in self and annotationset not in [
                        x.strip() for x in self[key].split(',')
                ]:
                    self[key] += "," + annotationset
                else:
                    self[key] = annotationset
        except ImportError as e:
            self['validation_error'] = str(e)
            return False

        return True
def compare(fs_i, f_o='data/comparison.csv'):
    """
    fs_i: list of input (FoLiA) files (full path, with extension) (str)
    f_o: output (CSV) file (full path, with extension) (str)
    ...
    """

    # https://pynlpl.readthedocs.io/en/latest/folia.html#reading-folia
    docs_i = list()
    speeches_i = list()    
    for f in fs_i:
        print(f)
        docs_i.append(folia.Document(file=f))

    for d in docs_i:
        # list of lists of utterances
        speeches_i.append([u for u in d[0]])

    with open(f_o, 'w', encoding='utf-8',newline='') as f:
        print(f_o)
        writer = csv.writer(f)
        empty_lines = [
            [],
            [],
        ]
        for u in zip(*speeches_i):
            if len(set(len(uu) for uu in u))==1: # all utterances in u have the same number of tokens
                flag_diff = [True]*len(u[0]) # annotation contents are different for every tokens across the utterances in u
                
                morphos_list = list()                
                for i in range(len(u)):
                    morphos = [fs_i[i]]
                    for w in u[i]:
                        morphos.append('\n'.join(get_annotation(w)))
                    morphos_list.append(morphos)
                for i in range(len(u[0])):
                    if len(set(m[i+1] for m in morphos_list))==1:
                        flag_diff[i] = False
                        for j in range(len(morphos_list)):
                            morphos_list[j][i+1] = str()
                if any(flag_diff):
                    writer.writerow([u[0].id, u[0].begintime, u[0].endtime])
                    # words = [''] + [w.text() for w in u[0]]
                    words = [''] + [w.text(hidden=True) for w in u[0]]
                    writer.writerow(words)
                    writer.writerows(morphos_list)
                    writer.writerows(empty_lines)

            else:
                for i in range(len(u)):
                    words = ['']
                    morphos = [fs_i[i]]
                    for w in u[i]:
                        # words.append(w.text())
                        words.append(w.text(hidden=True))
                        morphos.append('\n'.join(get_annotation(w)))
                    writer.writerows([words,morphos])
                    writer.writerows(empty_lines)
    def read(self, collected_file: CollectedFile) -> Iterable[Document]:
        try:
            doc = folia.Document(string=collected_file.content,
                                 autodeclare=True,
                                 loadsetdefinitions=False)
            self.tokenize(doc)
            doc_metadata = self.get_metadata_dict(doc.metadata.items())

            yield Document(collected_file,
                           list(self.get_utterances(doc, doc_metadata)),
                           doc_metadata)
        except Exception as e:
            raise Exception(collected_file.relpath + "/" +
                            collected_file.filename) from e
Example #6
0
def convert(f_i, f_o=None):
    """
    f_i: input (ELAN) files (full path, with extension) (str)
    f_o: output (FoLiA) file (full path, with extension) (str)
    ...
    """
    doc_i = Eaf(f_i)

    if not f_o:
        f_o = '.'.join([f_i.rpartition('.')[0], 'folia.xml'])

    # https://foliapy.readthedocs.io/en/latest/folia.html#editing-folia
    # https://foliapy.readthedocs.io/en/latest/folia.html#adding-structure
    # https://foliapy.readthedocs.io/en/latest/folia.html#structure-annotation-types
    # print(os.path.basename(f_o))
    id_doc_o = os.path.basename(f_o).partition('.')[0]
    print(id_doc_o)
    # doc_o = folia.Document(id=os.path.basename(f_o))
    doc_o = folia.Document(id=id_doc_o)
    # https://github.com/proycon/folia/blob/master/foliatools/conllu2folia.py
    # future:
    # https://foliapy.readthedocs.io/en/latest/folia.html#declarations
    # https://foliapy.readthedocs.io/en/latest/folia.html#provenance-information
    # doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA)
    # processor_mystem as a single processor for all annotation performed by this script
    processor_mystem = doc_o.declare(folia.LemmaAnnotation,
                                     set=SET_LEMMA,
                                     processor=folia.Processor(name="Mystem+"))
    # doc_o.declare(folia.PosAnnotation, set=SET_POS)
    doc_o.declare(folia.PosAnnotation, set=SET_POS, processor=processor_mystem)
    # doc_o.declare(folia.SyntacticUnit, set=SET_SU, annotator="BiRCh group")
    doc_o.declare(folia.Description, processor=processor_mystem)
    doc_o.declare(folia.Comment, processor=processor_mystem)
    doc_o.declare(folia.Utterance, processor=processor_mystem)
    doc_o.declare(folia.Word, processor=processor_mystem)
    doc_o.declare(folia.Hiddenword)

    # folia.Speech cannot be declared as an annotation type
    speech = doc_o.append(folia.Speech)
    for aa in create_conversation(get_aas(doc_i)):
        utterance = speech.append(folia.Utterance,
                                  id=aa[0],
                                  speaker=aa[1],
                                  begintime=aa[2],
                                  endtime=aa[3],
                                  processor=processor_mystem)

        # https://docs.python.org/3/library/string.html#formatspec
        utterance.append(folia.Word,
                         '{}:'.format(aa[1].upper()),
                         processor=processor_mystem)
        # aa[4]: utterance text
        tokens = get_tokens(aa[4])
        len_tokens = len(tokens)
        for i in range(len_tokens):
            t = tokens[i]
            # consider the previous token in morphological analysis
            # pre_t = None
            # if i:
            #     pre_t = tokens[i-1]
            pre_t = [None, None]
            if i > 1:
                pre_t = [tokens[i - 2], tokens[i - 1]]
            elif i == 1:
                pre_t[1] = tokens[i - 1]
            token = utterance.append(folia.Word, t, processor=processor_mystem)
            if i < (len_tokens - 1):
                t = ' '.join([t, tokens[i + 1]])
            # lemma, pos, features = analyze_morphology(t)
            lemma, pos, features = analyze_morphology(pre_t, t)
            if lemma:
                token.append(folia.LemmaAnnotation,
                             cls=lemma,
                             set=SET_LEMMA,
                             processor=processor_mystem
                             #  annotator='Mystem+'
                             )
            if pos:
                an_pos = token.append(folia.PosAnnotation,
                                      cls=pos,
                                      set=SET_POS,
                                      processor=processor_mystem
                                      #   annotator='Mystem+'
                                      )
            if features:
                # https://foliapy.readthedocs.io/en/latest/folia.html#features
                an_pos.append(folia.Description,
                              value=re.sub(r'=', r',', features),
                              processor=processor_mystem
                              #   annotator='Mystem+'
                              )
                an_pos.append(folia.Comment,
                              value=' '.join(['Mystem+ features:', features]),
                              processor=processor_mystem
                              #   annotator='Mystem+'
                              )

    doc_o.save(f_o)
Example #7
0
    clamdata = clam.common.data.getclamdata(datafile)

    #You now have access to all data. A few properties at your disposition now are:
    # clamdata.system_id , clamdata.project, clamdata.user, clamdata.status , clamdata.parameters, clamdata.inputformats, clamdata.outputformats , clamdata.input , clamdata.output

    clam.common.status.write(statusfile, "Starting...")

    count = defaultdict(int)
    for i, inputfile in enumerate(clamdata.input):
        #Update our status message to let CLAM know what we're doing
        clam.common.status.write(
            statusfile,
            "Processing " + os.path.basename(str(inputfile)) + "...",
            round((i / float(len(clamdata.input))) * 100))

        doc = folia.Document(file=str(inputfile))
        for pos in doc.select(folia.PosAnnotation):
            count[pos.cls] += 1

    #Write overall statistics output for this file
    f = io.open(outputdir + 'posfreqlist.tsv', 'w', encoding='utf-8')
    f.write(dicttotext(count))
    f.close()

    #A nice status message to indicate we're done
    clam.common.status.write(statusfile, "Done", 100)  # status update

    sys.exit(
        0
    )  #non-zero exit codes indicate an error and will be picked up by CLAM as such!
Example #8
0
def compare(fs_i, f_o='data/comparison.csv'):
    """
    fs_i: list of input (FoLiA) files (full path, with extension) (str)
    f_o: output (CSV) file (full path, with extension) (str)
    ...
    """

    # https://pynlpl.readthedocs.io/en/latest/folia.html#reading-folia
    docs_i = list()
    speeches_i = list()
    for f in fs_i:
        print(f)
        docs_i.append(folia.Document(file=f))

    for d in docs_i:
        # list of lists of utterances
        speeches_i.append([u for u in d[0] if isinstance(u, folia.Utterance)])

    ids_utterance = get_ids(*speeches_i)

    with open(f_o, 'w', encoding='utf-8', newline='') as f:
        print(f_o)
        writer = csv.writer(f)
        empty_lines = [
            [],
            [],
        ]
        for i in ids_utterance:
            if i in docs_i[0] and i in docs_i[1]:
                u1 = docs_i[0][i]
                u2 = docs_i[1][i]
                morphos_list = list()
                ids_token = get_ids(u1, u2)
                flag_diff = [True] * len(ids_token)
                for j in range(2):
                    # 1st element of morphos is the file name
                    morphos = [fs_i[j]]
                    for k in ids_token:
                        if k in docs_i[j]:
                            morphos.append('\n'.join(
                                get_annotation(docs_i[j][k], True)).strip())
                        else:
                            morphos.append('')
                    morphos_list.append(morphos)
                for j in range(len(ids_token)):
                    # j+1 because 1st element of morphos is the file name
                    if len(set(m[j + 1] for m in morphos_list)) == 1:
                        flag_diff[j] = False
                        # len(morphos_list) is 2 here
                        for k in range(len(morphos_list)):
                            morphos_list[k][j + 1] = str()
                if any(flag_diff):
                    writer.writerow(
                        [i, docs_i[0][i].begintime, docs_i[0][i].endtime])
                    writer.writerow([''] + ids_token)
                    writer.writerows(morphos_list)
                    writer.writerows(empty_lines)

            else:
                if i in docs_i[0]:
                    u = docs_i[0][i]
                else:
                    u = docs_i[1][i]

                # w: either normal word or hidden word
                ids_token_plus = [''] + [w.id for w in u]
                # 1st element of morphos is the file name
                morphos = [fs_i[i]]
                # w: either normal word or hidden word
                for w in u:
                    morphos.append('\n'.join(get_annotation(w, True)).strip())
                writer.writerows([ids_token_plus, morphos])
                writer.writerows(empty_lines)
Example #9
0
  <text xml:id="untitleddoc.text">
    <p xml:id="untitleddoc.p.1">
      <s xml:id="untitleddoc.p.1.s.1">
        <t>Hello world.</t>
      </s>
      <s xml:id="untitleddoc.p.1.s.2">
        <t>This is a test.</t>
      </s>
    </p>
    <p xml:id="untitleddoc.p.2">
      <s xml:id="untitleddoc.p.2.s.1">
        <t>The capital of The Netherlands is Amsterdam.</t>
      </s>
      <s xml:id="untitleddoc.p.2.s.2">
        <t>The government, however, always convenes in The Hague.</t>
      </s>
    </p>
  </text>
</FoLiA>
"""

import spacy
import folia.main as folia
from spacy2folia import spacy2folia

nlp = spacy.load("en_core_web_sm")
foliadoc = folia.Document(string=text)
foliadoc = spacy2folia.convert_folia(foliadoc, nlp)
print(foliadoc.xmlstring())

Example #10
0
def read_folia_xml(filename):
    doc = folia.Document(file=filename, encoding='utf-8')
    return doc
Example #11
0
 def test(self):
     if debug: print("Testing " + filename,file=sys.stderr)
     if erroneous:
         self.assertRaises(Exception, folia.Document, file=filename,autodeclare=False,deepvalidation=deepvalidation)
     else:
         folia.Document(file=filename, deepvalidation=deepvalidation)
Example #12
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "BabelEnte: Entity extractioN, Translation and Evaluation using BabelFy",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--apikey',
                        '--key',
                        type=str,
                        help="Babelnet API key",
                        action='store',
                        default="",
                        required=False)
    parser.add_argument('-s',
                        '--sourcelang',
                        type=str,
                        help="Source language code",
                        action='store',
                        default="EN",
                        required=False)
    parser.add_argument('-t',
                        '--targetlang',
                        type=str,
                        help="Target language code",
                        action='store',
                        default="",
                        required=False)
    parser.add_argument(
        '-S',
        '--source',
        type=str,
        help="Source sentences (plain text, one per line, utf-8)",
        action='store',
        default="",
        required=False)
    parser.add_argument(
        '-T',
        '--target',
        type=str,
        help="Target sentences (plain text, one per line, utf-8)",
        action='store',
        default="",
        required=False)
    parser.add_argument(
        '-r',
        '--recall',
        help=
        "Compute recall as well using Babel.net (results in many extra queries!)",
        action='store_true',
        required=False)
    parser.add_argument(
        '-o',
        '--outputdir',
        type=str,
        help=
        "Output directory when processing FoLiA documents (set to /dev/null to skip output alltogether)",
        action='store',
        default="./",
        required=False)
    parser.add_argument('-d',
                        '--debug',
                        help="Debug",
                        action='store_true',
                        required=False)
    parser.add_argument('--nodup',
                        help="Filter out duplicate entities in evaluation",
                        action='store_true',
                        required=False)
    parser.add_argument(
        '--evalfile',
        type=str,
        help="(Re)evaluate the supplied json file (output of babelente)",
        action='store',
        default="",
        required=False)
    parser.add_argument(
        '--anntype',
        type=str,
        help=
        "Annotation Type: Allows to restrict the disambiguated entries to only named entities (NAMED_ENTITIES), word senses (CONCEPTS) or both (ALL).",
        action='store',
        required=False)
    parser.add_argument(
        '--annres',
        type=str,
        help=
        "Annotation Resource: Allows to restrict the disambiguated entries to only WordNet (WN), Wikipedia (WIKI) or BabelNet (BN)",
        action='store',
        required=False)
    parser.add_argument('--th',
                        type=float,
                        help="Cutting Threshold (BabelFy)",
                        action='store',
                        required=False)
    parser.add_argument(
        '--match',
        type=str,
        help=
        "select the candidate extraction strategy, i.e., either only exact matching (EXACT_MATCHING) or both exact and partial matching (PARTIAL_MATCHING)",
        action='store',
        required=False)
    parser.add_argument(
        '--mcs',
        type=str,
        help=
        "Use this to enable or disable the most common sense backoff strategy for BabelFy (values: ON, OFF, ON_WITH_STOPWORDS)",
        action='store',
        required=False)
    parser.add_argument(
        '--dens',
        help=
        "Enable the densest subgraph heuristic during the disambiguation pipeline.",
        action='store_true',
        required=False)
    parser.add_argument(
        '--cands',
        type=str,
        help=
        "Use this parameter to obtain as a result of the disambiguation procedure a scored list of candidates (ALL) or only the top ranked one (TOP); if ALL is selected then --mcs and --th parameters will not be taken into account).",
        action='store',
        required=False)
    parser.add_argument(
        '--postag',
        type=str,
        help=
        "Use this parameter to change the tokenization and pos-tagging pipeline for your input text. Values: STANDARD, NOMINALIZE_ADJECTIVES, INPUT_FRAGMENTS_AS_NOUNS, CHAR_BASED_TOKENIZATION_ALL_NOUN",
        action='store',
        required=False)
    parser.add_argument(
        '--extaida',
        help=
        "Extend the candidates sets with the aida_means relations from YAGO.",
        action='store_true',
        required=False)
    parser.add_argument(
        '--overlap',
        type=str,
        help=
        "Resolve overlapping entities, can be set to allow (default), longest, score, globalscore, coherencescore",
        action='store',
        default='allow',
        required=False)
    parser.add_argument(
        '--cache',
        type=str,
        help=
        "Cache file, stores queries to prevent excessive querying of BabelFy (warning: not suitable for parallel usage!)",
        action='store',
        required=False)
    parser.add_argument('--dryrun',
                        help="Do not query",
                        action='store_true',
                        required=False)
    parser.add_argument(
        'inputfiles',
        nargs='*',
        help=
        'FoLiA input documents, use with -s to choose source language. For tramooc style usage: use -S/-T or --evalfile instead of this.'
    )
    #hidden power options:
    parser.add_argument(
        '--foliaset',
        type=str,
        help=argparse.SUPPRESS,
        action='store',
        default=
        "https://raw.githubusercontent.com/proycon/babelente/master/setdefinitions/babelente.babelnet.ttl",
        required=False)
    parser.add_argument(
        '--foliarelationset',
        type=str,
        help=argparse.SUPPRESS,
        action='store',
        default=
        "https://raw.githubusercontent.com/proycon/babelente/master/setdefinitions/babelente.relations.ttl",
        required=False)
    parser.add_argument(
        '--foliametricset',
        type=str,
        help=argparse.SUPPRESS,
        action='store',
        default=
        "https://raw.githubusercontent.com/proycon/babelente/master/setdefinitions/babelente.metrics.ttl",
        required=False)
    args = parser.parse_args()

    if not args.source and not args.target and not args.evalfile and not args.inputfiles:
        print(
            "ERROR: For Tramooc style usage, specify either --source/-S (with or without --target/-T, or --evalfile.",
            file=sys.stderr)
        print(
            "       For entity extraction & linking on FoLiA documents, just specify one or more FoLiA documents",
            file=sys.stderr)
        print("        along with --sourcelang to choose source language.",
              file=sys.stderr)
        print("       See babelente -h for full usage instructions.",
              file=sys.stderr)
        sys.exit(2)
    if args.target and not args.source:
        print(
            "ERROR: Specify --source/-S as well when --target/-T is used . See babelente -h for usage instructions.",
            file=sys.stderr)
        sys.exit(2)
    if (args.target or args.source or args.inputfiles) and not args.apikey:
        print(
            "ERROR: Specify an API key (--apikey). Get one on http://babelnet.org/",
            file=sys.stderr)
        sys.exit(2)
    if args.target and not args.targetlang:
        print("ERROR: Specify a target language (-t).", file=sys.stderr)
        sys.exit(2)

    if args.inputfiles:
        if not args.sourcelang:
            print("ERROR: Specify a source language (-s)", file=sys.stderr)
            sys.exit(2)
        first = True
        textdoc = False
        for filename in args.inputfiles:
            if not os.path.exists(filename):
                print("ERROR: No such file: " + filename)
                sys.exit(2)
            if filename[-3:].lower() == ".xml":
                #FoLiA based, extraction only
                print("Loading FoLiA document " + filename + " ...",
                      file=sys.stderr)
                doc = folia.Document(file=filename)
                if first:
                    print("[")
                    first = False
                processfolia(doc, args, None)

                if args.outputdir != '/dev/null':
                    outputname = os.path.basename(filename)
                    if outputname.endswith('.folia.xml'):
                        outputname = outputname.replace(
                            '.folia.xml', '.babelente.folia.xml')
                    elif outputname.endswith('.xml'):
                        outputname = outputname.replace(
                            '.xml', '.babelente.folia.xml')
                    else:
                        outputname = outputname + '.babelente.folia.xml'
                    doc.save(os.path.join(args.outputdir, outputname))
            else:
                #text-based
                textdoc = True
                print("Loading text document " + filename + " ...",
                      file=sys.stderr)
                args.source = filename
                break

        if not textdoc:
            print("{}")
            print("]")
            return True

    #Tramooc-style extraction, translation and evaluation
    with open(args.source, 'r', encoding='utf-8') as f:
        sourcelines = [stripmultispace(l) for l in f.readlines()]

    if args.target:
        with open(args.target, 'r', encoding='utf-8') as f:
            targetlines = [stripmultispace(l) for l in f.readlines()]

        if len(sourcelines) != len(targetlines):
            print(
                "ERROR: Expected the same number of line in source and target files, but got "
                + str(len(sourcelines)) + " vs " + str(len(targetlines)),
                file=sys.stderr)
            sys.exit(2)

    if args.cache:
        if os.path.exists(args.cache):
            print("Loading cache from " + args.cache, file=sys.stderr)
            with open(args.cache, 'rb') as f:
                cache = pickle.load(f)
        else:
            print("Creating new cache " + args.cache, file=sys.stderr)
            cache = {
                'source': {},
                'target': {},
                'synsets_source': {},
                'synsets_target': {}
            }
    else:
        cache = None

    evaluation = None
    if args.evalfile:
        with open(args.evalfile, 'rb') as f:
            data = json.load(f)
        sourceentities = data['sourceentities']
        targetentities = data['targetentities']

        print("Evaluating...", file=sys.stderr)
        evaluation = evaluate(
            sourceentities, targetentities, sourcelines, targetlines,
            args.recall, args.targetlang, args.apikey, args.nodup,
            None if cache is None else cache['synsets_source'], args.debug)
    else:
        print("Extracting source entities...", file=sys.stderr)
        sourceentities = [
            entity for entity in findentities(
                sourcelines, args.sourcelang, args,
                None if cache is None else cache['source'])
            if entity['isEntity'] and 'babelSynsetID' in entity
        ]  #with sanity check

        if args.target:
            print("Extracting target entities...", file=sys.stderr)
            targetentities = [
                entity for entity in findentities(
                    targetlines, args.targetlang, args,
                    None if cache is None else cache['target'])
                if entity['isEntity'] and 'babelSynsetID' in entity
            ]  #with sanity check

            print("Evaluating...", file=sys.stderr)
            evaluation = evaluate(
                sourceentities, targetentities, sourcelines, targetlines,
                args.recall, args.targetlang, args.apikey, args.nodup,
                None if cache is None else cache['synsets_target'], args.debug)
        else:
            print(
                json.dumps({'entities': sourceentities},
                           indent=4,
                           ensure_ascii=False))  #MAYBE TODO: add coverage?

    if evaluation is not None:
        print(
            json.dumps(
                {
                    'sourceentities': sourceentities,
                    'targetentities': targetentities,
                    'evaluation': evaluation
                },
                indent=4,
                ensure_ascii=False))
        #output summary to stderr (info is all in JSON stdout output as well)
        print("PRECISION(macro)=" + str(round(evaluation['precision'], 3)),
              "RECALL(macro)=" + str(round(evaluation['recall'], 3)),
              file=sys.stderr)
        print("PRECISION(micro)=" +
              str(round(evaluation['microprecision'], 3)),
              "RECALL(micro)=" + str(round(evaluation['microrecall'], 3)),
              file=sys.stderr)
        print("SOURCECOVERAGE=" + str(round(evaluation['sourcecoverage'], 3)),
              "TARGETCOVERAGE=" + str(round(evaluation['targetcoverage'], 3)),
              file=sys.stderr)
        print("SOURCEENTITIES=" + str(len(sourceentities)),
              "TARGETENTITIES=" + str(len(targetentities)))
        print("MATCHES=" + str(evaluation['matches']), file=sys.stderr)
        print("TRANSLATABLEENTITIES=" +
              str(evaluation['translatableentities']),
              file=sys.stderr)

    if cache is not None:
        with open(args.cache, 'wb') as f:
            pickle.dump(cache, f)
Example #13
0
def process(file, **kwargs):
    selectlang = kwargs.get('language', None)
    doc = folia.Document(file=file,
                         processor=folia.Processor.create("wikiente",
                                                          version=VERSION))
    if not doc.declared(folia.Sentence):
        print(
            "ERROR: Document contains no sentence annotation, but this is required for wikiente",
            file=sys.stderr)
        sys.exit(2)
    for sentence in doc.sentences():
        if kwargs.get('debug') and sentence.id:
            print("Processing sentence ", sentence.id, file=sys.stderr)
        if selectlang:
            foundlang = getlanguage(sentence)
            if foundlang is None:
                print("(no language information, skipping sentence ",
                      repr(sentence),
                      ")",
                      file=sys.stderr)
                continue
            elif foundlang.cls != selectlang:
                print("(skipping, language doesn't match, expected ",
                      selectlang,
                      " found ",
                      foundlang.cls,
                      file=sys.stderr)
                continue
        text = sentence.text(cls=kwargs.get('textclass', 'current'),
                             retaintokenisation=True)
        if kwargs.get('debug'):
            print("Processing: ", text, file=sys.stderr)
        try:
            entities = spotlight.annotate(os.path.join(kwargs.get('server'),
                                                       "annotate"),
                                          text,
                                          confidence=kwargs.get(
                                              'confidence', 0.5))
        except ConnectionError as e:
            print("WARNING: Connection Error", str(e), file=sys.stderr)
            if kwargs.get('ignore'):
                continue
            else:
                sys.exit(2)
        except spotlight.SpotlightException as e:
            print("WARNING: Spot exception", str(e), file=sys.stderr)
            continue
        except HTTPError as e:
            print("ERROR: HTTP exception", str(e), file=sys.stderr)
            if kwargs.get('ignore'):
                continue
            else:
                sys.exit(2)
        for rawentity in entities:
            if kwargs.get('debug'):
                print(rawentity, file=sys.stderr)
            wordspan = None
            try:
                wordspan = sentence.resolveoffsets(
                    rawentity['offset'],
                    rawentity['offset'] + len(rawentity['surfaceForm']),
                    cls=kwargs.get('textclass', 'current'))
            except folia.InconsistentText as e:
                print("WARNING: ", str(e), file=sys.stderr)
            if not wordspan:
                print("WARNING: Unable to resolve entity",
                      rawentity['surfaceForm'],
                      file=sys.stderr)
            else:
                mode = kwargs.get('mode', 1)
                if mode == 1:
                    cls = rawentity['URI']
                    entityset = ENTITYSET_MODE_1
                elif mode == 2:
                    cls = getclass(rawentity['types'].split(','))
                    if cls is None:
                        print(
                            "WARNING: Resolved entity does not specify any known types, skipping: ",
                            rawentity['surfaceForm'],
                            file=sys.stderr)
                        continue
                    entityset = ENTITYSET_MODE_2
                else:
                    raise ValueError("Invalid mode")
                entity = wordspan[0].add(folia.Entity,
                                         *wordspan,
                                         cls=cls,
                                         set=entityset)
                if kwargs.get('metrics'):
                    for key, value in rawentity:
                        if key not in ('URI', 'offset', 'surfaceForm'):
                            entity.append(folia.Metric,
                                          set=METRIC_SET,
                                          cls=key,
                                          value=str(value))
                if mode == 2:
                    entity.append(folia.Relation,
                                  cls="dbpedia",
                                  href=rawentity['URI'],
                                  set=RELATIONSET,
                                  format="application/rdf+xml")
    if kwargs.get('output', None) == '-':
        print(doc.xmlstring())
    else:
        doc.save(kwargs.get('output', None))
Example #14
0
def process(filename, outputdir, metadata, oztmetadata, oztcount, ignore):
    assert os.path.exists(filename)
    doc = folia.Document(file=filename)
    doc.provenance.append(folia.Processor.create("dbnl_ozt_fix.py"))
    found = 0

    if doc.id not in metadata:
        if doc.id + '_01' in metadata:
            doc.id = doc.id + '_01'
            print(
                "WARNING: Document ID did not have _01 suffix and did not match with metadata, added it manually so it matches again",
                file=sys.stderr)
        elif ignore:
            print(
                "WARNING: Document not found in Nederlab metadata! Ignoring this and passing the document as-is!!!",
                file=sys.stderr)
            doc.save(
                os.path.join(
                    outputdir,
                    os.path.basename(doc.filename.replace(".xml.gz", ".xml"))))
            return
        else:
            raise Exception("Document not found in metadata")

    for key, value in metadata[doc.id].items():
        if key not in ('title', 'ingestTime', 'updateTime',
                       'processingMethod') and value:
            doc.metadata[key] = value

    if doc.id in oztcount:
        unmatched = 0
        assigned_ozt_divs = set()
        for div in doc.select(folia.Division, False):
            if div.cls in ("chapter", "act"):
                found += 1
                seq_id = str(found).zfill(4)
                ozt_id = doc.id + "_" + seq_id
                if ozt_id not in oztmetadata:
                    unmatched += 1  #false positive
                    if div.id in assigned_ozt_divs:
                        print(
                            f"ERROR: No metadata was found for {ozt_id}, we expected an independent title but this is not one! We can't skip this element ({div.id}) because it was already assigned to an earlier division and would cause a conflict! This may be indicative of a problem where the metadata is out of sync with the actual files! Please investigate.",
                            file=sys.stderr)
                        sys.exit(6)
                    else:
                        print(
                            f"WARNING: No metadata was found for {ozt_id}, we expected an independent title but this is not one! Skipping...",
                            file=sys.stderr)
                    div.metadata = None  #unassign any metadata
                    continue
                print(f"Found {ozt_id}, reassigning identifiers...",
                      file=sys.stderr)
                div.id = ozt_id + ".text"
                div.metadata = ozt_id + ".metadata"
                doc.submetadata[ozt_id + ".metadata"] = folia.NativeMetaData()
                doc.submetadatatype[ozt_id + ".metadata"] = "native"
                assigned_ozt_divs.add(div.id)
                for key, value in oztmetadata[ozt_id].items():
                    if key not in ('ingestTime', 'updateTime',
                                   'processingMethod') and value:
                        doc.submetadata[ozt_id + ".metadata"][key] = value
                reassignids(div)
            elif div.metadata:
                div.metadata = None

        expected = oztcount[doc.id]
        if found != expected + unmatched:
            raise Exception(
                f"Found {found}  OZT chapters for {doc.id}, expected {expected}  ( + {unmatched} unmatched)"
            )
    else:
        print(f"Document {doc.id} has no independent titles, skipping...",
              file=sys.stderr)

    obsolete_submetadata = [
        key for key, value in doc.submetadata.items() if not value
    ]
    for key in obsolete_submetadata:
        del doc.submetadata[key]
        del doc.submetadatatype[key]

    print("Saving document", file=sys.stderr)
    doc.save(
        os.path.join(outputdir,
                     os.path.basename(doc.filename.replace(".xml.gz",
                                                           ".xml"))))