コード例 #1
0
def install_original(corpus: Corpus = Corpus(),
                     xmlfile: ExportInput = ExportInput("[xml_export.filename_compressed]"),
                     out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"),
                     export_path: str = Config("xml_export.export_original_path"),
                     host: str = Config("xml_export.export_original_host")):
    """Copy compressed combined unscrambled XML to remote host."""
    xml_utils.install_compressed_xml(corpus, xmlfile, out, export_path, host)
コード例 #2
0
def pretty(doc: Document = Document(),
           docid: AnnotationData = AnnotationData("<docid>"),
           out: Export = Export("xml_pretty/[xml_export.filename]"),
           token: Annotation = Annotation("<token>"),
           word: Annotation = Annotation("[export.word]"),
           annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
           source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"),
           header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"),
           remove_namespaces: bool = Config("export.remove_module_namespaces", False),
           sparv_namespace: str = Config("export.sparv_namespace"),
           source_namespace: str = Config("export.source_namespace"),
           include_empty_attributes: bool = Config("xml_export.include_empty_attributes")):
    """Export annotations to pretty XML in export_dir.

    Args:
        doc: Name of the original document.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        token: Annotation containing the token strings.
        word: Annotation containing the token strings.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words and document ID
    word_annotation = list(word.read())
    docid_annotation = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc,
                                                                 token_name=token_name,
                                                                 remove_namespaces=remove_namespaces,
                                                                 sparv_namespace=sparv_namespace,
                                                                 source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations,
                                                              doc=doc, split_overlaps=True)
    xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation,
                                       docid_annotation, include_empty_attributes, sparv_namespace)

    # Write XML to file
    with open(out, mode="w") as outfile:
        outfile.write(xmlstr)
    log.info("Exported: %s", out)
コード例 #3
0
def encode_scrambled(
        corpus: Corpus = Corpus(),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations",
                                                           is_input=False),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        docs: AllDocuments = AllDocuments(),
        words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"),
        vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt",
                                            all_docs=True),
        out: Export = Export("[cwb.corpus_registry]/[metadata.id]",
                             absolute_path=True),
        out_marker: Export = Export(
            "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker",
            absolute_path=True),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        bin_path: Config = Config("cwb.bin_path"),
        encoding: str = Config("cwb.encoding"),
        datadir: str = Config("cwb.cwb_datadir"),
        registry: str = Config("cwb.corpus_registry"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        skip_compression: Optional[bool] = Config("cwb.skip_compression"),
        skip_validation: Optional[bool] = Config("cwb.skip_validation")):
    """Do cwb encoding with vrt files in scrambled order."""
    cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles,
               out, out_marker, token.name, bin_path, encoding, datadir,
               registry, remove_namespaces, sparv_namespace, source_namespace,
               skip_compression, skip_validation)
コード例 #4
0
def vrt_scrambled(
        doc: Document = Document(),
        out: Export = Export("vrt_scrambled/{doc}.vrt"),
        chunk: Annotation = Annotation("[cwb.scramble_on]"),
        chunk_order: Annotation = Annotation(
            "[cwb.scramble_on]:misc.number_random"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt in scrambled order."""
    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    if chunk not in annotation_list:
        raise util.SparvErrorMessage(
            "The annotation used for scrambling ({}) needs to be included in the output."
            .format(chunk))
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list, export_names, doc=doc, split_overlaps=True)

    # Read words and document ID
    word_annotation = list(word.read())
    chunk_order_data = list(chunk_order.read())

    # Reorder chunks and open/close tags in correct order
    new_span_positions = util.scramble_spans(span_positions, chunk.name,
                                             chunk_order_data)

    # Make vrt format
    vrt_data = create_vrt(new_span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
コード例 #5
0
def parse(doc: Document = Document(),
          source_dir: Source = Source(),
          elements: list = Config("xml_import.elements"),
          skip: list = Config("xml_import.skip"),
          header_elements: list = Config("xml_import.header_elements"),
          header_data: list = Config("xml_import.header_data"),
          prefix: str = Config("xml_import.prefix"),
          encoding: str = Config("xml_import.encoding"),
          keep_control_chars: bool = Config("xml_import.keep_control_chars"),
          normalize: str = Config("xml_import.normalize")):
    """Parse XML source file and create annotation files.

    Args:
        doc: Source document name.
        source_dir: Directory containing source documents.
        elements: List of elements and attributes in source document. Only needed for renaming, as everything is
            parsed whether listed or not.
        skip: Elements and attributes to skip. Use elementname:@contents to skip contents as well.
        header_elements: Elements containing header metadata. Contents will not be included in corpus text.
        header_data: List of header elements and attributes from which to extract metadata.
        prefix: Optional prefix to add to annotations.
        encoding: Encoding of source document. Defaults to UTF-8.
        keep_control_chars: Set to True to keep control characters in the text.
        normalize: Normalize input using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'.
            Defaults to 'NFC'.
    """
    parser = SparvXMLParser(elements, skip, header_elements, header_data,
                            encoding, source_dir, prefix, keep_control_chars,
                            normalize)
    parser.parse(doc)
    parser.save()
コード例 #6
0
def install_json(jsonfile: ExportInput = ExportInput("[metadata.id].json"),
                 out: OutputCommonData = OutputCommonData(
                     "sbx_metadata.install_json_export_marker"),
                 export_path: str = Config("sbx_metadata.json_export_path"),
                 host: str = Config("sbx_metadata.json_export_host")):
    """Copy JSON metadata to remote host."""
    if not host:
        raise util.SparvErrorMessage(
            "'sbx_metadata.json_export_host' not set! JSON export not installed."
        )
    filename = Path(jsonfile).name
    remote_file_path = os.path.join(export_path, filename)
    util.install_file(host, jsonfile, remote_file_path)
    out.write("")
コード例 #7
0
def install_metashare(
        xmlfile: ExportInput = ExportInput("sbx_metadata/[metadata.id].xml"),
        out: OutputCommonData = OutputCommonData(
            "sbx_metadata.install_metashare_marker"),
        export_path: str = Config("sbx_metadata.metashare_path"),
        host: str = Config("sbx_metadata.metashare_host")):
    """Copy META-SHARE file to remote host."""
    if not host:
        raise util.SparvErrorMessage(
            "'sbx_metadata.metashare_host' not set! META-SHARE export not installed."
        )
    filename = Path(xmlfile).name
    remote_file_path = os.path.join(export_path, filename)
    util.install_file(host, xmlfile, remote_file_path)
    out.write("")
コード例 #8
0
ファイル: info.py プロジェクト: heatherleaf/sparv-pipeline
def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info",
                              absolute_path=True),
         sentences: AnnotationCommonData = AnnotationCommonData(
             "cwb.sentencecount"),
         firstdate: AnnotationCommonData = AnnotationCommonData(
             "cwb.datefirst"),
         lastdate: AnnotationCommonData = AnnotationCommonData("cwb.datelast"),
         resolution: AnnotationCommonData = AnnotationCommonData(
             "dateformat.resolution"),
         protected: bool = Config("korp.protected")):
    """Save information to the file specified by 'out'."""
    content = []
    protected_str = str(protected).lower()

    for key, value_obj in [("Sentences", sentences), ("FirstDate", firstdate),
                           ("LastDate", lastdate),
                           ("DateResolution", resolution),
                           ("Updated", time.strftime("%Y-%m-%d")),
                           ("Protected", protected_str)]:
        if isinstance(value_obj, AnnotationCommonData):
            value = value_obj.read()
        else:
            value = value_obj

        content.append("%s: %s\n" % (key, value))

    # Write .info file
    with open(out, "w") as o:
        o.writelines(content)

    log.info("Exported: %s", out)
コード例 #9
0
def install_relations(
        sqlfile: ExportInput = ExportInput("korp_wordpicture/relations.sql"),
        out: OutputCommonData = OutputCommonData(
            "korp.install_relations_marker"),
        db_name: str = Config("korp.mysql_dbname"),
        host: str = Config("korp.remote_host")):
    """Install Korp's Word Picture SQL on remote host.

    Args:
        sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_wordpicture/relations.sql").
        out (str, optional): Marker file to be written.
        db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname").
        host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host").
    """
    util.install_mysql(host, db_name, sqlfile)
    out.write("")
コード例 #10
0
def paragraph(
        text: Text = Text(),
        out: Output = Output("segment.paragraph",
                             cls="paragraph",
                             description="Paragraph segments"),
        chunk: Optional[Annotation] = Annotation("[segment.paragraph_chunk]"),
        segmenter: str = Config("segment.paragraph_segmenter"),
        existing_segments: Optional[str] = Config("segment.existing_paragraphs"
                                                  ),
        model: Optional[Model] = None):
    """Split text into paragraphs."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model)
コード例 #11
0
def sentence(
        text: Text = Text(),
        out: Output = Output("segment.sentence",
                             cls="sentence",
                             description="Sentence segments"),
        chunk: Optional[Annotation] = Annotation("[segment.sentence_chunk]"),
        segmenter: str = Config("segment.sentence_segmenter"),
        existing_segments: Optional[str] = Config(
            "segment.existing_sentences"),
        model: Optional[Model] = Model("[segment.sentence_model]")):
    """Split text into sentences."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model)
コード例 #12
0
def install_lemgrams(sqlfile: ExportInput = ExportInput(
    "korp_lemgram_index/lemgram_index.sql"),
                     marker: OutputCommonData = OutputCommonData(
                         "korp.install_lemgram_marker"),
                     db_name: str = Config("korp.mysql_dbname"),
                     host: str = Config("korp.remote_host")):
    """Install lemgram SQL on remote host.

    Args:
        sqlfile (str, optional): SQL file to be installed.
            Defaults to ExportInput("korp_lemgram_index/lemgram_index.sql").
        marker (str, optional): Marker file to be written.
            Defaults to OutputCommonData("korp.install_lemgram_marker").
        db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname").
        host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host").
    """
    util.install_mysql(host, db_name, sqlfile)
    marker.write("")
コード例 #13
0
def tokenize(
        text: Text = Text(),
        out: Output = Output("segment.token",
                             cls="token",
                             description="Token segments"),
        chunk: Annotation = Annotation("[segment.token_chunk]"),
        segmenter: str = Config("segment.token_segmenter"),
        existing_segments: Optional[str] = Config("segment.existing_tokens"),
        model: Optional[Model] = Model("[segment.tokenizer_config]"),
        token_list: Optional[Model] = Model("[segment.token_list]")):
    """Tokenize text."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model,
                    token_list=token_list)
コード例 #14
0
def vrt(doc: Document = Document(),
        out: Export = Export("vrt/{doc}.vrt"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt.

    - annotations: list of elements:attributes (annotations) to include.
    - source_annotations: list of elements:attributes from the original document
      to be kept. If not specified, everything will be kept.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)
    vrt_data = create_vrt(span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
コード例 #15
0
def cwb_align(corpus,
              other,
              link,
              aligndir="annotations/align",
              bin_path="",
              encoding: str = Config("cwb.encoding", "utf8")):
    """Align 'corpus' with 'other' corpus, using the 'link' annotation for alignment."""
    os.makedirs(aligndir, exist_ok=True)
    alignfile = os.path.join(aligndir, corpus + ".align")
    log.info("Aligning %s <-> %s", corpus, other)

    try:
        [(link_name, [(link_attr, _path)])] = parse_structural_attributes(link)
    except ValueError:
        raise ValueError("You have to specify exactly one alignment link.")
    link_attr = link_name + "_" + link_attr

    # Align linked chunks
    args = ["-v", "-o", alignfile, "-V", link_attr, corpus, other, link_name]
    result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-align"),
                                        args,
                                        encoding=encoding)
    with open(alignfile + ".result", "w") as F:
        print(result, file=F)
    _, lastline = result.rsplit("Alignment complete.", 1)
    log.info("%s", lastline.strip())
    if " 0 alignment" in lastline.strip():
        log.warning("No alignment regions created")
    log.info("Alignment file/result: %s/.result", alignfile)

    # Add alignment parameter to registry
    # cwb-regedit is not installed by default, so we skip it and modify the regfile directly instead:
    regfile = os.path.join(os.environ["CORPUS_REGISTRY"], corpus)
    with open(regfile) as F:
        skip_align = ("ALIGNED %s" % other) in F.read()

    if not skip_align:
        with open(regfile, "a") as F:
            print(file=F)
            print("# Added by cwb.py", file=F)
            print("ALIGNED", other, file=F)
        log.info("Added alignment to registry: %s", regfile)
    # args = [corpus, ":add", ":a", other]
    # result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-regedit"), args)
    # log.info("%s", result.strip())

    # Encode the alignments into CWB
    args = ["-v", "-D", alignfile]
    result, _ = util.system.call_binary(os.path.join(bin_path,
                                                     "cwb-align-encode"),
                                        args,
                                        encoding=encoding)
    log.info("%s", result.strip())
コード例 #16
0
def freq_list_simple(corpus: Corpus = Corpus(),
                     docs: AllDocuments = AllDocuments(),
                     word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
                     pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"),
                     baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
                     out: Export = Export("frequency_list/stats_[metadata.id].csv"),
                     delimiter: str = Config("stats_export.delimiter"),
                     cutoff: int = Config("stats_export.cutoff")):
    """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations."""
    freq_dict = defaultdict(int)

    for doc in docs:
        simple_tokens = word.read_attributes(doc, [word, pos, baseform])

        # Add empty annotations for sense, lemgram and complemgram
        tokens = []
        for w, p, b in simple_tokens:
            tokens.append((w, p, b, "|", "|", "|"))
        update_freqs(tokens, freq_dict)

    write_csv(out, freq_dict, delimiter, cutoff)
コード例 #17
0
def freq_list(corpus: Corpus = Corpus(),
              docs: AllDocuments = AllDocuments(),
              word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
              msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"),
              baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
              sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"),
              lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"),
              complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"),
              out: Export = Export("frequency_list/stats_[metadata.id].csv"),
              delimiter: str = Config("stats_export.delimiter"),
              cutoff: int = Config("stats_export.cutoff"),
              include_all_compounds: bool = Config("stats_export.include_all_compounds")):
    """Create a word frequency list for the entire corpus.

    Args:
        corpus (str, optional): The corpus ID. Defaults to Corpus.
        docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments.
        word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>").
        msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>").
        baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>").
        sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>").
        lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram").
        complemgram (str, optional): Compound lemgram annotations.
            Defaults to AnnotationAllDocs("<token>:saldo.complemgram").
        out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv").
        delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter").
        cutoff (int, optional): The minimum frequency a word must have in order to be included in the result.
            Defaults to Config("stats_export.cutoff").
        include_all_compounds (bool, optional): Whether to include compound analyses for every word
            or just for the words that are lacking a sense annotation.
            Defaults to Config("stats_export.include_all_compounds").
    """
    freq_dict = defaultdict(int)

    for doc in docs:
        tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram])
        update_freqs(tokens, freq_dict, include_all_compounds)

    write_csv(out, freq_dict, delimiter, cutoff)
コード例 #18
0
def dateformat(
        in_from: Annotation = Annotation("[dateformat.datetime_from]"),
        in_to: Optional[Annotation] = Annotation("[dateformat.datetime_to]"),
        out_from: Output = Output(
            "[dateformat.out_annotation]:dateformat.datefrom",
            description="From-dates"),
        out_to: Optional[Output] = Output(
            "[dateformat.out_annotation]:dateformat.dateto",
            description="To-dates"),
        informat: str = Config("dateformat.datetime_informat"),
        outformat: str = Config("dateformat.date_outformat"),
        splitter: Optional[str] = Config("dateformat.splitter", None),
        regex: Optional[str] = Config("dateformat.regex", None)):
    """Convert existing dates/times to specified date output format.

    http://docs.python.org/library/datetime.html#strftime-and-strptime-behavior

    Args:
        in_from (str, optional): Annotation containing from-dates (and times).
            Defaults to Annotation("[dateformat.datetime_from]").
        in_to (Optional[str], optional): Annotation containing to-dates.
            Defaults to Annotation("[dateformat.datetime_to]").
        out_from (str, optional): Annotation with from-times to be written.
            Defaults to Output("[dateformat.out_annotation]:dateformat.datefrom",description="From-dates").
        out_to (Optional[str], optional): Annotation with to-times to be written.
            Defaults to Output("[dateformat.out_annotation]:dateformat.dateto",description="To-dates").
        informat (str, optional): Format of the in_from and in_to dates/times.
            Several formats can be specified separated by |. They will be tried in order.
            Defaults to Config("dateformat.datetime_informat").
        outformat (str, optional): Desired format of the out_from and out_to dates.
            Several formats can be specified separated by |. They will be tied to their respective in-format.
            Defaults to Config("dateformat.date_outformat", "%Y%m%d").
        splitter (str, optional): One or more characters separating two dates in 'in_from',
            treating them as from-date and to-date. Defaults to Config("dateformat.splitter", None).
        regex (str, optional): Regular expression with a catching group whose content will be used in the parsing
            instead of the whole string. Defaults to Config("dateformat.regex", None).
    """
    _formatter(in_from, in_to, out_from, out_to, informat, outformat, splitter,
               regex)
コード例 #19
0
def build_tokenlist(
        saldo_model: Model = Model("saldo/saldo.pickle"),
        out: ModelOutput = ModelOutput(
            "segment/bettertokenizer.sv.saldo-tokens"),
        segmenter: str = Config("segment.token_wordlist_segmenter"),
        model: Model = Model("segment/bettertokenizer.sv")):
    """Build a list of words from a SALDO model, to help BetterWordTokenizer."""
    segmenter_args = []
    if model:
        if model.path.suffix in ["pickle", "pkl"]:
            with open(model, "rb") as m:
                model_arg = pickle.load(m)
        else:
            model_arg = model.path
        segmenter_args.append(model_arg)
    assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join(
        sorted(SEGMENTERS))
    segmenter = SEGMENTERS[segmenter]
    segmenter = segmenter(*segmenter_args)
    assert hasattr(
        segmenter, "span_tokenize"
    ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter

    wordforms = set()

    # Skip strings already handled by the tokenizer.
    # Also skip words ending in comma (used by some multi word expressions in SALDO).
    with open(saldo_model.path, "rb") as F:
        lexicon = pickle.load(F)
        for w in lexicon:
            w2 = list(map(split_triple, lexicon[w]))
            mwu_extras = [
                contw for w3 in w2 for cont in w3[2] for contw in cont
                if contw not in lexicon
            ]
            for wf in mwu_extras + [w]:
                spans = list(segmenter.span_tokenize(wf))
                if len(spans) > 1 and not wf.endswith(","):
                    wordforms.add(wf)

    out.write("\n".join(sorted(wordforms)))
コード例 #20
0
ファイル: misc.py プロジェクト: heatherleaf/sparv-pipeline
def text_spans(text: Text = Text(),
               chunk: Annotation = Annotation("<token>"),
               out: Output = Output("<token>:misc.word", cls="token:word"),
               keep_formatting_chars: Optional[bool] = Config(
                   "misc.keep_formatting_chars")):
    """Add the text content for each edge as a new annotation."""
    corpus_text = text.read()
    if isinstance(chunk, (str, Annotation)):
        chunk = chunk.read_spans()
    out_annotation = []
    for span in chunk:
        token = corpus_text[span[0]:span[1]]
        if not keep_formatting_chars:
            new_token = util.remove_formatting_characters(token)
            # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad!
            if new_token:
                token = new_token
        out_annotation.append(token)
    if out:
        out.write(out_annotation)
    else:
        return out_annotation
コード例 #21
0
def resolution(
        out_resolution: OutputCommonData = OutputCommonData(
            "dateformat.resolution"),
        informat: Optional[str] = Config("dateformat.datetime_informat")):
    """Get the datetime resolution from the informat defined in the corpus config.

    Args:
        out_resolution: Date format output.
        informat: Date in-format, used to calculate date resolution.
    """
    resolutions = []

    if informat:
        informats = informat.strip("|").split("|")
        for i in informats:
            res = []
            if any(s in i for s in ["%Y", "%y"]):
                res.append("Y")
            if any(s in i for s in ["%b", "%B", "%m"]):
                res.append("M")
            if any(s in i for s in ["%a", "%A", "%w", "%d"]):
                res.append("D")
            if any(s in i for s in ["%H", "%I"]):
                res.append("h")
            if "%M" in i:
                res.append("m")
            if "%S" in i:
                res.append("s")
            resolutions.append("".join(res))

        # Sort with more fine-grained resolutions first
        resolutions.sort(key=len, reverse=True)

    resolutions = "|".join(resolutions)

    # Write time resolution file
    out_resolution.write(resolutions)
コード例 #22
0
ファイル: malt.py プロジェクト: heatherleaf/sparv-pipeline
# The value of this constant is a bit arbitrary, and could probably be longer.
RESTART_THRESHOLD_LENGTH = 64000

SENT_SEP = "\n\n"
TOK_SEP = "\n"
TAG_SEP = "\t"
HEAD_COLUMN = 6
DEPREL_COLUMN = 7
UNDEF = "_"


@annotator("Dependency parsing using MALT Parser",
           language=["swe"],
           config=[
               Config("malt.jar",
                      default="maltparser-1.7.2/maltparser-1.7.2.jar",
                      description="Path name of the executable .jar file"),
               Config("malt.model",
                      default="malt/swemalt-1.7.2.mco",
                      description="Path to MALT model")
           ])
def annotate(
        maltjar: Binary = Binary("[malt.jar]"),
        model: Model = Model("[malt.model]"),
        out_dephead: Output = Output(
            "<token>:malt.dephead",
            cls="token:dephead",
            description="Positions of the dependency heads"),
        out_dephead_ref: Output = Output(
            "<token>:malt.dephead_ref",
            cls="token:dephead_ref",
コード例 #23
0
"""Annotate words with lexical classes from Blingbring or SweFN."""

import logging
from typing import List

import sparv.util as util
from sparv import Annotation, Config, Model, Output, annotator

log = logging.getLogger(__name__)


@annotator("Annotate tokens with Blingbring classes", language=["swe"], config=[
    Config("lexical_classes.bb_word_model", default="lexical_classes/blingbring.pickle",
           description="Path to Blingbring model")
])
def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring",
                                          description="Lexical classes for tokens from Blingbring"),
                     model: Model = Model("[lexical_classes.bb_word_model]"),
                     saldoids: Annotation = Annotation("<token:sense>"),
                     pos: Annotation = Annotation("<token:pos>"),
                     pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                     class_set: str = "bring",
                     disambiguate: bool = True,
                     connect_ids: bool = False,
                     delimiter: str = util.DELIM,
                     affix: str = util.AFFIX,
                     scoresep: str = util.SCORESEP,
                     lexicon=None):
    """Blingbring specific wrapper for annotate_words. See annotate_words for more info."""
    # pos_limit="NN VB JJ AB" | None
コード例 #24
0
"""Export annotated corpus data to pretty-printed xml."""

import logging
import os

import sparv.util as util
from sparv import (AllDocuments, Annotation, AnnotationData, Config, Corpus, Document, Export, ExportAnnotations,
                   ExportInput, OutputCommonData, SourceAnnotations, exporter, installer)
from . import xml_utils

log = logging.getLogger(__name__)


@exporter("XML export with one token element per line", config=[
    Config("xml_export.filename", default="{doc}_export.xml",
           description="Filename pattern for resulting XML files, with '{doc}' representing the source name."),
    Config("xml_export.annotations", description="Sparv annotations to include."),
    Config("xml_export.source_annotations",
           description="List of annotations and attributes from the source data to include. Everything will be "
                       "included by default."),
    Config("xml_export.header_annotations",
           description="List of headers from the source data to include. All headers will be included by default."),
    Config("xml_export.include_empty_attributes", False,
           description="Whether to include attributes even when they are empty.")
])
def pretty(doc: Document = Document(),
           docid: AnnotationData = AnnotationData("<docid>"),
           out: Export = Export("xml_pretty/[xml_export.filename]"),
           token: Annotation = Annotation("<token>"),
           word: Annotation = Annotation("[export.word]"),
           annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
コード例 #25
0
ファイル: saldo.py プロジェクト: heatherleaf/sparv-pipeline
def annotate(token: Annotation = Annotation("<token>"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             reference: Annotation = Annotation(
                 "<token>:misc.number_rel_<sentence>"),
             out_sense: Output = Output("<token>:saldo.sense",
                                        cls="token:sense",
                                        description="SALDO identifier"),
             out_lemgram: Output = Output("<token>:saldo.lemgram",
                                          description="SALDO lemgram"),
             out_baseform: Output = Output("<token>:saldo.baseform",
                                           cls="token:baseform",
                                           description="Baseform from SALDO"),
             models: List[Model] = [Model("[saldo.model]")],
             msd: Optional[Annotation] = Annotation("<token:msd>"),
             delimiter: str = util.DELIM,
             affix: str = util.AFFIX,
             precision: str = Config("saldo.precision"),
             precision_filter: str = "max",
             min_precision: float = 0.66,
             skip_multiword: bool = False,
             allow_multiword_overlap: bool = False,
             word_separator: str = "",
             lexicons=None):
    """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words.

    - token, word, msd, sentence, reference: existing annotations
    - out_baseform, out_lemgram, out_sense: resulting annotations to be written
    - models: a list of pickled lexica, typically the Saldo model (saldo.pickle)
      and optional lexicons for older Swedish.
    - delimiter: delimiter character to put between ambiguous results
    - affix: an optional character to put before and after results
    - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f"
      (use empty string for no precision)
    - precision_filter: an optional filter, currently there are the following values:
        max: only use the annotations that are most probable
        first: only use the most probable annotation (or one of the most probable if more than one)
        none: use all annotations
    - min_precision: only use annotations with a probability score higher than this
    - skip_multiword: set to True to disable multi word annotations
    - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations.
      By setting this to True, all overlaps will be allowed.
    - word_separator: an optional character used to split the values of "word" into several word variations
    - lexicons: this argument cannot be set from the command line, but is used in the catapult.
      This argument must be last.
    """
    # Allow use of multiple lexicons
    models_list = [(m.path.stem, m) for m in models]
    if not lexicons:
        lexicon_list = [(name, SaldoLexicon(lex.path))
                        for name, lex in models_list]
    # Use pre-loaded lexicons (from catapult)
    else:
        lexicon_list = []
        for name, _lex in models_list:
            assert lexicons.get(
                name, None) is not None, "Lexicon %s not found!" % name
            lexicon_list.append((name, lexicons[name]))

    # Maximum number of gaps in multi-word units.
    # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc)
    max_gaps = 1

    # Combine annotation names i SALDO lexicon with out annotations
    annotations = []
    if out_baseform:
        annotations.append((out_baseform, "gf"))
    if out_lemgram:
        annotations.append((out_lemgram, "lem"))
    if out_sense:
        annotations.append((out_sense, "saldo"))

    if skip_multiword:
        log.info("Skipping multi word annotations")

    min_precision = float(min_precision)

    # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be
    # allowed to span over other verbs)
    skip_pos_check = (min_precision == 0.0)

    word_annotation = list(word.read())
    ref_annotation = list(reference.read())
    if msd:
        msd_annotation = list(msd.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    out_annotation = word.create_empty_attribute()

    for sent in sentences:
        incomplete_multis = [
        ]  # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}]
        complete_multis = []  # ([ref], annotation)
        sentence_tokens = {}

        for token_index in sent:
            theword = word_annotation[token_index]
            ref = ref_annotation[token_index]
            msdtag = msd_annotation[token_index] if msd else ""

            annotation_info = {}
            sentence_tokens[ref] = {
                "token_index": token_index,
                "annotations": annotation_info
            }

            # Support for multiple values of word
            if word_separator:
                thewords = [w for w in theword.split(word_separator) if w]
            else:
                thewords = [theword]

            # First use MSD tags to find the most probable single word annotations
            ann_tags_words = find_single_word(thewords, lexicon_list, msdtag,
                                              precision, min_precision,
                                              precision_filter,
                                              annotation_info)

            # Find multi-word expressions
            if not skip_multiword:
                find_multiword_expressions(incomplete_multis, complete_multis,
                                           thewords, ref, msdtag, max_gaps,
                                           ann_tags_words, msd_annotation,
                                           sent, skip_pos_check)

            # Loop to next token

        if not allow_multiword_overlap:
            # Check that we don't have any unwanted overlaps
            remove_unwanted_overlaps(complete_multis)

        # Then save the rest of the multi word expressions in sentence_tokens
        save_multiwords(complete_multis, sentence_tokens)

        for tok in list(sentence_tokens.values()):
            out_annotation[tok["token_index"]] = _join_annotation(
                tok["annotations"], delimiter, affix)

        # Loop to next sentence

    for out_annotation_obj, annotation_name in annotations:
        out_annotation_obj.write(
            [v.get(annotation_name, delimiter) for v in out_annotation])
コード例 #26
0
from sparv import Annotation, Config, Model, ModelOutput, Output, Text, annotator, modelbuilder
from sparv.modules.saldo.saldo_model import split_triple

try:
    from . import crf  # for CRF++ models
except ImportError:
    pass

log = logging.getLogger(__name__)


@annotator(
    "Automatic tokenization",
    config=[
        Config("segment.token_segmenter",
               default="better_word",
               description="Token segmenter to use"),
        Config("segment.token_chunk",
               default="<sentence>",
               description=
               "Text chunk (annotation) to use as input when tokenizing"),
        Config("segment.existing_tokens",
               description="Optional existing token annotation"),
        Config("segment.tokenizer_config",
               default="segment/bettertokenizer.sv",
               description="Path to tokenizer config"),
        Config("segment.token_list",
               default="segment/bettertokenizer.sv.saldo-tokens",
               description="Path to optional token list file")
    ])
def tokenize(
コード例 #27
0
import logging

import sparv.util as util
from sparv import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, modelbuilder

log = logging.getLogger(__name__)

SENT_SEP = "$SENT$"


@annotator("Word sense disambiguation",
           language=["swe"],
           config=[
               Config("wsd.sense_model",
                      default="wsd/ALL_512_128_w10_A2_140403_ctx1.bin",
                      description="Path to sense model"),
               Config("wsd.context_model",
                      default="wsd/lem_cbow0_s512_w10_NEW2_ctx.bin",
                      description="Path to context model"),
               Config("wsd.default_prob",
                      -1.0,
                      description="Default value for unanalyzed senses"),
               Config("wsd.jar",
                      default="wsd/saldowsd.jar",
                      description="Path name of the executable .jar file"),
               Config("wsd.prob_format",
                      util.SCORESEP + "%.3f",
                      description="Format string for how to print the "
                      "sense probability")
           ])
コード例 #28
0
    "eng": "Penn",
    "fra": "TreeTagger",
    "spa": "TreeTagger",
    "ita": "TreeTagger",
    "rus": "TreeTagger",
}


@annotator("Part-of-speech tags and baseforms from TreeTagger",
           language=[
               "bul", "est", "fin", "lat", "nld", "pol", "ron", "slk", "deu",
               "eng", "fra", "spa", "ita", "rus"
           ],
           config=[
               Config("treetagger.binary",
                      "tree-tagger",
                      description="TreeTagger executable"),
               Config("treetagger.model",
                      "treetagger/[metadata.language].par",
                      description="Path to TreeTagger model")
           ])
def annotate(
        lang: Language = Language(),
        model: Model = Model("[treetagger.model]"),
        tt_binary: Binary = Binary("[treetagger.binary]"),
        out_upos: Output = Output("<token>:treetagger.upos",
                                  cls="token:upos",
                                  description="Part-of-speeches in UD"),
        out_pos: Output = Output(
            "<token>:treetagger.pos",
            cls="token:pos",
コード例 #29
0
def annotate(
        wsdjar: Binary = Binary("[wsd.jar]"),
        sense_model: Model = Model("[wsd.sense_model]"),
        context_model: Model = Model("[wsd.context_model]"),
        out: Output = Output(
            "<token>:wsd.sense",
            cls="token:sense",
            description="Sense disambiguated SALDO identifiers"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        saldo: Annotation = Annotation("<token>:saldo.sense"),
        pos: Annotation = Annotation("<token:pos>"),
        token: Annotation = Annotation("<token>"),
        prob_format: str = Config("wsd.prob_format"),
        default_prob: float = Config("wsd.default_prob"),
        encoding: str = util.UTF8):
    """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation.

    Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob.
      - wsdjar is the name of the java programme to be used for the wsd
      - sense_model and context_model are the models to be used with wsdjar
      - out is the resulting annotation file
      - sentence is an existing annotation for sentences and their children (words)
      - word is an existing annotations for wordforms
      - ref is an existing annotation for word references
      - lemgram and saldo are existing annotations for inflection tables and meanings
      - pos is an existing annotations for part-of-speech
      - prob_format is a format string for how to print the sense probability
      - default_prob is the default value for unanalyzed senses
    """
    word_annotation = list(word.read())
    ref_annotation = list(ref.read())
    lemgram_annotation = list(lemgram.read())
    saldo_annotation = list(saldo.read())
    pos_annotation = list(pos.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    # Start WSD process
    process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding)

    # Construct input and send to WSD
    stdin = build_input(sentences, word_annotation, ref_annotation,
                        lemgram_annotation, saldo_annotation, pos_annotation)

    if encoding:
        stdin = stdin.encode(encoding)

    stdout, stderr = process.communicate(stdin)
    # TODO: Solve hack line below!
    # Problem is that regular messages "Reading sense vectors.." are also piped to stderr.
    if len(stderr) > 52:
        util.system.kill_process(process)
        log.error(str(stderr))
        return

    if encoding:
        stdout = stdout.decode(encoding)

    process_output(word, out, stdout, sentences, saldo_annotation, prob_format,
                   default_prob)

    # Kill running subprocess
    util.system.kill_process(process)
    return
コード例 #30
0
ファイル: saldo.py プロジェクト: heatherleaf/sparv-pipeline
import sparv.util as util
from sparv import Annotation, Config, Model, Output, annotator
from sparv.modules.saldo.saldo_model import SaldoLexicon

log = logging.getLogger(__name__)

# The minimum precision difference for two annotations to be considered equal
PRECISION_DIFF = 0.01


@annotator(
    "SALDO annotations",
    language=["swe"],
    config=[
        Config("saldo.model",
               default="saldo/saldo.pickle",
               description="Path to SALDO model"),
        Config(
            "saldo.precision",
            "",
            description="Format string for appending precision to each value")
    ])
def annotate(token: Annotation = Annotation("<token>"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             reference: Annotation = Annotation(
                 "<token>:misc.number_rel_<sentence>"),
             out_sense: Output = Output("<token>:saldo.sense",
                                        cls="token:sense",
                                        description="SALDO identifier"),
             out_lemgram: Output = Output("<token>:saldo.lemgram",