def install_original(corpus: Corpus = Corpus(), xmlfile: ExportInput = ExportInput("[xml_export.filename_compressed]"), out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"), export_path: str = Config("xml_export.export_original_path"), host: str = Config("xml_export.export_original_host")): """Copy compressed combined unscrambled XML to remote host.""" xml_utils.install_compressed_xml(corpus, xmlfile, out, export_path, host)
def pretty(doc: Document = Document(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export("xml_pretty/[xml_export.filename]"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config("xml_export.include_empty_attributes")): """Export annotations to pretty XML in export_dir. Args: doc: Name of the original document. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. token: Annotation containing the token strings. word: Annotation containing the token strings. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Read words and document ID word_annotation = list(word.read()) docid_annotation = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, token_name=token_name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations, doc=doc, split_overlaps=True) xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation, docid_annotation, include_empty_attributes, sparv_namespace) # Write XML to file with open(out, mode="w") as outfile: outfile.write(xmlstr) log.info("Exported: %s", out)
def encode_scrambled( corpus: Corpus = Corpus(), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations", is_input=False), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), docs: AllDocuments = AllDocuments(), words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"), vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt", all_docs=True), out: Export = Export("[cwb.corpus_registry]/[metadata.id]", absolute_path=True), out_marker: Export = Export( "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker", absolute_path=True), token: AnnotationAllDocs = AnnotationAllDocs("<token>"), bin_path: Config = Config("cwb.bin_path"), encoding: str = Config("cwb.encoding"), datadir: str = Config("cwb.cwb_datadir"), registry: str = Config("cwb.corpus_registry"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), skip_compression: Optional[bool] = Config("cwb.skip_compression"), skip_validation: Optional[bool] = Config("cwb.skip_validation")): """Do cwb encoding with vrt files in scrambled order.""" cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, out, out_marker, token.name, bin_path, encoding, datadir, registry, remove_namespaces, sparv_namespace, source_namespace, skip_compression, skip_validation)
def vrt_scrambled( doc: Document = Document(), out: Export = Export("vrt_scrambled/{doc}.vrt"), chunk: Annotation = Annotation("[cwb.scramble_on]"), chunk_order: Annotation = Annotation( "[cwb.scramble_on]:misc.number_random"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt in scrambled order.""" # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) if chunk not in annotation_list: raise util.SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output." .format(chunk)) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, doc=doc, split_overlaps=True) # Read words and document ID word_annotation = list(word.read()) chunk_order_data = list(chunk_order.read()) # Reorder chunks and open/close tags in correct order new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order_data) # Make vrt format vrt_data = create_vrt(new_span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def parse(doc: Document = Document(), source_dir: Source = Source(), elements: list = Config("xml_import.elements"), skip: list = Config("xml_import.skip"), header_elements: list = Config("xml_import.header_elements"), header_data: list = Config("xml_import.header_data"), prefix: str = Config("xml_import.prefix"), encoding: str = Config("xml_import.encoding"), keep_control_chars: bool = Config("xml_import.keep_control_chars"), normalize: str = Config("xml_import.normalize")): """Parse XML source file and create annotation files. Args: doc: Source document name. source_dir: Directory containing source documents. elements: List of elements and attributes in source document. Only needed for renaming, as everything is parsed whether listed or not. skip: Elements and attributes to skip. Use elementname:@contents to skip contents as well. header_elements: Elements containing header metadata. Contents will not be included in corpus text. header_data: List of header elements and attributes from which to extract metadata. prefix: Optional prefix to add to annotations. encoding: Encoding of source document. Defaults to UTF-8. keep_control_chars: Set to True to keep control characters in the text. normalize: Normalize input using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'. Defaults to 'NFC'. """ parser = SparvXMLParser(elements, skip, header_elements, header_data, encoding, source_dir, prefix, keep_control_chars, normalize) parser.parse(doc) parser.save()
def install_json(jsonfile: ExportInput = ExportInput("[metadata.id].json"), out: OutputCommonData = OutputCommonData( "sbx_metadata.install_json_export_marker"), export_path: str = Config("sbx_metadata.json_export_path"), host: str = Config("sbx_metadata.json_export_host")): """Copy JSON metadata to remote host.""" if not host: raise util.SparvErrorMessage( "'sbx_metadata.json_export_host' not set! JSON export not installed." ) filename = Path(jsonfile).name remote_file_path = os.path.join(export_path, filename) util.install_file(host, jsonfile, remote_file_path) out.write("")
def install_metashare( xmlfile: ExportInput = ExportInput("sbx_metadata/[metadata.id].xml"), out: OutputCommonData = OutputCommonData( "sbx_metadata.install_metashare_marker"), export_path: str = Config("sbx_metadata.metashare_path"), host: str = Config("sbx_metadata.metashare_host")): """Copy META-SHARE file to remote host.""" if not host: raise util.SparvErrorMessage( "'sbx_metadata.metashare_host' not set! META-SHARE export not installed." ) filename = Path(xmlfile).name remote_file_path = os.path.join(export_path, filename) util.install_file(host, xmlfile, remote_file_path) out.write("")
def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info", absolute_path=True), sentences: AnnotationCommonData = AnnotationCommonData( "cwb.sentencecount"), firstdate: AnnotationCommonData = AnnotationCommonData( "cwb.datefirst"), lastdate: AnnotationCommonData = AnnotationCommonData("cwb.datelast"), resolution: AnnotationCommonData = AnnotationCommonData( "dateformat.resolution"), protected: bool = Config("korp.protected")): """Save information to the file specified by 'out'.""" content = [] protected_str = str(protected).lower() for key, value_obj in [("Sentences", sentences), ("FirstDate", firstdate), ("LastDate", lastdate), ("DateResolution", resolution), ("Updated", time.strftime("%Y-%m-%d")), ("Protected", protected_str)]: if isinstance(value_obj, AnnotationCommonData): value = value_obj.read() else: value = value_obj content.append("%s: %s\n" % (key, value)) # Write .info file with open(out, "w") as o: o.writelines(content) log.info("Exported: %s", out)
def install_relations( sqlfile: ExportInput = ExportInput("korp_wordpicture/relations.sql"), out: OutputCommonData = OutputCommonData( "korp.install_relations_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): """Install Korp's Word Picture SQL on remote host. Args: sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_wordpicture/relations.sql"). out (str, optional): Marker file to be written. db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ util.install_mysql(host, db_name, sqlfile) out.write("")
def paragraph( text: Text = Text(), out: Output = Output("segment.paragraph", cls="paragraph", description="Paragraph segments"), chunk: Optional[Annotation] = Annotation("[segment.paragraph_chunk]"), segmenter: str = Config("segment.paragraph_segmenter"), existing_segments: Optional[str] = Config("segment.existing_paragraphs" ), model: Optional[Model] = None): """Split text into paragraphs.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model)
def sentence( text: Text = Text(), out: Output = Output("segment.sentence", cls="sentence", description="Sentence segments"), chunk: Optional[Annotation] = Annotation("[segment.sentence_chunk]"), segmenter: str = Config("segment.sentence_segmenter"), existing_segments: Optional[str] = Config( "segment.existing_sentences"), model: Optional[Model] = Model("[segment.sentence_model]")): """Split text into sentences.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model)
def install_lemgrams(sqlfile: ExportInput = ExportInput( "korp_lemgram_index/lemgram_index.sql"), marker: OutputCommonData = OutputCommonData( "korp.install_lemgram_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): """Install lemgram SQL on remote host. Args: sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_lemgram_index/lemgram_index.sql"). marker (str, optional): Marker file to be written. Defaults to OutputCommonData("korp.install_lemgram_marker"). db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ util.install_mysql(host, db_name, sqlfile) marker.write("")
def tokenize( text: Text = Text(), out: Output = Output("segment.token", cls="token", description="Token segments"), chunk: Annotation = Annotation("[segment.token_chunk]"), segmenter: str = Config("segment.token_segmenter"), existing_segments: Optional[str] = Config("segment.existing_tokens"), model: Optional[Model] = Model("[segment.tokenizer_config]"), token_list: Optional[Model] = Model("[segment.token_list]")): """Tokenize text.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model, token_list=token_list)
def vrt(doc: Document = Document(), out: Export = Export("vrt/{doc}.vrt"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt. - annotations: list of elements:attributes (annotations) to include. - source_annotations: list of elements:attributes from the original document to be kept. If not specified, everything will be kept. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read words word_annotation = list(word.read()) # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) vrt_data = create_vrt(span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def cwb_align(corpus, other, link, aligndir="annotations/align", bin_path="", encoding: str = Config("cwb.encoding", "utf8")): """Align 'corpus' with 'other' corpus, using the 'link' annotation for alignment.""" os.makedirs(aligndir, exist_ok=True) alignfile = os.path.join(aligndir, corpus + ".align") log.info("Aligning %s <-> %s", corpus, other) try: [(link_name, [(link_attr, _path)])] = parse_structural_attributes(link) except ValueError: raise ValueError("You have to specify exactly one alignment link.") link_attr = link_name + "_" + link_attr # Align linked chunks args = ["-v", "-o", alignfile, "-V", link_attr, corpus, other, link_name] result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-align"), args, encoding=encoding) with open(alignfile + ".result", "w") as F: print(result, file=F) _, lastline = result.rsplit("Alignment complete.", 1) log.info("%s", lastline.strip()) if " 0 alignment" in lastline.strip(): log.warning("No alignment regions created") log.info("Alignment file/result: %s/.result", alignfile) # Add alignment parameter to registry # cwb-regedit is not installed by default, so we skip it and modify the regfile directly instead: regfile = os.path.join(os.environ["CORPUS_REGISTRY"], corpus) with open(regfile) as F: skip_align = ("ALIGNED %s" % other) in F.read() if not skip_align: with open(regfile, "a") as F: print(file=F) print("# Added by cwb.py", file=F) print("ALIGNED", other, file=F) log.info("Added alignment to registry: %s", regfile) # args = [corpus, ":add", ":a", other] # result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-regedit"), args) # log.info("%s", result.strip()) # Encode the alignments into CWB args = ["-v", "-D", alignfile] result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-align-encode"), args, encoding=encoding) log.info("%s", result.strip())
def freq_list_simple(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"), pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"), baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"), out: Export = Export("frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff")): """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" freq_dict = defaultdict(int) for doc in docs: simple_tokens = word.read_attributes(doc, [word, pos, baseform]) # Add empty annotations for sense, lemgram and complemgram tokens = [] for w, p, b in simple_tokens: tokens.append((w, p, b, "|", "|", "|")) update_freqs(tokens, freq_dict) write_csv(out, freq_dict, delimiter, cutoff)
def freq_list(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"), msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"), baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"), sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"), lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"), complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"), out: Export = Export("frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff"), include_all_compounds: bool = Config("stats_export.include_all_compounds")): """Create a word frequency list for the entire corpus. Args: corpus (str, optional): The corpus ID. Defaults to Corpus. docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments. word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>"). msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>"). baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>"). sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>"). lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram"). complemgram (str, optional): Compound lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.complemgram"). out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv"). delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter"). cutoff (int, optional): The minimum frequency a word must have in order to be included in the result. Defaults to Config("stats_export.cutoff"). include_all_compounds (bool, optional): Whether to include compound analyses for every word or just for the words that are lacking a sense annotation. Defaults to Config("stats_export.include_all_compounds"). """ freq_dict = defaultdict(int) for doc in docs: tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram]) update_freqs(tokens, freq_dict, include_all_compounds) write_csv(out, freq_dict, delimiter, cutoff)
def dateformat( in_from: Annotation = Annotation("[dateformat.datetime_from]"), in_to: Optional[Annotation] = Annotation("[dateformat.datetime_to]"), out_from: Output = Output( "[dateformat.out_annotation]:dateformat.datefrom", description="From-dates"), out_to: Optional[Output] = Output( "[dateformat.out_annotation]:dateformat.dateto", description="To-dates"), informat: str = Config("dateformat.datetime_informat"), outformat: str = Config("dateformat.date_outformat"), splitter: Optional[str] = Config("dateformat.splitter", None), regex: Optional[str] = Config("dateformat.regex", None)): """Convert existing dates/times to specified date output format. http://docs.python.org/library/datetime.html#strftime-and-strptime-behavior Args: in_from (str, optional): Annotation containing from-dates (and times). Defaults to Annotation("[dateformat.datetime_from]"). in_to (Optional[str], optional): Annotation containing to-dates. Defaults to Annotation("[dateformat.datetime_to]"). out_from (str, optional): Annotation with from-times to be written. Defaults to Output("[dateformat.out_annotation]:dateformat.datefrom",description="From-dates"). out_to (Optional[str], optional): Annotation with to-times to be written. Defaults to Output("[dateformat.out_annotation]:dateformat.dateto",description="To-dates"). informat (str, optional): Format of the in_from and in_to dates/times. Several formats can be specified separated by |. They will be tried in order. Defaults to Config("dateformat.datetime_informat"). outformat (str, optional): Desired format of the out_from and out_to dates. Several formats can be specified separated by |. They will be tied to their respective in-format. Defaults to Config("dateformat.date_outformat", "%Y%m%d"). splitter (str, optional): One or more characters separating two dates in 'in_from', treating them as from-date and to-date. Defaults to Config("dateformat.splitter", None). regex (str, optional): Regular expression with a catching group whose content will be used in the parsing instead of the whole string. Defaults to Config("dateformat.regex", None). """ _formatter(in_from, in_to, out_from, out_to, informat, outformat, splitter, regex)
def build_tokenlist( saldo_model: Model = Model("saldo/saldo.pickle"), out: ModelOutput = ModelOutput( "segment/bettertokenizer.sv.saldo-tokens"), segmenter: str = Config("segment.token_wordlist_segmenter"), model: Model = Model("segment/bettertokenizer.sv")): """Build a list of words from a SALDO model, to help BetterWordTokenizer.""" segmenter_args = [] if model: if model.path.suffix in ["pickle", "pkl"]: with open(model, "rb") as m: model_arg = pickle.load(m) else: model_arg = model.path segmenter_args.append(model_arg) assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join( sorted(SEGMENTERS)) segmenter = SEGMENTERS[segmenter] segmenter = segmenter(*segmenter_args) assert hasattr( segmenter, "span_tokenize" ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter wordforms = set() # Skip strings already handled by the tokenizer. # Also skip words ending in comma (used by some multi word expressions in SALDO). with open(saldo_model.path, "rb") as F: lexicon = pickle.load(F) for w in lexicon: w2 = list(map(split_triple, lexicon[w])) mwu_extras = [ contw for w3 in w2 for cont in w3[2] for contw in cont if contw not in lexicon ] for wf in mwu_extras + [w]: spans = list(segmenter.span_tokenize(wf)) if len(spans) > 1 and not wf.endswith(","): wordforms.add(wf) out.write("\n".join(sorted(wordforms)))
def text_spans(text: Text = Text(), chunk: Annotation = Annotation("<token>"), out: Output = Output("<token>:misc.word", cls="token:word"), keep_formatting_chars: Optional[bool] = Config( "misc.keep_formatting_chars")): """Add the text content for each edge as a new annotation.""" corpus_text = text.read() if isinstance(chunk, (str, Annotation)): chunk = chunk.read_spans() out_annotation = [] for span in chunk: token = corpus_text[span[0]:span[1]] if not keep_formatting_chars: new_token = util.remove_formatting_characters(token) # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad! if new_token: token = new_token out_annotation.append(token) if out: out.write(out_annotation) else: return out_annotation
def resolution( out_resolution: OutputCommonData = OutputCommonData( "dateformat.resolution"), informat: Optional[str] = Config("dateformat.datetime_informat")): """Get the datetime resolution from the informat defined in the corpus config. Args: out_resolution: Date format output. informat: Date in-format, used to calculate date resolution. """ resolutions = [] if informat: informats = informat.strip("|").split("|") for i in informats: res = [] if any(s in i for s in ["%Y", "%y"]): res.append("Y") if any(s in i for s in ["%b", "%B", "%m"]): res.append("M") if any(s in i for s in ["%a", "%A", "%w", "%d"]): res.append("D") if any(s in i for s in ["%H", "%I"]): res.append("h") if "%M" in i: res.append("m") if "%S" in i: res.append("s") resolutions.append("".join(res)) # Sort with more fine-grained resolutions first resolutions.sort(key=len, reverse=True) resolutions = "|".join(resolutions) # Write time resolution file out_resolution.write(resolutions)
# The value of this constant is a bit arbitrary, and could probably be longer. RESTART_THRESHOLD_LENGTH = 64000 SENT_SEP = "\n\n" TOK_SEP = "\n" TAG_SEP = "\t" HEAD_COLUMN = 6 DEPREL_COLUMN = 7 UNDEF = "_" @annotator("Dependency parsing using MALT Parser", language=["swe"], config=[ Config("malt.jar", default="maltparser-1.7.2/maltparser-1.7.2.jar", description="Path name of the executable .jar file"), Config("malt.model", default="malt/swemalt-1.7.2.mco", description="Path to MALT model") ]) def annotate( maltjar: Binary = Binary("[malt.jar]"), model: Model = Model("[malt.model]"), out_dephead: Output = Output( "<token>:malt.dephead", cls="token:dephead", description="Positions of the dependency heads"), out_dephead_ref: Output = Output( "<token>:malt.dephead_ref", cls="token:dephead_ref",
"""Annotate words with lexical classes from Blingbring or SweFN.""" import logging from typing import List import sparv.util as util from sparv import Annotation, Config, Model, Output, annotator log = logging.getLogger(__name__) @annotator("Annotate tokens with Blingbring classes", language=["swe"], config=[ Config("lexical_classes.bb_word_model", default="lexical_classes/blingbring.pickle", description="Path to Blingbring model") ]) def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring", description="Lexical classes for tokens from Blingbring"), model: Model = Model("[lexical_classes.bb_word_model]"), saldoids: Annotation = Annotation("<token:sense>"), pos: Annotation = Annotation("<token:pos>"), pos_limit: List[str] = ["NN", "VB", "JJ", "AB"], class_set: str = "bring", disambiguate: bool = True, connect_ids: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, scoresep: str = util.SCORESEP, lexicon=None): """Blingbring specific wrapper for annotate_words. See annotate_words for more info.""" # pos_limit="NN VB JJ AB" | None
"""Export annotated corpus data to pretty-printed xml.""" import logging import os import sparv.util as util from sparv import (AllDocuments, Annotation, AnnotationData, Config, Corpus, Document, Export, ExportAnnotations, ExportInput, OutputCommonData, SourceAnnotations, exporter, installer) from . import xml_utils log = logging.getLogger(__name__) @exporter("XML export with one token element per line", config=[ Config("xml_export.filename", default="{doc}_export.xml", description="Filename pattern for resulting XML files, with '{doc}' representing the source name."), Config("xml_export.annotations", description="Sparv annotations to include."), Config("xml_export.source_annotations", description="List of annotations and attributes from the source data to include. Everything will be " "included by default."), Config("xml_export.header_annotations", description="List of headers from the source data to include. All headers will be included by default."), Config("xml_export.include_empty_attributes", False, description="Whether to include attributes even when they are empty.") ]) def pretty(doc: Document = Document(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export("xml_pretty/[xml_export.filename]"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
def annotate(token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), reference: Annotation = Annotation( "<token>:misc.number_rel_<sentence>"), out_sense: Output = Output("<token>:saldo.sense", cls="token:sense", description="SALDO identifier"), out_lemgram: Output = Output("<token>:saldo.lemgram", description="SALDO lemgram"), out_baseform: Output = Output("<token>:saldo.baseform", cls="token:baseform", description="Baseform from SALDO"), models: List[Model] = [Model("[saldo.model]")], msd: Optional[Annotation] = Annotation("<token:msd>"), delimiter: str = util.DELIM, affix: str = util.AFFIX, precision: str = Config("saldo.precision"), precision_filter: str = "max", min_precision: float = 0.66, skip_multiword: bool = False, allow_multiword_overlap: bool = False, word_separator: str = "", lexicons=None): """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words. - token, word, msd, sentence, reference: existing annotations - out_baseform, out_lemgram, out_sense: resulting annotations to be written - models: a list of pickled lexica, typically the Saldo model (saldo.pickle) and optional lexicons for older Swedish. - delimiter: delimiter character to put between ambiguous results - affix: an optional character to put before and after results - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f" (use empty string for no precision) - precision_filter: an optional filter, currently there are the following values: max: only use the annotations that are most probable first: only use the most probable annotation (or one of the most probable if more than one) none: use all annotations - min_precision: only use annotations with a probability score higher than this - skip_multiword: set to True to disable multi word annotations - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations. By setting this to True, all overlaps will be allowed. - word_separator: an optional character used to split the values of "word" into several word variations - lexicons: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ # Allow use of multiple lexicons models_list = [(m.path.stem, m) for m in models] if not lexicons: lexicon_list = [(name, SaldoLexicon(lex.path)) for name, lex in models_list] # Use pre-loaded lexicons (from catapult) else: lexicon_list = [] for name, _lex in models_list: assert lexicons.get( name, None) is not None, "Lexicon %s not found!" % name lexicon_list.append((name, lexicons[name])) # Maximum number of gaps in multi-word units. # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc) max_gaps = 1 # Combine annotation names i SALDO lexicon with out annotations annotations = [] if out_baseform: annotations.append((out_baseform, "gf")) if out_lemgram: annotations.append((out_lemgram, "lem")) if out_sense: annotations.append((out_sense, "saldo")) if skip_multiword: log.info("Skipping multi word annotations") min_precision = float(min_precision) # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be # allowed to span over other verbs) skip_pos_check = (min_precision == 0.0) word_annotation = list(word.read()) ref_annotation = list(reference.read()) if msd: msd_annotation = list(msd.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) out_annotation = word.create_empty_attribute() for sent in sentences: incomplete_multis = [ ] # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}] complete_multis = [] # ([ref], annotation) sentence_tokens = {} for token_index in sent: theword = word_annotation[token_index] ref = ref_annotation[token_index] msdtag = msd_annotation[token_index] if msd else "" annotation_info = {} sentence_tokens[ref] = { "token_index": token_index, "annotations": annotation_info } # Support for multiple values of word if word_separator: thewords = [w for w in theword.split(word_separator) if w] else: thewords = [theword] # First use MSD tags to find the most probable single word annotations ann_tags_words = find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, precision_filter, annotation_info) # Find multi-word expressions if not skip_multiword: find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, ann_tags_words, msd_annotation, sent, skip_pos_check) # Loop to next token if not allow_multiword_overlap: # Check that we don't have any unwanted overlaps remove_unwanted_overlaps(complete_multis) # Then save the rest of the multi word expressions in sentence_tokens save_multiwords(complete_multis, sentence_tokens) for tok in list(sentence_tokens.values()): out_annotation[tok["token_index"]] = _join_annotation( tok["annotations"], delimiter, affix) # Loop to next sentence for out_annotation_obj, annotation_name in annotations: out_annotation_obj.write( [v.get(annotation_name, delimiter) for v in out_annotation])
from sparv import Annotation, Config, Model, ModelOutput, Output, Text, annotator, modelbuilder from sparv.modules.saldo.saldo_model import split_triple try: from . import crf # for CRF++ models except ImportError: pass log = logging.getLogger(__name__) @annotator( "Automatic tokenization", config=[ Config("segment.token_segmenter", default="better_word", description="Token segmenter to use"), Config("segment.token_chunk", default="<sentence>", description= "Text chunk (annotation) to use as input when tokenizing"), Config("segment.existing_tokens", description="Optional existing token annotation"), Config("segment.tokenizer_config", default="segment/bettertokenizer.sv", description="Path to tokenizer config"), Config("segment.token_list", default="segment/bettertokenizer.sv.saldo-tokens", description="Path to optional token list file") ]) def tokenize(
import logging import sparv.util as util from sparv import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, modelbuilder log = logging.getLogger(__name__) SENT_SEP = "$SENT$" @annotator("Word sense disambiguation", language=["swe"], config=[ Config("wsd.sense_model", default="wsd/ALL_512_128_w10_A2_140403_ctx1.bin", description="Path to sense model"), Config("wsd.context_model", default="wsd/lem_cbow0_s512_w10_NEW2_ctx.bin", description="Path to context model"), Config("wsd.default_prob", -1.0, description="Default value for unanalyzed senses"), Config("wsd.jar", default="wsd/saldowsd.jar", description="Path name of the executable .jar file"), Config("wsd.prob_format", util.SCORESEP + "%.3f", description="Format string for how to print the " "sense probability") ])
"eng": "Penn", "fra": "TreeTagger", "spa": "TreeTagger", "ita": "TreeTagger", "rus": "TreeTagger", } @annotator("Part-of-speech tags and baseforms from TreeTagger", language=[ "bul", "est", "fin", "lat", "nld", "pol", "ron", "slk", "deu", "eng", "fra", "spa", "ita", "rus" ], config=[ Config("treetagger.binary", "tree-tagger", description="TreeTagger executable"), Config("treetagger.model", "treetagger/[metadata.language].par", description="Path to TreeTagger model") ]) def annotate( lang: Language = Language(), model: Model = Model("[treetagger.model]"), tt_binary: Binary = Binary("[treetagger.binary]"), out_upos: Output = Output("<token>:treetagger.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output( "<token>:treetagger.pos", cls="token:pos",
def annotate( wsdjar: Binary = Binary("[wsd.jar]"), sense_model: Model = Model("[wsd.sense_model]"), context_model: Model = Model("[wsd.context_model]"), out: Output = Output( "<token>:wsd.sense", cls="token:sense", description="Sense disambiguated SALDO identifiers"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), saldo: Annotation = Annotation("<token>:saldo.sense"), pos: Annotation = Annotation("<token:pos>"), token: Annotation = Annotation("<token>"), prob_format: str = Config("wsd.prob_format"), default_prob: float = Config("wsd.default_prob"), encoding: str = util.UTF8): """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation. Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob. - wsdjar is the name of the java programme to be used for the wsd - sense_model and context_model are the models to be used with wsdjar - out is the resulting annotation file - sentence is an existing annotation for sentences and their children (words) - word is an existing annotations for wordforms - ref is an existing annotation for word references - lemgram and saldo are existing annotations for inflection tables and meanings - pos is an existing annotations for part-of-speech - prob_format is a format string for how to print the sense probability - default_prob is the default value for unanalyzed senses """ word_annotation = list(word.read()) ref_annotation = list(ref.read()) lemgram_annotation = list(lemgram.read()) saldo_annotation = list(saldo.read()) pos_annotation = list(pos.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) # Start WSD process process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding) # Construct input and send to WSD stdin = build_input(sentences, word_annotation, ref_annotation, lemgram_annotation, saldo_annotation, pos_annotation) if encoding: stdin = stdin.encode(encoding) stdout, stderr = process.communicate(stdin) # TODO: Solve hack line below! # Problem is that regular messages "Reading sense vectors.." are also piped to stderr. if len(stderr) > 52: util.system.kill_process(process) log.error(str(stderr)) return if encoding: stdout = stdout.decode(encoding) process_output(word, out, stdout, sentences, saldo_annotation, prob_format, default_prob) # Kill running subprocess util.system.kill_process(process) return
import sparv.util as util from sparv import Annotation, Config, Model, Output, annotator from sparv.modules.saldo.saldo_model import SaldoLexicon log = logging.getLogger(__name__) # The minimum precision difference for two annotations to be considered equal PRECISION_DIFF = 0.01 @annotator( "SALDO annotations", language=["swe"], config=[ Config("saldo.model", default="saldo/saldo.pickle", description="Path to SALDO model"), Config( "saldo.precision", "", description="Format string for appending precision to each value") ]) def annotate(token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), reference: Annotation = Annotation( "<token>:misc.number_rel_<sentence>"), out_sense: Output = Output("<token>:saldo.sense", cls="token:sense", description="SALDO identifier"), out_lemgram: Output = Output("<token>:saldo.lemgram",