def override(chunk: Annotation, repl: Annotation, out: Output): """Replace values in 'chunk' with non empty values from 'repl'.""" def empty(val): if not val: return True return val == "|" repl = list(repl.read()) out.write((repl[n] if not empty(repl[n]) else val for (n, val) in enumerate(chunk.read())))
def concat(out: Output, left: Annotation, right: Annotation, separator: str = "", merge_twins: bool = False): """Concatenate values from two annotations, with an optional separator. If merge_twins is set to True, no concatenation will be done on identical values. """ b = list(right.read()) out.write((f"{val_a}{separator}{b[n]}" if not (merge_twins and val_a == b[n]) else val_a for (n, val_a) in enumerate(left.read())))
def vrt_scrambled( doc: Document = Document(), out: Export = Export("vrt_scrambled/{doc}.vrt"), chunk: Annotation = Annotation("[cwb.scramble_on]"), chunk_order: Annotation = Annotation( "[cwb.scramble_on]:misc.number_random"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt in scrambled order.""" # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) if chunk not in annotation_list: raise util.SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output." .format(chunk)) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, doc=doc, split_overlaps=True) # Read words and document ID word_annotation = list(word.read()) chunk_order_data = list(chunk_order.read()) # Reorder chunks and open/close tags in correct order new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order_data) # Make vrt format vrt_data = create_vrt(new_span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), context: Annotation = Annotation("[geo.context_chunk]"), ne_type: Annotation = Annotation("swener.ne:swener.type"), ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"), ne_name: Annotation = Annotation("swener.ne:swener.name"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Annotate chunks with location data, based on locations contained within the text. context = text chunk to use for disambiguating places (when applicable). chunk = text chunk to which the annotation will be added. """ model = load_model(model, language=language) ne_type_annotation = list(ne_type.read()) ne_subtype_annotation = list(ne_subtype.read()) ne_name_annotation = list(ne_name.read()) children_context_chunk, _orphans = context.get_children(chunk) children_chunk_ne, _orphans = chunk.get_children(ne_type) out_annotation = chunk.create_empty_attribute() for chunks in children_context_chunk: all_locations = [] # TODO: Maybe not needed for anything? context_locations = [] chunk_locations = defaultdict(list) for ch in chunks: for n in children_chunk_ne[ch]: if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]: location_text = ne_name_annotation[n].replace("\n", " ").replace(" ", " ") location_data = model.get(location_text.lower()) if location_data: all_locations.append((location_text, list(location_data))) context_locations.append((location_text, list(location_data))) chunk_locations[ch].append((location_text, list(location_data))) else: pass # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%")) chunk_locations = most_populous(chunk_locations) for c in chunks: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def ufeatstag(out: Output = Output( "<token>:misc.ufeats", cls="token:ufeats", description="Universal morphological features"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>")): """Convert SUC MSD tags to universal features.""" pos_tags = pos.read() msd_tags = msd.read() out_annotation = [] for pos_tag, msd_tag in zip(pos_tags, msd_tags): feats = util.tagsets.suc_to_feats(pos_tag, msd_tag) out_annotation.append(util.cwbset(feats)) out.write(out_annotation)
def uppercase( word: Annotation = Annotation("<token:word>"), out: Output = Output("<token>:uppercase.upper"), # some_config_variable: str = Config("uppercase.some_setting") ): """Convert to uppercase.""" out.write([val.upper() for val in word.read()])
def diapivot_annotate( out: Output = Output( "<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), model: Model = Model("hist/diapivot.pickle")): """Annotate each lemgram with its corresponding saldo_id according to model. Args: out (str, optional): Resulting annotation file. Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams"). lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram"). model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle"). """ lexicon = PivotLexicon(model) lemgram_annotation = list(lemgram.read()) out_annotation = [] for lemgrams in lemgram_annotation: saldo_ids = [] for lemgram in lemgrams.split(util.DELIM): s_i = lexicon.get_exactMatch(lemgram) if s_i: saldo_ids += [s_i] out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) + util.AFFIX if saldo_ids else util.AFFIX) out.write(out_annotation)
def translate_tag(out: Output, tag: Annotation, mapping: dict = {}): """Convert part-of-speech tags, specified by the mapping. Example mappings: parole_to_suc, suc_to_simple, ... """ if isinstance(mapping, str): mapping = util.tagsets.mappings[mapping] out.write((mapping.get(t, t) for t in tag.read()))
def find_replace_regex(chunk: Annotation, out: Output, find: str = "", sub: str = ""): """Do find and replace in values of annotation using a regular expressions. N.B: When writing regular expressions in YAML they should be enclosed in single quotes. """ out.write((re.sub(find, sub, val) for val in chunk.read()))
def pretty(doc: Document = Document(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export("xml_pretty/[xml_export.filename]"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config("xml_export.include_empty_attributes")): """Export annotations to pretty XML in export_dir. Args: doc: Name of the original document. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. token: Annotation containing the token strings. word: Annotation containing the token strings. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Read words and document ID word_annotation = list(word.read()) docid_annotation = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, token_name=token_name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations, doc=doc, split_overlaps=True) xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation, docid_annotation, include_empty_attributes, sparv_namespace) # Write XML to file with open(out, mode="w") as outfile: outfile.write(xmlstr) log.info("Exported: %s", out)
def msdtag(out: Output = Output( "<token>:hunpos.msd", cls="token:msd", description="Part-of-speeches with morphological descriptions"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), binary: Binary = Binary("[hunpos.binary]"), model: Model = Model("[hunpos.model]"), morphtable: Optional[Model] = Model("[hunpos.morphtable]"), patterns: Optional[Model] = Model("[hunpos.patterns]"), tag_mapping=None, encoding: str = util.UTF8): """POS/MSD tag using the Hunpos tagger.""" if isinstance(tag_mapping, str) and tag_mapping: tag_mapping = util.tagsets.mappings[tag_mapping] elif tag_mapping is None or tag_mapping == "": tag_mapping = {} pattern_list = [] if patterns: with open(patterns.path, encoding="utf-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): name, pattern, tags = line.strip().split("\t", 2) pattern_list.append( (name, re.compile("^%s$" % pattern), tags)) def replace_word(w): """Replace word with alias if word matches a regex pattern.""" for p in pattern_list: if re.match(p[1], w): return "[[%s]]" % p[0] return w sentences, _orphans = sentence.get_children(word) token_word = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join( replace_word(token_word[token_index]) for token_index in sent) for sent in sentences) args = [model.path] if morphtable: args.extend(["-m", morphtable.path]) stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding) out_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_index, tagged_token in zip( sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] tag = tag_mapping.get(tag, tag) out_annotation[token_index] = tag out.write(out_annotation)
def struct_to_token( attr: Annotation = Annotation("{struct}:{attr}"), token: Annotation = Annotation("<token>"), out: Output = Output("<token>:misc.from_struct_{struct}_{attr}")): """Convert an attribute on a structural annotation into a token attribute.""" token_parents = token.get_parents(attr) attr_values = list(attr.read()) out_values = [ attr_values[p] if p is not None else "" for p in token_parents ] out.write(out_values)
def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), source: Annotation = Annotation("[geo.metadata_source]"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Get location data based on metadata containing location names.""" geomodel = load_model(model, language=language) same_target_source = chunk.split()[0] == source.split()[0] chunk_annotation = list(chunk.read()) source_annotation = list(source.read()) # If location source and target chunk are not the same, we need # to find the parent/child relations between them. if not same_target_source: target_source_parents = list(source.get_parents(chunk)) chunk_locations = {} for i, _ in enumerate(chunk_annotation): if same_target_source: location_source = source_annotation[i] else: location_source = source_annotation[target_source_parents[i]] if target_source_parents[ i] is not None else None if location_source: location_data = geomodel.get(location_source.strip().lower()) if location_data: chunk_locations[i] = [(location_source, list(location_data))] else: chunk_locations[i] = [] chunk_locations = most_populous(chunk_locations) out_annotation = chunk.create_empty_attribute() for c in chunk_locations: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def upostag(out: Output = Output("<token>:misc.upos", cls="token:upos", description="Part-of-speeches in UD"), pos: Annotation = Annotation("<token:pos>")): """Convert SUC POS tags to UPOS.""" pos_tags = pos.read() out_annotation = [] for tag in pos_tags: out_annotation.append(util.tagsets.pos_to_upos(tag, "swe", "SUC")) out.write(out_annotation)
def select(out: Output, annotation: Annotation, index: Optional[int] = 0, separator: Optional[str] = " "): """Select a specific index from the values of an annotation. The given annotation values are separated by 'separator', by default whitespace, with at least index + 1 elements. """ if isinstance(index, str): index = int(index) out.write(value.split(separator)[index] for value in annotation.read())
def annotate( lang: Language = Language(), model: Model = Model("[treetagger.model]"), tt_binary: Binary = Binary("[treetagger.binary]"), out_upos: Output = Output("<token>:treetagger.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output( "<token>:treetagger.pos", cls="token:pos", description="Part-of-speeches from TreeTagger"), out_baseform: Output = Output("<token>:treetagger.baseform", description="Baseforms from TreeTagger"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), encoding: str = util.UTF8): """POS/MSD tag and lemmatize using TreeTagger.""" sentences, _orphans = sentence.get_children(word) word_annotation = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path] stdout, stderr = util.system.call_binary(tt_binary, args, stdin, encoding=encoding) log.debug("Message from TreeTagger:\n%s", stderr) # Write pos and upos annotations. out_upos_annotation = word.create_empty_attribute() out_pos_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] out_pos_annotation[token_id] = tag out_upos_annotation[token_id] = util.tagsets.pos_to_upos( tag, lang, TAG_SETS.get(lang)) out_pos.write(out_pos_annotation) out_upos.write(out_upos_annotation) # Write lemma annotations. out_lemma_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN] out_lemma_annotation[token_id] = lem out_baseform.write(out_lemma_annotation)
def annotate( sense: Annotation = Annotation("<token>:saldo.sense"), out_scores: Output = Output("<token>:sensaldo.sentiment_score", description="SenSALDO sentiment score"), out_labels: Output = Output("<token>:sensaldo.sentiment_label", description="SenSALDO sentiment label"), model: Model = Model("[sensaldo.model]"), lexicon=None): """Assign sentiment values to tokens based on their sense annotation. When more than one sense is possible, calulate a weighted mean. - sense: existing annotation with saldoIDs. - out_scores, out_labels: resulting annotation file. - model: pickled lexicon with saldoIDs as keys. - lexicon: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ if not lexicon: lexicon = util.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) sense = sense.read() result_scores = [] result_labels = [] for token in sense: # Get set of senses for each token and sort them according to their probabilities token_senses = [ tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else (s, -1.0) for s in token.split(util.DELIM) if s ] token_senses.sort(key=lambda x: float(x[1]), reverse=True) # Lookup the sentiment score for the most probable sense and assign a sentiment label if token_senses: best_sense = token_senses[0][0] score = lexicon.lookup(best_sense, None) else: score = None if score: result_scores.append(score) result_labels.append(SENTIMENT_LABLES.get(int(score))) else: result_scores.append(None) result_labels.append(None) out_scores.write(result_scores) out_labels.write(result_labels)
def replace_list(chunk: Annotation, out: Output, find: str = "", sub: str = ""): """Find and replace annotations. Find string must match whole annotation. find and sub are whitespace separated lists of words to replace and their replacement. """ find = find.split() sub = sub.split() if len(find) != len(sub): raise util.SparvErrorMessage( "Find and sub must have the same number of words.") translate = dict((f, s) for (f, s) in zip(find, sub)) out.write((translate.get(val, val) for val in chunk.read()))
def number_by_parent(out: Output = Output("{annotation}:misc.number_by_parent_{parent_annotation}__{parent_attribute}"), chunk: Annotation = Annotation("{annotation}"), parent_order: Annotation = Annotation("{parent_annotation}:{parent_attribute}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by (parent_order, chunk order).""" parent_children, _orphans = parent_order.get_children(chunk) child_order = {child_index: (parent_nr, child_index) for parent_index, parent_nr in enumerate(parent_order.read()) for child_index in parent_children[parent_index]} def _order(index, _value): return child_order.get(index) _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
def ids(doc: Document = Document(), annotation: Annotation = Annotation("{annotation}"), out: Output = Output("{annotation}:misc.id", description="Unique ID for {annotation}"), docid: AnnotationData = AnnotationData("<docid>"), prefix: str = ""): """Create unique IDs for every span of an existing annotation.""" docid = docid.read() prefix = prefix + docid ann = list(annotation.read()) out_annotation = [] # Use doc name and annotation name as seed for the IDs _reset_id("{}/{}".format(doc, annotation), len(ann)) for _ in ann: new_id = _make_id(prefix, out_annotation) out_annotation.append(new_id) out.write(out_annotation)
def nominal_ratio(text: Annotation = Annotation("<text>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output( "<text>:readability.nk", description="Nominal ratios for text chunks"), noun_pos: List[str] = ["NN", "PP", "PC"], verb_pos: List[str] = ["PN", "AB", "VB"], fmt: str = "%.2f"): """Create nominal ratio annotation for text.""" text_children, _orphans = text.get_children(pos) pos_annotation = list(pos.read()) # Calculate OVIX for every text element nk_annotation = [] for text in text_children: in_pos = [pos_annotation[token_index] for token_index in text] nk_annotation.append(fmt % nominal_ratio_calc(in_pos, noun_pos, verb_pos)) out.write(nk_annotation)
def vrt(doc: Document = Document(), out: Export = Export("vrt/{doc}.vrt"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt. - annotations: list of elements:attributes (annotations) to include. - source_annotations: list of elements:attributes from the original document to be kept. If not specified, everything will be kept. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read words word_annotation = list(word.read()) # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) vrt_data = create_vrt(span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def _read_chunks_and_write_new_ordering(out: Output, chunk: Annotation, order, prefix="", zfill=False, start=START_DEFAULT): """Common function called by other numbering functions.""" new_order = defaultdict(list) in_annotation = list(chunk.read()) for i, val in enumerate(in_annotation): val = order(i, val) new_order[val].append(i) out_annotation = chunk.create_empty_attribute() nr_digits = len(str(len(new_order) - 1 + start)) for nr, key in enumerate(sorted(new_order), start): for index in new_order[key]: out_annotation[index] = "{prefix}{nr:0{length}d}".format(prefix=prefix, length=nr_digits if zfill else 0, nr=nr) out.write(out_annotation)
def text_headtail(text: Text = Text(), chunk: Annotation = Annotation("<token>"), out_head: Output = Output("<token>:misc.head"), out_tail: Output = Output("<token>:misc.tail")): """Extract "head" and "tail" whitespace characters for tokens.""" def escape(t): """Escape whitespace characters.""" return t.replace(" ", "\\s").replace("\n", "\\n").replace("\t", "\\t") out_head_annotation = chunk.create_empty_attribute() out_tail_annotation = chunk.create_empty_attribute() head_text = None corpus_text = text.read() chunk = list(chunk.read()) for i, span in enumerate(chunk): if head_text: out_head_annotation[i] = escape(head_text) head_text = None if i < len(chunk) - 1: tail_start = span[1][0] tail_end = chunk[i + 1][0][0] tail_text = corpus_text[tail_start:tail_end] try: n_pos = tail_text.rindex("\n") except ValueError: n_pos = None if n_pos is not None and n_pos + 1 < len(tail_text): head_text = tail_text[n_pos + 1:] tail_text = tail_text[:n_pos + 1] if tail_text: out_tail_annotation[i] = escape(tail_text) out_head.write(out_head_annotation) out_tail.write(out_tail_annotation)
def annotate( maltjar: Binary = Binary("[malt.jar]"), model: Model = Model("[malt.model]"), out_dephead: Output = Output( "<token>:malt.dephead", cls="token:dephead", description="Positions of the dependency heads"), out_dephead_ref: Output = Output( "<token>:malt.dephead_ref", cls="token:dephead_ref", description="Sentence-relative positions of the dependency heads"), out_deprel: Output = Output( "<token>:malt.deprel", cls="token:deprel", description="Dependency relations to the head"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), sentence: Annotation = Annotation("<sentence>"), token: Annotation = Annotation("<token>"), encoding: str = util.UTF8, process_dict=None): """ Run the malt parser, in an already started process defined in process_dict, or start a new process (default). The process_dict argument should never be set from the command line. """ if process_dict is None: process = maltstart(maltjar, model, encoding) else: process = process_dict["process"] # If process seems dead, spawn a new if process.stdin.closed or process.stdout.closed or process.poll(): util.system.kill_process(process) process = maltstart(maltjar, model, encoding, send_empty_sentence=True) process_dict["process"] = process sentences, orphans = sentence.get_children(token) sentences.append(orphans) word_annotation = list(word.read()) pos_annotation = list(pos.read()) msd_annotation = list(msd.read()) ref_annotation = list(ref.read()) def conll_token(nr, token_index): form = word_annotation[token_index] lemma = UNDEF pos = cpos = pos_annotation[token_index] feats = re.sub(r"[ ,.]", "|", msd_annotation[token_index]).replace("+", "/") return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats)) stdin = SENT_SEP.join( TOK_SEP.join( conll_token(n + 1, token_index) for n, token_index in enumerate(sent)) for sent in sentences) if encoding: stdin = stdin.encode(encoding) keep_process = len( stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) if process_dict is not None: process_dict["restart"] = not keep_process if keep_process: # Chatting with malt: send a SENT_SEP and read correct number of lines stdin_fd, stdout_fd = process.stdin, process.stdout stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8)) stdin_fd.flush() malt_sentences = [] for sent in sentences: malt_sent = [] for _ in sent: line = stdout_fd.readline() if encoding: line = line.decode(encoding) malt_sent.append(line) line = stdout_fd.readline() assert line == b"\n" malt_sentences.append(malt_sent) else: # Otherwise use communicate which buffers properly stdout, _ = process.communicate(stdin) if encoding: stdout = stdout.decode(encoding) malt_sentences = (malt_sent.split(TOK_SEP) for malt_sent in stdout.split(SENT_SEP)) out_dephead_annotation = word.create_empty_attribute() out_dephead_ref_annotation = out_dephead_annotation.copy() out_deprel_annotation = out_dephead_annotation.copy() for (sent, malt_sent) in zip(sentences, malt_sentences): for (token_index, malt_tok) in zip(sent, malt_sent): cols = [(None if col == UNDEF else col) for col in malt_tok.split(TAG_SEP)] out_deprel_annotation[token_index] = cols[DEPREL_COLUMN] head = int(cols[HEAD_COLUMN]) out_dephead_annotation[token_index] = str(sent[head - 1]) if head else "-" out_dephead_ref_annotation[token_index] = str( ref_annotation[sent[head - 1]]) if head else "" out_dephead.write(out_dephead_annotation) out_dephead_ref.write(out_dephead_ref_annotation) out_deprel.write(out_deprel_annotation)
def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str], class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX, scoresep=util.SCORESEP, lexicon=None): """ Annotate words with blingbring classes (rogetID). - out_sent: resulting annotation file. - model: pickled lexicon with saldoIDs as keys. - saldoids, pos: existing annotation with saldoIDs/parts of speech. - annotate: annotation function, returns an iterable containing annotations for one token ID. (annotate_bring() or annotate_swefn()) - pos_limit: parts of speech that will be annotated. Set to None to annotate all pos. - class_set: output Bring classes or Roget IDs ("bring", "roget_head", "roget_subsection", "roget_section" or "roget_class"). Set to None when not annotating blingbring. - disambiguate: use WSD and use only the most likely saldo ID. - connect_IDs: for sweFN: paste saldo ID after each sweFN ID. - delimiter: delimiter character to put between ambiguous results - affix: optional character to put before and after results to mark a set. - lexicon: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ if not lexicon: lexicon = util.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) sense = saldoids.read() token_pos = list(pos.read()) out_annotation = pos.create_empty_attribute() # Check if the saldo IDs are ranked (= word senses have been disambiguated) wsd = saldoids.split()[1].split(".")[0] == "wsd" for token_index, token_sense in enumerate(sense): # Check if part of speech of this token is allowed if not pos_ok(token_pos, token_index, pos_limit): saldo_ids = None out_annotation[token_index] = affix continue if wsd and util.SCORESEP in token_sense: ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \ if token_sense != util.AFFIX else None saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo] if not disambiguate: saldo_ids = [i[0] for i in saldo_tuples] # Only take the most likely analysis into account. # Handle wsd with equal probability for several words else: saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]): saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] saldo_ids = [i[0] for i in saldo_ids] else: # No WSD saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \ if token_sense != util.AFFIX else None result = annotate(saldo_ids, lexicon, connect_ids, scoresep) out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix out.write(out_annotation)
def annotate(token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), reference: Annotation = Annotation( "<token>:misc.number_rel_<sentence>"), out_sense: Output = Output("<token>:saldo.sense", cls="token:sense", description="SALDO identifier"), out_lemgram: Output = Output("<token>:saldo.lemgram", description="SALDO lemgram"), out_baseform: Output = Output("<token>:saldo.baseform", cls="token:baseform", description="Baseform from SALDO"), models: List[Model] = [Model("[saldo.model]")], msd: Optional[Annotation] = Annotation("<token:msd>"), delimiter: str = util.DELIM, affix: str = util.AFFIX, precision: str = Config("saldo.precision"), precision_filter: str = "max", min_precision: float = 0.66, skip_multiword: bool = False, allow_multiword_overlap: bool = False, word_separator: str = "", lexicons=None): """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words. - token, word, msd, sentence, reference: existing annotations - out_baseform, out_lemgram, out_sense: resulting annotations to be written - models: a list of pickled lexica, typically the Saldo model (saldo.pickle) and optional lexicons for older Swedish. - delimiter: delimiter character to put between ambiguous results - affix: an optional character to put before and after results - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f" (use empty string for no precision) - precision_filter: an optional filter, currently there are the following values: max: only use the annotations that are most probable first: only use the most probable annotation (or one of the most probable if more than one) none: use all annotations - min_precision: only use annotations with a probability score higher than this - skip_multiword: set to True to disable multi word annotations - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations. By setting this to True, all overlaps will be allowed. - word_separator: an optional character used to split the values of "word" into several word variations - lexicons: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ # Allow use of multiple lexicons models_list = [(m.path.stem, m) for m in models] if not lexicons: lexicon_list = [(name, SaldoLexicon(lex.path)) for name, lex in models_list] # Use pre-loaded lexicons (from catapult) else: lexicon_list = [] for name, _lex in models_list: assert lexicons.get( name, None) is not None, "Lexicon %s not found!" % name lexicon_list.append((name, lexicons[name])) # Maximum number of gaps in multi-word units. # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc) max_gaps = 1 # Combine annotation names i SALDO lexicon with out annotations annotations = [] if out_baseform: annotations.append((out_baseform, "gf")) if out_lemgram: annotations.append((out_lemgram, "lem")) if out_sense: annotations.append((out_sense, "saldo")) if skip_multiword: log.info("Skipping multi word annotations") min_precision = float(min_precision) # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be # allowed to span over other verbs) skip_pos_check = (min_precision == 0.0) word_annotation = list(word.read()) ref_annotation = list(reference.read()) if msd: msd_annotation = list(msd.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) out_annotation = word.create_empty_attribute() for sent in sentences: incomplete_multis = [ ] # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}] complete_multis = [] # ([ref], annotation) sentence_tokens = {} for token_index in sent: theword = word_annotation[token_index] ref = ref_annotation[token_index] msdtag = msd_annotation[token_index] if msd else "" annotation_info = {} sentence_tokens[ref] = { "token_index": token_index, "annotations": annotation_info } # Support for multiple values of word if word_separator: thewords = [w for w in theword.split(word_separator) if w] else: thewords = [theword] # First use MSD tags to find the most probable single word annotations ann_tags_words = find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, precision_filter, annotation_info) # Find multi-word expressions if not skip_multiword: find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, ann_tags_words, msd_annotation, sent, skip_pos_check) # Loop to next token if not allow_multiword_overlap: # Check that we don't have any unwanted overlaps remove_unwanted_overlaps(complete_multis) # Then save the rest of the multi word expressions in sentence_tokens save_multiwords(complete_multis, sentence_tokens) for tok in list(sentence_tokens.values()): out_annotation[tok["token_index"]] = _join_annotation( tok["annotations"], delimiter, affix) # Loop to next sentence for out_annotation_obj, annotation_name in annotations: out_annotation_obj.write( [v.get(annotation_name, delimiter) for v in out_annotation])
def annotate( wsdjar: Binary = Binary("[wsd.jar]"), sense_model: Model = Model("[wsd.sense_model]"), context_model: Model = Model("[wsd.context_model]"), out: Output = Output( "<token>:wsd.sense", cls="token:sense", description="Sense disambiguated SALDO identifiers"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), saldo: Annotation = Annotation("<token>:saldo.sense"), pos: Annotation = Annotation("<token:pos>"), token: Annotation = Annotation("<token>"), prob_format: str = Config("wsd.prob_format"), default_prob: float = Config("wsd.default_prob"), encoding: str = util.UTF8): """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation. Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob. - wsdjar is the name of the java programme to be used for the wsd - sense_model and context_model are the models to be used with wsdjar - out is the resulting annotation file - sentence is an existing annotation for sentences and their children (words) - word is an existing annotations for wordforms - ref is an existing annotation for word references - lemgram and saldo are existing annotations for inflection tables and meanings - pos is an existing annotations for part-of-speech - prob_format is a format string for how to print the sense probability - default_prob is the default value for unanalyzed senses """ word_annotation = list(word.read()) ref_annotation = list(ref.read()) lemgram_annotation = list(lemgram.read()) saldo_annotation = list(saldo.read()) pos_annotation = list(pos.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) # Start WSD process process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding) # Construct input and send to WSD stdin = build_input(sentences, word_annotation, ref_annotation, lemgram_annotation, saldo_annotation, pos_annotation) if encoding: stdin = stdin.encode(encoding) stdout, stderr = process.communicate(stdin) # TODO: Solve hack line below! # Problem is that regular messages "Reading sense vectors.." are also piped to stderr. if len(stderr) > 52: util.system.kill_process(process) log.error(str(stderr)) return if encoding: stdout = stdout.decode(encoding) process_output(word, out, stdout, sentences, saldo_annotation, prob_format, default_prob) # Kill running subprocess util.system.kill_process(process) return
def _formatter(in_from: Annotation, in_to: Optional[Annotation], out_from: Output, out_to: Output, informat: str, outformat: str, splitter: str, regex: str): """Take existing dates/times and input formats and convert to specified output format.""" def get_smallest_unit(informat): smallest_unit = 0 # No date if "%y" not in informat and "%Y" not in informat: pass elif "%b" not in informat and "%B" not in informat and "%m" not in informat: smallest_unit = 1 # year elif "%d" not in informat: smallest_unit = 2 # month elif "%H" not in informat and "%I" not in informat: smallest_unit = 3 # day elif "%M" not in informat: smallest_unit = 4 # hour elif "%S" not in informat: smallest_unit = 5 # minute else: smallest_unit = 6 # second return smallest_unit def get_date_length(informat): parts = informat.split("%") length = len( parts[0]) # First value is either blank or not part of date lengths = { "Y": 4, "3Y": 3, "y": 2, "m": 2, "b": None, "B": None, "d": 2, "H": None, "I": None, "M": 2, "S": 2 } for part in parts[1:]: add = lengths.get(part[0], None) if add: length += add + len(part[1:]) else: return None return length if not in_to: in_to = in_from informat = informat.split("|") outformat = outformat.split("|") if splitter: splitter = splitter assert len(outformat) == 1 or (len(outformat) == len(informat)), "The number of out-formats must be equal to one " \ "or the number of in-formats." ifrom = list(in_from.read()) ofrom = in_from.create_empty_attribute() for index, val in enumerate(ifrom): val = val.strip() if not val: ofrom[index] = None continue tries = 0 for inf in informat: if splitter and splitter in inf: values = re.findall("%[YybBmdHMS]", inf) if len(set(values)) < len(values): vals = val.split(splitter) inf = inf.split(splitter) else: vals = [val] inf = [inf] if regex: temp = [] for v in vals: matches = re.search(regex, v) if matches: temp.append([x for x in matches.groups() if x][0]) if not temp: # If the regex doesn't match, treat as no date ofrom[index] = None continue vals = temp tries += 1 try: fromdates = [] for i, v in enumerate(vals): if "%3Y" in inf[i]: datelen = get_date_length(inf[i]) if datelen and not datelen == len(v): raise ValueError inf[i] = inf[i].replace("%3Y", "%Y") v = "0" + v if "%0m" in inf[i] or "%0d" in inf[i]: inf[i] = inf[i].replace("%0m", "%m").replace("%0d", "%d") datelen = get_date_length(inf[i]) if datelen and not datelen == len(v): raise ValueError fromdates.append(datetime.datetime.strptime(v, inf[i])) if len(fromdates) == 1 or out_to: ofrom[index] = fromdates[0].strftime(outformat[0] if len( outformat) == 1 else outformat[tries - 1]) else: outstrings = [ fromdate.strftime(outformat[0] if len(outformat) == 1 else outformat[tries - 1]) for fromdate in fromdates ] ofrom[index] = outstrings[0] + splitter + outstrings[1] break except ValueError: if tries == len(informat): log.error("Could not parse: %s", str(vals)) raise continue out_from.write(ofrom) del ofrom if out_to: ito = list(in_to.read()) oto = in_to.create_empty_attribute() for index, val in enumerate(ito): if not val: oto[index] = None continue tries = 0 for inf in informat: if splitter and splitter in inf: values = re.findall("%[YybBmdHMS]", inf) if len(set(values)) < len(values): vals = val.split(splitter) inf = inf.split(splitter) else: vals = [val] inf = [inf] if regex: temp = [] for v in vals: matches = re.search(regex, v) if matches: temp.append([x for x in matches.groups() if x][0]) if not temp: # If the regex doesn't match, treat as no date oto[index] = None continue vals = temp tries += 1 try: todates = [] for i, v in enumerate(vals): if "%3Y" in inf[i]: datelen = get_date_length(inf[i]) if datelen and not datelen == len(v): raise ValueError inf[i] = inf[i].replace("%3Y", "%Y") v = "0" + v if "%0m" in inf[i] or "%0d" in inf[i]: inf[i] = inf[i].replace("%0m", "%m").replace("%0d", "%d") datelen = get_date_length(inf[i]) if datelen and not datelen == len(v): raise ValueError todates.append(datetime.datetime.strptime(v, inf[i])) smallest_unit = get_smallest_unit(inf[0]) if smallest_unit == 1: add = relativedelta(years=1) elif smallest_unit == 2: add = relativedelta(months=1) elif smallest_unit == 3: add = relativedelta(days=1) elif smallest_unit == 4: add = relativedelta(hours=1) elif smallest_unit == 5: add = relativedelta(minutes=1) elif smallest_unit == 6: add = relativedelta(seconds=1) todates = [ todate + add - relativedelta(seconds=1) for todate in todates ] oto[index] = todates[-1].strftime(outformat[0] if len( outformat) == 1 else outformat[tries - 1]) break except ValueError: if tries == len(informat): log.error("Could not parse: %s", str(vals)) raise continue out_to.write(oto)
def csv(doc: Document = Document(), out: Export = Export("csv/{doc}.csv"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), sentence: Annotation = Annotation("<sentence>"), annotations: ExportAnnotations = ExportAnnotations( "csv_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "csv_export.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), delimiter: str = Config("csv_export.delimiter")): """Export annotations to CSV format.""" # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Read words word_annotation = list(word.read()) # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token_name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) # Make csv header csv_data = [ _make_header(token_name, token_attributes, export_names, delimiter) ] # Go through spans_dict and add to csv, line by line for _pos, instruction, span in span_positions: if instruction == "open": # Create token line if span.name == token_name: csv_data.append( _make_token_line(word_annotation[span.index], token_name, token_attributes, annotation_dict, span.index, delimiter)) # Create line with structural annotation else: attrs = _make_attrs(span.name, annotation_dict, export_names, span.index) for attr in attrs: csv_data.append(f"# {attr}") if not attrs: csv_data.append(f"# {span.export}") # Insert blank line after each closing sentence elif span.name == sentence.name and instruction == "close": csv_data.append("") # Write result to file with open(out, "w") as f: f.write("\n".join(csv_data)) logger.info("Exported: %s", out)