def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), source: Annotation = Annotation("[geo.metadata_source]"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Get location data based on metadata containing location names.""" geomodel = load_model(model, language=language) same_target_source = chunk.split()[0] == source.split()[0] chunk_annotation = list(chunk.read()) source_annotation = list(source.read()) # If location source and target chunk are not the same, we need # to find the parent/child relations between them. if not same_target_source: target_source_parents = list(source.get_parents(chunk)) chunk_locations = {} for i, _ in enumerate(chunk_annotation): if same_target_source: location_source = source_annotation[i] else: location_source = source_annotation[target_source_parents[i]] if target_source_parents[ i] is not None else None if location_source: location_data = geomodel.get(location_source.strip().lower()) if location_data: chunk_locations[i] = [(location_source, list(location_data))] else: chunk_locations[i] = [] chunk_locations = most_populous(chunk_locations) out_annotation = chunk.create_empty_attribute() for c in chunk_locations: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str], class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX, scoresep=util.SCORESEP, lexicon=None): """ Annotate words with blingbring classes (rogetID). - out_sent: resulting annotation file. - model: pickled lexicon with saldoIDs as keys. - saldoids, pos: existing annotation with saldoIDs/parts of speech. - annotate: annotation function, returns an iterable containing annotations for one token ID. (annotate_bring() or annotate_swefn()) - pos_limit: parts of speech that will be annotated. Set to None to annotate all pos. - class_set: output Bring classes or Roget IDs ("bring", "roget_head", "roget_subsection", "roget_section" or "roget_class"). Set to None when not annotating blingbring. - disambiguate: use WSD and use only the most likely saldo ID. - connect_IDs: for sweFN: paste saldo ID after each sweFN ID. - delimiter: delimiter character to put between ambiguous results - affix: optional character to put before and after results to mark a set. - lexicon: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ if not lexicon: lexicon = util.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) sense = saldoids.read() token_pos = list(pos.read()) out_annotation = pos.create_empty_attribute() # Check if the saldo IDs are ranked (= word senses have been disambiguated) wsd = saldoids.split()[1].split(".")[0] == "wsd" for token_index, token_sense in enumerate(sense): # Check if part of speech of this token is allowed if not pos_ok(token_pos, token_index, pos_limit): saldo_ids = None out_annotation[token_index] = affix continue if wsd and util.SCORESEP in token_sense: ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \ if token_sense != util.AFFIX else None saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo] if not disambiguate: saldo_ids = [i[0] for i in saldo_tuples] # Only take the most likely analysis into account. # Handle wsd with equal probability for several words else: saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]): saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] saldo_ids = [i[0] for i in saldo_ids] else: # No WSD saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \ if token_sense != util.AFFIX else None result = annotate(saldo_ids, lexicon, connect_ids, scoresep) out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix out.write(out_annotation)