def main(): parser = argparse.ArgumentParser( description="Calculate the number of entities that have easy starts.") parser.add_argument("dataset") parser.add_argument("--datasets-index", "--datasets_index", default="configs/datasets.json") parser.add_argument("--cache", default="data") parser.add_argument("--surface-index", "--surface_index", default=0, type=int) parser.add_argument("--entity-index", "--entity_index", default=-1, type=int) parser.add_argument( "--span-type", "--span_type", default=SpanEncoding.IOBES, type=SpanEncoding.from_string, ) parser.add_argument("--types", action="store_true") parser.add_argument("--delim") args = parser.parse_args() dataset = download_dataset(args.dataset, args.datasets_index, args.cache) emissions, _ = estimate_counts( list(read_conll(dataset["train_file"], args.delim)), args.surface_index, args.entity_index) transition_mask = make_transition_mask(dataset, args.span_type, args.entity_index, args.delim) unamb, domd, total = easy_start( dataset["valid_file"], emissions, transition_mask, args.surface_index, args.entity_index, args.span_type, args.delim, args.types, ) print( f"There are {len(unamb)} entities that start with an unambiguous tokens." ) print( f"There are {len(domd)} entities that have ambiguous starts but transitions dominate them." ) print(f"There are {len(total)} entities in the whole dataset.") print( f"{len(unamb) / len(total) * 100}% of entities have unambiguous starts" ) print(f"{len(domd) / len(total) * 100}% of entities have dominated starts") print( f"{(len(unamb) + len(domd)) / len(total) * 100}% of entities have easy starts" )
def read_all_tags(dataset, entity_idx, delim): tags = set() for ds in (dataset["train_file"], dataset["valid_file"], dataset["test_file"]): for sent in read_conll(ds, delim): tags.update(list(zip(*sent))[entity_idx]) return tags
def main(): parser = argparse.ArgumentParser(description="Calculate the number of entities that have easy starts.") parser.add_argument("dataset") parser.add_argument("--datasets-index", "--datasets_index", default="configs/datasets.json") parser.add_argument("--cache", default="data") parser.add_argument("--surface-index", "--surface_index", default=0, type=int) parser.add_argument("--entity-index", "--entity_index", default=-1, type=int) parser.add_argument("--types", action="store_true") parser.add_argument("--delim") args = parser.parse_args() dataset = download_dataset(args.dataset, args.datasets_index, args.cache) emissions, _ = estimate_counts( list(read_conll(dataset["train_file"], args.delim)), args.surface_index, args.entity_index ) ambiguous = 0 for token, emission in emissions.items(): if args.types: if len(set(extract_type(t) for t in emission)) > 1: ambiguous += 1 else: if len(emission) > 1: ambiguous += 1 print(f"{ambiguous / len(emissions) * 100}% of types are ambiguous.")
def extract_tags(file_name: str, index: int, delim: Optional[str] = None) -> List[List[str]]: tags = [] for sentence in read_conll(file_name, delim): tags.append( list( chain([TokenFunction.GO], list(zip(*sentence))[index], [TokenFunction.EOS]))) return tags
def read_entities(file_name: str, entity_index: int = -1, span_type: SpanEncoding = SpanEncoding.IOBES, delim: Optional[str] = None) -> List[Span]: entities = [] for sentence in read_conll(file_name, delim): tags = list(zip(*sentence))[entity_index] for entity in parse_spans(tags, span_type): entities.append(entity) return entities
def main(): parser = argparse.ArgumentParser( description="Calculate the number of entities that have easy starts.") parser.add_argument("dataset") parser.add_argument("--datasets-index", "--datasets_index", default="configs/datasets.json") parser.add_argument("--cache", default="data") parser.add_argument("--surface-index", "--surface_index", default=0, type=int) parser.add_argument("--entity-index", "--entity_index", default=-1, type=int) parser.add_argument( "--span-type", "--span_type", default=SpanEncoding.IOBES, type=SpanEncoding.from_string, ) parser.add_argument("--types", action="store_true") parser.add_argument("--delim") args = parser.parse_args() dataset = download_dataset(args.dataset, args.datasets_index, args.cache) emissions, _ = estimate_counts( list(read_conll(dataset["train_file"], args.delim)), args.surface_index, args.entity_index) transition_mask = make_transition_mask(dataset, args.span_type, args.entity_index, args.delim) domd, ambig_domd, total, ambig_total = strictly_dominated( dataset["valid_file"], emissions, transition_mask, args.surface_index, args.entity_index, args.delim) print(f"There are {domd} tokens that are strictly dominated.") print( f"There are {ambig_domd} ambiguous tokens that are strictly dominated." ) print(f"There are {total} tokens in the whole dataset.") print(f"{domd / total * 100}% of tokens have are strictly dominated") print( f"{ambig_domd / ambig_total * 100}% of ambiguous tokens are strictly dominated" )
def main(): parser = argparse.ArgumentParser( description="Estimate transition probabilities from the training data." ) parser.add_argument("dataset") parser.add_argument("--datasets-index", "--datasets_index", default="configs/datasets.json") parser.add_argument("--cache", default="data") parser.add_argument("--surface-index", "--surface_index", default=0, type=int) parser.add_argument("--entity-index", "--entity_index", default=-1, type=int) parser.add_argument("--delim") parser.add_argument("--output") args = parser.parse_args() dataset = download_dataset(args.dataset, args.datasets_index, args.cache) _, transitions = estimate_counts( list(read_conll(dataset["train_file"], args.delim)), args.surface_index, args.entity_index) transitions, _ = normalize_transitions(transitions) vocab, transitions = to_dense(transitions) args.output = args.dataset if args.output is None else args.output os.makedirs(args.output, exist_ok=True) with open(os.path.join(args.output, "vocab.json"), "w") as wf: json.dump(vocab, wf, indent=2) np.save(os.path.join(args.output, "transitions.npy"), transitions)
def easy_start(file_name, emissions, transition_mask, surface_idx, entity_idx, span_type, delim, types=False): unamb = [] domd = [] total = [] type_surface = set() for sentence in read_conll(file_name, delim): cols = list(zip(*sentence)) tags = cols[entity_idx] surfaces = cols[surface_idx] for entity in parse_spans(tags, span_type): start_token = surfaces[entity.start] if (start_token, entity.type) in type_surface and types: continue type_surface.add((start_token, entity.type)) if not emissions[start_token]: continue if len(emissions[start_token]) == 1: unamb.append(entity) else: prev = tags[entity.start - 1] if entity.start > 0 else TokenFunction.GO all_tags = list(emissions[start_token].keys()) possible_tags = [] for tag in all_tags: if transition_mask[(prev, tag)]: possible_tags.append(tag) if len(possible_tags) == 1: domd.append(entity) total.append(entity) return unamb, domd, total
def strictly_dominated(file_name, emissions, transition_mask, surface_idx, entity_idx, delim): domd = 0 ambig_domd = 0 ambig_total = 0 total = 0 for sentence in read_conll(file_name, delim): cols = list(zip(*sentence)) tags = cols[entity_idx] surfaces = cols[surface_idx] for i in range(len(surfaces)): if i == 0: prev = TokenFunction.GO else: prev = tags[i - 1] possible_tags = list(emissions[surfaces[i]]) if dominated(possible_tags, prev, transition_mask): domd += 1 if emissions[surfaces[i]] and len(emissions[surfaces[i]]) > 1: ambig_domd += 1 total += 1 if emissions[surfaces[i]] and len(emissions[surfaces[i]]) > 1: ambig_total += 1 return domd, ambig_domd, total, ambig_total
def easy_end(file_name, emissions, surface_idx, entity_idx, span_type, delim, types=False): easy = [] total = [] type_surface = set() for sentence in read_conll(file_name, delim): cols = list(zip(*sentence)) tags = cols[entity_idx] surfaces = cols[surface_idx] for entity in parse_spans(tags, span_type): end_token = surfaces[entity.end - 1] if (end_token, entity.type) in type_surface and types: continue type_surface.add((end_token, entity.type)) if span_type is SpanEncoding.IOBES: # If we had never seen this token before we won't have any reason to think it is # an `I-` so it should be an easier E- to get so we don't have to check for if # the emissions exist if f"I-{entity.type}" not in emissions[end_token]: easy.append(entity) elif span_type is SpanEncoding.BIO: if not emissions[end_token] or len(emissions[end_token]) == 1: easy.append(entity) total.append(entity) return easy, total
if not features: return {} elif ':' in features[0]: return { feature.split(':')[0]: int(feature.split(':')[1]) for feature in features } else: return {feature: index for index, feature in enumerate(features)} if os.path.exists(args.text) and os.path.isfile(args.text): texts = [] if args.conll: feature_indices = feature_index_mapping(args.features) for sentence in read_conll(args.text): if feature_indices: texts.append([{k: line[v] for k, v in feature_indices.items()} for line in sentence]) else: texts.append([line[0] for line in sentence]) else: with open(args.text, 'r') as f: for line in f: text = line.strip().split() texts += [text] else: texts = [args.text.split()] m = bl.TaggerService.load(args.model,
def test_read_conll_metadata_comments_conflict(): with pytest.raises(ValueError): next(read_conll(StringIO("a"), metadata=True, allow_comments=False))
def feature_index_mapping(features): if not features: return {} elif ':' in features[0]: return {feature.split(':')[0]: int(feature.split(':')[1]) for feature in features} else: return {feature: index for index, feature in enumerate(features)} if os.path.exists(args.text) and os.path.isfile(args.text): texts = [] if args.conll: feature_indices = feature_index_mapping(args.features) for sentence in read_conll(args.text): if feature_indices: texts.append([{k: line[v] for k, v in feature_indices.items()} for line in sentence]) else: texts.append([line[0] for line in sentence]) else: with open(args.text, 'r') as f: for line in f: text = line.strip().split() texts += [text] else: texts = [args.text.split()] m = bl.TaggerService.load(args.model, backend=args.backend, remote=args.remote, name=args.name, preproc=args.preproc, device=args.device) for sen in m.predict(texts, export_mapping=create_export_mapping(args.export_mapping)):