def main():
    parser = argparse.ArgumentParser(
        description="Calculate the number of entities that have easy starts.")
    parser.add_argument("dataset")
    parser.add_argument("--datasets-index",
                        "--datasets_index",
                        default="configs/datasets.json")
    parser.add_argument("--cache", default="data")
    parser.add_argument("--surface-index",
                        "--surface_index",
                        default=0,
                        type=int)
    parser.add_argument("--entity-index",
                        "--entity_index",
                        default=-1,
                        type=int)
    parser.add_argument(
        "--span-type",
        "--span_type",
        default=SpanEncoding.IOBES,
        type=SpanEncoding.from_string,
    )
    parser.add_argument("--types", action="store_true")
    parser.add_argument("--delim")
    args = parser.parse_args()

    dataset = download_dataset(args.dataset, args.datasets_index, args.cache)

    emissions, _ = estimate_counts(
        list(read_conll(dataset["train_file"], args.delim)),
        args.surface_index, args.entity_index)

    transition_mask = make_transition_mask(dataset, args.span_type,
                                           args.entity_index, args.delim)
    unamb, domd, total = easy_start(
        dataset["valid_file"],
        emissions,
        transition_mask,
        args.surface_index,
        args.entity_index,
        args.span_type,
        args.delim,
        args.types,
    )

    print(
        f"There are {len(unamb)} entities that start with an unambiguous tokens."
    )
    print(
        f"There are {len(domd)} entities that have ambiguous starts but transitions dominate them."
    )
    print(f"There are {len(total)} entities in the whole dataset.")

    print(
        f"{len(unamb) / len(total) * 100}% of entities have unambiguous starts"
    )
    print(f"{len(domd) / len(total) * 100}% of entities have dominated starts")
    print(
        f"{(len(unamb) + len(domd)) / len(total) * 100}% of entities have easy starts"
    )
Example #2
0
def read_all_tags(dataset, entity_idx, delim):
    tags = set()
    for ds in (dataset["train_file"], dataset["valid_file"],
               dataset["test_file"]):
        for sent in read_conll(ds, delim):
            tags.update(list(zip(*sent))[entity_idx])
    return tags
Example #3
0
def main():
    parser = argparse.ArgumentParser(description="Calculate the number of entities that have easy starts.")
    parser.add_argument("dataset")
    parser.add_argument("--datasets-index", "--datasets_index", default="configs/datasets.json")
    parser.add_argument("--cache", default="data")
    parser.add_argument("--surface-index", "--surface_index", default=0, type=int)
    parser.add_argument("--entity-index", "--entity_index", default=-1, type=int)
    parser.add_argument("--types", action="store_true")
    parser.add_argument("--delim")
    args = parser.parse_args()

    dataset = download_dataset(args.dataset, args.datasets_index, args.cache)

    emissions, _ = estimate_counts(
        list(read_conll(dataset["train_file"], args.delim)), args.surface_index, args.entity_index
    )

    ambiguous = 0
    for token, emission in emissions.items():
        if args.types:
            if len(set(extract_type(t) for t in emission)) > 1:
                ambiguous += 1
        else:
            if len(emission) > 1:
                ambiguous += 1

    print(f"{ambiguous / len(emissions) * 100}% of types are ambiguous.")
def extract_tags(file_name: str,
                 index: int,
                 delim: Optional[str] = None) -> List[List[str]]:
    tags = []
    for sentence in read_conll(file_name, delim):
        tags.append(
            list(
                chain([TokenFunction.GO],
                      list(zip(*sentence))[index], [TokenFunction.EOS])))
    return tags
Example #5
0
def read_entities(file_name: str,
                  entity_index: int = -1,
                  span_type: SpanEncoding = SpanEncoding.IOBES,
                  delim: Optional[str] = None) -> List[Span]:
    entities = []
    for sentence in read_conll(file_name, delim):
        tags = list(zip(*sentence))[entity_index]
        for entity in parse_spans(tags, span_type):
            entities.append(entity)
    return entities
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        description="Calculate the number of entities that have easy starts.")
    parser.add_argument("dataset")
    parser.add_argument("--datasets-index",
                        "--datasets_index",
                        default="configs/datasets.json")
    parser.add_argument("--cache", default="data")
    parser.add_argument("--surface-index",
                        "--surface_index",
                        default=0,
                        type=int)
    parser.add_argument("--entity-index",
                        "--entity_index",
                        default=-1,
                        type=int)
    parser.add_argument(
        "--span-type",
        "--span_type",
        default=SpanEncoding.IOBES,
        type=SpanEncoding.from_string,
    )
    parser.add_argument("--types", action="store_true")
    parser.add_argument("--delim")
    args = parser.parse_args()

    dataset = download_dataset(args.dataset, args.datasets_index, args.cache)

    emissions, _ = estimate_counts(
        list(read_conll(dataset["train_file"], args.delim)),
        args.surface_index, args.entity_index)

    transition_mask = make_transition_mask(dataset, args.span_type,
                                           args.entity_index, args.delim)
    domd, ambig_domd, total, ambig_total = strictly_dominated(
        dataset["valid_file"], emissions, transition_mask, args.surface_index,
        args.entity_index, args.delim)

    print(f"There are {domd} tokens that are strictly dominated.")
    print(
        f"There are {ambig_domd} ambiguous tokens that are strictly dominated."
    )
    print(f"There are {total} tokens in the whole dataset.")

    print(f"{domd / total * 100}% of tokens have are strictly dominated")
    print(
        f"{ambig_domd / ambig_total * 100}% of ambiguous tokens are strictly dominated"
    )
def main():
    parser = argparse.ArgumentParser(
        description="Estimate transition probabilities from the training data."
    )
    parser.add_argument("dataset")
    parser.add_argument("--datasets-index",
                        "--datasets_index",
                        default="configs/datasets.json")
    parser.add_argument("--cache", default="data")
    parser.add_argument("--surface-index",
                        "--surface_index",
                        default=0,
                        type=int)
    parser.add_argument("--entity-index",
                        "--entity_index",
                        default=-1,
                        type=int)
    parser.add_argument("--delim")
    parser.add_argument("--output")
    args = parser.parse_args()

    dataset = download_dataset(args.dataset, args.datasets_index, args.cache)

    _, transitions = estimate_counts(
        list(read_conll(dataset["train_file"], args.delim)),
        args.surface_index, args.entity_index)

    transitions, _ = normalize_transitions(transitions)

    vocab, transitions = to_dense(transitions)

    args.output = args.dataset if args.output is None else args.output

    os.makedirs(args.output, exist_ok=True)

    with open(os.path.join(args.output, "vocab.json"), "w") as wf:
        json.dump(vocab, wf, indent=2)
    np.save(os.path.join(args.output, "transitions.npy"), transitions)
def easy_start(file_name,
               emissions,
               transition_mask,
               surface_idx,
               entity_idx,
               span_type,
               delim,
               types=False):
    unamb = []
    domd = []
    total = []
    type_surface = set()
    for sentence in read_conll(file_name, delim):
        cols = list(zip(*sentence))
        tags = cols[entity_idx]
        surfaces = cols[surface_idx]
        for entity in parse_spans(tags, span_type):
            start_token = surfaces[entity.start]
            if (start_token, entity.type) in type_surface and types:
                continue
            type_surface.add((start_token, entity.type))
            if not emissions[start_token]:
                continue
            if len(emissions[start_token]) == 1:
                unamb.append(entity)
            else:
                prev = tags[entity.start -
                            1] if entity.start > 0 else TokenFunction.GO
                all_tags = list(emissions[start_token].keys())
                possible_tags = []
                for tag in all_tags:
                    if transition_mask[(prev, tag)]:
                        possible_tags.append(tag)
                if len(possible_tags) == 1:
                    domd.append(entity)
            total.append(entity)
    return unamb, domd, total
Example #9
0
def strictly_dominated(file_name, emissions, transition_mask, surface_idx,
                       entity_idx, delim):
    domd = 0
    ambig_domd = 0
    ambig_total = 0
    total = 0
    for sentence in read_conll(file_name, delim):
        cols = list(zip(*sentence))
        tags = cols[entity_idx]
        surfaces = cols[surface_idx]
        for i in range(len(surfaces)):
            if i == 0:
                prev = TokenFunction.GO
            else:
                prev = tags[i - 1]
            possible_tags = list(emissions[surfaces[i]])
            if dominated(possible_tags, prev, transition_mask):
                domd += 1
                if emissions[surfaces[i]] and len(emissions[surfaces[i]]) > 1:
                    ambig_domd += 1
            total += 1
            if emissions[surfaces[i]] and len(emissions[surfaces[i]]) > 1:
                ambig_total += 1
    return domd, ambig_domd, total, ambig_total
def easy_end(file_name, emissions, surface_idx, entity_idx, span_type, delim, types=False):
    easy = []
    total = []
    type_surface = set()
    for sentence in read_conll(file_name, delim):
        cols = list(zip(*sentence))
        tags = cols[entity_idx]
        surfaces = cols[surface_idx]
        for entity in parse_spans(tags, span_type):
            end_token = surfaces[entity.end - 1]
            if (end_token, entity.type) in type_surface and types:
                continue
            type_surface.add((end_token, entity.type))
            if span_type is SpanEncoding.IOBES:
                # If we had never seen this token before we won't have any reason to think it is
                # an `I-` so it should be an easier E- to get so we don't have to check for if
                # the emissions exist
                if f"I-{entity.type}" not in emissions[end_token]:
                    easy.append(entity)
            elif span_type is SpanEncoding.BIO:
                if not emissions[end_token] or len(emissions[end_token]) == 1:
                    easy.append(entity)
            total.append(entity)
    return easy, total
Example #11
0
    if not features:
        return {}
    elif ':' in features[0]:
        return {
            feature.split(':')[0]: int(feature.split(':')[1])
            for feature in features
        }
    else:
        return {feature: index for index, feature in enumerate(features)}


if os.path.exists(args.text) and os.path.isfile(args.text):
    texts = []
    if args.conll:
        feature_indices = feature_index_mapping(args.features)
        for sentence in read_conll(args.text):
            if feature_indices:
                texts.append([{k: line[v]
                               for k, v in feature_indices.items()}
                              for line in sentence])
            else:
                texts.append([line[0] for line in sentence])
    else:
        with open(args.text, 'r') as f:
            for line in f:
                text = line.strip().split()
                texts += [text]
else:
    texts = [args.text.split()]

m = bl.TaggerService.load(args.model,
Example #12
0
def test_read_conll_metadata_comments_conflict():
    with pytest.raises(ValueError):
        next(read_conll(StringIO("a"), metadata=True, allow_comments=False))
Example #13
0

def feature_index_mapping(features):
    if not features:
        return {}
    elif ':' in features[0]:
        return {feature.split(':')[0]: int(feature.split(':')[1]) for feature in features}
    else:
        return {feature: index for index, feature in enumerate(features)}


if os.path.exists(args.text) and os.path.isfile(args.text):
    texts = []
    if args.conll:
        feature_indices = feature_index_mapping(args.features)
        for sentence in read_conll(args.text):
            if feature_indices:
                texts.append([{k: line[v] for k, v in feature_indices.items()} for line in sentence])
            else:
                texts.append([line[0] for line in sentence])
    else:
        with open(args.text, 'r') as f:
            for line in f:
                text = line.strip().split()
                texts += [text]
else:
    texts = [args.text.split()]

m = bl.TaggerService.load(args.model, backend=args.backend, remote=args.remote,
                          name=args.name, preproc=args.preproc, device=args.device)
for sen in m.predict(texts, export_mapping=create_export_mapping(args.export_mapping)):