Esempio n. 1
0
def tagging(dictionaries,
            tokenizer,
            pos_tagger,
            ner_tagger,
            input_streams,
            sep="\t",
            **flags):
    """Print columnar output of [text UID,] token data and entity tags; one token per line."""
    worker = TextAnalytics(tokenizer, pos_tagger, **flags)
    worker.addNerTagger(ner_tagger)

    for d in dictionaries:
        worker.addDictionary(d)

    for input in input_streams:
        for line in input:
            *uid, text = line.strip().split(sep)
            logging.debug('tagging %s: "%s"', '-'.join(uid), text)

            try:
                _, ner_tokens, dict_tags = worker.analyze(text)
            except RuntimeError:
                logging.exception('at UID %s', sep.join(uid))
                continue

            for idx in range(len(ner_tokens[0])):
                token = ner_tokens[0][idx]
                tags = [t[idx].entity for t in ner_tokens[1:]]
                tags.extend(d[idx] for d in dict_tags)
                print("{}{}{}{}{}".format(sep.join(uid), sep if uid else "",
                                          sep.join(token), sep if tags else "",
                                          sep.join(tags)))

            print("")
Esempio n. 2
0
def normalize(dictionaries,
              tokenizer,
              pos_tagger,
              ner_tagger,
              input_streams,
              sep="\t",
              **flags):
    """Print only [text UIDs and] dictionary tags."""
    worker = TextAnalytics(tokenizer, pos_tagger, **flags)
    worker.addNerTagger(ner_tagger)

    for d in dictionaries:
        worker.addDictionary(d)

    for input in input_streams:
        for line in input:
            *uid, text = line.strip().split(sep)
            logging.debug('normalizing %s: "%s"', '-'.join(uid), text)

            try:
                _, _, dict_tags = worker.analyze(text)
            except RuntimeError:
                logging.exception('at UID %s', sep.join(uid))
                continue

            for tags in dict_tags:
                for tag in {tag[2:] for tag in tags if tag != Dictionary.O}:
                    print("{}{}{}".format(sep.join(uid), sep if uid else "",
                                          tag))
Esempio n. 3
0
def tagging(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="\t", **flags):
    """Print columnar output of [text UID,] token data and entity tags; one token per line."""
    worker = TextAnalytics(tokenizer, pos_tagger, **flags)
    worker.addNerTagger(ner_tagger)

    for d in dictionaries:
        worker.addDictionary(d)

    for input in input_streams:
        for line in input:
            *uid, text = line.strip().split(sep)
            logging.debug('tagging %s: "%s"', '-'.join(uid), text)

            try:
                _, ner_tokens, dict_tags = worker.analyze(text)
            except RuntimeError:
                logging.exception('at UID %s', sep.join(uid))
                continue

            for idx in range(len(ner_tokens[0])):
                token = ner_tokens[0][idx]
                tags = [t[idx].entity for t in ner_tokens[1:]]
                tags.extend(d[idx] for d in dict_tags)
                print("{}{}{}{}{}".format(sep.join(uid), sep if uid else "", sep.join(token),
                                          sep if tags else "", sep.join(tags)))

            print("")
Esempio n. 4
0
def align(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="", **flags):
    """Print the aligned dictionary tags below the tokens."""
    uid = []
    worker = TextAnalytics(tokenizer, pos_tagger, **flags)
    worker.addNerTagger(ner_tagger)

    for d in dictionaries:
        worker.addDictionary(d)

    for input in input_streams:
        for text in input:
            if sep:
                *uid, text = text.strip().split(sep)

            logging.debug('aligning %s "%s"', sep.join(uid), text)

            try:
                tokens, _, dict_tags = worker.analyze(text)
            except RuntimeError:
                logging.exception('at UID %s', sep.join(uid))
                continue

            lens = [max(len(tok), max(len(t) for t in tags)) for tok, *tags in
                    zip(tokens, *dict_tags)]

            if sep and uid:
                print(sep.join(uid))

            print(" ".join(("{:<%i}" % l).format(t) for l, t in zip(lens, tokens)))

            for tags in dict_tags:
                print(" ".join(("{:<%i}" % l).format(t) for l, t in zip(lens, tags)))

            print("--")
Esempio n. 5
0
def align(dictionaries,
          tokenizer,
          pos_tagger,
          ner_tagger,
          input_streams,
          sep="",
          **flags):
    """Print the aligned dictionary tags below the tokens."""
    uid = []
    worker = TextAnalytics(tokenizer, pos_tagger, **flags)
    worker.addNerTagger(ner_tagger)

    for d in dictionaries:
        worker.addDictionary(d)

    for input in input_streams:
        for text in input:
            if sep:
                *uid, text = text.strip().split(sep)

            logging.debug('aligning %s "%s"', sep.join(uid), text)

            try:
                tokens, _, dict_tags = worker.analyze(text)
            except RuntimeError:
                logging.exception('at UID %s', sep.join(uid))
                continue

            lens = [
                max(len(tok), max(len(t) for t in tags))
                for tok, *tags in zip(tokens, *dict_tags)
            ]

            if sep and uid:
                print(sep.join(uid))

            print(" ".join(
                ("{:<%i}" % l).format(t) for l, t in zip(lens, tokens)))

            for tags in dict_tags:
                print(" ".join(
                    ("{:<%i}" % l).format(t) for l, t in zip(lens, tags)))

            print("--")
Esempio n. 6
0
def normalize(dictionaries, tokenizer, pos_tagger, ner_tagger, input_streams, sep="\t", **flags):
    """Print only [text UIDs and] dictionary tags."""
    worker = TextAnalytics(tokenizer, pos_tagger, **flags)
    worker.addNerTagger(ner_tagger)

    for d in dictionaries:
        worker.addDictionary(d)

    for input in input_streams:
        for line in input:
            *uid, text = line.strip().split(sep)
            logging.debug('normalizing %s: "%s"', '-'.join(uid), text)

            try:
                _, _, dict_tags = worker.analyze(text)
            except RuntimeError:
                logging.exception('at UID %s', sep.join(uid))
                continue

            for tags in dict_tags:
                for tag in {tag[2:] for tag in tags if tag != Dictionary.O}:
                    print("{}{}{}".format(sep.join(uid), sep if uid else "", tag))