コード例 #1
0
def tokenize(documents):
    real_tokens = []
    documents2 = []
    tbw = TreebankWordTokenizer()
    for doc in documents:
        text = doc["text"]
        file = doc["id"]
        text = text.replace("\"","'")
        #text = text.replace("/", " ")
        text = text.replace("-", " ")
        text = text.replace(".", " ")
        tokens = tbw.span_tokenize(text)
        for token in tokens:
            token_txt = text[token[0]:token[1]]
            found = False
            for tag in doc["tags"]:
                if int(tag["start"])<=token[0] and int(tag["end"])>=token[1]:
                    token_tag = tag["tag"]
                    token_tag_type = tag["type"]
                    found = True
            if found==False:
                token_tag = "O"
                token_tag_type = "O"

            real_tokens.append({"token":token_txt,"start":token[0],"end":token[1],"tag":token_tag,"tag_type":token_tag_type})
        documents2.append({"id": file, "text": text, "tags": doc["tags"],"tokens":real_tokens})
    return documents2
コード例 #2
0
ファイル: nltk_processors.py プロジェクト: awoziji/forte
class NLTKWordTokenizer(PackProcessor):
    r"""A wrapper of NLTK word tokenizer.
    """
    def __init__(self):
        super().__init__()
        self.tokenizer = TreebankWordTokenizer()

    def _process(self, input_pack: DataPack):
        for begin, end in self.tokenizer.span_tokenize(input_pack.text):
            Token(input_pack, begin, end)
コード例 #3
0
class DocumentTokenizer(object):
    """
    Used to split a document into sentences and tokens.
    Returns a list of lists TODO
    """
    def __init__(self, sent_tokenizer=None, word_tokenizer=None):
        if not sent_tokenizer:
            #self.sent_tokenizer = ClinicalRushSentenceTokenizer('rush_rules.tsv')
            self.sent_tokenizer = DefaultSentenceTokenizer()
        if not word_tokenizer:
            self.word_tokenizer = TreebankWordTokenizer()

        #self.rush = rush
        #self.word_tokenizer = word_tokenizer

    def tokenize_doc(self, doc):
        """
        Takes raw string. Returns a list of lists where each list is the
        sentence, and each sentence contains two-tuples of tokens and spans.
        """
        tokenized_sents_and_spans = []
        try:
            # sentence_span is a list of tuples of spans
            sentence_spans = self.sent_tokenizer.tokenize_sents(doc)
        except Exception as e:
            raise e
            return []
            #raise e
        for start, end in sentence_spans:
            sentence = doc[start:end]
            tokenized_sents_and_spans.append(
                self.tokenize_sent(sentence, start))
        return tokenized_sents_and_spans

    def tokenize_sent(self, sentence, offset):
        try:
            tokens = self.word_tokenizer.tokenize(sentence)
        except Exception as e:
            print("Word tokenizing failed")
            print(sentence)
            raise e
        try:
            spans = self.word_tokenizer.span_tokenize(sentence)
        except Exception as e:
            print("Span tokenizing failed")
            print(sentence)
            raise e
        tokens_and_spans = []
        for token, span in zip(tokens, spans):
            start, end = span
            true_start = start + offset
            true_end = end + offset
            tokens_and_spans.append((token, (true_start, true_end)))
        return tokens_and_spans
コード例 #4
0
def tokenize_en(text):
    """Receive text string and return tokens and spans"""
    tokenizer = TreebankWordTokenizer()
    tokens = []
    tokens_span = []
    for start, end in tokenizer.span_tokenize(text):
        token = text[start:end]
        # Separate ending dot "." in token
        if len(token) > 1 and token[-1] == "." and token.count(".") == 1:
            end_resize = end - 1
            tokens.append(text[start:end_resize])
            tokens_span.append((start, end_resize))
            tokens.append(text[end_resize:end])
            tokens_span.append((end_resize, end))
        else:
            tokens.append(token)
            tokens_span.append((start, end))
    return tokens, tokens_span
コード例 #5
0
class NLTKWordTokenizer(PackProcessor):
    r"""A wrapper of NLTK word tokenizer."""
    def __init__(self):
        super().__init__()
        self.tokenizer = TreebankWordTokenizer()

    def _process(self, input_pack: DataPack):
        for begin, end in self.tokenizer.span_tokenize(input_pack.text):
            Token(input_pack, begin, end)

    def record(self, record_meta: Dict[str, Set[str]]):
        r"""Method to add output type record of `NLTKWordTokenizer`, which is
        `ft.onto.base_ontology.Token`,
        to :attr:`forte.data.data_pack.Meta.record`.

        Args:
            record_meta: the field in the datapack for type record that need to
                fill in for consistency checking.
        """
        record_meta["ft.onto.base_ontology.Token"] = set()
コード例 #6
0
class SrlDataReader:
    def __init__(self, vocab):
        self.vocab = vocab
        self.tokenizer = TreebankWordTokenizer()

    def read_data(self, data_dir):
        for root, dirs, files in os.walk(data_dir):
            for name in files:
                if not name.endswith(".json"):
                    continue

                full_path = os.path.join(root, name)

                with open(full_path) as fin:
                    doc = json.load(fin)
                    for data in self.parse_doc(doc):
                        yield data

    def parse_doc(self, doc):
        text = doc["text"]
        events = doc["events"]
        fillers = doc["fillers"]
        entities = doc["entities"]

        begin_map = defaultdict(list)
        end_map = defaultdict(list)

        for f in fillers:
            begin_map[f["begin"]].append((f["id"], f["type"]))
            end_map[f["end"]].append((f["id"], f["type"]))

        for entity in entities:
            for em in entity["mentions"]:
                begin_map[em["begin"]].append((em["id"], em["type"]))
                end_map[em["end"]].append((em["id"], em["type"]))

        for event in events:
            for evm in event["mentions"]:
                begin_map[evm["begin"]].append((evm["id"], evm["type"]))
                end_map[evm["end"]].append((evm["id"], evm["type"]))

        indexed_doc = []
        tags = []
        sent_offset = 0

        on_types = set()

        for sentence in text.split("\n"):
            word_spans = self.tokenizer.span_tokenize(sentence)

            tokens = []

            for b, e in word_spans:
                token_text = sentence[b:e]
                indexed_doc.append(self.vocab.get(token_text, 0))

                begin = sent_offset + b
                end = sent_offset + e

                token_tags = []

                for begin_obj in begin_map[begin]:
                    obj_id, obj_type = begin_obj
                    token_tags.append("B_" + obj_type)
                    on_types.add(obj_type)

                for end_obj in end_map[end]:
                    obj_id, obj_type = end_obj
                    token_tags.append("I_" + obj_type)
                    on_types.remove(obj_type)

                if on_types:
                    for t in on_types:
                        token_tags.append("I_" + t)
                else:
                    token_tags.append("O")

                tags.append(token_tags)
                tokens.append(self.vocab.get(token_text))

            sent_offset += len(sentence) + 1

            print(tokens)
            input(tags)

            yield tokens, tags