Ejemplo n.º 1
0
    def test_enumerate_spans_enumerates_all_spans(self):
        tokenizer = SpacyWordSplitter(pos_tags=True)
        sentence = tokenizer.split_words(u"This is a sentence.")

        spans = span_utils.enumerate_spans(sentence)
        assert spans == [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 1),
                         (1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (2, 4),
                         (3, 3), (3, 4), (4, 4)]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2)
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4),
                         (3, 4)]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2,
                                           offset=20)
        assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23),
                         (22, 24), (23, 24)]

        def no_prefixed_punctuation(tokens):
            # Only include spans which don't start or end with punctuation.
            return tokens[0].pos_ != u"PUNCT" and tokens[-1].pos_ != u"PUNCT"

        spans = span_utils.enumerate_spans(
            sentence,
            max_span_width=3,
            min_span_width=2,
            filter_function=no_prefixed_punctuation)

        # No longer includes (2, 4) or (3, 4) as these include punctuation
        # as their last element.
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]
Ejemplo n.º 2
0
    def test_enumerate_spans_enumerates_all_spans(self):
        tokenizer = SpacyWordSplitter(pos_tags=True)
        sentence = tokenizer.split_words("This is a sentence.")

        spans = span_utils.enumerate_spans(sentence)
        assert spans == [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2),
                         (1, 3), (1, 4), (2, 2), (2, 3), (2, 4), (3, 3), (3, 4), (4, 4)]

        spans = span_utils.enumerate_spans(sentence, max_span_width=3, min_span_width=2)
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4), (3, 4)]

        spans = span_utils.enumerate_spans(sentence, max_span_width=3, min_span_width=2, offset=20)
        assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23), (22, 24), (23, 24)]

        def no_prefixed_punctuation(tokens: List[Token]):
            # Only include spans which don't start or end with punctuation.
            return tokens[0].pos_ != "PUNCT" and tokens[-1].pos_ != "PUNCT"

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2,
                                           filter_function=no_prefixed_punctuation)

        # No longer includes (2, 4) or (3, 4) as these include punctuation
        # as their last element.
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]
Ejemplo n.º 3
0
    def __init__(
            self,
            source_tokenizer: Tokenizer = None,
            target_tokenizer: Tokenizer = None,
            source_token_indexers: Dict[str, TokenIndexer] = None,
            target_token_indexers: Dict[str, TokenIndexer] = None,
            source_add_start_token: bool = True,
            source_add_end_token: bool = True,
            delimiter: str = "\t",
            source_max_tokens: Optional[int] = None,
            target_max_tokens: Optional[int] = None,
            lazy: bool = False,
            offline: bool = True,
            training: bool = True,
            perfect_entity_linking: bool = True,
            constrained_vocab=None,
            ranking_mode: bool = False,  # need to be consistent with the model
            use_constrained_vocab:
        bool = False,  # need to be consistent with the model
    ) -> None:
        super().__init__(lazy)
        self._source_tokenizer = source_tokenizer or SpacyWordSplitter()
        self._target_tokenizer = target_tokenizer or (
            lambda x: x.replace('(', ' ( ').replace(')', ' ) ').split())
        self._source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._target_token_indexers = target_token_indexers
        self._source_add_start_token = source_add_start_token
        self._source_add_end_token = source_add_end_token
        self._delimiter = delimiter
        self._source_max_tokens = source_max_tokens
        self._target_max_tokens = target_max_tokens
        self._source_max_exceeded = 0
        self._target_max_exceeded = 0
        self._training = training
        self._offline = offline
        self._perfect_el = perfect_entity_linking
        if not self._perfect_el:
            self.el_results = json.load(open("entity_linking/grailqa_el.json"))
            self.extractor = GrailQA_Value_Extractor()
        self._constrained_vocab = constrained_vocab or '1_step'
        # possible choices: {1_step, 2_step, cheating, domain, mix}
        self._ranking_mode = ranking_mode
        self._use_constrained_vocab = use_constrained_vocab

        self._uncovered_count = defaultdict(lambda: 0)
Ejemplo n.º 4
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        # Keys: title + abstractText
        splitter = SpacyWordSplitter('en_core_web_sm', True, True, True)
        tokenizer = WordTokenizer(word_splitter=splitter)
        with open(file_path, 'r') as f:
            json_docs = json.load(f)

        for article in json_docs['documents']:
            doc_name = article['pmid']
            title = article['title']
            abstract = article['abstractText']
            text = title + " " + abstract

            tokens = tokenizer.tokenize(text)

            yield self.text_to_instance(doc_name, tokens)
Ejemplo n.º 5
0
    def _read(self, file_path: str) -> Iterator[Instance]:
        splitter = SpacyWordSplitter('en_core_web_sm', True, True, True)
        tokenizer = WordTokenizer(word_splitter=splitter)
        root = ElementTree.parse(file_path).getroot()
        xml_sents = root.findall("./sentence")

        for xml_sent in tqdm(xml_sents):
            text = xml_sent.find("text").text
            annotations = xml_sent.find('aspectTerms')
            if annotations is not None:
                annotations = annotations.findall("aspectTerm")
            else:
                annotations = []

            # Sorts the annotations by start character
            annotations.sort(key=lambda x: int(x.get('from')))

            # Tokenizes the sentence
            tokens = tokenizer.tokenize(text)

            # Assigns tags based on annotations
            tags = []
            next = 0
            current = None
            for token in tokens:
                # Checks if the next annotation begins somewhere in this token
                start_entity = next < len(annotations)
                start_entity = start_entity and token.idx <= int(
                    annotations[next].get('from'))
                start_entity = start_entity and token.idx + len(
                    token.text) > int(annotations[next].get('from'))

                if start_entity:
                    tags.append('I' if current is None else 'B')
                    current = annotations[next]
                    next += 1
                elif current is not None:
                    if token.idx < int(current.get('to')):
                        tags.append('I')
                    else:
                        tags.append('O')
                        current = None
                else:
                    tags.append('O')

            yield self.text_to_instance(xml_sent.get('id'), tokens, tags)
Ejemplo n.º 6
0
    def __init__(self,
                 ablate_mode: str,
                 token_indexers: Optional[Dict[str, TokenIndexer]] = None,
                 tokenizer: Optional[Tokenizer] = None,
                 limit_number: Optional[int] = None,
                 normalize_outputs: Optional[Tuple[float, float]] = None,
                 lazy: bool = False) -> None:
        super().__init__(token_indexers, tokenizer, limit_number,
                         normalize_outputs, lazy)

        assert ablate_mode in ["years", "dates", "numbers"]
        self._ablate_mode = ablate_mode

        # Ensure tokenizer creates the tags needed for filtering
        # Since we may need to use spaCy's `like_num` property that is not inherited by AlleNLP tokens, we keep the spacy tokens directly
        self._tokenizer = tokenizer or WordTokenizer(
            word_splitter=SpacyWordSplitter(
                pos_tags=True, ner=True, keep_spacy_tokens=True))