Example #1
0
    def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None):
        instance_dict = {}

        if isinstance(sent_tokens, str):
            sent_tokens = sent_tokens.split()
        sent_tokens = cleanse_sentence_text(sent_tokens)
        text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers)
        instance_dict['text'] = text_field
        instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field)

        if annotations is not None:
            for i, slot_name in enumerate(self._slot_labels):
                span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans])
                instance_dict['span_slot_%s'%slot_name] = span_slot

            labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans])
            instance_dict['labeled_spans'] = labeled_span_field

            if self._bio_labels:
                bio_labels = ["O"] * len(sent_tokens)

                bio_labels[pred_index] = "B-V"

                for span in self._resolve_spans(annotations, pred_index):
                    bio_labels[span.start()] = "B-ARG"
                    for i in range(span.start()+1, span.end()+1):
                        bio_labels[i] = "I-ARG"
                instance_dict["bio_label"] = SequenceLabelField(bio_labels, text_field, label_namespace="bio_labels")

            instance_dict['annotations'] = MetadataField({'annotations':annotations})

        metadata = {'pred_index' : pred_index, 'sent_text': " ".join(sent_tokens)}
        if sent_id is not None:
            metadata['sent_id'] = sent_id
        instance_dict['metadata'] = MetadataField(metadata)

        return Instance(instance_dict)
Example #2
0
def _fix_tokenization(tokenized_sent,
                      bert_embs,
                      old_det_to_new_ind,
                      obj_to_type,
                      token_indexers,
                      pad_ind=-1):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """

    new_tokenization_with_tags = []
    for tok in tokenized_sent:
        if isinstance(tok, list):
            for int_name in tok:
                obj_type = obj_to_type[int_name]
                new_ind = old_det_to_new_ind[int_name]
                if new_ind < 0:
                    raise ValueError(
                        "Oh no, the new index is negative! that means it's invalid. {} {}"
                        .format(tokenized_sent, old_det_to_new_ind))
                text_to_use = GENDER_NEUTRAL_NAMES[
                    new_ind % len(GENDER_NEUTRAL_NAMES
                                  )] if obj_type == 'person' else obj_type
                new_tokenization_with_tags.append((text_to_use, new_ind))
        else:
            new_tokenization_with_tags.append((tok, pad_ind))

    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags],
                              text_field)
    return text_field, tags
Example #3
0
    def text_to_instance(self,  # type: ignore
                         formalism: str,
                         position_in_corpus : int,
                         am_sentence: AMSentence) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        formalism : str.
            The formalism of this instance (e.g. DM, PSD, ...)
        position_in_corpus : ``int``, required.
            The index of this sentence in the corpus.
        am_sentence : ``AMSentence``, required.
            The words in the sentence to be encoded.

        Returns
        -------
        An instance containing words, pos tags, dependency edge labels, head
        indices, supertags and lexical labels as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in am_sentence.get_tokens(shadow_art_root=True)], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(am_sentence.get_pos(), tokens, label_namespace="pos")
        fields["ner_tags"] = SequenceLabelField(am_sentence.get_ner(), tokens, label_namespace="ner_labels")
        fields["lemmas"] = SequenceLabelField(am_sentence.get_lemmas(), tokens, label_namespace="lemmas")
        fields["supertags"] = SequenceLabelField(am_sentence.get_supertags(), tokens, label_namespace=formalism+"_supertag_labels")
        fields["lexlabels"] = SequenceLabelField(am_sentence.get_lexlabels(), tokens, label_namespace=formalism+"_lex_labels")
        fields["head_tags"] = SequenceLabelField(am_sentence.get_edge_labels(),tokens, label_namespace=formalism+"_head_tags") #edge labels
        fields["head_indices"] = SequenceLabelField(am_sentence.get_heads(),tokens,label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": am_sentence.words, "attributes": am_sentence.attributes,
                                            "formalism": formalism, "position_in_corpus" : position_in_corpus,
                                            "token_ranges" : am_sentence.get_ranges(),
                                            "is_annotated" : am_sentence.is_annotated()})
        return Instance(fields)
Example #4
0
    def text_to_instance(
        self,  # type: ignore
        sentence: str = None,
        tokens: List[Token] = None,
        targets: List[str] = None,
    ) -> Instance:

        """
        Parameters
        ----------
        sentence : ``str``, optional
            A sentence containing [MASK] tokens that should be filled in by the model.  This input
            is superceded and ignored if ``tokens`` is given.
        tokens : ``List[Token]``, optional
            An already-tokenized sentence containing some number of [MASK] tokens to be predicted.
        targets : ``List[str]``, optional
            Contains the target tokens to be predicted.  The length of this list should be the same
            as the number of [MASK] tokens in the input.
        """
        if not tokens:
            tokens = self._tokenizer.tokenize(sentence)
        input_field = TextField(tokens, self._token_indexers)
        mask_positions = []
        for i, token in enumerate(tokens):
            if token.text == "[MASK]":
                mask_positions.append(i)
        if not mask_positions:
            raise ValueError("No [MASK] tokens found!")
        if targets and len(targets) != len(mask_positions):
            raise ValueError(f"Found {len(mask_positions)} mask tokens and {len(targets)} targets")
        mask_position_field = ListField([IndexField(i, input_field) for i in mask_positions])
        # TODO(mattg): there's a problem if the targets get split into multiple word pieces...
        fields: Dict[str, Field] = {"tokens": input_field, "mask_positions": mask_position_field}
        if targets is not None:
            target_field = TextField([Token(target) for target in targets], self._token_indexers)
            fields["target_ids"] = target_field
        return Instance(fields)
Example #5
0
    def read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        with open(file_path, "r") as data_file:
            instances = []
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in tqdm.tqdm(data_file):
                line = line.strip("\n")
                # skip blank lines
                if not line:
                    continue

                pred_id = int(line.split()[0])
                tokens_and_tags = line.split(maxsplit=1)[1].split(
                    self._word_tag_delimiter)
                tokens = [Token(token) for token in tokens_and_tags[0].split()]
                tags = [tag for tag in tokens_and_tags[1].split()]

                pred_tags = [
                    0 if i != pred_id else 1 for i in range(len(tokens))
                ]
                sequence = TextField(tokens, self._token_indexers)
                sequence_tags = SequenceLabelField(tags, sequence)
                sequence_pred_tags = SequenceLabelField(pred_tags, sequence)

                instances.append(
                    Instance({
                        'tokens': sequence,
                        'tags': sequence_tags,
                        'verb_indicator': sequence_pred_tags
                    }))
                if not instances:
                    raise ConfigurationError(
                        "No instances were read from the given filepath {}. "
                        "Is the path correct?".format(file_path))
        return Dataset(instances)
Example #6
0
    def text_to_instance(self, words: List[str], upos_tags: List[str],
                         dependencies: List[Tuple[str, int]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace=self._task_type + "_head_tags")
            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": words, "pos": upos_tags})
        return Instance(fields)
Example #7
0
    def _read(self, file_dir: str):
        # file_dir should point to the conllu.tar.gz file plus train, dev, or test
        # example: file_dir="data/en/conllu.tar.gz/train"
        file, split = os.path.split(file_dir)

        tar = tarfile.open(file, "r:gz")
        file_names = [tarinfo for tarinfo in tar.getmembers() if split in tarinfo.name and ".conllu" in tarinfo.name]

        if split == "train" and self._num_examples > -1:
            file_names = file_names[:self._num_examples]


        for fname in file_names:
            content = tar.extractfile(fname)
            language = content.readline().decode("utf8").rstrip("\n")[-2:]
            rating = content.readline().decode("utf8").rstrip("\n")[-1]
            doc_id = content.readline().decode("utf8").rstrip("\n").split()[-1]

            tokens = []
            num_sents = 0
            num_tokens = 0

            for line in content:
                line = line.decode("utf8")
                if line[0] == '#':
                    continue

                if not line.rstrip("\n"):
                    num_sents += 1
                    continue

                else:
                    tokens.append(Token(line.split("\t")[1]))
                    num_tokens += 1

            #content = tar.extractfile(fname).read()
            yield self.text_to_instance(tokens, doc_id, rating, num_sents, num_tokens)
    def text_to_instance(self,  # type: ignore
                         query: List[str],
                         prelinked_entities: Dict[str, Dict[str, str]] = None,
                         sql: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if sql is not None:
            try:
                action_sequence, all_actions = self._world.get_action_sequence_and_all_actions(sql,
                                                                                               prelinked_entities)
            except ParseError:
                return None

        index_fields: List[Field] = []
        production_rule_fields: List[Field] = []

        for production_rule in all_actions:
            nonterminal, _ = production_rule.split(' ->')
            production_rule = ' '.join(production_rule.split(' '))
            field = ProductionRuleField(production_rule, self._world.is_global_rule(nonterminal))
            production_rule_fields.append(field)

        valid_actions_field = ListField(production_rule_fields)
        fields["valid_actions"] = valid_actions_field

        action_map = {action.rule: i # type: ignore
                      for i, action in enumerate(valid_actions_field.field_list)}

        for production_rule in action_sequence:
            index_fields.append(IndexField(action_map[production_rule], valid_actions_field))

        action_sequence_field = ListField(index_fields)
        fields["action_sequence"] = action_sequence_field
        return Instance(fields)
Example #9
0
    def _read(self, file_path: str):

        for sentence in open(cached_path(file_path), "r"):
            tokens = sentence.strip().split(" ")
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)
            words = []
            for index, token in enumerate(tokens):
                # Coreference is annotated using [square brackets]
                # or (round brackets) around coreferent phrases.
                if "[" in token and "]" in token:
                    clusters[0].append((index, index))
                elif "[" in token:
                    clusters[0].append((index, index))
                elif "]" in token:
                    old_span = clusters[0][-1]
                    clusters[0][-1] = (old_span[0], index)

                if "(" in token and ")" in token:
                    clusters[1].append((index, index))
                elif "(" in token:
                    clusters[1].append((index, index))
                elif ")" in token:
                    old_span = clusters[1][-1]
                    clusters[1][-1] = (old_span[0], index)

                if token.endswith("."):
                    # Winobias is tokenised, but not for full stops.
                    # We'll just special case them here.
                    token = token[:-1]
                    words.append(token.strip("[]()"))
                    words.append(".")
                else:
                    words.append(token.strip("[]()"))

            yield self.text_to_instance([Token(x) for x in words],
                                        [x for x in clusters.values()])
Example #10
0
    def text_to_instance(self, data: Dict[str, Any]) -> Instance:  # pylint: disable=arguments-differ
        # Flatten and pad tokens
        tokens = data['tokens']
        tokens = [Token(x) for x in tokens]
        fields = {'tokens': TextField(tokens, self._token_indexers)}

        # If annotations are provided, process them into arrays.
        if 'annotations' in data:

            # Initialize arrays and book keeping data structures.
            seen_entities: Set[str] = set()
            entity_types = np.zeros(shape=(len(tokens), ))
            entity_ids = np.zeros(shape=(len(tokens), ))
            mention_lengths = np.ones(shape=(len(tokens), ))

            # Process annotations
            for annotation in data['annotations']:

                seen_entities.add(annotation['id'])
                start, end = annotation['span']
                length = end - start

                for i in range(*annotation['span']):
                    # Note: +1 offset to account for start token.
                    entity_types[i] = 1
                    entity_ids[i] = len(seen_entities)
                    mention_lengths[i] = length
                    length -= 1

            fields['entity_types'] = SequentialArrayField(entity_types,
                                                          dtype=np.uint8)
            fields['entity_ids'] = SequentialArrayField(entity_ids,
                                                        dtype=np.int64)
            fields['mention_lengths'] = SequentialArrayField(mention_lengths,
                                                             dtype=np.int64)

        return Instance(fields)
Example #11
0
    def text_to_instance(
        self,
        annotation_id: str,
        document: str,
        query: str = None,
        label: str = None,
        rationale: List[int] = None,
        tokens_existing: List[str] = None,
    ) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        tokens = [Token(w) for w in tokens_existing]
        rationale_tokens = rationale

        keep_tokens = [0 if t != '[SEP]' else 1 for t in tokens]

        fields["document"] = TextField(tokens, self._token_indexers)
        fields["rationale"] = SequenceLabelField(rationale_tokens, fields["document"], "rationale_labels")

        metadata = {
            "annotation_id": annotation_id,
            "tokens": tokens,
            "keep_tokens": keep_tokens,
            "token_rationale": rationale_tokens,
            "document": document,
            "query": query,
            "convert_tokens_to_instance": self.convert_tokens_to_instance,
            "label": label,
        }

        fields["metadata"] = MetadataField(metadata)

        if label is not None:
            fields["label"] = LabelField(label, label_namespace="labels")

        return Instance(fields)
Example #12
0
    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Finalize predictions.

        This method overrides ``Model.decode``, which gets called after ``Model.forward``, at test
        time, to finalize predictions. The logic for the decoder part of the encoder-decoder lives
        within the ``forward`` method.

        This method trims the output predictions to the first end symbol, replaces indices with
        corresponding tokens, and adds a field called ``predicted_tokens`` to the ``output_dict``.
        """
        predicted_indices = output_dict["predictions"]
        if not isinstance(predicted_indices, numpy.ndarray):
            predicted_indices = predicted_indices.detach().cpu().numpy()
        all_predicted_tokens = []
        for indices in predicted_indices:
            # Beam search gives us the top k results for each source sentence in the batch
            # but we just want the single best.
            if len(indices.shape) > 1:
                indices = indices[0]
            indices = list(indices)

            if self._end_index in indices:
                indices = indices[:indices.index(self._end_index)]
            predicted_tokens = list()
            for x in indices:
                if x in [self._end_index, self._start_index, self._pad_index]:
                    continue
                if x >= self._num_classes:
                    index = x - self._num_classes
                    predicted_tokens.append(Token("@entity_%d" % index))
                else:
                    w = self.vocab.get_token_from_index(x, namespace=self._target_namespace)
                    predicted_tokens.append(w)
            all_predicted_tokens.append(predicted_tokens)
        output_dict["predicted_tokens"] = all_predicted_tokens
        return output_dict
Example #13
0
    def text_to_instance(self,
                         tokens: List[str],
                         arc_indices: List[Tuple[int, int]] = None,
                         arc_tags: List[str] = None,
                         upos_tags: List[str] = None,
                         xpos_tags: List[str] = None) -> Instance:

        fields: Dict[str, Field] = {}

        if self.use_lowercase:
            tokens = list(map(str.lower, tokens))
        tokens = self.tokenizer.tokenize(' '.join(tokens)) \
            if self.tokenizer is not None else [Token(t) for t in tokens]

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field

        if upos_tags is not None:
            fields['upos_tags'] = SequenceLabelField(upos_tags,
                                                     text_field,
                                                     label_namespace='upos')

        if xpos_tags is not None:
            fields['xpos_tags'] = SequenceLabelField(xpos_tags,
                                                     text_field,
                                                     label_namespace='xpos')

        if arc_indices is not None and arc_tags is not None:
            fields['adjacency_matrix'] = AdjacencyField(
                arc_indices,
                text_field,
                arc_tags,
                label_namespace='dependency',
                padding_value=-1)

        return Instance(fields)
 def test_get_valid_actions_in_world_without_comparable_columns(self):
     question_tokens = [Token(x) for x in ['what', 'was', 'the', 'first', 'title', '?']]
     table_file = self.FIXTURES_ROOT / 'data' / 'corenlp_processed_tables' / 'TEST-1.table'
     table_context = TableQuestionContext.read_from_file(table_file, question_tokens)
     # The table does not have date or number columns.
     assert "date" not in table_context.column_types.values()
     assert "number" not in table_context.column_types.values()
     world = WikiTablesVariableFreeWorld(table_context)
     actions = world.get_valid_actions()
     assert set(actions.keys()) == {
             "<r,<g,s>>",
             "<r,<g,r>>",
             "<r,<t,<s,r>>>",
             "<n,<n,<n,d>>>",
             "<r,r>",
             "<r,n>",
             "d",
             "n",
             "s",
             "t",
             "r",
             "@start@",
             }
     assert set([str(type_) for type_ in world.get_basic_types()]) == {'n', 'd', 's', 'r', 't', 'g'}
Example #15
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        all_letters = string.ascii_letters + " .,;'"
        n_letters = len(all_letters)
        names = []
        countries = []

        # Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
        def unicodeToAscii(s):
            return ''.join(
                c for c in unicodedata.normalize('NFD', s)
                if unicodedata.category(c) != 'Mn' and c in all_letters)

        # Read a file and split into lines
        def readLines(file_path):
            lines = open(file_path,
                         encoding='utf-8').read().strip().split('\n')
            return [unicodeToAscii(line) for line in lines]

        lines = readLines(file_path)

        for pair in lines:
            yield self.name_to_instance([Token(pair.strip().split()[0])],
                                        pair.strip().split()[1])
Example #16
0
def main():
    opts = options()
    # load zero pronoun detector
    with open(opts.tagger_param_file, mode='rb') as f:
        tagger_params = pickle.load(f)
    print(tagger_params)
    tagger_vocab = Vocabulary.from_files(opts.tagger_vocab_file)
    tagger_model = Tagger.build(tagger_params, tagger_vocab)
    tagger_model, tagger_indexer = load_model(tagger_model, tagger_params,
                                              opts.tagger_model_file,
                                              opts.gpuid)
    # prepare dataset readers
    tagger_reader = TaggerDatasetReader(
        token_indexers={"tokens": tagger_indexer})

    with codecs.open(opts.input_file, "r",
                     encoding="utf8") as f_in, codecs.open(
                         opts.output_file, "w", encoding="utf8") as f_out:
        for line in f_in:
            line = line.strip()
            toks = [Token(tok) for tok in line.split(" ")]
            tagger_instance = tagger_reader.text_to_instance(toks)
            output = tagger_model.forward_on_instance(tagger_instance)
            f_out.write(" ".join(output["tags"]) + "\n")
Example #17
0
    def _process_sentence(
            self, sentence_tokens: List[str], verbal_predicates: List[int],
            predicate_argument_labels: List[List[str]]) -> List[Instance]:
        """
        Parameters
        ----------
        sentence_tokens : ``List[str]``, required.
            The tokenised sentence.
        verbal_predicates : ``List[int]``, required.
            The indexes of the verbal predicates in the
            sentence which have an associated annotation.
        predicate_argument_labels : ``List[List[str]]``, required.
            A list of predicate argument labels, one for each verbal_predicate. The
            internal lists are of length: len(sentence).

        Returns
        -------
        A list of Instances.

        """
        tokens = [Token(t) for t in sentence_tokens]
        if not verbal_predicates:
            # Sentence contains no predicates.
            tags = ["O" for _ in sentence_tokens]
            verb_label = [0 for _ in sentence_tokens]
            return [self.text_to_instance(tokens, verb_label, tags)]
        else:
            instances = []
            for verb_index, annotation in zip(verbal_predicates,
                                              predicate_argument_labels):
                tags = annotation
                verb_label = [0 for _ in sentence_tokens]
                verb_label[verb_index] = 1
                instances.append(
                    self.text_to_instance(tokens, verb_label, tags))
            return instances
Example #18
0
def fill_token_indices(tokens, text, uncased):
    new_tokens = []
    text_idx = 0

    if uncased:
        text = text.lower()

    for token in tokens:
        first_char_idx = 2 if len(
            token.text) > 2 and token.text[:2] == "##" else 0

        while text[text_idx] == ' ' or text[text_idx] == '\xa0':
            text_idx += 1

        new_tokens.append(Token(text=token.text, idx=text_idx))

        token_len = len(token.text) - first_char_idx

        if token.text == '[UNK]':
            token_len = 1

        text_idx += token_len

    return new_tokens
    def clean(self, passage, question, answer, passage_tagging,
              question_tagging):
        passage_tokens = [Token(w) for w in passage_tagging['words']]
        spans = DropReader.find_valid_spans(passage_tokens, answer['spans'])

        new_answer_texts = []

        cleaned = False

        for answer_text in answer['spans']:
            valid = True

            for span in spans:
                span_text = ' '.join(passage_tagging['words'][span[0]:span[1] +
                                                              1]).lower()

                if answer_text.lower() != span_text:
                    continue

                if any(tag != 'O'
                       for tag in passage_tagging['tags'][span[0]:span[1] +
                                                          1]):
                    valid = False
                    cleaned = True
                    break

            if valid:
                new_answer_texts.append(answer_text)

        if not cleaned:
            return None

        new_answer = answer.copy()
        new_answer['spans'] = new_answer_texts

        return {'answer': new_answer}
Example #20
0
    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path, 'r', encoding='utf-8') as f:
            columns = next(f).strip().split(
                "\t")  # первая строка для названий колонок
            tokens, labels = [], []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if tokens:
                        yield self.text_to_instance(tokens, labels)
                    tokens, labels = [], []
                    continue
                if len(columns) == 4:
                    # для train.csv
                    _, _, word, gram = line.split('\t')
                    pos, _ = gram.split('#')
                    labels.append(pos)
                else:
                    # для test.csv
                    _, _, word = line.split('\t')
                tokens.append(Token(word))

            if tokens:
                yield self.text_to_instance(tokens, labels)
Example #21
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        if file_path[-8:] == 'test.txt':
            data = snips_reader('test.txt',
                                valid_class=self.valid_class,
                                random_seed=self.random_seed,
                                drop_empty=self.drop_empty)
        elif file_path[-9:] == 'train.txt':
            data = snips_reader('train.txt',
                                valid_class=self.valid_class,
                                random_seed=self.random_seed,
                                drop_empty=self.drop_empty)
        else:
            data = snips_reader('valid.txt',
                                valid_class=self.valid_class,
                                random_seed=self.random_seed,
                                drop_empty=self.drop_empty)
        # if file_path[-9:] == 'train.txt':
        #     print(data[:10])

        for fields in data:
            # unzipping trick returns tuples, but our Fields need lists

            tokens, ner_tags = [list(field) for field in zip(*fields)]
            # TextField requires ``Token`` objects
            tokens = [Token(token) for token in tokens]
            sequence = TextField(tokens, self._token_indexers)

            instance_fields: Dict[str, Field] = {'tokens': sequence}
            # Add "feature labels" to instance
            if 'ner' in self.feature_labels:
                instance_fields['ner_tags'] = SequenceLabelField(
                    ner_tags, sequence, "ner_tags")
            # Add "tag label" to instance
            instance_fields['tags'] = SequenceLabelField(ner_tags, sequence)
            yield Instance(instance_fields)
Example #22
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s",
                    file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path,
                                               self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]

            ##########################
            result = self.dependency_tree_predictor.predict(
                sentence=" ".join(sentence.words))
            # print(result['words'])
            root_dict = result['hierplane_tree']['root']
            adj = {}
            self.traverse_tree(adj, root_dict['word'], root_dict)
            predicte_adj = {}
            #########################
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, adj, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [
                        1 if label[-2:] == "-V" else 0 for label in tags
                    ]
                    yield self.text_to_instance(tokens, verb_indicator, adj,
                                                tags)
Example #23
0
 def process_documents(self, content):
     # documents = {}
     documents_obj = {}
     curr_id = -1
     for is_divider, lines in tqdm.tqdm(
             itertools.groupby(content, _is_divider)):
         # Ignore the document divider chunks, so that `lines` corresponds to the
         # a single sentence.
         for line in lines:
             line = line.rstrip('\n')
             tokens = line.split()
             if tokens[0] == 'ID' and is_divider:
                 curr_id = tokens[1]
                 if curr_id in documents_obj:
                     warnings.warn(f'duplicate {curr_id}')
                 else:
                     # documents[curr_id] = []
                     documents_obj[curr_id] = Doc(curr_id, [])
             elif not (line.strip() == ''):
                 tokens = [Token(self.vocab[int(idx)]) for idx in tokens]
                 # documents[curr_id].append(tokens)
                 sent = SentLabel(tokens, ['O'] * len(tokens))
                 documents_obj[curr_id].sentences.append(sent)
     return documents_obj
 def _read(self, file_path):
     # if `file_path` is a URL, redirect to the cache
     # file_path = cached_path(file_path)
     for filename in os.listdir(file_path):
         filename_splitted = filename.split('_')
         task_name = filename_splitted[-3]
         domain_name = filename_splitted[-2]
         if task_name not in self._tasks or domain_name not in self._domains:
             continue
         with open(os.path.join(file_path, filename), "r") as data_file:
             logger.info("Reading instances from lines in file at: %s",
                         filename)
             for line in Tqdm.tqdm(data_file):
                 line = line.strip("\n")
                 # skip blank lines
                 if not line:
                     continue
                 tokens_and_tags = [
                     pair.rsplit(self._word_tag_delimiter, 1)
                     for pair in line.split(self._token_delimiter)
                 ]
                 tokens_and_tags = ([['<<' + task_name + '>>', 'O'],
                                     ['<<' + domain_name + '>>', 'O']] +
                                    tokens_and_tags)
                 tokens = [Token(token) for token, tag in tokens_and_tags]
                 tags = [tag for token, tag in tokens_and_tags]
                 task_field = LabelField(task_name,
                                         label_namespace="task_labels")
                 sequence = TextField(tokens, self._token_indexers)
                 sequence_tags = SequenceLabelField(
                     tags, sequence, label_namespace='labels')
                 yield Instance({
                     'task_token': task_field,
                     'tokens': sequence,
                     'tags': sequence_tags
                 })
Example #25
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    if self.ignore_ner_tags:
                        tokens_, pos_tags, chunk_tags = fields[:3]
                        ner_tags = None
                    else:
                        tokens_, pos_tags, chunk_tags, ner_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
Example #26
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)

        with open(file_path, 'r') as conll_file:
            logger.info(
                "Reading Target CONLL instances from CONLL "
                "dataset at: %s", file_path)
            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(conll_file,
                                                       _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if is_divider:
                    continue
                fields = [line.strip().split() for line in lines]
                # unzipping trick returns tuples, but our Fields need lists
                fields = [list(field) for field in zip(*fields)]
                tokens_ = fields[0]
                tags = fields[1]

                # TextField requires ``Token`` objects
                tokens = [Token(token) for token in tokens_]

                yield self.text_to_instance(tokens, tags)
Example #27
0
def make_instance(question: str, choices: List[str]) -> List[Instance]:
    """Given a question and a list of choices text, convert to BERT NSP instances.

    Parameters
    ----------
    question : str
        Question
    choices : List[str]
        List of five choices

    Returns
    -------
    List[Instance]
        List of Allennlp Instances
    """
    question_tokens = TOKENIZER.tokenize(question)
    instances = []
    for choice in choices:
        choice_tokens = TOKENIZER.tokenize(choice)
        tokens = question_tokens + [Token('[SEP]')] + choice_tokens
        instance = Instance(
            {"tokens": TextField(tokens, {"bert": WORD_INDEXER})})
        instances.append(instance)
    return instances
Example #28
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info(
                "Filtering to only include file paths containing the %s domain",
                self._domain_identifier,
            )

        for sentence in self._ontonotes_subset(
            ontonotes_reader, file_path, self._domain_identifier
        ):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags)
Example #29
0
def _fix_visual_concept(visual_concept, visual_concept_num, h5fn, pad_ind):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """
    bert_embs = np.zeros([len(visual_concept), 768])
    new_tokenization_with_tags = []
    for i, tok in enumerate(visual_concept):
        new_tokenization_with_tags.append((tok, pad_ind))
        with h5py.File(h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v)
                for k, v in h5[str(visual_concept_num[i])].items()
            }
            bert_embs[i, :] = grp_items[f'word']
    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags],
                              text_field)
    return text_field, tags
    def clean(self, passage, question, answer, passage_tagging, question_tagging):
        passage_tokens = [Token(w) for w in passage_tagging['words']]
        spans = find_valid_spans(passage_tokens, answer['spans'])

        new_answer_texts = []

        cleaned = False

        for answer_text in answer['spans']:
            if self.should_remove_answer(answer_text):
                continue
            
            valid = True

            for span in spans:
                span_text = ' '.join(passage_tagging['words'][span[0]:span[1]+1]).lower()
                span_text = span_text.replace(' - ', '-')

                if answer_text.lower() != span_text:
                    continue
                
                if self.should_remove_span(passage_tagging['tags'][span[0]:span[1]+1]):
                    valid = False
                    cleaned = True
                    break

            if valid:
                new_answer_texts.append(answer_text)

        if not cleaned:
            return None
        
        new_answer = answer.copy()
        new_answer['spans'] = new_answer_texts

        return {'answer': new_answer}