def test_count_vocab_items_respects_casing(self):
        indexer = SingleIdTokenIndexer("words")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["words"] == {"hello": 1, "Hello": 1}

        indexer = SingleIdTokenIndexer("words", lowercase_tokens=True)
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["words"] == {"hello": 2}
 def test_as_array_produces_token_sequence(self):
     indexer = SingleIdTokenIndexer("words")
     padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
     assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
 def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__()
     self._tokenizer = tokenizer
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Example #4
0
    def __init__(self,
                 lazy: bool = False,
                 paper_features_path: str = None,
                 word_splitter: WordSplitter = None,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 data_file: Optional[str] = None,
                 samples_per_query: int = 5,
                 margin_fraction: float = 0.5,
                 ratio_hard_negatives: float = 0.5,
                 predict_mode: bool = False,
                 max_num_authors: Optional[int] = 5,
                 ratio_training_samples: Optional[float] = None,
                 max_sequence_length: Optional[int] = -1,
                 cache_path: Optional[str] = None,
                 overwrite_cache: Optional[bool] = False,
                 use_cls_token: Optional[bool] = None,
                 concat_title_abstract: Optional[bool] = None,
                 coviews_file: Optional[str] = None,
                 included_text_fields: Optional[str] = None,
                 use_paper_feature_cache: bool = True) -> None:
        """
        Args:
            lazy: if false returns a list
            paper_features_path: path to the paper features json file (result of scripts.generate_paper_features.py
            candidates_path: path to the candidate papers
            tokenizer: tokenizer to be used for tokenizing strings
            token_indexers: token indexer for indexing vocab
            data_file: path to the data file (e.g, citations)
            samples_per_query: number of triplets to generate for each query
            margin_fraction: minimum margin of co-views between positive and negative samples
            ratio_hard_negatives: ratio of training data that is selected from hard negatives
                remaining is allocated to easy negatives. should be set to 1.0 in case of similar click data
            predict_mode: if `True` the model only considers the current paper and returns an embedding
                otherwise the model uses the triplet format to train the embedder
            author_id_embedder: Embedder for author ids
            s2_id_embedder: Embedder for respresenting s2 ids
            other_id_embedder: Embedder for representing other ids (e.g., id assigned by metadata)
            max_num_authors: maximum number of authors,
            ratio_training_samples: Limits training to proportion of all training instances
            max_sequence_length: Longer sequences would be truncated (if -1 then there would be no truncation)
            cache_path: Path to file to cache instances, if None, instances won't be cached.
                If specified, instances are cached after being created so next time they are not created
                again from scratch
            overwrite_cache: If true, it overwrites the cached files. Each file corresponds to
                all instances created from the train, dev or test set.
            use_cls_token: Like bert, use an additional CLS token in the begginning (for transoformer)
            concat_title_abstract: Whether to consider title and abstract as a single field.
            coviews_file: Only for backward compatibility to work with older models (renamed to 
                `data_file` in newer models), leave this empty as it won't have any effect
            included_text_fields: space delimited fields to concat to the title: e.g., `title abstract authors`
            use_paper_feature_cache: set to False to disable the in-memory cache of paper features
        """
        super().__init__(lazy)
        self._word_splitter = word_splitter or SimpleWordSplitter()
        self._tokenizer = tokenizer or WordTokenizer(self._word_splitter)
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._token_indexer_author_id = {
            "tokens": SingleIdTokenIndexer(namespace='author')
        }
        self._token_indexer_author_position = \
            {"tokens": SingleIdTokenIndexer(namespace='author_positions')}

        self._token_indexer_venue = {
            "tokens": SingleIdTokenIndexer(namespace='venue')
        }
        self._token_indexer_id = {
            "tokens": SingleIdTokenIndexer(namespace='id')
        }

        with open(paper_features_path) as f_in:
            self.papers = json.load(f_in)
        self.samples_per_query = samples_per_query
        self.margin_fraction = margin_fraction
        self.ratio_hard_negatives = ratio_hard_negatives

        self.predict_mode = predict_mode
        self.max_sequence_length = max_sequence_length
        self.use_cls_token = use_cls_token

        if data_file and not predict_mode:
            # logger.info(f'reading contents of the file at: {coviews_file}')
            with open(data_file) as f_in:
                self.dataset = json.load(f_in)
            # logger.info(f'reading complete. Total {len(self.dataset)} records found.')
            root_path, _ = os.path.splitext(data_file)
            # for multitask interleaving reader, track which dataset the instance is coming from
            self.data_source = root_path.split('/')[-1]
        else:
            self.dataset = None
            self.data_source = None

        self.max_num_authors = max_num_authors

        self.triplet_generator = TripletGenerator(
            paper_ids=list(self.papers.keys()),
            coviews=self.dataset,
            margin_fraction=margin_fraction,
            samples_per_query=samples_per_query,
            ratio_hard_negatives=ratio_hard_negatives)
        self.paper_feature_cache = {
        }  # paper_id -> paper features. Serves as a cache for the _get_paper_features function

        self.ratio_training_samples = float(
            ratio_training_samples) if ratio_training_samples else None

        self.cache_path = cache_path
        self.overwrite_cache = overwrite_cache
        self.data_file = data_file
        self.paper_features_path = paper_features_path
        self.ratio_training_samples = ratio_training_samples

        self.concat_title_abstract = concat_title_abstract
        self.included_text_fields = set(included_text_fields.split())
        self.use_paper_feature_cache = use_paper_feature_cache

        self.abstract_delimiter = [Token('[SEP]')]
        self.author_delimiter = [Token('[unused0]')]
 def test_as_array_produces_token_sequence(self):
     indexer = SingleIdTokenIndexer("words")
     padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
     assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
Example #6
0
    def __init__(
        self,
        lazy: bool = False,
        sample: int = -1,
        lf_syntax: str = None,
        replace_world_entities: bool = False,
        align_world_extractions: bool = False,
        gold_world_extractions: bool = False,
        tagger_only: bool = False,
        denotation_only: bool = False,
        world_extraction_model: Optional[str] = None,
        skip_attributes_regex: Optional[str] = None,
        entity_bits_mode: Optional[str] = None,
        entity_types: Optional[List[str]] = None,
        lexical_cues: List[str] = None,
        tokenizer: Tokenizer = None,
        question_token_indexers: Dict[str, TokenIndexer] = None,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._question_token_indexers = question_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._entity_token_indexers = self._question_token_indexers
        self._sample = sample
        self._replace_world_entities = replace_world_entities
        self._lf_syntax = lf_syntax
        self._entity_bits_mode = entity_bits_mode
        self._align_world_extractions = align_world_extractions
        self._gold_world_extractions = gold_world_extractions
        self._entity_types = entity_types
        self._tagger_only = tagger_only
        self._denotation_only = denotation_only
        self._skip_attributes_regex = None
        if skip_attributes_regex is not None:
            self._skip_attributes_regex = re.compile(skip_attributes_regex)
        self._lexical_cues = lexical_cues

        # Recording of entities in categories relevant for tagging
        all_entities = {}
        all_entities["world"] = ["world1", "world2"]
        # TODO: Clarify this into an appropriate parameter
        self._collapse_tags = ["world"]

        self._all_entities = None
        if entity_types is not None:
            if self._entity_bits_mode == "collapsed":
                self._all_entities = entity_types
            else:
                self._all_entities = [e for t in entity_types for e in all_entities[t]]

        logger.info(f"all_entities = {self._all_entities}")

        # Base world, depending on LF syntax only
        self._knowledge_graph = KnowledgeGraph(
            entities={"placeholder"}, neighbors={}, entity_text={"placeholder": "placeholder"}
        )
        self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax)

        # Decide dynamic entities, if any
        self._dynamic_entities: Dict[str, str] = dict()
        self._use_attr_entities = False
        if "_attr_entities" in lf_syntax:
            self._use_attr_entities = True
            qr_coeff_sets = self._world.qr_coeff_sets
            for qset in qr_coeff_sets:
                for attribute in qset:
                    if (
                        self._skip_attributes_regex is not None
                        and self._skip_attributes_regex.search(attribute)
                    ):
                        continue
                    # Get text associated with each entity, both from entity identifier and
                    # associated lexical cues, if any
                    entity_strings = [words_from_entity_string(attribute).lower()]
                    if self._lexical_cues is not None:
                        for key in self._lexical_cues:
                            if attribute in LEXICAL_CUES[key]:
                                entity_strings += LEXICAL_CUES[key][attribute]
                    self._dynamic_entities["a:" + attribute] = " ".join(entity_strings)

        # Update world to include dynamic entities
        if self._use_attr_entities:
            logger.info(f"dynamic_entities = {self._dynamic_entities}")
            neighbors: Dict[str, List[str]] = {key: [] for key in self._dynamic_entities}
            self._knowledge_graph = KnowledgeGraph(
                entities=set(self._dynamic_entities.keys()),
                neighbors=neighbors,
                entity_text=self._dynamic_entities,
            )
            self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax)

        self._stemmer = PorterStemmer().stemmer

        self._world_tagger_extractor = None
        self._extract_worlds = False
        if world_extraction_model is not None:
            logger.info("Loading world tagger model...")
            self._extract_worlds = True
            self._world_tagger_extractor = WorldTaggerExtractor(world_extraction_model)
            logger.info("Done loading world tagger model!")

        # Convenience regex for recognizing attributes
        self._attr_regex = re.compile(r"""\((\w+) (high|low|higher|lower)""")
Example #7
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token
Example #8
0
 def __init__(self, tokenizer=None, token_indexers=None, lazy=False):
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
Example #9
0
 def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy=False)
     self.token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
def main():
    args = parse_args()
    params = Params.from_file(args.params)
    save_dir = Path(args.save)
    save_dir.mkdir(parents=True)

    params.to_file(save_dir / 'params.json')

    train_params, model_params = params.pop('train'), params.pop('model')

    random_seed = train_params.pop_int('random_seed', 2019)
    torch.manual_seed(random_seed)
    random.seed(random_seed)

    log_filename = save_dir / 'stdout.log'
    sys.stdout = TeeLogger(filename=log_filename,
                           terminal=sys.stdout,
                           file_friendly_terminal_output=False)
    sys.stderr = TeeLogger(filename=log_filename,
                           terminal=sys.stderr,
                           file_friendly_terminal_output=False)

    tokenizer = WordTokenizer(
        start_tokens=['<s>'],
        end_tokens=['</s>'],
    )
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = SnliReader(tokenizer=tokenizer,
                                token_indexers={'tokens': token_indexer})

    train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path')
    train_unlabeled_dataset_path = train_params.pop(
        'train_unlabeled_dataset_path', None)
    train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path)
    train_labeled_dataset = filter_dataset_by_length(
        dataset=train_labeled_dataset, max_length=30)
    if train_unlabeled_dataset_path is not None:
        train_unlabeled_dataset = dataset_reader.read(
            train_unlabeled_dataset_path)
        train_unlabeled_dataset = filter_dataset_by_length(
            dataset=train_unlabeled_dataset, max_length=30)
    else:
        train_unlabeled_dataset = []

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))

    vocab = Vocabulary.from_instances(
        instances=train_labeled_dataset + train_unlabeled_dataset,
        max_vocab_size=train_params.pop_int('max_vocab_size', None))
    vocab.save_to_files(save_dir / 'vocab')

    labeled_batch_size = train_params.pop_int('labeled_batch_size')
    unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size')
    labeled_iterator = BasicIterator(batch_size=labeled_batch_size)
    unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size)
    labeled_iterator.index_with(vocab)
    unlabeled_iterator.index_with(vocab)

    if not train_unlabeled_dataset:
        unlabeled_iterator = None

    model = SeparatedSNLIModel(params=model_params, vocab=vocab)
    optimizer = optim.Adam(params=model.parameters())
    summary_writer = SummaryWriter(log_dir=save_dir / 'log')

    trainer = SeparatedLVMTrainer(
        model=model,
        optimizer=optimizer,
        labeled_iterator=labeled_iterator,
        unlabeled_iterator=unlabeled_iterator,
        train_labeled_dataset=train_labeled_dataset,
        train_unlabeled_dataset=train_unlabeled_dataset,
        validation_dataset=valid_dataset,
        summary_writer=summary_writer,
        serialization_dir=save_dir,
        num_epochs=train_params.pop('num_epochs', 50),
        iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size,
        write_summary_every=100,
        validate_every=2000,
        patience=2,
        clip_grad_max_norm=5,
        cuda_device=train_params.pop_int('cuda_device', 0))
    trainer.train()
 def __init__(self):
     self.tokenizer = WordTokenizer()
     self.token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #12
0
 def __init__(self, max_length: int):
     super().__init__()
     self.token_indexers = {'tokens': SingleIdTokenIndexer()}
     self.max_length = max_length
Example #13
0
        return Instance(fields)

    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["id"],
                row[label_cols].values,
            )


token_indexer = SingleIdTokenIndexer()


def tokenizer(x: str):
    return [
        w.text
        for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).
        split_words(x)[:config.max_seq_len]
    ]


reader = JigsawDatasetReader(tokenizer=tokenizer,
                             token_indexers={"tokens": token_indexer})
train_ds, test_ds = (reader.read(DATA_ROOT / fname)
                     for fname in ["train.csv", "test_proced.csv"])
val_ds = None
Example #14
0
 def test_printing_doesnt_crash(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       {"words": SingleIdTokenIndexer(namespace="words")})
     print(field)
Example #15
0
def main():
    token_indexer = SingleIdTokenIndexer()
    reader = JigsawDatasetReader(
        tokenizer=custom_tokenizer(),
        token_indexers={"tokens": token_indexer},
    )

    # Kaggle的多标签“恶意评论分类挑战
    dataset_root = Path('../../data/jigsaw')
    train_dataset, dev_dataset = (reader.read(
        dataset_root / fname) for fname in ["train.csv", "test_proced.csv"])

    print(
        f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}"
    )

    # 建立词汇表,从数据集中建立
    vocab = Vocabulary.from_instances(train_dataset + dev_dataset)
    vocab_dim = vocab.get_vocab_size('tokens')
    print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim)

    # 构建网络,此处网络为lstm-linear
    embedding_dim = 300
    hidden_dim = 128
    token_embedding = Embedding(num_embeddings=vocab_dim,
                                embedding_dim=embedding_dim)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(embedding_dim,
                      hidden_dim,
                      bidirectional=True,
                      batch_first=True))
    model = MultiLabelClassifier(word_embeddings, 0.5, encoder, 0.2,
                                 len(label_cols), vocab)

    # allennlp 目前好像不支持单机多卡,或者支持性能不好
    gpu_id = 0 if torch.cuda.is_available() else -1
    if gpu_id > -1: model.cuda(gpu_id)

    # 构建迭代器,并为迭代器指定vocab
    iterator = BucketIterator(batch_size=32,
                              sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    # --------------------- forward demo ----------------------
    # generator = iter(iterator(train_dataset, shuffle=True))
    # for _ in range(5):
    #     batch = next(generator)
    #     print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # [batch, sentence_len, token_len]
    #     batch = move_to_device(batch, gpu_id)
    #     tokens = batch['tokens']
    #
    #     # option1. forward one step by one
    #     mask = get_text_field_mask(tokens)
    #     embeddings = model.word_embeddings(tokens)
    #     print("embeddings: ", embeddings.shape)
    #     state = model.encoder(embeddings, mask)
    #     class_logits = model.linear(state)
    #
    #     print("lstm state: ", state.shape, class_logits.shape)
    #
    #     # option2. do forward on the model
    #     y = model(**batch)
    #     metric = model.get_metrics()
    #     print("model out: ", y, '\n', metric)

    # --------------------- train ---------------------
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-4,
                                 weight_decay=1e-5)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=dev_dataset,
        # serialization_dir="./models/",
        cuda_device=gpu_id,
        patience=10,
        num_epochs=20)
    trainer.train()
Example #16
0
def build_tasks(args):
    '''Main logic for preparing tasks, doing so by
    1) creating / loading the tasks
    2) building / loading the vocabulary
    3) building / loading the word vectors
    4) indexing each task's data
    5) initializing lazy loaders (streaming iterators)
    '''

    # 1) create / load tasks
    tasks, train_task_names, eval_task_names = \
        get_tasks(parse_task_list_arg(args.train_tasks),
                  parse_task_list_arg(args.eval_tasks), args.max_seq_len,
                  path=args.data_dir, scratch_path=args.exp_dir,
                  load_pkl=bool(not args.reload_tasks),
                  nli_prob_probe_path=args['nli-prob'].probe_path,
                  max_targ_v_size=args.max_targ_word_v_size)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    # 2) build / load vocab and indexers
    indexers = {}
    if not args.word_embs == 'none':
        indexers["words"] = SingleIdTokenIndexer()
    if args.elmo:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.openai_transformer:
        assert not indexers, ("OpenAI transformer is not supported alongside"
                              " other indexers due to tokenization!")
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
        # Exit if any tasks are not compatible with this tokenization.
        for task in tasks:
            assert task.tokenizer_name == "OpenAI.BPE", \
                (f"Task '{task.name:s}' not compatible with OpenAI "
                  "Transformer model. For edge probing, use -openai versions.")

    vocab_path = os.path.join(args.exp_dir, 'vocab')
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size('tokens')
    args.max_char_v_size = vocab.get_vocab_size('chars')

    # 3) build / load word vectors
    word_embs = None
    if args.word_embs != 'none':
        emb_file = os.path.join(args.exp_dir, 'embs.pkl')
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, 'rb'))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        "Flag reload_indexing was set, but no tasks are set to reindex (use -o \"args.reindex_tasks = \"task1,task2,...\"\")"
    )
    for task in tasks:
        force_reindex = (args.reload_indexing and task.name in reindex_tasks)
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text as well?
        task.train_data = None
        task.val_data = None
        task.test_data = None
        log.info("\tTask '%s': cleared in-memory data.", task.name)

    log.info("\tFinished indexing tasks")

    # 5) Initialize tasks with data iterators.
    assert not (args.training_data_fraction < 1 and args.eval_data_fraction < 1), \
        "training_data_fraction and eval_data_fraction could not be used at a same time (could not be < 1 together)"
    train_tasks = []
    eval_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.val_data = _get_instance_generator(task.name, "val", preproc_dir)
        task.test_data = _get_instance_generator(task.name, "test",
                                                 preproc_dir)
        # When using training_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if args.training_data_fraction < 1 and task.name in train_task_names:
            log.info("Creating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.training_data_fraction)
            train_tasks.append(task)
            if task.name in eval_task_names:
                # Rebuild the iterator so we see the full dataset in the eval training
                # phase. It will create a deepcopy of the task object
                # and therefore there could be two tasks with the same name (task.name).
                log.info("Creating un-trimmed eval training version of " +
                         task.name + " train.")
                log.warn(
                    "When using un-trimmed eval training version of train split, "
                    "it creates a deepcopy of task object which is inefficient."
                )
                task = copy.deepcopy(task)
                task.train_data = _get_instance_generator(task.name,
                                                          "train",
                                                          preproc_dir,
                                                          fraction=1.0)
                eval_tasks.append(task)

        # When using eval_data_fraction, we need modified iterators
        # only for training datasets at train_for_eval time.
        elif args.eval_data_fraction < 1 and task.name in eval_task_names:
            log.info("Creating trimmed train-for-eval-only version of " +
                     task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.eval_data_fraction)
            eval_tasks.append(task)
            if task.name in train_task_names:
                # Rebuild the iterator so we see the full dataset in the pretraining
                # phase. It will create a deepcopy of the task object
                # and therefore there could be two tasks with the same name (task.name).
                log.info("Creating un-trimmed pretraining version of " +
                         task.name + " train.")
                log.warn(
                    "When using un-trimmed pretraining version of train split, "
                    "it creates a deepcopy of task object which is inefficient."
                )
                task = copy.deepcopy(task)
                task.train_data = _get_instance_generator(task.name,
                                                          "train",
                                                          preproc_dir,
                                                          fraction=1.0)
                train_tasks.append(task)
        # When neither eval_data_fraction nor training_data_fraction is specified
        # we use unmodified iterators.
        else:
            task.train_data = _get_instance_generator(task.name,
                                                      "train",
                                                      preproc_dir,
                                                      fraction=1.0)
            if task.name in train_task_names:
                train_tasks.append(task)
            if task.name in eval_task_names:
                eval_tasks.append(task)

        log.info("\tLazy-loading indexed data for task='%s' from %s",
                 task.name, preproc_dir)
    log.info("All tasks initialized with data iterators.")
    log.info('\t  Training on %s', ', '.join(train_task_names))
    log.info('\t  Evaluating on %s', ', '.join(eval_task_names))
    return train_tasks, eval_tasks, vocab, word_embs
Example #17
0
 def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy=False)
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._tokenizer = WordTokenizer()
Example #18
0
 def __init__(self, token_indexers=None, lazy=False):
     super(CcgBankDatasetReader, self).__init__(lazy=lazy)
     self._token_indexers = token_indexers or {
         u'tokens': SingleIdTokenIndexer()
     }
brief

Authors: panxu([email protected])
Date:    2018/12/18 09:39:00
"""

from allennlp.data.fields import TextField
from allennlp.data import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data import Token

words = ["All", "the", "cool", "kids", "use", "character", "embeddings", "."]
sentence1 = TextField(tokens=[Token(w) for w in words],
                      token_indexers={
                          'tokens':
                          SingleIdTokenIndexer(namespace='token_ids'),
                          'characters':
                          TokenCharactersIndexer(namespace='token_characters')
                      })

words2 = ["I", "prefer", "word2vec", "though", "..."]
sentence2 = TextField(tokens=[Token(w) for w in words2],
                      token_indexers={
                          'tokens':
                          SingleIdTokenIndexer(namespace='token_ids'),
                          'characters':
                          TokenCharactersIndexer(namespace='token_characters')
                      })

instance1 = Instance({'sentence': sentence1})
instance2 = Instance({'sentence': sentence2})
from allennlp.data import Batch, Instance, Token, Vocabulary
from allennlp.data.dataset_readers.dataset_utils.span_utils import enumerate_spans
from allennlp.data.fields import TextField, ListField, SpanField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.modules.span_extractors import EndpointSpanExtractor
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding

# Create an instance with multiple spans
tokens = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas', '.']
tokens = [Token(token) for token in tokens]
token_indexers = {'tokens': SingleIdTokenIndexer()}
text_field = TextField(tokens, token_indexers=token_indexers)

spans = [(2, 3), (5, 6)]  # ('an', 'elephant') and ('my', 'pajamas)
span_fields = ListField(
    [SpanField(start, end, text_field) for start, end in spans])

instance = Instance({'tokens': text_field, 'spans': span_fields})

# Alternatively, you can also enumerate all spans
spans = enumerate_spans(tokens, max_span_width=3)
print('all spans up to length 3:')
print(spans)


def filter_function(span_tokens):
    return not any(t == Token('.') for t in span_tokens)


spans = enumerate_spans(tokens,
Example #21
0
    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(["This", "is", "a", "sentence", "."],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()
Example #22
0
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            return Instance({
                'source_tokens': source_field,
                'target_tokens': target_field
            })

        else:
            return Instance({'source_tokens': source_field})

    @overrides
    def apply_token_indexers(self, instance: Instance) -> None:
        instance.fields[
            'source_tokens']._token_indexers = self.source_token_indexers
        instance.fields[
            'target_tokens']._token_indexers = self._target_token_indexers


if __name__ == '__main__':
    dataset_reader = Seq2SeqDatasetReader(
        source_token_indexers={
            "tokens": SingleIdTokenIndexer(namespace='source_tokens')
        },
        target_token_indexers={
            "tokens": SingleIdTokenIndexer(namespace='target_tokens')
        })

    instances = list(dataset_reader.read('./data/reverse/train.csv'))
    print(instances[0])
Example #23
0
 def setUp(self):
     super(TestSpanField, self).setUp()
     self.text = TextField([
         Token(t) for t in
         [u"here", u"is", u"a", u"sentence", u"for", u"spans", u"."]
     ], {u"words": SingleIdTokenIndexer(u"words")})
Example #24
0
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer},
                                                    use_subtrees=True)
    # train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') #origin source
    train_data = reader.read('train.txt') #local
    reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class",
                                                    token_indexers={"tokens": single_id_indexer})
    # dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') #orgin
    dev_data = reader.read('dev.txt') #local
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        # embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" #origin
        embedding_path = "crawl-300d-2M.vec.zip" #local

        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=300,
                                    weight=weight,
                                    trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim,
                                                  hidden_size=512,
                                                  num_layers=2,
                                                  batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda() # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train() # rnn cannot do backwards in train mode

    # intiialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train() # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model,
                                                      batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
 def setUp(self):
     super(TestSequenceLabelField, self).setUp()
     self.text = TextField(
         [Token(t) for t in ["here", "are", "some", "words", "."]],
         {"words": SingleIdTokenIndexer("words")})
Example #26
0
    def setUp(self):
        super(BidirectionalAttentionFlowTest, self).setUp()

        constants.GLOVE_PATH = 'tests/fixtures/glove.6B.100d.sample.txt.gz'
        reader_params = Params({
            'token_indexers': {
                'tokens': {
                    'type': 'single_id'
                },
                'token_characters': {
                    'type': 'characters'
                }
            }
        })
        dataset = SquadReader.from_params(reader_params).read(
            'tests/fixtures/data/squad.json')
        vocab = Vocabulary.from_dataset(dataset)
        self.vocab = vocab
        dataset.index_instances(vocab)
        self.dataset = dataset
        self.token_indexers = {
            'tokens': SingleIdTokenIndexer(),
            'token_characters': TokenCharactersIndexer()
        }

        self.model = BidirectionalAttentionFlow.from_params(
            self.vocab, Params({}))

        small_params = Params({
            'text_field_embedder': {
                'tokens': {
                    'type': 'embedding',
                    'pretrained_file': constants.GLOVE_PATH,
                    'trainable': False,
                    'projection_dim': 4
                },
                'token_characters': {
                    'type': 'character_encoding',
                    'embedding': {
                        'embedding_dim': 8
                    },
                    'encoder': {
                        'type': 'cnn',
                        'embedding_dim': 8,
                        'num_filters': 4,
                        'ngram_filter_sizes': [5]
                    }
                }
            },
            'phrase_layer': {
                'type': 'lstm',
                'bidirectional': True,
                'input_size': 8,
                'hidden_size': 4,
                'num_layers': 1,
            },
            'similarity_function': {
                'type': 'linear',
                'combination': 'x,y,x*y',
                'tensor_1_dim': 8,
                'tensor_2_dim': 8
            },
            'modeling_layer': {
                'type': 'lstm',
                'bidirectional': True,
                'input_size': 32,
                'hidden_size': 4,
                'num_layers': 1,
            },
            'span_end_encoder': {
                'type': 'lstm',
                'bidirectional': True,
                'input_size': 56,
                'hidden_size': 4,
                'num_layers': 1,
            },
        })
        self.small_model = BidirectionalAttentionFlow.from_params(
            self.vocab, small_params)
def main():
    parser = argparse.ArgumentParser(description='Evidence Inference experiments')
    parser.add_argument('--cuda_device', type=int, default=0,
                        help='GPU number (default: 0)')
    parser.add_argument('--epochs', type=int, default=2,
                        help='upper epoch limit (default: 2)')
    parser.add_argument('--patience', type=int, default=1,
                        help='trainer patience  (default: 1)')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--dropout', type=float, default=0.2,
                        help='dropout for the model (default: 0.2)')
    parser.add_argument('--emb_size', type=int, default=256,
                        help='elmo embeddings size (default: 256)')
    parser.add_argument('--model_name', type=str, default='baseline',
                        help='model name (default: baseline)')
    parser.add_argument('--tunable', action='store_true',
                        help='tune the underlying embedding model (default: False)')
    args = parser.parse_args()

    annotations = pd.read_csv('data/data/annotations_merged.csv')
    prompts = pd.read_csv('data/data/prompts_merged.csv')

    feature_dictionary = {}
    prompts_dictionary = {}

    for index, row in prompts.iterrows():
        prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']]

    for index, row in annotations.iterrows():
        if row['PMCID'] not in feature_dictionary:
            feature_dictionary[row['PMCID']] = []
        feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']]
                                                + prompts_dictionary[row['PromptID']])

    train = []
    valid = []
    test = []

    with open('data/splits/train_article_ids.txt') as train_file:
        for line in train_file:
            train.append(int(line.strip()))

    with open('data/splits/validation_article_ids.txt') as valid_file:
        for line in valid_file:
            valid.append(int(line.strip()))

    with open('data/splits/test_article_ids.txt') as test_file:
        for line in test_file:
            test.append(int(line.strip()))

    elmo_token_indexer = {'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer()}

    reader = EIDatasetReader(elmo_token_indexer, feature_dictionary)
    train_data = reader.read(train)
    valid_data = reader.read(valid)
    test_data = reader.read(test)

    vocab = Vocabulary.from_instances(train_data + valid_data + test_data)

    urls = [
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_options.json',
        'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_'
        '2xhighway_weights.hdf5'
    ]

    elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable,
                                             projection_dim=args.emb_size)

    word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True)

    model = Baseline(word_embeddings, vocab)

    cuda_device = args.cuda_device

    if torch.cuda.is_available():
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    iterator = BucketIterator(batch_size=args.batch_size,
                              sorting_keys=[('article', 'num_tokens')],
                              padding_noise=0.1)
    iterator.index_with(vocab)

    serialization_dir = 'model_checkpoints/' + args.model_name

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=test_data,
                      patience=args.patience,
                      validation_metric='+accuracy',
                      num_epochs=args.epochs,
                      cuda_device=cuda_device,
                      serialization_dir=serialization_dir)

    result = trainer.train()
    for key in result:
        print(str(key) + ': ' + str(result[key]))

    test_metrics = evaluate(trainer.model, test_data, iterator,
                            cuda_device=cuda_device,
                            batch_weight_key="")

    print('Test Data statistics:')
    for key, value in test_metrics.items():
        print(str(key) + ': ' + str(value))
Example #28
0
def _get_default_indexer() -> SingleIdTokenIndexer:
    return SingleIdTokenIndexer(namespace='tokens',
                                start_tokens=[START_SYMBOL],
                                end_tokens=[END_SYMBOL])
Example #29
0
    def test_valid_vocab_extension(self):
        vocab_dir = self.TEST_DIR / 'vocab_save'
        extension_ways = ["from_params", "extend_from_instances"]
        # Test: padded/non-padded common namespaces are extending appropriately
        non_padded_namespaces_list = [[], ["tokens"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("d", namespace="tokens")
            original_vocab.add_token_to_namespace("a", namespace="tokens")
            original_vocab.add_token_to_namespace("b", namespace="tokens")
            text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]],
                                   {"tokens": SingleIdTokenIndexer("tokens")})
            instances = Batch([Instance({"text": text_field})])
            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                extra_count = 2 if extended_vocab.is_padded("tokens") else 0
                assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count
                assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count
                assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count

                assert extended_vocab.get_token_index("c", "tokens") # should be present
                assert extended_vocab.get_token_index("e", "tokens") # should be present

                assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count

        # Test: padded/non-padded non-common namespaces are extending appropriately
        non_padded_namespaces_list = [[],
                                      ["tokens1"],
                                      ["tokens1", "tokens2"]]
        for non_padded_namespaces in non_padded_namespaces_list:
            original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces)
            original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2
            text_field = TextField([Token(t) for t in ["b"]],
                                   {"tokens2": SingleIdTokenIndexer("tokens2")})
            instances = Batch([Instance({"text": text_field})])

            for way in extension_ways:
                if way == "extend_from_instances":
                    extended_vocab = copy.copy(original_vocab)
                    params = Params({"non_padded_namespaces": non_padded_namespaces})
                    extended_vocab.extend_from_instances(params, instances)
                else:
                    shutil.rmtree(vocab_dir, ignore_errors=True)
                    original_vocab.save_to_files(vocab_dir)
                    params = Params({"directory_path": vocab_dir, "extend": True,
                                     "non_padded_namespaces": non_padded_namespaces})
                    extended_vocab = Vocabulary.from_params(params, instances)

                # Should have two namespaces
                assert len(extended_vocab._token_to_index) == 2

                extra_count = 2 if extended_vocab.is_padded("tokens1") else 0
                assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count

                extra_count = 2 if extended_vocab.is_padded("tokens2") else 0
                assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
Example #30
0
 def setUp(self):
     super(TestTagField, self).setUp()
     self.text = TextField(["here", "are", "some", "words", "."],
                           {"words": SingleIdTokenIndexer("words")})
Example #31
0
def train(train, validation, optimizer_name):
    batch_size = 32
    learning_rate = 0.01
    max_iterations = 100

    token_indexer = {
        "tokens": SingleIdTokenIndexer(),
        "token_characters": TokenCharactersIndexer(min_padding_length=3),
    }

    reader = Conll2003DatasetReader(token_indexer)

    train_dataset = reader.read(train)

    validation_dataset = reader.read(validation)

    # Once we've read in the datasets, we use them to create our <code>Vocabulary</code>
    # (that is, the mapping[s] from tokens / labels to ids).
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    # Set variables

    model = get_model(vocab)

    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    if optimizer_name == 'adahessian':
        optimizer = Adahessian(model.parameters(),
                               lr=learning_rate,
                               block_length=2)
    elif optimizer_name == 'ranger':
        optimizer = Ranger(model.parameters(), lr=learning_rate)
    else:
        raise AttributeError()

    train_dataset.index_with(vocab)
    validation_dataset.index_with(vocab)

    scheduler = ReduceOnPlateauLearningRateScheduler(optimizer,
                                                     factor=0.5,
                                                     patience=4,
                                                     mode="min",
                                                     verbose=True)

    dl = PyTorchDataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
    )

    dl_validation = PyTorchDataLoader(validation_dataset,
                                      batch_size=batch_size,
                                      shuffle=False)

    trainer_model = AdaTrainer

    trainer = trainer_model(
        model=model,
        optimizer=optimizer,
        # iterator=iterator,
        grad_norm=10.0,
        data_loader=dl,
        validation_data_loader=dl_validation,
        learning_rate_scheduler=scheduler,
        patience=8,
        num_epochs=max_iterations,
        cuda_device=cuda_device,
    )
    train_metrics = trainer.train()
    print(train_metrics)
def build_tasks(args):
    '''Prepare tasks'''

    def parse_tasks(task_list):
        '''parse string of tasks'''
        if task_list == 'all':
            tasks = ALL_TASKS
        elif task_list == 'none':
            tasks = []
        else:
            tasks = task_list.split(',')
        return tasks

    train_task_names = parse_tasks(args.train_tasks)
    eval_task_names = parse_tasks(args.eval_tasks)
    all_task_names = list(set(train_task_names + eval_task_names))
    tasks = get_tasks(all_task_names, args.max_seq_len, args.load_tasks)

    max_v_sizes = {'word': args.max_word_v_size}
    token_indexer = {}
    if args.elmo:
        token_indexer["elmo"] = ELMoTokenCharactersIndexer("elmo")
        if not args.elmo_no_glove:
            token_indexer["words"] = SingleIdTokenIndexer()
    else:
        token_indexer["words"] = SingleIdTokenIndexer()

    vocab_path = os.path.join(args.exp_dir, 'vocab')
    preproc_file = os.path.join(args.exp_dir, args.preproc_file)
    if args.load_preproc and os.path.exists(preproc_file):
        preproc = pkl.load(open(preproc_file, 'rb'))
        vocab = Vocabulary.from_files(vocab_path)
        word_embs = preproc['word_embs']
        for task in tasks:
            train, val, test = preproc[task.name]
            task.train_data = train
            task.val_data = val
            task.test_data = test
        log.info("\tFinished building vocab. Using %d words",
                 vocab.get_vocab_size('tokens'))
        log.info("\tLoaded data from %s", preproc_file)
    else:
        log.info("\tProcessing tasks from scratch")
        word2freq = get_words(tasks)
        vocab = get_vocab(word2freq, max_v_sizes)
        word_embs = get_embeddings(vocab, args.word_embs_file, args.d_word)
        preproc = {'word_embs': word_embs}
        for task in tasks:
            train, val, test = process_task(task, token_indexer, vocab)
            task.train_data = train
            task.val_data = val
            task.test_data = test
            del_field_tokens(task)
            preproc[task.name] = (train, val, test)
        log.info("\tFinished indexing tasks")
        pkl.dump(preproc, open(preproc_file, 'wb'))
        vocab.save_to_files(vocab_path)
        log.info("\tSaved data to %s", preproc_file)
        del word2freq
    del preproc

    train_tasks = [task for task in tasks if task.name in train_task_names]
    eval_tasks = [task for task in tasks if task.name in eval_task_names]
    log.info('\t  Training on %s', ', '.join([task.name for task in train_tasks]))
    log.info('\t  Evaluating on %s', ', '.join([task.name for task in eval_tasks]))
    return train_tasks, eval_tasks, vocab, word_embs