class IOBDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer_space = WhitespaceTokenizer() self.tokenizer_spacy = SpacyTokenizer(language = "en_core_web_md", pos_tags = True, split_on_spaces = True) self.token_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=2), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP', feature_name='tag_') } self.intent_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=2), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP', feature_name='tag_') } def text_to_instance(self, tokens: List[Token], intent: List[Token], rmf: str = None, label: int = None) -> Instance: sentence_field = TextField(tokens, self.token_indexers) intent_field = TextField(intent, self.intent_indexers) fields = {"utterance": sentence_field, "intent": intent_field } if label: fields["label"] = LabelField(label) if rmf: rmf = np.fromstring(rmf, dtype=float, sep=' ') fields["rmf"] = ArrayField(rmf) return Instance(fields) def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path) as f: for line in f: sentence, intent, rmf, label = line.strip().split('\t') yield self.text_to_instance(self.tokenizer_space.tokenize(sentence), self.tokenizer_space.tokenize(intent), rmf, label )
class CCGReader(SequenceTaggingDatasetReader): def __init__(self, *args, **kwargs): super(CCGReader, self).__init__(*args, **kwargs) self.tokenizer = WhitespaceTokenizer() def text_to_instances(self, text: str) -> Iterable[Instance]: instances = [] for line in text.split('\n'): instance = self.text_to_instance(line) if instance: instances.append(instance) return instances def text_to_instance(self, line: str) -> Optional[Instance]: tokens = [] tags = [] toks_tags = self.tokenizer.tokenize(line) if not toks_tags: return None for tok_tag in toks_tags: tok, *tag = tok_tag.text.split(self._word_tag_delimiter) tokens.append(Token(tok)) tags.append(tag or UNK) inst = Instance({'tokens': TextField(tokens, {})}) inst.add_field('tags', SequenceLabelField(tags, inst['tokens'])) return inst
def run_ELMo_RSA(stim_file, header=False, filter_file=None): EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE) #Get tokenizer tokenizer = WhitespaceTokenizer() #Load model ##ELMo OG elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json' #ELMo Small #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json' #ELMo Medium #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json' #ELMo OG (5.5B) #elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_embedding = ElmoTokenEmbedder(options_file=elmo_options_file, weight_file=elmo_weight_file, dropout=0.0) embedder = BasicTextFieldEmbedder( token_embedders={'elmo_tokens': elmo_embedding}) for x in range(len(EXP.SENTS)): sentences = list(EXP.SENTS[x]) target = sentences[0] sentence = sentences[1] #GET BASELINE token_indexer = ELMoTokenCharactersIndexer() vocab = Vocabulary() target_tokens = tokenizer.tokenize(target) target_text_field = TextField(target_tokens, {'elmo_tokens': token_indexer}) target_text_field.index(vocab) target_token_tensor = target_text_field.as_tensor( target_text_field.get_padding_lengths()) target_tensor_dict = target_text_field.batch_tensors( [target_token_tensor]) target_embedding = embedder(target_tensor_dict)[0] baseline = target_embedding[-1].data.cpu().squeeze() #GET SIMS sims = get_ELMo_sims(sentence, baseline, tokenizer, embedder) values = get_dummy_values(sentence) EXP.load_IT('elmo', x, values, False, sims) return EXP
def tokenized_passage(self, passage): tokenizer = WhitespaceTokenizer() return tokenizer.tokenize(passage)
class TransactionsDatasetReader(DatasetReader): def __init__( self, discretizer_path: str, max_sequence_length: int = None, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self.discretizer = load_discretizer(discretizer_path) self._max_sequence_length = max_sequence_length or math.inf self._tokenizer = WhitespaceTokenizer() self._start_token = Token(START_TOKEN) self._end_token = Token(END_TOKEN) def _add_start_end_tokens(self, tokens: List[Token]) -> List[Token]: return [self._start_token] + tokens + [self._end_token] def text_to_instance( self, transactions: List[int], amounts: List[float], label: Optional[int] = None, client_id: Optional[int] = None, ) -> Instance: transactions = " ".join(map(str, transactions)) amounts = transform_amounts(amounts, self.discretizer) amounts = " ".join(amounts) transactions = self._tokenizer.tokenize(transactions) amounts = self._tokenizer.tokenize(amounts) transactions = self._add_start_end_tokens(transactions) amounts = self._add_start_end_tokens(amounts) fields = { "transactions": TextField(transactions, {"tokens": SingleIdTokenIndexer("transactions")}), "amounts": TextField(amounts, {"tokens": SingleIdTokenIndexer("amounts")}), } if label is not None: fields["label"] = LabelField(label=str(label), skip_indexing=False) if client_id is not None: fields["client_id"] = LabelField(label=client_id, skip_indexing=True, label_namespace="client_id") return Instance(fields) def _read(self, file_path: str): logger.info("Loading data from %s", file_path) dropped_instances = 0 with jsonlines.open(cached_path(file_path), "r") as reader: for items in reader: transactions = items["transactions"] amounts = items["amounts"] assert len(transactions) == len(amounts) instance = self.text_to_instance( transactions=transactions, amounts=amounts, label=items.get("label"), client_id=items.get("client_id"), ) if instance.fields["transactions"].sequence_length( ) <= self._max_sequence_length: yield instance else: dropped_instances += 1 if not dropped_instances: logger.info(f"No instances dropped from {file_path}.") else: logger.warning( f"Dropped {dropped_instances} instances from {file_path}.") @classmethod def from_archive(cls, archive_file: str) -> "TransactionsDatasetReader": config = load_archive(archive_file).config["dataset_reader"] assert config.pop("type") == "transactions_reader" return cls(**config)
# Represents each token with both an id from a vocabulary and a sequence of # characters. token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='token_vocab'), 'token_characters': TokenCharactersIndexer(namespace='character_vocab') } vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'], namespace='token_vocab') vocab.add_tokens_to_namespace( ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'], namespace='character_vocab') text = "This is some text ." tokens = tokenizer.tokenize(text) print("Tokens:", tokens) # The setup here is the same as what we saw above. text_field = TextField(tokens, token_indexers) text_field.index(vocab) padding_lengths = text_field.get_padding_lengths() tensor_dict = text_field.as_tensor(padding_lengths) # Note now that we have two entries in this output dictionary, # one for each indexer that we specified. print("Combined tensor dictionary:", tensor_dict) # Now we split text into words with part-of-speech tags, using Spacy's POS tagger. # This will result in the `tag_` variable being set on each `Token` object, which # we will read in the indexer. tokenizer = SpacyTokenizer(pos_tags=True)