def test_read_from_file(self): word_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) paragraph_tokenizer = ParagraphWordTokenizer(word_splitter=JustSpacesWordSplitter()) reader = AbstractiveClozeDatasetReader(document_tokenizer=paragraph_tokenizer, topic_tokenizer=word_tokenizer, max_document_length=10, max_context_length=7, max_cloze_length=5) instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl')) instance0 = { 'document': ['NEW', 'YORK', ',', 'Jan.', '8', ',', '2016', '/PRNewswire/', '--', 'Businessman'], 'topics': [['Ken', 'Fields'], ['Politics']], 'context': ['%', 'Renewable', 'Energy', 'in', '20', 'Years', '.'], 'cloze': ['Picking', 'as', 'his', 'campaign', 'slogan'] } assert len(instances) == 25 fields = instances[0].fields assert [t.text for t in fields['document'].tokens] == instance0['document'] assert len(fields['topics'].field_list) == len(instance0['topics']) for topic_field, topic in zip(fields['topics'].field_list, instance0['topics']): assert [t.text for t in topic_field.tokens] == topic assert [t.text for t in fields['context'].tokens] == instance0['context'] assert [t.text for t in fields['cloze'].tokens] == instance0['cloze'] metadata = fields['metadata'] assert 'document' in metadata assert 'topics' in metadata assert 'context' in metadata assert 'cloze' in metadata
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._word_splitter = JustSpacesWordSplitter() self.lazy = lazy
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) # self._tokenizer = tokenizer or WordTokenizer() self._tokenizer = JustSpacesWordSplitter() self._token_indexers = token_indexers or { "sentence": SingleIdTokenIndexer() }
def __init__(self, is_pretrain, token_indexer: Dict[str, TokenIndexer] = None, char_indexer: Dict[str, TokenCharactersIndexer] = None, lazy: bool = False, tables_file: str = 'data\\tables.jsonl', test_sym_file: str = 'data\\test.sym', load_cache: bool = True, save_cache: bool = True, cache_dir: str = 'cache', loading_limit: int = -1): super().__init__(lazy=lazy) self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) self._token_indexers = token_indexer or {"tokens": SingleIdTokenIndexer()} self._char_indexers = char_indexer self._is_pretrain = is_pretrain self._table_file = tables_file self._loading_limit = loading_limit self._load_cache = load_cache self._save_cache = save_cache self._cache_dir = cache_dir if self._load_cache or self._save_cache: if not os.path.exists(self._cache_dir): os.mkdir(self._cache_dir) self._test_sym_file = test_sym_file
def test_read_from_file(self): tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) reader = ExtractiveClozeDatasetReader(tokenizer=tokenizer, max_num_sentences=5, max_sentence_length=6, max_context_length=4) instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl')) instance1 = { 'document': [ ['Drew', 'Sheneman', 'has', 'been', 'the', 'editorial'], ['J.', ')'], ['since', '1998', '.'], ['With', 'exceptional', 'artistry', ',', 'his', 'cartoons'], ['Sheneman', 'began', 'cartooning', 'in', 'college', 'and'] ], 'topics': [['Drew', 'Sheneman']], 'context': ['American', 'editorial', 'cartoonist', '.'], 'labels': [1, 0, 1, 0, 1] } assert len(instances) == 25 fields = instances[1].fields assert len(fields['document'].field_list) == 5 for sentence, sentence_field in zip(instance1['document'], fields['document'].field_list): assert [t.text for t in sentence_field.tokens] == sentence assert len(fields['topics'].field_list) == 1 for topic, topic_field in zip(instance1['topics'], fields['topics'].field_list): assert [t.text for t in topic_field.tokens] == topic assert [t.text for t in fields['context']] == instance1['context'] assert np.array_equal(fields['labels'].array, instance1['labels']) metadata = fields['metadata'] assert 'document' in metadata assert 'topics' in metadata assert 'context' in metadata assert 'cloze' in metadata
def __init__(self, lazy: bool = False, max_bag_size: int = 25, negative_exampels_percentage: int = 100, with_direct_supervision: bool = True) -> None: """ args: lazy: lazy reading of the dataset max_bag_size: maximum number of sentences per a bag negative_exampels_percentage: percentage of negative examples to keep with_direct_supervision: keep or ignore direct supervision examples """ super().__init__(lazy=lazy) self.max_bag_size = max_bag_size self.negative_exampels_percentage = negative_exampels_percentage self.with_direct_supervision = with_direct_supervision self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) self._token_indexers = {"tokens": SingleIdTokenIndexer()} # for logging and input validation self._inst_counts: Dict = defaultdict(int) # count instances per relation type self._pairs: Set = set() # keep track of pairs of entities self._bag_sizes: Dict = defaultdict(int) # count relation types per bag self._relation_coocur: Dict = defaultdict(int) # count relation types per bag self._failed_mentions_count: int = 0 # count mentions with wrong formating self._count_direct_supervised_inst: int = 0 self._count_bag_labels: Dict = defaultdict(int)
def __init__( self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, delimiter: str = "\t", source_max_tokens: Optional[int] = 510, target_max_tokens: Optional[int] = 64, lazy: bool = False, ) -> None: super().__init__(lazy) self._source_tokenizer = source_tokenizer or WordTokenizer( word_splitter=JustSpacesWordSplitter()) self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token self._delimiter = delimiter self._source_max_tokens = source_max_tokens self._target_max_tokens = target_max_tokens self._source_max_exceeded = 0 self._target_max_exceeded = 0 self.pre_sen = 10
def __init__( self, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, delimiter: str = "\t", source_max_tokens: Optional[int] = 256, target_max_tokens: Optional[int] = 32, lazy: bool = False, ) -> None: super().__init__(lazy) self._source_tokenizer = source_tokenizer or WordTokenizer( word_splitter=JustSpacesWordSplitter()) self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers self._target_token_indexers = target_token_indexers or self._source_token_indexers self._source_add_start_token = source_add_start_token self._delimiter = delimiter self._source_max_tokens = source_max_tokens self._target_max_tokens = target_max_tokens self._source_max_exceeded = 0 self._target_max_exceeded = 0 self.pre_sen = 10 self.seg = pkuseg.pkuseg(model_name='medicine', user_dict='../data/0510/mdg/user_dict.txt')
def test_read_from_file(self, lazy): reader = MRPCReader(tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()), token_indexers={"bert": PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)}, lazy=lazy, skip_label_indexing=False, mode='merge') instances = reader.read( str(self.FIXTURES_ROOT / 'mrpc_dev.tsv')) instances = ensure_list(instances) instance1 = {"tokens": "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .".split() + ["[SEP]"] + "\" The foodservice pie business does not fit our long-term growth strategy .".split(), "label": '1'} instance2 = {"tokens": "Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .".split() + ["[SEP]"] + "His wife said he was \" 100 percent behind George Bush \" and looked forward to using his years of training in the war .".split(), "label": '0'} instance3 = {"tokens": "The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .".split() + ["[SEP]"] + "The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .".split(), "label": '0'} for instance, expected_instance in zip(instances, [instance1, instance2, instance3]): fields = instance.fields assert [ t.text for t in fields["tokens"].tokens] == expected_instance["tokens"] assert fields["label"].label == expected_instance["label"]
def test_read_from_file(self): tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) reader = ExtractiveDatasetReader(tokenizer=tokenizer, max_num_sentences=5, max_sentence_length=6) instances = list(reader.read(f'{FIXTURES_ROOT}/data/sds.jsonl')) instance0 = { 'document': [['Editor', '\'s', 'note', ':', 'In', 'our'], ['An', 'inmate', 'housed', 'on', 'the', '``'], ['MIAMI', ',', 'Florida', '(', 'CNN', ')'], ['Most', 'often', ',', 'they', 'face', 'drug'], ['So', ',', 'they', 'end', 'up', 'on']] } assert len(instances) == 25 fields = instances[0].fields assert len(fields['document'].field_list) == 5 for sentence, sentence_field in zip(instance0['document'], fields['document'].field_list): assert [t.text for t in sentence_field.tokens] == sentence assert np.array_equal(fields['labels'].array, [0, 0, 1, 1, 0]) metadata = fields['metadata'] assert 'document' in metadata assert len(metadata['document']) == 5 assert 'summary' in metadata assert len(metadata['summary']) == 4
class SentenceTaggerPredictor(Predictor): """ Wrapper for any model that takes in a sentence and returns a single set of tags for it. In particular, it can be used with the :class:`~allennlp.models.crf_tagger.CrfTagger` model and also the :class:`~allennlp.models.simple_tagger.SimpleTagger` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = JustSpacesWordSplitter() @overrides def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) instance = self._dataset_reader.text_to_instance(tokens) return_dict: JsonDict = {"words": [token.text for token in tokens]} return instance, return_dict
def __init__(self, reverse: bool = False, tokens_per_instance: int = None, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: tokenizer = tokenizer or WordTokenizer(word_splitter=JustSpacesWordSplitter()) super().__init__(tokens_per_instance, tokenizer, token_indexers, True) self._reverse = reverse
class PairsDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__() self._tokenizer = JustSpacesWordSplitter() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): line_json = json.loads(line) if not line_json: continue query_paper = line_json["query_paper"] candidate_paper = line_json["candidate_paper"] relevance = line_json["relevance"] instance = self.text_to_instance( query_paper=query_paper, candidate_paper=candidate_paper, relevance=relevance) if instance is not None: yield instance @overrides def text_to_instance(self, query_paper: str, candidate_paper: str, relevance: str = None) -> Instance: # type: ignore # pylint: disable=arguments-differ fields: Dict[str, Field] = {} query_tokens = self._tokenizer.split_words(query_paper) fields['query_paper'] = TextField(query_tokens, self._token_indexers) candidate_tokens = self._tokenizer.split_words(candidate_paper) fields['candidate_paper'] = TextField(candidate_tokens, self._token_indexers) if relevance is not None: fields['label'] = LabelField(relevance) return Instance(fields)
def __init__(self, bert_model): lower_case = True if "uncased" in bert_model else False self.bert_indexer, self.tokenizer = self.get_bert_indexer( bert_model, lower_case=lower_case) self.tokenizer_bert = MyBertWordSplitter(do_lower_case=lower_case) self.spacy_splitter = SpacyWordSplitter(keep_spacy_tokens=True) self.just_space_tokenization = JustSpacesWordSplitter() self.simple_tokenization = SimpleWordSplitter()
def test_reader(self): tokenizer = WordTokenizer(JustSpacesWordSplitter()) reader = SummDataReader(tokenizer=tokenizer, source_max_tokens=400, target_max_tokens=100) train_dataset = reader.read('../data/dev_bbc/train.dev.tsv.tagged') vocab = Vocabulary.from_instances(train_dataset) assert vocab.get_vocab_size('tokens') > 2
def __init__( self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer(word_splitter=JustSpacesWordSplitter()) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def __init__(self, lazy: bool = False, window_size: int = 5, tokenizer: Tokenizer = None, indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self.window_size = window_size self._tokenizer = tokenizer or JustSpacesWordSplitter() self._indexers = indexers or {"tokens": SingleIdTokenIndexer()}
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, code_switching_lex_folder: Path = None, bivalency_lex_folder: Path = None): ''' :param tokenizer: Defaults to just Whitespace if no other Tokeniser is given. :param token_indexers: Default to just using word tokens to represent the input. :param code_switching_lex_folder: Folder that contains three lexicon lists of code switching words between different dialects and MSA: 1. MSA_DIAL_EGY.txt, 2. MSA_DIAL_GLF.txt, 3. MSA_DIAL_LEV.txt. These lexicons will allow code switching regularised attention. :param bivalency_lex_folder: Folder that contains three lexicon lists of bivalency words between Egyptian, Levantine and Gulf dialects of Arabic: 1. EGY_GLF.txt, 2. EGY_LEV.txt, 3. GLF_LEV.txt. This will allow bivalency regularised attention NOTE: That all code switching and bivalency words are lower cased and then compared to the words within the text, where when compared the words within the text are temporarly lower cased for comparison reason only. The words within the text do not remain lower cased unless you have specified this within the `token_indexers`. ''' super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter()) self._token_indexers = token_indexers or \ {"tokens": SingleIdTokenIndexer()} self.code_switching_lex_folder = code_switching_lex_folder if code_switching_lex_folder is not None: self.msa_egy = self._lexicon_set( Path(code_switching_lex_folder, 'MSA_DIAL_EGY.txt')) self.msa_glf = self._lexicon_set( Path(code_switching_lex_folder, 'MSA_DIAL_GLF.txt')) self.msa_lev = self._lexicon_set( Path(code_switching_lex_folder, 'MSA_DIAL_LEV.txt')) self.bivalency_lex_folder = bivalency_lex_folder if bivalency_lex_folder is not None: egy_glf = self._lexicon_set( Path(bivalency_lex_folder, 'EGY_GLF.txt')) egy_lev = self._lexicon_set( Path(bivalency_lex_folder, 'EGY_LEV.txt')) glf_lev = self._lexicon_set( Path(bivalency_lex_folder, 'GLF_LEV.txt')) self.bivalency_egy = egy_glf.union(egy_lev) self.bivalency_lev = glf_lev.union(egy_lev) self.bivalency_glf = egy_glf.union(glf_lev)
def from_params(cls, params: Params) -> 'SkipGramExamplesDatasetReader': tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) phrase_token_indexer = TokenIndexer.dict_from_params( params.pop('pivot_phrase_token_indexers', {})) target_word_indexer = TokenIndexer.dict_from_params( params.pop('context_word_token_indexers', {})) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, pivot_phrase_token_indexers=phrase_token_indexer, context_word_token_indexers=target_word_indexer)
def __init__(self, dpd_directory: str, max_logical_forms: int = 500, lazy: bool = False) -> None: super().__init__(lazy) self._dpd_directory = dpd_directory self._max_logical_forms = max_logical_forms self._utterance_token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._logical_form_token_indexers = {'lf_tokens': SingleIdTokenIndexer(namespace='lf_tokens')} self._tokenizer = WordTokenizer(JustSpacesWordSplitter())
def __init__(self) -> None: # we use simple word tokenizer to split sentence to words self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) #initialize indexers singleIdIndexer = SingleIdTokenIndexer() elmoIndexer = ELMoTokenCharactersIndexer() self.indexers = {} self.indexers["tokens"] = singleIdIndexer self.indexers["elmo_characters"] = elmoIndexer
def __init__(self, lowercase_tokens: bool = True) -> None: super().__init__(False) self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(), start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL]) self._token_indexers = { 'tokens': SingleIdTokenIndexer(lowercase_tokens=lowercase_tokens) } self._nonterminals_indexers = { 'tokens': SingleIdTokenIndexer(namespace='nonterminals') }
def __init__(self, lazy: bool = False, shuffle_examples: bool = True, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._shuffle_examples = shuffle_examples self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter()) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() }
def main(): args = parse_args() checkpoint_path = Path(args.checkpoint) checkpoint_dir = checkpoint_path.parent params_path = checkpoint_dir / 'params.json' vocab_dir = checkpoint_dir / 'vocab' params = Params.from_file(params_path) train_params, model_params = params.pop('train'), params.pop('model') tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(), start_tokens=['<s>'], end_tokens=['</s>']) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) dataset_reader = QuoraParaphraseDatasetReader( tokenizer=tokenizer, token_indexers={'tokens': token_indexer}) valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path')) if not args.test_dataset: test_dataset_path = train_params.pop('test_dataset_path') else: test_dataset_path = args.test_dataset test_dataset = dataset_reader.read(test_dataset_path) if args.only_label: test_dataset = [ d for d in test_dataset if d.fields['label'].label == args.only_label ] vocab = Vocabulary.from_files(vocab_dir) random.shuffle(valid_dataset) model_params['token_embedder']['pretrained_file'] = None model = SeparatedQuoraModel(params=model_params, vocab=vocab) model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')) model.to(args.cuda_device) model.eval() torch.set_grad_enabled(False) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) for dataset in (valid_dataset, test_dataset): generator = iterator(dataset, shuffle=False, num_epochs=1) model.get_metrics(reset=True) for batch in tqdm(generator): batch = move_to_device(batch, cuda_device=args.cuda_device) model(premise=batch['premise'], hypothesis=batch['hypothesis'], label=batch['label']) metrics = model.get_metrics() pprint(metrics)
def __init__(self, tokenizer_A: Tokenizer = None, tokenizer_B: Tokenizer = None, token_indexers_A: Dict[str, TokenIndexer] = None, token_indexers_B: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer_A = tokenizer_A or WordTokenizer(word_splitter=JustSpacesWordSplitter()) self._tokenizer_B = tokenizer_B or self._tokenizer_A self._token_indexers_A = token_indexers_A or {"ids": SingleIdTokenIndexer(namespace="vocab_A")} self._token_indexers_B = token_indexers_B or {"ids": SingleIdTokenIndexer(namespace="vocab_B")}
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file, _fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data): # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces _tokenizer = None if _config["preprocessed_tokenized"] == True: _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) if _config["token_embedder_type"] == "embedding": _token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True) } _vocab = Vocabulary.from_files(_config["vocab_directory"]) elif _config["token_embedder_type"] == "fasttext": _token_indexers = { "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"]) } _vocab = FastTextVocab(_fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data, _config["fasttext_max_subwords"]) elif _config["token_embedder_type"] == "elmo": _token_indexers = {"tokens": ELMoTokenCharactersIndexer()} _vocab = None _triple_loader = IrTripleDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"], max_query_length=_config["max_query_length"]) _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]), sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) _iterator.index_with(_vocab) for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1): _queue.put( training_batch) # this moves the tensors in to shared memory _queue.close() # indicate this local thread is done _wait_for_exit.wait( ) # keep this process alive until all the shared memory is used and not needed anymore
class LegalDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._word_splitter = JustSpacesWordSplitter() self.lazy = lazy @overrides def text_to_instance(self, graf_tokens: List[Token], labels: List[str] = None) -> Instance: graf_field = TextField(graf_tokens, self.token_indexers) metadata = MetadataField(({"graf_words": graf_tokens})) fields = {"graf": graf_field, "metadata": metadata} if labels is not None: label_field = MultiLabelField(labels) fields["label"] = label_field return Instance(fields) def _read(self, file_path: str) -> Iterator[Instance]: """ This is a file that has been created by json2lines. :param file_path: :return: """ counts = {"pos": 0, "neg": 0} with open(file_path) as f: lines = f.readlines() for line in lines: graf_str, label_str = line.strip().split("\t") if "unmatched" == label_str: counts["neg"] += 1 else: counts["pos"] += 1 yield self.text_to_instance( self._word_splitter.split_words(graf_str), label_str.split(",")) print(counts)
def load_contextual_ner(self, path, ws_tokenizer=True): # logging.info('Loading Contextual NER ...') self.contextual_ner = Predictor.from_path(path, cuda_device=0) if ws_tokenizer: # switch-off tokenizer (expect pretokenized, space-separated strings) self.contextual_ner._tokenizer = JustSpacesWordSplitter() # load labels (to use logits, wip) self.contextual_ner_labels = [] with open(path + 'vocabulary/labels.txt', 'r') as labels_f: for line in labels_f: self.contextual_ner_labels.append(line.strip())
def __init__(self, lazy: bool = False, en_tokenizer: Tokenizer = None, fr_tokenizer: Tokenizer = None, en_token_indexers: Dict[str, TokenIndexer] = None, fr_token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._en_tokenizer = en_tokenizer or WordTokenizer( word_splitter=JustSpacesWordSplitter(), start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._fr_tokenizer = fr_tokenizer or WordTokenizer( word_splitter=JustSpacesWordSplitter(), start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._en_token_indexers = en_token_indexers or { "tokens": SingleIdTokenIndexer(namespace="source", lowercase_tokens=True) } self._fr_token_indexers = fr_token_indexers or { "tokens": SingleIdTokenIndexer(namespace="target", lowercase_tokens=True) }
def setUp(self): logging.basicConfig(level=logging.INFO) tokenizer = WordTokenizer(JustSpacesWordSplitter()) reader = SummDataReader(tokenizer, source_max_tokens=400, lazy=False) self.train_dataset = reader.read( '../data/dev_bbc/train.dev.tsv.tagged') self.val_dataset = reader.read('../data/dev_bbc/val.dev.tsv.tagged') vocab_path = 'data/cnndm/vocab' if os.path.exists(vocab_path): self.vocab = Vocabulary.from_files(vocab_path) else: self.vocab = Vocabulary.from_instances(self.train_dataset, max_vocab_size=80000) self.vocab.save_to_files(vocab_path)