Esempio n. 1
0
    def test_read_from_file(self):
        word_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
        paragraph_tokenizer = ParagraphWordTokenizer(word_splitter=JustSpacesWordSplitter())
        reader = AbstractiveClozeDatasetReader(document_tokenizer=paragraph_tokenizer,
                                               topic_tokenizer=word_tokenizer,
                                               max_document_length=10,
                                               max_context_length=7,
                                               max_cloze_length=5)
        instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl'))

        instance0 = {
            'document': ['NEW', 'YORK', ',', 'Jan.', '8', ',', '2016', '/PRNewswire/', '--', 'Businessman'],
            'topics': [['Ken', 'Fields'], ['Politics']],
            'context': ['%', 'Renewable', 'Energy', 'in', '20', 'Years', '.'],
            'cloze': ['Picking', 'as', 'his', 'campaign', 'slogan']
        }

        assert len(instances) == 25
        fields = instances[0].fields
        assert [t.text for t in fields['document'].tokens] == instance0['document']
        assert len(fields['topics'].field_list) == len(instance0['topics'])
        for topic_field, topic in zip(fields['topics'].field_list, instance0['topics']):
            assert [t.text for t in topic_field.tokens] == topic
        assert [t.text for t in fields['context'].tokens] == instance0['context']
        assert [t.text for t in fields['cloze'].tokens] == instance0['cloze']
        metadata = fields['metadata']
        assert 'document' in metadata
        assert 'topics' in metadata
        assert 'context' in metadata
        assert 'cloze' in metadata
Esempio n. 2
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy=False)
     self.token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._word_splitter = JustSpacesWordSplitter()
     self.lazy = lazy
Esempio n. 3
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     # self._tokenizer = tokenizer or WordTokenizer()
     self._tokenizer = JustSpacesWordSplitter()
     self._token_indexers = token_indexers or {
         "sentence": SingleIdTokenIndexer()
     }
    def __init__(self, is_pretrain,
                 token_indexer: Dict[str, TokenIndexer] = None,
                 char_indexer: Dict[str, TokenCharactersIndexer] = None,
                 lazy: bool = False,
                 tables_file: str = 'data\\tables.jsonl',
                 test_sym_file: str = 'data\\test.sym',
                 load_cache: bool = True,
                 save_cache: bool = True,
                 cache_dir: str = 'cache',
                 loading_limit: int = -1):
        super().__init__(lazy=lazy)
        self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
        self._token_indexers = token_indexer or {"tokens": SingleIdTokenIndexer()}
        self._char_indexers = char_indexer
        self._is_pretrain = is_pretrain

        self._table_file = tables_file
        self._loading_limit = loading_limit

        self._load_cache = load_cache
        self._save_cache = save_cache
        self._cache_dir = cache_dir

        if self._load_cache or self._save_cache:
            if not os.path.exists(self._cache_dir):
                os.mkdir(self._cache_dir)

        self._test_sym_file = test_sym_file
Esempio n. 5
0
    def test_read_from_file(self):
        tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
        reader = ExtractiveClozeDatasetReader(tokenizer=tokenizer, max_num_sentences=5,
                                              max_sentence_length=6, max_context_length=4)
        instances = list(reader.read(f'{FIXTURES_ROOT}/data/cloze.jsonl'))

        instance1 = {
            'document': [
                ['Drew', 'Sheneman', 'has', 'been', 'the', 'editorial'],
                ['J.', ')'],
                ['since', '1998', '.'],
                ['With', 'exceptional', 'artistry', ',', 'his', 'cartoons'],
                ['Sheneman', 'began', 'cartooning', 'in', 'college', 'and']
            ],
            'topics': [['Drew', 'Sheneman']],
            'context': ['American', 'editorial', 'cartoonist', '.'],
            'labels': [1, 0, 1, 0, 1]
        }

        assert len(instances) == 25
        fields = instances[1].fields
        assert len(fields['document'].field_list) == 5
        for sentence, sentence_field in zip(instance1['document'], fields['document'].field_list):
            assert [t.text for t in sentence_field.tokens] == sentence
        assert len(fields['topics'].field_list) == 1
        for topic, topic_field in zip(instance1['topics'], fields['topics'].field_list):
            assert [t.text for t in topic_field.tokens] == topic
        assert [t.text for t in fields['context']] == instance1['context']
        assert np.array_equal(fields['labels'].array, instance1['labels'])
        metadata = fields['metadata']
        assert 'document' in metadata
        assert 'topics' in metadata
        assert 'context' in metadata
        assert 'cloze' in metadata
    def __init__(self, lazy: bool = False,
                 max_bag_size: int = 25,
                 negative_exampels_percentage: int = 100,
                 with_direct_supervision: bool = True) -> None:
        """
        args:
            lazy: lazy reading of the dataset
            max_bag_size: maximum number of sentences per a bag
            negative_exampels_percentage: percentage of negative examples to keep
            with_direct_supervision: keep or ignore direct supervision examples
        """
        super().__init__(lazy=lazy)
        self.max_bag_size = max_bag_size
        self.negative_exampels_percentage = negative_exampels_percentage
        self.with_direct_supervision = with_direct_supervision

        self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
        self._token_indexers = {"tokens": SingleIdTokenIndexer()}

        # for logging and input validation
        self._inst_counts: Dict = defaultdict(int)  # count instances per relation type
        self._pairs: Set = set()  # keep track of pairs of entities
        self._bag_sizes: Dict = defaultdict(int)  # count relation types per bag
        self._relation_coocur: Dict = defaultdict(int)  # count relation types per bag
        self._failed_mentions_count: int = 0  # count mentions with wrong formating
        self._count_direct_supervised_inst: int = 0
        self._count_bag_labels: Dict = defaultdict(int)
Esempio n. 7
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = True,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = 510,
     target_max_tokens: Optional[int] = 64,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy)
     self._source_tokenizer = source_tokenizer or WordTokenizer(
         word_splitter=JustSpacesWordSplitter())
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_max_exceeded = 0
     self._target_max_exceeded = 0
     self.pre_sen = 10
Esempio n. 8
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = True,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = 256,
     target_max_tokens: Optional[int] = 32,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy)
     self._source_tokenizer = source_tokenizer or WordTokenizer(
         word_splitter=JustSpacesWordSplitter())
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_max_exceeded = 0
     self._target_max_exceeded = 0
     self.pre_sen = 10
     self.seg = pkuseg.pkuseg(model_name='medicine',
                              user_dict='../data/0510/mdg/user_dict.txt')
Esempio n. 9
0
    def test_read_from_file(self, lazy):
        reader = MRPCReader(tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()),
                            token_indexers={"bert":
                                            PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)},
                            lazy=lazy,
                            skip_label_indexing=False,
                            mode='merge')
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'mrpc_dev.tsv'))
        instances = ensure_list(instances)

        instance1 = {"tokens": "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .".split() + ["[SEP]"] +
                     "\" The foodservice pie business does not fit our long-term growth strategy .".split(),
                     "label": '1'}

        instance2 = {"tokens": "Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .".split() + ["[SEP]"] +
                     "His wife said he was \" 100 percent behind George Bush \" and looked forward to using his years of training in the war .".split(),
                     "label": '0'}

        instance3 = {"tokens": "The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .".split() + ["[SEP]"] +
                     "The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .".split(),
                     "label": '0'}

        for instance, expected_instance in zip(instances, [instance1, instance2, instance3]):
            fields = instance.fields
            assert [
                t.text for t in fields["tokens"].tokens] == expected_instance["tokens"]
            assert fields["label"].label == expected_instance["label"]
Esempio n. 10
0
    def test_read_from_file(self):
        tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
        reader = ExtractiveDatasetReader(tokenizer=tokenizer,
                                         max_num_sentences=5,
                                         max_sentence_length=6)
        instances = list(reader.read(f'{FIXTURES_ROOT}/data/sds.jsonl'))

        instance0 = {
            'document': [['Editor', '\'s', 'note', ':', 'In', 'our'],
                         ['An', 'inmate', 'housed', 'on', 'the', '``'],
                         ['MIAMI', ',', 'Florida', '(', 'CNN', ')'],
                         ['Most', 'often', ',', 'they', 'face', 'drug'],
                         ['So', ',', 'they', 'end', 'up', 'on']]
        }

        assert len(instances) == 25
        fields = instances[0].fields
        assert len(fields['document'].field_list) == 5
        for sentence, sentence_field in zip(instance0['document'],
                                            fields['document'].field_list):
            assert [t.text for t in sentence_field.tokens] == sentence
        assert np.array_equal(fields['labels'].array, [0, 0, 1, 1, 0])
        metadata = fields['metadata']
        assert 'document' in metadata
        assert len(metadata['document']) == 5
        assert 'summary' in metadata
        assert len(metadata['summary']) == 4
Esempio n. 11
0
class SentenceTaggerPredictor(Predictor):
    """
    Wrapper for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = JustSpacesWordSplitter()

    @overrides
    def _json_to_instance(self,
                          json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        instance = self._dataset_reader.text_to_instance(tokens)

        return_dict: JsonDict = {"words": [token.text for token in tokens]}

        return instance, return_dict
Esempio n. 12
0
 def __init__(self,
              reverse: bool = False,
              tokens_per_instance: int = None,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     tokenizer = tokenizer or WordTokenizer(word_splitter=JustSpacesWordSplitter())
     super().__init__(tokens_per_instance, tokenizer, token_indexers, True)
     self._reverse = reverse
Esempio n. 13
0
class PairsDatasetReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__()
        self._tokenizer = JustSpacesWordSplitter()
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file.readlines():
                line_json = json.loads(line)
                if not line_json:
                    continue

                query_paper = line_json["query_paper"]
                candidate_paper = line_json["candidate_paper"]
                relevance = line_json["relevance"]

                instance = self.text_to_instance(
                    query_paper=query_paper,
                    candidate_paper=candidate_paper,
                    relevance=relevance)
                if instance is not None:
                    yield instance

    @overrides
    def text_to_instance(self,
                         query_paper: str,
                         candidate_paper: str,
                         relevance: str = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        query_tokens = self._tokenizer.split_words(query_paper)
        fields['query_paper'] = TextField(query_tokens, self._token_indexers)

        candidate_tokens = self._tokenizer.split_words(candidate_paper)
        fields['candidate_paper'] = TextField(candidate_tokens,
                                              self._token_indexers)

        if relevance is not None:
            fields['label'] = LabelField(relevance)

        return Instance(fields)
Esempio n. 14
0
 def __init__(self, bert_model):
     lower_case = True if "uncased" in bert_model else False
     self.bert_indexer, self.tokenizer = self.get_bert_indexer(
         bert_model, lower_case=lower_case)
     self.tokenizer_bert = MyBertWordSplitter(do_lower_case=lower_case)
     self.spacy_splitter = SpacyWordSplitter(keep_spacy_tokens=True)
     self.just_space_tokenization = JustSpacesWordSplitter()
     self.simple_tokenization = SimpleWordSplitter()
Esempio n. 15
0
 def test_reader(self):
     tokenizer = WordTokenizer(JustSpacesWordSplitter())
     reader = SummDataReader(tokenizer=tokenizer,
                             source_max_tokens=400,
                             target_max_tokens=100)
     train_dataset = reader.read('../data/dev_bbc/train.dev.tsv.tagged')
     vocab = Vocabulary.from_instances(train_dataset)
     assert vocab.get_vocab_size('tokens') > 2
 def __init__(
     self,
     tokenizer: Tokenizer = None,
     token_indexers: Dict[str, TokenIndexer] = None,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer(word_splitter=JustSpacesWordSplitter())
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
Esempio n. 17
0
 def __init__(self,
              lazy: bool = False,
              window_size: int = 5,
              tokenizer: Tokenizer = None,
              indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self.window_size = window_size
     self._tokenizer = tokenizer or JustSpacesWordSplitter()
     self._indexers = indexers or {"tokens": SingleIdTokenIndexer()}
Esempio n. 18
0
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 code_switching_lex_folder: Path = None,
                 bivalency_lex_folder: Path = None):
        '''
        :param tokenizer: Defaults to just Whitespace if no other Tokeniser 
                          is given.
        :param token_indexers: Default to just using word tokens to represent 
                               the input.
        :param code_switching_lex_folder: Folder that contains three lexicon 
                                          lists of code switching words between 
                                          different dialects and MSA: 
                                          1. MSA_DIAL_EGY.txt, 
                                          2. MSA_DIAL_GLF.txt, 
                                          3. MSA_DIAL_LEV.txt. These lexicons 
                                          will allow code switching regularised 
                                          attention.
        :param bivalency_lex_folder: Folder that contains three lexicon lists 
                                     of bivalency words between Egyptian, 
                                     Levantine and Gulf dialects of Arabic:
                                     1. EGY_GLF.txt, 2. EGY_LEV.txt, 
                                     3. GLF_LEV.txt. This will allow bivalency 
                                     regularised attention
        
        NOTE: That all code switching and bivalency words are lower cased and 
        then compared to the words within the text, where when compared the 
        words within the text are temporarly lower cased for comparison reason 
        only. The words within the text do not remain lower cased unless you 
        have specified this within the `token_indexers`.
        '''
        super().__init__(lazy)

        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or \
                               {"tokens": SingleIdTokenIndexer()}

        self.code_switching_lex_folder = code_switching_lex_folder
        if code_switching_lex_folder is not None:
            self.msa_egy = self._lexicon_set(
                Path(code_switching_lex_folder, 'MSA_DIAL_EGY.txt'))
            self.msa_glf = self._lexicon_set(
                Path(code_switching_lex_folder, 'MSA_DIAL_GLF.txt'))
            self.msa_lev = self._lexicon_set(
                Path(code_switching_lex_folder, 'MSA_DIAL_LEV.txt'))
        self.bivalency_lex_folder = bivalency_lex_folder
        if bivalency_lex_folder is not None:
            egy_glf = self._lexicon_set(
                Path(bivalency_lex_folder, 'EGY_GLF.txt'))
            egy_lev = self._lexicon_set(
                Path(bivalency_lex_folder, 'EGY_LEV.txt'))
            glf_lev = self._lexicon_set(
                Path(bivalency_lex_folder, 'GLF_LEV.txt'))
            self.bivalency_egy = egy_glf.union(egy_lev)
            self.bivalency_lev = glf_lev.union(egy_lev)
            self.bivalency_glf = egy_glf.union(glf_lev)
Esempio n. 19
0
 def from_params(cls, params: Params) -> 'SkipGramExamplesDatasetReader':
     tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
     phrase_token_indexer = TokenIndexer.dict_from_params(
         params.pop('pivot_phrase_token_indexers', {}))
     target_word_indexer = TokenIndexer.dict_from_params(
         params.pop('context_word_token_indexers', {}))
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer,
                pivot_phrase_token_indexers=phrase_token_indexer,
                context_word_token_indexers=target_word_indexer)
Esempio n. 20
0
 def __init__(self,
              dpd_directory: str,
              max_logical_forms: int = 500,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._dpd_directory = dpd_directory
     self._max_logical_forms = max_logical_forms
     self._utterance_token_indexers = {'tokens': SingleIdTokenIndexer(namespace='tokens')}
     self._logical_form_token_indexers = {'lf_tokens': SingleIdTokenIndexer(namespace='lf_tokens')}
     self._tokenizer = WordTokenizer(JustSpacesWordSplitter())
Esempio n. 21
0
    def __init__(self) -> None:
        # we use simple word tokenizer to split sentence to words
        self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

        #initialize indexers
        singleIdIndexer = SingleIdTokenIndexer()
        elmoIndexer = ELMoTokenCharactersIndexer()
        self.indexers = {}
        self.indexers["tokens"] = singleIdIndexer
        self.indexers["elmo_characters"] = elmoIndexer
Esempio n. 22
0
 def __init__(self, lowercase_tokens: bool = True) -> None:
     super().__init__(False)
     self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(),
                                     start_tokens=[START_SYMBOL],
                                     end_tokens=[END_SYMBOL])
     self._token_indexers = {
         'tokens': SingleIdTokenIndexer(lowercase_tokens=lowercase_tokens)
     }
     self._nonterminals_indexers = {
         'tokens': SingleIdTokenIndexer(namespace='nonterminals')
     }
Esempio n. 23
0
 def __init__(self,
              lazy: bool = False,
              shuffle_examples: bool = True,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._shuffle_examples = shuffle_examples
     self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
def main():
    args = parse_args()
    checkpoint_path = Path(args.checkpoint)
    checkpoint_dir = checkpoint_path.parent
    params_path = checkpoint_dir / 'params.json'
    vocab_dir = checkpoint_dir / 'vocab'

    params = Params.from_file(params_path)
    train_params, model_params = params.pop('train'), params.pop('model')

    tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter(),
                              start_tokens=['<s>'],
                              end_tokens=['</s>'])
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    dataset_reader = QuoraParaphraseDatasetReader(
        tokenizer=tokenizer, token_indexers={'tokens': token_indexer})

    valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path'))
    if not args.test_dataset:
        test_dataset_path = train_params.pop('test_dataset_path')
    else:
        test_dataset_path = args.test_dataset
    test_dataset = dataset_reader.read(test_dataset_path)
    if args.only_label:
        test_dataset = [
            d for d in test_dataset
            if d.fields['label'].label == args.only_label
        ]
    vocab = Vocabulary.from_files(vocab_dir)
    random.shuffle(valid_dataset)

    model_params['token_embedder']['pretrained_file'] = None
    model = SeparatedQuoraModel(params=model_params, vocab=vocab)
    model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
    model.to(args.cuda_device)
    model.eval()

    torch.set_grad_enabled(False)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)

    for dataset in (valid_dataset, test_dataset):
        generator = iterator(dataset, shuffle=False, num_epochs=1)
        model.get_metrics(reset=True)
        for batch in tqdm(generator):
            batch = move_to_device(batch, cuda_device=args.cuda_device)
            model(premise=batch['premise'],
                  hypothesis=batch['hypothesis'],
                  label=batch['label'])
        metrics = model.get_metrics()
        pprint(metrics)
Esempio n. 25
0
    def __init__(self,
                 tokenizer_A: Tokenizer = None,
                 tokenizer_B: Tokenizer = None,
                 token_indexers_A: Dict[str, TokenIndexer] = None,
                 token_indexers_B: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)

        self._tokenizer_A = tokenizer_A or WordTokenizer(word_splitter=JustSpacesWordSplitter())
        self._tokenizer_B = tokenizer_B or self._tokenizer_A

        self._token_indexers_A = token_indexers_A or {"ids": SingleIdTokenIndexer(namespace="vocab_A")}
        self._token_indexers_B = token_indexers_B or {"ids": SingleIdTokenIndexer(namespace="vocab_B")}
def multiprocess_training_loader(process_number: int, _config,
                                 _queue: mp.Queue, _wait_for_exit: mp.Event,
                                 _local_file, _fasttext_vocab_cached_mapping,
                                 _fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {
            "tokens": SingleIdTokenIndexer(lowercase_tokens=True)
        }
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {
            "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])
        }
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,
                               _fasttext_vocab_cached_data,
                               _config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _triple_loader = IrTripleDatasetReader(
        lazy=True,
        tokenizer=_tokenizer,
        token_indexers=_token_indexers,
        max_doc_length=_config["max_doc_length"],
        max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                               sorting_keys=[("doc_pos_tokens", "num_tokens"),
                                             ("doc_neg_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_triple_loader.read(_local_file),
                                    num_epochs=1):

        _queue.put(
            training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait(
    )  # keep this process alive until all the shared memory is used and not needed anymore
Esempio n. 27
0
class LegalDatasetReader(DatasetReader):
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._word_splitter = JustSpacesWordSplitter()
        self.lazy = lazy

    @overrides
    def text_to_instance(self,
                         graf_tokens: List[Token],
                         labels: List[str] = None) -> Instance:
        graf_field = TextField(graf_tokens, self.token_indexers)

        metadata = MetadataField(({"graf_words": graf_tokens}))

        fields = {"graf": graf_field, "metadata": metadata}

        if labels is not None:
            label_field = MultiLabelField(labels)
            fields["label"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterator[Instance]:
        """
        This is a file that has been created by json2lines.
        :param file_path:
        :return:
        """

        counts = {"pos": 0, "neg": 0}

        with open(file_path) as f:
            lines = f.readlines()

        for line in lines:
            graf_str, label_str = line.strip().split("\t")
            if "unmatched" == label_str:
                counts["neg"] += 1
            else:
                counts["pos"] += 1

            yield self.text_to_instance(
                self._word_splitter.split_words(graf_str),
                label_str.split(","))

        print(counts)
Esempio n. 28
0
    def load_contextual_ner(self, path, ws_tokenizer=True):
        #
        logging.info('Loading Contextual NER ...')
        self.contextual_ner = Predictor.from_path(path, cuda_device=0)

        if ws_tokenizer:
            # switch-off tokenizer (expect pretokenized, space-separated strings)
            self.contextual_ner._tokenizer = JustSpacesWordSplitter()

        # load labels (to use logits, wip)
        self.contextual_ner_labels = []
        with open(path + 'vocabulary/labels.txt', 'r') as labels_f:
            for line in labels_f:
                self.contextual_ner_labels.append(line.strip())
Esempio n. 29
0
 def __init__(self,
              lazy: bool = False,
              en_tokenizer: Tokenizer = None,
              fr_tokenizer: Tokenizer = None,
              en_token_indexers: Dict[str, TokenIndexer] = None,
              fr_token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._en_tokenizer = en_tokenizer or WordTokenizer(
             word_splitter=JustSpacesWordSplitter(),
             start_tokens=[START_SYMBOL],
             end_tokens=[END_SYMBOL]
     )
     self._fr_tokenizer = fr_tokenizer or WordTokenizer(
             word_splitter=JustSpacesWordSplitter(),
             start_tokens=[START_SYMBOL],
             end_tokens=[END_SYMBOL]
     )
     self._en_token_indexers = en_token_indexers or {
         "tokens": SingleIdTokenIndexer(namespace="source", lowercase_tokens=True)
     }
     self._fr_token_indexers = fr_token_indexers or {
         "tokens": SingleIdTokenIndexer(namespace="target", lowercase_tokens=True)
     }
Esempio n. 30
0
    def setUp(self):
        logging.basicConfig(level=logging.INFO)

        tokenizer = WordTokenizer(JustSpacesWordSplitter())
        reader = SummDataReader(tokenizer, source_max_tokens=400, lazy=False)
        self.train_dataset = reader.read(
            '../data/dev_bbc/train.dev.tsv.tagged')
        self.val_dataset = reader.read('../data/dev_bbc/val.dev.tsv.tagged')
        vocab_path = 'data/cnndm/vocab'
        if os.path.exists(vocab_path):
            self.vocab = Vocabulary.from_files(vocab_path)
        else:
            self.vocab = Vocabulary.from_instances(self.train_dataset,
                                                   max_vocab_size=80000)
            self.vocab.save_to_files(vocab_path)