Esempio n. 1
0
def _csv_iterator(data_path, ngrams, yield_cls=False, label=-1):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f, delimiter="\t")
        for row in reader:
            tokens = ' '.join([row[5]])
            #print(row[5])
            tokens = tokenizer(tokens)

            if yield_cls:
                yield row[7], ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
Esempio n. 2
0
def build_legacy_torchtext_vocab_pipeline(vocab_file):
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for line in f:
            for token in line:
                yield token

    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    pipeline = sequential_transforms(tokenizer, vocab_func(vocab))
    return pipeline, None, None
Esempio n. 3
0
def BookCorpus(vocab,
               tokenizer=get_tokenizer("basic_english"),
               data_select=('train', 'test', 'valid'),
               removed_tokens=[],
               min_sentence_len=None):

    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(('train', 'test', 'valid'))):
        raise TypeError('data_select is not supported!')

    extracted_files = glob.glob('/datasets01/bookcorpus/021819/*/*.txt')
    random.seed(1000)
    random.shuffle(extracted_files)

    num_files = len(extracted_files)
    _path = {
        'train': extracted_files[:(num_files // 20 * 17)],
        'test': extracted_files[(num_files // 20 * 17):(num_files // 20 * 18)],
        'valid': extracted_files[(num_files // 20 * 18):]
    }

    data = {}
    for item in _path.keys():
        data[item] = []
        logging.info('Creating {} data'.format(item))
        tokens = []
        for txt_file in _path[item]:
            with open(txt_file, 'r', encoding="utf8", errors='ignore') as f:
                for line in f.readlines():
                    _tokens = tokenizer(line.strip())
                    if min_sentence_len:
                        if len(_tokens) >= min_sentence_len:
                            tokens.append(
                                [vocab.stoi[token] for token in _tokens])
                    else:
                        tokens += [vocab.stoi[token] for token in _tokens]
        data[item] = tokens

    for key in data_select:
        if data[key] == []:
            raise TypeError('Dataset {} is empty!'.format(key))
    if min_sentence_len:
        return tuple(
            LanguageModelingDataset(data[d], vocab, lambda x: x, False)
            for d in data_select)
    else:
        return tuple(
            LanguageModelingDataset(
                torch.tensor(data[d]).long(), vocab, lambda x: x, False)
            for d in data_select)
Esempio n. 4
0
    def __init__(self,
                 texts,
                 labels,
                 embed_dim,
                 ngrams=3,
                 num_epochs=5,
                 seed=0):

        # set seed
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

        self.texts = texts
        self.labels = labels
        self.embed_dim = embed_dim
        self.ngrams = ngrams

        # construct vocab
        print('Constructing vocabulary...')
        self.vocab = construct_vocab(texts, ngrams)
        self.vocab_size = len(self.vocab)

        # prepare dataset
        print('Preparing dataset...')
        self.train_dataset = make_torchdataset(self.vocab, texts, labels,
                                               ngrams)
        self.num_classes = len(self.train_dataset.get_labels())

        # prepare device ref and model
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = TextClassificationModel(self.vocab_size, self.embed_dim,
                                             self.num_classes).to(self.device)

        # loss function & optimization
        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=4.0)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
                                                         1,
                                                         gamma=0.9)
        self.batch_size = 16

        self.tokenizer = get_tokenizer("basic_english")
        self.ngrams = ngrams

        if num_epochs > 0:
            print('Training model...')
            self.train(self.train_dataset, num_epochs)
def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"),
                    root='.data', vocab=None, removed_tokens=[],
                    data_select=('train', 'test', 'valid')):

    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(('train', 'test', 'valid'))):
        raise TypeError('data_select is not supported!')

    if dataset_name == 'PennTreebank':
        extracted_files = []
        select_to_index = {'train': 0, 'test': 1, 'valid': 2}
        extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]],
                                             root=root) for key in data_select]
    else:
        dataset_tar = download_from_url(URLS[dataset_name], root=root)
        extracted_files = [os.path.join(root, d) for d in extract_archive(dataset_tar)]

    _path = {}
    for item in data_select:
        _path[item] = _get_datafile_path(item, extracted_files)

    if vocab is None:
        if 'train' not in _path.keys():
            raise TypeError("Must pass a vocab if train is not selected.")
        logging.info('Building Vocab based on {}'.format(_path['train']))
        txt_iter = iter(tokenizer(row) for row in io.open(_path['train'],
                                                          encoding="utf8"))
        vocab = build_vocab_from_iterator(txt_iter)
        logging.info('Vocab has {} entries'.format(len(vocab)))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")

    data = {}
    for item in _path.keys():
        data[item] = []
        logging.info('Creating {} data'.format(item))
        txt_iter = iter(tokenizer(row) for row in io.open(_path[item],
                                                          encoding="utf8"))
        _iter = numericalize_tokens_from_iterator(
            vocab, txt_iter, removed_tokens)
        for tokens in _iter:
            data[item] += [token_id for token_id in tokens]

    for key in data_select:
        if data[key] == []:
            raise TypeError('Dataset {} is empty!'.format(key))

    return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab)
                 for d in data_select)
Esempio n. 6
0
def _load_model():
    # First load into memory the variables that we will need to predict
    checkpoint_path = pathlib.Path(__file__).parent.absolute() / "state_dict.pt"
    checkpoint = torch.load(checkpoint_path)

    global VOCAB, MODEL, NGRAMS, TOKENIZER
    VOCAB = checkpoint["vocab"]
    # TODO load the model. You can get `embed_dim` and `num_class` from the checkpoint. 
    # TODO Then, load the state dict of the model
    MODEL = ...
    MODEL...

    NGRAMS = checkpoint["ngrams"]
    TOKENIZER = get_tokenizer("basic_english")
Esempio n. 7
0
def build_legacy_pytext_vocab_pipeline(vocab_file):
    from pytext.data.utils import Vocabulary

    tokenizer = get_tokenizer("basic_english")
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = sequential_transforms(tokenizer_func(tokenizer),
                                     PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>")))
    return pipeline, None, None
Esempio n. 8
0
def prepare_data(device='cpu', train_batch_size=20, eval_batch_size=20):
    TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                                init_token='<sos>',
                                eos_token='<eos>',
                                lower=True)
    train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
    TEXT.build_vocab(train_txt)
    device = torch.device(device)

    train_data = batchify(train_txt, train_batch_size, TEXT, device)
    val_data = batchify(val_txt, eval_batch_size, TEXT, device)
    test_data = batchify(test_txt, eval_batch_size, TEXT, device)

    return train_data, val_data, test_data
Esempio n. 9
0
def get_accuracy(ps_rref, data_dir, test_batch_size, job_name, target_loss):
    logger = Logger(
        job_name=job_name,
        file_dir=f'./measurement/logs/{job_name}_tester.log').logger

    train_iter = WikiText2(root=data_dir, split='train')
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, train_iter),
                                      specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    bptt = 35

    train_iter, val_iter, test_iter = WikiText2(root=data_dir)
    val_data = data_process(val_iter, vocab, tokenizer)
    val_data = batchify(val_data, test_batch_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    t0 = time.time()
    logger.info("Start!")
    init = t0
    while True:
        t1 = time.time()
        if t1 - t0 > 20:
            t0 = t1
            m = ps_rref.rpc_sync().get_model().to(device)

            test_loss = 0.

            with torch.no_grad():
                hidden = m.init_hidden(test_batch_size)
                for batch_idx, i in enumerate(
                        range(0,
                              val_data.size(0) - 1, bptt)):
                    data, targets = get_batch(val_data, i, bptt)
                    data, targets = data.to(device), targets.to(device)
                    hidden = repackage_hidden(hidden)
                    output, hidden = m(data, hidden)
                    loss = criterion(output, targets)
                    test_loss += len(data) * loss.item()

            test_loss /= (len(val_data) - 1)

            logger.info("Test Loss: {:7.3f} | Time: {:7.2f} seconds".format(
                test_loss, (t1 - init)))

            if test_loss < target_loss:
                ps_rref.rpc_sync().stop()
                break
Esempio n. 10
0
def build_batch_torchtext_vocab(vocab_file):
    from torchtext.data.utils import get_tokenizer
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator
    from transforms import TextClassificationPipeline

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for token in f:
            yield token
    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    text_pipeline = TextDataPipeline(tokenizer, partial(map, vocab))
    label_pipeline = int
    return TextClassificationPipeline(label_pipeline, text_pipeline), None
Esempio n. 11
0
def _csv_iterator(data_path, ngrams, yield_cls=False):
    tokenizer = get_tokenizer("basic_english")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            tokens = ' '.join(row[1:])
            # print(tokens)
            tokens = tokenizer(tokens)

            # print(tokens)
            if yield_cls:
                yield int(row[0]) - 1, ngrams_iterator(tokens, ngrams)
            else:
                yield ngrams_iterator(tokens, ngrams)
Esempio n. 12
0
def build_legacy_batch_torchtext_vocab_pipeline(vocab_file):
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator
    from transforms import TextClassificationPipeline

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for line in f:
            for token in line:
                yield token

    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    text_pipeline = sequential_transforms(tokenizer, vocab_func(vocab))
    label_pipeline = totensor(dtype=torch.long)
    return TextClassificationPipeline(label_pipeline, text_pipeline), None, None
Esempio n. 13
0
def basic_english():
    """
    Basic  english tokenizer

    We use character level tokenizer in this experiment.
    You can switch by setting,

    ```
        'tokenizer': 'basic_english',
    ```

    as the configurations dictionary when starting the experiment.

    """
    return get_tokenizer('basic_english')
Esempio n. 14
0
def get_data(device):
    TEXT = torchtext.data.Field(
        tokenize=get_tokenizer("basic_english"), init_token="<sos>", eos_token="<eos>", lower=True
    )
    train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
    TEXT.build_vocab(train_txt)
    ntokens = len(TEXT.vocab.stoi)

    batch_size = 500
    eval_batch_size = 200
    train_data = batchify(train_txt, batch_size, TEXT, device)
    val_data = batchify(val_txt, eval_batch_size, TEXT, device)
    test_data = batchify(test_txt, eval_batch_size, TEXT, device)

    return ntokens, train_data, val_data, test_data
Esempio n. 15
0
def get_src_trg(tokenize=True):

    if tokenize == False:
        SRC = Field(sequential = False,
                    init_token = '<sos>',
                    eos_token = '<eos>',
                    lower = False)

        TRG = Field(sequential = False,
                    init_token = '<sos>',
                    eos_token = '<eos>',
                    lower = False)
    else:
        SRC = Field(tokenize = get_tokenizer("spacy"),
                    init_token = '<sos>',
                    eos_token = '<eos>',
                    lower = False)

        TRG = Field(tokenize = get_tokenizer("spacy"),
                    init_token = '<sos>',
                    eos_token = '<eos>',
                    lower = False)
    
    return SRC, TRG
Esempio n. 16
0
def build_torchtext_vocab(vocab_file):
    from torchtext.data.utils import get_tokenizer
    tokenizer = get_tokenizer("basic_english")
    from torchtext.vocab import build_vocab_from_iterator
    from torchtext.experimental.functional import totensor, vocab_func, sequential_transforms

    def token_iterator(vocab_file):
        f = open(vocab_file, 'r')
        for token in f:
            yield token

    vocab = build_vocab_from_iterator(token_iterator(vocab_file))
    pipeline = sequential_transforms(tokenizer, vocab_func(vocab),
                                     totensor(dtype=torch.long))
    return pipeline, None, None
Esempio n. 17
0
 def predict(self, text: str):
     time_started = time()
     tokenizer = get_tokenizer("basic_english")
     with torch.no_grad():
         text_tensor = torch.tensor([
             self.vocab[token]
             for token in ngrams_iterator(tokenizer(text), NGRAMS)
         ])
         output_tensor = self.model(text_tensor, torch.tensor([0]))
         output = output_tensor.argmax(1).item()
         elapsed = time() - time_started
         logger.info(
             f"ModelWrapper.predict: [elapsed {elapsed:.2f}s]: "
             f"len(text)={len(text)}, len(tokens)={len(text_tensor)}, answer={output}"
         )
         return output
Esempio n. 18
0
    def __init__(self, train_batch_size=20, eval_batch_size=10, bptt=35):
        self.bptt = bptt
        train_iter = WikiText2(split='train')
        self.tokenizer = get_tokenizer('basic_english')
        counter = Counter()
        for line in train_iter:
            counter.update(self.tokenizer(line))
        self.vocab = Vocab(counter)
        train_iter, val_iter, test_iter = WikiText2()
        train_data = self.data_process(train_iter)
        val_data = self.data_process(val_iter)
        test_data = self.data_process(test_iter)

        self.train_data = self.batchify(train_data, train_batch_size)
        self.val_data = self.batchify(val_data, eval_batch_size)
        self.test_data = self.batchify(test_data, eval_batch_size)
Esempio n. 19
0
def constructVocab(news_file_train, news_file_test, attrs, save_path):
    """
        Build field using torchtext for tokenization
    
    Returns:
        torchtext.vocabulary 
    """

    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(
        news_token_generator(news_file_train, news_file_test, tokenizer,
                             attrs))

    output = open(save_path, 'wb')
    pickle.dump(vocab, output)
    output.close()
Esempio n. 20
0
def build_vocab(data_path, data_name, jsons, threshold, lang='en'):
    """Build vocabulary"""
    counter = Counter()
    for path in jsons[data_name]:
        full_path = os.path.join(os.path.join(data_path, data_name), path)
        if data_name == 'f8k':
            captions = cap_from_flickr_json(full_path)
            tokenizer = get_tokenizer('spacy', language=lang)
            for i, caption in enumerate(captions):
                counter.update(tokenizer(caption))
                if i % 1000 == 0:
                    print("[%d/%d] tokenized the captions." % (i, len(captions)))

    # Create vocabulary with words that has a number of occurence higher than threshold
    vocab = Vocab(counter, min_freq=threshold, specials=('<unk>', '<pad>', '<start>', '<end>'))
    return vocab
Esempio n. 21
0
def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([
            vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)
        ])
        output = model(text, torch.tensor([0]))
        ret = output > THRESHOLD
        result = []
        print(ret)
        cnt = 0
        for r in ret[0]:
            if r:
                result.append(cnt)
            cnt += 1
        return result
Esempio n. 22
0
    def _dataset_parser(self, inputdataset) -> Tuple[List, List]:

        """01. Token -> ID conversion pipelines"""

        def check_token(x):
            for token in x:
                if token not in self.vocabulary.keys():
                    self.vocabulary[token] = len(self.vocabulary)+1

            return self.vocabulary

        def sample_pipeline(x):
            
            vocab = check_token(x)
            return [vocab[token] for token in x]

            #return [self.vocabulary[token] for token in x]

        def label_pipeline(x):
            return int(x)-1

        tokenizer = get_tokenizer('basic_english')
        samples, labels = [], []

        for (label,line) in inputdataset:

            tokens = tokenizer(line)

            if len(tokens) > 249:
                continue

            for pad in range(0, 250-len(tokens)):
                tokens.append('PAD')

            word_embedding = sample_pipeline(tokens)

            # for pad in range(0, 250-len(word_embedding)):
            #     word_embedding.append(0)
            
            current_sample = torch.tensor(word_embedding, dtype=torch.int64)
            samples.append(current_sample)
            
            label_embedding = label_pipeline(label)
            current_label = torch.tensor(label_embedding, dtype=torch.int64)
            labels.append(current_label)

        return samples, labels
Esempio n. 23
0
def _pd_iterator(data_to_parse: np.ndarray,
                 ngrams: int,
                 yield_cls: bool = False):
    """
    :param data_to_parse: array of two colums with label and text
    :param ngrams: amount of ngrams
    :param yield_cls: return text with label or without
    :return: generator needed in future parsing for torch
    """
    tokenizer = get_tokenizer(None)
    for row_id in range(len(data_to_parse)):
        tokens = data_to_parse[row_id][1]
        tokens = tokenizer(tokens)
        if yield_cls:
            yield data_to_parse[row_id][0], ngrams_iterator(tokens, ngrams)
        else:
            yield ngrams_iterator(tokens, ngrams)
Esempio n. 24
0
 def __init__(
     self,
     sequential=True,
     use_vocab=True,
     init_token=None,
     eos_token=None,
     fix_length=None,
     dtype=torch.long,
     preprocessing=None,
     postprocessing=None,
     lower=False,
     tokenize=None,
     tokenizer_language="en",
     include_lengths=False,
     batch_first=False,
     pad_token="<pad>",
     unk_token="<unk>",
     pad_first=False,
     truncate_first=False,
     stop_words=None,
     is_target=False,
 ):
     self.sequential = sequential
     self.use_vocab = use_vocab
     self.init_token = init_token
     self.eos_token = eos_token
     self.unk_token = unk_token
     self.fix_length = fix_length
     self.dtype = dtype
     self.preprocessing = preprocessing
     self.postprocessing = postprocessing
     self.lower = lower
     # store params to construct tokenizer for serialization
     # in case the tokenizer isn't picklable (e.g. spacy)
     self.tokenizer_args = (tokenize, tokenizer_language)
     self.tokenize = get_tokenizer(tokenize, tokenizer_language)
     self.include_lengths = include_lengths
     self.batch_first = batch_first
     self.pad_token = pad_token if self.sequential else None
     self.pad_first = pad_first
     self.truncate_first = truncate_first
     try:
         self.stop_words = set(stop_words) if stop_words is not None else None
     except TypeError:
         raise ValueError("Stop words must be convertible to a set")
     self.is_target = is_target
Esempio n. 25
0
def make_torch_vocab(torch_text_path, corpus_type, min_freq=None):
    """Leveraging torch text experimental functions.

    :param torch_text_path: string, file path to torchtext file
    :param corpus_type: string, Required. One of 'train', 'valid', 'test'
    :param min_freq: token counter min frequency threshold, if None -> 1.
    :return: vocabulary

    source: https://github.com/pytorch/text/blob/master/torchtext/experimental/vocab.py
    """
    logging.info('Starting make_torch_vocab()')

    if min_freq is None:
        min_freq = 1

    files = os.listdir(torch_text_path)
    if all([".tokens" in i for i in files]):
        logging.info(
            f'Found {corpus_type} .token files.\n'
            f'\tReturning copra from disk instead of downloading them.\n'
            f'\tTo force new download, delete or rename these files:\n'
            f'\t{files}')

        tokenizer = get_tokenizer('basic_english')
        vocabulary = {}

        for file in files:
            file_path = os.sep.join([torch_text_path, file])
            counter = Counter()
            f = open(file_path, 'r')

            for line in f:
                counter.update(tokenizer(line))

            v = Vocab(counter, min_freq=min_freq)

            key = 'train' if '.train.' in file else 'test' if '.test.' in file else 'valid'
            vocabulary.update({key: v})
            f.close()

        logging.info(f'Completed parsing vocab for {corpus_type}.')
        for k, v in vocabulary.items():
            logging.info(f'Dataset {k}: with vocabulary of length: {len(v)}.')

        return vocabulary
def get_datasets(args):
    download = True if hvd.local_rank() == 0 else False
    if not download: hvd.allreduce(torch.tensor(1), name="barrier")
    args.dir = os.path.join(args.dir, args.dataset)
    os.makedirs(args.dir, exist_ok=True)
    tokenizer = get_tokenizer("basic_english")
    if args.dataset == 'wikitext2':
        WikiText = datasets.WikiText2
    elif args.dataset == 'wikitext103':
        WikiText = datasets.WikiText103
    train_data, val_data, test_data = WikiText(tokenizer=tokenizer, root=args.dir)
    if args.verbose: print("")
    if download: hvd.allreduce(torch.tensor(1), name="barrier")

    ntokens = len(train_data.get_vocab())
    batch_size = args.batch_size * (args.bptt + 1)

    def collate(data):
        data = torch.stack(data)
        source = data.view(args.batch_size, -1).contiguous()
        data = source[:, :-1]
        target = source[:, 1:].contiguous().view(-1)
        return data, target

    torch.set_num_threads(4)
    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}

    train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_data, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
            train_data, batch_size=batch_size, collate_fn=collate, drop_last=True,
            sampler=train_sampler, shuffle=False, **kwargs)
    val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_data, num_replicas=hvd.size(), rank=hvd.rank())
    val_loader = torch.utils.data.DataLoader(
            val_data, batch_size=batch_size, collate_fn=collate, drop_last=True,
            sampler=val_sampler, shuffle=False, **kwargs)
    test_sampler = torch.utils.data.distributed.DistributedSampler(
            test_data, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(
            test_data, batch_size=batch_size, collate_fn=collate, drop_last=True,
            sampler=test_sampler, shuffle=False, **kwargs)

    return (train_sampler, train_loader), (val_sampler, val_loader), \
           (test_sampler, test_loader), ntokens
Esempio n. 27
0
 def iterator(start, num_lines):
     tokenizer = get_tokenizer("basic_english")
     with io.open(data_path, encoding="utf8") as f:
         reader = unicode_csv_reader(f)
         for i, row in enumerate(reader):
             if i == start:
                 break
         for _ in range(num_lines):
             tokens = ' '.join(row[1:])
             tokens = ngrams_iterator(tokenizer(tokens), ngrams)
             yield int(row[0]) - 1, torch.tensor(
                 [vocab[token] for token in tokens])
             try:
                 row = next(reader)
             except StopIteration:
                 f.seek(0)
                 reader = unicode_csv_reader(f)
                 row = next(reader)
Esempio n. 28
0
def _setup_datasets(dataset_name, tokenizer, root, vocab, data_select,
                    single_line, year, language):
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')

    data_select = check_default_set(data_select, ('train', 'test', 'valid'))

    if not single_line and dataset_name != 'WikiText103':
        raise TypeError('single_line must be True except for WikiText103')
    if vocab is None:
        if 'train' not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")
        if dataset_name == 'WMTNewsCrawl':
            raw_train, = raw.DATASETS[dataset_name](root=root,
                                                    data_select=('train', ),
                                                    year=year,
                                                    language=language)
        else:
            raw_train, = raw.DATASETS[dataset_name](root=root,
                                                    data_select=('train', ))
        logger_.info('Building Vocab based on train data')
        vocab = build_vocab(raw_train, tokenizer)
    logger_.info('Vocab has %d entries', len(vocab))

    def text_transform(line):
        return torch.tensor([vocab[token] for token in tokenizer(line)],
                            dtype=torch.long)

    if dataset_name == 'WMTNewsCrawl':
        raw_datasets = raw.DATASETS[dataset_name](root=root,
                                                  data_select=data_select,
                                                  year=year,
                                                  language=language)
    else:
        raw_datasets = raw.DATASETS[dataset_name](root=root,
                                                  data_select=data_select)
    raw_data = {
        name: list(map(text_transform, raw_dataset))
        for name, raw_dataset in zip(data_select, raw_datasets)
    }
    logger_.info('Building datasets for {}'.format(data_select))
    return tuple(
        LanguageModelingDataset(raw_data[item], vocab, text_transform,
                                single_line) for item in data_select)
Esempio n. 29
0
def _setup_datasets(dataset_name,
                    root='.data',
                    vocab=None,
                    tokenizer=None,
                    data_select=('train', 'dev')):
    text_transform = []
    if tokenizer is None:
        tokenizer = get_tokenizer('basic_english')
    text_transform = sequential_transforms(tokenizer)
    if isinstance(data_select, str):
        data_select = [data_select]
    if not set(data_select).issubset(set(('train', 'dev'))):
        raise TypeError(
            'Given data selection {} is not supported!'.format(data_select))
    train, dev = raw.DATASETS[dataset_name](root=root)
    raw_data = {
        'train': [item for item in train],
        'dev': [item for item in dev]
    }
    if vocab is None:
        if 'train' not in data_select:
            raise TypeError("Must pass a vocab if train is not selected.")

        def apply_transform(data):
            for (_context, _question, _answers, _ans_pos) in data:
                tok_ans = []
                for item in _answers:
                    tok_ans += text_transform(item)
                yield text_transform(_context) + text_transform(
                    _question) + tok_ans

        vocab = build_vocab_from_iterator(apply_transform(raw_data['train']),
                                          len(raw_data['train']))
    text_transform = sequential_transforms(text_transform, vocab_func(vocab),
                                           totensor(dtype=torch.long))
    transforms = {
        'context': text_transform,
        'question': text_transform,
        'answers': text_transform,
        'ans_pos': totensor(dtype=torch.long)
    }
    return tuple(
        QuestionAnswerDataset(raw_data[item], vocab, transforms)
        for item in data_select)
Esempio n. 30
0
def predict(text, model, dictionary, ngrams):
    r"""
    The predict() function here is used to test the model on a sample text.
    The input text is numericalized with the vocab and then sent to
    the model for inference.
    Args:
        text: a sample text string
        model: the trained model
        dictionary: a vocab object for the information of string-to-index
        ngrams: the number of ngrams.
    """
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([
            dictionary[token]
            for token in ngrams_iterator(tokenizer(text), ngrams)
        ])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1