Esempio n. 1
0
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size,
                      seq_len, **kwargs):
    """Creates the stream which is used for the main loop.
    
    Args:
        src_data (string): Path to the source sentences
        trg_data (string): Path to the target sentences
        src_vocab_size (int): Size of the source vocabulary in the NMT
                              model
        trg_vocab_size (int): Size of the target vocabulary in the NMT
                              model
        seq_len (int): Maximum length of any source or target sentence
    
    Returns:
        ExplicitNext. Alignment data stream which can be iterated
        explicitly
    """
    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)})
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))
    s = Batch(s, iteration_scheme=ConstantScheme(1))
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])
    return ExplicitNext(masked_stream)
Esempio n. 2
0
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Filter -- TODO 
	stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len']))

	# Map - no need 

	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(
			configuration['batch_size']*configuration['sort_k_batches']))
	stream = Mapping(stream, SortMapping(_length))
	stream = Unpack(stream)
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(configuration['batch_size']))

	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Esempio n. 3
0
def get_dev_stream_with_topicalq(test_set=None,
                                 src_vocab=None,
                                 src_vocab_size=30000,
                                 topical_test_set=None,
                                 topical_vocab=None,
                                 topical_vocab_size=2000,
                                 unk_id=1,
                                 **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if test_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set, type(src_vocab)
        topical_vocab = cPickle.load(open(topical_vocab, 'rb'))
        #not ensure special token.
        topical_dataset = TextFile([topical_test_set], topical_vocab, None,
                                   None, '10')
        dev_dataset = TextFile([test_set], src_vocab, None)
        #dev_stream = DataStream(dev_dataset)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            topical_dataset.get_example_stream()
        ], ('source', 'source_topical'))
    return dev_stream
Esempio n. 4
0
def get_dev_stream(val_set=None,
                   valid_sent_dict=None,
                   src_vocab=None,
                   trg_vocab=None,
                   src_vocab_size=30000,
                   trg_vocab_size=30000,
                   unk_id=1,
                   **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and src_vocab is not None:
        # Load dictionaries and ensure special tokens exist
        src_vocab = ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab)),
                                          bos_idx=0,
                                          eos_idx=src_vocab_size - 1,
                                          unk_idx=unk_id)

        trg_vocab = ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                          bos_idx=0,
                                          eos_idx=trg_vocab_size - 1,
                                          unk_idx=unk_id)

        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_dictset = TextFile([valid_sent_dict], trg_vocab, None)
        #dev_stream = DataStream(dev_dataset)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            dev_dictset.get_example_stream()
        ], ('source', 'valid_sent_trg_dict'))
    return dev_stream
def get_devtest_stream(data_type='valid', input_file=None, **kwards):

    if data_type == 'valid':
        data_file = kwards.pop('valid_src')
        data_file_hist = kwards.pop('valid_src_hist')
    elif data_type == 'test':
        if input_file is None:
            data_file = kwards.pop('test_src')
        else:
            data_file = input_file
        # added by Longyue
        data_file_hist = kwards.pop('test_src_hist')
    else:
        logger.error('wrong datatype, which must be one of valid or test')

    unk_token = kwards.pop('unk_token')
    eos_token = kwards.pop('eos_token')
    vocab_src = kwards.pop('vocab_src')

    dataset = TextFile(files=[data_file],
                       dictionary=pkl.load(open(vocab_src, 'rb')),
                       level='word',
                       unk_token=unk_token,
                       bos_token=None,
                       eos_token=eos_token)

    dev_stream = DataStream(dataset)

    # added by Longyue
    hist_len = 3
    dev_stream_hist = []
    for idx in range(hist_len):
        dataset_hist = TextFile(files=[data_file_hist + str(idx)],
                                dictionary=pkl.load(open(vocab_src, 'rb')),
                                level='word',
                                unk_token=unk_token,
                                bos_token=None,
                                eos_token=eos_token)

        dev_stream_hist.append(DataStream(dataset_hist))

    dev_stream_hist_combine = []
    for d_s in dev_stream_hist:
        for item in d_s.get_epoch_iterator():
            dev_stream_hist_combine.append(item)

    item_len = len(dev_stream_hist_combine)
    dev_stream_hist_split = []
    for i in range(item_len / hist_len):
        tmp = []
        for j in range(hist_len):
            tmp.append(dev_stream_hist_combine[i + item_len / hist_len * j])
            dev_stream_hist_split.append(tmp)

    dev_stream_hist_split = tuple(dev_stream_hist_split)
    return dev_stream, dev_stream_hist_split
Esempio n. 6
0
def get_tr_stream(config):

    cgs = config['cgs']
    enc_ids, dec_ids = get_enc_dec_ids(cgs)

    # Prepare source vocabs and files, make sure special tokens are there
    src_files = config['src_datas']
    src_vocabs = {
        k: cPickle.load(open(v))
        for k, v in config['src_vocabs'].iteritems()
    }
    for k in src_vocabs.keys():
        src_vocabs[k]['<S>'] = 0
        src_vocabs[k]['</S>'] = config['src_eos_idxs'][k]
        src_vocabs[k]['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_files = config['trg_datas']
    trg_vocabs = {
        k: cPickle.load(open(v))
        for k, v in config['trg_vocabs'].iteritems()
    }
    for k in trg_vocabs.keys():
        trg_vocabs[k]['<S>'] = 0
        trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k]
        trg_vocabs[k]['<UNK>'] = config['unk_id']

    # Create individual source streams
    src_datasets = {
        cg: TextFile([src_files[cg]], src_vocabs[p_(cg)[0]], None)
        for cg in cgs
    }

    # Create individial target streams
    trg_datasets = {
        cg: TextFile([trg_files[cg]], trg_vocabs[p_(cg)[1]], None)
        for cg in cgs
    }

    # Build the preprocessing pipeline for individual streams
    ind_streams = {}
    for cg in cgs:
        logger.info('Building training stream for cg:[{}]'.format(cg))
        masked_stream = get_src_trg_stream(cg, config, src_datasets,
                                           trg_datasets)
        ind_streams[cg] = masked_stream

    # Scheduler and meta-controller
    multi_enc_stream = MultiEncStream(ind_streams,
                                      schedule=config['schedule'],
                                      batch_sizes=config['batch_sizes'],
                                      transpose=True,
                                      start_after=config.get(
                                          'start_after', None))
    return multi_enc_stream
Esempio n. 7
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Esempio n. 8
0
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 9
0
def get_test_stream_withContext_grdTruth(test_ctx_datas=None,
                                         test_set_source=None,
                                         test_set_target=None,
                                         src_vocab=None,
                                         src_vocab_size=30000,
                                         trg_vocab=None,
                                         trg_vocab_size=30000,
                                         batch_size=128,
                                         unk_id=1,
                                         ctx_num=3,
                                         **kwargs):
    """Setup development set stream if necessary."""
    masked_stream = None
    if test_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set_source, type(src_vocab)
        # Get text files from both source and target
        ctx_datasets = []
        for i in range(ctx_num):
            ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None))
        dev_dataset = TextFile([test_set_source], src_vocab, None)
        dev_target = TextFile([test_set_target], trg_vocab, None)
        dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] + [
            dev_dataset.get_example_stream(),
            dev_target.get_example_stream()
        ],
                           tuple('context_' + str(i) for i in range(ctx_num)) +
                           ('source', 'target'))
        stream = Mapping(
            dev_stream,
            _oov_to_unk(ctx_num=ctx_num,
                        src_vocab_size=src_vocab_size,
                        trg_vocab_size=trg_vocab_size,
                        unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
        masked_stream = PaddingWithEOSContext(
            stream, [src_vocab_size - 1
                     for i in range(ctx_num + 1)] + [trg_vocab_size - 1])

    return masked_stream
Esempio n. 10
0
def get_dev_stream_withContext_withPosTag(test_ctx_datas=None,
                                          test_posTag_datas=None,
                                          test_set_source=None,
                                          src_vocab=None,
                                          src_vocab_size=30000,
                                          unk_id=1,
                                          ctx_num=3,
                                          **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if test_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set_source, type(src_vocab)
        # Get text files from both source and target
        ctx_datasets = []
        posTag_datasets = []
        for i in range(ctx_num):
            ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None))
            posTag_datasets.append(
                TextFile([test_posTag_datas[i]], src_vocab, None))
        posTag_datasets.append(
            TextFile([test_posTag_datas[ctx_num]], src_vocab, None))
        src_dataset = TextFile([test_set_source], src_vocab, None)

        # Merge them to get a source, target pair
        dev_stream = Merge(
            [i.get_example_stream() for i in ctx_datasets] +
            [i.get_example_stream()
             for i in posTag_datasets] + [src_dataset.get_example_stream()],
            tuple('context_' + str(i) for i in range(ctx_num)) +
            tuple('context_posTag_' + str(i)
                  for i in range(ctx_num)) + ('source_posTag', 'source'))

        stream = Mapping(
            dev_stream,
            _oov_to_unk_posTag_dev(ctx_num=ctx_num,
                                   src_vocab_size=src_vocab_size,
                                   unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        masked_stream = PaddingWithEOSContext(
            stream, [src_vocab_size - 1 for i in range(2 * ctx_num + 2)])

    return masked_stream
Esempio n. 11
0
def get_dev_stream_with_grdTruth(val_set_source=None,
                                 val_set_target=None,
                                 src_vocab=None,
                                 src_vocab_size=30000,
                                 trg_vocab=None,
                                 trg_vocab_size=30000,
                                 batch_size=128,
                                 unk_id=1,
                                 seq_len=50,
                                 **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if val_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_vocab_size - 1,
                                           unk_idx=unk_id)

        print val_set_source, type(src_vocab)
        dev_dataset = TextFile([val_set_source], src_vocab, None)
        trg_dataset = TextFile([val_set_target], trg_vocab, None)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('dev_source', 'dev_target'))
        # Filter sequences that are too long
        stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len))

        # Replace out of vocabulary tokens with unk token
        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=src_vocab_size,
                        trg_vocab_size=trg_vocab_size,
                        unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        # Pad sequences that are short
        masked_stream = PaddingWithEOS(
            stream, [src_vocab_size - 1, trg_vocab_size - 1])
    return masked_stream
Esempio n. 12
0
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None,
                                         src_vocab_size=30000, unk_id=1, **kwargs):
    """Setup development set stream if necessary."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']


    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        dev_dataset = TextFile([val_set], src_vocab, None)

        # now add the source with the image features
        # create the image datastream (iterate over a file line-by-line)
        con_features = _get_np_array(val_context_features)
        con_feature_dataset = IterableDataset(con_features)
        valid_image_stream = DataStream(con_feature_dataset)

        # dev_stream = DataStream(dev_dataset)
        dev_stream = Merge([dev_dataset.get_example_stream(),
                            valid_image_stream], ('source', 'initial_context'))
    #         dev_stream = dev_stream.get_example_stream()

    return dev_stream
Esempio n. 13
0
def get_devtest_stream(data_type='valid', input_file=None, **kwards):

    if data_type == 'valid':
        data_file = kwards.pop('valid_src')
    elif data_type == 'test':
        if input_file is None:
            data_file = kwards.pop('test_src')
        else:
            data_file = input_file
    else:
        logger.error('wrong datatype, which must be one of valid or test')

    unk_token = kwards.pop('unk_token')
    eos_token = kwards.pop('eos_token')
    vocab_src = kwards.pop('vocab_src')

    dataset = TextFile(files=[data_file],
                       encoding='UTF-8',
                       preprocess=to_lower_case,
                       dictionary=pkl.load(open(vocab_src, 'rb')),
                       level='word',
                       unk_token=unk_token,
                       bos_token=None,
                       eos_token=eos_token)

    dev_stream = DataStream(dataset)

    return dev_stream
Esempio n. 14
0
def get_dev_stream(val_set=None,
                   src_vocab=None,
                   src_vocab_size=30000,
                   unk_id=1,
                   bos_token=None,
                   **kwargs):
    """Setup development set stream if necessary."""
    if type(bos_token) is str:
        bos_token = bos_token.decode('utf8')

    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab)),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        dev_dataset = TextFile([val_set],
                               src_vocab,
                               bos_token=bos_token,
                               eos_token=u'</S>',
                               unk_token=u'<UNK>',
                               encoding='utf8')

        dev_stream = DataStream(dev_dataset)
    return dev_stream
Esempio n. 15
0
def get_log_prob_stream(cg, config):
    eid, did = p_(cg)
    dataset = config['log_prob_sets'][cg]

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocab = cPickle.load(open(config['src_vocabs'][eid]))
    src_vocab['<S>'] = 0
    src_vocab['</S>'] = config['src_eos_idxs'][eid]
    src_vocab['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocab = cPickle.load(open(config['trg_vocabs'][did]))
    trg_vocab['<S>'] = 0
    trg_vocab['</S>'] = config['trg_eos_idxs'][did]
    trg_vocab['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    logger.info('Building logprob stream for cg:[{}]'.format(cg))
    src_dataset = TextFile([dataset[0]], src_vocab, None)
    trg_dataset = TextFile([dataset[1]], trg_vocab, None)
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                    trg_vocab_size=config['trg_vocab_sizes'][did],
                    unk_id=config['unk_id']))
    bs = 100
    if 'log_prob_bs' in config:
        if isinstance(config['log_prob_bs'], dict):
            bs = config['log_prob_bs'][cg]
        else:
            bs = config['log_prob_bs']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(bs,
                                                   num_examples=get_num_lines(
                                                       dataset[0])))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream,
        _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                       (2, 0, config['trg_eos_idxs'][did])]))

    return masked_stream
Esempio n. 16
0
def get_dev_stream_with_prefix_file(val_set=None, val_set_grndtruth=None, val_set_prefixes=None, val_set_suffixes=None,
                                    src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1,
                                    return_vocab=False, **kwargs):
    """Setup development stream with user-provided source, target, prefixes, and suffixes"""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None and val_set_prefixes is not None and val_set_suffixes is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        # Note: user should have already provided the EOS token in the data representation for the suffix
        # Note: The reason that we need EOS tokens in the reference file is that IMT systems need to evaluate metrics
        # Note: which count prediction of the </S> token, and evaluation scripts are called on the files
        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_prefix_dataset = TextFile([val_set_prefixes], trg_vocab,
                                      bos_token='<S>',
                                      eos_token=None,
                                      unk_token='<UNK>')
        dev_suffix_dataset = TextFile([val_set_suffixes], trg_vocab,
                                      bos_token=None,
                                      eos_token=None,
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream(),
                            dev_prefix_dataset.get_example_stream(),
                            dev_suffix_dataset.get_example_stream()],
                           ('source', 'target','target_prefix','target_suffix'))

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
Esempio n. 17
0
def get_test_stream(sfiles, svocab_dict): 
	dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	stream = Merge([dataset.get_example_stream(),], ('source', ))
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(10))
	stream = Padding(stream)
	return stream
Esempio n. 18
0
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000,
                                 trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream()],
                           ('source', 'target'))

        # now add prefix and suffixes to this stream
        dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)),
                             add_sources=('target_prefix', 'target_suffix'))

        dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream))

        # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
        dev_stream.produces_examples = False
        # flatten the stream back out into (source, target, target_prefix, target_suffix)
        dev_stream = Unpack(dev_stream)

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
Esempio n. 19
0
def get_dev_tr_stream_with_topic_target(val_set_source=None,val_set_target=None, src_vocab=None,trg_vocab=None, src_vocab_size=30000,trg_vocab_size=30000,
                                        trg_topic_vocab_size=2000,source_topic_vocab_size=2000,
                                        topical_dev_set=None,topic_vocab_input=None,topic_vocab_output=None,topical_vocab_size=2000,
                   unk_id=1, **kwargs):
    """Prepares the training data stream."""

    dev_stream = None
    if val_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict)
            else cPickle.load(open(src_vocab, 'rb')),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab, 'rb')),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)
        topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb'));
        topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it
        topic_binary_vocab={};
        for k,v in topic_vocab_output.items():
            if k=='<UNK>':
                topic_binary_vocab[k]=0;
            else:
                topic_binary_vocab[k]=1;
        # Get text files from both source and target
        src_dataset = TextFile([val_set_source], src_vocab, None)
        trg_dataset = TextFile([val_set_target], trg_vocab, None)
        src_topic_input=TextFile([topical_dev_set],topic_vocab_input,None,None,'rt')
        trg_topic_dataset = TextFile([val_set_target],topic_vocab_output,None);
        trg_topic_binary_dataset= TextFile([val_set_target],topic_binary_vocab,None);

        # Merge them to get a source, target pair
        dev_stream = Merge([src_dataset.get_example_stream(),
                        trg_dataset.get_example_stream(),
                        src_topic_input.get_example_stream(),
                        trg_topic_dataset.get_example_stream(),
                        trg_topic_binary_dataset.get_example_stream()],
                       ('source', 'target','source_topical','target_topic','target_binary_topic'))
        stream = Batch(
        dev_stream, iteration_scheme=ConstantScheme(1))
        masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1])

    return masked_stream
Esempio n. 20
0
def get_stream(vocab, data, vocab_size, unk_id, eos_id, bos_id, noise=0):
    vocab = get_vocab(vocab, vocab_size, unk_id, eos_id, bos_id)

    # Maps words to their index in the vocabulary. OOV words are replaced by <UNK> index.
    # Also appends </S> index at the end. No <S> token (TODO: bos_id parameter useless).
    dataset = TextFile([data], vocab, None)

    stream = Mapping(dataset.get_example_stream(), _add_noise(noise))
    stream.dataset = dataset  # for backward-compatibility
    return stream
Esempio n. 21
0
def get_dev_stream(sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(1006))
	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Esempio n. 22
0
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000,
                   unk_id=1, **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_stream = DataStream(dev_dataset)
    return dev_stream
Esempio n. 23
0
def test_text():
    # Test word level and epochs.
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        sentences1 = f.name
        f.write("This is a sentence\n")
        f.write("This another one")
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        sentences2 = f.name
        f.write("More sentences\n")
        f.write("The last one")
    dictionary = {'<UNK>': 0, '</S>': 1, 'this': 2, 'a': 3, 'one': 4}
    text_data = TextFile(files=[sentences1, sentences2],
                         dictionary=dictionary,
                         bos_token=None,
                         preprocess=lower)
    stream = DataStream(text_data)
    epoch = stream.get_epoch_iterator()
    assert len(list(epoch)) == 4
    epoch = stream.get_epoch_iterator()
    for sentence in zip(range(3), epoch):
        pass
    f = BytesIO()
    cPickle.dump(epoch, f)
    sentence = next(epoch)
    f.seek(0)
    epoch = cPickle.load(f)
    assert next(epoch) == sentence
    assert_raises(StopIteration, next, epoch)

    # Test character level.
    dictionary = dict([(chr(ord('a') + i), i)
                       for i in range(26)] + [(' ', 26)] + [('<S>', 27)] +
                      [('</S>', 28)] + [('<UNK>', 29)])
    text_data = TextFile(files=[sentences1, sentences2],
                         dictionary=dictionary,
                         preprocess=lower,
                         level="character")
    sentence = next(DataStream(text_data).get_epoch_iterator())[0]
    assert sentence[:3] == [27, 19, 7]
    assert sentence[-3:] == [2, 4, 28]
Esempio n. 24
0
def get_test_stream(test_set=None,
                    src_vocab=None,
                    trg_vocab=None,
                    src_vocab_size=200000,
                    trg_vocab_size=6540,
                    unk_id=1,
                    sort_k_batches=12):
    """Prepares the testing data stream."""
    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)
    # Get text files from both source and target
    src_dataset = TextFile([test_set], src_vocab, None)
    trg_dataset = TextFile(['./data/test.zh'], trg_vocab, None)
    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream, _oov_to_unk())
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(sort_k_batches))
    # Convert it into a stream again
    stream = Unpack(stream)
    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(1))
    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream,
                                   [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Esempio n. 25
0
def get_stream(input_file, vocab_file, **kwards):
    unk_token = kwards.pop('unk_token')
    eos_token = kwards.pop('eos_token')

    dataset = TextFile(files=[input_file],
                       dictionary=pkl.load(open(vocab_file, 'rb')),
                       level='word',
                       unk_token=unk_token,
                       bos_token=None,
                       eos_token=eos_token)

    stream = DataStream(dataset)

    return stream
Esempio n. 26
0
def _get_text_stream(src_data,
                     trg_data,
                     src_vocab_size=30000,
                     trg_vocab_size=30000,
                     **kwargs):
    """Creates a parallel data stream from two text files without 
    random access. This stream cannot be used with reshuffling.
    
    The arguments to this method are given by the configuration dict.
    """

    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    return Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
Esempio n. 27
0
def get_sgnmt_dev_stream(val_set=None,
                         src_vocab=None,
                         src_vocab_size=30000,
                         unk_id=1,
                         **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if val_set is not None:
        src_vocab = add_special_ids(
            {str(i): i
             for i in xrange(src_vocab_size)})
        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_stream = DataStream(dev_dataset)
    return dev_stream
Esempio n. 28
0
def _get_sgnmt_dev_stream(val_set=None,
                          src_vocab=None,
                          src_vocab_size=30000,
                          **kwargs):
    """Setup development set stream if necessary.
    
    The arguments to this method are given by the configuration dict.
    """
    dev_stream = None
    if val_set is not None:
        src_vocab = _add_special_ids({str(i) : i 
                                        for i in xrange(src_vocab_size)})
        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_stream = DataStream(dev_dataset)
    return dev_stream
Esempio n. 29
0
def get_test_stream(src_vocab,
                    trg_vocab,
                    src_data,
                    trg_data=None,
                    src_vocab_size=30000,
                    unk_id=1,
                    seq_len=50,
                    batch_size=80,
                    sort_k_batches=12,
                    **kwargs):
    """Prepares the test data stream (=no batches or gold labels)."""

    print('streaming...')
    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=2,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                       bos_idx=0,
                                       eos_idx=2,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode)
    print(src_data)
    #exit()
    trg_dataset = TextFile([trg_data], trg_vocab, preprocess=get_unicode)

    #stream=DataStream(src_dataset)
    stream = Merge([DataStream(src_dataset),
                    DataStream(trg_dataset)], ('source', 'target'))

    return stream
Esempio n. 30
0
def get_tst_stream(val_set=None,
                   src_vocab=None,
                   src_vocab_size=30000,
                   unk_id=1,
                   **kwargs):

    tst_stream = None
    if val_set is not None and src_vocab is not None:
        # Load dictionaries and ensure special tokens exist
        src_vocab = ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab)),
                                          bos_idx=0,
                                          eos_idx=src_vocab_size - 1,
                                          unk_idx=unk_id)

        tst_dataset = TextFile([val_set], src_vocab, None)
        tst_stream = DataStream(tst_dataset)
    return tst_stream