Ejemplo n.º 1
0
def prepare_training_data(src_files, src_files_encoding, trg_files,
                          trg_files_encoding, src_output_file,
                          trg_output_file):
    '''
    for each pair of source/target files, check they have the same number of sentences;
    do shuffle and save with utf-8 encodings
    '''
    src = chain(
        *[iter_(open_(f, encoding=src_files_encoding)) for f in src_files])
    trg = chain(
        *[iter_(open_(f, encoding=trg_files_encoding)) for f in trg_files])

    # TODO: find a way not to load all sentences into memory
    logger.info("reading sentences from source files...")
    src_sentences = [sent for sent in src]
    logger.info("reading sentences from target files...")
    trg_sentences = [sent for sent in trg]

    assert len(src_sentences) == len(trg_sentences)
    logger.info("number of sentences:%d" % len(src_sentences))

    # '\n' has been removed from a sentence
    assert src_sentences[0].endswith('\n')
    # do the shuffle
    ids = list(range(len(src_sentences)))
    random.shuffle(ids)

    with codecs.open(src_output_file, 'w', 'UTF-8') as f_src:
        with codecs.open(trg_output_file, 'w', 'UTF-8') as f_trg:
            for i in ids:
                f_src.write(src_sentences[i])
                f_trg.write(trg_sentences[i])
Ejemplo n.º 2
0
def test_dict_iterator():
    d = {'a': 'b', 1: 2}
    assert list(iter_(d)) == list(iter(d))
    assert list(iter_(d.items())) == list(iter(d.items()))
    assert list(iter_(d.keys())) == list(iter(d.keys()))
    assert list(iter_(d.values())) == list(iter(d.values()))

    yield verify_pickle, iter_, iter, 2, 1, d
    yield verify_pickle, iter_, iter, 2, 1, d.items()
    yield verify_pickle, iter_, iter, 2, 1, d.values()
    yield verify_pickle, iter_, iter, 2, 1, d.keys()
Ejemplo n.º 3
0
def test_dict_iterator():
    d = {'a': 'b', 1: 2}
    assert list(iter_(d)) == list(iter(d))
    assert list(iter_(d.items())) == list(iter(d.items()))
    assert list(iter_(d.keys())) == list(iter(d.keys()))
    assert list(iter_(d.values())) == list(iter(d.values()))

    yield verify_pickle, iter_, iter, 2, 1, d
    yield verify_pickle, iter_, iter, 2, 1, d.items()
    yield verify_pickle, iter_, iter, 2, 1, d.values()
    yield verify_pickle, iter_, iter, 2, 1, d.keys()
Ejemplo n.º 4
0
 def get_request_iterator(self):
     l = [
         f for f in os.listdir(self.path)
         if os.path.isfile(os.path.join(self.path, f))
     ]
     if self.shuffle:
         random.shuffle(l)
     return iter_(l)
Ejemplo n.º 5
0
    def get_request_iterator(self):
        i = range(self.nitems)
        j = range(self.nfeats)

        if self.irandom:
            random.shuffle(i)
        if self.jrandom:
            random.shuffle(j)

        ib = batch(i, self.ibatchsize)
        jb = batch(j, self.jbatchsize)

        return iter_([(ii, jj) for ii in ib for jj in jb])
Ejemplo n.º 6
0
    def get_request_iterator(self):
        i = range(self.nitems)
        j = range(self.nfeats)

        if self.irandom:
            random.shuffle(i)
        if self.jrandom:
            random.shuffle(j)

        ib = batch(i, self.ibatchsize)
        jb = batch(j, self.jbatchsize)

        return iter_([(ii, jj) for ii in ib for jj in jb])
    def get_request_iterator(self):
        self.refs = list(self.squad_iterator.get_request_iterator())
        cnn_refs = list(self.cnn_iterator.get_request_iterator())
        random.shuffle(cnn_refs)
        cnn_to_add = int(self.cnn_ratio * len(self.refs))
        if cnn_to_add > len(cnn_refs):
            print "To many CNN data points requested"
            cnn_to_add = len(cnn_refs)

        self.refs += cnn_refs[:cnn_to_add]

        if self.shuffle:
            print "Shuffling CNN and SQuAD (should occur every epoch)"
            random.shuffle(self.refs)
        return iter_(self.refs)
Ejemplo n.º 8
0
    def get_request_iterator(self):
        cuts = self.use_cuts
        if cuts == None:
            cuts = [random.randrange(first_time, last_time) for _ in range(self.num_cuts)]

        l = []
        with sqlite3.connect(self.dbfile) as db:
            c = db.cursor()
            for cut in cuts:
                part = [i for (i,) in
                    c.execute('SELECT trip FROM trip_times WHERE begin >= ? AND begin <= ? AND end >= ?',
                                (cut - 40000, cut, cut))]
                l = l + part
        random.shuffle(l)
                
        return iter_(l)
Ejemplo n.º 9
0
 def get_request_iterator(self):
     cuts = self.use_cuts
     if cuts == None:
         cuts = [
             random.randrange(first_time, last_time)
             for _ in range(self.num_cuts)
         ]
     l = []
     with sqlite3.connect(self.dbfile) as db:
         c = db.cursor()
         for cut in cuts:
             part = [
                 i for (i, ) in c.execute(
                     'SELECT trip FROM trip_times WHERE begin >= ? AND begin <= ? AND end >= ?',
                     (cut - 40000, cut, cut))
             ]
             l = l + part
     random.shuffle(l)
     return iter_(l)
 def open(self):
   return chain(*[iter_( codecs.open(f, encoding="latin1") ) for f in self.files])
Ejemplo n.º 11
0
 def get_request_iterator(self):
     return iter_(self.indices)
Ejemplo n.º 12
0
Archivo: text.py Proyecto: Afrik/fuel
 def open(self):
     return chain(*[iter_(open_(f, encoding=self.encoding))
                    for f in self.files])
Ejemplo n.º 13
0
 def open(self):
     return chain(*[iter_(open(f)) for f in self.files])
Ejemplo n.º 14
0
def test_file_iterator_pickling():
    f = _create_test_file()
    it = iter_(open(f.name))
    last = [next(it) for _ in range(2)][-1]
    first = next(cPickle.loads(cPickle.dumps(it)))
    assert int(first) == int(last) + 1
Ejemplo n.º 15
0
 def open(self):
     return chain(
         *[iter_(codecs.open(f, encoding="latin1")) for f in self.files])
Ejemplo n.º 16
0
def tee(iterable, n=2):
    """tee(iterable, n=2) --> tuple of n independent iterators."""
    return tee_manager(iter_(iterable), n=n).iterators()
Ejemplo n.º 17
0
 def open(self):
     return chain(*[iter_(open(f)) for f in self.files])
Ejemplo n.º 18
0
 def open(self):
     return bAbIState(chain(*[iter_(open(f)) for f in self.files]), [], 0)
Ejemplo n.º 19
0
 def __init__(self, iterable, n=2):
     self._iterable = iter_(iterable)
     self._deques = tuple(collections.deque() for _ in range(n))
Ejemplo n.º 20
0
 def get_request_iterator(self):
     return iter_(xrange(self.num_examples))
Ejemplo n.º 21
0
 def get_request_iterator(self):
     l = [(random.randrange(0, self.item_range - self.seq_len + 1), self.seq_len)
          for _ in xrange(self.num_seqs)]
     return iter_(l)
Ejemplo n.º 22
0
 def open(self):
     handlers = [open(f, "rb") for f in self.files]
     return chain(*[iter_(h) for h in handlers]), handlers
Ejemplo n.º 23
0
 def get_request_iterator(self):
     indices = list(self.indices)
     self.rng.shuffle(indices)
     return iter_(indices)
Ejemplo n.º 24
0
 def get_request_iterator(self):
     indices = list(range(self.num_examples))
     self.rng.shuffle(indices)
     return imap(list, imap(
         islice, repeat(iter_(indices), self.num_batches),
         repeat(self.batch_size, self.num_batches)))
 def get_request_iterator(self):
     if self.shuffle:
         random.shuffle(self.reference)
     return iter_(self.reference)
Ejemplo n.º 26
0
 def open(self):
     iterators = [iter_(channel) for channel in self.iterables]
     return izip(*iterators)
 def get_request_iterator(self):
     return iter_(list(range(2)))
Ejemplo n.º 28
0
 def get_request_iterator(self):
     l = [f for f in os.listdir(self.path)
          if os.path.isfile(os.path.join(self.path, f))]
     if self.shuffle:
         random.shuffle(l)
     return iter_(l)
Ejemplo n.º 29
0
 def __init__(self, n_list, seq):
     self._n_list = iter_(n_list)
     self._seq = iter_(seq)
Ejemplo n.º 30
0
 def open(self):
     return iter_(self.piano_rolls)
Ejemplo n.º 31
0
 def open(self):
     iterators = [iter_(channel) for channel in self.iterables]
     return izip(*iterators)
Ejemplo n.º 32
0
 def get_request_iterator(self):
     indices = list(range(self.num_examples))
     self.rng.shuffle(indices)
     return iter_(indices)
Ejemplo n.º 33
0
 def get_request_iterator(self):
     it = self.iteration_scheme.get_request_iterator()
     return iter_([it.next() for n in range(self.times)]) 
Ejemplo n.º 34
0
def test_file_iterator_pickling():
    f = _create_test_file()
    it = iter_(open(f.name))
    last = [next(it) for _ in range(2)][-1]
    first = next(cPickle.loads(cPickle.dumps(it)))
    assert int(first) == int(last) + 1
Ejemplo n.º 35
0
 def open(self):
     return chain(
         *[iter_(open_(f, encoding=self.encoding)) for f in self.files])
Ejemplo n.º 36
0
 def open(self):
     return iter_(self.files)