def prepare_training_data(src_files, src_files_encoding, trg_files, trg_files_encoding, src_output_file, trg_output_file): ''' for each pair of source/target files, check they have the same number of sentences; do shuffle and save with utf-8 encodings ''' src = chain( *[iter_(open_(f, encoding=src_files_encoding)) for f in src_files]) trg = chain( *[iter_(open_(f, encoding=trg_files_encoding)) for f in trg_files]) # TODO: find a way not to load all sentences into memory logger.info("reading sentences from source files...") src_sentences = [sent for sent in src] logger.info("reading sentences from target files...") trg_sentences = [sent for sent in trg] assert len(src_sentences) == len(trg_sentences) logger.info("number of sentences:%d" % len(src_sentences)) # '\n' has been removed from a sentence assert src_sentences[0].endswith('\n') # do the shuffle ids = list(range(len(src_sentences))) random.shuffle(ids) with codecs.open(src_output_file, 'w', 'UTF-8') as f_src: with codecs.open(trg_output_file, 'w', 'UTF-8') as f_trg: for i in ids: f_src.write(src_sentences[i]) f_trg.write(trg_sentences[i])
def test_dict_iterator(): d = {'a': 'b', 1: 2} assert list(iter_(d)) == list(iter(d)) assert list(iter_(d.items())) == list(iter(d.items())) assert list(iter_(d.keys())) == list(iter(d.keys())) assert list(iter_(d.values())) == list(iter(d.values())) yield verify_pickle, iter_, iter, 2, 1, d yield verify_pickle, iter_, iter, 2, 1, d.items() yield verify_pickle, iter_, iter, 2, 1, d.values() yield verify_pickle, iter_, iter, 2, 1, d.keys()
def get_request_iterator(self): l = [ f for f in os.listdir(self.path) if os.path.isfile(os.path.join(self.path, f)) ] if self.shuffle: random.shuffle(l) return iter_(l)
def get_request_iterator(self): i = range(self.nitems) j = range(self.nfeats) if self.irandom: random.shuffle(i) if self.jrandom: random.shuffle(j) ib = batch(i, self.ibatchsize) jb = batch(j, self.jbatchsize) return iter_([(ii, jj) for ii in ib for jj in jb])
def get_request_iterator(self): self.refs = list(self.squad_iterator.get_request_iterator()) cnn_refs = list(self.cnn_iterator.get_request_iterator()) random.shuffle(cnn_refs) cnn_to_add = int(self.cnn_ratio * len(self.refs)) if cnn_to_add > len(cnn_refs): print "To many CNN data points requested" cnn_to_add = len(cnn_refs) self.refs += cnn_refs[:cnn_to_add] if self.shuffle: print "Shuffling CNN and SQuAD (should occur every epoch)" random.shuffle(self.refs) return iter_(self.refs)
def get_request_iterator(self): cuts = self.use_cuts if cuts == None: cuts = [random.randrange(first_time, last_time) for _ in range(self.num_cuts)] l = [] with sqlite3.connect(self.dbfile) as db: c = db.cursor() for cut in cuts: part = [i for (i,) in c.execute('SELECT trip FROM trip_times WHERE begin >= ? AND begin <= ? AND end >= ?', (cut - 40000, cut, cut))] l = l + part random.shuffle(l) return iter_(l)
def get_request_iterator(self): cuts = self.use_cuts if cuts == None: cuts = [ random.randrange(first_time, last_time) for _ in range(self.num_cuts) ] l = [] with sqlite3.connect(self.dbfile) as db: c = db.cursor() for cut in cuts: part = [ i for (i, ) in c.execute( 'SELECT trip FROM trip_times WHERE begin >= ? AND begin <= ? AND end >= ?', (cut - 40000, cut, cut)) ] l = l + part random.shuffle(l) return iter_(l)
def open(self): return chain(*[iter_( codecs.open(f, encoding="latin1") ) for f in self.files])
def get_request_iterator(self): return iter_(self.indices)
def open(self): return chain(*[iter_(open_(f, encoding=self.encoding)) for f in self.files])
def open(self): return chain(*[iter_(open(f)) for f in self.files])
def test_file_iterator_pickling(): f = _create_test_file() it = iter_(open(f.name)) last = [next(it) for _ in range(2)][-1] first = next(cPickle.loads(cPickle.dumps(it))) assert int(first) == int(last) + 1
def open(self): return chain( *[iter_(codecs.open(f, encoding="latin1")) for f in self.files])
def tee(iterable, n=2): """tee(iterable, n=2) --> tuple of n independent iterators.""" return tee_manager(iter_(iterable), n=n).iterators()
def open(self): return bAbIState(chain(*[iter_(open(f)) for f in self.files]), [], 0)
def __init__(self, iterable, n=2): self._iterable = iter_(iterable) self._deques = tuple(collections.deque() for _ in range(n))
def get_request_iterator(self): return iter_(xrange(self.num_examples))
def get_request_iterator(self): l = [(random.randrange(0, self.item_range - self.seq_len + 1), self.seq_len) for _ in xrange(self.num_seqs)] return iter_(l)
def open(self): handlers = [open(f, "rb") for f in self.files] return chain(*[iter_(h) for h in handlers]), handlers
def get_request_iterator(self): indices = list(self.indices) self.rng.shuffle(indices) return iter_(indices)
def get_request_iterator(self): indices = list(range(self.num_examples)) self.rng.shuffle(indices) return imap(list, imap( islice, repeat(iter_(indices), self.num_batches), repeat(self.batch_size, self.num_batches)))
def get_request_iterator(self): if self.shuffle: random.shuffle(self.reference) return iter_(self.reference)
def open(self): iterators = [iter_(channel) for channel in self.iterables] return izip(*iterators)
def get_request_iterator(self): return iter_(list(range(2)))
def get_request_iterator(self): l = [f for f in os.listdir(self.path) if os.path.isfile(os.path.join(self.path, f))] if self.shuffle: random.shuffle(l) return iter_(l)
def __init__(self, n_list, seq): self._n_list = iter_(n_list) self._seq = iter_(seq)
def open(self): return iter_(self.piano_rolls)
def get_request_iterator(self): indices = list(range(self.num_examples)) self.rng.shuffle(indices) return iter_(indices)
def get_request_iterator(self): it = self.iteration_scheme.get_request_iterator() return iter_([it.next() for n in range(self.times)])
def open(self): return chain( *[iter_(open_(f, encoding=self.encoding)) for f in self.files])
def open(self): return iter_(self.files)