def prepare_training_data(src_files, src_files_encoding, trg_files, trg_files_encoding, src_output_file, trg_output_file): ''' for each pair of source/target files, check they have the same number of sentences; do shuffle and save with utf-8 encodings ''' src = chain( *[iter_(open_(f, encoding=src_files_encoding)) for f in src_files]) trg = chain( *[iter_(open_(f, encoding=trg_files_encoding)) for f in trg_files]) # TODO: find a way not to load all sentences into memory logger.info("reading sentences from source files...") src_sentences = [sent for sent in src] logger.info("reading sentences from target files...") trg_sentences = [sent for sent in trg] assert len(src_sentences) == len(trg_sentences) logger.info("number of sentences:%d" % len(src_sentences)) # '\n' has been removed from a sentence assert src_sentences[0].endswith('\n') # do the shuffle ids = list(range(len(src_sentences))) random.shuffle(ids) with codecs.open(src_output_file, 'w', 'UTF-8') as f_src: with codecs.open(trg_output_file, 'w', 'UTF-8') as f_trg: for i in ids: f_src.write(src_sentences[i]) f_trg.write(trg_sentences[i])
def get_request_iterator(self): if self.times: return repeat(self.batch_size, self.times) if self.num_examples: d, r = divmod(self.num_examples, self.batch_size) return chain(repeat(self.batch_size, d), [r] if r else []) return repeat(self.batch_size)
def get_request_iterator(self): iterator_list = [] start = 0 for size, examples in self.size_dict.items(): iterator_list.append( partition_all(self.batch_size, xrange(start, start + examples))) start += examples return chain(*iterator_list)
def __init__(self, data_streams, sources): self.data_streams = data_streams if len( list( chain( *[data_stream.sources for data_stream in data_streams]))) != len(sources): raise ValueError("wrong number of sources given") self.sources = sources
def __init__(self, data_streams, sources, axis_labels=None): super(Merge, self).__init__(iteration_scheme=None, axis_labels=axis_labels) if not all(data_stream.produces_examples == data_streams[0].produces_examples for data_stream in data_streams): raise ValueError("all data streams must produce the same type of " "output (batches or examples)") self.data_streams = data_streams self.produces_examples = self.data_streams[0].produces_examples if len(list(chain(*[data_stream.sources for data_stream in data_streams]))) != len(sources): raise ValueError("wrong number of sources given") self.sources = sources
def __init__(self, data_streams, sources, axis_labels=None): super(Merge, self).__init__( iteration_scheme=None, axis_labels=axis_labels) if not all(data_stream.produces_examples == data_streams[0].produces_examples for data_stream in data_streams): raise ValueError('all data streams must produce the same type of ' 'output (batches or examples)') self.data_streams = data_streams self.produces_examples = self.data_streams[0].produces_examples if len(list(chain(*[data_stream.sources for data_stream in data_streams]))) != len(sources): raise ValueError("wrong number of sources given") self.sources = sources
def wrapper_func(*args, **kwargs): mock_requests = args[-1] args = args[:-1] length = len(mock_content) mock_response = mock.Mock() mock_response.iter_content = mock.Mock(side_effect=lambda s: chain( (mock_content[s * i:s * (i + 1)] for i in range(length // s)), (mock_content[(length // s) * s:], ))) mock_response.headers = {} if content_length: mock_response.headers['content-length'] = length if content_disposition: cd = 'attachment; filename={}'.format(mock_filename) mock_response.headers['Content-Disposition'] = cd mock_requests.get.return_value = mock_response return func(*args, **kwargs)
def cross_validation(scheme_class, num_examples, num_folds, strict=True, **kwargs): """Return pairs of schemes to be used for cross-validation. Parameters ---------- scheme_class : subclass of :class:`IndexScheme` or :class:`BatchScheme` The type of the returned schemes. The constructor is called with an iterator and `**kwargs` as arguments. num_examples : int The number of examples in the datastream. num_folds : int The number of folds to return. strict : bool, optional If `True`, enforce that `num_examples` is divisible by `num_folds` and so, that all validation sets have the same size. If `False`, the size of the validation set is returned along the iteration schemes. Defaults to `True`. Yields ------ fold : tuple The generator returns `num_folds` tuples. The first two elements of the tuple are the training and validation iteration schemes. If `strict` is set to `False`, the tuple has a third element corresponding to the size of the validation set. """ if strict and num_examples % num_folds != 0: raise ValueError(("{} examples are not divisible in {} evenly-sized " + "folds. To allow this, have a look at the " + "`strict` argument.").format(num_examples, num_folds)) for i in xrange(num_folds): begin = num_examples * i // num_folds end = num_examples * (i+1) // num_folds train = scheme_class(list(chain(xrange(0, begin), xrange(end, num_examples))), **kwargs) valid = scheme_class(xrange(begin, end), **kwargs) if strict: yield (train, valid) else: yield (train, valid, end - begin)
def wrapper_func(*args, **kwargs): mock_requests = args[-1] args = args[:-1] length = len(mock_content) mock_response = mock.Mock() mock_response.iter_content = mock.Mock( side_effect=lambda s: chain( (mock_content[s * i: s * (i + 1)] for i in range(length // s)), (mock_content[(length // s) * s:],))) mock_response.headers = {} if content_length: mock_response.headers['content-length'] = length if content_disposition: cd = 'attachment; filename={}'.format(mock_filename) mock_response.headers['Content-Disposition'] = cd mock_requests.get.return_value = mock_response return func(*args, **kwargs)
def search(request): ''' queries = request.GET['q'].split() queryset_list = Post.objects.all() # .order_by('-timestamp') queryset_list.extend(UserProfile.objects.all()) for query in queries: queryset_list = queryset_list.filter( Q(title__icontains=query) | Q(description__icontains=query) | Q(user__username__icontains=query) | Q(user__full_name__icontains=query) ) ''' queries = request.GET['q'].split() results = [] for query in queries: user_results = (UserProfile.objects.filter( Q(username__icontains=query) | Q(bio__icontains=query) | Q(full_name__icontains=query) | Q(location__icontains=query))) post_results = (Post.objects.filter( Q(title__icontains=query) | Q(description__icontains=query) | Q(medium__icontains=query))) results = list(chain(results, user_results, post_results)) paginator = Paginator(results, 10) # Show 10 contacts per page page = request.GET.get('page') try: queryset = paginator.page(page) except PageNotAnInteger: queryset = paginator.page(1) except EmptyPage: queryset = paginator.page(paginator.num_pages) context = { 'object_list': queryset, 'title': 'Search Results', 'query': query, } return render(request, 'search_results.html', context)
def open(self): return chain( *[iter_(open_(f, encoding=self.encoding)) for f in self.files])
def open(self): return chain( *[iter_(codecs.open(f, encoding="latin1")) for f in self.files])
def __init__(self, data_streams, sources): self.data_streams = data_streams if len(list(chain(*[data_stream.sources for data_stream in data_streams]))) != len(sources): raise ValueError("wrong number of sources given") self.sources = sources
def open(self): return chain(*[iter_(open_(f, encoding=self.encoding)) for f in self.files])
def open(self): return chain(*[iter_(open(f)) for f in self.files])
def open(self): handlers = [open(f, "rb") for f in self.files] return chain(*[iter_(h) for h in handlers]), handlers
def get_request_iterator(self): return chain(*[sch.get_request_iterator() for sch in self.schemes])
def open(self): return chain(*[iter_( codecs.open(f, encoding="latin1") ) for f in self.files])
def open(self): return bAbIState(chain(*[iter_(open(f)) for f in self.files]), [], 0)