def prepare_training_data(src_files, src_files_encoding, trg_files,
                          trg_files_encoding, src_output_file,
                          trg_output_file):
    '''
    for each pair of source/target files, check they have the same number of sentences;
    do shuffle and save with utf-8 encodings
    '''
    src = chain(
        *[iter_(open_(f, encoding=src_files_encoding)) for f in src_files])
    trg = chain(
        *[iter_(open_(f, encoding=trg_files_encoding)) for f in trg_files])

    # TODO: find a way not to load all sentences into memory
    logger.info("reading sentences from source files...")
    src_sentences = [sent for sent in src]
    logger.info("reading sentences from target files...")
    trg_sentences = [sent for sent in trg]

    assert len(src_sentences) == len(trg_sentences)
    logger.info("number of sentences:%d" % len(src_sentences))

    # '\n' has been removed from a sentence
    assert src_sentences[0].endswith('\n')
    # do the shuffle
    ids = list(range(len(src_sentences)))
    random.shuffle(ids)

    with codecs.open(src_output_file, 'w', 'UTF-8') as f_src:
        with codecs.open(trg_output_file, 'w', 'UTF-8') as f_trg:
            for i in ids:
                f_src.write(src_sentences[i])
                f_trg.write(trg_sentences[i])
Example #2
0
 def get_request_iterator(self):
     if self.times:
         return repeat(self.batch_size, self.times)
     if self.num_examples:
         d, r = divmod(self.num_examples, self.batch_size)
         return chain(repeat(self.batch_size, d), [r] if r else [])
     return repeat(self.batch_size)
Example #3
0
 def get_request_iterator(self):
     iterator_list = []
     start = 0
     for size, examples in self.size_dict.items():
         iterator_list.append(
             partition_all(self.batch_size, xrange(start,
                                                   start + examples)))
         start += examples
     return chain(*iterator_list)
Example #4
0
 def __init__(self, data_streams, sources):
     self.data_streams = data_streams
     if len(
             list(
                 chain(
                     *[data_stream.sources
                       for data_stream in data_streams]))) != len(sources):
         raise ValueError("wrong number of sources given")
     self.sources = sources
Example #5
0
    def __init__(self, data_streams, sources, axis_labels=None):
        super(Merge, self).__init__(iteration_scheme=None, axis_labels=axis_labels)
        if not all(data_stream.produces_examples == data_streams[0].produces_examples for data_stream in data_streams):
            raise ValueError("all data streams must produce the same type of " "output (batches or examples)")
        self.data_streams = data_streams
        self.produces_examples = self.data_streams[0].produces_examples

        if len(list(chain(*[data_stream.sources for data_stream in data_streams]))) != len(sources):
            raise ValueError("wrong number of sources given")
        self.sources = sources
Example #6
0
    def __init__(self, data_streams, sources, axis_labels=None):
        super(Merge, self).__init__(
            iteration_scheme=None, axis_labels=axis_labels)
        if not all(data_stream.produces_examples ==
                   data_streams[0].produces_examples
                   for data_stream in data_streams):
            raise ValueError('all data streams must produce the same type of '
                             'output (batches or examples)')
        self.data_streams = data_streams
        self.produces_examples = self.data_streams[0].produces_examples

        if len(list(chain(*[data_stream.sources for data_stream
                            in data_streams]))) != len(sources):
            raise ValueError("wrong number of sources given")
        self.sources = sources
Example #7
0
 def wrapper_func(*args, **kwargs):
     mock_requests = args[-1]
     args = args[:-1]
     length = len(mock_content)
     mock_response = mock.Mock()
     mock_response.iter_content = mock.Mock(side_effect=lambda s: chain(
         (mock_content[s * i:s * (i + 1)] for i in range(length // s)),
         (mock_content[(length // s) * s:], )))
     mock_response.headers = {}
     if content_length:
         mock_response.headers['content-length'] = length
     if content_disposition:
         cd = 'attachment; filename={}'.format(mock_filename)
         mock_response.headers['Content-Disposition'] = cd
     mock_requests.get.return_value = mock_response
     return func(*args, **kwargs)
Example #8
0
def cross_validation(scheme_class, num_examples, num_folds, strict=True,
                     **kwargs):
    """Return pairs of schemes to be used for cross-validation.

    Parameters
    ----------
    scheme_class : subclass of :class:`IndexScheme` or :class:`BatchScheme`
        The type of the returned schemes. The constructor is called with an
        iterator and `**kwargs` as arguments.
    num_examples : int
        The number of examples in the datastream.
    num_folds : int
        The number of folds to return.
    strict : bool, optional
        If `True`, enforce that `num_examples` is divisible by `num_folds`
        and so, that all validation sets have the same size. If `False`,
        the size of the validation set is returned along the iteration
        schemes. Defaults to `True`.

    Yields
    ------
    fold : tuple
        The generator returns `num_folds` tuples. The first two elements of
        the tuple are the training and validation iteration schemes. If
        `strict` is set to `False`, the tuple has a third element
        corresponding to the size of the validation set.

    """
    if strict and num_examples % num_folds != 0:
        raise ValueError(("{} examples are not divisible in {} evenly-sized " +
                          "folds. To allow this, have a look at the " +
                          "`strict` argument.").format(num_examples,
                                                       num_folds))

    for i in xrange(num_folds):
        begin = num_examples * i // num_folds
        end = num_examples * (i+1) // num_folds
        train = scheme_class(list(chain(xrange(0, begin),
                                        xrange(end, num_examples))),
                             **kwargs)
        valid = scheme_class(xrange(begin, end), **kwargs)

        if strict:
            yield (train, valid)
        else:
            yield (train, valid, end - begin)
Example #9
0
 def wrapper_func(*args, **kwargs):
     mock_requests = args[-1]
     args = args[:-1]
     length = len(mock_content)
     mock_response = mock.Mock()
     mock_response.iter_content = mock.Mock(
         side_effect=lambda s: chain(
             (mock_content[s * i: s * (i + 1)]
              for i in range(length // s)),
             (mock_content[(length // s) * s:],)))
     mock_response.headers = {}
     if content_length:
         mock_response.headers['content-length'] = length
     if content_disposition:
         cd = 'attachment; filename={}'.format(mock_filename)
         mock_response.headers['Content-Disposition'] = cd
     mock_requests.get.return_value = mock_response
     return func(*args, **kwargs)
Example #10
0
def search(request):
    '''
    queries = request.GET['q'].split()
    queryset_list = Post.objects.all()  # .order_by('-timestamp')
    queryset_list.extend(UserProfile.objects.all())
    for query in queries:
        queryset_list = queryset_list.filter(
            Q(title__icontains=query) |
            Q(description__icontains=query) |
            Q(user__username__icontains=query) |
            Q(user__full_name__icontains=query)
        )
    '''
    queries = request.GET['q'].split()
    results = []
    for query in queries:
        user_results = (UserProfile.objects.filter(
            Q(username__icontains=query) |
            Q(bio__icontains=query) |
            Q(full_name__icontains=query) |
            Q(location__icontains=query)))
        post_results = (Post.objects.filter(
            Q(title__icontains=query) |
            Q(description__icontains=query) |
            Q(medium__icontains=query)))
        results = list(chain(results, user_results, post_results))
    paginator = Paginator(results, 10)  # Show 10 contacts per page
    page = request.GET.get('page')
    try:
        queryset = paginator.page(page)
    except PageNotAnInteger:
        queryset = paginator.page(1)
    except EmptyPage:
        queryset = paginator.page(paginator.num_pages)
    context = {
        'object_list': queryset,
        'title': 'Search Results',
        'query': query,

    }
    return render(request, 'search_results.html', context)
Example #11
0
def search(request):
    '''
    queries = request.GET['q'].split()
    queryset_list = Post.objects.all()  # .order_by('-timestamp')
    queryset_list.extend(UserProfile.objects.all())
    for query in queries:
        queryset_list = queryset_list.filter(
            Q(title__icontains=query) |
            Q(description__icontains=query) |
            Q(user__username__icontains=query) |
            Q(user__full_name__icontains=query)
        )
    '''
    queries = request.GET['q'].split()
    results = []
    for query in queries:
        user_results = (UserProfile.objects.filter(
            Q(username__icontains=query) | Q(bio__icontains=query)
            | Q(full_name__icontains=query) | Q(location__icontains=query)))
        post_results = (Post.objects.filter(
            Q(title__icontains=query) | Q(description__icontains=query)
            | Q(medium__icontains=query)))
        results = list(chain(results, user_results, post_results))
    paginator = Paginator(results, 10)  # Show 10 contacts per page
    page = request.GET.get('page')
    try:
        queryset = paginator.page(page)
    except PageNotAnInteger:
        queryset = paginator.page(1)
    except EmptyPage:
        queryset = paginator.page(paginator.num_pages)
    context = {
        'object_list': queryset,
        'title': 'Search Results',
        'query': query,
    }
    return render(request, 'search_results.html', context)
Example #12
0
 def open(self):
     return chain(
         *[iter_(open_(f, encoding=self.encoding)) for f in self.files])
Example #13
0
 def open(self):
     return chain(
         *[iter_(codecs.open(f, encoding="latin1")) for f in self.files])
Example #14
0
 def __init__(self, data_streams, sources):
     self.data_streams = data_streams
     if len(list(chain(*[data_stream.sources for data_stream
                         in data_streams]))) != len(sources):
         raise ValueError("wrong number of sources given")
     self.sources = sources
Example #15
0
File: text.py Project: Afrik/fuel
 def open(self):
     return chain(*[iter_(open_(f, encoding=self.encoding))
                    for f in self.files])
Example #16
0
 def open(self):
     return chain(*[iter_(open(f)) for f in self.files])
 def open(self):
     handlers = [open(f, "rb") for f in self.files]
     return chain(*[iter_(h) for h in handlers]), handlers
Example #18
0
 def get_request_iterator(self):
     return chain(*[sch.get_request_iterator() for sch in self.schemes])
 def open(self):
   return chain(*[iter_( codecs.open(f, encoding="latin1") ) for f in self.files])
Example #20
0
 def open(self):
     return bAbIState(chain(*[iter_(open(f)) for f in self.files]), [], 0)
Example #21
0
 def open(self):
     return chain(*[iter_(open(f)) for f in self.files])