Beispiel #1
0
    def test_partition(self):
        def is_odd(x):
            return x % 2 == 1

        before, after = iters.partition(is_odd, iters.range(5))
        self.assertEqual([0, 2, 4], list(before))
        self.assertEqual([1, 3], list(after))
Beispiel #2
0
    def test_partition(self):
        def is_odd(x):
            return x % 2 == 1

        before, after = iters.partition(is_odd, iters.range(5))
        self.assertEqual([0,2,4], list(before))
        self.assertEqual([1,3], list(after))
Beispiel #3
0
def _collect(start_url, limit, do_head_fn, do_get_fn):
    ''' Collect recursively incoming and outgoing information
    starting from specified URL

    @note: If one of the pages cannot be reached - limit doesn't decrease
    @note: Algorithm applied below is very similar to BFS

    @types: str, int, callable, callable -> dict[str, UrlInfo]

    @type do_head_fn: (iterable[str] -> iterable[requests.Response])
    @param do_head_fn: function takes iterable of URLs and returns list
                       of Response for HEAD requests
    @type do_get_fn: (iterable[str] -> iterable[requests.Response])
    @param do_get_fn: function takes iterable of URLs and returns list
                      of responses for GET requests
    '''
    logger.info("staring url with limit %s: %s" % (limit, start_url))
    info_by_url = defaultdict(new_url_info)
    parent_to_url_queue = [(None, _normalize_url(start_url))]
    while parent_to_url_queue and limit > 0:

        urls_to_process = parent_to_url_queue[:limit]
        del parent_to_url_queue[:limit]

        urls = (url for _, url in urls_to_process)
        results = _get_outgoings(urls, do_head_fn, do_get_fn)
        results = izip(urls_to_process, results)

        for result in results:
            (parent_url, url), (is_page_reached, urls) = result
            if is_page_reached:
                limit = limit - 1
                urls = ifilterfalse(_is_fragment_ref, urls)
                urls = [_normalize_url(url, u) for u in urls]

                info = info_by_url[url]
                info.outgoing.update(urls)

                parent_url and info.incomming.add(parent_url)

                candidates, visited = partition(info_by_url.has_key, urls)

                [info_by_url.get(u).incomming.add(url) for u in visited]

                parent_to_url_queue.extend(izip(repeat(url), candidates))
                logging.debug("OK         %s <-- %s" % (url, parent_url))
            else:
                logging.debug("FAILED     %s <-- %s" % (url, parent_url))
    return info_by_url
Beispiel #4
0
    def extract_args(name_space):
        def extract(nt, args):
            vals = (getattr(name_space, arg) for arg in args)
            args_dict = dict(zip(args, vals))
            return nt(**args_dict)

        non_nt_types, nts = partition(compose(is_namedtuple, get(1)),
                                      types.items())
        nt_names, nt_types = zip(*nts)
        nt_args = map(lambda x: x._fields, nt_types)
        args_dict = dict(zip(nt_names, map(extract, nt_types, nt_args)))
        non_nt_types = list(non_nt_types)
        non_nts = dict(
            (attr, getattr(name_space, attr)) for (attr, _) in non_nt_types)
        args_dict.update(non_nts)
        return args_dict