Example #1
0
def search_sparse(mk_url,
                  begin_date,
                  end_date,
                  word,
                  quote_key=True,
                  name='',
                  extract_entries=extract_entries):
    "搜索两个日期之间的关键词"

    def _mk_url(page):
        return mk_url(word, begin_date, end_date, page)

    def _before_start():
        print 'Getting %s on %s from %s to %s.' % (name, word, begin_date,
                                                   end_date)

    def _uniquify(entries):
        return unique(entries, lambda e: e['title'])

    def _after_end(entries):
        print 'got %d %s entries.' % (len(entries), name)

    sleep_interval = 2  # sleep before getting next page
    return crawl_search_engine(_mk_url, _before_start, rq_dom, extract_entries,
                               has_next_page, sleep_interval, _uniquify,
                               _after_end)
Example #2
0
def test_crawl_search_engine():
    d = {'page': 1}

    def _has_next_page(dom):
        printf('running has_next_page')
        if d['page'] >= 2:
            return False
        else:
            d['page'] += 1
            return True

    generic_crawler.crawl_search_engine(
        lambda page: printf('running mk_url'),
        lambda: printf('running before_start'),
        lambda u: printf('runnning rq_dom') or (None, None),
        lambda d: printf('running extract_entries') or [], _has_next_page, 1,
        lambda es: es, lambda es: printf('running after_end'))
Example #3
0
def test_crawl_search_engine():
    d = {"page": 1}

    def _has_next_page(dom):
        printf("running has_next_page")
        if d["page"] >= 2:
            return False
        else:
            d["page"] += 1
            return True

    generic_crawler.crawl_search_engine(
        lambda page: printf("running mk_url"),
        lambda: printf("running before_start"),
        lambda u: printf("runnning rq_dom") or (None, None),
        lambda d: printf("running extract_entries") or [],
        _has_next_page,
        1,
        lambda es: es,
        lambda es: printf("running after_end"),
    )
Example #4
0
def search_sparse(mk_url, begin_date, end_date, word, quote_key=True, name='',
                  extract_entries=extract_entries):
    "搜索两个日期之间的关键词"
    def _mk_url(page):
        return mk_url(word, begin_date, end_date, page)
    def _before_start():
        print 'Getting %s on %s from %s to %s.' % (name, word, begin_date, end_date)
    def _uniquify(entries):
        return unique(entries, lambda e: e['title'])
    def _after_end(entries):
        print 'got %d %s entries.' % (len(entries), name)
    sleep_interval = 2 # sleep before getting next page
    return crawl_search_engine(_mk_url,
                               _before_start,
                               rq_dom,
                               extract_entries,
                               has_next_page,
                               sleep_interval,
                               _uniquify,
                               _after_end)