Beispiel #1
0
def export(stream=sys.stdout):
    """Export file format: json_repr [SEPARATOR json_repr] """
    json_list = [s.to_dict() for s in spider_utils.find_spiders()
                 if s.type == "vk"]
    for s_repr in json_list:
        json.dump(s_repr, stream, separators=(',', ': '), indent=2)
        stream.write(settings.SPIDER_SEPARATOR)
Beispiel #2
0
def crawl_all(token=None):
    if not token:
        LOG.warn("No token passed, "
                 "acquiring one using login data from settings")
        token = utils.get_access_token()
    LOG.info("Access token: %s" % token)
    runner = crawler.CrawlerRunner(project.get_project_settings())

    dispatcher.connect(on_close, signal=signals.spider_closed)
    for spider_cls in spider_utils.find_spiders():
        # FIXME incapsulation vialation
        # inject access_token to a VK spider
        spider_cls.access_token = token
        RUNNING_CRAWLERS.append(spider_cls)
        runner.crawl(spider_cls)
    d = runner.join()
    d.addBoth(lambda _: send_mail())

    internet.reactor.run()
Beispiel #3
0
def query_results():
    # if nothing selected - output all results
    query = request.args.get('q', '')
    token = request.args.get('access_token', '')
    sources = request.args.getlist('source')
    if len(sources) > 0:
        filter_sources = 'source:(%s)' % ' '.join(sources)
        query = (filter_sources if query.strip() == ''
                 else query + ' AND ' + filter_sources)
    # FIXME some query preprocessing may be needed
    solr = pysolr.Solr(settings.SOLR_URL, timeout=settings.SOLR_TIMEOUT)
    items = solr.search(query, sort="date desc", rows=settings.QUERY_ROWS)
    items_out = list(items.docs)
    for item in items_out:
        # change date format from ugly Solr to nice user defined
        dt = datetime.datetime.strptime(item['date'],
                                        settings.SOLR_DATE_FORMAT)
        item['date'] = dt.strftime(settings.DATE_FORMAT)
    spiders = [s.name for s in spider_utils.find_spiders()]
    return render_template('show_items.html', items=items_out,
                           query=query, spiders=spiders, access_token=token)