Example #1
0
    def _filter_by_query(self, spider):
        """Return those items from recently fetched that match the QUERY.

        Make sure that items have been uploaded to Solr but last crawl time
        not updated before calling this func
        """
        # FIXME does Solr have a native way to do this?
        def escape(link):
            res = link
            for c in ['/', ':', '?', '&']:
                res = res.replace(c, '\\'+c)
            return res
        # increment date by 1 second to hide last seen result
        # FIXME how can we do it with a solr query?
        last_to_show = (datetime.datetime.now() -
                        datetime.timedelta(days=settings.POSTS_TTL))
        if not spider.last_ts:
            spider.last_ts = last_to_show
        inc_date = max(spider.last_ts + datetime.timedelta(0, 1), last_to_show)
        query = ((u"%(query)s AND date:([%(date)s TO NOW]) "
                    "AND source: %(source)s") %
                    {'query': settings.QUERY,
                    'date': utils.convert_date_to_solr_date(inc_date),
                    'source': spider.name})
        items = self.solr.search(query, sort="date desc",
                                 rows=settings.QUERY_ROWS)
        # convert dates to human-readable non-solr format
        for item in items:
            # FIXME move to utils
            dt = datetime.datetime.strptime(item['date'],
                                            settings.SOLR_DATE_FORMAT)
            item['date'] = dt.strftime(settings.DATE_FORMAT)
        return items
Example #2
0
def cleanup():
    """Remove data from solr older than post ttl"""
    solr = pysolr.Solr(settings.SOLR_URL, timeout=settings.SOLR_TIMEOUT)
    date = utils.convert_date_to_solr_date(
        datetime.datetime.now() - datetime.timedelta(days=settings.POSTS_TTL))
    xml = solr.delete(q="date:[* TO %s]" % date)
    return Response(xml, mimetype="text/xml")