Ejemplo n.º 1
0
class DuplicatesPipeline(object):
    def __init__(self):
        servers = SOLR_SERVERS
        self.cnn = SolrConnection(servers)[SOLR_COLLECTION_DEFAULT]
        self.cache_list = []

    def process_item(self, item, spider):
        if self.cnn.search(
            {"q": "report_link:%s" % item["report_link"].encode("utf-8")}
        ).result.response.numFound != 0 or self.cache_duplicate(item["report_link"]):
            raise DropItem("Duplicate item found: %s" % item["report_link"])
        else:
            if "report_revision_time_standard" in item:
                delta = datetime.timedelta(hours=8)
                dateTimezone = item["report_revision_time_standard"] - delta
                item["report_revision_time_standard"] = dateTimezone.strftime("%Y-%m-%dT%H:%M:%S") + "Z"
            return item

    def cache_duplicate(self, report_link):
        if report_link in self.cache_list:
            return True
        else:
            if len(self.cache_list) > SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER:
                self.cache_list = []
            else:
                self.cache_list.append(report_link)
            return False
Ejemplo n.º 2
0
class DuplicatesPipeline(object):
    def __init__(self):
        servers = SOLR_SERVERS
        self.cnn = SolrConnection(servers)[SOLR_COLLECTION_DEFAULT]
        self.cache_list = []

    def process_item(self, item, spider):
        if self.cnn.search({"q":'report_link:%s' % item['report_link'].encode('utf-8')}).result.response.numFound != 0 \
                or self.cache_duplicate(item["report_link"]):
            raise DropItem("Duplicate item found: %s" % item['report_link'])
        else:
            if "report_revision_time_standard" in item:
                delta = datetime.timedelta(hours=8)
                dateTimezone = item["report_revision_time_standard"] - delta
                item["report_revision_time_standard"] = dateTimezone.strftime(
                    '%Y-%m-%dT%H:%M:%S') + "Z"
            return item

    def cache_duplicate(self, report_link):
        if report_link in self.cache_list:
            return True
        else:
            if len(self.cache_list) > SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER:
                self.cache_list = []
            else:
                self.cache_list.append(report_link)
            return False
Ejemplo n.º 3
0
class SolrBackend(object):
    def __init__(self, table, core="collection1"):
        self.table = table
        self.core = core
        self.url = 'localhost:8983'
        try:
            self.interface = SolrConnection(self.url)[self.core]
        except Exception as e:
            logger.warning("Cannot connect to Solr: %s" % e)
            raise RuntimeError("Cannot connect to Solr: %s" % e)

    def get_ids(self, queryset):
        return [r.id for r in queryset.select(self.table._id)]

    def indexes(self, *fieldnames):
        self.fieldnames = fieldnames

    def after_insert(self, fields, id):
        document = [{'id': id}]
        for name in self.fieldnames:
            if name in fields:
                document[0][name] = str(fields[name])
        self.interface.add(document)
        self.interface.commit()
        return True

    def after_update(self, queryset, fields):
        """ caveat, this should work but only if ALL indexed fields are updated at once """
        ids = self.get_ids(queryset)
        documents = []
        for id in ids:
            self.interface.delete({'q':'id:%i'%id})
            document = {'id':id}
            for name in self.fieldnames:
                if name in fields:
                    document[name] = str(fields[name])
            documents.append(document)
        self.interface.add(documents)
        self.interface.commit()
        return True
    def index_table(self, query, db):
        for row in db(query).select():
            self.interface.delete({'q':'id:%i'%row.id})
        self.interface.commit()

        
        documents = []
        for row in db(query).select():
            document = {'id':row.id}
            for name in self.fieldnames:
                document[name] = str(row[name])
            documents.append(document)
        self.interface.add(documents)
        self.interface.commit()
        return True

    def update(self, query, fields, db, **core_fields):
        rows = db(query).select(*fields)
        documents = []
        for row in rows:
            document={}
            for key in row.keys():
                for core_field in core_fields:
                    if core_field in key:
                        document[key] = str(row[key])
                        if key == 'id':
                            self.interface.delete({'q':'id:%i'%row[key]})
            documents.append(document)
        self.interface.add(documents)
        self.interface.commit()
        return True


        



    def before_delete(self, queryset):
        self.ids = self.get_ids(queryset)
        return False

    def after_delete(self, queryset):
        self.ids = self.get_ids(queryset)
        for id in self.ids:
            self.interface.delete({'q':'id:%i'%id})
        self.interface.commit()
        return True

    def meta_search(self, limit, offset, mode, compact, sort, **fieldkeys):
        query = ''
        items = len(fieldkeys)
        count = 0
        # Convert to solrcloudpy search
        for fieldkey in fieldkeys:
            query += " %s:%s " % (fieldkey, fieldkeys[fieldkey])
            count += 1
            if items > 1 and count < items:
                query += mode

        se = SearchOptions()
        se.commonparams.q(query).rows(limit).sort(sort).start(offset)
        response = self.interface.search(se)
        if compact:
            return [r['id'] for r in response.result['response'].docs]
        return response.result['response']
Ejemplo n.º 4
0
class SolrBackend(object):
    def __init__(self, table, core="collection1"):
        self.table = table
        self.core = core
        self.url = 'localhost:8983'
        try:
            self.interface = SolrConnection(self.url)[self.core]
        except Exception as e:
            logger.warning("Cannot connect to Solr: %s" % e)
            raise RuntimeError("Cannot connect to Solr: %s" % e)

    def get_ids(self, queryset):
        return [r.id for r in queryset.select(self.table._id)]

    def indexes(self, *fieldnames):
        self.fieldnames = fieldnames

    def after_insert(self, fields, id):
        document = [{'id': id}]
        for name in self.fieldnames:
            if name in fields:
                document[0][name] = unicode(fields[name])
        self.interface.add(document)
        self.interface.commit()
        return True

    def after_update(self, queryset, fields):
        """ caveat, this should work but only if ALL indexed fields are updated at once """
        ids = self.get_ids(queryset)
        documents = []
        for id in ids:
            self.interface.delete(id)
            document = {'id':id}
            for name in self.fieldnames:
                if name in fields:
                    document[name] = unicode(fields[name])
            documents.append(document)
        self.interface.add(documents)
        self.interface.commit()
        return True

    def update(self, query, fields, db, **core_fields):
        '''
        Usage:

        '''
        rows = db(query).select(*fields)
        documents = []
        for row in rows:
            document={}
            for key in row.keys():
                for core_field in core_fields:
                    if core_field in row[key]:
                        document[core_fields[core_field]] = unicode(row[key][core_field])
                        if core_field == 'id':
                            self.interface.delete(row[key][core_field])
            documents.append(document)
        self.interface.add(documents)
        self.interface.commit()
        return True

    def before_delete(self, queryset):
        self.ids = self.get_ids(queryset)
        return False

    def after_delete(self):
        for id in self.ids:
            self.interface.delete(id=id)
        self.interface.commit()
        return True

    def meta_search(self, limit, offset, mode, compact, sort, **fieldkeys):
        query = ''
        items = len(fieldkeys)
        count = 0
        # Convert to solrcloudpy search
        for fieldkey in fieldkeys:
            query += " %s:%s " % (fieldkey, fieldkeys[fieldkey])
            count += 1
            if items > 1 and count < items:
                query += mode

        se = SearchOptions()
        se.commonparams.q(query).rows(limit).sort(sort).start(offset)
        print se
        response = self.interface.search(se)
        if compact:
            return [r['id'] for r in response.result['response'].docs]
        return response.result['response']