Ejemplo n.º 1
0
 def process_item(self, item, spider):
     if spider.name=='66ys':
         store_item = dict(item)
         if store_item.has_key('download_info') or store_item.has_key('wangpan_info'):
             self.bulk.insert(store_item)
             self.bulk_ct += 1
             if self.bulk_ct >= self.bulk_max:
                 self.bulk.execute()
                 self.bulk = BulkOperationBuilder(self.collection, ordered=False)    #重新新建1个并行的bulk对象实例
                 self.bulk_ct = 0
             # return item
         else:
             ss = store_item['movie_name']+'\t'+store_item['ys66_url']
             spider.logger.info(ss)
     elif spider.name=='dy2018':
         # return item
         store_item = dict(item)
         if store_item.has_key('download_info'):
             self.bulk.insert(store_item)
             self.bulk_ct += 1
             if self.bulk_ct >= self.bulk_max:
                 self.bulk.execute()
                 self.bulk = BulkOperationBuilder(self.collection, ordered=False)    #重新新建1个并行的bulk对象实例
                 self.bulk_ct = 0
         else:
             ss = store_item['movie_name']+'\t'+store_item['dy2018_url']
             spider.logger.info(ss)
     else:
         pass
Ejemplo n.º 2
0
 def bulk_upsert_operation(self, table, SON, DOC):
     try:
         col = collection.Collection(self.db, table)
         bulkop = BulkOperationBuilder(col, ordered=False)
         bulkop.find(SON).update(DOC)
         return bulkop.execute()
     except Exception as e:
         logger.error(traceback.format_exc())
         return None
Ejemplo n.º 3
0
 def bulk_upsert_operation(self, table, SON, DOC):
     try:
         col = collection.Collection(self.db, table)
         bulkop = BulkOperationBuilder(col, ordered=False)
         bulkop.find(SON).update(DOC)
         return bulkop.execute()
     except Exception as e:
         logger.error(traceback.format_exc())
         return None
Ejemplo n.º 4
0
class MoviePipeline(object):

    def __init__(self):
        server = settings['MONGODB_SERVER']
        port = settings['MONGODB_PORT']
        dbName = settings['MONGODB_DB']
        self.client = MongoClient(server, port)
        self.db = self.client[dbName]
        self.bulk_max = 500         # 操作数上限
        # self.ids_seen = set()     # 这个集合可以用来过滤掉重复的item

    # def from_crawler(cls, crawler):
    #     pass

    def open_spider(self, spider):
        self.collection = self.db[spider.collection_name]
        self.bulk = BulkOperationBuilder(self.collection, ordered=False)    #建立一个并行的bulk对象
        self.bulk_ct = 0     # 这个变量用来记录bulk中的操作数量,满足一定值以后就执行一次


    def close_spider(self, spider):
        try:
            self.bulk.execute()
        except BulkWriteError:
            pass
        self.client.close()

    def process_item(self, item, spider):
        if spider.name=='66ys':
            store_item = dict(item)
            if store_item.has_key('download_info') or store_item.has_key('wangpan_info'):
                self.bulk.insert(store_item)
                self.bulk_ct += 1
                if self.bulk_ct >= self.bulk_max:
                    self.bulk.execute()
                    self.bulk = BulkOperationBuilder(self.collection, ordered=False)    #重新新建1个并行的bulk对象实例
                    self.bulk_ct = 0
                # return item
            else:
                ss = store_item['movie_name']+'\t'+store_item['ys66_url']
                spider.logger.info(ss)
        elif spider.name=='dy2018':
            # return item
            store_item = dict(item)
            if store_item.has_key('download_info'):
                self.bulk.insert(store_item)
                self.bulk_ct += 1
                if self.bulk_ct >= self.bulk_max:
                    self.bulk.execute()
                    self.bulk = BulkOperationBuilder(self.collection, ordered=False)    #重新新建1个并行的bulk对象实例
                    self.bulk_ct = 0
            else:
                ss = store_item['movie_name']+'\t'+store_item['dy2018_url']
                spider.logger.info(ss)
        else:
            pass
Ejemplo n.º 5
0
 def bulk(collection, ordered=True, passive_document_validation=False):
     """
     批量执行
     :param collection: 操作
     :param ordered: True所有命令将按顺序执行,并且第一个出错就会中止整个流程,False将按
                     任意顺序执行,可能是并行的,并且报告所有的操作完成后的错误。默认为True
     :param passive_document_validation:(可选)如果为True,则允许写入选择退出文档级别的验证。
                     默认值为 False。
     :return:
     """
     return BulkOperationBuilder(collection, ordered,
                                 passive_document_validation)
Ejemplo n.º 6
0
 def __init__(self, collection, ordered, bypass_document_validation):
     self.io_loop = collection.get_io_loop()
     delegate = BulkOperationBuilder(collection.delegate, ordered,
                                     bypass_document_validation)
     super(self.__class__, self).__init__(delegate)
Ejemplo n.º 7
0
 def _init_builder(self):
     self._bob = BulkOperationBuilder(collection=self.collection, ordered=self.ordered)
Ejemplo n.º 8
0
class muBulkOps(object):
    """ a wrapper around BulkOperationBuilder provides for some automation

    .. versionadded:: 1.0.6

    :parameters:
        - ae_n: (int) auto execute every n operations (defaults to 0 to refrain from auto execution)
        - ae_s: (int) auto execute seconds since start or last execute before a new execute is automatically initiated
          useful when we want to ensure that collection data are relative fresh
          set it to 0 (default to disable auto execute b
        - dwc: (dict) or None default write concern to use in case of autoexecute_every
          DO NOT pass a WriteConcern object just a plain dict i.e {'w':1}
    """
    frmt_stats = "{:s}db:{:s} collection:{:s} cnt_operations_executed:{:16,d} cnt_operations_pending:{:6,d}"

    def __init__(self, collection, ordered=True, ae_n=0, ae_s=0, dwc=None):
        """Initialize a new BulkOperationBuilder instance."""
        self.collection = collection
        self.ordered = ordered
        self.ae_n = ae_n
        self.dwc = dwc
        self.cnt_operations_pending = 0
        self.cnt_operations_executed = 0
        self.ae_n = ae_n
        self.ae_s = ae_s
        if ae_s != 0:
            self.dt_last = datetime.now()
        self._init_builder()

    def _init_builder(self):
        self._bob = BulkOperationBuilder(collection=self.collection, ordered=self.ordered)

    def find(self, selector):
        return self._bob.find(selector)

#     def append(self, document):
#         return self.insert(document)
    def stats(self, message=''):
        return self.frmt_stats.format(message, self.collection.database.name, self.collection.name,
                                      self.cnt_operations_executed, self.cnt_operations_pending)

    def stats_print(self):
        print(self.stats())

    def insert(self, document):
        rt = self._bob.insert(document)
        self.cnt_operations_pending += 1
        # LOG.critical(self.stats("inserts"))
        if self.ae_s != 0:
            current_dt = datetime.now()
            if self.cnt_operations_pending == self.ae_n or ((current_dt - self.dt_last).seconds > self.ae_s):
                self.dt_last = current_dt
                rt = self.execute(write_concern=self.dwc, recreate=True)
        elif self.cnt_operations_pending == self.ae_n:
            rt = self.execute(write_concern=self.dwc, recreate=True)
        return rt

    def execute(self, write_concern=None, recreate=True):
        rt = self._bob.execute(write_concern=write_concern)
        self.cnt_operations_executed += self.cnt_operations_pending
        self.cnt_operations_pending = 0
        if recreate:
            self._init_builder()
        return rt

    def execute_if_pending(self, write_concern=None):
        """executes if any pending operations still exist call it on error or something"""
        if write_concern is None:
            write_concern = self.dwc
        if self.cnt_operations_pending > 0:
            rt = self.execute(write_concern=self.dwc, recreate=True)
            return rt
Ejemplo n.º 9
0
 def __init__(self, collection, ordered):
     self.io_loop = collection.get_io_loop()
     delegate = BulkOperationBuilder(collection.delegate, ordered)
     super(self.__class__, self).__init__(delegate)
Ejemplo n.º 10
0
    def _new_cache(self, *args, **kwargs):
        backend = self._get_backend(self.get_table())
        result = BulkOperationBuilder(backend, self._cache_ordered)

        return result
Ejemplo n.º 11
0
 def open_spider(self, spider):
     self.collection = self.db[spider.collection_name]
     self.bulk = BulkOperationBuilder(self.collection, ordered=False)    #建立一个并行的bulk对象
     self.bulk_ct = 0     # 这个变量用来记录bulk中的操作数量,满足一定值以后就执行一次