def parse_item(self, response): meta = response.request.meta source = meta['domain'] url = response.request.url if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if not self._process_response(response, source, LinkType.LEAF): service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)]) career.remove_item(url, source) return if not self.itemloader_class: return try: selector = HtmlXPathSelector(response) loader = self.itemloader_class(selector) loader.add_value('source', source) loader.add_value('source_link', url) except Exception, e: service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) print e, url log_error(url)
def process_item(self, item, spider): try: adapter = CareerItemAdapterFactory.get_itemadapter(item.get('source')) if adapter: item = adapter.adapt(item) return item except Exception, e: log_error(e) raise DropItem()
def process_item(self, item, spider): try: if spider.is_item_valid(item, 1): career.save_item(item, spider.name) return item else: raise DropItem("invalid item: %s" % item) except Exception, e: log_error(e) raise DropItem()
def _thrift_call(func): try: transport = TSocket.TSocket(SERVICE_CONFIG['host'], SERVICE_CONFIG['port']) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Links.Client(protocol) transport.open() return func(client) except Thrift.TException, tx: log_error(tx)
def report_link(source, catetory, link, description=''): try: cursor = _conn.cursor() insert_sql = "INSERT INTO %s (source, category, link, description, create_time) VALUES('%s', '%s', '%s', '%s', %s)" % \ (_link_monitor_table, source, catetory, link, description, get_epoch_datetime()) cursor.execute(insert_sql) _conn.commit() except MySQLdb.Error, e: log_error(e)
def save_item(item, name): if not item: return try: cursor = _conn.cursor() #record last_crawl time item['add_time'] = get_epoch_datetime() item['update_time'] = get_epoch_datetime() _upsert_item(cursor, item) _conn.commit() except Exception, e: log_error(e)
def process_item(self, item, spider): try: # icon_dic = {} # icon_dic['url'] = item['icon_link'] # icon_dic['source_link'] = item['source_link'] # icon_dic['source'] = 'icon' # career.push_image_url(icon_dic) # # image_dic = {} # image_dic['url'] = item['images'] # image_dic['source_link'] = item['source_link'] # image_dic['source'] = 'image' # career.push_image_url(image_dic) return item except Exception, e: log_error(e) raise DropItem()