Example #1
0
    def parse_item(self, response):
        meta = response.request.meta
        source = meta['domain']
        url = response.request.url
        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if not self._process_response(response, source, LinkType.LEAF):
            service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)])
            career.remove_item(url, source)
            return

        if not self.itemloader_class:
            return

        try:
            selector = HtmlXPathSelector(response)
            loader = self.itemloader_class(selector)
            loader.add_value('source', source)
            loader.add_value('source_link', url)
        except Exception, e:
            service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])
            print e, url
            log_error(url)
Example #2
0
 def process_item(self, item, spider):
     try:
         adapter = CareerItemAdapterFactory.get_itemadapter(item.get('source'))
         if adapter:
             item = adapter.adapt(item)
         return item
     except Exception, e:
         log_error(e)
         raise DropItem()
Example #3
0
 def process_item(self, item, spider):
     try:
         if spider.is_item_valid(item, 1):
             career.save_item(item, spider.name)
             return item
         else:
             raise DropItem("invalid item: %s" % item)
     except Exception, e:
         log_error(e)
         raise DropItem()
Example #4
0
def _thrift_call(func):
    try:
        transport = TSocket.TSocket(SERVICE_CONFIG['host'], SERVICE_CONFIG['port'])
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = Links.Client(protocol)
        transport.open()
        return func(client)
    except Thrift.TException, tx:
        log_error(tx)
Example #5
0
def report_link(source, catetory, link, description=''):
    try:
        cursor = _conn.cursor()

        insert_sql = "INSERT INTO %s (source, category, link, description, create_time) VALUES('%s', '%s', '%s', '%s', %s)" % \
                    (_link_monitor_table, source, catetory, link, description, get_epoch_datetime())
        cursor.execute(insert_sql)
        _conn.commit()
    except MySQLdb.Error, e:
        log_error(e)
Example #6
0
def save_item(item, name):
    if not item:
        return
    try:
        cursor = _conn.cursor()

        #record last_crawl time
        item['add_time'] = get_epoch_datetime()
        item['update_time'] = get_epoch_datetime()

        _upsert_item(cursor, item)
        _conn.commit()
    except Exception, e:
        log_error(e)
Example #7
0
    def process_item(self, item, spider):
        try:
#            icon_dic = {}
#            icon_dic['url'] = item['icon_link']
#            icon_dic['source_link'] = item['source_link']
#            icon_dic['source'] = 'icon'
#            career.push_image_url(icon_dic)
#
#            image_dic = {}
#            image_dic['url'] = item['images']
#            image_dic['source_link'] = item['source_link']
#            image_dic['source'] = 'image'
#            career.push_image_url(image_dic)
            return item
        except Exception, e:
            log_error(e)
            raise DropItem()