Esempio n. 1
0
 def commit_transaction(self, connection, cursor):
     try:
         connection.commit()
     except Exception, e:
         log_exception(self.logger, e)
         if cursor is not None:
             self.logger.info("Executed Rollback")
             connection.rollback()
Esempio n. 2
0
 def start_transaction(self):
     connection = self.connect()
     try:
         return (connection, connection.cursor())
     except Exception, e:
         log_exception(self.logger, e)
         self.logger.info("Executed Rollback")
         connection.rollback()
Esempio n. 3
0
 def execute_transaction(self, connection, cursor, query, args=None):
     try:
         cursor.execute(query, args)
     except Exception, e:
         log_exception(self.logger, e)
         if cursor is not None:
             self.logger.info("Executed Rollback, last query: " \
                     + cursor._last_executed)
             connection.rollback()
Esempio n. 4
0
 def execute_to_fetch_list(self, query, args=None):
     connection = self.connect()
     cur = connection.cursor()
     try:
         cur.execute(query, args)
         data = cur.fetchall()
         data_list = [raw[0] for raw in data]
     except Exception, e:
         log_exception(self.logger, e)
         raise e
Esempio n. 5
0
 def execute_select(self, query, args=None, batch_size=500):
     return_data = []
     connection = self.connect()
     cur = connection.cursor(MySQLdb.cursors.DictCursor)
     cur.arraysize = batch_size
     try:
         cur.execute(query, args)
     except Exception, e:
         log_exception(self.logger, e)
         raise e
Esempio n. 6
0
    def process_feed(self, category, feed):
        temp_dir = self.get_temp_dir()
        latest_load = category['last_feed_date'] if category['last_feed_date'] \
                        else datetime.datetime.min
        if feed['date'] > latest_load:
            try:
                feed['filepath'] = self.download_feed(feed)
            except Exception, e:
                log_exception(self.logger, e)
                return

            for item in self.parse_feed(feed):
                yield item

            last_date = feed['date'].strftime('%Y-%m-%d %H:%M:%S')
            update_feed_category(self.mysql_manager, category['feed_name'],
                                 self.spider_conf['source_id'], last_date)
            abs_filepath = os.path.join(temp_dir, feed['filename'])
            os.unlink(abs_filepath)
Esempio n. 7
0
    def push_mq(self, fileset, spider):
        project_conf = spider.project_conf
        username = project_conf.get("LOAD", "username")
        password = project_conf.get("LOAD", "password")
        load_host = project_conf.get("LOAD", "host")
        load_virtual_host = project_conf.get("LOAD", "virtual_host")
        queue_name = project_conf.get("LOAD", "queue")
        exchange = project_conf.get("LOAD", "exchange")
        routing_key = project_conf.get("LOAD", "routing_key")

        # TODO: why is this try catch everything?
        try:
            credsecurity = pika.PlainCredentials(username, password)
            parameters = pika.ConnectionParameters(
                host=load_host,
                virtual_host=load_virtual_host,
                credentials=credsecurity)
            connection = pika.BlockingConnection(parameters)
            with contextlib.closing(connection.channel()) as channel:
                channel.queue_declare(queue=queue_name, durable=True)
                send_mq_from_dev = project_conf.getboolean(
                    "OUTPUT", 'send_mq_request_local')
                servername = socket.gethostname()
                if send_mq_from_dev:
                    servername = 'alascrapy901'
                message = json.dumps({
                    "host": servername,
                    "files": fileset
                },
                                     sort_keys=True,
                                     indent=4)
                channel.basic_publish(exchange=exchange,
                                      routing_key=routing_key,
                                      body=message)
            connection.close()

        except Exception, e:
            spider._logger.info('Push MQ failed')
            log_exception(spider._logger, e)
    def process_item(self, item, spider):
        if not isinstance(spider, AmazonCSV):
            return item

        if isinstance(item, CategoryItem):
            return item
        elif isinstance(item, AmazonProduct):
            asin = item['asin']["ID_value"]
            salesrank = item.get('salesrank', {}).get('ID_value', None)
            if salesrank <= SALESRANK_LIMIT:
                spider._amazon_check_reviews[asin] = item
        else:
            raise DropItem("AmazonCSV yielded invalid item. Valid items"
                           "are CategoryItem and dict")

        if len(spider._amazon_check_reviews) >= 10:
            amazon_collection = AmazonCollection()
            amazon_collection['collection'] = {}

            asins = spider._amazon_check_reviews.keys()[:10]
            try:
                have_reviews = spider.amazon_api.have_reviews(asins)
            except Exception, e:
                log_exception(spider.logger, e)
                raise DropItem("Could not verify reviews")

            errors = have_reviews['errors']
            if errors:
                raise DropItem(str(errors))

            have_reviews_asins = have_reviews['asins']
            return_dict = {}
            for asin in have_reviews['invalid_asins']:
                spider._amazon_check_reviews.pop(asin)

            for asin in have_reviews_asins:
                if have_reviews_asins[asin]['has_reviews']:
                    amazon_group = spider._amazon_check_reviews[asin].get(
                        'amazon_group', None)
                    parent_asin = have_reviews_asins[asin].get(
                        'parent_asin', None)
                    if parent_asin and not amazon_group:
                        amazon_group = spider.product_id(
                            spider._amazon_check_reviews[asin]['product'])
                        amazon_group['ID_kind'] = 'amazon_group_id'
                        amazon_group['ID_value'] = parent_asin
                        spider._amazon_check_reviews[asin][
                            'amazon_group'] = amazon_group

                    return_dict[asin] = deepcopy(
                        spider._amazon_check_reviews[asin])
                    sales_rank = int(spider._amazon_check_reviews[asin].get(
                        'salesrank', {}).get('ID_value', None))

                    _scrape_key = spider._amazon_check_reviews[asin].get(
                        'amazon_group', {}).get('ID_value', None)
                    if not _scrape_key:
                        _scrape_key = asin

                    spider._review_scrape[_scrape_key] = {
                        'sales_rank': sales_rank,
                        'asin': _scrape_key
                    }

                spider._amazon_check_reviews.pop(asin)
            amazon_collection['collection'] = return_dict
            return amazon_collection