Ejemplo n.º 1
0
    def closed(self, reason):
        # join all the report summary strings together,
        # with a new line '\n' in between
        stats = self.crawler.stats.get_stats()
        stats = pprint.pformat(stats)
        s = '\n'.join(self.report_summary)
        # log the summary report as stored in 's'
        logging.info(s)

        # Store the log to our mysql db if it's not a test run
        # Open the log file and retrieve its content
        log_path = self.settings.get('LOG_FILE')
        file = open('%s' % log_path, 'r')
        file_content = file.read()
        file.close()

        # Store the log to our mysql db

        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()

        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)
    def __init__(self, source = 0, test = 0, limit = 0, *args, **kwargs):
        super(NewProductSpider, self).__init__(*args, **kwargs)
        self.source = source
        self.test = test
        self.limit = limit
        # Store the source and date in a report summary variable
        self.report_summary = []
        self.report_summary.append("Source: %s" % source)
        self.report_summary.append("Test: %s" % test)
        self.report_summary.append("Date: %s" % (
            datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')))
        # Return an error in case no source code was passed along.
        if self.source == 0:
            self.logger.critical("No source code was passed along!")
            sys.exit()

        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()           
        
        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)
Ejemplo n.º 3
0
    def count_new_thumbs(self):
        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()

        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)
Ejemplo n.º 4
0
    def item_completed(self, results, item, info):
        if (item['test'] == 1) or (item['test'] == '1'):
            self.urls_table = 'product_urls'
            self.details_table = 'product_details'
        if (item['test'] == 0) or (item['test'] == '0'):
            self.urls_table = 'product_urls_SCRAPY'
            self.details_table = 'product_details_SCRAPY'
        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()

        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)
    def start_requests(self):

        # Products are divided in different Groups
        # Stock, Not in Stock and Not Active (urls.status= = 4)

        # In Stock
        if (self.group == 0 or self.group == '0'):
            stock = 0  # 0 is in stock
            status = 1
            self.updatetime = time.time() - 172800  # seconds/48 Hours

        # Out Stock
        if (self.group == 1 or self.group == '1'):
            stock = 1
            status = 1
            self.updatetime = time.time() - 604800  # seconds/7 days

        # Not Active
        if (self.group == 2 or self.group == '2'):
            status = 4
            self.updatetime = time.time() - 2592000  # seconds/30 days

        # For Faster Update Sequence
        if (self.group == 9 or self.group == '9'):
            stock = 0  # 0 is in stock
            status = 1
            self.updatetime = time.time() - 7200  # seconds/48 Hours

        # Overwrite to 1 second for test purposes
        if (self.test == 0) or (self.test == '0'):
            self.updatetime = time.time() - 1  # 1 second!

        # Starting the DB connection
        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()

        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)
Ejemplo n.º 6
0
    def item_completed(self, results, item, info):
        # Setting the test values for the pipeline
        logging.info("We are testing if 0 and real if 1: %s", item['test'])

        if (item['test'] == 1) or (item['test'] == '1'):
            self.urls_table = 'product_urls'
            self.details_table = 'product_details'
        if (item['test'] == 0) or (item['test'] == '0'):
            self.urls_table = 'product_urls_SCRAPY'
            self.details_table = 'product_details_SCRAPY'

        # Starting the DB connection
        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()

        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)
    def closed(self, reason):

        # Close the Selenium Javascript Driver
        # self.driver.close()

        #add the number of pages scraped to the summary
        self.report_summary.append("# Pages scraped: %d" % self.page_counter)
        #add the number of products scraped to the summary
        self.report_summary.append("# Products scraped: %d" %
                                   (self.product_counter))
        # join all the report summary strings together,
        # with a new line '\n' in between
        stats = self.crawler.stats.get_stats()
        stats = pprint.pformat(stats)
        s = '\n'.join(self.report_summary)
        # log the summary report as stored in 's'
        self.logger.info(s)

        # Store the log to our mysql db if it's not a test run
        # Open the log file and retrieve its content
        log_path = self.settings.get('LOG_FILE')
        file = open('%s' % log_path, 'r')
        file_content = file.read()
        file.close()
        # Store the log to our mysql db
        # Start the db connection through the custom module.
        conn = mysql_connection.setup_conn()
        # Now fetch some URLs from the product_urls table.
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO `scrapy_logs` "
            "(`spider`, `test`,`log_date`, `log_file`, `stats`, `short_msg`, `long_msg`) "
            "VALUES (%s, %s, %s, %s, %s, %s, %s)",
            ('spider_new_urls', self.test, time.time(), log_path, stats, s,
             file_content))
        conn.commit()
        # Close the db connection when done.
        conn.close()
 def __init__(self, source = 0, test = 0, limit = 0, *args, **kwargs):
     super(NewURLsSpider, self).__init__(*args, **kwargs)    
     self.source = source
     self.test = test
     self.limit = limit
     
     # Store the source and date in a report summary variable
     self.report_summary = []
     self.report_summary.append("Source: %s" % source)
     self.report_summary.append("Test: %s" % test)
     self.report_summary.append("Date: %s" % (
         datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')))
     # Create a page counter variable
     self.page_counter = 1
     # Create a product counter variable
     self.product_counter = 0
     # Return an error in case no source code was passed along.
     if self.source == 0:
         self.logger.critical("No source code was passed along!")
         #sys.exit()
         raise CloseSpider('No source.')
     # Set the xpath dictionary.
     # Which stores all the information per source in the custom
     # module called product_scrape_xpaths.
     self.xpath_dict = product_scrape_xpaths.get_dict()
 
     try:
         # Start the db connection through the custom module.
         self.conn = mysql_connection.setup_conn()
         self.cursor = self.conn.cursor()           
     
     except MySQLdb.Error, e:
         try:
             print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
         except IndexError:
             print "MySQL Error: %s" % str(e)