def closed(self, reason): # join all the report summary strings together, # with a new line '\n' in between stats = self.crawler.stats.get_stats() stats = pprint.pformat(stats) s = '\n'.join(self.report_summary) # log the summary report as stored in 's' logging.info(s) # Store the log to our mysql db if it's not a test run # Open the log file and retrieve its content log_path = self.settings.get('LOG_FILE') file = open('%s' % log_path, 'r') file_content = file.read() file.close() # Store the log to our mysql db try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
def __init__(self, source = 0, test = 0, limit = 0, *args, **kwargs): super(NewProductSpider, self).__init__(*args, **kwargs) self.source = source self.test = test self.limit = limit # Store the source and date in a report summary variable self.report_summary = [] self.report_summary.append("Source: %s" % source) self.report_summary.append("Test: %s" % test) self.report_summary.append("Date: %s" % ( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))) # Return an error in case no source code was passed along. if self.source == 0: self.logger.critical("No source code was passed along!") sys.exit() try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
def count_new_thumbs(self): try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
def item_completed(self, results, item, info): if (item['test'] == 1) or (item['test'] == '1'): self.urls_table = 'product_urls' self.details_table = 'product_details' if (item['test'] == 0) or (item['test'] == '0'): self.urls_table = 'product_urls_SCRAPY' self.details_table = 'product_details_SCRAPY' try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
def start_requests(self): # Products are divided in different Groups # Stock, Not in Stock and Not Active (urls.status= = 4) # In Stock if (self.group == 0 or self.group == '0'): stock = 0 # 0 is in stock status = 1 self.updatetime = time.time() - 172800 # seconds/48 Hours # Out Stock if (self.group == 1 or self.group == '1'): stock = 1 status = 1 self.updatetime = time.time() - 604800 # seconds/7 days # Not Active if (self.group == 2 or self.group == '2'): status = 4 self.updatetime = time.time() - 2592000 # seconds/30 days # For Faster Update Sequence if (self.group == 9 or self.group == '9'): stock = 0 # 0 is in stock status = 1 self.updatetime = time.time() - 7200 # seconds/48 Hours # Overwrite to 1 second for test purposes if (self.test == 0) or (self.test == '0'): self.updatetime = time.time() - 1 # 1 second! # Starting the DB connection try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
def item_completed(self, results, item, info): # Setting the test values for the pipeline logging.info("We are testing if 0 and real if 1: %s", item['test']) if (item['test'] == 1) or (item['test'] == '1'): self.urls_table = 'product_urls' self.details_table = 'product_details' if (item['test'] == 0) or (item['test'] == '0'): self.urls_table = 'product_urls_SCRAPY' self.details_table = 'product_details_SCRAPY' # Starting the DB connection try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
def closed(self, reason): # Close the Selenium Javascript Driver # self.driver.close() #add the number of pages scraped to the summary self.report_summary.append("# Pages scraped: %d" % self.page_counter) #add the number of products scraped to the summary self.report_summary.append("# Products scraped: %d" % (self.product_counter)) # join all the report summary strings together, # with a new line '\n' in between stats = self.crawler.stats.get_stats() stats = pprint.pformat(stats) s = '\n'.join(self.report_summary) # log the summary report as stored in 's' self.logger.info(s) # Store the log to our mysql db if it's not a test run # Open the log file and retrieve its content log_path = self.settings.get('LOG_FILE') file = open('%s' % log_path, 'r') file_content = file.read() file.close() # Store the log to our mysql db # Start the db connection through the custom module. conn = mysql_connection.setup_conn() # Now fetch some URLs from the product_urls table. cursor = conn.cursor() cursor.execute( "INSERT INTO `scrapy_logs` " "(`spider`, `test`,`log_date`, `log_file`, `stats`, `short_msg`, `long_msg`) " "VALUES (%s, %s, %s, %s, %s, %s, %s)", ('spider_new_urls', self.test, time.time(), log_path, stats, s, file_content)) conn.commit() # Close the db connection when done. conn.close()
def __init__(self, source = 0, test = 0, limit = 0, *args, **kwargs): super(NewURLsSpider, self).__init__(*args, **kwargs) self.source = source self.test = test self.limit = limit # Store the source and date in a report summary variable self.report_summary = [] self.report_summary.append("Source: %s" % source) self.report_summary.append("Test: %s" % test) self.report_summary.append("Date: %s" % ( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))) # Create a page counter variable self.page_counter = 1 # Create a product counter variable self.product_counter = 0 # Return an error in case no source code was passed along. if self.source == 0: self.logger.critical("No source code was passed along!") #sys.exit() raise CloseSpider('No source.') # Set the xpath dictionary. # Which stores all the information per source in the custom # module called product_scrape_xpaths. self.xpath_dict = product_scrape_xpaths.get_dict() try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)