def __init__(self, source=0, test=0, *args, **kwargs): super(NewURLsSpider, self).__init__(*args, **kwargs) self.source = source self.test = test # Store the source and date in a report summary variable self.report_summary = [] self.report_summary.append("Source: %s" % source) self.report_summary.append("Test: %s" % test) self.report_summary.append( "Date: %s" % (datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))) # Create a page counter variable self.page_counter = 1 # Create a product counter variable self.product_counter = 0 # Return an error in case no source code was passed along. if self.source == 0: self.logger.critical("No source code was passed along!") #sys.exit() raise CloseSpider('No source.') # Set the xpath dictionary. # Which stores all the information per source in the custom # module called product_scrape_xpaths. self.xpath_dict = product_scrape_xpaths.get_dict()
def start_requests(self): # Now fetch some URLs from the product_urls table. # Define which table to use according to Test or Real if (self.test == 1) or (self.test == '1'): query = ("SELECT `id`, `url` " "FROM `product_urls` " "WHERE `source` = %s AND `status` = 0 " "LIMIT {0} ".format(self.limit) ) self.cursor.execute(query,(self.source,)) else: query = ("SELECT `id`, `url` " "FROM `product_urls_SCRAPY` " "WHERE `source` = %s AND `status` = 0 " "LIMIT {0} ".format(self.limit) ) self.cursor.execute(query,(self.source,)) rows = self.cursor.fetchall() # First store the dictionary with source xpaths in a variable, # as obtained from the customer product_scrape_xpaths module. self.xpath_dict = product_scrape_xpaths.get_dict() # Because the duplicate filter whe need to set some sources to no filtering because of the not available page sources_no_filtering = ['6', '7', '20', '25', '31', '21'] if self.source in sources_no_filtering: no_filter=True else: no_filter=False # Loop through each of the rows, as row. # And initiate a scrape for each of them based on the URL, # also pass along the id of the row from the product_urls table as # meta data (for later reference), and handle it through the parse # function. for row in rows: # Edit the url with some query parameters, for setting USD or location query = self.xpath_dict[self.source]['query_url'] request_url = row[1] + query yield scrapy.Request( url=request_url, meta={'id': row[0], 'url': row[1]}, callback=self.parse, dont_filter=no_filter) """
def process_response(self, request, response, spider): if (request.meta.get('dont_redirect', False) or response.status in getattr(spider, 'handle_httpstatus_list', []) or response.status in request.meta.get('handle_httpstatus_list', []) or request.meta.get('handle_httpstatus_all', False)): return response allowed_status = (301, 302, 303, 307) # 30 -TVC mall is returng a 301 withouth header location # This code is not compataible with the ScrapeThumbs Spider and ScrapeOldThumbs Spider # Because the spider.source attribute is not available. not_allowed_source = ('30') if (( 'Location' not in response.headers and spider.source not in not_allowed_source ) or response.status not in allowed_status): return response if (spider.source in not_allowed_source): # Get the dict per source with the xpaths self.xpath_dict = product_scrape_xpaths.get_dict() location = safe_url_string( self.xpath_dict[spider.source]['na_url'] ) else: location = safe_url_string(response.headers['location']) redirected_url = urljoin(request.url, location) # ADDED 10-11-2016 For 302 redirects to the mobile webpage! # if re.match('^http[s]?:\/\/m[\.][^\.]+[\.]com[\.]*', redirected_url) is not None: # redirected_url = redirected_url.replace("//m.", "//www.") if response.status in (301, 307) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) redirected = self._redirect_request_using_get(request, redirected_url) return self._redirect(redirected, request, spider, response.status)
def __init__(self, source = 0, test = 0, limit = 0, *args, **kwargs): super(NewURLsSpider, self).__init__(*args, **kwargs) self.source = source self.test = test self.limit = limit # Store the source and date in a report summary variable self.report_summary = [] self.report_summary.append("Source: %s" % source) self.report_summary.append("Test: %s" % test) self.report_summary.append("Date: %s" % ( datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))) # Create a page counter variable self.page_counter = 1 # Create a product counter variable self.product_counter = 0 # Return an error in case no source code was passed along. if self.source == 0: self.logger.critical("No source code was passed along!") #sys.exit() raise CloseSpider('No source.') # Set the xpath dictionary. # Which stores all the information per source in the custom # module called product_scrape_xpaths. self.xpath_dict = product_scrape_xpaths.get_dict() try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e)
class UpdateProductSpider(scrapy.Spider): # This spider is allowed to handle responses with a 404 handle_httpstatus_list = [404] custom_settings = { 'ITEM_PIPELINES': { 'scraper1.pipelines.pipelines.MySQLUpdateProduct': 100 } } # Set the short-name for this spider: name = "ScrapeUpdateProduct" # Restrict the domain name which the spider can operate on. # Make sure to append the list for new sources. allowed_domains = [ "dx.com", "banggood.com", "focalprice.com", "miniinthebox.com", "lightinthebox.com", "tmart.com", "gearbest.com", "tinydeal.com", "geekbuying.com", "dealsmachine.com", "newfrog.com", "tomtop.com", "fasttech.com", "chinavasion.com", "tvc-mall.com", "antelife.com", "cafago.com", "chinabuye.com", "dinodirect.com", "sunsky-online.com", "cndirect.com", "zapals.com" ] # Define the initializing function, used to catch the source number # passed along while running the spider as an argument (-a). def __init__(self, source=0, test=0, limit=0, group=0, cats=0, images=0, descrp=0, *args, **kwargs): super(UpdateProductSpider, self).__init__(*args, **kwargs) self.source = source self.test = test self.limit = limit self.updateCategories = cats self.updateDescriptions = descrp self.updateImages = images self.group = group # Set the right database Tabels according to if we are Real or Testing if (self.test == 1) or (self.test == '1'): self.urls_table = 'product_urls' self.details_table = 'product_details' if (self.test == 0) or (self.test == '0'): self.urls_table = 'product_urls_SCRAPY' self.details_table = 'product_details_SCRAPY' # Store the source and date in a report summary variable self.report_summary = [] self.report_summary.append("Source: %s" % source) self.report_summary.append( "Date: %s" % (datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))) # Return an error in case no source code was passed along. if self.source == 0: logging.critical("No source code was passed along!") sys.exit() # Overwrite the default start_request function. # In order to be able to pull (start) URLs from the database, # and run them in concurrently. def start_requests(self): # Products are divided in different Groups # Stock, Not in Stock and Not Active (urls.status= = 4) # In Stock if (self.group == 0 or self.group == '0'): stock = 0 # 0 is in stock status = 1 self.updatetime = time.time() - 172800 # seconds/48 Hours # Out Stock if (self.group == 1 or self.group == '1'): stock = 1 status = 1 self.updatetime = time.time() - 604800 # seconds/7 days # Not Active if (self.group == 2 or self.group == '2'): status = 4 self.updatetime = time.time() - 2592000 # seconds/30 days # For Faster Update Sequence if (self.group == 9 or self.group == '9'): stock = 0 # 0 is in stock status = 1 self.updatetime = time.time() - 7200 # seconds/48 Hours # Overwrite to 1 second for test purposes if (self.test == 0) or (self.test == '0'): self.updatetime = time.time() - 1 # 1 second! # Starting the DB connection try: # Start the db connection through the custom module. self.conn = mysql_connection.setup_conn() self.cursor = self.conn.cursor() except MySQLdb.Error, e: try: print "MySQL Error [%d]: %s" % (e.args[0], e.args[1]) except IndexError: print "MySQL Error: %s" % str(e) if (self.updateImages == 1 or self.updateImages == '1'): # Different Query If we want to scrape New Images # DELETE LATER IMG_STATUS != 9 query = ( "SELECT urls.id, urls.url, urls.url_hash, urls.pid " "FROM `{0}` As urls " "INNER JOIN `{1}` As details " "ON urls.pid = details.id " "WHERE urls.processing = 0 AND urls.source = %s AND urls.status = 1 AND urls.img_status != 9 AND (details.thumbs_extra = 0 OR details.thumbs_extra = 1) " "LIMIT {2}".format(self.urls_table, self.details_table, self.limit)) logging.debug("We got the Image Query") self.cursor.execute(query, (self.source, )) # The normal query for Active Products elif (self.group == '0' or self.group == '1'): # For Clean purposes also take `updated` IS NULL with the query. query = ( "SELECT urls.id, urls.url, urls.url_hash, urls.pid " "FROM `{0}` As urls " "INNER JOIN `{1}` As details " "ON urls.pid = details.id " "WHERE urls.processing = 0 AND urls.source = %s AND urls.status = %s AND details.stock = %s AND (details.updated < {2} OR details.updated IS NULL) " "ORDER BY details.stock " "LIMIT {3} ".format(self.urls_table, self.details_table, self.updatetime, self.limit)) self.cursor.execute(query, ( self.source, status, stock, )) # The query for Not-Active Products # NEED TO BE UPDATED DNRY! else: # For Clean purposes also take `updated` IS NULL with the query. query = ( "SELECT urls.id, urls.url, urls.url_hash, urls.pid " "FROM `{0}` As urls " "INNER JOIN `{1}` As details " "ON urls.pid = details.id " "WHERE urls.processing = 0 AND urls.source = %s AND urls.status = %s AND (details.updated < {2} OR details.updated IS NULL) " "ORDER BY details.stock " "LIMIT {3} ".format(self.urls_table, self.details_table, self.updatetime, self.limit)) self.cursor.execute(query, ( self.source, status, )) rows = self.cursor.fetchall() if rows: # Get a list of selected ID's # Because the processing boolean we are able to use multiple Spiders at once. # Update these ID's in the urls table and SET processing to 1 id_list = [] for row in rows: id_list.append(row[0]) logging.debug("What is the id_list: %s", id_list) string_id_list = ','.join(map(str, id_list)) self.id_list = string_id_list query = ("UPDATE `{0}` As urls " "SET urls.processing = 1 " "WHERE urls.source = %s AND urls.id IN ( {1} ) ".format( self.urls_table, string_id_list)) self.cursor.execute(query, (self.source, )) self.conn.commit() # Close the db connection when done. self.conn.close() # Get the dict per source with the xpaths self.xpath_dict = product_scrape_xpaths.get_dict() # Because the duplicate filter whe need to set some sources to no filtering because of the not available page sources_no_filtering = [ '6', '7', '20', '22', '25', '28', '30', '31' ] # 08-11-2016 - Added GearBest 28 - 27-7-2017 Added TVC 30 if self.source in sources_no_filtering: no_filter = True else: no_filter = False # Loop through each of the rows, as row. # And initiate a scrape for each of them based on the URL, # also pass along the id of the row from the product_urls table as # meta data (for later reference), and handle it through the parse # function. for row in rows: # Edit the url with some query parameters, for setting USD or location query = self.xpath_dict[self.source]['query_url'] request_url = row[1] + query # 11-08-16 Changed dont_redirect to False. Because of 301 and 302 not allowed issues if re.match('^http[s]?:\/\/www[\.][^\.]+[\.]com[\.]*', row[1]) is not None: yield scrapy.Request(url=request_url, meta={ 'id': row[0], 'url_hash': row[2], 'pid': row[3], 'dont_redirect': False }, callback=self.parse, dont_filter=no_filter) else: logging.warning('Request Error on ID: %s for url %s', row[0], request_url) continue