def updateRegularProducts(self, productRegular): print 'Updating prices for regular products...' try: con = mdb.connect(Utils.getConfig()['host'], Utils.getConfig()['user'], Utils.getConfig()['passwd'], Utils.getConfig()['dbname']) cur = con.cursor(mdb.cursors.DictCursor) # select the most recent records for each product start = time.time() # ids = '' # for prod in productRegular: # ids = ids + str(prod['id']) + ',' # ids = ids[:-1] # cur.execute("SELECT p1.* FROM product_price p1 LEFT JOIN product_price p2 \ # ON (p1.id = p2.id AND p1.date < p2.date) WHERE p2.date IS NULL AND p1.id IN ({})".format(ids)) cur.execute("SELECT * FROM product_price") productRegularDB = cur.fetchall() end = time.time() print "Query time: {}".format(end - start) # sort by date (descending) and by product ID productRegularDB = sorted(productRegularDB, key=lambda k: (k['date']), reverse=True) productRegularDB = sorted(productRegularDB, key=lambda k: (k['id'])) Utils.deleteDuplicates(productRegularDB) idArray = Utils.buildArray(productRegularDB) for product in productRegular: index = Utils.binarySearch(idArray, product['id']) # insert a new product record if it hasn't occurred before or the price has changed if index == -1 or productRegularDB[index]['price'] != product['price']: cur.execute("INSERT INTO product_price (id, date, price, category, url) \ VALUES ({}, \"{}\", {}, {}, \"{}\")".format(product['id'], product['date'], product['price'], product['category'], product['url'])) except mdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1)
def getProducts(self): with open(Utils.getConfig()['subcatFile']) as json_file: jsonData = json.load(json_file) self.getProductsByCat(jsonData) self.items = sorted(self.items, key=lambda k: (k['id'])) Utils.saveJsonFile(Utils.getConfig()['subcatFile'], jsonData) Utils.deleteDuplicates(self.items) Utils.saveJsonFile(Utils.getConfig()['productFile'], self.items)
def getProducts(self): with open(Utils.getConfig()['subcatFile']) as json_file: self.jsonData = json.load(json_file) for cat in self.jsonData: print cat['url'] page = 0 while True: page += 1 url = cat['url'].encode('utf-8') + "/I-Page{}_40".format(page) print url try: self.parse(cat['subId'], url) except urllib2.HTTPError as httpError: print httpError if str(httpError.code)[0] == '5': self.parse(cat['subId'], url) else: break except IndexError: break self.items = sorted(self.items, key=lambda k: (k['id'])) Utils.deleteDuplicates(self.items) Utils.saveJsonFile(Utils.getConfig()['productFile'], self.items)
def preparePromoList(self): male = ["męsk", "mesk"] female = ["damsk", "kobie"] junior = ["junior", "dziec"] juniorYear = ["lat", "ans"] try: con = mdb.connect( Utils.getConfig()["host"], Utils.getConfig()["user"], Utils.getConfig()["passwd"], Utils.getConfig()["dbname"], ) cur = con.cursor(mdb.cursors.DictCursor) # cur.execute('SELECT p1.* FROM product_price p1 LEFT JOIN product_price p2 \ # ON (p1.id = p2.id AND p1.price > p2.price) WHERE p2.price IS NULL') # get all products from DB cur.execute("SELECT * FROM product_price") productPriceDB = cur.fetchall() productPriceDB = sorted(productPriceDB, key=lambda k: (k["date"], k["price"])) Utils.deleteDuplicates(productPriceDB) # get promoted products added today but skip discontinued items cur.execute( 'SELECT * FROM product_promo WHERE operation != {} AND last_date > "{}" ORDER BY ' "operation ASC, discount DESC".format(ProcessData.PROD_WITHDRAW, self.dateTime) ) productPromoDB = cur.fetchall() print( "#decapromolist lista promowanych produktów (delta {}-{}):".format( self.datePrevProcFormatted, self.dateFormatted ), file=self.mdFile, ) for cat in self.subcatData: # checked whether the list contains at least one product belonging to processed subcategory rowCat = next((row for row in productPromoDB if row["category"] == cat["subId"]), None) if rowCat is None: continue else: catStr = "\nKategoria: " + cat["name"].encode("utf-8") + "->" + cat["subName"].encode("utf-8") # process promoted items for row in productPromoDB: product = {} # when a product doesn't belong to considered subcategory skip to the next one if row["category"] != cat["subId"]: continue # if row['name'] == row['name'].upper(): # name = str(unicode(row['name'], 'utf-8', 'ignore').title().encode('utf-8')) # else: # name = row['name'] url = Utils.getConfig()["siteURL"] + row["url"] content = urllib2.urlopen(url).read() response = html.fromstring(content) # get product information nameCheck = "" namePosStart = content.find("tc_vars") if namePosStart != -1: namePosEnd = content.find("/*", namePosStart) nameCheck = content[namePosStart:namePosEnd] nameCheck = nameCheck.lower() # get the product name try: name = response.xpath('//span[@id="productName"]')[0].text if name == name.upper(): name = name.title().encode("utf-8") else: name = name.encode("utf-8") print(name + " " + url.encode("utf-8")) except IndexError: print("\Invalid product: " + row) continue # when a product is out of stock then skip to the next one outOfStock = response.xpath('//link[@href="http://schema.org/OutOfStock"]') if outOfStock: print("Out of stock") continue # get an image imgPosStart = content.find('tc_vars["product_url_picture"]') imgPosEnd = content.find('";', imgPosStart) img = content[imgPosStart + 34 : imgPosEnd] # quite vague method to determine the sex # (in most cases it works just fine, i.e. when the description is correct) label = "" labelPosStart = content.find('tc_vars["product_breadcrumb_label"]') if labelPosStart != -1: labelPosEnd = content.find('");', labelPosStart) label = content[labelPosStart + 49 : labelPosEnd] label = label.lower() nameLower = name.lower() if ( any(substring in label for substring in male) == True or any(substring in nameLower for substring in male) == True or any(substring in nameCheck for substring in male) == True ): sex = "M" elif ( any(substring in label for substring in female) == True or any(substring in nameLower for substring in female) == True or any(substring in nameCheck for substring in female) == True ): sex = "F" elif ( any(substring in label for substring in junior) == True or any(substring in nameLower for substring in junior) == True or any(substring in nameCheck for substring in junior) == True ): sex = "J" else: sex = "U" # get list of available sizes sizeList = "" # product['sz'] = [] for size in response.xpath('//li[@class=" available"]'): sizeStr = size.xpath("a")[0].text sizeList = sizeList + sizeStr.strip() + ", " # product['sz'].append(sizeStr) sizeListLower = sizeList.lower() if any(substring in sizeListLower for substring in juniorYear): sex = "J" text = "{} [{}]({}) ".format(sex, name, url) product["sz"] = " " + sizeList[:-2] product["sx"] = sex product["nm"] = '<a href="' + url.encode("utf-8") + '">' + name + "</a>" # product['rl'] = url product["sc"] = cat["name"].encode("utf-8") + "->" + cat["subName"].encode("utf-8") # product['sc'] = row['category'] # product['im'] = img if row["discount"] >= 60: text += "**" text = text + "{}->{} ({}%) [{}]".format( row["old_price"], row["price"], row["discount"], self.operationToDescr(row["operation"], str(row["prev_price"])), ) product["pr"] = row["price"] product["op"] = row["old_price"] product["dc"] = row["discount"] product["pp"] = row["prev_price"] product["or"] = self.operationToDescr(row["operation"], str(row["prev_price"])) if row["discount"] >= 60: text += "**" if sizeList != "": try: text = text + " [Rozmiary: {}]".format(sizeList[:-2]) except UnicodeEncodeError: pass # additional check to be sure that the current price is the lowest to this day # (checking "price history") prodLowestPrice = next( ( prodLowestPrice for prodLowestPrice in productPriceDB if prodLowestPrice["id"] == row["id"] and prodLowestPrice["price"] < row["price"] ), None, ) if prodLowestPrice is not None: text = text + " [Regularna cena była niższa {} w dn. {}]".format( prodLowestPrice["price"], prodLowestPrice["date"] ) product["rp"] = prodLowestPrice["price"] product["rd"] = prodLowestPrice["date"].strftime("%d.%m.%Y") if catStr != "": print(catStr + self.SPACES, file=self.mdFile) catStr = "" print(text + self.SPACES, file=self.mdFile) self.products.append(product) except mdb.Error, e: print("Error %d: %s" % (e.args[0], e.args[1])) sys.exit(1)