def train_old_title(old_title_hash, new_deal_hash, filtered_deal_hash): selector = dealSelector() for title in old_title_hash: if selector.checkDeal(title): url = old_title_hash[title] new_deal_hash[title] = [] new_deal_hash[title].append(url) else: filtered_deal_hash[title] = selector.filter_rule
def fetch_new_title_moon(old_title_hash, new_deal_hash, new_title_hash, filtered_deal_hash): print "fetch_new_title_moon" selector = dealSelector() hasNewTitle = True index = 1 cur = datetime.now() timestamp = "%i" % ( ( ( cur.year * 100 + cur.month) * 100 + cur.day ) * 100 + cur.hour ) dirname = "../data/dealmoon/%s" % timestamp try: os.mkdir(dirname) except: pass while hasNewTitle and index <= 30: print "\tprocessing %i" % index url = 'http://www.dealmoon.com/%i' % index if index == 1: url = 'http://www.dealmoon.com/' html = urllib.urlopen(url).read() filename = "../data/dealmoon/%s/%i.html" % (timestamp, index) output = open(filename, "w") output.write(html) output.close() parser = moonParser() parser.feed(html) hasNewTitle = False for title in parser.promo_hash: if title not in old_title_hash: hasNewTitle = True url = parser.promo_hash[title][0] new_title_hash[title] = url old_title_hash[title] = url if selector.checkDeal(title): new_deal_hash[title] = [] new_deal_hash[title].append(url) new_deal_hash[title].append("") new_deal_hash[title].append("") i = 1 while i < len(parser.promo_hash[title]): new_deal_hash[title].append(parser.promo_hash[title][i]) i += 1 else: filtered_deal_hash[title] = selector.filter_rule index += 1
def fetch_new_title_slick(old_title_hash, new_deal_hash, new_title_hash, filtered_deal_hash): imageParser = sdImageParser() print "fetch_new_title_slick" selector = dealSelector() hasNewTitle = True index = 1 cur = datetime.now() timestamp = "%i" % ( ( ( cur.year * 100 + cur.month) * 100 + cur.day ) * 100 + cur.hour ) dirname = "../data/slickdeal/%s" % timestamp try: os.mkdir(dirname) except: pass while hasNewTitle and index <= 30: print "\tprocessing %i" % index url='http://slickdeals.net/forums/forumdisplay.php?f=9&page={0}&order=desc&sort=lastpost'.format(index) web = urllib.urlopen(url) html = web.read() web.close() filename = "../data/slickdeal/%s/%i.html" % (timestamp, index) output = open(filename, "w") output.write(html) output.close() parser = sdParser() parser.feed(html) hasNewTitle = False for title in parser.promo_hash: if title not in old_title_hash: old_title_hash[title] = url hasNewTitle = True url = parser.promo_hash[title][0] new_title_hash[title] = url if selector.checkDeal(title): new_deal_hash[title] = [] real_url = "http://slickdeals.net" + url new_deal_hash[title].append(real_url) new_deal_hash[title].append("") new_deal_hash[title].append("") for image in fetch_slick_images(real_url): new_deal_hash[title].append(image) else: filtered_deal_hash[title] = selector.filter_rule index += 1
def fetch_new_title_wallet(old_title_hash, new_deal_hash, new_title_hash, filtered_deal_hash): print "fetch_new_title_wallet" selector = dealSelector() hasNewTitle = True index = 1 cur = datetime.now() timestamp = "%i" % ( ( ( cur.year * 100 + cur.month) * 100 + cur.day ) * 100 + cur.hour ) dirname = "../data/wallet/%s" % timestamp try: os.mkdir(dirname) except: pass while hasNewTitle and index <= 8: print "\tprocessing %i" % index url = 'http://www.fatwallet.com/?liststyle=grid&page=%i' % index html = urllib.urlopen(url).read() filename = "../data/wallet/%s/%i.html" % (timestamp, index) output = open(filename, "w") output.write(html) output.close() parser = walletParser() parser.feed(html) hasNewTitle = False for title in parser.promo_hash: if title not in old_title_hash: hasNewTitle = True url = parser.promo_hash[title][0] new_title_hash[title] = url old_title_hash[title] = url if selector.checkDeal(title): new_deal_hash[title] = [] new_deal_hash[title].append(url) new_deal_hash[title].append(parser.promo_hash[title][1]) new_deal_hash[title].append(parser.promo_hash[title][2]) #print "was_price: %s" % parser.promo_hash[title][2] i = 3 while i < len(parser.promo_hash[title]): new_deal_hash[title].append(parser.promo_hash[title][i]) i += 1 else: filtered_deal_hash[title] = selector.filter_rule index += 1
def train_old_web(run_dir, new_deal_hash, new_title_hash, filtered_deal_hash): selector = dealSelector() for curdir, dirnames, filenames in os.walk(run_dir): for filename in filenames: fullname = "%s/%s" % (curdir, filename) print "processing %s" % fullname input = open(fullname, "r") html = input.read() input.close() parser = sdParser() parser.feed(html) for title in parser.promo_hash: if selector.checkDeal(title): url = parser.promo_hash[title] new_deal_hash[title] = url else: filtered_deal_hash[title] = selector.filter_rule for dirname in dirnames: train_old_web(dirname, new_deal_hash, new_title_hash, filtered_deal_hash)