Ejemplo n.º 1
0
def train_old_title(old_title_hash, new_deal_hash, filtered_deal_hash):
    selector = dealSelector()
    for title in old_title_hash:
         if selector.checkDeal(title):
             url = old_title_hash[title]
             new_deal_hash[title] = []
             new_deal_hash[title].append(url)
         else:
             filtered_deal_hash[title] = selector.filter_rule
Ejemplo n.º 2
0
def fetch_new_title_moon(old_title_hash, new_deal_hash, new_title_hash, filtered_deal_hash):
    print "fetch_new_title_moon"
    selector = dealSelector()
    hasNewTitle = True
    index = 1
    cur = datetime.now()
    timestamp = "%i" % ( ( ( cur.year * 100 + cur.month) * 100 + cur.day ) * 100 + cur.hour )
    dirname = "../data/dealmoon/%s" % timestamp
    try:
        os.mkdir(dirname)
    except:
        pass

    while hasNewTitle and index <= 30:
        print "\tprocessing %i" % index
        url = 'http://www.dealmoon.com/%i' % index
        if index == 1:
            url = 'http://www.dealmoon.com/'
        html = urllib.urlopen(url).read()
        filename = "../data/dealmoon/%s/%i.html" % (timestamp, index)
        output = open(filename, "w")
        output.write(html)
        output.close()

        parser = moonParser()
        parser.feed(html)
        hasNewTitle = False
        for title in parser.promo_hash:
            if title not in old_title_hash:
                hasNewTitle = True
                url = parser.promo_hash[title][0]
                new_title_hash[title] = url
                old_title_hash[title] = url
                if selector.checkDeal(title):
                    new_deal_hash[title] = []
                    new_deal_hash[title].append(url)
                    new_deal_hash[title].append("")
                    new_deal_hash[title].append("")
                    i = 1
                    while i < len(parser.promo_hash[title]):
                        new_deal_hash[title].append(parser.promo_hash[title][i])
                        i += 1
                else:
                    filtered_deal_hash[title] = selector.filter_rule
        index += 1
Ejemplo n.º 3
0
def fetch_new_title_slick(old_title_hash, new_deal_hash, new_title_hash, filtered_deal_hash):
    imageParser = sdImageParser()
    print "fetch_new_title_slick"
    selector = dealSelector()
    hasNewTitle = True
    index = 1
    cur = datetime.now()
    timestamp = "%i" % ( ( ( cur.year * 100 + cur.month) * 100 + cur.day ) * 100 + cur.hour )
    dirname = "../data/slickdeal/%s" % timestamp
    try:
        os.mkdir(dirname)
    except:
        pass

    while hasNewTitle and index <= 30:
        print "\tprocessing %i" % index
        url='http://slickdeals.net/forums/forumdisplay.php?f=9&page={0}&order=desc&sort=lastpost'.format(index) 
        web = urllib.urlopen(url)
        html = web.read()
        web.close()
        filename = "../data/slickdeal/%s/%i.html" % (timestamp, index)
        output = open(filename, "w")
        output.write(html)
        output.close()

        parser = sdParser()
        parser.feed(html)
        hasNewTitle = False
        for title in parser.promo_hash:
            if title not in old_title_hash:
                old_title_hash[title] = url
                hasNewTitle = True
                url = parser.promo_hash[title][0]
                new_title_hash[title] = url
                if selector.checkDeal(title):
                    new_deal_hash[title] = []
                    real_url = "http://slickdeals.net" + url
                    new_deal_hash[title].append(real_url)
                    new_deal_hash[title].append("")
                    new_deal_hash[title].append("")
                    for image in fetch_slick_images(real_url):
                        new_deal_hash[title].append(image)
                else:
                    filtered_deal_hash[title] = selector.filter_rule
        index += 1
Ejemplo n.º 4
0
def fetch_new_title_wallet(old_title_hash, new_deal_hash, new_title_hash, filtered_deal_hash):
    print "fetch_new_title_wallet"
    selector = dealSelector()
    hasNewTitle = True
    index = 1
    cur = datetime.now()
    timestamp = "%i" % ( ( ( cur.year * 100 + cur.month) * 100 + cur.day ) * 100 + cur.hour )
    dirname = "../data/wallet/%s" % timestamp
    try:
        os.mkdir(dirname)
    except:
        pass

    while hasNewTitle and index <= 8:
        print "\tprocessing %i" % index
        url = 'http://www.fatwallet.com/?liststyle=grid&page=%i' % index
        html = urllib.urlopen(url).read()
        filename = "../data/wallet/%s/%i.html" % (timestamp, index)
        output = open(filename, "w")
        output.write(html)
        output.close()

        parser = walletParser()
        parser.feed(html)
        hasNewTitle = False
        for title in parser.promo_hash:
            if title not in old_title_hash:
                hasNewTitle = True
                url = parser.promo_hash[title][0]
                new_title_hash[title] = url
                old_title_hash[title] = url
                if selector.checkDeal(title):
                    new_deal_hash[title] = []
                    new_deal_hash[title].append(url)
                    new_deal_hash[title].append(parser.promo_hash[title][1])
                    new_deal_hash[title].append(parser.promo_hash[title][2])
#print "was_price: %s" % parser.promo_hash[title][2]
                    i = 3
                    while i < len(parser.promo_hash[title]):
                        new_deal_hash[title].append(parser.promo_hash[title][i])
                        i += 1
                else:
                    filtered_deal_hash[title] = selector.filter_rule
        index += 1
Ejemplo n.º 5
0
def train_old_web(run_dir, new_deal_hash, new_title_hash, filtered_deal_hash):
    selector = dealSelector()
    for curdir, dirnames, filenames in os.walk(run_dir):
        for filename in filenames:
            fullname = "%s/%s" % (curdir, filename)
            print "processing %s" % fullname
            input = open(fullname, "r")
            html = input.read()
            input.close()

            parser = sdParser()
            parser.feed(html)
            for title in parser.promo_hash:
                if selector.checkDeal(title):
                    url = parser.promo_hash[title]
                    new_deal_hash[title] = url
                else:
                    filtered_deal_hash[title] = selector.filter_rule
        for dirname in dirnames:
            train_old_web(dirname, new_deal_hash, new_title_hash, filtered_deal_hash)