def get_cats_thread(self, cat): subcats = self.get_sub_cats("http://top.taobao.com/level2.php?cat=%s" % cat["id"], "cat", 2) if len(subcats) == 1: cat["children"] = self.get_sub_cats_thread(subcats[0]) return cat threadPool = ThreadPool(len(subcats) if len(subcats) <= 5 else 5) for sc in subcats: threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc) cat["children"] = threadPool.killAllWorkers(None) return cat
def get_top_keywords(self, cats=None, parent=None, up=True): """Get top keywords for all the categories""" if not cats: cats = self.get_cats() if not cats: return [] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up) cats = threadPool.killAllWorkers(None) return cats
def get_cats_thread(self, cat): print cat['id'] subcats = self.get_sub_cats('http://top.taobao.com/level2.php?cat=%s'%cat['id'], 'cat', 2) if len(subcats) == 1: cat['children'] = self.get_sub_cats_thread(subcats[0]) return cat threadPool = ThreadPool(len(subcats) if len(subcats)<=5 else 5) for sc in subcats: threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc) cat['children'] = threadPool.killAllWorkers(None) return cat
def cat_top_keywords(self, cat, level3="", up=True, offset=0, offsets=[]): """Get top keywords in a specific category""" # print 'CAT:%s, level:%s'%(str(cat), str(level3)) # print 'OFFSET: %d'%offset response = [] if not offsets or offset == 0: url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( str(cat), str(level3), "true" if up else "", offset, ) rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) # print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( str(cat), str(level3), "true" if up else "", page_offset, ) next_page = "True" if idx == (len(offsets) - 1) else "False" threadPool.run( self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset) ) pages = threadPool.killAllWorkers(None) # print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content) offset2 = int(p.config["offset"]) response += self.parse_cat_top_keywords(soup2, offset2) # print 'GOT: %d'%offset2 if p.config["get_next"] != "True": continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) # print offsets if not offsets: continue response += self.cat_top_keywords(cat, level3, up, offset2, offsets) # return sorted(response, key=itemgetter('pos')) if response else [] # print "RETURN:%d"%offset return response
def get_cats(self): '''Get top keywords categories''' start_url = 'http://top.taobao.com/index.php?from=tbsy' rs = self.fetch(start_url) if not rs: return None soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) cats = [{'id':'TR_%s'%li['id'].encode('utf-8').upper(), 'title':li.a.text.encode('utf-8').strip()} for li in soup.find('div', id='nav').findAll('li') if li['id']!='index'] threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5) for cat in cats: threadPool.run(self.get_cats_thread, callback=None, cat=cat) cats = threadPool.killAllWorkers(None) return cats
def get_top_keywords(self, cats=None, parent=None, up=True): '''Get top keywords for all the categories''' if not cats: cats = self.get_cats() if not cats: return [] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up) cats = threadPool.killAllWorkers(None) return cats
def cat_top_keywords(self, session, cat, up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' print 'CAT:%s, level:%s'%(str(cat), str(cat.level)) print 'OFFSET: %d'%offset response = [] if not offsets or offset==0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', offset) print url rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) response = self.parse_cat_top_keywords(soup, offset) if offset==0: offsets = self.get_cat_top_keywords_pages(soup, offset) print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets)<=5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets)-1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) print offsets if not offsets: continue response += self.cat_top_keywords(session, cat, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset for k in response: new_keyword = models.Keyword(k['name'].decode('utf-8')) new_keyword.categories.append(cat) session.add(new_keyword) try: session.commit() except IntegrityError: session.rollback() new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k['name']).first() new_keyword.categories.append(cat) session.commit() print 'Duplicate %s'%new_keyword return response
def get_cats(self): """Get top keywords categories""" start_url = "http://top.taobao.com/index.php?from=tbsy" rs = self.fetch(start_url) if not rs: return None soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) cats = [ {"id": "TR_%s" % li["id"].encode("utf-8").upper(), "title": li.a.text.encode("utf-8").strip()} for li in soup.find("div", id="nav").findAll("li") if li["id"] != "index" ] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.get_cats_thread, callback=None, cat=cat) cats = threadPool.killAllWorkers(None) return cats
def cat_top_keywords(self, cat, level3='', up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' #print 'CAT:%s, level:%s'%(str(cat), str(level3)) #print 'OFFSET: %d'%offset response = [] if not offsets or offset == 0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % ( str(cat), str(level3), 'true' if up else '', offset) rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) #print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % ( str(cat), str(level3), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets) - 1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) #print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) #print offsets if not offsets: continue response += self.cat_top_keywords(cat, level3, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset return response
def cat_top_keywords(self, session, cat, up=True, offset=0, offsets=[]): """Get top keywords in a specific category""" print "CAT:%s, level:%s" % (str(cat), str(cat.level)) print "OFFSET: %d" % offset response = [] if not offsets or offset == 0: url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( cat.parent.cid, "" if cat.level == 2 else str(cat.cid), "true" if up else "", offset, ) print url rs = self.fetch(url) if not rs: return response soup = BeautifulSoup( rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage ) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) print "OFFSETS: %s" % offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( cat.parent.cid, "" if cat.level == 2 else str(cat.cid), "true" if up else "", page_offset, ) next_page = "True" if idx == (len(offsets) - 1) else "False" threadPool.run( self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset) ) pages = threadPool.killAllWorkers(None) # print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup( p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage ) offset2 = int(p.config["offset"]) response += self.parse_cat_top_keywords(soup2, offset2) print "GOT: %d" % offset2 if p.config["get_next"] != "True": continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) print offsets if not offsets: continue response += self.cat_top_keywords(session, cat, up, offset2, offsets) # return sorted(response, key=itemgetter('pos')) if response else [] # print "RETURN:%d"%offset for k in response: new_keyword = models.Keyword(k["name"].decode("utf-8")) new_keyword.categories.append(cat) session.add(new_keyword) try: session.commit() except IntegrityError: session.rollback() new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k["name"]).first() new_keyword.categories.append(cat) session.commit() print "Duplicate %s" % new_keyword return response