def _categorize(self): """ categorize wine info """ entities = WebLinkWineTemp.query().fetch(50) # to avoid running datastore free quota limit for entity in entities: result = re.findall(r"BuyWine/Item/\d+|sku|skuIT-\d+|bwe\d+|wines/\d+|/wine/|Apply/Vintage/\d+", entity.link, re.I) # sku ; BuyWine/Item ; bwe query = WebLinkWine.query(WebLinkWine.link == entity.link) if result and query.count() == 0: new_wine_info = WebLinkWine() new_wine_info.link = entity.link new_wine_info.title = entity.title new_wine_info.put()
def get(self): # fetch entities from db entities = WebLinkWineTemp.query().fetch(15) search_list = [] if entities: for entity in entities: search_list.append({'title' : entity.title, 'link' : entity.link}) entity.key.delete() else: search_list = dict_general.default_urls # crawl website list_found_link = [] while len(search_list) > 0: link = search_list.pop(0)['link'] parsed_str = urlparse.urlsplit(link) link_base = "{url_scheme}://{url_netloc}".format(url_scheme = parsed_str.scheme, url_netloc = parsed_str.netloc) try: req = urllib2.Request(link) response = urllib2.urlopen(req) # need to add new mechanism to prevent fetch javascript searched_page = response.read() soup = BeautifulSoup(searched_page) for found_link in soup.find_all('a'): if found_link.get('href'): match_group = re.match("http", found_link.get('href'), re.I) full_href = "" title = "NA" if not match_group: full_href = "{href_link_base}{sub_href}".format(href_link_base = link_base, sub_href = found_link.get('href')) else: full_href = found_link.get('href') if found_link.contents and len(found_link.contents) > 0 and found_link.contents[0].string: title = found_link.contents[0].string list_found_link.append({'title' : title, 'link' : full_href}) except urllib2.HTTPError, err: pass