def _categorize(self):
     """ categorize wine info """
     entities = WebLinkWineTemp.query().fetch(50) # to avoid running datastore free quota limit
     for entity in entities:
         result = re.findall(r"BuyWine/Item/\d+|sku|skuIT-\d+|bwe\d+|wines/\d+|/wine/|Apply/Vintage/\d+", entity.link, re.I) # sku ; BuyWine/Item ; bwe
         query = WebLinkWine.query(WebLinkWine.link == entity.link)
         if result and query.count() == 0:
             new_wine_info = WebLinkWine()
             new_wine_info.link = entity.link
             new_wine_info.title = entity.title
             new_wine_info.put()
 def get(self):
     # fetch entities from db
     entities = WebLinkWineTemp.query().fetch(15)
     search_list = []
     
     if entities:
         for entity in entities:
             search_list.append({'title' : entity.title, 'link' : entity.link})
             entity.key.delete()
     else:
         search_list = dict_general.default_urls
         
     # crawl website
     list_found_link = []
     while len(search_list) > 0:
         link = search_list.pop(0)['link']
         parsed_str = urlparse.urlsplit(link)
         link_base = "{url_scheme}://{url_netloc}".format(url_scheme = parsed_str.scheme, url_netloc = parsed_str.netloc)
         
         try:
             req = urllib2.Request(link)
             response = urllib2.urlopen(req) # need to add new mechanism to prevent fetch javascript
             searched_page = response.read()
             soup = BeautifulSoup(searched_page)
             
             for found_link in soup.find_all('a'):
                 if found_link.get('href'):
                     match_group = re.match("http", found_link.get('href'), re.I)
                     full_href = ""
                     title = "NA"
                     
                     if not match_group:
                         full_href = "{href_link_base}{sub_href}".format(href_link_base = link_base, sub_href = found_link.get('href'))
                     else:
                         full_href = found_link.get('href')
                         
                     if found_link.contents and len(found_link.contents) > 0 and found_link.contents[0].string:
                         title = found_link.contents[0].string
                         
                     list_found_link.append({'title' : title, 'link' : full_href})
         except urllib2.HTTPError, err:
             pass