def processLinkCategoriesFromJson(cls, categories, url): if categories is None or len(categories) == 0: logging.info('missing categories. skipping') return cat_dict = eval(categories) if len(cat_dict) == 0: logging.info('no categories. skipping') return for cat, cnt in cat_dict.iteritems(): existingCategory=LinkCategory.gql('WHERE category = :1 and url = :2' , cat, url).get() if existingCategory is None: logging.info('new category %s , init url %s' % (cat, url)) linkCategory = LinkCategory() linkCategory.category = cat linkCategory.url = url linkCategory.put() else: logging.info('updated time for category %s [ %s ]' % (cat, existingCategory.url)) existingCategory.updated = datetime.datetime.now() existingCategory.put()
def post(self): url=self.request.get('url',None) url_hash = LinkUtils.getUrlHash(url) if url is None: logging.info('no link in request. skipping') return category_api='http://access.alchemyapi.com/calls/url/URLGetCategory?apikey=%s&url=%s&outputMode=json' %(self.alchemy_key, urllib2.quote(url.encode('utf-8'))) logging.info('trying to fetch shared count info %s' %category_api) link=None language=None category=None try: link = Links.gql('WHERE url_hash = :1', url_hash).get() if link is None: link = Links.gql('WHERE url = :1', url).get() except BadValueError: logging.info('url property too long') if link is None: link = Links() else: link.date_updated = datetime.datetime.now().date() json = LinkUtils.getJsonFromApi(category_api) if json is None: logging.info('alchemy api returned no category.skipping') return try: language=json['language'] category=json['category'] score=Cast.toFloat(json['score'],0) if score is not None and score > 0.5 and category is not None: logging.info('category %s score %s' %(category, score)) cats=category.split("_") if cats is None: logging.info('no categories. exit') return memcache_key=url_hash+'_category' current_categories=memcache.get(memcache_key) merge_cat=[] if current_categories is not None: logging.info('merging with existing cats %s' %current_categories) merge_cat.extend(current_categories) merge_cat.extend(cats) else: merge_cat=cats model=None try: model=SessionModel.gql('WHERE url_hash = :1 order by date desc', url).get() if model is None: model=SessionModel.gql('WHERE url = :1 order by date desc', url).get() except BadValueError: logging.info('url too long ... %s' %url) if model is None: logging.info('model not defined ... skipping') return linkDetail=Links.gql('WHERE url_hash = :1' , url_hash).get() if linkDetail is None: linkDetail=Links.gql('WHERE url = :1' , url).get() if linkDetail is not None and linkDetail.categories is not None: logging.info('category found from link details %s' % linkDetail.categories) delic_cats=eval(linkDetail.categories) d_cats=[ c for c in delic_cats ] merge_cat.extend(d_cats) merge_cat=set(merge_cat) logging.info('caching cats %s for url %s' %(merge_cat, url)) memcache.set(memcache_key, list(set(merge_cat))[:4]) for c in merge_cat: taskqueue.add(queue_name='message-broadcast-queue', url='/category/stream', params={'category':c, 'url': url_hash}) existingLinkCat = LinkCategory.gql('WHERE url_hash = :1 and category = :2', url_hash, c).get() if existingLinkCat is None: existingLinkCat = LinkCategory.gql('WHERE url = :1 and category = :2', url, c).get() if existingLinkCat is not None: existingLinkCat.updated=datetime.datetime.now() if existingLinkCat.url_hash is None: existingLinkCat.url_hash = url_hash existingLinkCat.put() logging.info('updated exisitng url(%s) category(%s) update time %s' % (url, c, existingLinkCat.updated)) else: logging.info('new pair: url%s) category(%s) ' % (url, c)) linkCategory=LinkCategory() linkCategory.url=url linkCategory.url_hash = url_hash linkCategory.category=c if model is not None: linkCategory.model_details=model.key() linkCategory.put() if language is not None: link.language = language link.url=url link.url_hash=url_hash link.put() except KeyError: e0, e1 = sys.exc_info()[0],sys.exc_info()[1] logging.info('key error [[%s, %s]] in %s' %(e0, e1, json))