def memcache_indexer_keywords_distinct(self, term=None): """ Get or set in the memcache the keywords indexed with count. If term is not set, it returns the full index, else returns the ancestorship of a term. :return: a Query() """ mkey = _MEMCACHE_SLUGS['INDEXER_DISTINCT'] + str(term) if not memcache.get(key=mkey): if not term: query = Indexer.query(projection=[Indexer.keyword], distinct=True) results = { "indexed": [ { "keyword": q.keyword, "count": Indexer.query(Indexer.keyword == q.keyword).count() } for q in query ], "n_indexed": query.count() } memcache.add(key=mkey, value=results) else: try: results = TextSemantics.find_term_ancestorship(term) except Exception as e: raise ValueError(str(e)) memcache.add(key=mkey, value=results) else: results = memcache.get(key=mkey) return results
def execute_task(self, *args): item, key = args from flankers.textsemantics import find_related_concepts if not (item.title == '' and item.abstract == ''): # if item is not a media or a link from Twitter # it is or a feed or a tweet text = item.abstract if len(item.abstract) != 0 else item.title labels = find_related_concepts(text) for l in labels: if Indexer.query().filter(Indexer.webres == key).count() == 0: index = Indexer(keyword=l.strip(), webres=key) index.put() print "indexing stored: " + item.url + ">" + l
def memcache_articles_by_keyword(kwd): """ Get or set in the memcache articles related to a given keyword :param kwd: a keyword :return: a list """ mkey = "Keywords_" + kwd if not memcache.get(key=mkey): results = Indexer.get_webresource(kwd) memcache.add(key=mkey, value=results) else: results = memcache.get(key=mkey) return results
def execute_task(self, *args): """ Index an article. See Indexer class in models. :param args: single object to index and its key :return: None """ item, key = args from flankers.textsemantics import TextSemantics if not (item.title == '' and item.abstract == ''): # if item is not a media or a link from Twitter # it is or a feed or a tweet text = item.abstract if len(item.abstract) != 0 else item.title text = text[:1799] if len(text) >= 1800 else text if Indexer.query().filter(Indexer.webres == key).count() == 0: semantics = TextSemantics(text) labels = semantics.find_related_concepts() for l in labels: index = Indexer(keyword=l.strip(), webres=key) index.put() print "indexing stored: " + item.url + ">" + l else: raise Exception("storeIndexer(): Resource already indexed")
def memcache_articles_by_keyword(self, kwd): """ Get or set in the memcache articles related to a given keyword. GET /articles/<version>/?keyword=<some keyword> :param kwd: a keyword :return: a list """ mkey = _MEMCACHE_SLUGS['KWD_BY_ARTICLE'] + kwd if not memcache.get(key=mkey): results = Indexer.get_webresource(kwd) memcache.add(key=mkey, value=results) else: results = memcache.get(key=mkey) return results
def get(self, name): """ Handles WebResource """ self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' if self.request.get('token') == _CLIENT_TOKEN: if (name == 'webresource' or name == 'indexer') and self.request.get('retrieve'): # respond with a single resource of the requested kind resource = self.retrieve_a_single_resource(self.request.get('retrieve'), kind=name) print type(resource) resource = resource.dump_to_json() if resource else None return self.response.write( json.dumps(resource) ) if resource else self.json_error_handler(404, '?retrieve=ID Wrong ID') elif (name == 'webresource' or name == 'indexer') and self.request.get('index'): # RETRIEVE a index of WebResource (list of all keys presents in the datastore, paginated) from articlesjsonapi import memcache_webresource_query query = memcache_webresource_query() # Forked from https://github.com/GoogleCloudPlatform/appengine-paging-python page_size = 25 cursor = None next_bookmark = None bookmark = self.request.get('bookmark') if bookmark and bookmark != '': # if bookmark is set, serve the part of the cursor from the given bookamrk plus the page size cursor = ndb.Cursor.from_websafe_string(bookmark) articles, next_cursor, more = query.fetch_page(page_size, start_cursor=cursor) # assign the key for the next cursor if more: next_bookmark = next_cursor.to_websafe_string() listed = {'articles': [ webres.key.id() for webres in articles ], 'next': next_bookmark if next_bookmark else None} return self.response.write( json.dumps(listed) ) elif name == 'concepts' and self.request.get('retrieve'): # RETRIEVE keywords related to a WebResource from datastore.models import WebResource, Indexer # Find concepts related to a WebResource resource = self.retrieve_a_single_resource(self.request.get('retrieve')) concepts = Indexer.query().filter(Indexer.webres == resource.key) listed = {'concepts': [ concept.keyword.replace(" ", "+") for concept in concepts ], 'resource_id': resource.key.id() } return self.response.write( json.dumps(listed) ) else: return self.response.write(self.json_error_handler(404)) else: return self.response.write( self.json_error_handler(405, exception='Not authorized') )