def store_feed(e): """ store a single entry from the feedparser :param e: the entry :return: if succeed the stored key else None """ query = WebResource.query().filter(WebResource.url == e["link"]) if query.count() == 0: print "STORING: " + e["link"] try: if 'summary' in e: s, t = BeautifulSoup(e['summary'], "lxml"), BeautifulSoup(e['title'], "lxml") e['summary'], e['title'] = s.get_text(), t.get_text() else: t = BeautifulSoup(e['title'], "lxml") e['summary'], e['title'] = None , t.get_text() k = WebResource.store_feed(e) print "STORED: " + str(k) return k except Exception as e: print "Cannot Store: " + str(e) return None else: print "Resource already stored" return None
def store_feed(e): """ store a single entry from the feedparser :param e: the entry :return: if succeed the stored key else None """ query = WebResource.query().filter(WebResource.url == e["link"]) if query.count() == 0: print "STORING: " + e["link"] try: if 'summary' in e: s, t = BeautifulSoup(e['summary'], "lxml"), BeautifulSoup( e['title'], "lxml") e['summary'], e['title'] = s.get_text(), t.get_text() else: t = BeautifulSoup(e['title'], "lxml") e['summary'], e['title'] = None, t.get_text() k = WebResource.store_feed(e) print "STORED: " + str(k) return k except Exception as e: print "Cannot Store: " + str(e) return None else: print "Resource already stored" return None
def memcache_webresource_query(): """ Get or Set in the memcache the full query of WebResources. Updates every six hours (18000 secs) :return: Query object or None """ mkey = "WebResource_all" if not memcache.get(key=mkey): query = WebResource.query() memcache.add(key=mkey, value=query, time=18000) else: query = memcache.get(key=mkey) ### by now we exclude media and links children resources (resource with empty title) return query.filter(WebResource.title != "").order(WebResource.title).order(WebResource.key)
def memcache_webresource_query(): """ Get or Set in the memcache the full query of WebResources. Updates every six hours (18000 secs) :return: Query object or None """ mkey = "WebResource_all" if not memcache.get(key=mkey): query = WebResource.query() memcache.add(key=mkey, value=query, time=18000) else: query = memcache.get(key=mkey) ### by now we exclude media and links children resources (resource with empty title) return query.filter(WebResource.title != "").order( WebResource.title).order(WebResource.key)
def get(self): """ Handler for the cronjob: /cron/indexing It store keywords indexing of the most recent WebResource stored :return: """ # create the Index entries from flankers.long_task import storeIndexer an_hour = datetime.datetime.now() - datetime.timedelta(hours=1) print an_hour query = WebResource.query().filter(WebResource.stored > an_hour) print query.count() for q in query: s = storeIndexer() s.execute_task(q, q.key) del s
def get(self): """ Handler for the cronjob: /cron/indexing It store keywords indexing of the most recent WebResource stored :return: """ # create the Index entries from flankers.long_task import storeIndexer if not memcache.get(key=self.mkey): an_hour = datetime.datetime(*localtime()[:6]) - datetime.timedelta(hours=2) print an_hour query = WebResource.query().filter(WebResource.stored > an_hour) if query.count() == 0: memcache.delete(key=self.mkey) return None print "queried: " + str(query.count()) listed = [] for k in query.iter(keys_only=True): listed.append(k) memcache.add(key=self.mkey, value=listed) to_index = listed else: to_index = memcache.get(key=self.mkey) print "To be indexed: " + str(len(to_index)) if len(to_index) != 0: key = to_index.pop() print "popped", str(len(to_index)) print "popping", str(key) try: s = storeIndexer() s.execute_task(key.get(), key) del s except Exception: print "resource already indexed" pass memcache.set(key=self.mkey, value=to_index) else: memcache.delete(key=self.mkey) print "nothing to index"
def memcache_webresource_query(self): """ Get or Set in the memcache the full query of WebResources. It's used by all the endpoints to fetch all the data. Updates every six hours (18000 secs) :return: Query object or None """ mkey = _MEMCACHE_SLUGS['ALL'] if not memcache.get(key=mkey): self._query = WebResource.query() memcache.add(key=mkey, value=self._query, time=18000) else: self._query = memcache.get(key=mkey) ### Note for filtering: http://stackoverflow.com/a/28627068/2536357 return self._query
def memcache_keywords(url): """ Get or set in the memcache resulting keywords for a given url :param url: the url of the WebResource :return: a Query object or None """ from urlparse import urlparse parts = urlparse(url) if parts.scheme and parts.netloc: mkey = "Keyword_for_" + url if not memcache.get(key=mkey): q = WebResource.query().filter(WebResource.url == url).fetch(1) results = q[0].get_indexers() if len(q) == 1 else [] memcache.add(key=mkey, value=results, time=15000) else: results = memcache.get(key=mkey) return results else: return None
def post(self, perform): """ Handle dumping and monitoring for Triple Store. """ self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'text/html' if self.request.get('token') == _CLIENT_TOKEN: # authorized if perform == 'dump': # dump Webresource # 1. get a batch to be dumped _BATCH = int(self.request.get('batch')) query = WebResource.query(WebResource.in_graph == False).fetch(_BATCH) for q in query: triples = str() # 2. create triples representing the resource and its related concepts df, rl = self.build_triples(q) print df, rl triples += self.n_triplify(df) triples += " ".join([self.n_triplify(r) for r in rl]) print triples # 3. store triples _, cache_graph = store_triples(triples, _VOC_GRAPH_ID, format="n3") print "GRAPH STORED OK: {} triples".format(len(cache_graph)) # 4. set in_graph flag to True q.in_graph = True q.put() return self.response.write( "A batch of " + str(_BATCH) + " resources has been successfully stored in the triple store" ) elif perform == 'monitor': # gather statistics pass
def get(self): from google.appengine.ext import ndb from datastore.models import WebResource # Forked from https://github.com/GoogleCloudPlatform/appengine-paging-python if self.request.get("url"): # serve keywords for a given article's url self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' if not memcache.get(key="Keyword_" + self.request.get("url")): q = WebResource.query().filter(WebResource.url == self.request.get("url")).fetch(1) response = q[0].get_indexers() if len(q) == 1 else [] memcache.add(key="Keyword_for_" + self.request.get("url"), value=response, time=15000) else: response = memcache.get(key="Keyword_for_" + self.request.get("url")) return self.response.out.write( json.dumps(response) ) else: # serve articles if not memcache.get(key="WebResource_all"): query = WebResource.query() memcache.add(key="WebResource_all", value=query, time=18000) else: query = memcache.get(key="WebResource_all") page_size = 25 cursor = None bookmark = self.request.get('bookmark') if bookmark: # if bookmark is set, serve the part of the cursor from the given bookamrk plus the page size cursor = ndb.Cursor.from_websafe_string(bookmark) articles, next_cursor, more = query.fetch_page(page_size, start_cursor=cursor) next_bookmark = None if more: next_bookmark = next_cursor.to_websafe_string() print next_bookmark if next_bookmark: # serve the data with the link to the next bookmark mkey = "Articles_" + next_bookmark if not memcache.get(key=mkey): listed = {'articles': [webres.dump_to_json() for webres in articles], 'next': _SERVICE + '/visualize/articles/?api=true&bookmark=' + next_bookmark} memcache.add(key=mkey, value=listed, time=15000) else: listed = memcache.get(key=mkey) else: # last page, serve the page and the next bookmark is None listed = {'articles': [webres.dump_to_json() for webres in articles], 'next': None } if self.request.get("api"): # param 'api' is true, return JSON self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' return self.response.out.write( json.dumps(listed) ) # param 'api' is not set or false, return template path = os.path.join(_PATH, 'articles.html') return self.response.out.write(template.render(path, {'bookmark': next_bookmark, 'articles': listed}))