def store_feed(e):
    """
    store a single entry from the feedparser
    :param e: the entry
    :return: if succeed the stored key else None
    """
    query = WebResource.query().filter(WebResource.url == e["link"])
    if query.count() == 0:
        print "STORING: " + e["link"]
        try:
            if 'summary' in e:
                s, t = BeautifulSoup(e['summary'], "lxml"), BeautifulSoup(
                    e['title'], "lxml")
                e['summary'], e['title'] = s.get_text(), t.get_text()
            else:
                t = BeautifulSoup(e['title'], "lxml")
                e['summary'], e['title'] = None, t.get_text()
            k = WebResource.store_feed(e)
            print "STORED: " + str(k)
            return k
        except Exception as e:
            print "Cannot Store: " + str(e)
            return None
    else:
        print "Resource already stored"
        return None
def store_feed(e):
    """
    store a single entry from the feedparser
    :param e: the entry
    :return: if succeed the stored key else None
    """
    query = WebResource.query().filter(WebResource.url == e["link"])
    if query.count() == 0:
        print "STORING: " + e["link"]
        try:
            if 'summary' in e:
                s, t = BeautifulSoup(e['summary'], "lxml"), BeautifulSoup(e['title'], "lxml")
                e['summary'], e['title'] = s.get_text(), t.get_text()
            else:
                t = BeautifulSoup(e['title'], "lxml")
                e['summary'], e['title'] = None , t.get_text()
            k = WebResource.store_feed(e)
            print "STORED: " + str(k)
            return k
        except Exception as e:
            print "Cannot Store: " + str(e)
            return None
    else:
        print "Resource already stored"
        return None
 def execute_task(self, timeline, remain=list()):
     """
     #TO-DO: make this recursive
     :param timeline:
     :param remain:
     :return:
     """
     for twt in timeline:
         if isinstance(twt, list):
             print "twt is a list"
             self.recurring(twt)
         else:
             WebResource.store_tweet(twt)
             self.i += 1
     print "Total tweets: " + str(self.i)
        def get_wall_recursive(url):
            response = urllib.urlopen(url)
            response = json.loads(response.read())
            if 'error' not in response.keys():
                for o in response['data']:
                    WebResource.store_fb_post(alias, o)
            else:
                from flankers.errors import RESTerror
                raise RESTerror('get_wall_recursive(): FB API error')

            if 'paging' not in response.keys() or not response['paging']['next'] or self.counter == 10:
                return None

            self.counter += 1
            return get_wall_recursive(response['paging']['next'])
Example #5
0
 def execute_task(self, timeline, remain=list()):
     """
     #TO-DO: make this recursive
     :param timeline:
     :param remain:
     :return:
     """
     for twt in timeline:
         if isinstance(twt, list):
             print "twt is a list"
             self.recurring(twt)
         else:
             WebResource.store_tweet(twt)
             self.i += 1
     print "Total tweets: " + str(self.i)
 def execute_task(self, timeline, remain=list()):
     """
     Store a tweet in the datastore
     Storing method: see WebResource.store_tweet() in models.
     #TO-DO: make this recursive
     :param timeline: a timeline dict() fetched from TW API
     :param remain:
     :return: None
     """
     for twt in timeline:
         if isinstance(twt, list):
             print "twt is a list"
             self.recurring(twt)
         else:
             WebResource.store_tweet(twt)
             self.i += 1
     print "Total tweets: " + str(self.i)
def memcache_webresource_query():
    """
    Get or Set in the memcache the full query of WebResources.
    Updates every six hours (18000 secs)
    :return: Query object or None
    """
    mkey = "WebResource_all"
    if not memcache.get(key=mkey):
        query = WebResource.query()
        memcache.add(key=mkey, value=query, time=18000)
    else:
        query = memcache.get(key=mkey)

    ### by now we exclude media and links children resources (resource with empty title)
    return query.filter(WebResource.title != "").order(WebResource.title).order(WebResource.key)
Example #8
0
 def get(self):
     """
     Handler for the cronjob: /cron/indexing
     It store keywords indexing of the most recent WebResource stored
     :return:
     """
     # create the Index entries
     from flankers.long_task import storeIndexer
     an_hour = datetime.datetime.now() - datetime.timedelta(hours=1)
     print an_hour
     query = WebResource.query().filter(WebResource.stored > an_hour)
     print query.count()
     for q in query:
         s = storeIndexer()
         s.execute_task(q, q.key)
         del s
 def get(self):
     """
     Handler for the cronjob: /cron/indexing
     It store keywords indexing of the most recent WebResource stored
     :return:
     """
     # create the Index entries
     from flankers.long_task import storeIndexer
     an_hour = datetime.datetime.now() - datetime.timedelta(hours=1)
     print an_hour
     query = WebResource.query().filter(WebResource.stored > an_hour)
     print query.count()
     for q in query:
         s = storeIndexer()
         s.execute_task(q, q.key)
         del s
def memcache_webresource_query():
    """
    Get or Set in the memcache the full query of WebResources.
    Updates every six hours (18000 secs)
    :return: Query object or None
    """
    mkey = "WebResource_all"
    if not memcache.get(key=mkey):
        query = WebResource.query()
        memcache.add(key=mkey, value=query, time=18000)
    else:
        query = memcache.get(key=mkey)

    ### by now we exclude media and links children resources (resource with empty title)
    return query.filter(WebResource.title != "").order(
        WebResource.title).order(WebResource.key)
    def memcache_webresource_query(self):
        """
        Get or Set in the memcache the full query of WebResources.

        It's used by all the endpoints to fetch all the data.
        Updates every six hours (18000 secs)
        :return: Query object or None
        """
        mkey = _MEMCACHE_SLUGS['ALL']
        if not memcache.get(key=mkey):
            self._query = WebResource.query()
            memcache.add(key=mkey, value=self._query, time=18000)
        else:
            self._query = memcache.get(key=mkey)

        ### Note for filtering: http://stackoverflow.com/a/28627068/2536357
        return self._query
Example #12
0
 def post(self):
     self.response.headers['Access-Control-Allow-Origin'] = '*'
     if self.request.get('pwd') == _TEMP_SECRET and self.request.get(
             'resource'):
         from datastore.models import WebResource
         try:
             oid = WebResource.dump_from_json(self.request.get('resource'))
         except (Exception, ValueError) as e:
             self.response.status = 400
             return self.response.write(
                 'The request could not be understood, wrong resource format or syntax: '
                 + str(e))
         self.response.status = 200
         return self.response.write('Resource Stored: ' + str(oid))
     else:
         self.response.status = 405
         return self.response.write('Not Authorized')
    def get(self):
        """
        Handler for the cronjob: /cron/indexing
        It store keywords indexing of the most recent WebResource stored
        :return:
        """
        # create the Index entries
        from flankers.long_task import storeIndexer

        if not memcache.get(key=self.mkey):
            an_hour = datetime.datetime(*localtime()[:6]) - datetime.timedelta(hours=2)
            print an_hour
            query = WebResource.query().filter(WebResource.stored > an_hour)
            if query.count() == 0:
                memcache.delete(key=self.mkey)
                return None
            print "queried: " + str(query.count())

            listed = []
            for k in query.iter(keys_only=True):
                listed.append(k)

            memcache.add(key=self.mkey, value=listed)
            to_index = listed
        else:
            to_index = memcache.get(key=self.mkey)

        print "To be indexed: " + str(len(to_index))

        if len(to_index) != 0:
            key = to_index.pop()
            print "popped", str(len(to_index))
            print "popping", str(key)

            try:
                s = storeIndexer()
                s.execute_task(key.get(), key)
                del s
            except Exception:
                print "resource already indexed"
                pass

            memcache.set(key=self.mkey, value=to_index)
        else:
            memcache.delete(key=self.mkey)
            print "nothing to index"
def memcache_keywords(url):
    """
    Get or set in the memcache resulting keywords for a given url
    :param url: the url of the WebResource
    :return: a Query object or None
    """
    from urlparse import urlparse
    parts = urlparse(url)
    if parts.scheme and parts.netloc:
        mkey = "Keyword_for_" + url
        if not memcache.get(key=mkey):
            q = WebResource.query().filter(WebResource.url == url).fetch(1)
            results = q[0].get_indexers() if len(q) == 1 else []
            memcache.add(key=mkey, value=results, time=15000)
        else:
            results = memcache.get(key=mkey)
        return results
    else:
        return None
def memcache_keywords(url):
    """
    Get or set in the memcache resulting keywords for a given url
    :param url: the url of the WebResource
    :return: a Query object or None
    """
    from urlparse import urlparse
    parts = urlparse(url)
    if parts.scheme and parts.netloc:
        mkey = "Keyword_for_" + url
        if not memcache.get(key=mkey):
            q = WebResource.query().filter(WebResource.url == url).fetch(1)
            results = q[0].get_indexers() if len(q) == 1 else []
            memcache.add(key=mkey, value=results, time=15000)
        else:
            results = memcache.get(key=mkey)
        return results
    else:
        return None
    def post(self, perform):
        """
        Handle dumping and monitoring for Triple Store.
        """
        self.response.headers['Access-Control-Allow-Origin'] = '*'
        self.response.headers['Content-Type'] = 'text/html'

        if self.request.get('token') == _CLIENT_TOKEN:
            # authorized
            if perform == 'dump':
                # dump Webresource
                # 1. get a batch to be dumped
                _BATCH = int(self.request.get('batch'))
                query = WebResource.query(WebResource.in_graph == False).fetch(_BATCH)

                for q in query:
                    triples = str()
                    # 2. create triples representing the resource and its related concepts
                    df, rl = self.build_triples(q)
                    print df, rl
                    triples += self.n_triplify(df)
                    triples += " ".join([self.n_triplify(r) for r in rl])
                    print triples
                    # 3. store triples
                    _, cache_graph = store_triples(triples, _VOC_GRAPH_ID, format="n3")
                    print "GRAPH STORED OK: {} triples".format(len(cache_graph))
                    # 4. set in_graph flag to True
                    q.in_graph = True
                    q.put()

                return self.response.write(
                    "A batch of " + str(_BATCH) +
                    " resources has been successfully stored in the triple store"
                )
            elif perform == 'monitor':
                # gather statistics
                pass
 def store_video(obj):
     WebResource.store_youtube_video(obj)
 def recurring(self, timeline):
     for twt in timeline:
         WebResource.store_tweet(twt)
         self.i += 1
     print "Total tweets in list: " + str(self.i)
    def get(self):
        from google.appengine.ext import ndb
        from datastore.models import WebResource

        # Forked from https://github.com/GoogleCloudPlatform/appengine-paging-python

        if self.request.get("url"):
            # serve keywords for a given article's url
            self.response.headers['Access-Control-Allow-Origin'] = '*'
            self.response.headers['Content-Type'] = 'application/json'
            if not memcache.get(key="Keyword_" + self.request.get("url")):
                q = WebResource.query().filter(WebResource.url == self.request.get("url")).fetch(1)
                response = q[0].get_indexers() if len(q) == 1 else []
                memcache.add(key="Keyword_for_" + self.request.get("url"), value=response, time=15000)
            else:
                response = memcache.get(key="Keyword_for_" + self.request.get("url"))

            return self.response.out.write(
                json.dumps(response)
            )
        else:
            # serve articles
            if not memcache.get(key="WebResource_all"):
                query = WebResource.query()
                memcache.add(key="WebResource_all", value=query, time=18000)
            else:
                query = memcache.get(key="WebResource_all")

            page_size = 25
            cursor = None
            bookmark = self.request.get('bookmark')
            if bookmark:
                # if bookmark is set, serve the part of the cursor from the given bookamrk plus the page size
                cursor = ndb.Cursor.from_websafe_string(bookmark)

            articles, next_cursor, more = query.fetch_page(page_size, start_cursor=cursor)

            next_bookmark = None
            if more:
                next_bookmark = next_cursor.to_websafe_string()
            print next_bookmark

            if next_bookmark:
                # serve the data with the link to the next bookmark
                mkey = "Articles_" + next_bookmark
                if not memcache.get(key=mkey):
                    listed = {'articles': [webres.dump_to_json()
                                           for webres in articles],
                              'next': _SERVICE + '/visualize/articles/?api=true&bookmark=' + next_bookmark}
                    memcache.add(key=mkey, value=listed, time=15000)
                else:
                    listed = memcache.get(key=mkey)
            else:
                # last page, serve the page and the next bookmark is None
                listed = {'articles': [webres.dump_to_json()
                                       for webres in articles],
                          'next': None
                          }

            if self.request.get("api"):
                # param 'api' is true, return JSON
                self.response.headers['Access-Control-Allow-Origin'] = '*'
                self.response.headers['Content-Type'] = 'application/json'
                return self.response.out.write(
                    json.dumps(listed)
                )
            # param 'api' is not set or false, return template
            path = os.path.join(_PATH, 'articles.html')
            return self.response.out.write(template.render(path, {'bookmark': next_bookmark,
                                                                  'articles': listed}))
 def recurring(self, timeline):
     for twt in timeline:
         WebResource.store_tweet(twt)
         self.i += 1
     print "Total tweets in list: " + str(self.i)