Beispiel #1
0
 def processFeeds(self, feeds):
     stats.feeds_total = len(feeds)
     for feed in feeds:
         do_it_again = 0
         try:
             do_it_again = int(
                 dateutil.parser.parse(
                     feed['retrieved']).strftime("%s")) + int(
                         feed['interval'])
         except:
             do_it_again = 0
         if feed['active'] == 1 and do_it_again < datetime.datetime.utcnow(
         ).timestamp():
             ##
             ## Time update needs to happen at the beginning of the run to minimize
             ## the possibility race condition when muliple collector jobs
             ## are scheduled.
             ##
             self.setFeedRetrieved(feed['id'],
                                   str(datetime.datetime.utcnow()))
             log.info("processing feed %s" % (feed['title']))
             if feed['type'] == 0:
                 self.processRSS(feed['id'], feed['source'])
             elif feed['type'] == 1:
                 self.processReddit(feed['id'], feed['source'])
             else:
                 self.processGeneric(feed['id'], feed['source'])
             stats.feeds_processed += 1
Beispiel #2
0
 def processRSS(self, feedid, url):
     d = feedparser.parse(url)
     for item in d['items']:
         log.info('retreiving %s' % item.title)
         self.articles.append(
             ArticleData(feedid, item.title, item.link, item.description,
                         item.published))
Beispiel #3
0
 def requestToken(self):
     headers = {
         'Content-Type': 'application/json',
         'user-agent': collector_config.USER_AGENT
     }
     url = "%s/auth" % (collector_config.API_BASE_URL)
     data = {
         'username': collector_config.API_USERNAME,
         'password': collector_config.API_PASSWORD
     }
     response = ""
     log.info("Requesting JWT token from %s for %s" %
              (url, data['username']))
     try:
         response = requests.post(url,
                                  headers=headers,
                                  data=json.dumps(data))
         response.raise_for_status()
         newtoken = response.json()['access_token']
         log.info("Retreived JWT token from %s for %s" %
                  (url, data['username']))
         return newtoken
     except requests.exceptions.HTTPError as e:
         log.error(
             "Error requesting token from %s: %s %s (%s)" %
             (url, response.json()['status_code'], response.json()['error'],
              response.json()['description']))
         raise Exception()
         return False
     except Exception as e:
         log.exception(e)
         raise Exception()
         return False
Beispiel #4
0
 def setFeedRetrieved(self, feedid, retrieved):
     log.debug("Updating feed %s with last retrieved time of %s." %
               (feedid, retrieved))
     filters = [dict(name='id', op='equals', val=feedid)]
     data = dict(retrieved=retrieved)
     print("doput(feed, %s, %s)" % (filters, data))
     response = None
     try:
         response = rest.doPut("feed", filters, data)
     except Exception as e:
         log.error(
             "Failed to updaste feed %s with last retrieved time of %s." %
             (feedid, retrieved))
         log.exception(e)
         raise Exception()
         return None
     if response.json()['num_modified'] != 0:
         log.info("Updating feed %s with last retrieved time of %s." %
                  (feedid, retrieved))
         return True
     else:
         log.error(
             "Failed to updaste feed %s with last retrieved time of %s." %
             (feedid, retrieved))
         return False
Beispiel #5
0
 def processGeneric(self, feedid, url):
     p = newspaper.build(url)
     for a in p.articles:
         a.download()
         log.info('retreiving %s' % a.title)
         self.articles.append(
             ArticleData(feedid, a.title, a.url, "",
                         str(datetime.datetime.utcnow())))
Beispiel #6
0
 def getFeeds(this):
     log.info("Retreiving list of feeds from API server.")
     response = ""
     try:
         response = rest.doGet("feed")
         return response.json()['objects']
     except:
         log.error("Error retreiving list of feeds from API server.")
         raise Exception() 
         return False
Beispiel #7
0
 def addArticle(self, article):
         if self.articleExists(article) == False:
             log.info("adding article %s" % (article.title))
             data = { "title": article.title, "link": article.link, "keywords": article.keywords, "description": article.description, "content": article.content, "contenthash": article.contenthash, "retrieved": article.retrieved, "published": article.published }
             try: 
                 response = rest.doPost("article", data)
                 for word in article.keywords:
                     keywordid = self.addKeyword(word)
                     self.addArticleKeyword(response.json()['id'], keywordid)
             except Exception as e:
                 log.exception(e)
                 return None
         else:
             return False
Beispiel #8
0
 def getFeedRetrieved(self, feedid):
     log.debug("Checking when feed %s last retreived." % (feedid))
     filters = [dict(name='id', op='equals', val=feedid)]
     response = None
     try: 
         response = rest.doGet(filters)
         if response.json()['num_results'] != 0:
             r = response.json()['objects'].pop()
             log.info("Feed %s last retreived %s" % (feedid, r['retrieved']))
             return r['retrieved'] 
         else:
             log.info("No retreived timestamp for %s" % (feedid))
             return False
     except Exception as e:
         log.exception(e)
         raise Exception()
         return None
Beispiel #9
0
 def getFeedRetrieved(self, feedid):
     log.debug("Checking when feed %s last retreived." % (feedid))
     filters = [dict(name='id', op='equals', val=feedid)]
     response = None
     try:
         response = rest.doGet(filters)
         if response.json()['num_results'] != 0:
             r = response.json()['objects'].pop()
             log.info("Feed %s last retreived %s" %
                      (feedid, r['retrieved']))
             return r['retrieved']
         else:
             log.info("No retreived timestamp for %s" % (feedid))
             return False
     except Exception as e:
         log.exception(e)
         raise Exception()
         return None
Beispiel #10
0
 def setFeedRetrieved(self, feedid, retrieved):
     log.debug("Updating feed %s with last retrieved time of %s." % (feedid, retrieved))
     filters = [dict(name='id', op='equals', val=feedid)]
     data = dict(retrieved=retrieved)
     print("doput(feed, %s, %s)" % (filters, data))
     response = None
     try:
         response = rest.doPut("feed", filters, data)
     except Exception as e:
         log.error("Failed to updaste feed %s with last retrieved time of %s." % (feedid, retrieved))
         log.exception(e)
         raise Exception()
         return None
     if response.json()['num_modified'] != 0:
         log.info("Updating feed %s with last retrieved time of %s." % (feedid, retrieved))
         return True
     else:
         log.error("Failed to updaste feed %s with last retrieved time of %s." % (feedid, retrieved))
         return False
Beispiel #11
0
 def requestToken(self):
     headers = {"Content-Type": "application/json", "user-agent": collector_config.USER_AGENT}
     url = "%s/auth" % (collector_config.API_BASE_URL)
     data = {"username": collector_config.API_USERNAME, "password": collector_config.API_PASSWORD}
     response = ""
     log.info("Requesting JWT token from %s for %s" % (url, data["username"]))
     try:
         response = requests.post(url, headers=headers, data=json.dumps(data))
         response.raise_for_status()
         newtoken = response.json()["access_token"]
         log.info("Retreived JWT token from %s for %s" % (url, data["username"]))
         return newtoken
     except requests.exceptions.HTTPError as e:
         log.error(
             "Error requesting token from %s: %s %s (%s)"
             % (url, response.json()["status_code"], response.json()["error"], response.json()["description"])
         )
         raise Exception()
         return False
     except Exception as e:
         log.exception(e)
         raise Exception()
         return False
Beispiel #12
0
 def addArticle(self, article):
     if self.articleExists(article) == False:
         log.info("adding article %s" % (article.title))
         data = {
             "title": article.title,
             "link": article.link,
             "keywords": article.keywords,
             "description": article.description,
             "content": article.content,
             "contenthash": article.contenthash,
             "retrieved": article.retrieved,
             "published": article.published
         }
         try:
             response = rest.doPost("article", data)
             for word in article.keywords:
                 keywordid = self.addKeyword(word)
                 self.addArticleKeyword(response.json()['id'], keywordid)
         except Exception as e:
             log.exception(e)
             return None
     else:
         return False
Beispiel #13
0
 def processFeeds(self, feeds):
     stats.feeds_total=len(feeds)
     for feed in feeds:
         do_it_again = 0
         try:
             do_it_again = int(dateutil.parser.parse(feed['retrieved']).strftime("%s")) + int(feed['interval'])
         except: 
             do_it_again = 0
         if feed['active'] == 1 and do_it_again < datetime.datetime.utcnow().timestamp():
             ##
             ## Time update needs to happen at the beginning of the run to minimize 
             ## the possibility race condition when muliple collector jobs
             ## are scheduled.
             ##
             self.setFeedRetrieved(feed['id'], str(datetime.datetime.utcnow()))
             log.info("processing feed %s" % (feed['title']))
             if feed['type'] == 0:
                 self.processRSS(feed['id'], feed['source'])
             elif feed['type'] == 1:
                 self.processReddit(feed['id'], feed['source'])
             else:
                 self.processGeneric(feed['id'], feed['source'])
             stats.feeds_processed += 1
Beispiel #14
0
    def processReddit(self, feedid, subreddit):
        r = praw.Reddit(collector_config.USER_AGENT)
        sr = r.get_subreddit(subreddit)
    
        for item in sr.get_new():
            log.info('retreiving %s' % item.title)
            self.articles.append(ArticleData(feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created))))
        
        for item in sr.get_hot():
            log.info('retreiving %s' % item.title)
            self.articles.append(ArticleData(feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created))))

        for item in sr.get_top():
            log.info('retreiving %s' % item.title)
            self.articles.append(ArticleData(feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created))))
Beispiel #15
0
    def processReddit(self, feedid, subreddit):
        r = praw.Reddit(collector_config.USER_AGENT)
        sr = r.get_subreddit(subreddit)

        for item in sr.get_new():
            log.info('retreiving %s' % item.title)
            self.articles.append(
                ArticleData(
                    feedid, item.title, item.url, item.selftext,
                    str(datetime.datetime.utcfromtimestamp(item.created))))

        for item in sr.get_hot():
            log.info('retreiving %s' % item.title)
            self.articles.append(
                ArticleData(
                    feedid, item.title, item.url, item.selftext,
                    str(datetime.datetime.utcfromtimestamp(item.created))))

        for item in sr.get_top():
            log.info('retreiving %s' % item.title)
            self.articles.append(
                ArticleData(
                    feedid, item.title, item.url, item.selftext,
                    str(datetime.datetime.utcfromtimestamp(item.created))))
Beispiel #16
0
 def initFeeds(this):
     log.info('Reading feeds file %s' % (collector_config.FEEDS_FILE))
     with open(collector_config.FEEDS_FILE) as feeds_file:
         feeds = json.load(feeds_file)
         for feed in feeds:
             this.addFeed(feed['title'], feed['source'], "", feed['type'], feed['interval'], feed['active'])
Beispiel #17
0
 def processGeneric(self, feedid, url):
     p = newspaper.build(url)
     for a in p.articles:
         a.download()
         log.info('retreiving %s' % a.title)
         self.articles.append(ArticleData(feedid, a.title, a.url, "", str(datetime.datetime.utcnow())))
Beispiel #18
0
 def processRSS(self, feedid, url):
     d = feedparser.parse(url)
     for item in d['items']:
         log.info('retreiving %s' % item.title)
         self.articles.append(ArticleData(feedid, item.title, item.link, item.description, item.published))