class VespaWrite: def open_spider(self, spider): self.vespa = Vespa(url = "http://vespa-search", port = 8080) def process_item(self, item, spider): try: vespa_fields = { } vespa_fields['url'] = item['url'] vespa_fields['bodytext'] = item['bodytext'] vespa_fields['firstpubtime'] = item['firstpubtime'] if ('modtime' in item): vespa_fields['modtime'] = item['modtime'] vespa_fields['wordcount'] = item['wordcount'] vespa_fields['headline'] = item['headline'] vespa_fields['sentiment'] = item['sentiment'] if ('summary' in item): vespa_fields['abstract'] = item['summary'] if ('keywords' in item): vespa_fields['keywords'] = item['keywords'] if ('bylines' in item): vespa_fields['bylines'] = item['bylines'] if ('section' in item): vespa_fields['section'] = item['section'] vespa_fields['source'] = item['source'] if ('twitter_retweet_count' in item): vespa_fields['twitter_retweet_count'] = item['twitter_retweet_count'] if ('twitter_favourite_count' in item): vespa_fields['twitter_favourite_count'] = item['twitter_favourite_count'] response = self.vespa.update_data( schema = "newsarticle", data_id = hashlib.sha256(item['url'].encode()).hexdigest(), fields = vespa_fields, create = True ) return item except (KeyError, TypeError): logger.debug("error: " + item) pass
class TwitterInserter: def run(self): try: config.load_incluster_config() except: config.load_kube_config() v1 = client.CoreV1Api() twitter_secrets = v1.read_namespaced_secret(name='twitter-secrets', namespace='default').data api_key = base64.b64decode(twitter_secrets["api-key"]).decode('utf-8') api_secret = base64.b64decode( twitter_secrets["api-secret"]).decode('utf-8') self.vespa = Vespa(url="http://vespa-search", port=8080) auth = tweepy.AppAuthHandler(api_key, api_secret) self.api = tweepy.API(auth) updated = 0 for userid in [ 'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage', 'canberratimes', 'zdnetaustralia', 'newscomauHQ', 'westaustralian', 'SBSNews', 'australian', 'crikey_news', '9NewsAUS', 'BBCNewsAus' ]: try: for status in tweepy.Cursor(self.api.user_timeline, id=userid, include_entities=True, tweet_mode="extended").items(60): if len(status.entities['urls']) == 0: continue url = status.entities['urls'][0]['expanded_url'] url = self.get_url(url) if (url.startswith("https://twitter.com") or url.startswith("https://www.reddit.com")): continue article = self.get_article(url) if article: self.update_document(article, status) updated += 1 # else: # self.insert_document(url) except Exception as e: print("exception! {}".format(e)) continue print("Completed run, updated {} tweets".format(updated)) def get_url(self, url): if (re.match(r'https?://zd.net', url) or url.startswith("https://trib.al") or url.startswith("https://bit.ly") or url.startswith("https://bbc.in")): url = urlopen(url).geturl() return self.get_url(url) else: return url.split('?')[0] # def insert_document(self, url): # payload = {'url': url } # requests.get("http://localhost:8000/", params=payload) # print("Hit spider url for {}".format(url)) def update_document(self, article, status): vespa_fields = {} vespa_fields['twitter_favourite_count'] = status.favorite_count vespa_fields['twitter_retweet_count'] = status.retweet_count vespa_fields[ 'twitter_link'] = 'https://twitter.com/{}/status/{}'.format( status.user.screen_name, status.id) response = self.vespa.update_data( schema="newsarticle", data_id=hashlib.sha256( article['fields']['url'].encode()).hexdigest(), fields=vespa_fields) print("Updated {} with {} {}: {}".format(article['fields']['url'], status.favorite_count, status.retweet_count, response)) def get_article(self, url): article_time = time.time() - 24 * 60 * 60 body = { 'yql': 'select url from sources newsarticle where userQuery();', 'query': "url:{}".format(url), 'hits': 1, } results = self.vespa.query(body=body) if len(results.hits) > 0: return results.hits[0]
class TwitterInserter: api_key = "TWITTER_API_KEY" api_secret = "TWITTER_API_SECRET" def run(self): self.vespa = Vespa(url="http://vespa-search", port=8080) auth = tweepy.AppAuthHandler(self.api_key, self.api_secret) self.api = tweepy.API(auth) updated = 0 for userid in [ 'abcnews', 'GuardianAus', 'smh', 'iTnews_au', 'theage', 'canberratimes', 'zdnetaustralia', 'newscomauHQ', 'westaustralian' ]: try: for status in tweepy.Cursor(self.api.user_timeline, id=userid, include_entities=True).items(60): if len(status.entities['urls']) == 0: continue url = status.entities['urls'][0]['expanded_url'] url = url.split('?')[0] if (url.startswith("https://twitter.com")): continue if (url.startswith("https://zd.net") or url.startswith("https://bit.ly")): url = urlopen(url).geturl() article = self.get_article(url) if article: self.update_document(article, status) updated += 1 except Exception as e: logger.error(e) print("Completed run, updated {} tweets".format(updated)) def update_document(self, article, status): vespa_fields = {} vespa_fields['twitter_favourite_count'] = status.favorite_count vespa_fields['twitter_retweet_count'] = status.retweet_count response = self.vespa.update_data( schema="newsarticle", data_id=hashlib.sha256( article['fields']['url'].encode()).hexdigest(), fields=vespa_fields) #print("Updated {} with {} {}: {}".format(article['fields']['url'], status.favorite_count, status.retweet_count, response)) def get_article(self, url): article_time = time.time() - 24 * 60 * 60 body = { 'yql': 'select url from sources newsarticle where userQuery();', 'query': "url:{}".format(url), 'hits': 1, } results = self.vespa.query(body=body) if len(results.hits) > 0: return results.hits[0] def get_twitter_user(self, url): if url.startswith("https://www.abc.net.au"): return "abcnews" if url.startswith("https://www.theguardian.com/"): return "GuardianAus" if url.startswith("https://www.smh.com.au"): return "smh" if url.startswith("https://www.itnews.com.au"): return "iTnews_au" if url.startswith("https://www.theage.com.au"): return "theage" if url.startswith("https://www.canberratimes.com.au"): return "canberratimes" if url.startswith("https://www.zdnet.com"): return "zdnetaustralia" if url.startswith("https://www.news.com.au"): return "newscomauHQ" if url.startswith("https://thewest.com.au"): return "westaustralian"