def load_config(config_file): try: with open(config_file, "r") as config_file_data: _config_file_settings = json.load(config_file_data) return _config_file_settings except Exception as e: get_logger().error("Error opening config file: %s" % e) return None
def _get_cluster_terms(self, order_centroids, index, _used_terms): get_logger().info("Cluster %d words:" % index) _terms = "" for ind in order_centroids[index, :8]: _term = self.vocab_frame.ix[self.terms[ind].split( ' ')].values.tolist()[0][0].encode('utf-8', 'ignore') if _term not in self.stopwords: _terms += " " + _term _terms = list( set(filter(lambda term: is_keyword(term), _terms.split()))) _terms = [t for t in _terms if t not in _used_terms][0:4] for t in _terms: _used_terms.add(t) get_logger().info("%s" % _terms) return _terms
def load_model(): _config_settings = load_config( os.path.join("config", GlobalConfig().get().sources)) check_config(_config_settings) _feeds = _config_settings['news']['sources'] _news_sources, _keywords = fetch_news_sources(_feeds) safe_delete('news_sources.pkl') safe_delete('news_keywords.pkl') with open('news_sources.pkl', 'w') as outfile: pickle.dump(_news_sources, outfile) with open('news_keywords.pkl', 'w') as outfile: pickle.dump(_keywords, outfile) get_logger().info("Fetched %s news articles from %s sources..." % (len(_news_sources.keys()), len(_feeds)))
def post_tweet(topic): _hash_tags = get_hashtags(topic['keywords']) _config = GlobalConfig().get() cfg = { "access_token": _config.twitter.access_token, "access_secret": _config.twitter.access_secret, "consumer_key": _config.twitter.consumer_key, "consumer_secret": _config.twitter.consumer_secret, } try: auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret']) auth.set_access_token(cfg['access_token'], cfg['access_secret']) api = tweepy.API(auth) api.update_status(status=topic['url'] + " " + _hash_tags) except Exception as e: get_logger().error("Error posting to twitter: %s" % e)
def k_means(self): for i in self.summaries: self.totalvocab_stemmed.extend(self.tokenize_and_stem(i)) self.totalvocab_tokenized.extend(Tokenizable.tokenize_only(i)) self.vocab_frame = pd.DataFrame({'words': self.totalvocab_tokenized}, index=self.totalvocab_stemmed) get_logger().info("There are %s items in vocab_frame" % str(self.vocab_frame.shape[0])) km = KMeans(n_clusters=self.num_clusters) km.fit(self.tfidf_maxtrix()) joblib.dump(km, self.output_file) return { "clusters": self.num_clusters, "words": int(str(self.vocab_frame.shape[0])), "articles": len(self.titles), "last_update": int(time.time()), }
def post_message(topic): _hash_tags = get_hashtags(topic['keywords']) _config = GlobalConfig().get() cfg = { "page_id": _config.facebook.page_id, "access_token": _config.facebook.access_token } attachment = { "link": topic['url'], 'caption': "http://www.speciousnews.com" } try: api = facebook.GraphAPI(cfg["access_token"]) api.put_wall_post(message=topic['title'] + " " + _hash_tags, attachment=attachment, profile_id=cfg['page_id']) except Exception as e: get_logger().error("Error posting to facebook: %s" % e)
def _get_cluster_titles(self, frame, index, _terms, max_articles=40): get_logger().info("Cluster %d titles:" % index) _articles = list() _titles = frame.ix[index]['title'].values.tolist() for title in _titles[0:max_articles]: get_logger().info(' %s,' % title) _source = self.sources.get(title) _article = {"title": title, 'polarity': _source.get('polarity', 0)} if _source: _article['html'] = _source.get('html', '') if _source and _source.get('image'): _article['image'] = _source.get('image') if _source and _source.get('summary'): _article['summary'] = _source.get("summary", '') if _source and _source.get('link'): _article['link'] = _source.get("link", '') if _article.get('html') and _article.get('summary'): _articles.append(_article) return Clusterable.mk_new_cluster(_articles, _terms, len(_titles))
def tfidf_maxtrix(self): get_logger().info("Generating TfDif vector.") vectorizer = TfidfVectorizer(max_df=0.5, max_features=2**20, min_df=0.005, stop_words='english', use_idf=True, tokenizer=self.tokenize_and_stem, ngram_range=(1, 3)) get_logger().info("Fitting TfDif vector.") tfidf_matrix = vectorizer.fit_transform(self.summaries) get_logger().info(tfidf_matrix.shape) self.terms = vectorizer.get_feature_names() get_logger().info("Calculating similarity between documents") self.dist = 1 - cosine_similarity(tfidf_matrix) return tfidf_matrix
def fetch_news_sources(news_sources, max_keywords=15, max_days_elapsed=10): news_items = dict() news_keywords = dict() for _source, _news_sources in news_sources.items(): for _rss_news_source in _news_sources: try: get_logger().info("Fetching: %s" % _rss_news_source) _feed = feedparser.parse(_rss_news_source) for _news_item in _feed['items']: days_elapsed = 0 if _news_item.get('published_parsed'): published_time = datetime.fromtimestamp( mktime(_news_item['published_parsed'])) days_elapsed = (datetime.now() - published_time).days if days_elapsed <= max_days_elapsed: parse_news_article(_news_item, _source, news_items, news_keywords) except Exception as e: get_logger().warn("Error reading: %s (%s)" % (_rss_news_source, e)) get_logger().info("Found %s noun phrases." % len(news_keywords.keys())) return news_items, sorted(news_keywords.items(), key=lambda (k, v): (-1 * len(v), k))[0:max_keywords]
def run_model(): with open("news_sources.pkl", "r") as news_sources_file: _news_sources = pickle.loads(news_sources_file.read()) with open("news_keywords.pkl", "r") as news_keywords_file: _keywords = pickle.loads(news_keywords_file.read()) safe_delete('doc_cluster.pkl') try: news = NewsAnalyzer(news_items=_news_sources) _cluster_info = news.k_means() _topics = news.analyze() get_logger().info("Storing results to mongo db...") NewsDB().add_topics(_topics) get_logger().info("Finished saving results to mongo db...") get_logger().info("Saving cluster graph file as html.") _cluster_info['html'] = news.view_cluster_html() NewsDB().update_cluster_info(_cluster_info) get_logger().info("Saved cluster graph to html.") get_logger().info("Saving news articles.") NewsDB().add_news_articles(_news_sources) get_logger().info("Analyzing and saving keywords.") _keywords = NewsAnalyzer.analyze_keywords(_keywords) NewsDB().add_news_keywords(_keywords) get_logger().info("Posting link(s) to social media.") post_message(find_one_random_topic(_topics + _keywords)) post_tweet(find_one_random_topic(_topics + _keywords)) get_logger().info("Done.") except Exception as e: get_logger().error("Error running model: %s" % str(e))
def check_config(config_settings): if not config_settings: get_logger().error("Invalid config file. Exiting...\n") sys.exit(0)
get_logger().info("Analyzing and saving keywords.") _keywords = NewsAnalyzer.analyze_keywords(_keywords) NewsDB().add_news_keywords(_keywords) get_logger().info("Posting link(s) to social media.") post_message(find_one_random_topic(_topics + _keywords)) post_tweet(find_one_random_topic(_topics + _keywords)) get_logger().info("Done.") except Exception as e: get_logger().error("Error running model: %s" % str(e)) def show_usage(): print("Usage: python news.py [--load | --run]\n") sys.exit(1) if __name__ == "__main__": if len(sys.argv) > 2: show_usage() get_logger().info("Starting news model...") if len(sys.argv) == 1: load_model() run_model() elif sys.argv[1].lower() == "--load": load_model() elif sys.argv[1].lower() == "--run": run_model() else: show_usage()