Example #1
0
def load_config(config_file):
    try:
        with open(config_file, "r") as config_file_data:
            _config_file_settings = json.load(config_file_data)
        return _config_file_settings
    except Exception as e:
        get_logger().error("Error opening config file: %s" % e)
        return None
Example #2
0
 def _get_cluster_terms(self, order_centroids, index, _used_terms):
     get_logger().info("Cluster %d words:" % index)
     _terms = ""
     for ind in order_centroids[index, :8]:
         _term = self.vocab_frame.ix[self.terms[ind].split(
             ' ')].values.tolist()[0][0].encode('utf-8', 'ignore')
         if _term not in self.stopwords:
             _terms += " " + _term
     _terms = list(
         set(filter(lambda term: is_keyword(term), _terms.split())))
     _terms = [t for t in _terms if t not in _used_terms][0:4]
     for t in _terms:
         _used_terms.add(t)
     get_logger().info("%s" % _terms)
     return _terms
Example #3
0
def load_model():
    _config_settings = load_config(
        os.path.join("config",
                     GlobalConfig().get().sources))
    check_config(_config_settings)
    _feeds = _config_settings['news']['sources']
    _news_sources, _keywords = fetch_news_sources(_feeds)
    safe_delete('news_sources.pkl')
    safe_delete('news_keywords.pkl')
    with open('news_sources.pkl', 'w') as outfile:
        pickle.dump(_news_sources, outfile)
    with open('news_keywords.pkl', 'w') as outfile:
        pickle.dump(_keywords, outfile)
    get_logger().info("Fetched %s news articles from %s sources..." %
                      (len(_news_sources.keys()), len(_feeds)))
Example #4
0
def post_tweet(topic):
    _hash_tags = get_hashtags(topic['keywords'])
    _config = GlobalConfig().get()
    cfg = {
        "access_token": _config.twitter.access_token,
        "access_secret": _config.twitter.access_secret,
        "consumer_key": _config.twitter.consumer_key,
        "consumer_secret": _config.twitter.consumer_secret,
    }
    try:
        auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret'])
        auth.set_access_token(cfg['access_token'], cfg['access_secret'])
        api = tweepy.API(auth)
        api.update_status(status=topic['url'] + " " + _hash_tags)
    except Exception as e:
        get_logger().error("Error posting to twitter: %s" % e)
Example #5
0
 def k_means(self):
     for i in self.summaries:
         self.totalvocab_stemmed.extend(self.tokenize_and_stem(i))
         self.totalvocab_tokenized.extend(Tokenizable.tokenize_only(i))
     self.vocab_frame = pd.DataFrame({'words': self.totalvocab_tokenized},
                                     index=self.totalvocab_stemmed)
     get_logger().info("There are %s items in vocab_frame" %
                       str(self.vocab_frame.shape[0]))
     km = KMeans(n_clusters=self.num_clusters)
     km.fit(self.tfidf_maxtrix())
     joblib.dump(km, self.output_file)
     return {
         "clusters": self.num_clusters,
         "words": int(str(self.vocab_frame.shape[0])),
         "articles": len(self.titles),
         "last_update": int(time.time()),
     }
Example #6
0
def post_message(topic):
    _hash_tags = get_hashtags(topic['keywords'])
    _config = GlobalConfig().get()
    cfg = {
        "page_id": _config.facebook.page_id,
        "access_token": _config.facebook.access_token
    }
    attachment = {
        "link": topic['url'],
        'caption': "http://www.speciousnews.com"
    }
    try:
        api = facebook.GraphAPI(cfg["access_token"])
        api.put_wall_post(message=topic['title'] + " " + _hash_tags,
                          attachment=attachment,
                          profile_id=cfg['page_id'])
    except Exception as e:
        get_logger().error("Error posting to facebook: %s" % e)
Example #7
0
    def _get_cluster_titles(self, frame, index, _terms, max_articles=40):
        get_logger().info("Cluster %d titles:" % index)
        _articles = list()
        _titles = frame.ix[index]['title'].values.tolist()
        for title in _titles[0:max_articles]:
            get_logger().info(' %s,' % title)
            _source = self.sources.get(title)
            _article = {"title": title, 'polarity': _source.get('polarity', 0)}
            if _source:
                _article['html'] = _source.get('html', '')
            if _source and _source.get('image'):
                _article['image'] = _source.get('image')
            if _source and _source.get('summary'):
                _article['summary'] = _source.get("summary", '')
            if _source and _source.get('link'):
                _article['link'] = _source.get("link", '')
            if _article.get('html') and _article.get('summary'):
                _articles.append(_article)

        return Clusterable.mk_new_cluster(_articles, _terms, len(_titles))
Example #8
0
 def tfidf_maxtrix(self):
     get_logger().info("Generating TfDif vector.")
     vectorizer = TfidfVectorizer(max_df=0.5,
                                  max_features=2**20,
                                  min_df=0.005,
                                  stop_words='english',
                                  use_idf=True,
                                  tokenizer=self.tokenize_and_stem,
                                  ngram_range=(1, 3))
     get_logger().info("Fitting TfDif vector.")
     tfidf_matrix = vectorizer.fit_transform(self.summaries)
     get_logger().info(tfidf_matrix.shape)
     self.terms = vectorizer.get_feature_names()
     get_logger().info("Calculating similarity between documents")
     self.dist = 1 - cosine_similarity(tfidf_matrix)
     return tfidf_matrix
Example #9
0
def fetch_news_sources(news_sources, max_keywords=15, max_days_elapsed=10):
    news_items = dict()
    news_keywords = dict()
    for _source, _news_sources in news_sources.items():
        for _rss_news_source in _news_sources:
            try:
                get_logger().info("Fetching: %s" % _rss_news_source)
                _feed = feedparser.parse(_rss_news_source)
                for _news_item in _feed['items']:
                    days_elapsed = 0
                    if _news_item.get('published_parsed'):
                        published_time = datetime.fromtimestamp(
                            mktime(_news_item['published_parsed']))
                        days_elapsed = (datetime.now() - published_time).days
                    if days_elapsed <= max_days_elapsed:
                        parse_news_article(_news_item, _source, news_items,
                                           news_keywords)
            except Exception as e:
                get_logger().warn("Error reading: %s (%s)" %
                                  (_rss_news_source, e))
    get_logger().info("Found %s noun phrases." % len(news_keywords.keys()))
    return news_items, sorted(news_keywords.items(),
                              key=lambda (k, v):
                              (-1 * len(v), k))[0:max_keywords]
Example #10
0
def run_model():
    with open("news_sources.pkl", "r") as news_sources_file:
        _news_sources = pickle.loads(news_sources_file.read())
    with open("news_keywords.pkl", "r") as news_keywords_file:
        _keywords = pickle.loads(news_keywords_file.read())
    safe_delete('doc_cluster.pkl')
    try:
        news = NewsAnalyzer(news_items=_news_sources)
        _cluster_info = news.k_means()
        _topics = news.analyze()
        get_logger().info("Storing results to mongo db...")
        NewsDB().add_topics(_topics)
        get_logger().info("Finished saving results to mongo db...")
        get_logger().info("Saving cluster graph file as html.")
        _cluster_info['html'] = news.view_cluster_html()
        NewsDB().update_cluster_info(_cluster_info)
        get_logger().info("Saved cluster graph to html.")
        get_logger().info("Saving news articles.")
        NewsDB().add_news_articles(_news_sources)
        get_logger().info("Analyzing and saving keywords.")
        _keywords = NewsAnalyzer.analyze_keywords(_keywords)
        NewsDB().add_news_keywords(_keywords)
        get_logger().info("Posting link(s) to social media.")
        post_message(find_one_random_topic(_topics + _keywords))
        post_tweet(find_one_random_topic(_topics + _keywords))
        get_logger().info("Done.")
    except Exception as e:
        get_logger().error("Error running model: %s" % str(e))
Example #11
0
def check_config(config_settings):
    if not config_settings:
        get_logger().error("Invalid config file.  Exiting...\n")
        sys.exit(0)
Example #12
0
        get_logger().info("Analyzing and saving keywords.")
        _keywords = NewsAnalyzer.analyze_keywords(_keywords)
        NewsDB().add_news_keywords(_keywords)
        get_logger().info("Posting link(s) to social media.")
        post_message(find_one_random_topic(_topics + _keywords))
        post_tweet(find_one_random_topic(_topics + _keywords))
        get_logger().info("Done.")
    except Exception as e:
        get_logger().error("Error running model: %s" % str(e))


def show_usage():
    print("Usage: python news.py [--load | --run]\n")
    sys.exit(1)


if __name__ == "__main__":
    if len(sys.argv) > 2:
        show_usage()
    get_logger().info("Starting news model...")

    if len(sys.argv) == 1:
        load_model()
        run_model()
    elif sys.argv[1].lower() == "--load":
        load_model()
    elif sys.argv[1].lower() == "--run":
        run_model()
    else:
        show_usage()