def dnn_organizer(self, collection="Product", key="BRTUSD"): db = Mongo() pre_processing = PreProcessing() news_collection = db.create_collection( self.config["database"]["collection"]) news_filtered = db.create_collection( self.config["database"]["destination"], NewsOrganizer.get_index_models()) for news in news_collection.find(self.config["database"]["query"]): date = news.get('date') before = self.get_price_before_date(db, collection, key, date) minute = self.get_price_at_date(db, collection, key, date) hour = self.get_price_at_date(db, collection, key, date, minutes=60) day = self.get_price_at_date(db, collection, key, date, add_day=True) try: news_filtered.insert({ "_id": news.get('_id'), "title": pre_processing.preprocess(news.get('title')), "summery": pre_processing.preprocess(news.get('summery')), "article": pre_processing.preprocess(news.get('article')), "url": news.get('url'), "category": news.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "date": date, "authors": news['authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc()
class WikiRecorder(object): def __init__(self, collection_name="Wiki"): self.col = Mongo().create_collection(collection_name, WikiRecorder.get_index_models()) self.preprocessor = PreProcessing() self.config = WikiRecorder.get_config() self.total = 0 def collect_all(self): name_list = self.config["Wiki"]["Corporations"] for cor_name in name_list: self.collect(cor_name) def collect(self, title, page_id=None): page = Wikipedia.get_page(title, pageid=page_id) title = page.original_title title_p = self.preprocessor.preprocess(title) summary = page.summary summary_p = self.preprocessor.preprocess(summary) content = page.content page_id = page.pageid data = { 'title': title, 'title_p': title_p, 'summary': summary, 'summary_p': summary_p, 'content': content, 'page_id': page_id } print(data) try: self.col.insert(data) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) @staticmethod def get_index_models(): return [ IndexModel("title", name="index_title"), IndexModel("page_id", name="index_page_id") ] @staticmethod def get_config(): pwd = os.path.dirname(os.path.abspath(__file__)) return json.load(open(pwd + '/config.json', 'r'))
def calculate_distance_for_tweet(info, input): skip = info["skip"] get = info["to"] date = info["date"] title = info["news_title"] db = Mongo(test=2) pre = PreProcessing() tweets = WordEmbedding.get_tweets_before_date( db, date).skip(skip).limit(get) tweetcount = 0 count = 0 print(get) vector = WordEmbedding.get_vector_list(title) for tweet in tweets: tweetcount += 1 try: cosine = WordEmbedding.cosine_distance_word_embedding_with_vector( vector, pre.preprocess(tweet["tweet_text"])) percentage = round((1 - cosine) * 100, 2) except Exception as exception: print("Exeption") percentage = 0 if percentage > 80: count += 1 if tweet["tweet_user_verified"]: count += 1 print("count" + str(count)) return count
def dnn_organizer_with_wiki_tweets(self, collection="Product", key="BRTUSD", name="Brent Crude"): db = Mongo() pre_processing = PreProcessing() news_collection = db.create_collection( self.config["database"]["collection"]) news_filtered = db.create_collection( self.config["database"]["destination"], NewsOrganizer.get_index_models()) wiki_forecast = WikiForecast() twitter_forecast = TwitterForecast() if self.config["elasticSearch"]["enableTag"]: tags = twitter_forecast.get_pre_defined_tags() else: tags = {"tags": []} count = 0 processed = 0 while True: try: cursor = news_collection.find( self.config["database"]["query"], no_cursor_timeout=True).skip(processed) for news in cursor: try: summery = pre_processing.preprocess( news.get('summery')) summery_similarity = wiki_forecast.get_similarity( summery, title=name) date = news.get('date') title = pre_processing.preprocess(news.get('title')) before = self.get_price_before_date( db, collection, key, date) minute = self.get_price_at_date( db, collection, key, date) hour = self.get_price_at_date(db, collection, key, date, minutes=60) day = self.get_price_at_date(db, collection, key, date, add_day=True) total, percentage = twitter_forecast.get_popularity_from_elastic_search( date, title + tags["tags"], pre_processing, maxsize=self.config["elasticSearch"]["maxSize"]) news_filtered.insert({ "_id": news.get('_id'), "title": title, "summery": pre_processing.preprocess(news.get('summery')), "article": pre_processing.preprocess(news.get('article')), "url": news.get('url'), "category": news.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "wiki_relatedness": summery_similarity, "tweet_count": total, "tweet_percentage": percentage, "date": date, "authors": news['authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip")