Example #1
0
    def dnn_organizer(self, collection="Product", key="BRTUSD"):
        db = Mongo()
        pre_processing = PreProcessing()
        news_collection = db.create_collection(
            self.config["database"]["collection"])
        news_filtered = db.create_collection(
            self.config["database"]["destination"],
            NewsOrganizer.get_index_models())

        for news in news_collection.find(self.config["database"]["query"]):
            date = news.get('date')
            before = self.get_price_before_date(db, collection, key, date)
            minute = self.get_price_at_date(db, collection, key, date)
            hour = self.get_price_at_date(db,
                                          collection,
                                          key,
                                          date,
                                          minutes=60)
            day = self.get_price_at_date(db,
                                         collection,
                                         key,
                                         date,
                                         add_day=True)
            try:
                news_filtered.insert({
                    "_id":
                    news.get('_id'),
                    "title":
                    pre_processing.preprocess(news.get('title')),
                    "summery":
                    pre_processing.preprocess(news.get('summery')),
                    "article":
                    pre_processing.preprocess(news.get('article')),
                    "url":
                    news.get('url'),
                    "category":
                    news.get('category'),
                    "price_after_minute":
                    minute,
                    "price_after_hour":
                    hour,
                    "price_after_day":
                    day,
                    "price_before":
                    before,
                    "date":
                    date,
                    "authors":
                    news['authors']
                })
            except Exception as exception:
                Logger().get_logger().error(type(exception).__name__,
                                            exc_info=True)
                traceback.print_exc()
Example #2
0
class WikiRecorder(object):
    def __init__(self, collection_name="Wiki"):
        self.col = Mongo().create_collection(collection_name,
                                             WikiRecorder.get_index_models())
        self.preprocessor = PreProcessing()
        self.config = WikiRecorder.get_config()
        self.total = 0

    def collect_all(self):
        name_list = self.config["Wiki"]["Corporations"]
        for cor_name in name_list:
            self.collect(cor_name)

    def collect(self, title, page_id=None):
        page = Wikipedia.get_page(title, pageid=page_id)

        title = page.original_title
        title_p = self.preprocessor.preprocess(title)
        summary = page.summary
        summary_p = self.preprocessor.preprocess(summary)
        content = page.content
        page_id = page.pageid
        data = {
            'title': title,
            'title_p': title_p,
            'summary': summary,
            'summary_p': summary_p,
            'content': content,
            'page_id': page_id
        }
        print(data)
        try:
            self.col.insert(data)
        except Exception as exception:
            Logger().get_logger().error(type(exception).__name__,
                                        exc_info=True)

    @staticmethod
    def get_index_models():
        return [
            IndexModel("title", name="index_title"),
            IndexModel("page_id", name="index_page_id")
        ]

    @staticmethod
    def get_config():
        pwd = os.path.dirname(os.path.abspath(__file__))
        return json.load(open(pwd + '/config.json', 'r'))
Example #3
0
    def calculate_distance_for_tweet(info, input):
        skip = info["skip"]
        get = info["to"]
        date = info["date"]
        title = info["news_title"]
        db = Mongo(test=2)
        pre = PreProcessing()
        tweets = WordEmbedding.get_tweets_before_date(
            db, date).skip(skip).limit(get)
        tweetcount = 0
        count = 0
        print(get)
        vector = WordEmbedding.get_vector_list(title)
        for tweet in tweets:
            tweetcount += 1
            try:
                cosine = WordEmbedding.cosine_distance_word_embedding_with_vector(
                    vector, pre.preprocess(tweet["tweet_text"]))
                percentage = round((1 - cosine) * 100, 2)
            except Exception as exception:
                print("Exeption")
                percentage = 0

            if percentage > 80:
                count += 1
                if tweet["tweet_user_verified"]:
                    count += 1
        print("count" + str(count))
        return count
Example #4
0
 def dnn_organizer_with_wiki_tweets(self,
                                    collection="Product",
                                    key="BRTUSD",
                                    name="Brent Crude"):
     db = Mongo()
     pre_processing = PreProcessing()
     news_collection = db.create_collection(
         self.config["database"]["collection"])
     news_filtered = db.create_collection(
         self.config["database"]["destination"],
         NewsOrganizer.get_index_models())
     wiki_forecast = WikiForecast()
     twitter_forecast = TwitterForecast()
     if self.config["elasticSearch"]["enableTag"]:
         tags = twitter_forecast.get_pre_defined_tags()
     else:
         tags = {"tags": []}
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(
                 self.config["database"]["query"],
                 no_cursor_timeout=True).skip(processed)
             for news in cursor:
                 try:
                     summery = pre_processing.preprocess(
                         news.get('summery'))
                     summery_similarity = wiki_forecast.get_similarity(
                         summery, title=name)
                     date = news.get('date')
                     title = pre_processing.preprocess(news.get('title'))
                     before = self.get_price_before_date(
                         db, collection, key, date)
                     minute = self.get_price_at_date(
                         db, collection, key, date)
                     hour = self.get_price_at_date(db,
                                                   collection,
                                                   key,
                                                   date,
                                                   minutes=60)
                     day = self.get_price_at_date(db,
                                                  collection,
                                                  key,
                                                  date,
                                                  add_day=True)
                     total, percentage = twitter_forecast.get_popularity_from_elastic_search(
                         date,
                         title + tags["tags"],
                         pre_processing,
                         maxsize=self.config["elasticSearch"]["maxSize"])
                     news_filtered.insert({
                         "_id":
                         news.get('_id'),
                         "title":
                         title,
                         "summery":
                         pre_processing.preprocess(news.get('summery')),
                         "article":
                         pre_processing.preprocess(news.get('article')),
                         "url":
                         news.get('url'),
                         "category":
                         news.get('category'),
                         "price_after_minute":
                         minute,
                         "price_after_hour":
                         hour,
                         "price_after_day":
                         day,
                         "price_before":
                         before,
                         "wiki_relatedness":
                         summery_similarity,
                         "tweet_count":
                         total,
                         "tweet_percentage":
                         percentage,
                         "date":
                         date,
                         "authors":
                         news['authors']
                     })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__,
                                                 exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")