Beispiel #1
0
 def save_to_db(self):
     mongo = Mongo()
     try:
         mongo.insert(self.row)
     except Exception:
         print(self.row)
         Logger().get_logger().error('Insert Error', exc_info=True)
Beispiel #2
0
    def collect(self):
        socket.setdefaulttimeout(120)  # 120 seconds
        db=Mongo()

        start = datetime(self.START_YEAR, 1, 1, 0, 0, 0, 0)
        end = datetime(self.START_YEAR, 1+1, 1, 0, 0, 0, 0)
        collection = db.create_collection("FilteredNews")
        print("\t", end='\t')
        for category in self.categories:
            print(category, end='\t')
        print()
        while end.year < self.END_YEAR:
            count = collection.find({'RSS_Date': {'$gte': start, '$lt': end}}).count(False)
            # Get Category Count
            result = collection.aggregate([{ '$match': { 'date': {'$gte': start, '$lt': end},} },
                                { "$group": { "_id": { "$toLower": "$category" }, "count": { "$sum": 1 } } },
                                { "$group": { "_id": None, "counts": { "$push": { "k": "$_id", "v": "$count" } } } },
                                { "$replaceRoot": { "newRoot": { "$arrayToObject": "$counts" } } } ])
            print(str(start.year) + "." + str(start.month) + " \t " + str(count), end='\t')
            list_result = list(result)
            for item in list_result:
                for category in self.categories:
                    if category in item:
                        print(item[category], end='\t')
                    else:
                        print('0', end='\t')
            print()
            start = Statistics.add_one_month(start)
            end = Statistics.add_one_month(end)
 def collect(self):
     sites = self.read_website_collection()
     socket.setdefaulttimeout(120)  # 120 seconds
     db = Mongo()
     count = 0
     for info in sites:
         (site, category) = info.split(" ")
         siteHistory = archivecdx.Listing(
             site,
             fl=["original", "timestamp", "digest", "statuscode"],
             filter=["statuscode:200"])
         print("Size of List :" + str(len(siteHistory.listing)))
         for history in siteHistory:
             timestamp = datetime.strptime(history.timestamp,
                                           "%Y%m%d%H%M%S")
             link = 'http://web.archive.org/web/%sid_/%s' % (
                 history.timestamp, history.original)
             print('(%d) - Archive Link : %s - %s' %
                   (count, link, str(datetime.today())))
             #if site == "http://feeds.bbci.co.uk/news/business/rss.xml":
             #    if history.timestamp in self.Pass_List: #Control
             #        continue
             try:
                 d = feedparser.parse(link)
             except Exception as exception:
                 print("FeedParser Timeout ?")
                 Logger().get_logger().error(type(exception).__name__,
                                             exc_info=True)
             newslist = []
             for post in d.entries:
                 try:
                     count = count + 1
                     if db.already_exists(post.link):
                         continue
                     if post.published_parsed:
                         try:
                             dt = datetime.fromtimestamp(
                                 mktime(post.published_parsed))
                         except AttributeError:
                             dt = ''
                     else:
                         dt = ''
                     article = Article(post.link)
                     newslist.append(
                         News.RssNews(
                             title=post.title,
                             time=dt,
                             summery=post.summary,
                             category=category,
                             tags='',
                             url=post.link,
                             iaurl=('http://web.archive.org/web/%sid_/%s' %
                                    (history.timestamp, post.link)),
                             article=article))
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__,
                                                 exc_info=True)
             pool = NewsPool()
             pool.set(newslist)
             pool.join()
 def parse_currency(currency_key, directory, name):  # Type : 1 - Currency
     print("Currency")
     col = Mongo().create_collection("Currency", FDC.get_index_models())
     with open(directory) as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
         print(currency_key)
         hour = -1
         fd = None
         for row in csv_reader:
             if len(row) < 2:  # Check Data
                 continue
             add_value = 0
             if currency_key == "EURUSD":
                 date = DateHelper.str2date(row[0])
                 add_value = -1
             else:
                 date = DateHelper.str2date(row[0]+row[1])
             if hour != date.hour:
                 hour = date.hour
                 if fd is not None:
                     try:
                         col.insert(fd.get_currency())
                     except:
                         Logger().get_logger().error('Insert Error', exc_info=True)
                 fd = FinancialData(name, currency_key, date,
                                    row[FDLocations.Currency_Open.value + add_value],
                                    row[FDLocations.Currency_High.value + add_value],
                                    row[FDLocations.Currency_Low.value + add_value],
                                    row[FDLocations.Currency_Close.value + add_value])
             else:
                 fd.add(row[FDLocations.Currency_High.value + add_value],
                        row[FDLocations.Currency_Low.value + add_value],
                        row[FDLocations.Currency_Close.value + add_value])
 def parse_index_datetime(currency_key, directory, name, interval):  # Type : 4 - Index
     col = Mongo().create_collection("Index")
     with open(directory) as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
         line_count = 0
         print(currency_key)
         hour = -1
         hour_count = 0
         fd = None
         for row in csv_reader:
             if len(row) < 2:  # Check Data
                 continue
             date = DateHelper.str2date(row[0] + row[1])
             if hour != date.hour:
                 hour = date.hour
                 hour_count = 0
                 if fd is not None:
                     print(fd)
                     try:
                         col.insert(fd.get_index())
                     except:
                         Logger().get_logger().error('Insert Error', exc_info=True)
                 fd = FinancialData(name, currency_key, date,
                                    row[FDLocations.IndexDateTime_Open.value],
                                    row[FDLocations.IndexDateTime_High.value],
                                    row[FDLocations.IndexDateTime_Low.value],
                                    row[FDLocations.IndexDateTime_Close.value])
             else:
                 fd.add(row[FDLocations.IndexDateTime_High.value],
                        row[FDLocations.IndexDateTime_Low.value],
                        row[FDLocations.IndexDateTime_Close.value])
                 hour_count += 1
             line_count += 1
         print(f'Processed {line_count} lines.')
Beispiel #6
0
 def evaluate(self):
     LoggerHelper.info("Evaluation Started...")
     nlp = pipeline('sentiment-analysis')
     self.load_model(self.config["evaluation"]["load"])
     self.model.eval()
     self.timer.start()
     db = Mongo()
     news_collection = db.create_collection(self.config["evaluation"]["collection"])
     news_filtered = db.create_collection(self.config["evaluation"]["destination"], NewsOrganizer.get_index_models())
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(self.config["evaluation"]["query"], no_cursor_timeout=True).skip(
                 processed)
             for news in cursor:
                 try:
                     summery = news.get('summery')
                     b_input_ids, b_input_mask = self.reader.get_one_news(summery)
                     b_input_ids, b_input_mask = b_input_ids.to(self.device), b_input_mask.to(self.device)
                     outputs = self.model(b_input_ids, token_type_ids=None,
                                          attention_mask=b_input_mask)
                     logits = outputs[0].detach().cpu().numpy()  # Move result to CPU
                     result = np.argmax(logits, axis=1).flatten()  #
                     sentiment = nlp(summery)
                     if result[0] == 1:
                         news_filtered.insert({
                             "_id": news.get('_id'),
                             "title": news.get('title'),
                             "summery": news.get('summery'),
                             "article": news.get('article'),
                             "url": news.get('url'),
                             "category": news.get('category'),
                             "price_after_minute": news.get('price_after_minute'),
                             "price_after_hour": news.get('price_after_hour'),
                             "price_after_day": news.get('price_after_day'),
                             "sentiment": sentiment,
                             "price_before": news.get('price_before'),
                             "wiki_relatedness": news.get('wiki_relatedness'),
                             "tweet_count": news.get('tweet_count'),
                             "tweet_percentage": news.get('tweet_percentage'),
                             "date": news.get('date'),
                             "authors": news.get('authors'),
                             "comment": news.get('comment'),
                             "price_effect": news.get('price_effect')
                         })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__, exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")
     self.timer.stop(time_for="Evaluation")
Beispiel #7
0
 def __init__(self, config, batch_size, sequence_length):
     self.db = Mongo()
     self.configs = config
     self.batch_size = batch_size
     self.sequence_length = sequence_length
     self.clear_data()
     self.__test_cursor = None
     self.__train_cursor = None
Beispiel #8
0
    def dnn_organizer(self, collection="Product", key="BRTUSD"):
        db = Mongo()
        pre_processing = PreProcessing()
        news_collection = db.create_collection(
            self.config["database"]["collection"])
        news_filtered = db.create_collection(
            self.config["database"]["destination"],
            NewsOrganizer.get_index_models())

        for news in news_collection.find(self.config["database"]["query"]):
            date = news.get('date')
            before = self.get_price_before_date(db, collection, key, date)
            minute = self.get_price_at_date(db, collection, key, date)
            hour = self.get_price_at_date(db,
                                          collection,
                                          key,
                                          date,
                                          minutes=60)
            day = self.get_price_at_date(db,
                                         collection,
                                         key,
                                         date,
                                         add_day=True)
            try:
                news_filtered.insert({
                    "_id":
                    news.get('_id'),
                    "title":
                    pre_processing.preprocess(news.get('title')),
                    "summery":
                    pre_processing.preprocess(news.get('summery')),
                    "article":
                    pre_processing.preprocess(news.get('article')),
                    "url":
                    news.get('url'),
                    "category":
                    news.get('category'),
                    "price_after_minute":
                    minute,
                    "price_after_hour":
                    hour,
                    "price_after_day":
                    day,
                    "price_before":
                    before,
                    "date":
                    date,
                    "authors":
                    news['authors']
                })
            except Exception as exception:
                Logger().get_logger().error(type(exception).__name__,
                                            exc_info=True)
                traceback.print_exc()
Beispiel #9
0
    def organize(self):
        db = Mongo()
        news_collection = db.create_collection("News")
        news_filtered = db.create_collection("FilteredNews",
                                             NewsOrganizer.get_index_models())

        for news in news_collection.find():
            article = NewsOrganizer.get_article(news)
            if article is None:
                FileHelper.append_to_file(self.config["log"]["Article_None"],
                                          news["_id"])
                continue
            if article == "":
                FileHelper.append_to_file(self.config["log"]["Article_Empty"],
                                          news["_id"])
                continue
            date = NewsOrganizer.get_date(news)
            if not date:
                FileHelper.append_to_file(self.config["Log"]["Date_None"],
                                          news["_id"])
                continue
            summery = NewsOrganizer.get_summery(news)
            if not summery:
                FileHelper.append_to_file(self.config["Log"]["Summery_None"],
                                          news["_id"])
                continue
            try:
                news_filtered.insert({
                    "title":
                    NewsOrganizer.get_title(news),
                    "summery":
                    summery,
                    "category":
                    NewsOrganizer.get_category(news),
                    "date":
                    date,
                    "article":
                    article,
                    "url":
                    news['URL'],
                    "canonical_link":
                    news['Canonical_Link'],
                    "authors":
                    news['Authors']
                })
            except Exception as exception:
                Logger().get_logger().error(type(exception).__name__,
                                            exc_info=True)
                traceback.print_exc()
Beispiel #10
0
class WikiRecorder(object):
    def __init__(self, collection_name="Wiki"):
        self.col = Mongo().create_collection(collection_name,
                                             WikiRecorder.get_index_models())
        self.preprocessor = PreProcessing()
        self.config = WikiRecorder.get_config()
        self.total = 0

    def collect_all(self):
        name_list = self.config["Wiki"]["Corporations"]
        for cor_name in name_list:
            self.collect(cor_name)

    def collect(self, title, page_id=None):
        page = Wikipedia.get_page(title, pageid=page_id)

        title = page.original_title
        title_p = self.preprocessor.preprocess(title)
        summary = page.summary
        summary_p = self.preprocessor.preprocess(summary)
        content = page.content
        page_id = page.pageid
        data = {
            'title': title,
            'title_p': title_p,
            'summary': summary,
            'summary_p': summary_p,
            'content': content,
            'page_id': page_id
        }
        print(data)
        try:
            self.col.insert(data)
        except Exception as exception:
            Logger().get_logger().error(type(exception).__name__,
                                        exc_info=True)

    @staticmethod
    def get_index_models():
        return [
            IndexModel("title", name="index_title"),
            IndexModel("page_id", name="index_page_id")
        ]

    @staticmethod
    def get_config():
        pwd = os.path.dirname(os.path.abspath(__file__))
        return json.load(open(pwd + '/config.json', 'r'))
Beispiel #11
0
    def calculate_distance_for_tweet(info, input):
        skip = info["skip"]
        get = info["to"]
        date = info["date"]
        title = info["news_title"]
        db = Mongo(test=2)
        pre = PreProcessing()
        tweets = WordEmbedding.get_tweets_before_date(
            db, date).skip(skip).limit(get)
        tweetcount = 0
        count = 0
        print(get)
        vector = WordEmbedding.get_vector_list(title)
        for tweet in tweets:
            tweetcount += 1
            try:
                cosine = WordEmbedding.cosine_distance_word_embedding_with_vector(
                    vector, pre.preprocess(tweet["tweet_text"]))
                percentage = round((1 - cosine) * 100, 2)
            except Exception as exception:
                print("Exeption")
                percentage = 0

            if percentage > 80:
                count += 1
                if tweet["tweet_user_verified"]:
                    count += 1
        print("count" + str(count))
        return count
 def __init__(self,
              config,
              batch_size,
              sequence_length,
              word_emb_enabled=True):
     self.db = Mongo()
     self.configs = config
     self.batch_size = batch_size
     self.sequence_length = sequence_length
     self.clear_data()
     if word_emb_enabled:
         self.word_embedding = WordEmbedding(
             path=self.configs["wordEmbedding"]["path"])
     self.__test_cursor = None
     self.test_count = 0
     self.__train_cursor = None
     self.train_count = 0
     self.__validate_cursor = None
     self.validate_count = 0
     self.max_min = None
 def collect(self):
     db = Mongo()
     conn = sqlite3.connect(self.SQL_LOCATION)
     c = conn.cursor()
     c.execute(
         'SELECT title, author, date, publication, category, digital, section, url FROM longform'
     )
     line_count = 0
     date_count = 0
     newslist = []
     for row in c:
         url = row[self.Url]
         date = DateHelper.str2date(row[self.Date])
         title = row[self.Title]
         if url == "" or url is None or date == "":  # Is There Url Or Date
             continue
         if db.is_title_url_exists(title, url):
             continue
         allUrls = FileCollector.extract_url_from_text(url)
         article = Article(allUrls[1])
         category = row[self.Category]
         section = row[self.Section]
         newslist.append(
             News.RssNews(title=title,
                          time=date,
                          summery='',
                          category=FileCollector.get_category(
                              category, section),
                          tags='',
                          url=allUrls[1],
                          iaurl=allUrls[0],
                          article=article))
         print(line_count)
         if len(newslist) == 20:
             pool = NewsPool()
             pool.set(newslist)
             pool.join()
             newslist = []
         line_count += 1
     print(f'\t{line_count}')
     print(f'\t{len(newslist)}')
 def parse_stock(currency_key, directory, name, interval):  # Type : 3 - Stock
     print("Stock")
     col = Mongo().create_collection("Stock", FDC.get_index_models())
     with open(directory) as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
         print(currency_key)
         for row in csv_reader:
             if len(row) < 2:  # Check Data
                 continue
             date = DateHelper.str2date(row[0])
             if interval == 60:
                 fd = FinancialData(name, currency_key, date,
                                    row[FDLocations.Stock_Open.value],
                                    row[FDLocations.Stock_High.value],
                                    row[FDLocations.Stock_Low.value],
                                    row[FDLocations.Stock_Close.value],
                                    row[FDLocations.Stock_Volume.value],
                                    row[FDLocations.Stock_Trade.value],
                                    row[FDLocations.Stock_Avg.value])
                 col.insert(fd.get_stock())
             else:
                 print("Not Handled !!!")
Beispiel #15
0
class TaDataReader(object):

    # LSTM Applied On Sequential Data - It unrolls, In the Sequence Dimension
    # Batch Size :
    # Sequence Length : Memorize (Hidden and Cell State)
    def __init__(self, config, batch_size, sequence_length):
        self.db = Mongo()
        self.configs = config
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.clear_data()
        self.__test_cursor = None
        self.__train_cursor = None

    def fetch_train_data(self):
        self.__train_cursor = self.db.get_data(
            self.configs['db'], self.configs['train_query'],
            self.configs['train_query_fields'])
        self.__train_cursor.batch_size(
            self.batch_size * self.sequence_length)  # DB To Local Length

    def fetch_test_data(self):
        self.__test_cursor = self.db.get_data(
            self.configs['db'], self.configs['test_query'],
            self.configs['test_query_fields'])
        self.__test_cursor.batch_size(
            self.batch_size * self.sequence_length)  # DB To Local Length

    def get_train_count(self):
        if self.__train_cursor is None:
            self.fetch_train_data()
        return self.__train_cursor.count()

    def get_train_data(self):
        self.__train_cursor.rewind()
        self.clear_data()
        batch_count = 0
        sequence_count = 0
        for row in self.__train_cursor:
            self.__x_sequence.append(
                np.asarray([row["Open"]], dtype=np.float32))
            self.__y_sequence.append(
                np.asarray([row["Open"]], dtype=np.float32))  # row["High"]
            sequence_count += 1
            if sequence_count % (self.sequence_length + 1) == 0:
                self.__x_sequence.pop()
                self.__y_sequence.pop(0)
                self.x.append(np.asarray(self.__x_sequence, dtype=np.float32))
                self.y.append(np.asarray(self.__y_sequence, dtype=np.float32))
                self.clear_sequence()
                batch_count += 1
                if batch_count % self.batch_size == 0:
                    yield np.asarray(self.x, dtype=np.float32), np.asarray(
                        self.y, dtype=np.float32)
                    self.clear_data()

    def get_test_count(self):
        if self.__test_cursor is None:
            self.fetch_test_data()
        return self.__test_cursor.count()

    def get_test_data(self):
        self.__test_cursor.rewind()
        self.clear_data()
        batch_count = 0
        sequence_count = 0
        for row in self.__test_cursor:
            self.__x_sequence.append(
                np.asarray([row["Open"]], dtype=np.float32))
            self.__y_sequence.append(
                np.asarray([row["Open"]], dtype=np.float32))  # row["High"]
            sequence_count += 1
            if sequence_count % (self.sequence_length + 1) == 0:
                self.__x_sequence.pop()
                self.__y_sequence.pop(0)
                self.x.append(np.asarray(self.__x_sequence, dtype=np.float32))
                self.y.append(np.asarray(self.__y_sequence, dtype=np.float32))
                self.clear_sequence()
                batch_count += 1
                if batch_count % self.batch_size == 0:
                    yield np.asarray(self.x, dtype=np.float32), np.asarray(
                        self.y, dtype=np.float32)
                    self.clear_data()

    def clear_data(self):
        self.x = []
        self.y = []
        self.clear_sequence()

    def clear_sequence(self):
        self.__x_sequence = []
        self.__y_sequence = []
Beispiel #16
0
 def get_wiki(collection="Wiki", title="Brent Crude"):
     db = Mongo()
     query = {"title": title}
     fields = {"summary_p": 1, "_id": 0}
     return db.get_data_one(collection, query, fields)
Beispiel #17
0
 def __init__(self):
     self.config = self.__get_config()
     self.db = Mongo()
 def __init__(self, directory="/Users/kaaneksen/Desktop/Master Project/Twitter/02", collection_name="Tweet"):
     self.directory = directory
     self.col = Mongo().create_collection(collection_name, TweetRecorder.get_index_models())
     self.total = 0
class TweetRecorder(object):

    def __init__(self, directory="/Users/kaaneksen/Desktop/Master Project/Twitter/02", collection_name="Tweet"):
        self.directory = directory
        self.col = Mongo().create_collection(collection_name, TweetRecorder.get_index_models())
        self.total = 0

    def load_all_tweets_in_directory(self, directory=None):
        """Walk all files in directory and loads all tweets into a MongoDb"""
        files_processed = 0
        if directory is None:
            directory = self.directory
        for root, dirs, files in os.walk(directory):
            for file in files:
                files_processed += 1
                filename = os.path.join(root, file)
                if not filename.endswith('.bz2'):
                    continue
                print('Starting work on file ' + str(files_processed) + '): ' + filename)
                self.handle_file(filename)
                if files_processed % 20 == 0:
                    print("Total Tweets Processed : {}".format(self.total))

    def handle_file(self, filename):
        """Takes a filename, loads all tweets into a MongoDb"""
        tweets = TweetRecorder.load_bz2_json(filename)
        tweet_dicts = []
        tweets_saved = 0
        for tweet in tweets:
            tweet_dict, tweets_saved = TweetRecorder.load_tweet(tweet, tweets_saved)  # Extracts proper items and places them in database
            if tweet_dict:
                tweet_dicts.append(tweet_dict)
        self.total = self.total + len(tweet_dicts)
        try:
            self.col.insert_many(tweet_dicts, ordered=False, bypass_document_validation=True)
        except Exception:
            Logger().get_logger().error('Insert Error - Twitter', exc_info=True)
        return True

    @staticmethod
    def load_bz2_json(filename):
        """ Takes a bz2 filename, returns the tweets as a list of tweet dictionaries"""
        data = open(filename, "rb").read()
        lines = bz2.decompress(data).decode("utf-8").split("\n")
        tweets = []
        for line in lines:
            try:
                if line == "":
                    continue
                tweets.append(json.loads(line))
            except:  # I'm kind of lenient as I have millions of tweets, most errors were due to encoding or so)
                continue
        return tweets

    @staticmethod
    def load_tweet(tweet, tweets_saved):
        """Takes a tweet (dictionary) and convert to appropriate dictionary"""
        try:
            tweet_lang = tweet['lang']
            data = {
                '_id': tweet['id'],
                'tweet_text': tweet['text'],
                'tweet_location': tweet['coordinates'],
                'tweet_created_at': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'),
                'tweet_entities': tweet['entities'],
                'tweet_replay_to_tweet': tweet['in_reply_to_status_id'],
                'tweet_replay_to_user': tweet['in_reply_to_user_id'],
                'tweet_user_id': tweet['user']['id'],
                'tweet_user_lang': tweet['user']['lang'],
                'tweet_user_name': tweet['user']['name'],
                'tweet_user_time_zone': tweet['user']['time_zone'],
                'tweet_user_followers_count': tweet['user']['followers_count'],
                'tweet_user_verified': tweet['user']['verified'],
                'tweet_user_all_tweet_count': tweet['user']['statuses_count']
            }
            if tweet_lang != "en":
                return {}, tweets_saved
            else:
                tweets_saved += 1
                return data, tweets_saved
        except KeyError:
            return {}, tweets_saved

    @staticmethod
    def get_index_models():
        return [IndexModel("tweet_created_at", name="index_date"),
                IndexModel("tweet_replay_to_tweet", name="index_replay_to"),
                IndexModel("tweet_user_id", name="index_user_id")]
Beispiel #20
0
 def dnn_organizer_with_wiki_tweets(self,
                                    collection="Product",
                                    key="BRTUSD",
                                    name="Brent Crude"):
     db = Mongo()
     pre_processing = PreProcessing()
     news_collection = db.create_collection(
         self.config["database"]["collection"])
     news_filtered = db.create_collection(
         self.config["database"]["destination"],
         NewsOrganizer.get_index_models())
     wiki_forecast = WikiForecast()
     twitter_forecast = TwitterForecast()
     if self.config["elasticSearch"]["enableTag"]:
         tags = twitter_forecast.get_pre_defined_tags()
     else:
         tags = {"tags": []}
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(
                 self.config["database"]["query"],
                 no_cursor_timeout=True).skip(processed)
             for news in cursor:
                 try:
                     summery = pre_processing.preprocess(
                         news.get('summery'))
                     summery_similarity = wiki_forecast.get_similarity(
                         summery, title=name)
                     date = news.get('date')
                     title = pre_processing.preprocess(news.get('title'))
                     before = self.get_price_before_date(
                         db, collection, key, date)
                     minute = self.get_price_at_date(
                         db, collection, key, date)
                     hour = self.get_price_at_date(db,
                                                   collection,
                                                   key,
                                                   date,
                                                   minutes=60)
                     day = self.get_price_at_date(db,
                                                  collection,
                                                  key,
                                                  date,
                                                  add_day=True)
                     total, percentage = twitter_forecast.get_popularity_from_elastic_search(
                         date,
                         title + tags["tags"],
                         pre_processing,
                         maxsize=self.config["elasticSearch"]["maxSize"])
                     news_filtered.insert({
                         "_id":
                         news.get('_id'),
                         "title":
                         title,
                         "summery":
                         pre_processing.preprocess(news.get('summery')),
                         "article":
                         pre_processing.preprocess(news.get('article')),
                         "url":
                         news.get('url'),
                         "category":
                         news.get('category'),
                         "price_after_minute":
                         minute,
                         "price_after_hour":
                         hour,
                         "price_after_day":
                         day,
                         "price_before":
                         before,
                         "wiki_relatedness":
                         summery_similarity,
                         "tweet_count":
                         total,
                         "tweet_percentage":
                         percentage,
                         "date":
                         date,
                         "authors":
                         news['authors']
                     })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__,
                                                 exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")
Beispiel #21
0
 def __init__(self, collection_name="Wiki"):
     self.col = Mongo().create_collection(collection_name,
                                          WikiRecorder.get_index_models())
     self.preprocessor = PreProcessing()
     self.config = WikiRecorder.get_config()
     self.total = 0
Beispiel #22
0
 def dnn_organizer_for_dnn_filtered_news(self):
     db = Mongo()
     collection = self.config["dnnfiltered"]["text_collection"]
     news_collection = db.create_collection(
         self.config["dnnfiltered"]["collection"])
     news_filtered = db.create_collection(
         self.config["dnnfiltered"]["destination"],
         NewsOrganizer.get_index_models())
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(
                 self.config["dnnfiltered"]["query"],
                 no_cursor_timeout=True).skip(processed)
             for news in cursor:
                 try:
                     url = news.get('url')
                     date = news.get('date')
                     before = self.get_price_before_date(
                         db, "Product", "BRTUSD", date)
                     minute = self.get_price_at_date(
                         db, "Product", "BRTUSD", date)
                     hour = self.get_price_at_date(db,
                                                   "Product",
                                                   "BRTUSD",
                                                   date,
                                                   minutes=60)
                     day = self.get_price_at_date(db,
                                                  "Product",
                                                  "BRTUSD",
                                                  date,
                                                  add_day=True)
                     info = self.get_news_for_link(db,
                                                   collection,
                                                   url,
                                                   fields=None)
                     if info is None:
                         info = {}
                     news_filtered.insert({
                         "_id":
                         news.get('_id'),
                         "title":
                         news.get('title'),
                         "title_o":
                         info.get('title'),
                         "summery":
                         news.get('title'),
                         "summery_o":
                         info.get('summery'),
                         "article":
                         news.get('article'),
                         "article_o":
                         info.get('article'),
                         "url":
                         url,
                         "category":
                         info.get('category'),
                         "price_after_minute":
                         minute,
                         "price_after_hour":
                         hour,
                         "price_after_day":
                         day,
                         "price_before":
                         before,
                         "wiki_relatedness":
                         info.get('wiki_relatedness'),
                         "tweet_count":
                         info.get('tweet_count'),
                         "tweet_percentage":
                         info.get('tweet_percentage'),
                         "date":
                         date,
                         "authors":
                         info.get('authors'),
                         "comment":
                         info.get('comment'),
                         "wiki_relatedness_nor":
                         info.get('wiki_relatedness_nor'),
                         "tweet_count_nor":
                         info.get('tweet_count_nor'),
                         "price_effect":
                         info.get('price_effect')
                     })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__,
                                                 exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")
class NewsDnnBaseDataReader(object):
    DictDataTerm = {'Train': 1, 'Validate': 2, 'Test': 3}

    DictDataType = {'News': 1, 'Wiki': 2, 'WikiAndTweet': 3}

    ArticleMinSize = 10

    # LSTM Applied On Sequential Data - It unrolls, In the Sequence Dimension
    # Batch Size :
    # Sequence Length : Memorize (Hidden and Cell State) -> Article Size
    def __init__(self,
                 config,
                 batch_size,
                 sequence_length,
                 word_emb_enabled=True):
        self.db = Mongo()
        self.configs = config
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.clear_data()
        if word_emb_enabled:
            self.word_embedding = WordEmbedding(
                path=self.configs["wordEmbedding"]["path"])
        self.__test_cursor = None
        self.test_count = 0
        self.__train_cursor = None
        self.train_count = 0
        self.__validate_cursor = None
        self.validate_count = 0
        self.max_min = None

    '''
        Data Fetch
    '''

    def fetch_data(self, fetch_type=1):
        if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]:
            self.__train_cursor = self.db.get_data(
                self.configs['database']['name'],
                self.configs['database']['train']['query'],
                self.configs['database']['fields'],
                notimeout=True)
            if self.configs['database']['sort'] is not None:
                self.__train_cursor = self.__train_cursor.sort(
                    ListHelper.convert_dict_list(
                        self.configs['database']['sort']))
        elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]:
            self.__validate_cursor = self.db.get_data(
                self.configs['database']['name'],
                self.configs['database']['validate']['query'],
                self.configs['database']['fields'],
                notimeout=True)
            if self.configs['database']['sort'] is not None:
                self.__validate_cursor = self.__validate_cursor.sort(
                    ListHelper.convert_dict_list(
                        self.configs['database']['sort']))
        elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]:
            self.__test_cursor = self.db.get_data(
                self.configs['database']['name'],
                self.configs['database']['test']['query'],
                self.configs['database']['fields'],
                notimeout=True)
            if self.configs['database']['sort'] is not None:
                self.__test_cursor = self.__test_cursor.sort(
                    ListHelper.convert_dict_list(
                        self.configs['database']['sort']))
        else:
            LoggerHelper.critical('Unable To Fetch')

    '''
        Get Count
    '''

    def get_count(self, fetch_type=1):
        if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]:
            if self.__train_cursor is None:
                self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Train"])
            self.train_count = self.__train_cursor.count()
            return self.train_count
        elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]:
            if self.__validate_cursor is None:
                self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Validate"])
            self.validate_count = self.__validate_cursor.count()
            return self.validate_count
        elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]:
            if self.__test_cursor is None:
                self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Test"])
            self.test_count = self.__test_cursor.count()
            return self.test_count
        else:
            LoggerHelper.critical('Unable To Fetch')

    '''
        Get Data
    '''

    def get_data(self, fetch_type=1, data_type=1):
        if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]:
            cursor = self.__train_cursor
        elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]:
            cursor = self.__validate_cursor
        elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]:
            cursor = self.__test_cursor
        else:
            LoggerHelper.critical('Unable To Get Cursor (Check Fetch Type)')
            return None
        cursor.rewind()
        self.clear_data()
        if data_type == NewsDnnBaseDataReader.DictDataType["News"]:
            return self.get_data_news(cursor)
        elif data_type == NewsDnnBaseDataReader.DictDataType["Wiki"]:
            return self.get_data_wiki(cursor)
        elif data_type == NewsDnnBaseDataReader.DictDataType["WikiAndTweet"]:
            return self.get_data_wiki_and_tweet(cursor)
        else:
            LoggerHelper.critical('Unknown Data Type (data_type)')
            return None

    '''
        Get Max Min
    '''

    def get_max_min(self):
        data = {}
        for field in self.configs['database']['max_min']['fields']:
            fields = {field: 1, "_id": 0}
            min = self.db.get_data_one(
                self.configs['database']['name'],
                self.configs['database']['max_min']['query'],
                fields=fields,
                sort=[(field, +1)])
            max = self.db.get_data_one(
                self.configs['database']['name'],
                self.configs['database']['max_min']['query'],
                fields=fields,
                sort=[(field, -1)])
            data[field] = {"max": max, "min": min}
        self.max_min = data
        return data

    '''
        NEWS
    '''

    def get_data_news(self, cursor):
        batch_count = 0
        price_start = self.configs["database"]["price"]["start"]
        price_end = self.configs["database"]["price"]["end"]
        for row in cursor:
            embedded_article = self.word_embedding.get_weight_matrix(
                row["article"])
            if len(embedded_article) < NewsDnnBaseDataReader.ArticleMinSize:
                continue
            self.x.append(self.pad_embedded_article(embedded_article))
            self.y.append(
                NewsDnnBaseDataReader.get_classification(
                    row[price_start], row[price_end],
                    self.configs['database']['price']['buffer_percent']))
            batch_count = batch_count + 1
            if batch_count % self.batch_size == 0:
                yield np.asarray(self.x, dtype=np.float32), np.asarray(
                    self.y, dtype=np.float32)
                self.clear_data()

    '''
        WIKI
    '''

    def get_data_wiki(self, cursor):
        batch_count = 0
        price_start = self.configs["database"]["price"]["start"]
        price_end = self.configs["database"]["price"]["end"]
        wiki_column = self.configs['options']['wiki']['wiki_column']
        for row in cursor:
            embedded_article = self.word_embedding.\
                get_weight_matrix_all(article=row["article"],
                                      wiki=row[wiki_column],
                                      wiki_multiply_factors=self.configs['options']['wiki']['multiply_factors'])
            if len(embedded_article) < NewsDnnBaseDataReader.ArticleMinSize:
                continue
            self.x.append(self.pad_embedded_article(embedded_article))
            self.y.append(
                NewsDnnBaseDataReader.get_classification(
                    row[price_start], row[price_end],
                    self.configs['database']['price']['buffer_percent']))
            batch_count = batch_count + 1
            if batch_count % self.batch_size == 0:
                yield np.asarray(self.x, dtype=np.float32), np.asarray(
                    self.y, dtype=np.float32)
                self.clear_data()

    '''
        WIKI & TWEET
    '''

    def get_data_wiki_and_tweet(self, cursor):
        batch_count = 0
        price_start = self.configs["database"]["price"]["start"]
        price_end = self.configs["database"]["price"]["end"]
        wiki_column = self.configs['options']['wiki']['wiki_column']
        tweet_column = self.configs['options']['twitter']['tweet_column']
        wiki_multiply_factors = self.configs['options']['wiki'][
            'multiply_factors']
        tweet_multiply_factors = self.configs['options']['twitter'][
            'multiply_factors']
        for row in cursor:
            embedded_article = self.word_embedding. \
                get_weight_matrix_all(article=row["article"],
                                      wiki=row[wiki_column],
                                      wiki_multiply_factors=wiki_multiply_factors,
                                      tweet=row[tweet_column],
                                      tweet_multiply_factors=tweet_multiply_factors)
            if len(embedded_article) < NewsDnnBaseDataReader.ArticleMinSize:
                continue
            # Article
            self.x.append(self.pad_embedded_article(embedded_article))
            # Price
            self.y.append(
                NewsDnnBaseDataReader.get_classification(
                    row[price_start], row[price_end],
                    self.configs['database']['price']['buffer_percent']))
            batch_count = batch_count + 1
            if batch_count % self.batch_size == 0:
                yield np.asarray(self.x, dtype=np.float32), np.asarray(
                    self.y, dtype=np.float32)
                self.clear_data()

    '''
        HELPER METHODS
    '''

    def pad_embedded_article(self, embedded_article):
        # Calculate Difference
        padding_difference = (embedded_article.shape[0] - self.sequence_length)
        if padding_difference == 0:
            return embedded_article
        if padding_difference >= 0:
            return embedded_article[:-padding_difference]
        else:  # Add Padding
            return np.pad(embedded_article,
                          ((abs(padding_difference), 0), (0, 0)), 'constant')

    def clear_data(self):
        self.x = []
        self.y = []

    @staticmethod
    def get_classification(start, end, buffer_percent):
        diff = float(start["Open"]) - float(end["Open"])
        total = float(start["Open"]) + float(end["Open"]) / 2
        percentage = (diff / total) * 100
        if percentage > buffer_percent:
            return 2  # Increase
        elif percentage < -buffer_percent:
            return 1  # Decrease
        else:
            return 0  # Same Value