def parse_currency(currency_key, directory, name):  # Type : 1 - Currency
     print("Currency")
     col = Mongo().create_collection("Currency", FDC.get_index_models())
     with open(directory) as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
         print(currency_key)
         hour = -1
         fd = None
         for row in csv_reader:
             if len(row) < 2:  # Check Data
                 continue
             add_value = 0
             if currency_key == "EURUSD":
                 date = DateHelper.str2date(row[0])
                 add_value = -1
             else:
                 date = DateHelper.str2date(row[0]+row[1])
             if hour != date.hour:
                 hour = date.hour
                 if fd is not None:
                     try:
                         col.insert(fd.get_currency())
                     except:
                         Logger().get_logger().error('Insert Error', exc_info=True)
                 fd = FinancialData(name, currency_key, date,
                                    row[FDLocations.Currency_Open.value + add_value],
                                    row[FDLocations.Currency_High.value + add_value],
                                    row[FDLocations.Currency_Low.value + add_value],
                                    row[FDLocations.Currency_Close.value + add_value])
             else:
                 fd.add(row[FDLocations.Currency_High.value + add_value],
                        row[FDLocations.Currency_Low.value + add_value],
                        row[FDLocations.Currency_Close.value + add_value])
Esempio n. 2
0
 def sentiment_analysis_test(date=None, hashtags=None):
     nlp = pipeline('sentiment-analysis')
     if date is None:
         date = DateHelper.str2date("2015-05-12T16:07:40Z")
     if hashtags is None:
         hashtags = ["oil", "crude", "crude oil"]
     tweets = TwitterForecast.get_tweets_before_date_from_elastic_search(
         date, hashtags, days=5, maxsize=10000)
     total_tweets = tweets["hits"]["total"]["value"]
     if total_tweets == 0:
         print("No Tweet Found")
     else:
         for es_tweet in tweets["hits"]["hits"]:
             tweet = es_tweet["_source"]
             try:
                 text = tweet["tweet_text"].replace("\n", "")
                 username = tweet['tweet_user_name']
                 sentiment = nlp(text)[0]
                 if sentiment['score'] > 0.98:
                     if tweet["tweet_user_verified"]:
                         print('[%s-%s] - %s (%s)' %
                               (u"\U0001F44D", sentiment['label'], text,
                                username))
                     else:
                         print('[%s] - %s (%s)' %
                               (sentiment['label'], text, username))
             except Exception as exception:
                 print(exception)
                 traceback.print_exc()
 def parse_index_datetime(currency_key, directory, name, interval):  # Type : 4 - Index
     col = Mongo().create_collection("Index")
     with open(directory) as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
         line_count = 0
         print(currency_key)
         hour = -1
         hour_count = 0
         fd = None
         for row in csv_reader:
             if len(row) < 2:  # Check Data
                 continue
             date = DateHelper.str2date(row[0] + row[1])
             if hour != date.hour:
                 hour = date.hour
                 hour_count = 0
                 if fd is not None:
                     print(fd)
                     try:
                         col.insert(fd.get_index())
                     except:
                         Logger().get_logger().error('Insert Error', exc_info=True)
                 fd = FinancialData(name, currency_key, date,
                                    row[FDLocations.IndexDateTime_Open.value],
                                    row[FDLocations.IndexDateTime_High.value],
                                    row[FDLocations.IndexDateTime_Low.value],
                                    row[FDLocations.IndexDateTime_Close.value])
             else:
                 fd.add(row[FDLocations.IndexDateTime_High.value],
                        row[FDLocations.IndexDateTime_Low.value],
                        row[FDLocations.IndexDateTime_Close.value])
                 hour_count += 1
             line_count += 1
         print(f'Processed {line_count} lines.')
Esempio n. 4
0
 async def __price_handler(self, request):
     request = await request.json()
     date = DateHelper.str2date(request['news_date'])
     info = self.get_price_before_date(self.db, request['collection'],
                                       request['key'], date,
                                       request['range'])
     date_list = []
     open_list = []
     high_list = []
     low_list = []
     close_list = []
     volume_list = []
     for a in info:
         date_list.append(str(a.get('Date')))
         open_list.append(a.get('Open'))
         high_list.append(a.get('High'))
         low_list.append(a.get('Low'))
         close_list.append(a.get('Close'))
         volume_list.append(a.get('Volume'))
     res = {
         'Title': request['collection'] + " - " + request['key'],
         'PriceDate': date_list,
         'OpenPrice': open_list,
         'HighPrice': high_list,
         'LowPrice': low_list,
         'ClosePrice': close_list,
         'Volume': volume_list
     }
     res = JSONEncoder().encode(res)
     return web.json_response(res)
Esempio n. 5
0
 def get_date(news):
     date = news['Date']
     rss_date = news['RSS_Date']
     selected_date = rss_date
     if date:
         if DateHelper.is_time_of_date_exist(date):
             try:
                 if date > rss_date:
                     selected_date = rss_date
                 else:
                     selected_date = date
             except:
                 selected_date = date
     elif rss_date:
         selected_date = rss_date
     else:
         try:
             metadata = news['Meta_Data'].get("pubdate")
             if metadata:
                 return DateHelper.str2date(metadata)
             else:
                 html = news['HTML']
                 sub_index = html.find('publishDate')
                 print(news['Meta_Data']["pubdate"])
                 if sub_index > 0:
                     date = html[sub_index:(sub_index + 100)]
                     result = re.search('publishDate":"(.*?)",', date)
                     if result:
                         print(result.group(1))
                         selected_date = DateHelper.str2date(
                             result.group(1))
                     else:
                         return None
                 else:
                     return None
         except Exception:
             return None
     return selected_date
 def collect(self):
     db = Mongo()
     conn = sqlite3.connect(self.SQL_LOCATION)
     c = conn.cursor()
     c.execute(
         'SELECT title, author, date, publication, category, digital, section, url FROM longform'
     )
     line_count = 0
     date_count = 0
     newslist = []
     for row in c:
         url = row[self.Url]
         date = DateHelper.str2date(row[self.Date])
         title = row[self.Title]
         if url == "" or url is None or date == "":  # Is There Url Or Date
             continue
         if db.is_title_url_exists(title, url):
             continue
         allUrls = FileCollector.extract_url_from_text(url)
         article = Article(allUrls[1])
         category = row[self.Category]
         section = row[self.Section]
         newslist.append(
             News.RssNews(title=title,
                          time=date,
                          summery='',
                          category=FileCollector.get_category(
                              category, section),
                          tags='',
                          url=allUrls[1],
                          iaurl=allUrls[0],
                          article=article))
         print(line_count)
         if len(newslist) == 20:
             pool = NewsPool()
             pool.set(newslist)
             pool.join()
             newslist = []
         line_count += 1
     print(f'\t{line_count}')
     print(f'\t{len(newslist)}')
Esempio n. 7
0
    def __init__(self,
                 config,
                 epochs=None,
                 batch_size=None,
                 seq_length=None,
                 use_gpu=None,
                 lr=None,
                 hidden_size=None):
        self.config = config
        if epochs is None:
            self.epochs = config["networkConfig"]["epochs"]
        else:
            self.epochs = epochs
        if batch_size is None:
            self.batch_size = config["networkConfig"]["batch_size"]
        else:
            self.batch_size = batch_size
        if seq_length is None:
            self.seq_length = self.config["networkConfig"]["sequence_length"]
        else:
            self.seq_length = seq_length
        if use_gpu is None:
            self.use_gpu = self.config["networkConfig"]["useGPU"]
        else:
            self.use_gpu = use_gpu
        if hidden_size is None:
            if self.config["networkConfig"]["hidden_size"] < 0:
                self.hidden_size = None
            else:
                self.hidden_size = self.config["networkConfig"]["hidden_size"]
        else:
            self.hidden_size = hidden_size
        if lr is None:
            self.lr = self.config["networkConfig"]["learning_rate"]
        else:
            self.lr = lr

        self.timer = Timer()
        self.current_date = DateHelper.get_current_date()
        self.criterion = self.load_criterion()
 def parse_stock(currency_key, directory, name, interval):  # Type : 3 - Stock
     print("Stock")
     col = Mongo().create_collection("Stock", FDC.get_index_models())
     with open(directory) as csv_file:
         csv_reader = csv.reader(csv_file, delimiter=',')
         print(currency_key)
         for row in csv_reader:
             if len(row) < 2:  # Check Data
                 continue
             date = DateHelper.str2date(row[0])
             if interval == 60:
                 fd = FinancialData(name, currency_key, date,
                                    row[FDLocations.Stock_Open.value],
                                    row[FDLocations.Stock_High.value],
                                    row[FDLocations.Stock_Low.value],
                                    row[FDLocations.Stock_Close.value],
                                    row[FDLocations.Stock_Volume.value],
                                    row[FDLocations.Stock_Trade.value],
                                    row[FDLocations.Stock_Avg.value])
                 col.insert(fd.get_stock())
             else:
                 print("Not Handled !!!")
Esempio n. 9
0
    def train(self, lr=0.001, clip=5, val_frac=0.1, print_every=10):
        """ Training a network

            Arguments
            ---------
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss

        """
        df = pandas.DataFrame(
            columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss'])
        self.timer.start()
        self.model.train()

        if self.model.train_on_gpu:
            self.model.cuda()

        counter = 0
        h = None
        for e in range(self.epochs):
            if h is None:  # initialize hidden state
                h = self.model.init_hidden(self.reader.batch_size)

            for x, y in self.reader.get_train_data(
            ):  # get_batches(data, batch_size, seq_length):
                counter += 1
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                if self.model.train_on_gpu:
                    inputs, targets = inputs.cuda(), targets.cuda()

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                self.model.zero_grad()
                # get the output from the model -
                output, h = self.model(
                    inputs, h
                )  # Input Should Be 3-Dimensional: seq_len, batch, input_size
                # calculate the loss and perform back propagation
                loss = self.criterion(
                    output,
                    targets.view(self.reader.batch_size *
                                 self.reader.sequence_length))
                loss.backward()
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.model.parameters(), clip)
                self.optimizer.step()

                # loss stats
                if counter % print_every == 0:
                    # Get validation loss
                    val_h = self.model.init_hidden(self.reader.batch_size)
                    val_losses = []
                    self.model.eval()
                    for x, y in self.reader.get_test_data(
                    ):  # get_batches(val_data, batch_size, seq_length):

                        x, y = torch.from_numpy(x), torch.from_numpy(y)

                        # Creating new variables for the hidden state, otherwise
                        # we'd backprop through the entire training history
                        val_h = tuple([each.data for each in val_h])

                        inputs, targets = x, y
                        if self.model.train_on_gpu:
                            inputs, targets = inputs.cuda(), targets.cuda()

                        output, val_h = self.model(inputs, val_h)
                        val_loss = self.criterion(
                            output,
                            targets.view(self.reader.batch_size *
                                         self.reader.sequence_length))

                        val_losses.append(val_loss.item())

                    self.model.train(
                    )  # reset to train mode after iterationg through validation data
                    print("Epoch: {}/{}...".format(e + 1, self.epochs),
                          "Step: {}...".format(counter),
                          "Loss: {:.4f}...".format(loss.item()),
                          "Val Loss: {:.4f}".format(np.mean(val_losses)))
                    df = df.append(
                        {
                            'Epoch': "{}/{}".format(e + 1, self.epochs),
                            'Step': counter,
                            'Last Train Loss': loss.item(),
                            'Mean Test Loss': np.mean(val_losses)
                        },
                        ignore_index=True)
        self.timer.stop()
        self.save_model()
        date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, date)
        Export.append_df_to_excel(self.get_info(), date)
Esempio n. 10
0
    def train(self, clip=5, val_frac=0.1, print_every=20):
        """ Training a network

            Arguments
            ---------
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss

        """
        df = pandas.DataFrame(
            columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss'])
        self.timer.start()
        self.model.train()

        if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
            self.model.cuda()

        counter = 0
        h = None
        for e in range(self.epochs):
            h = self.model.init_hidden(self.reader.batch_size)

            print(self.config["options"]["network_type"])
            print(NewsDnnGeneralDataReader.DictDataType[self.config["options"]
                                                        ["network_type"]])
            # Batch Loop
            for x, y in self.reader.get_data(
                    fetch_type=NewsDnnGeneralDataReader.DictDataTerm["Train"],
                    data_type=NewsDnnGeneralDataReader.DictDataType[
                        self.config["options"]["network_type"]]):
                counter += 1
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                if self.model.can_use_gpu and self.config["networkConfig"][
                        "useGPU"]:
                    inputs, targets = inputs.cuda(), targets.cuda()

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                self.model.zero_grad()

                # get the output from the model -
                output, h = self.model(
                    inputs, h
                )  # Input Should Be 3-Dimensional: seq_len, batch, input_size

                # calculate the loss and perform back propagation
                loss = self.criterion(output.squeeze(), targets.long())
                loss.backward()

                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.model.parameters(), clip)
                self.optimizer.step()

                # Validate
                if counter % print_every == 0:
                    timer = Timer()
                    timer.start()
                    df = self.validate(df, e, counter, loss)
                    timer.stop(time_for="Validate")
                self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)
Esempio n. 11
0
    def train(self, clip=5, val_frac=0.1, print_every=20):
        """ Training a network

            Arguments
            ---------
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            print_every: Number of steps for printing training and validation loss

        """
        df = pandas.DataFrame(columns=['Epoch', 'Step',
                                       'Train Mean Loss Cumulative', 'Train Accuracy',
                                       'Val Mean Loss', 'Val Accuracy'])
        self.timer.start()
        self.model.train()

        if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
            self.model.cuda()

        counter = 0
        for e in range(self.epochs):

            print(self.config["options"]["network_type"])
            print(NewsDnnBaseDataReader.DictDataType[
                      self.config["options"]["network_type"]])
            train_accuracy = 0
            losses = []
            # Batch Loop
            for x, y in self.reader.get_data(fetch_type=NewsDnnBaseDataReader.DictDataTerm["Train"],
                                             data_type=NewsDnnBaseDataReader.DictDataType[self.config["options"]["network_type"]]):
                counter += 1
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]:
                    inputs, targets = inputs.cuda(), targets.cuda()



                # zero accumulated gradients
                self.optimizer.zero_grad()
                # self.model.zero_grad()

                # get the output from the model -
                output = self.model(inputs)  # Input Should Be 3-Dimensional: seq_len, batch, input_size

                # calculate the loss and perform back propagation
                loss = self.criterion(output, targets.long())
                loss.backward()
                losses.append(loss.item())
                train_accuracy += self.calculate_accuracy(output, targets)

                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.model.parameters(), clip)
                self.optimizer.step()

                # Validate In Steps
                if counter % print_every == 0:
                    timer = Timer()
                    timer.start()
                    df = self.validate(df, e, counter, losses, train_accuracy, print_every)
                    train_accuracy = 0  # Clear Train Accuracy
                    timer.stop(time_for="Validate")
                    self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)
Esempio n. 12
0
    def train(self, print_every=20):
        df = pandas.DataFrame(columns=['Epoch', 'Step',
                                       'Train Mean Loss Cumulative', 'Train Accuracy',
                                       'Val Mean Loss', 'Val Accuracy'])
        self.timer.start()
        self.model.train()  # Set mode of model
        losses = []
        train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"],
                                         data_type=NewsCateDataReader.DictDataType[
                                             self.config["options"]["network_type"]])
        for e in range(self.epochs):
            print(self.config["options"]["network_type"])
            print(NewsCateDataReader.DictDataType[
                      self.config["options"]["network_type"]])
            self.model.train()  # Set to Train Mode
            total_loss_for_epoch = 0

            epoch_timer = Timer()
            epoch_timer.start()
            for step, batch in enumerate(train_set): # For each batch of training data...
                # Progress update every 40 batches.
                if step % print_every == 0:
                    # Report progress.
                    print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_set)))
                # Get Data
                b_input_ids = batch[0].to(self.device)
                b_input_mask = batch[1].to(self.device)
                b_labels = batch[2].to(self.device)

                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                self.model.zero_grad()

                # Perform a forward pass (evaluate the model on this training batch).
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = self.model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)
                loss = outputs[0]
                total_loss_for_epoch += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                # modified based on their gradients, the learning rate, etc.
                self.optimizer.step()

                # Update the learning rate.
                self.scheduler.step()

            # Calculate the average loss over the training data.
            avg_train_loss = total_loss_for_epoch / len(train_set)

            # Store the loss value for plotting the learning curve.
            losses.append(avg_train_loss)
            LoggerHelper.info("  Average training loss: {0:.2f}".format(avg_train_loss))
            epoch_timer.stop(time_for="Epoch")

            timer = Timer(start=True)
            df = self.validate(df, e, losses)
            timer.stop(time_for="Validate")
            self.model.train()
        self.timer.stop(time_for="Train")
        self.save_model()
        self.current_date = DateHelper.get_current_date()
        Export.append_df_to_excel(df, self.current_date)
        Export.append_df_to_excel(self.get_info(), self.current_date)