def makeImportVocab(cls,
                        basic_path=None,
                        keyword_csv_file=None,
                        important_vocab_csv_file=None):
        #初始化源文件路径和存储文件路径
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if keyword_csv_file is None:
            return None
        if important_vocab_csv_file is None:
            return None
        input_data_path = os.path.join(basic_path, keyword_csv_file)
        output_data_path = os.path.join(basic_path, important_vocab_csv_file)
        VTool.makeDirs(files=[output_data_path])
        #清空接收路径下的文件,初始化列名
        pd.DataFrame({
            "0": [],
            "1": []
        }).to_csv(output_data_path, index=False, encoding="utf-8")

        i = 0
        vocab = {}
        reader = pd.read_csv(input_data_path, chunksize=5000)
        for sentences in reader:
            for sentence in sentences['1']:
                i += 1
                print(i)
                if str(sentence) == 'nan':
                    continue
                words = sentence.split(" ")
                for word in words:
                    if word not in vocab:
                        vocab[word] = 1
                    else:
                        vocab[word] += 1
        sorted_vocab = sorted(vocab.items(), key=lambda v: v[1], reverse=True)

        data = []
        for word, num in sorted_vocab:
            data.append([word, num])
        if len(data) != 0:
            pd.DataFrame(data).to_csv(output_data_path,
                                      index=False,
                                      header=False,
                                      mode="a",
                                      encoding="utf-8")
    def train(self,
              basic_path=None,
              input_file=None,
              output_folder=None,
              embedding_dim=0,
              folder_extra='',
              filter_sizes=None,
              reduce_num=0,
              test_part_start=0.9,
              test_part_end=1,
              data_stand=False,
              times=10):
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if input_file is None or output_folder is None:
            return None
        if filter_sizes is not None:
            self.filter_sizes = filter_sizes

        input_path = os.path.join(basic_path, input_file)
        output_path = os.path.join(basic_path, output_folder)
        VTool.makeDirs(folders=[output_path])
        self.embedding_dim = embedding_dim
        self.filter_sizes = filter_sizes
        print("Writing to {}\n".format(output_path))

        tf.reset_default_graph()
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=self.allow_soft_placement,
                log_device_placement=self.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                x_train, y_train, batch_index, _, _, _ = DataHelper.get_number_data(
                    file=input_path,
                    batch_size=self.batch_size,
                    reduce_num=reduce_num,
                    test_part_start=test_part_start,
                    test_part_end=test_part_end,
                    stand=data_stand)
                if len(x_train) <= 0:
                    print("CSV No Data!!!")
                    exit()
                print("x.shape = {}".format(x_train.shape))
                print("y.shape = {}".format(y_train.shape))

                cnn = CnnImage(sequence_length=x_train.shape[1],
                               num_classes=y_train.shape[1],
                               embedding_size=self.embedding_dim,
                               filter_sizes=self.filter_sizes,
                               num_filters=self.num_filters,
                               full_layer_filters=self.full_layer_filters,
                               l2_reg_lambda=self.l2_reg_lambda)

                # Define Training procedure
                global_step = tf.Variable(0,
                                          name="global_step",
                                          trainable=False)
                optimizer = tf.train.AdamOptimizer(self.learn_rate)
                grads_and_vars = optimizer.compute_gradients(cnn.loss)
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step=global_step)
                '''
                # Keep track of gradient values and sparsity (optional)
                grad_summaries = []
                for g, v in grads_and_vars:
                    if g is not None:
                        grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                        sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                        grad_summaries.append(grad_hist_summary)
                        grad_summaries.append(sparsity_summary)
                grad_summaries_merged = tf.summary.merge(grad_summaries)

                # Output directory for models and summaries
                print("Writing to {}\n".format(output_path))

                # Summaries for loss and accuracy
                loss_summary = tf.summary.scalar("loss", cnn.loss)
                acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

                # Train Summaries
                summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
                summary_dir = os.path.join(output_path, "train_summaries")
                summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)
                VTool.makeDirs(folders=[summary_dir])
                '''

                checkpoint_dir = os.path.abspath(
                    os.path.join(output_path, "checkpoints" + folder_extra))
                checkpoint_prefix = os.path.join(checkpoint_dir, "model")

                VTool.makeDirs(folders=[checkpoint_dir])
                saver = tf.train.Saver()
                sess.run(tf.global_variables_initializer())
                for i in range(times):
                    for step in range(len(batch_index) - 1):
                        feed_dict = {
                            cnn.input_x:
                            x_train[batch_index[step]:batch_index[step + 1]],
                            cnn.input_y:
                            y_train[batch_index[step]:batch_index[step + 1]],
                            cnn.dropout_keep_prob:
                            self.dropout_keep_prob
                        }

                        _, loss, accuracy, predictions, input_y_index = sess.run(
                            [
                                train_op, cnn.loss, cnn.accuracy,
                                cnn.predictions, cnn.input_y_index
                            ], feed_dict)
                        time_str = datetime.datetime.now().isoformat()
                        print("{}: step {}, loss {:g}, acc {:g}".format(
                            time_str, step, loss, accuracy))
                        all_accuracy = cnn.various_accuracy(
                            self.num_labels, input_y_index.tolist(),
                            predictions.tolist())
                        for a in all_accuracy:
                            print(
                                "input_nums: {:g}, pre_nums: {:g}, right_nums: {:g}, accuracy: {:g}"
                                .format(a[0], a[1], a[2], a[3]))
                        # summary_writer.add_summary(summaries, step)

                    if i % 5 == 0:
                        print("保存模型:", saver.save(sess, checkpoint_prefix))
                print("保存模型:", saver.save(sess, checkpoint_prefix))
                print("The train has finished")
Beispiel #3
0
    def calcuWordTrend(self,
                       cur=None,
                       choose_dates=None,
                       basic_path=None,
                       word_cache_file=None,
                       output_file=None):
        if cur == None or choose_dates == None or output_file == None or word_cache_file == None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        word_cache_path = os.path.join(basic_path, word_cache_file)

        index = 0
        is_reload = False
        for choose_date in choose_dates:
            output_path = os.path.join(basic_path,
                                       output_file + '_%s.csv' % index)

            index += 1
            if os.path.exists(output_path) and not is_reload:
                continue

            VTool.makeDirs(files=[output_path])

            date_str_arr = []
            date_rate = {}
            for k in choose_date:
                for d in choose_date[k]:
                    date = d[1]
                    date_str_arr.append('"' + date + '"')
                    date_rate[date] = d[2]
            date_str = ",".join(date_str_arr)

            news = []
            if len(date_str_arr) > 0:
                cur.execute(
                    "SELECT id, time FROM news WHERE time in (%s) order by time, content"
                    % (date_str))
                news_temp = cur.fetchall()

                news_by_id = {}
                for n in news_temp:
                    news_by_id[n[0]] = {}
                    news_by_id[n[0]]['date'] = str(n[1])
                    news_by_id[n[0]]['words'] = ''
                del news_temp

                nid_len = len(news_by_id)
                reader = pd.read_csv(word_cache_path, chunksize=1000)
                for sentences in reader:
                    for k in sentences['1'].keys():
                        nid = sentences['0'][k]
                        if nid in news_by_id and news_by_id[nid]['words'] == '':
                            news_by_id[nid]['words'] = str(
                                sentences['1'][k]).split(" ")
                            nid_len -= 1
                    if nid_len <= 0:
                        break
                reader.close()
                del reader, sentences
            print(len(news_by_id))

            word_dict = {
                "words": {},
                "up_total_words": 0,
                "down_total_words": 0
            }
            i = 0
            for k in news_by_id:
                date = news_by_id[k]['date']
                if date not in date_rate:
                    continue
                if date_rate[date] >= 0:
                    ckey = "up"
                else:
                    ckey = "down"
                words = news_by_id[k]['words']
                for w in words:
                    if w not in word_dict["words"]:
                        word_dict["words"][w] = {"up": 0, "down": 0}
                    word_dict["words"][w][ckey] += 1
                    word_dict["%s_total_words" % ckey] += 1

                i += 1
                print(i)

            if word_dict["up_total_words"] != 0:
                for w in word_dict["words"]:
                    word_dict["words"][w]["up"] = word_dict["words"][w][
                        "up"] / word_dict["up_total_words"]
            if word_dict["down_total_words"] != 0:
                for w in word_dict["words"]:
                    word_dict["words"][w]["down"] = word_dict["words"][w][
                        "down"] / word_dict["down_total_words"]

            csv_data = []
            for w in word_dict["words"]:
                csv_data.append([
                    w, word_dict["words"][w]["up"],
                    word_dict["words"][w]["down"]
                ])
            csv_data.append([
                'total_words', word_dict["up_total_words"],
                word_dict["down_total_words"]
            ])

            pd.DataFrame({
                "0": [],
                "1": [],
                "2": []
            }).to_csv(output_path, index=False, encoding="utf-8")
            pd.DataFrame(csv_data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          encoding="utf-8")
    def makeTrendStockOriginCsv(cls,
                                cur=None,
                                start_date=None,
                                end_date=None,
                                day_num=3,
                                basic_path=None,
                                stock_id=None,
                                word_trend_file=None,
                                news_file=None,
                                output_file=None):
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None or stock_id is None or output_file is None or word_trend_file is None or news_file is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))

        word_trend_path = os.path.join(basic_path, word_trend_file)
        news_path = os.path.join(basic_path, news_file)
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])
        #清空接收路径下的文件,初始化列名
        pd.DataFrame({
            "0": [],
            "1": []
        }).to_csv(output_path, index=False, encoding="utf-8")

        word_trend = {}
        word_trend_temp = pd.read_csv(word_trend_path)
        for k in word_trend_temp["0"].keys():
            word_trend[word_trend_temp["0"][k]] = [
                word_trend_temp["1"][k], word_trend_temp["2"][k]
            ]
        p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] +
                                               word_trend['total_words'][1])
        p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] +
                                                 word_trend['total_words'][1])

        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]
        deviation = 2
        skip = 100
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d "
                %
                (stock_id, start_date, end_date,
                 0 if slimit - day_num - deviation < 0 else slimit - day_num -
                 deviation, skip if slimit - day_num - deviation < 0 else
                 skip + day_num + deviation))
            history_tt = cur.fetchall()
            history_t = []
            for h in history_tt:
                history_t.append([
                    int(h[0]),
                    float(h[1]),
                    float(h[2]),
                    float(h[3]),
                    float(h[4]),
                    float(h[5]),
                    float(h[6]),
                    float(h[7]),
                    float(h[8]),
                    str(h[9])
                ])
            del history_tt

            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'opening': history_temp[1],
                'closing': history_temp[2],
                'difference': history_temp[3],
                'percentage_difference': history_temp[4],
                'lowest': history_temp[5],
                'highest': history_temp[6],
                'volume': history_temp[7],
                'amount': history_temp[8],
                'date': history_temp[9]
            }
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            '''
            '''
            sdate = str(history['date'][history['date'].keys()[0]])
            edate = str(history['date'][history['date'].keys()[-1]])
            cur.execute(
                "SELECT GROUP_CONCAT(id  SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time"
                % (sdate, edate))
            news_temp = cur.fetchall()
            news_by_date = {}
            news_by_id = {}
            for n in news_temp:
                news_by_date[str(n[1])] = n[0].split(",")
                for nid in news_by_date[str(n[1])]:
                    news_by_id[nid] = None
            del news_temp

            nid_len = len(news_by_id)
            reader = pd.read_csv(news_path, chunksize=1000)
            for sentences in reader:
                if nid_len > 0:
                    for k in sentences['1'].keys():
                        nid = str(sentences['0'][k])
                        if nid in news_by_id and news_by_id[nid] == None:
                            news_by_id[nid] = str(sentences['1'][k]).split(" ")
                            wp_up = p_up
                            wp_down = p_down
                            for w in news_by_id[nid]:
                                if w not in word_trend:
                                    wp_up *= (1 / word_trend['total_words'][0])
                                    wp_down *= (1 /
                                                word_trend['total_words'][1])
                                else:
                                    if word_trend[w][0] > 0:
                                        wp_up *= word_trend[w][0]
                                    else:
                                        wp_up *= (1 /
                                                  word_trend['total_words'][0])

                                    if word_trend[w][1] > 0:
                                        wp_down *= word_trend[w][1]
                                    else:
                                        wp_down *= (
                                            1 / word_trend['total_words'][1])
                                while True:
                                    if wp_up < 1 and wp_down < 1:
                                        wp_up *= 10
                                        wp_down *= 10
                                    else:
                                        break

                            news_by_id[nid] = [
                                wp_up / (wp_up + wp_down),
                                -1 * wp_down / (wp_up + wp_down)
                            ]
                            nid_len -= 1
                            if nid_len <= 0:
                                break
                else:
                    break
            reader.close()
            del reader, sentences

            for d in news_by_date:
                sumn = [0, 0]
                for nid in news_by_date[d]:
                    sumn[0] += news_by_id[nid][0]
                    sumn[1] += news_by_id[nid][1]
                le = len(news_by_date[d])
                if le > 0:
                    sumn[0] /= le
                    sumn[1] /= le
                news_by_date[d] = sumn
                print(d)

            history['news_pos_num'] = 0
            history['news_neg_num'] = 0
            for i in history.index:
                history.loc[i, 'rate'] = str(
                    np.round(float(history['rate'][i]), 2))
                if str(history['date'][i]) in news_by_date:
                    history.loc[i, 'news_pos_num'] = str(
                        np.round(
                            float(news_by_date[str(history['date'][i])][0]),
                            2))
                    history.loc[i, 'news_neg_num'] = str(
                        np.round(
                            float(news_by_date[str(history['date'][i])][1]),
                            2))
                else:
                    history.loc[i, 'news_pos_num'] = "0"
                    history.loc[i, 'news_neg_num'] = "0"

            #将经过标准化的数据处理成训练集和测试集可接受的形式
            def func_train_data(data_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None
                print("正在处理的股票代码:%06s" % data_stock.name)

                data_temp_x = data_stock[[
                    "opening", "closing", "difference",
                    "percentage_difference", "lowest", "highest", "volume",
                    "amount", "news_pos_num", "news_neg_num"
                ]]
                data_temp_y = data_stock[["rate", "date", "stock_id"]]
                data_res = []
                for i in range(day_num - 1,
                               len(data_temp_x.index) - deviation):
                    data_res.append([
                        data_temp_x.iloc[i - day_num + 1:i + 1].values.reshape(
                            day_num, 10).tolist()
                    ] + data_temp_y.iloc[i + deviation].values.reshape(
                        1, 3).tolist())
                if len(data_res) != 0:
                    pd.DataFrame(data_res).to_csv(output_path,
                                                  index=False,
                                                  header=False,
                                                  mode="a")

            g_stock_num = history.groupby(by=["stock_id"])
            cls.groupby_skip = False
            g_stock_num.apply(func_train_data)
            slimit += skip
    def makeTextOriginCsv(cls,
                          cur=None,
                          start_date=None,
                          end_date=None,
                          day_num=1,
                          basic_path=None,
                          input_file=None,
                          output_file=None,
                          stock_id=None,
                          rewrite=True):
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None or input_file is None or output_file is None or stock_id is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        input_path = os.path.join(basic_path, input_file)
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])
        '''
        '''
        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]
        if rewrite == True:
            pd.DataFrame({"0": [], "1": []}).to_csv(output_path, index=False)

        deviation = 2
        skip = 50
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id,closing,date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d "
                %
                (stock_id, start_date, end_date,
                 0 if slimit - deviation - day_num < 0 else slimit -
                 deviation - day_num, skip if slimit - deviation - day_num < 0
                 else skip + deviation + day_num))
            history_t = cur.fetchall()

            sdate = str(history_t[0][2])
            edate = str(history_t[-1][2])

            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'closing': history_temp[1],
                'date': history_temp[2]
            }
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            '''
            '''
            cur.execute(
                "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' GROUP BY time order by time "
                % (sdate, edate))
            news_temp = cur.fetchall()

            news_date = {}
            for k in history['date'].keys():
                if (k - deviation - day_num + 1) in history['date']:
                    news_date[str(history['date'][k])] = [
                        str(history['date'][k - deviation - day_num + 1]),
                        str(history['date'][k - deviation])
                    ]

            news_by_date = {}
            news_by_id = {}
            for n in news_temp:
                news_by_date[str(n[1])] = n[0].split(",")
                for nid in news_by_date[str(n[1])]:
                    news_by_id[nid] = ''
            del news_temp

            nid_len = len(news_by_id)
            reader = pd.read_csv(input_path, chunksize=1000)
            for sentences in reader:
                for k in sentences['1'].keys():
                    nid = str(sentences['0'][k])
                    if nid in news_by_id and news_by_id[nid] == '':
                        news_by_id[nid] = str(sentences['1'][k]).split(" ")
                        nid_len -= 1
                if nid_len <= 0:
                    break
            reader.close()
            del reader, sentences

            def func_train_data(date_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None

                date = str(date_stock.name)
                if date not in news_date:
                    return
                sdate = datetime.datetime.strptime(news_date[date][0],
                                                   '%Y-%m-%d')
                edate = datetime.datetime.strptime(news_date[date][1],
                                                   '%Y-%m-%d')

                words = []
                while sdate <= edate:
                    cur_date = sdate.strftime('%Y-%m-%d')
                    sdate += datetime.timedelta(days=1)
                    if cur_date not in news_by_date:
                        print("%s error" % cur_date)
                        return None
                    for i in news_by_date[cur_date]:
                        words += news_by_id[i]

                data = []
                for k in date_stock['stock_id'].keys():
                    data.append([[" ".join(words)],
                                 [
                                     str(np.round(float(history['rate'][k]),
                                                  2)),
                                     str(date_stock['date'][k]),
                                     str(date_stock['stock_id'][k])
                                 ]])

                print("正在处理的日期:%s" % date_stock.name)
                pd.DataFrame(data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          encoding="utf-8")

            g_stock = history.groupby(by=["date"])
            cls.groupby_skip = False
            g_stock.apply(func_train_data)
            slimit += skip
    def makeBindexOriginCsv(cls,
                            cur=None,
                            words=None,
                            start_date=None,
                            end_date=None,
                            day_num=1,
                            basic_path=None,
                            output_file=None,
                            stock_id=None):
        #初始化源文件路径和存储文件路径
        if cur is None or words is None or start_date is None or end_date is None or stock_id is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if output_file is None:
            output_file = "bindex_data.csv"
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        #清空接收路径下的文件,初始化列名
        pd.DataFrame({
            "0": [],
            "1": []
        }).to_csv(output_path, index=False, encoding="utf-8")

        start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')

        for i in range(len(words)):
            words[i] = "'" + words[i] + "'"
        words_str = ",".join(words)

        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]

        deviation = 2
        skip = 100
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id,closing,date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date, stock_id asc limit %d,%d "
                %
                (stock_id, start_date, end_date,
                 0 if slimit - deviation - day_num < 0 else slimit -
                 deviation - day_num, skip if slimit - deviation - day_num < 0
                 else skip + deviation + day_num))
            history_t = cur.fetchall()
            sdate = str(history_t[0][2])
            edate = str(history_t[-1][2])

            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'closing': history_temp[1],
                'date': history_temp[2]
            }
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            '''
            '''
            cur.execute(
                "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc"
                % (words_str, sdate, edate))
            bindex = cur.fetchall()
            news_date = {}
            for k in history['date'].keys():
                if (k - deviation - day_num + 1) in history['date']:
                    news_date[str(history['date'][k])] = [
                        str(history['date'][k - deviation - day_num + 1]),
                        str(history['date'][k - deviation])
                    ]

            bindex_t = []
            bindex_vec = 0
            cur_date = None
            if len(bindex) > 0:
                cur_date = str(bindex[0][2])
            bix = []
            for bi in bindex:
                if str(bi[2]) != cur_date:
                    bindex_t.append([bix, cur_date])
                    cur_date = str(bi[2])
                    bix = []

                bix_temp = json.loads(bi[1])
                bix_temp = sorted(bix_temp.items(), key=lambda v: v[0])
                for k, b in bix_temp:
                    bix_list = sorted(b.items(), key=lambda v: v[0])
                    for kk, bb in bix_list:
                        bix.append(bb)
                if bindex_vec == 0:
                    bindex_vec = len(bix)
            bindex_t.append([bix, cur_date])
            del bindex

            bindex_by_date = {}
            for k in range(1, len(bindex_t)):
                b_t = []
                for kk in range(len(bindex_t[k][0])):
                    if int(bindex_t[k][0][kk]) != 0 and int(
                            bindex_t[k - 1][0][kk]) != 0:
                        b_t.append(
                            str(
                                np.round(
                                    float(100 *
                                          (int(bindex_t[k][0][kk]) /
                                           int(bindex_t[k - 1][0][kk]) - 1)),
                                    2)))
                    else:
                        b_t.append(str(0.00))
                bindex_by_date[bindex_t[k][1]] = b_t
            del bindex_t

            def func_train_data(date_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None

                date = str(date_stock.name)
                if date not in news_date:
                    return
                sdate = datetime.datetime.strptime(news_date[date][0],
                                                   '%Y-%m-%d')
                edate = datetime.datetime.strptime(news_date[date][1],
                                                   '%Y-%m-%d')

                bindexs = []
                while sdate <= edate:
                    cur_date = sdate.strftime('%Y-%m-%d')
                    sdate += datetime.timedelta(days=1)
                    if cur_date not in bindex_by_date:
                        print("%s error" % cur_date)
                        exit()
                    else:
                        bindexs += bindex_by_date[cur_date]

                data = []
                for k in date_stock['stock_id'].keys():
                    data.append([(np.array(bindexs).reshape(
                        int(len(bindexs) / bindex_vec), bindex_vec)).tolist(),
                                 [
                                     str(np.round(float(history['rate'][k]),
                                                  2)),
                                     str(date_stock['date'][k]),
                                     str(date_stock['stock_id'][k])
                                 ]])
                print("正在处理的日期:%s" % date_stock.name)
                pd.DataFrame(data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          encoding="utf-8")

            g_stock = history.groupby(by=["date"])
            cls.groupby_skip = False
            g_stock.apply(func_train_data)
            slimit += skip
    def makeNewsKeywordCacheCsv(cls,
                                cur=None,
                                start_date=None,
                                end_date=None,
                                basic_path=None,
                                analyse_type='tfidf',
                                rewrite=True):
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if analyse_type not in ['tfidf', 'textrank', 'all', 'title']:
            return None
        tfidf = analyse.extract_tags
        textrank = analyse.textrank

        origin_data_path = os.path.join(basic_path,
                                        "%s_keyword_cache.csv" % analyse_type)
        VTool.makeDirs(files=[origin_data_path])
        #清空接收路径下的文件,初始化列名
        if rewrite == True:
            pd.DataFrame({
                "0": [],
                "1": []
            }).to_csv(origin_data_path, index=False, encoding="utf-8")

        skip = 30
        start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        start_date -= datetime.timedelta(days=1)
        end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
        i = 1
        while start_date <= end_date:
            start_date += datetime.timedelta(days=1)
            cur_date = start_date.strftime('%Y-%m-%d')
            start_date += datetime.timedelta(days=skip)
            if start_date > end_date:
                cur_end_date = end_date.strftime('%Y-%m-%d')
            else:
                cur_end_date = start_date.strftime('%Y-%m-%d')

            if analyse_type == 'title':
                cur.execute(
                    "SELECT id, title FROM news WHERE time between '%s' and '%s' order by time, title"
                    % (cur_date, cur_end_date))
            else:
                cur.execute(
                    "SELECT id, content FROM news WHERE time between '%s' and '%s' order by time, content"
                    % (cur_date, cur_end_date))
            news = cur.fetchall()
            news_keyword = []
            for n in news:
                i += 1
                print(i)
                if analyse_type == 'tfidf':
                    kword = tfidf(
                        n[1],
                        allowPOS=['n', 'nr', 'ns', 'nt', 'nz', 'vn', 'v'])
                    kword = " ".join(kword)
                elif analyse_type == 'textrank':
                    kword = textrank(
                        n[1],
                        allowPOS=['n', 'nr', 'ns', 'nt', 'nz', 'vn', 'v'])
                    kword = " ".join(kword)
                elif analyse_type == 'all':
                    kword = cls.jiebafenci(n[1])
                elif analyse_type == 'title':
                    kword = cls.jiebafenci(n[1])
                else:
                    kword = ''
                keywords = [str(n[0]), kword.strip()]
                news_keyword.append(keywords)
            pd.DataFrame(news_keyword).to_csv(origin_data_path,
                                              index=False,
                                              header=False,
                                              mode="a",
                                              encoding="utf-8")
Beispiel #8
0
    def stock_lstm_softmax(self, basic_path=None, train_file=None, model_file=None, log_folder=None, pre_file=None):        
        """
        使用LSTM处理股票数据
        分类预测
        """
        if train_file is None or model_file is None or log_folder is None or pre_file is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        train_path = os.path.join(basic_path, train_file)
        model_path = os.path.join(basic_path, model_file)
        log_path = os.path.join(basic_path, log_folder)
        pre_path = os.path.join(basic_path, pre_file)
        VTool.makeDirs(files=[model_path, pre_path], folders=[log_path])

        tf.reset_default_graph()
        #给batch_size赋值
        self.batch_size = 20
        test_part = 0.1
        self.train_size, self.test_size = VTool.initCsvTrainAndTest(basic_path=basic_path, input_file=train_file, batch_size=self.batch_size, test_part=test_part)
        #学习率
        learning_rate = 0.001
        #喂数据给LSTM的原始数据有几行,即:一次希望LSTM能“看到”多少个交易日的数据
        origin_data_row = 3
        #喂给LSTM的原始数据有几列,即:日线数据有几个元素
        origin_data_col = 8+20
        #LSTM网络有几层
        layer_num = 1
        #LSTM网络,每层有几个神经元
        cell_num = 256
        #最后输出的数据维度,即:要预测几个数据,该处需要处理分类问题,按照自己设定的类型数量设定
        output_num = 3
        #每次给LSTM网络喂多少行经过处理的股票数据。该参数依据自己显卡和网络大小动态调整,越大 一次处理的就越多,越能占用更多的计算资源
        batch_size = tf.placeholder(tf.int32, [])
        #输入层、输出层权重、偏置。
        #通过这两对参数,LSTM层能够匹配输入和输出的数据
        W = {
            'in':tf.Variable(tf.truncated_normal([origin_data_col, cell_num], stddev = 1), dtype = tf.float32),
            'out':tf.Variable(tf.truncated_normal([cell_num, output_num], stddev = 1), dtype = tf.float32)
        }
        bias = {
            'in':tf.Variable(tf.constant(0.1, shape=[cell_num,]), dtype = tf.float32),
            'out':tf.Variable(tf.constant(0.1, shape=[output_num,]), dtype = tf.float32)
        }
        #告诉LSTM网络,即将要喂的数据是几行几列
        #None的意思就是喂数据时,行数不确定交给tf自动匹配
        #我们喂得数据行数其实就是batch_size,但是因为None这个位置tf只接受数字变量,而batch_size是placeholder定义的Tensor变量,表示我们在喂数据的时候才会告诉tf具体的值是多少
        input_x = tf.placeholder(tf.float32, [None, origin_data_col * origin_data_row])
        input_y = tf.placeholder(tf.float32, [None, output_num])
        #处理过拟合问题。该值在其起作用的层上,给该层每一个神经元添加一个“开关”,“开关”打开的概率是keep_prob定义的值,一旦开关被关了,这个神经元的输出将被“阻断”。这样做可以平衡各个神经元起作用的重要性,杜绝某一个神经元“一家独大”,各种大佬都证明这种方法可以有效减弱过拟合的风险。
        keep_prob = tf.placeholder(tf.float32, [])

        #通过reshape将输入的input_x转化成2维,-1表示函数自己判断该是多少行,列必须是origin_data_col
        #转化成2维 是因为即将要做矩阵乘法,矩阵一般都是2维的(反正我没见过3维的)
        input_x_after_reshape_2 = tf.reshape(input_x, [-1, origin_data_col])

        #当前计算的这一行,就是输入层。输入层的激活函数是relu,并且施加一个“开关”,其打开的概率为keep_prob
        #input_rnn即是输入层的输出,也是下一层--LSTM层的输入
        input_rnn = tf.nn.dropout(tf.nn.relu_layer(input_x_after_reshape_2, W['in'], bias['in']), keep_prob)
        
        #通过reshape将输入的input_rnn转化成3维
        #转化成3维,是因为即将要进入LSTM层,接收3个维度的数据。粗糙点说,即LSTM接受:batch_size个,origin_data_row行cell_num列的矩阵,这里写-1的原因与input_x写None一致
        input_rnn = tf.reshape(input_rnn, [-1, origin_data_row, cell_num])

        #定义一个带着“开关”的LSTM单层,一般管它叫细胞
        def lstm_cell():
            cell = rnn.LSTMCell(cell_num, reuse=tf.get_variable_scope().reuse)
            return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
        #这一行就是tensorflow定义多层LSTM网络的代码
        lstm_layers = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)], state_is_tuple = True)
        #初始化LSTM网络
        init_state = lstm_layers.zero_state(batch_size, dtype = tf.float32)

        #使用dynamic_rnn函数,告知tf构建多层LSTM网络,并定义该层的输出
        outputs, state = tf.nn.dynamic_rnn(lstm_layers, inputs = input_rnn, initial_state = init_state, time_major = False)
        h_state = state[-1][1]

        #该行代码表示了输出层
        #将LSTM层的输出,输入到输出层(输出层带softmax激活函数),输出为各个分类的概率
        #假设有3个分类,那么输出举例为:[0.001, 0.992, 0.007],表示第1种分类概率千分之1,第二种99.2%, 第三种千分之7
        y_pre = tf.nn.softmax(tf.matmul(h_state, W['out']) + bias['out'])

        #损失函数,用作指导tf
        #loss定义为交叉熵损失函数,softmax输出层大多都使用的这个损失函数。关于该损失函数详情可以百度下
        loss = -tf.reduce_mean(input_y * tf.log(y_pre))
        #告诉tf,它需要做的事情就是就是尽可能将loss减小
        #learning_rate是减小的这个过程中的参数。如果将我们的目标比喻为“从北向南走路走到菜市场”,我理解的是
        #learning_rate越大,我们走的每一步就迈的越大。初看似乎步子越大越好,但是其实并不能保证每一步都是向南走
        #的,有可能因为训练数据的原因,导致我们朝西走了一大步。或者我们马上就要到菜市场了,但是一大步走过去,给
        #走过了。。。综上,这个learning_rate(学习率参数)的取值,无法给出一个比较普适的,还是需要根据实际情况去
        #尝试和调整。0.001的取值是tf给的默认值
        #上述例子是个人理解用尽可能通俗易懂地语言表达。如有错误,欢迎指正
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

        #这块定义了一个新的值,用作展示训练的效果
        #它的定义为:预测对的 / 总预测数,例如:0.55表示预测正确了55%
        correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        #用以保存参数的函数(跑完下次再跑,就可以直接读取上次跑的结果而不必从头开始)
        saver = tf.train.Saver(tf.global_variables())
        
        #tf要求必须如此定义一个init变量,用以在初始化运行(也就是没有保存模型)时加载各个变量
        init = tf.global_variables_initializer()        
        #设置tf按需使用GPU资源,而不是有多少就占用多少
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        sess = tf.Session(config = config)
        #使用with,保证执行完后正常关闭tf
        with sess and open (pre_path, "w") as f:
            try:
                #定义了存储模型的文件路径,即:当前运行的python文件路径下,文件名为stock_rnn.ckpt
                saver.restore(sess, model_save_path)
                print ("成功加载模型参数")
            except:
                #如果是第一次运行,通过init告知tf加载并初始化变量
                print ("未加载模型参数,文件被删除或者第一次运行")
                sess.run(init)
            
            i = 0
            while True:
                train_x, train_y = self.get_train_softmax(file_path = train_path, time_step = origin_data_row, rtype="train")
                if train_x is None:
                    print ("训练集均已训练完毕")                    
                    saver.save(sess, model_path)
                    print("保存模型\n")
                    break

                if (i + 1) % 10 == 0:
                    train_accuracy = sess.run(accuracy, feed_dict={
                        input_x:train_x, input_y: train_y, keep_prob: 1.0, batch_size: self.batch_size})
                    print ("step: {0}, training_accuracy: {1}".format(i + 1, train_accuracy))
                    saver.save(sess, model_path)
                    print("保存模型\n")
                    #这部分代码作用为:每次保存模型,顺便将预测收益和真实收益输出保存至show_y_softmax.txt文件下。熟悉tf可视化,完全可以采用可视化替代
                    _y_pre_train = sess.run(y_pre, feed_dict={
                        input_x: train_x, input_y: train_y, keep_prob: 1.0, batch_size:  self.batch_size})
                    _loss = sess.run(loss, feed_dict={
                        input_x:train_x, input_y: train_y, keep_prob: 1.0, batch_size: self.batch_size})
                    a1 = np.array(train_y).reshape(self.batch_size, output_num)
                    b1 = np.array(_y_pre_train).reshape(self.batch_size, output_num)
                    f.write(str(a1.tolist()))
                    f.write("\n")
                    f.write(str(b1.tolist()))
                    f.write("\n")
                    f.write(str(_loss))
                    f.write("\n")
                i += 1
                #按照给定的参数训练一次LSTM神经网络
                sess.run(train_op, feed_dict={input_x: train_x, input_y: train_y, keep_prob: 0.6, batch_size: self.batch_size})

            #计算测试数据的准确率
            #读取测试集数据
            test_x, test_y = self.get_train_softmax(file_path = train_path, time_step = origin_data_row, rtype="test")
            print ("test accuracy {0}".format(sess.run(accuracy, feed_dict={
                input_x: test_x, input_y: test_y, keep_prob: 1.0, batch_size:self.test_size})))
            self.init()
Beispiel #9
0
    def make_train_csv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, time_step=3, word_count=20, stock_id_str=None, ranking_type='tfidf'):
        if cur == None or start_date == None or end_date == None or output_file == None or stock_id_str == None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if time_step < 0:
            time_step = 3
        if word_count < 0:
            word_count = 20
        if ranking_type not in ["tfidf", "textrank"]:
            ranking_type = "tfidf"
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        pd.DataFrame({"0":[], "1":[]}).to_csv(output_path, index=False)
        words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type)
        word_count = len(words)
        for i in range(len(words)):
            words[i] = "'" + words[i] + "'"
        words_str = ",".join(words)
        del words

        cur.execute("SELECT count(*) as count FROM history WHERE stock_id in (%s) and date between '%s' and '%s' " % (stock_id_str, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]	    
        stock_id_num = len(stock_id_str.split(","))
        skip = 50 * stock_id_num
        slimit = 0
        while slimit < count:
            cur.execute("SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id in (%s) and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id_str, start_date, end_date, 0 if slimit-stock_id_num < 0 else slimit-stock_id_num, skip if slimit-stock_id_num < 0 else skip+stock_id_num))
            slimit += skip
            history_tt = cur.fetchall()
            history_t = []
            for h in history_tt:
                history_t.append([int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9])])
            del history_tt
            
            sdate = str(history_t[0][9])
            edate = str(history_t[-1][9])
            sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d')
            sdate = (sdate - datetime.timedelta( days=(time_step+1) )).strftime('%Y-%m-%d')            
            cur.execute("SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate))
            bindex = cur.fetchall()
            bindex_t = []            
            bindex_vec = 0
            cur_date = None
            if len(bindex) > 0:
                cur_date = str(bindex[0][2])
            bix = []
            bix_item = [cur_date]
            if len(bindex) > 0:
                for bi in bindex:
                    if str(bi[2]) != cur_date:                    
                        cur_date = str(bi[2])
                        bix.append(bix_item)
                        bix_item = [cur_date]
                    bix_temp = json.loads(bi[1])
                    bix_item.append(bix_temp['all']['0'])
                bix.append(bix_item)
            del bindex

            bindex = {}
            for k in range(1,len(bix)):
                b_t = []
                for kk in range(1,len(bix[k])):
                    if int(bix[k][kk]) != 0 and int(bix[k-1][kk]) != 0:
                        b_t.append(str(np.round(float(100 * (int(bix[k][kk]) / int(bix[k-1][kk]) - 1)), 2)))
                    else:
                        b_t.append(str(0.00))
                bindex[bix[k][0]] = b_t
            del bix

            for i in range(len(history_t)):
                history_t[i] += bindex[history_t[i][9]]
            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {'stock_id':history_temp[0], 'opening':history_temp[1], 'closing':history_temp[2], 'difference':history_temp[3], 'percentage_difference':history_temp[4], 'lowest':history_temp[5], 'highest':history_temp[6], 'volume':history_temp[7], 'amount':history_temp[8], 'date':history_temp[9]}
            for i in range(10, 10+word_count):
                history["word%s" % (i-9)] = history_temp[i]
            del history_t, history_temp        
            history = DataFrame(history)
            g_history = history.groupby(by = ['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
            for i in history.index:
                history.loc[i, 'rate'] = str(np.round(float(history['rate'][i]), 2))

            #将经过标准化的数据处理成训练集和测试集可接受的形式              
            def func_train_data(data_stock, time_step):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None
                print ("正在处理的股票代码:%06s"%data_stock.name)

                word_key_list = []
                for i in range(1,word_count+1):
                    word_key_list.append("word%s" % i)
                x = word_key_list + ["opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount"]
                #提取输入S列(对应train_x)
                data_temp_x = data_stock[x]
                #提取输出列(对应train_y)
                data_temp_y = data_stock[["rate", "date", "stock_id"]]
                data_res = []
                for i in range(time_step - 1, len(data_temp_x.index) - 1):               
                    data_res.append( data_temp_x.iloc[i - time_step + 1: i + 1].values.reshape(1, time_step * (8+word_count)).tolist() + data_temp_y.iloc[i + 1].values.reshape(1,3).tolist() )
                if len(data_res) != 0:
                    pd.DataFrame(data_res).to_csv(output_path, index=False, header=False, mode="a")
            
            g_stock = history.groupby(by = ["stock_id"])
            #清空接收路径下的文件,初始化列名	                    
            cls.groupby_skip = False
            g_stock.apply(func_train_data, time_step = time_step)
Beispiel #10
0
            res_file = os.path.join(basic_path, choose_stock_folder,
                                    model_folder, "ten-fold-ten-times.csv")

            # 读入情况
            columns = []
            for i in range(len(test_part_array) - 1):
                columns.append(str(i))

            csv_res = {}
            if is_reload or not os.path.exists(res_file):
                for i in range(len(test_part_array) - 1):
                    csv_res[str(i)] = []
                    for j in range(test_part_times):
                        csv_res[str(i)].append([])
                VTool.makeDirs(files=[res_file])
                pd.DataFrame(csv_res).to_csv(res_file,
                                             index=False,
                                             columns=columns)
            csv_res = pd.read_csv(res_file)

            array_start = len(test_part_array) - 1
            times_start = test_part_times
            res = {}
            find = False
            for i in csv_res:
                res[i] = csv_res[i].apply(eval).values
                if not find:
                    for j in range(len(res[i])):
                        if len(res[i][j]) == 0:
                            array_start = int(i)
Beispiel #11
0
    def train_rate(self,
                   basic_path=None,
                   data_file=None,
                   model_folder=None,
                   folder_extra="",
                   input_type="origin",
                   word_count=0,
                   input_size=8,
                   batch_size=30,
                   time_step=10,
                   reduce_num=0,
                   test_part_start=0.9,
                   test_part_end=1,
                   times=50):
        if data_file is None or model_folder is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        data_path = os.path.join(basic_path, data_file)
        model_path = os.path.join(basic_path, model_folder)
        VTool.makeDirs(folders=[model_path])

        f = open(data_path)
        df = pd.read_csv(f)
        f.close()
        columns = self.getColumns(input_type=input_type,
                                  word_count=word_count,
                                  res_type="rate")
        data = df[columns].values

        rnn_unit = 8  #单元数量
        output_size = 2

        tf.reset_default_graph()
        X = tf.placeholder(tf.float32,
                           shape=[None, time_step, input_size + word_count])
        Y = tf.placeholder(tf.float32, shape=[None, time_step, output_size])
        keep_prob = tf.placeholder(tf.float32)

        x_train, y_train = self.get_train_data(
            data, model_path, input_size + word_count, batch_size, time_step,
            reduce_num, test_part_start, test_part_end, {
                "input_type": input_type,
                "word_count": word_count,
                "input_size": input_size,
                "time_step": time_step,
                "rnn_unit": rnn_unit,
                "output_size": output_size
            })

        pred, predictions, _ = self.lstm(X=X,
                                         keep_prob=keep_prob,
                                         rnn_unit=rnn_unit,
                                         input_size=input_size + word_count,
                                         output_size=output_size)

        global_step = tf.Variable(0)
        lr = 0.01
        #learning_rate = tf.train.exponential_decay(0.01, global_step, decay_steps=len(x_train), decay_rate=0.95, staircase=True)

        #损失函数
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=pred,
                                                    labels=tf.reshape(
                                                        Y, [-1, 2])))
        train_op = tf.train.AdamOptimizer(lr).minimize(loss,
                                                       global_step=global_step)
        #train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
        saver = tf.train.Saver()

        y_input = tf.argmax(tf.reshape(Y, [-1, 2]), 1)
        correct_predictions = tf.equal(predictions, y_input)
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))

        checkpoint_dir = os.path.abspath(
            os.path.join(model_path, "checkpoints" + folder_extra))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for i in range(times):
                j = 0
                while j < len(x_train):
                    j_end = j + batch_size if j + batch_size < len(
                        x_train) else len(x_train)
                    _, _loss = sess.run([train_op, loss],
                                        feed_dict={
                                            X: x_train[j:j_end],
                                            Y: y_train[j:j_end],
                                            keep_prob: 0.8
                                        })
                    j = j_end
                print("Number of iterations:", i, " loss:", _loss)

                if i % 10 == 0:
                    print("保存模型:", saver.save(sess, checkpoint_prefix))

                    # _predictions = sess.run([predictions],feed_dict={X:x_test, keep_prob:1})
                    # _predictions = np.array(_predictions).reshape((-1, time_step)).tolist()
                    # y_predict = []
                    # for p in _predictions:
                    #     y_predict.append(p[-1])
                    # all_num, right_num, all_accuracy = self.various_accuracy(output_size, y_test, y_predict)
                    # print("All input_nums: {:g}, right_nums: {:g}, accuracy: {:g}".format(all_num, right_num, right_num/all_num))
                    # for a in all_accuracy:
                    #     print("input_nums: {:g}, pre_nums: {:g}, right_nums: {:g}, accuracy: {:g}".format(a[0], a[1], a[2], a[3]))

            print("保存模型:", saver.save(sess, checkpoint_prefix))
            print("The train has finished")
Beispiel #12
0
    def make_train_csv(cls,
                       cur=None,
                       start_date=None,
                       end_date=None,
                       basic_path=None,
                       output_file=None,
                       time_step=10,
                       stock_id_str=None):
        """
        制作股票分类数据
        orgin_data_path:原始数据存放路径
        all_data_path:制作成可被算法接收的文件存放路径
        """
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None or output_file is None or stock_id_str is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if time_step < 0:
            time_step = 10
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        data = cur.execute(
            "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id in (%s) and date between '%s' and '%s' "
            % (stock_id_str, start_date, end_date))
        data = cur.fetchall()
        if len(data) == 0:
            return None

        res = []
        for d in data:
            res.append([
                int(d[0]),
                int(d[1]),
                str(d[2]),
                float(d[3]),
                float(d[4]),
                float(d[5]),
                float(d[6]),
                float(d[7]),
                float(d[8]),
                float(d[9]),
                float(d[10])
            ])
        new_data = []
        for d in zip(*res):
            new_data.append(d)
        origin_data = {
            'id': new_data[0],
            'stock_id': new_data[1],
            'date': new_data[2],
            'opening': new_data[3],
            'closing': new_data[4],
            'difference': new_data[5],
            'percentage_difference': new_data[6],
            'lowest': new_data[7],
            'highest': new_data[8],
            'volume': new_data[9],
            'amount': new_data[10]
        }

        #读取原始数据,只保留需要使用的列
        total_data = DataFrame(origin_data)
        #根据股票代码排序,相同的股票代码按照交易日期排序。
        #inplace参数表示不需要返回排序后的结果,直接覆盖原变量即可
        #error-按字符顺序排序,其中日期没有规格化,故导致排序错误
        total_data.sort_values(by=['stock_id', 'date'], inplace=True)
        #根据股票代码分组
        g_stock_num = total_data.groupby(by=["stock_id"])
        #针对每一组股票,分别计算收益gate,其定义为:(下下一个交易日的开盘价 / 下一个交易日的开盘价) - 1
        #对gate乘以100,使之变成百分比形式(0.09 -> 9,表示9%)
        #使用np.round函数保存两位小数,之后的数字丢弃(9.8346474 - > 9.83)
        total_data["gate"] = 100 * (g_stock_num.shift(0)["closing"] /
                                    g_stock_num.shift(1)["closing"] - 1)
        for i in total_data.index:
            total_data.loc[i, 'gate'] = str(
                np.round(float(total_data['gate'][i]), 2))
        #重新调整列的顺序,为接下来处理成输入、输出形式做准备
        total_data = total_data[[
            "opening", "closing", "difference", "percentage_difference",
            "lowest", "highest", "volume", "amount", "gate", "date", "stock_id"
        ]]

        #将调整列顺序后的代码,重新按照股票代码分组
        g_stock_num = total_data.groupby(by=["stock_id"])

        #拿time_step个交易日的数据(默认为60个交易日),进行标准化
        def func_stand(data_one_stock_num, time_step):
            #通过apply进入函数内的数据,其股票名为data_one_stock_num.name,类型为pd.dataFrame
            #即,进入此函数的数据为所有名为data_one_stock_num.name的集合
            #dataFrame.shape:(num , 11), num是这个股票出现的次数
            for colu_name in data_one_stock_num.columns:
                if colu_name in ["gate", "date", "stock_id"]:
                    continue
                #只针对输入数据进行标准化,标准化算法为: (原始数据 - 平均值) / 标准差
                #这里每一次for循环,都拿出了1列数据,针对这一列进行标准化并覆盖原数据
                data_one_stock_num[colu_name] = data_one_stock_num[colu_name]
                #data_one_stock_num[colu_name] = ((data_one_stock_num[colu_name] - data_one_stock_num[colu_name].rolling(time_step).mean())/data_one_stock_num[colu_name].rolling(time_step).std())
            return data_one_stock_num

        #将经过标准化的数据处理成训练集和测试集可接受的形式
        def func_train_data(data_one_stock_num, time_step):
            if cls.groupby_skip == False:
                cls.groupby_skip = True
                return None
            print("正在处理的股票代码:%06s" % data_one_stock_num.name)
            #提取输入列(对应train_x)
            data_temp_x = data_one_stock_num[[
                "opening", "closing", "difference", "percentage_difference",
                "lowest", "highest", "volume", "amount"
            ]]
            #提取输出列(对应train_y)
            data_temp_y = data_one_stock_num[["gate", "date", "stock_id"]]
            data_res = []
            #for循环从time_step - 1开始,因为前time_step - 1个数据不满足time_step条件
            #例如:time_step为60,即需要60个交易日的数据制成训练集的一个输入,但某只股票因为停牌等原因,只有50个交易日的数据。那么它就可以跳过了,不满足最低数目的要求
            for i in range(time_step - 1, len(data_temp_x.index) - 1):
                data_res.append(
                    data_temp_x.iloc[i - time_step + 1:i +
                                     1].values.reshape(1, time_step *
                                                       8).tolist() +
                    data_temp_y.iloc[i + 1].values.reshape(1, 3).tolist())
            if len(data_res) != 0:
                #使用末尾添加的形式,将各个股票的数据依次添加进设定的路径中。
                #index参数表示是否需添加一列序号,header表示是否需要添加列头,mode表示选择哪一种模型进行csv操作(类似于open函数的模型)
                pd.DataFrame(data_res).to_csv(output_path,
                                              index=False,
                                              header=False,
                                              mode="a")
            return data_one_stock_num

        #数据标准化
        data_after_stand = g_stock_num.apply(func_stand, time_step=time_step)
        data_after_stand.dropna(inplace=True)
        #将数据转成训练集合的形式
        g_stock_num = data_after_stand.groupby(by=["stock_id"])
        #清空接收路径下的文件,初始化列名
        pd.DataFrame({"0": [], "1": []}).to_csv(output_path, index=False)
        cls.groupby_skip = False
        g_stock_num.apply(func_train_data, time_step=time_step)
Beispiel #13
0
    def makeEnsembleData(self, stock_folders=None):
        test_part_array = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        test_part_times = 10
        is_reload = False

        folder_600104 = "600104/" #3
        folder_601318 = "601318/" #5
        folder_002230 = "002230/" #7

        basic_path = self.basic_path

        stock_folders = stock_folders if stock_folders != None else [folder_600104]
        for choose_stock_folder in stock_folders:
            for i in range(len(test_part_array)-1):
                for j in range(test_part_times):
                    
                    folder_extra = '_' + str(i) + '_' + str(j)
                    data_file = os.path.join(basic_path, choose_stock_folder, "ensemble/checkpoints" + folder_extra, "ensemble_data.csv")
                    if os.path.exists(data_file) and not is_reload:
                        continue
                    else:
                        VTool.makeDirs(files=[data_file])

                    data = []

                    # cnn-model-1 1
                    cst1 = CNNStockText()
                    accuracy, profit, origin_profit, predictions, others = cst1.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/title_data.csv", output_folder=choose_stock_folder+"cnn/title_run", word2vec_model="news_title_word2vec", filter_sizes=[3, 4, 5], folder_extra=folder_extra, reduce_num=21, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])

                    cst2 = CNNStockText()
                    # cnn-model-2 2
                    accuracy, profit, origin_profit, predictions, others = cst2.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/tfidf_text_data.csv", output_folder=choose_stock_folder+"cnn/text_run", word2vec_model="news_tfidf_word2vec", filter_sizes=[8, 9, 10], folder_extra=folder_extra, reduce_num=21, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])

                    # cnn-model-3 3
                    csn = CNNStockNumber()            
                    accuracy, profit, origin_profit, predictions, others = csn.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/bindex_data.csv", output_folder=choose_stock_folder+"cnn/bindex_run", embedding_dim=3, filter_sizes=[2], folder_extra=folder_extra, reduce_num=21, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])

                    # cnn-model-4 4
                    accuracy, profit, origin_profit, predictions, others = csn.predict(basic_path=basic_path, input_file=choose_stock_folder+"cnn/news_stock_data_%s.csv" % i, output_folder=choose_stock_folder+"cnn/news_stock_run", embedding_dim=10, filter_sizes=[3, 4, 5], folder_extra=folder_extra, reduce_num=0, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])
                    
                    # lstm-model-1 5
                    so = LSTMStockOrigin()
                    accuracy, profit, origin_profit, predictions, others = so.predict_rate(basic_path=basic_path, data_file=choose_stock_folder+"lstm/stock_origin_data.csv", model_folder=choose_stock_folder+"lstm/origin_model", folder_extra=folder_extra, reduce_num=10, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])

                    # lstm-model-2 6
                    accuracy, profit, origin_profit, predictions, others = so.predict_rate(basic_path=basic_path, data_file=choose_stock_folder+"lstm/stock_bindex_data.csv", model_folder=choose_stock_folder+"lstm/bindex_model", folder_extra=folder_extra, reduce_num=10, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])

                    # lstm-model-3 7
                    accuracy, profit, origin_profit, predictions, others = so.predict_rate(basic_path=basic_path, data_file=choose_stock_folder+"lstm/stock_news_data_%s.csv" % i, model_folder=choose_stock_folder+"lstm/news_model", folder_extra=folder_extra, reduce_num=10, test_part_start=test_part_array[0], test_part_end=test_part_array[-1])
                    data.append([accuracy, profit, origin_profit, predictions, others])

                    ensemble_data = {}
                    for d in data:
                        for di in range(len(d[3])):
                            if d[4][di][1] not in ensemble_data:
                                ensemble_data[d[4][di][1]] = [d[4][di][1], d[4][di][0], []]
                            
                            ensemble_data[d[4][di][1]][2].append(d[3][di])
                    
                    data_len = len(data)
                    data = {}
                    for k in ensemble_data:
                        d = ensemble_data[k]
                        if len(d[2]) == data_len:
                            data[k] = d

                    e_data = sorted(data.items(), key=lambda x: time.mktime(time.strptime(x[0], '%Y-%m-%d')))
                    data = {"date": [], "rate": [], "predictions": []}
                    for d in e_data:
                        data["date"].append(d[1][0])
                        data["rate"].append(d[1][1])
                        data["predictions"].append(d[1][2])
                    pd.DataFrame(data).to_csv(data_file, index=False, columns=["date", "rate", "predictions"])
                    del cst1, cst2, e_data, data
                    gc.collect()
Beispiel #14
0
    def makeOriginDataCsv(cls,
                          cur=None,
                          start_date=None,
                          end_date=None,
                          basic_path=None,
                          output_file=None,
                          stock_id=None):
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        data = cur.execute(
            "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        data = cur.fetchall()
        if len(data) == 0:
            return None

        res = []
        for d in data:
            res.append([
                int(d[0]),
                int(d[1]),
                str(d[2]),
                float(d[3]),
                float(d[4]),
                float(d[5]),
                float(d[6]),
                float(d[7]),
                float(d[8]),
                float(d[9]),
                float(d[10])
            ])
        new_data = []
        for d in zip(*res):
            new_data.append(d)
        origin_data = {
            'id': new_data[0],
            'stock_id': new_data[1],
            'date': new_data[2],
            'opening': new_data[3],
            'closing': new_data[4],
            'difference': new_data[5],
            'percentage_difference': new_data[6],
            'lowest': new_data[7],
            'highest': new_data[8],
            'volume': new_data[9],
            'amount': new_data[10]
        }

        #读取原始数据,只保留需要使用的列
        total_data = DataFrame(origin_data)
        total_data.sort_values(by=['stock_id', 'date'], inplace=True)
        #根据股票代码分组
        g_stock_num = total_data.groupby(by=["stock_id"])
        total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] /
                                    g_stock_num.shift(1)["closing"] - 1)
        for i in total_data.index:
            total_data.loc[i, 'rate'] = str(
                np.round(float(total_data['rate'][i]), 2))
        #重新调整列的顺序,为接下来处理成输入、输出形式做准备
        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ]
        total_data = total_data[columns]

        def func_train_data(data_one_stock_num):
            if cls.groupby_skip == False:
                cls.groupby_skip = True
                return None
            print("正在处理的股票代码:%06s" % data_one_stock_num.name)
            data = {
                "stock_id": [],
                "date": [],
                "opening": [],
                "closing": [],
                "difference": [],
                "percentage_difference": [],
                "lowest": [],
                "highest": [],
                "volume": [],
                "amount": [],
                "rate": []
            }
            for i in range(len(data_one_stock_num.index) - 1):
                for k in data:
                    data[k].append(data_one_stock_num.iloc[i][k])
            pd.DataFrame(data).to_csv(output_path,
                                      index=False,
                                      columns=columns)

        total_data1 = total_data.dropna()
        total_data2 = total_data1.drop(
            total_data1[(total_data1.rate == 'nan')].index)
        g_stock_num = total_data2.groupby(by=["stock_id"])
        #清空接收路径下的文件,初始化列名
        cls.groupby_skip = False
        g_stock_num.apply(func_train_data)