Esempio n. 1
0
def optimal_parameters(train_signal, train_labels, test_signal, test_labels, window_size, paa_segments, vocab_size, gamma):
    S = np.zeros((len(window_size), len(paa_segments), len(vocab_size)))

    max_S = 0
    optimal_labels = np.zeros_like(test_labels)

    for w in range(len(window_size)):
        for p in range(len(paa_segments)):
            for v in range(len(vocab_size)):
                train_sax=sax_via_window(train_signal, window_size[w], paa_segments[p], vocab_size[v],
                   nr_strategy='none', z_threshold=0.01)
                test_sax = sax_via_window(test_signal, window_size[w], paa_segments[p], vocab_size[v],
                             nr_strategy='none')

                train_probs = extract_ngram_probs(train_sax)
                train_prof = extract_train_prof(train_probs, thres=1)
                alarm_regions = extract_alarm_regions(test_sax, train_prof)

#                 print("Attacks detected: ", len(alarm_regions))
                if not alarm_regions:
                    continue

                predicted_labels = np.zeros_like(test_labels)
                predicted_labels[alarm_regions] = 1

                s = compute_s(test_labels, predicted_labels, gamma)
                if s > max_S:
                    max_S = s
                    optimal_labels = predicted_labels
                S[w,p,v] = s

        return S, optimal_labels
Esempio n. 2
0
def discretization(ano_set, train_set, anomaly_set, win_size, paa_size,
                   alphabet_size, signal):
    print('Computing discrete model for ', signal, ' .......')

    # Find discrete sequences in the training data
    sax1 = sax_via_window(series=np.array(train_set[signal]),
                          win_size=win_size,
                          paa_size=paa_size,
                          alphabet_size=alphabet_size,
                          nr_strategy='exact',
                          z_threshold=0.01)

    # Find discrete sequences in the anomalous data
    sax2 = sax_via_window(series=np.array(ano_set[signal]),
                          win_size=win_size,
                          paa_size=paa_size,
                          alphabet_size=alphabet_size,
                          nr_strategy='exact',
                          z_threshold=0.01)

    # List discrete sequences that are in the anomaly set, but not in the training set
    attacklist = []
    for k, v in sax2.items():
        if k not in sax1:
            for e in v:
                attacklist.append(ano_set.index[e])

    # Sort the dates that are found and remove None values
    attacklist = [i for i in attacklist if i is not None]
    attacklist.sort()

    # Add the found anomalies to the output and return
    for i in attacklist:
        anomaly_set.at[i, 'anomaly'] = 1
    return anomaly_set
Esempio n. 3
0
def test_series_to_wordbag():
    """Test TS to vector."""
    dat = np.array([
        0., 0., 0., 0., 0., -0.270340178359072, -0.367828308500142,
        0.666980581124872, 1.87088147328446, 2.14548907684624,
        -0.480859313143032, -0.72911654245842, -0.490308602315934,
        -0.66152028906509, -0.221049033806403, 0.367003418871239,
        0.631073992586373, 0.0487728723414486, 0.762655178750436,
        0.78574757843331, 0.338239686422963, 0.784206454089066,
        -2.14265084073625, 2.11325193044223, 0.186018356196443, 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.519132472499234, -2.604783141655,
        -0.244519550114012, -1.6570790528784, 3.34184602886343,
        2.10361226260999, 1.9796808733979, -0.822247322003058,
        1.06850578033292, -0.678811824405992, 0.804225748913681,
        0.57363964388698, 0.437113583759113, 0.437208643628268,
        0.989892093383503, 1.76545983424176, 0.119483882364649,
        -0.222311941138971, -0.74669456611669, -0.0663660879732063, 0., 0., 0.,
        0., 0.
    ])

    sax_none = sax_via_window(dat, 6, 3, 3, "none", 0.01)
    wordbag = series_to_wordbag(dat, 6, 3, 3, "none", 0.01)
    wordbag2 = manyseries_to_wordbag(np.matrix([dat, dat]), 6, 3, 3, "none",
                                     0.01)

    frequencies = {}
    for k, v in sax_none.items():
        frequencies[k] = len(v)

    for k, v in wordbag.items():
        assert v == frequencies[k]

    for k, v in wordbag2.items():
        assert v == frequencies[k] * 2
Esempio n. 4
0
def test_via_window():
    """Test SAX via window."""
    dat = np.array([
        0., 0., 0., 0., 0., -0.270340178359072, -0.367828308500142,
        0.666980581124872, 1.87088147328446, 2.14548907684624,
        -0.480859313143032, -0.72911654245842, -0.490308602315934,
        -0.66152028906509, -0.221049033806403, 0.367003418871239,
        0.631073992586373, 0.0487728723414486, 0.762655178750436,
        0.78574757843331, 0.338239686422963, 0.784206454089066,
        -2.14265084073625, 2.11325193044223, 0.186018356196443, 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.519132472499234, -2.604783141655,
        -0.244519550114012, -1.6570790528784, 3.34184602886343,
        2.10361226260999, 1.9796808733979, -0.822247322003058,
        1.06850578033292, -0.678811824405992, 0.804225748913681,
        0.57363964388698, 0.437113583759113, 0.437208643628268,
        0.989892093383503, 1.76545983424176, 0.119483882364649,
        -0.222311941138971, -0.74669456611669, -0.0663660879732063, 0., 0., 0.,
        0., 0.
    ])

    sax_none = sax_via_window(dat, 6, 3, 3, "none", 0.01)

    elements_num = 0
    for key in sax_none:
        elements_num += len(sax_none[key])
    elements_num
    assert len(dat) - 6 == elements_num

    cca = sax_none['cca']
    assert np.array_equal(np.array(cca), np.array([0, 1]))

    sax_exact = sax_via_window(dat, 6, 3, 3, "exact", 0.01)
    cca = sax_exact['cca']
    assert np.array_equal(np.array(cca), np.array([0]))

    sax_mindist = sax_via_window(dat, 6, 3, 3, "mindist", 0.01)
    cca = sax_mindist['cca']
    bbc = sax_mindist['bbc']
    assert np.array_equal(np.array(cca), np.array([0]))
    assert np.array_equal(np.array(bbc), np.array([2]))
Esempio n. 5
0
def find_discords_hotsax(series,
                         win_size=100,
                         num_discords=2,
                         alphabet_size=3,
                         paa_size=3,
                         znorm_threshold=0.01,
                         sax_type='unidim'):
    """HOT-SAX-driven discords discovery."""
    discords = list()

    global_registry = set()

    # Z-normalized versions for every subsequence.
    znorms = np.array([
        znorm(series[pos:pos + win_size], znorm_threshold)
        for pos in range(len(series) - win_size + 1)
    ])

    # SAX words for every subsequence.
    sax_data = sax_via_window(series,
                              win_size=win_size,
                              paa_size=paa_size,
                              alphabet_size=alphabet_size,
                              nr_strategy=None,
                              znorm_threshold=0.01,
                              sax_type=sax_type)
    """[2.0] build the 'magic' array"""
    magic_array = list()
    for k, v in sax_data.items():
        magic_array.append((k, len(v)))
    """[2.1] sort it ascending by the number of occurrences"""
    magic_array = sorted(magic_array, key=lambda tup: tup[1])

    while len(discords) < num_discords:

        best_discord = find_best_discord_hotsax(series, win_size,
                                                global_registry, sax_data,
                                                magic_array, znorms)

        if -1 == best_discord[0]:
            break

        discords.append(best_discord)

        mark_start = max(0, best_discord[0] - win_size + 1)
        mark_end = best_discord[0] + win_size

        for i in range(mark_start, mark_end):
            global_registry.add(i)

    return discords
Esempio n. 6
0
def series_to_wordbag(series,
                      win_size,
                      paa_size,
                      alphabet_size=3,
                      nr_strategy='exact',
                      z_threshold=0.01):
    """VSM implementation."""
    sax = sax_via_window(series, win_size, paa_size, alphabet_size,
                         nr_strategy, z_threshold)

    # convert the dict to a wordbag
    frequencies = {}
    for k, v in sax.items():
        frequencies[k] = len(v)
    return frequencies
Esempio n. 7
0
def sax_conversion(nx):
    from saxpy.sax import sax_via_window
    for i in range(nx.shape[0]):
        for j in range(nx.shape[2]):
            sax = sax_via_window(nx[0][:][2], 5, 5, 3, 'none', 0.01)
    return sax
Esempio n. 8
0
x1 = data2.iloc[1:, 1].values.flatten()  # pick a random sample from class 0
x1 = x1.astype(np.float)

plt.plot(x1)
plt.show()

window_size = 50
word_size = 3
alphabet_size = 3
nr_strategy = "exact"
z_threshold = 0.01

#sax_via_window(data, win_size(segmentation), paa_size(word_size), alphabet_size,nr_strategy='exact', z_threshold=0.01):

sax_window = sax_via_window(x1, window_size, word_size, alphabet_size,
                            nr_strategy, z_threshold)

sax_keys = list(sax_window.keys())
sax_values = list(sax_window.values())

i = 0
for n_val in sax_values:
    print(x1[n_val])
    print(n_val)
    print(sax_keys[i])
    x2 = list()

    for n1_val in n_val:
        #print(x1[n1_val], ",",x1[n1_val+1],",",x1[n1_val+2] )
        alpha_count = 0
        while (alpha_count < alphabet_size):
Esempio n. 9
0
def find_best_discord_hotsax(series, win_size, a_size, paa_size,
                             znorm_threshold, globalRegistry):  # noqa: C901
    """Find the best discord with hotsax."""
    """
    [1.0] get the sax data first
        将一个 time series 转化为 SAX字典 (key: 字符串, value: 窗口索引组成的列表)
    """
    sax_none = sax_via_window(series, win_size, a_size, paa_size, "none", 0.01)
    """
    [2.0] build the 'magic' array
        magic_array: a list of tuples
        (字符串, 窗口索引个数)
    """
    magic_array = list()
    for k, v in sax_none.items():
        magic_array.append((k, len(v)))
    """
    [2.1] sort it desc by the key
        按照 窗口索引个数降序 对 tuple 排序
    """
    m_arr = sorted(magic_array, key=lambda tup: tup[1])
    """
    [3.0] define the key vars

    bestSoFarPosition
        bestSoFarDistance对应的窗口开始索引
        这个窗口是该时间序列的异常子序列

    bestSoFarDistance
        max(min(distance))
        对于每一个窗口, 我们求出它与其他窗口的最小距离
        对所有的最小距离取一个最大值
    """
    bestSoFarPosition = -1
    bestSoFarDistance = 0.

    distanceCalls = 0

    visit_array = np.zeros(len(series), dtype=np.int)
    """[4.0] and we are off iterating over the magic array entries"""
    for entry in m_arr:
        """[5.0] some moar of teh vars"""
        curr_word = entry[0]
        # occurrences 当前word 的窗口索引列表
        occurrences = sax_none[curr_word]
        """[6.0] jumping around by the same word occurrences makes it easier to
        nail down the possibly small distance value 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值
         -- so we can be efficient and all that..."""

        # curr_pos 当前窗口索引 开始索引
        for curr_pos in occurrences:

            # 若 已经在 globalRegistry 跳出本次循环
            if curr_pos in globalRegistry:
                continue
            """[7.0] we don't want an overlapping subsequence"""
            # 避免 重复的子序列
            mark_start = curr_pos - win_size
            mark_end = curr_pos + win_size

            # 我们要找到 与 当前窗口 相似性最大的(距离最小的)窗口, 而 visit_set 定义我们已经看过的窗口的开始索引
            visit_set = set(range(mark_start, mark_end))
            """[8.0] here is our subsequence in question"""
            # cur_seq 标准化的子序列
            cur_seq = znorm(series[curr_pos:(curr_pos + win_size)],
                            znorm_threshold)
            """[9.0] let's see what is NN distance"""
            # 定义 nn_dist 为: 当前窗口 与 其他窗口 的最小距离 (两窗口 不能有重复部分 且 不能相邻?)
            nn_dist = np.inf
            # 定义 bool 是否进行随机搜索
            do_random_search = 1
            """[10.0] ordered by occurrences search first"""
            # 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值
            for next_pos in occurrences:
                """[11.0] skip bad pos"""
                # 避免 重复子序列
                if next_pos in visit_set:
                    continue
                else:
                    visit_set.add(next_pos)
                """[12.0] distance we compute"""
                dist = euclidean(
                    cur_seq,
                    znorm(series[next_pos:(next_pos + win_size)],
                          znorm_threshold))
                distanceCalls += 1
                """[13.0] keep the books up-to-date"""
                # 更新 nn_dist
                if dist < nn_dist:
                    nn_dist = dist
                if dist < bestSoFarDistance:
                    do_random_search = 0
                    break
            """[13.0] if not broken above,
            we shall proceed with random search"""
            # 上面循环正常结束 并没有提前跳出 那我们就要进行随机搜索

            if do_random_search:
                """[14.0] build that random visit order array"""
                curr_idx = 0
                for i in range(0,
                               (len(series) -
                                win_size)):  # 为什么不是 len(series) - win_size + 1

                    # 当前窗口开始索引 在上面没有查看过
                    if not (i in visit_set):

                        # 将其添加到 visit_array 中
                        visit_array[curr_idx] = i
                        curr_idx += 1

                # 此时 curr_idx 为 在上面没查看过的窗口开始索引的个数

                # 打乱顺序
                it_order = np.random.permutation(visit_array[0:curr_idx])
                curr_idx -= 1
                """[15.0] and go random"""

                while curr_idx >= 0:

                    # 随机选择 窗口开始索引 it_order[curr_idx]
                    rand_pos = it_order[curr_idx]

                    curr_idx -= 1

                    dist = euclidean(
                        cur_seq,
                        znorm(series[rand_pos:(rand_pos + win_size)],
                              znorm_threshold))
                    distanceCalls += 1
                    """[16.0] keep the books up-to-date again"""

                    # 更新 nn_dist
                    if dist < nn_dist:
                        nn_dist = dist

                    if dist < bestSoFarDistance:
                        nn_dist = dist
                        break
            """[17.0] and BIGGER books"""

            # 更新 bestSoFarDistance 和 bestSoFarPosition
            if (nn_dist > bestSoFarDistance) and (nn_dist < np.inf):
                bestSoFarDistance = nn_dist
                bestSoFarPosition = curr_pos

    return (bestSoFarPosition, bestSoFarDistance)
Esempio n. 10
0
def prepare_data(window_len, word_len, alphabet_len, alpha_to_num):
    source = input("Source (1=CSV,2=Finance): ")
    if source == "1":
        source = input("CSV file (weather_JAN.csv): ")
        if source == "":
            source = "weather_JAN.csv"
        ts_data = pd.read_csv(source,
                              index_col="date",
                              parse_dates=["date"],
                              dtype=np.float32)
        sax_ret = sax_via_window(ts_data["x"].values,
                                 window_len,
                                 word_len,
                                 alphabet_size=alphabet_len,
                                 nr_strategy="none",
                                 z_threshold=0.01)
    else:
        source = input("Remote Source (yahoo): ")
        if source == "":
            source = "yahoo"

        contract = input("Contract (SPY): ")
        if contract == "":
            contract = "SPY"

        start_date = input("Start Date (2000-01-01): ")
        if start_date == "":
            start_date = "2000-01-01"

        end_date = input("End Date (now): ")
        if end_date == "":
            end_date = datetime.datetime.now()

        ts_data = get_asset_data(source, contract, start_date, end_date)
        if "Close" in ts_data:
            close_tag = "Close"
        elif "close" in ts_data:
            close_tag = "close"
        else:
            return {"Error": "Couldn't find Close data."}
        ts_data["x"] = ts_data[close_tag]
        sax_ret = sax_via_window(ts_data["x"].values,
                                 window_len,
                                 word_len,
                                 alphabet_size=alphabet_len,
                                 nr_strategy="none",
                                 z_threshold=0.01)
    result_x = result_y = []
    if sax_ret:
        my_sax = dict()
        for k, v in sax_ret.items():
            for i in v:
                my_sax[i] = k

        tmp_d = {"x": [], "y": []}
        for i in range(len(my_sax)):
            word = my_sax[i]
            if i < len(my_sax) - 1:
                pred = my_sax[i + 1][-1]
                num_list = [np.float32(alpha_to_num[char][1]) for char in word]
                increment_list = []
                for num in num_list:
                    increment_list.append(num)
                    tmp_d["x"].append(np.array(increment_list))
                    tmp_d["y"].append(
                        np.array([np.float32(alpha_to_num[pred][1])]))

        last_ts = ts_data["x"].values[-window_len:]
        last_sax = my_sax[len(my_sax) - 1]
        print("LAST WINDOW ITEMS: ", last_ts)
        print("MAX  WINDOW ITEM : ", max(last_ts))
        print("MIN  WINDOW ITEM : ", min(last_ts))
        print("       LAST SAX  : ", last_sax)

        # FORMAT:
        # result_x[0] = [1]         result_y[0] = 3
        # result_x[1] = [1,4]       result_y[1] = 3
        # result_x[2] = [1,4,2]     result_y[2] = 3
        # result_x[3] = [1,4,2,2]   result_y[3] = 3
        # result_x[4] = [1,4,2,2,4] result_y[4] = 3
        #####

        opt_val_test = input("Validate and Test: ")
        if opt_val_test == "y":
            train_percent = float(input("Train %: "))
            # Separate Dataset into train, val and test
            pos_train = int(len(tmp_d["x"]) * train_percent)
            pos_train = int(pos_train / window_len) * window_len

            pos_val = len(tmp_d["x"][pos_train:]) / 2
            pos_val = pos_train + int(pos_val / window_len) * window_len

            pos_test = pos_val

            result_x = dict()
            result_x["train"] = tmp_d["x"][:pos_train]
            result_x["val"] = tmp_d["x"][pos_train:pos_val]
            result_x["test"] = tmp_d["x"][pos_test:]

            result_y = dict()
            result_y["train"] = np.array(tmp_d["y"][:pos_train])
            result_y["val"] = np.array(tmp_d["y"][pos_train:pos_val])
            result_y["test"] = np.array(tmp_d["y"][pos_val:])
        else:
            result_x = {"train": tmp_d["x"]}
            result_y = {"train": np.array(tmp_d["y"])}
    else:
        print("Not enough data!")

    return result_x, result_y
Esempio n. 11
0
def find_best_discord_hotsax(series, win_size, a_size, paa_size,
                             znorm_threshold, globalRegistry):  # noqa: C901
    """Find the best discord with hotsax."""
    """[1.0] get the sax data first"""
    sax_none = sax_via_window(series, win_size, a_size, paa_size, "none", 0.01)
    """[2.0] build the 'magic' array"""
    magic_array = list()
    for k, v in sax_none.items():
        magic_array.append((k, len(v)))
    """[2.1] sort it desc by the key"""
    m_arr = sorted(magic_array, key=lambda tup: tup[1])
    """[3.0] define the key vars"""
    bestSoFarPosition = -1
    bestSoFarDistance = 0.

    distanceCalls = 0

    visit_array = np.zeros(len(series), dtype=np.int)
    """[4.0] and we are off iterating over the magic array entries"""
    for entry in m_arr:
        """[5.0] some moar of teh vars"""
        curr_word = entry[0]
        occurrences = sax_none[curr_word]
        """[6.0] jumping around by the same word occurrences makes it easier to
        nail down the possibly small distance value -- so we can be efficient
        and all that..."""
        for curr_pos in occurrences:

            if curr_pos in globalRegistry:
                continue
            """[7.0] we don't want an overlapping subsequence"""
            mark_start = curr_pos - win_size
            mark_end = curr_pos + win_size
            visit_set = set(range(mark_start, mark_end))
            """[8.0] here is our subsequence in question"""
            cur_seq = znorm(series[curr_pos:(curr_pos + win_size)],
                            znorm_threshold)
            """[9.0] let's see what is NN distance"""
            nn_dist = np.inf
            do_random_search = 1
            """[10.0] ordered by occurrences search first"""
            for next_pos in occurrences:
                """[11.0] skip bad pos"""
                if next_pos in visit_set:
                    continue
                else:
                    visit_set.add(next_pos)
                """[12.0] distance we compute"""
                dist = euclidean(
                    cur_seq,
                    znorm(series[next_pos:(next_pos + win_size)],
                          znorm_threshold))
                distanceCalls += 1
                """[13.0] keep the books up-to-date"""
                if dist < nn_dist:
                    nn_dist = dist
                if dist < bestSoFarDistance:
                    do_random_search = 0
                    break
            """[13.0] if not broken above,
            we shall proceed with random search"""
            if do_random_search:
                """[14.0] build that random visit order array"""
                curr_idx = 0
                for i in range(0, (len(series) - win_size)):
                    if not (i in visit_set):
                        visit_array[curr_idx] = i
                        curr_idx += 1
                it_order = np.random.permutation(visit_array[0:curr_idx])
                curr_idx -= 1
                """[15.0] and go random"""
                while curr_idx >= 0:
                    rand_pos = it_order[curr_idx]
                    curr_idx -= 1

                    dist = euclidean(
                        cur_seq,
                        znorm(series[rand_pos:(rand_pos + win_size)],
                              znorm_threshold))
                    distanceCalls += 1
                    """[16.0] keep the books up-to-date again"""
                    if dist < nn_dist:
                        nn_dist = dist
                    if dist < bestSoFarDistance:
                        nn_dist = dist
                        break
            """[17.0] and BIGGER books"""
            if (nn_dist > bestSoFarDistance) and (nn_dist < np.inf):
                bestSoFarDistance = nn_dist
                bestSoFarPosition = curr_pos

    return (bestSoFarPosition, bestSoFarDistance)
    def _prepare_data(self, alpha_to_num):

        result_x = result_y = dict()
        last_sax_word = ""
        sax_ret = dict()
        if self.source_type == "csv":
            if "http://" in self.source or "https://" in self.source:
                csv_file = "tmp.csv"
                urlretrieve(self.source, csv_file)
                ts_data = pd.read_csv(csv_file)
                if "input" in ts_data:
                    ts_data["input"] = pd.to_numeric(ts_data["input"],
                                                     downcast='float')
                    sax_ret = sax_via_window(ts_data["input"].values,
                                             self.window_len,
                                             self.word_len,
                                             alphabet_size=self.alphabet_size,
                                             nr_strategy="none",
                                             z_threshold=0.01)
                os.remove(csv_file)
            else:
                log.error("Error: Invalid Link.")
                return result_x, result_y, last_sax_word
        elif self.source_type == "financial":
            ts_data = self._get_asset_data()
            if "Close" in ts_data:
                close_tag = "Close"
            elif "close" in ts_data:
                close_tag = "close"
            else:
                log.error("Error: Couldn't find Close data.")
                return result_x, result_y, last_sax_word

            ts_data["input"] = ts_data[close_tag]
            sax_ret = sax_via_window(ts_data["input"].values,
                                     self.window_len,
                                     self.word_len,
                                     alphabet_size=self.alphabet_size,
                                     nr_strategy="none",
                                     z_threshold=0.01)
        else:
            log.error("Invalid 'source_type'!")
            return result_x, result_y, last_sax_word

        if sax_ret:
            my_sax = dict()
            for k, v in sax_ret.items():
                for i in v:
                    my_sax[i] = k

            tmp_d = {"x": [], "y": []}
            for i in range(len(my_sax)):
                word = my_sax[i]
                if i < len(my_sax) - 1:
                    pred = my_sax[i + 1][-1]
                    num_list = [
                        np.float32(alpha_to_num[char][1]) for char in word
                    ]
                    increment_list = []
                    for num in num_list:
                        increment_list.append(num)
                        tmp_d["x"].append(np.array(increment_list))
                        tmp_d["y"].append(
                            np.array([np.float32(alpha_to_num[pred][1])]))

            result_x = {"train": tmp_d["x"]}
            result_y = {"train": np.array(tmp_d["y"])}
            if my_sax:
                last_sax_word = my_sax[len(my_sax) - 1]
            else:
                log.error("Not enough SAX data!")
        else:
            log.error("Not enough data!")

        return result_x, result_y, last_sax_word
Esempio n. 13
0
def main():
    window_len = int(input("window_len: "))
    word_len = int(input("word_len: "))
    alphabet_len = int(input("alphabet_len: "))

    alpha_to_num_step = float(1 / alphabet_len)
    alpha_to_num_map = float(alpha_to_num_step / 2)

    source = "weather_JAN.csv"
    ts_data = pd.read_csv(source,
                          index_col="date",
                          parse_dates=["date"],
                          dtype=np.float32)
    sax_ret = sax_via_window(ts_data["temp"].values,
                             window_len,
                             word_len,
                             alphabet_size=alphabet_len,
                             nr_strategy="none",
                             z_threshold=0.01)

    my_sax = dict()
    for k, v in sax_ret.items():
        for i in v:
            my_sax[i] = k

    tmp_d = {"x": [], "y": []}
    for k, v in my_sax.items():
        num_list = [
            np.float32(((ord(char) - 96) * alpha_to_num_step) -
                       alpha_to_num_map) for char in v[:-1]
        ]
        increment_list = []
        for num in num_list:
            increment_list.append(num)
            tmp_d["x"].append(np.array(increment_list))
            tmp_d["y"].append(
                np.array([
                    np.float32("".join([
                        str(((ord(char) - 96) * alpha_to_num_step) -
                            alpha_to_num_map) for char in v[-1]
                    ]))
                ]))

    # FORMAT:
    # result_x[0] = [1]         result_y[0] = 3
    # result_x[1] = [1,4]       result_y[1] = 3
    # result_x[2] = [1,4,2]     result_y[2] = 3
    # result_x[3] = [1,4,2,2]   result_y[3] = 3
    # result_x[4] = [1,4,2,2,4] result_y[4] = 3
    #####

    result_x = dict()
    result_x["train"] = tmp_d["x"][:len(tmp_d["x"]) - 2000]
    result_x["test"] = tmp_d["x"][len(tmp_d["x"]) - 2000:len(tmp_d["x"]) -
                                  1000]
    result_x["val"] = tmp_d["x"][len(tmp_d["x"]) - 1000:len(tmp_d["x"])]

    result_y = dict()
    result_y["train"] = np.array(tmp_d["y"][:len(tmp_d["y"]) - 2000])
    result_y["test"] = np.array(tmp_d["y"][len(tmp_d["y"]) -
                                           2000:len(tmp_d["y"]) - 1000])
    result_y["val"] = np.array(tmp_d["y"][len(tmp_d["y"]) -
                                          1000:len(tmp_d["y"])])

    batch_size = window_len * (word_len - 1)
    h_dims = word_len

    epochs = input("Epochs: ")
    if not epochs == "":
        epochs = int(epochs)
    else:
        epochs = 100

    start_time = time.time()

    model_file = "{}_epochs.model".format(epochs)

    if not os.path.exists(model_file):
        x = C.sequence.input_variable(1)
        z = create_model(x, h_dims)
        var_l = C.input_variable(1, dynamic_axes=z.dynamic_axes, name="y")
        learning_rate = 0.005
        lr_schedule = C.learning_parameter_schedule(learning_rate)
        loss = C.squared_error(z, var_l)
        error = C.squared_error(z, var_l)
        momentum_schedule = C.momentum_schedule(0.9, minibatch_size=batch_size)
        learner = C.fsadagrad(z.parameters,
                              lr=lr_schedule,
                              momentum=momentum_schedule)
        trainer = C.Trainer(z, (loss, error), [learner])

        # training
        loss_summary = []

        start = time.time()
        for epoch in range(0, epochs):
            for x_batch, l_batch in next_batch(result_x, result_y, "train",
                                               batch_size):
                trainer.train_minibatch({x: x_batch, var_l: l_batch})

            if epoch % (epochs / 10) == 0:
                training_loss = trainer.previous_minibatch_loss_average
                loss_summary.append(training_loss)
                print("epoch: {}, loss: {:.4f}".format(epoch, training_loss))

        print("Training took {:.1f} sec".format(time.time() - start))

        # Print the train, validation and test errors
        for label_txt in ["train", "val", "test"]:
            print("mse for {}: {:.6f}".format(
                label_txt,
                get_mse(trainer, x, result_x, result_y, batch_size, var_l,
                        label_txt)))

        z.save(model_file)

    else:
        z = C.load_model(model_file)
        x = C.logging.find_all_with_name(z, "")[-1]

    # Print out all layers in the model
    print("Loading {} and printing all nodes:".format(model_file))
    node_outputs = C.logging.find_all_with_name(z, "")
    for n in node_outputs:
        print("  {}".format(n))

    results = []
    # predict
    # f, a = plt.subplots(2, 1, figsize=(12, 8))
    for j, ds in enumerate(["val", "test"]):
        fig = plt.figure()
        a = fig.add_subplot(2, 1, 1)
        results = []
        for x_batch, y_batch in next_batch(result_x, result_y, ds, batch_size):
            pred = z.eval({x: x_batch})
            results.extend(pred[:, 0])
        # because we normalized the input data we need to multiply the prediction
        # with SCALER to get the real values.
        a.plot((result_y[ds]).flatten(), label=ds + " raw")
        a.plot(np.array(results), label=ds + " pred")
        a.legend()

        fig.savefig("{}_chart_{}_epochs.jpg".format(ds, epochs))

    print("Delta: ", time.time() - start_time)

    return result_x, result_y, results
Esempio n. 14
0
def prepare_data(window_len, word_len, alphabet_len, alpha_to_num,
                 train_percent):
    source = input("Source (1=CSV,2=Finance): ")
    if source == "1":
        source = "weather_JAN.csv"
        ts_data = pd.read_csv(source,
                              index_col="date",
                              parse_dates=["date"],
                              dtype=np.float32)
        sax_ret = sax_via_window(ts_data["temp"].values,
                                 window_len,
                                 word_len,
                                 alphabet_size=alphabet_len,
                                 nr_strategy="none",
                                 z_threshold=0.01)
    else:
        source = input("Remote Source (yahoo): ")
        if source == "":
            source = "yahoo"

        contract = input("Contract (SPY): ")
        if contract == "":
            contract = "SPY"

        start_date = input("Start Date (2000-01-01): ")
        if start_date == "":
            start_date = "2000-01-01"

        end_date = input("End Date (now): ")
        if end_date == "":
            end_date = datetime.datetime.now()

        ts_data = get_asset_data(source, contract, start_date, end_date)
        if "Close" in ts_data:
            close_tag = "Close"
        elif "close" in ts_data:
            close_tag = "close"
        else:
            return {"Error": "Couldn't find Close data."}
        sax_ret = sax_via_window(ts_data[close_tag].values,
                                 window_len,
                                 word_len,
                                 alphabet_size=alphabet_len,
                                 nr_strategy="none",
                                 z_threshold=0.01)
    my_sax = dict()
    for k, v in sax_ret.items():
        for i in v:
            my_sax[i] = k

    tmp_d = {"x": [], "y": []}
    for k, v in my_sax.items():
        num_list = [np.float32(alpha_to_num[char][1]) for char in v[:-1]]
        increment_list = []
        for num in num_list:
            increment_list.append(num)
            tmp_d["x"].append(np.array(increment_list))
            tmp_d["y"].append(
                np.array([np.float32(alpha_to_num[char][1])
                          for char in v[-1]]))

    # FORMAT:
    # result_x[0] = [1]         result_y[0] = 3
    # result_x[1] = [1,4]       result_y[1] = 3
    # result_x[2] = [1,4,2]     result_y[2] = 3
    # result_x[3] = [1,4,2,2]   result_y[3] = 3
    # result_x[4] = [1,4,2,2,4] result_y[4] = 3
    #####

    # Separate Dataset into train (80%), val (10%) and test (10%)
    pos_train = int(len(tmp_d["x"]) * train_percent)
    pos_train = int(pos_train / window_len) * window_len

    pos_val = len(tmp_d["x"][pos_train:]) / 2
    pos_val = pos_train + int(pos_val / window_len) * window_len

    pos_test = pos_val

    result_x = dict()
    result_x["train"] = tmp_d["x"][:pos_train]
    result_x["val"] = tmp_d["x"][pos_train:pos_val]
    result_x["test"] = tmp_d["x"][pos_test:]

    result_y = dict()
    result_y["train"] = np.array(tmp_d["y"][:pos_train])
    result_y["val"] = np.array(tmp_d["y"][pos_train:pos_val])
    result_y["test"] = np.array(tmp_d["y"][pos_val:])

    return result_x, result_y
 def generate_candidates(self):
     candidates, cand_len = [], self.max_len
     # Fast shapelets (SAX Representation)
     if self.fast_shapelets:
         while cand_len >= self.min_len:
             sax_time_series = []
             mask_patterns = []
             mask_dict = []
             sax_list = []
             masked_words = {}
             sax_index = []
             sax_counts = []
             max_count = 0
             if (self.fast_shapelet_arg['n_masking'] <
                     self.fast_shapelet_arg['alpha_size']):
                 counter = 0
                 while counter < self.fast_shapelet_arg['n_masking']:
                     mask_index = random.sample(
                         range(self.fast_shapelet_arg['alpha_size']),
                         self.fast_shapelet_arg['mask_len'])
                     if mask_index in mask_patterns:
                         continue
                     mask_patterns.append(mask_index)
                     counter += 1
             counter = 0
             for time_serie, label in zip(self.time_series, self.labels):
                 sax = sax_via_window(time_serie, cand_len, 5, 5, "none",
                                      0.01)
                 sax_ = {}
                 for key, indexes in sax.items():
                     for index in indexes:
                         sax_[index] = key
                 sax = list(sax_.values())
                 sax_time_series.append((sax, label))
                 sax_list.extend(sax)
             for iteration in mask_patterns:
                 masked_list = {}
                 masked_words = []
                 for sax_series, label in sax_time_series:
                     masked_sax = []
                     words = sax_series
                     for mask_index in iteration:
                         if mask_index == 0:
                             words = [word[1:] for word in words]
                         elif mask_index == self.fast_shapelet_arg[
                                 'alpha_size'] - 1:
                             words = [
                                 word[:self.
                                      fast_shapelet_arg['alpha_size'] - 1]
                                 for word in words
                             ]
                         else:
                             words = [
                                 (word[:mask_index] + word[mask_index + 1:])
                                 for word in words
                             ]
                     masked_words.extend(words)
                     if label in masked_list:
                         masked_list[label] = masked_list[label] + words
                     else:
                         masked_list[label] = words
                 if not sax_counts:
                     for word in masked_words:
                         sax_dict = {}
                         for label in masked_list:
                             sax_dict[label] = masked_list[label].count(
                                 word)
                             max_count = max(max_count, sax_dict[label])
                         sax_counts.append(sax_dict)
                 else:
                     for index, word in enumerate(masked_words):
                         sax_dict = sax_counts[index]
                         for label in masked_list:
                             sax_dict[label] = sax_dict[
                                 label] + masked_list[label].count(word)
                             max_count = max(max_count, sax_dict[label])
                         sax_counts[index] = sax_dict
                         sax_complement = []
                         sax_dist_pow = []
                         for sax_count in sax_counts:
                             sax_dict = {}
                             sax_dist = {}
                             for label in sax_count:
                                 sax_dict[
                                     label] = max_count - sax_count[label]
                             sax_dist = {
                                 key: sax_dict[key] - sax_count.get(key, 0)
                                 for key in sax_count.keys()
                             }
                             sax_complement.append(sax_dict)
                             sax_dist_pow.append(sum(sax_dist.values()))
                     max_dist_pow = sorted(
                         range(len(sax_dist_pow)),
                         key=lambda i: sax_dist_pow[i],
                         reverse=True)[:self.shapelet_count]
                     for max_index in max_dist_pow:
                         index = 0
                         for idx, time_serie in enumerate(sax_time_series):
                             index += len(time_serie[0])
                             if index >= max_index:
                                 location = max_index - (index -
                                                         len(time_serie[0]))
                                 candidates.append(
                                     (self.time_series[idx]
                                      [location:location + cand_len],
                                      time_serie[1]))
                                 break
             if cand_len == self.min_len:
                 break
             if self.shapelet_range:
                 cand_len -= 1
             else:
                 cand_len = self.min_len
     # Normal representation
     else:
         while cand_len >= self.min_len:
             for time_serie, label in zip(self.time_series, self.labels):
                 for k in range(len(time_serie) - cand_len + 1):
                     candidates.append((time_serie[k:k + cand_len], label))
             if cand_len == self.min_len:
                 break
             if self.shapelet_range:
                 cand_len -= 1
             else:
                 cand_len = self.min_len
     return pd.DataFrame(candidates)