def mine_string_patterns(doc):
    id, lines = doc
    docs = []

    for i, line in enumerate(lines):
        lr = []
        line = re.sub(r'\d+', '', line)
        toks = line.strip().split(' ')
        for t in toks:
            if t:
                lr.append(t)
        docs.append(lr)

    wordmap = {}  # type: Dict[str, int] #problematic!
    idx = 0
    for doc in docs:
        for tok in doc:
            if tok not in wordmap:
                wordmap[tok] = idx
                idx += 1
    doc_vecs = []
    for doc in docs:
        doc_vec = []
        for tok in doc:
            doc_vec.append(wordmap[tok])
        doc_vecs.append(doc_vec)
    db = doc_vecs
    ps = PrefixSpan(db)
    invwordmap = invert(wordmap)
    func = ps.frequent
    # lambda function for sorting
    key = None
    # upper bound
    bound = None
    # filter lambda function
    filter = None
    threshold = 2
    closed = True
    generator = False
    ps.minlen = 2
    ps.maxlen = 10
    results = []
    for freq, patt in func(threshold,
                           closed=closed,
                           generator=generator,
                           key=key,
                           bound=bound,
                           filter=filter):
        pattern = ' '.join((invwordmap[i] for i in patt))
        results.append([pattern, freq])

    return id, results
Ejemplo n.º 2
0
def one_stacking_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        curr_time = 0
        aversion = "c"
        for row in spamreader:
            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = temp
                temp = []
            if row[2] == "":
                continue

            curr_time += int(row[3])
            if row[6] == "0":
                if aversion == "c":
                    temp.append(aversion)
                aversion = "a"
                temp.append(aversion)
                aversion = "c"
                curr_time = 0

            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = "c"

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("one stacking period \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
def find_clusters_names(labels, features):
    
    groups = [[] for i in range(0, max(labels)+1)]
    for i in range(0, max(labels)+1):
        groups[i] =  features[features['labels'] == i].index
        groups[i] = groups[i].tolist()
    
    for group in groups:
        for i in range(0, len(group)):
            group[i] = group[i].split("::")
            group[i] = group[i] + group[i][len(group[i])-1].split(" ")
            
    res= []
    for group in groups :
        prefix = PrefixSpan(group)
        prefix.maxlen = 4
        prefix.minlen = 4
        res.append(prefix.topk(5, filter = lambda patt, matches : diversity_score(patt) >= len(patt)))
                
    return [create_str(res[i][0][1]) for i in range(0, len(res))]
Ejemplo n.º 4
0
def raw_data():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        temp = []
        for row in spamreader:

            if not curr_usr == row[2]:
                curr_usr = row[2]
                dict[curr_usr] = temp
                temp = []
            if row[2] == "":
                continue
            if row[6] == "0":
                temp.append("a")
            else:
                temp.append("c")

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("raw data \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")
    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Ejemplo n.º 5
0
def aversion_direction_one_stacking_period():
    dict = {}
    with open(FILE) as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(spamreader, None)
        curr_usr = "******"
        avg = []
        curr_time = 0
        aversion = [0.0, 0.0, "1"]
        temp = []
        for row in spamreader:
            if not curr_usr == row[2]:
                mean = np.average(avg, axis=0)
                t = []
                for i in temp:
                    res = "c"
                    if i[2] == "0":
                        diff = [np.abs(a - b) for a, b in zip(i[0:2], mean)]
                        # if np.abs((diff[0] / mean[0]) - (diff[1] / mean[1])) < treshold:
                        if (np.abs(((diff[0] + mean[0]) / mean[0]) - ((diff[1] + mean[1]) / mean[1])) < treshold) or \
                                (((diff[0] + mean[0]) / mean[0]) > treshold2 and (
                                            ((diff[1] + mean[1]) / mean[1]) > treshold2)):
                            res = "f"
                        elif diff[0] - diff[1] > 0:
                            if i[0] < mean[0]:
                                res = "l"
                            if i[0] > mean[0]:
                                res = "r"
                        else:
                            if i[1] < mean[1]:
                                res = "u"
                            if i[1] > mean[1]:
                                res = "d"
                    t.append(res)
                dict[curr_usr] = t
                curr_usr = row[2]
                temp = []
                avg = []
            if row[2] == "":
                continue

            if row[6] == "1":
                avg.append([
                    float(row[4].replace(",", ".")),
                    float(row[5].replace(",", "."))
                ])
            curr_time += int(row[3])
            if row[6] == "0":
                if aversion[2] == "1":
                    temp.append(aversion)
                aversion = [
                    float(row[4].replace(",", ".")),
                    float(row[5].replace(",", ".")), row[6]
                ]
                temp.append(aversion)
                aversion = [0.0, 0.0, "1"]
                curr_time = 0

            if curr_time > PERIOD * 1000:
                temp.append(aversion)
                curr_time = curr_time - (PERIOD * 1000)
                aversion = [0.0, 0.0, "1"]

    for i in list(dict.values()):
        print(" -1 ".join(i) + " -2")
    # print(dict.values())
    ps = PrefixSpan(list(dict.values()))
    print("aversion direction one stacking period \n\n")
    ps.minlen = 3
    ps.maxlen = 8
    for i in ps.topk(20):
        print(i)
    print("\n")
    for i in ps.topk(20, closed=True):
        print(i)
    print("\n")
    for i in ps.topk(20, generator=True):
        print(i)
    print("\n")

    # for i in ps.frequent(2):
    #     print(i)
    print("\n\n\n")
Ejemplo n.º 6
0
    discrete_time = []
    with open(filename, "r", encoding="utf-8") as weights_file:
        print(f"Reading file {filename}")
        for i, weights_triple in enumerate(weights_file):
            current_weights = weights_triple.replace(",", ".").split("\t")
            weights.append(int(current_weights[1]))
            discrete_time_base = int(current_weights[0].strip())
            discrete_time.append(discrete_time_base)
            curr_frequency = int(current_weights[3].strip())
            frequency.append(curr_frequency)
            for k in range(0, curr_frequency):
                weights.append(int(current_weights[1]))
                discrete_time_base += 1
                discrete_time.append(discrete_time_base)
            if limit is not None and (i == limit or discrete_time_base >= limit):
                print("Limit reached")
                break
    return discrete_time, weights


if __name__ == '__main__':
    basedir = "C:/Users/havar/Home/cache_simulation_results/"

    _t, _w = _read_db(basedir + "scaled_w_01.csv")
    data = list(chunks(_w, 1000))
    ps = PrefixSpan(data)
    ps.minlen = 5
    ps.maxlen = 100

    print(ps.frequent(5, closed=True))