Ejemplo n.º 1
0
    def fit(self, X, y=None):

        import pandas as pd
        from prefixspan import PrefixSpan

        l = list()
        p = PrefixSpan(X)
        for i in range(2, self.__frequent + 1):
            l.extend(p.frequent(i))
        df = pd.DataFrame(columns=["secuencia", "support", "tam"])
        for i, j in enumerate(l):
            df.loc[i] = [j[1], j[0] / len(X), len(j[1])]
        df = df.sort_values("tam", ascending=True)
        df.drop("tam", axis=1, inplace=True)
        df = df[df["support"] >= self.__minSupport]
        df = df.reset_index(drop=True)

        for i in df.iterrows():
            node = self.root
            for pos, j in enumerate(i[1]["secuencia"]):
                if node.existChildren(j):
                    node = node.getChildren(j)
                    if pos == len(i[1]["secuencia"]) - 1:
                        node.setSupport(i[1]["support"])
                else:
                    child = nodo(se=j, su=i[1]["support"])
                    node.addChild(j, child)
                    node = child

        return self
Ejemplo n.º 2
0
def sequence_mining(min_support, token):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = "{}\\dataset\\upload_sequence_processed{}.txt".format(
    # data_path = "{}\\dataset\\upload_sequence_processed-{}.txt".format(
    #     current_dir, token)
    data_path = os.path.join(current_dir, 'dataset',
                             'upload_sequence_processed-{}.txt'.format(token)
                             )
    db = []
    with open(data_path, 'r') as f:
        file = reader(f, delimiter=' ', quotechar='\r')
        i = 0
        for row in file:
            # if i % 2 == 0:
            # if i % 2 == 0 or i % 2 == 1:
                # print(row)
            db.append([int(item) for item in row])
            i += 1
    row_count = len(db)
    if min_support * row_count < 2:
        if row_count != 0:
            min_support = 2 / row_count
    # print(db)
    # print(db)
    # print(row_count)
    ps = PrefixSpan(db)
    all_sequence = ps.frequent(row_count*min_support)
    # all_sequence = ps.frequent(1)
    all_sequence_num = len(all_sequence)
    # print("="*99)
    # print(all_sequence_num)
    return all_sequence_num, all_sequence
Ejemplo n.º 3
0
    def score_session(self, items: List[int], items_to_score: List[int],
                      relevant_sessions_indices: Set[int]):
        scores = self.items_to_scores.get(str(items))
        if scores is None:
            self.competence = []

            self.last = len(items)
            self.bcs_sigma = (self.last - 1) / (2 * np.sqrt(2 * np.log(2)))
            if self.bcs_sigma == 0:
                self.bcs_sigma = 0.1
            self.bcs_a = 1 / (self.bcs_sigma * np.sqrt(2 * np.pi))
            total_bcs_weight = sum(
                [self.bcs_weight(i + 1) for i, x in enumerate(items)])

            relevant_sessions = [self.db[i] for i in relevant_sessions_indices]

            for session in relevant_sessions:
                lcs = get_longest_common_subsequence(session, items)
                lcs_indices = get_indices(items, lcs)

                bcs = sum([self.bcs_weight(x)
                           for x in lcs_indices]) / total_bcs_weight

                fes_last = len(session)
                self.lcs_last = get_indices(session, lcs)[-1]
                self.fes_sigma = (fes_last -
                                  self.lcs_last) / (2 * np.sqrt(2 * np.log(2)))
                if self.fes_sigma == 0:
                    self.fes_sigma = 0.1
                self.fes_a = 1 / (self.fes_sigma * np.sqrt(2 * np.pi))
                cni = session[self.lcs_last:]
                unique_cni = set(cni)
                fes = sum(
                    [self.fes_weight(cni.index(x) + 1)
                     for x in unique_cni]) / len(items)

                self.competence.append(0 if bcs == 0 or fes == 0 else (bcs *
                                                                       fes) /
                                       (1 / 2 * (bcs + fes)))

            # mine patterns
            self.total_weight = sum(self.competence)

            ps = PrefixSpan(relevant_sessions)

            patterns = ps.frequent(self.delta,
                                   key=self.pattern_key,
                                   bound=self.pattern_key)

            scores = self.score_items(patterns)

            self.items_to_scores.update({str(items): scores})
        predictions = np.zeros(len(items_to_score))
        mask = np.isin(items_to_score, list(scores.keys()))
        scored_items = items_to_score[mask]
        values = [scores[x] for x in scored_items]
        predictions[mask] = values
        return pd.Series(data=predictions, index=items_to_score)
Ejemplo n.º 4
0
def nlp_FreqSubsequenceMining(classfication,
                              MINSUP=3,
                              CLOSED=False,
                              GENERATOR=True):

    Sequences.clear()

    print("Analyzing %s..." % classfication)

    nlp = spacy.load("en_core_web_sm")

    # Read from raw data file, then convert it to NodeID-sequences.
    file = open("./DATA-Train/DATA-%s" % classfication)
    while 1:

        # read a conf-id, a description from file
        line = file.readline()
        if not line:
            break
        try:
            conf_id, conf_desc_text_raw = line.split('\t')
        except ValueError:
            print(line)
        doc = nlp(conf_desc_text_raw.strip())

        doc = MergeWords(doc)

        Sequences.append([
            MapDict[(
                #root.dep_,
                token.pos_,
                MyWordTags(token.text))] for token in doc
            if (token.pos_, MyWordTags(token.text)) != ('PUNCT', 'OTHER')
        ])

    size = len(Sequences)
    mean_len = np.mean([len(s) for s in Sequences])
    print("Config & Desc: %d\nMean length: %.1f" % (size, mean_len))

    # Mining FreqSeqs from those NodeID-sequences
    # FreqSeqs = ((sup, [seq]), (sup, [seq]), ...)
    FreqSeqs = PrefixSpan(Sequences)
    tmp = FreqSeqs.frequent(int(MINSUP), closed=CLOSED, generator=GENERATOR)
    res = {}
    for FreqSeq in tmp:
        res[tuple(FreqSeq[1])] = FreqSeq[0]

    print("Frequent Sub-sequences: %d\n" % len(res))

    # FreqSeqs with support number,
    return res
Ejemplo n.º 5
0
def generate_rules(changes_sets, threshold):
    ps = PrefixSpan(changes_sets)
    print("Start rule generation")
    # freq_seqs = ps.frequent(minsup=int(len(new_changes) * 0.1), closed=True)
    freq_seqs = ps.frequent(minsup=threshold, closed=True)

    # freq_seqs = PrefixSpan_frequent(
    #     ps, minsup=int(len(new_changes) * 0.1), closed=True)
    freq_seqs = [
        x for x in freq_seqs
        if any([y.startswith("+")
                for y in x[1]]) and any([y.startswith("-") for y in x[1]])
    ]

    freq_seqs = sorted(freq_seqs, reverse=True)
    return freq_seqs
Ejemplo n.º 6
0
def sequence_mining(min_support):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    data_path = "{}\\dataset\\upload_sequence_processed.txt".format(
        # data_path = "{}\\dataset\\upload_processed.txt".format(
        current_dir)
    db = []
    with open(data_path, 'r') as f:
        file = reader(f, delimiter=' ', quotechar='\r')
        for row in file:
            db.append(row)
    # db = array(read_csv(data_path, sep=' ', header=None))
    row_count = len(db)
    ps = PrefixSpan(db)
    all_sequence = ps.frequent(row_count * min_support)
    all_sequence_len = len(all_sequence)
    return all_sequence_len, all_sequence
Ejemplo n.º 7
0
    def find_patterns(self):
        print(self.sampling_type)
        db = self.data
        ps = PrefixSpan(db)
        n_items = len(db)
        result = None
        opts = {
            "closed": self.closed,
            # Somehow does not work
            #"generator": self.generator
        }
        from pprint import pprint
        pprint(opts)
        if self.sampling_type:
            result = ps.topk(self.k, **opts)
        else:
            print("Support value:", self.min_support)
            print("Size:", n_items)
            print("Support:", n_items * self.min_support / 100)
            result = ps.frequent((self.min_support * n_items / 100.0), **opts)

        self.table.model().clear()
        model = QStandardItemModel(self.table)
        model.clear()
        for col, label in enumerate(["Support", "Pattern"]):
            item = QStandardItem(label)
            model.setHorizontalHeaderItem(col, item)
        sequences = []
        for support, pattern in result:
            if len(pattern) < self.min_len:
                continue
            support /= n_items
            sequences.append((support, pattern))
            sitem = self.NumericItem(support)
            pitem = QStandardItem(str(pattern))
            model.appendRow([sitem, pitem])
        self.Outputs.object.send(sequences)
        self.table.setModel(model)
Ejemplo n.º 8
0
    def compute_prefix_span(self):
        r'''
        Accepts a list of list representing sequnces and a
        minimum support, returns the output of the
        PrefixSpan algorithm.

        Parameters
        ----------
        database: (list of lists)
            The "database" (list) of sequences.
        min_support: (int)
            The minimum relative support for PrefixSpan.

        Returns
        -------
        prefix_span: (list of tuples)
            Output of PrefixSpan.frequent. List of tuples of the
            form (frequency, sequence), where sequence is a list
            representing the sequence from the database.
        '''
        ps = PrefixSpan(self.database)
        prefix_span = ps.frequent(self.min_support)
        return prefix_span
def spatronesintax(libro, posv=True):
    #empieza
    df = libro
    #df=pd.read_excel('../Visualization/Relatos_Benvenutto.xlsx')
    tes = df['Texto'].tolist()
    for i in range(len(tes)):
        tes[i] = tes[i] + '.\n'
    tes = ''.join(tes)
    #o=re.sub('…|[.]{3}','.',tes)
    o = re.sub('[“]+|[”]+|["]+', '', tes)
    listaprueba = sent_tokenize(o)
    listapos = []
    for i in listaprueba:
        i = i.strip()
        doc = nlp(i)
        listapos.append(doc)
    oye = separo(listapos, posv)
    listanum, pola = labeltonum(oye)
    #dfl=pd.DataFrame(listalen)
    #dfl['ok']=listanum
    ps = PrefixSpan(oye)
    ps = PrefixSpan(listanum)
    lista = ps.frequent(int(len(oye) * 0.5))
    lista2 = []
    for i in lista:
        if len(i[1]) > 5:
            lista2.append(i)
    df2 = correr(lista2, listanum)
    listatrans = []
    for i in df2['indis']:
        listaux2 = []
        for j in i:
            listaux2.append(pola[j])
        listatrans.append(listaux2)
    df2['transformer'] = listatrans
    df2.to_excel('pospattern.xlsx', index=False)
Ejemplo n.º 10
0
    discrete_time = []
    with open(filename, "r", encoding="utf-8") as weights_file:
        print(f"Reading file {filename}")
        for i, weights_triple in enumerate(weights_file):
            current_weights = weights_triple.replace(",", ".").split("\t")
            weights.append(int(current_weights[1]))
            discrete_time_base = int(current_weights[0].strip())
            discrete_time.append(discrete_time_base)
            curr_frequency = int(current_weights[3].strip())
            frequency.append(curr_frequency)
            for k in range(0, curr_frequency):
                weights.append(int(current_weights[1]))
                discrete_time_base += 1
                discrete_time.append(discrete_time_base)
            if limit is not None and (i == limit or discrete_time_base >= limit):
                print("Limit reached")
                break
    return discrete_time, weights


if __name__ == '__main__':
    basedir = "C:/Users/havar/Home/cache_simulation_results/"

    _t, _w = _read_db(basedir + "scaled_w_01.csv")
    data = list(chunks(_w, 1000))
    ps = PrefixSpan(data)
    ps.minlen = 5
    ps.maxlen = 100

    print(ps.frequent(5, closed=True))
Ejemplo n.º 11
0
from prefixspan import PrefixSpan

db = [
    [0, 1, 2, 3, 4],
    [1, 1, 1, 3, 4],
    [2, 1, 2, 2, 0],
    [1, 1, 1, 2, 2],
]

ps = PrefixSpan(db)

print(ps.frequent(2))
Ejemplo n.º 12
0
import pickle
from prefixspan import PrefixSpan

with open("../data/objects/paths", "rb") as f:
    paths = pickle.load(f)

ps = PrefixSpan(paths)
freqs = ps.frequent(2)

with open("../data/objects/freqs", "wb") as f:
    pickle.dump(freqs, f)
class PrefixSpanManager:
    """
    Classe d'outil a l'utilisation de prefixspan

    Parameters:
        * sax_engine: SaxEngine
            Instance de preprocessing SAX
        * export: Boolean
            Si oui ou non les donnees sont deja exportees au bon format

    Variables:
        * se_instance: SaxEngine
            L'instance de class SAX
        * data: Array[]
            Les donnees au format SAX
    """
    def __init__(self, sax_engine, export = True):
        self.se_instance = sax_engine
        self.data = sax_engine.sax_data
        self.process_data = []
        self.ps = None
        self.ploter = Plot(self)
        if export:
            self.export_format()

    def run(self):
        """
        Creer l'instance PrefixSpan avec les donnees pretraites
        """
        self.ps = PrefixSpan(self.process_data)

    def export_format(self):
        """
        Modifie le format pour correspondre au besoin de l'instance de PrefixSpan
        """
        tmp = []
        for elmt in self.data:
            tmp.append(elmt.ravel())
        self.process_data = tmp

    def topk(self, n, c = True):
        """
        Affiche les motifs les plus frequents(plus grand support) et par defaut les fermes

        Parameters:
            * n: int
                Nombre de motifs a afficher
        Returns:
            Liste de motifs frequent
        """
        return self.ps.topk(n, closed = c)

    def frequent(self, n):
        """
        Retourne les frequent de support n

        Parameters:
            * n: int
                Support minimal
        Returns:
            Liste des motifs de support minimal n
        """
        return self.ps.frequent(n)

    def plot(self, l):
        self.ploter.plot_prefixspan(l)
Ejemplo n.º 14
0
OUTPUT_JSON_NAME = "data/rules/" + owner + "_" + repo + "_" + lang + ".json"

with open(INPUT_JSON_NAME, mode='r', encoding='utf-8') as f:
    changes_sets = load(f)

changes = [x["changes_set"] for x in changes_sets]

new_changes = []
for tokens in changes:
    new_tokens = [
        x for x in tokens if not x.endswith("\n") and not x.endswith(" ")
    ]
    if new_tokens != [] and new_tokens not in new_changes:
        new_changes.append(new_tokens)

print("Start rule generation")
ps = PrefixSpan(new_changes)
freq_seqs = ps.frequent(minsup=int(len(new_changes) * 0.1), closed=True)
# freq_seqs = PrefixSpan_frequent(
#     ps, minsup=int(len(new_changes) * 0.1), closed=True)
freq_seqs = [
    x for x in freq_seqs
    if any([y.startswith("+")
            for y in x[1]]) and any([y.startswith("-") for y in x[1]])
]

freq_seqs = sorted(freq_seqs, reverse=True)

with open(OUTPUT_JSON_NAME, mode='w', encoding='utf-8') as f:
    dump(freq_seqs, f, indent=1)
Ejemplo n.º 15
0
def apply(grouped_stream, all_labels, parameters=None):
    """
    Applies the prefix span algorithm

    Parameters
    -------------
    grouped_stream
        Grouped stream
    all_labels
        Indexed labels
    parameters
        All the parameters of the algorithm

    Returns
    --------------
    frequents
        List containing frequent itemsets as label indexes
    frequents_label
        List containing frequent itemsets as labels
    frequents_encodings
        List containing frequent itemsets as word encodings
    frequents_occurrences
        List containing all the sequences of events associated to the corresponding itemset
    """
    if parameters is None:
        parameters = {}

    final_label_idx = parameters[FINAL_LABEL_IDX] if FINAL_LABEL_IDX in parameters else DEFAULT_FINAL_LABEL_IDX

    m = parameters[M] if M in parameters else DEFAULT_M

    data = [[y[final_label_idx] for y in x] for x in grouped_stream]
    ps = PrefixSpan(data)

    frequents = [x[1] for x in ps.frequent(m)]
    frequents_label = [" ".join([all_labels[y] for y in x]) for x in frequents]

    F = tempfile.NamedTemporaryFile(suffix='.txt')
    F.close()
    F2 = open(F.name, "w")
    for label in frequents_label:
        F2.write(label+"\n")
    F2.close()

    model = fasttext.train_unsupervised(F.name)
    frequents_encodings = []
    for i in range(len(frequents)):
        phrase = [x for x in frequents_label[i].split() if x in model.words]
        v = None
        for w in phrase:
            if v is None:
                v = model.get_word_vector(w)
            else:
                v = v + model.get_word_vector(w)
        frequents_encodings.append(v)

    frequents_occurrences = []
    for f in frequents:
        frequents_occurrences.append([])
        for g in grouped_stream:
            d = [x[final_label_idx] for x in g]
            for i in range(len(d)-len(f)):
                if d[i] == f[0] and d[i+len(f)-1] == f[len(f)-1]:
                    if d[i:i+len(f)] == f:
                        frequents_occurrences[-1].append(g[i:i+len(f)])

    return frequents, frequents_label, frequents_encodings, frequents_occurrences
Ejemplo n.º 16
0
def preprocess_dataset(path_):

    #Extraction données------------------------------------------------------------------------------

    path = path_
    date_column = 'Date-heure UTC (événement)'
    action_column = 'Pages'
    identity_col = 'Visiteurs uniques ID'

    dataset = pd.read_csv(path, sep=';', parse_dates=[date_column])

    dataset[date_column] = pd.to_datetime(dataset[date_column],
                                          errors='coerce')
    dataset = dataset.dropna(subset=[date_column])

    dataset = dataset[dataset[action_column] != '-']
    dataset.index = dataset[date_column]
    dataset.drop(columns=date_column, inplace=True)
    dataset.sort_index(ascending=True, inplace=True)

    #'particulier::compte::compte-conseil univers de besoin'
    valeurs_interdites = [
        'particulier::acces-CR::acces-CR-store locator trouver ma CR 50',
        'particulier::particulier-accueil particuliers et BP'
    ]

    dataset_after = dataset[~dataset[action_column].isin(valeurs_interdites)]
    person_list = dataset_after[identity_col].unique()

    print("Le nombre de personnes répertoriées est : " + str(len(person_list)))

    #liste_pages_dataset = dataset[action_column].unique().tolist()
    #comparaisons du nombre de visiteurs avant/ après : 77406 - 24682 = 52724 personnes
    #52724 personnes ne visitent que les pages de valeurs-interdites ....

    List_actions = []

    #start = time.time()

    #parameter authorized_session_time defines the maximal inactivity time before we consider the client
    #opened two distincts sessions
    authorized_inactivity_time = datetime.timedelta(minutes=30)

    for i in range(0, len(person_list)):

        personne = person_list[i]
        subdata = dataset_after[dataset_after[identity_col] == personne]
        start = 0

        for j in range(0, len(subdata.index) - 1):

            duree = subdata.index[j + 1] - subdata.index[j]

            if duree > authorized_inactivity_time:

                actions = subdata[action_column].iloc[start:j + 1].tolist()
                start = j + 1
                List_actions.append(actions)

        actions = subdata[action_column].iloc[start:len(subdata.index)].tolist(
        )
        List_actions.append(actions)

    #Premier coup de PrefixSpan pour trouver les parcours pertinents---------------------------------------

    first_search = PrefixSpan(List_actions)
    first_search.min_len = 2
    first_search.max_len = 7
    results_search1 = first_search.frequent(
        15, filter=lambda patt, matches: diversity_score(patt) >= len(patt))

    results_search1.sort(key=lambda x: -x[0])

    #Deuxieme passage pour obtenir la liste des transitions de taille 2 et leurs effectifs---------------------

    second_search = PrefixSpan(List_actions)
    second_search.min_len = 2
    second_search.max_len = 2
    filter_list = compute_transitions_list(results_search1)
    results_search2 = second_search.frequent(
        5, filter=lambda patt, matches: patt in filter_list)

    results_search2.sort(key=lambda x: -x[0])

    #Tracé du Sankey -------------------------------------------------------------------------------------

    liste_resfinal = results_search2

    labels = []
    sources = []
    targets = []
    values = []
    links = []

    #A link only appears in the graph if it constitutes more than rate% of the incoming/outgoing traffic of the
    #two nodes involved in the link
    rate = 0.11

    for match in liste_resfinal:

        if len(match[1]) == second_search.minlen:
            pattern = match[1]
        else:
            pattern = match[1][len(match[1]) - 2:len(match[1])]

        for label in pattern:
            renamed = rename(label)
            if renamed not in labels:
                labels.append(renamed)
            if match[1].index(label) < len(match[1]) - 1:
                targetted = rename(match[1][match[1].index(label) + 1])
                res_exit = 0
                res_entry = 0
                res_incoming = 0
                res_ongoing = 0
                if labels.index(renamed) in targets:

                    for i in range(0, len(targets)):
                        x = targets[i]
                        if x == labels.index(renamed):
                            res_exit += values[i]

                if labels.index(renamed) in sources:

                    for i in range(0, len(sources)):
                        if sources[i] == labels.index(renamed):
                            res_incoming += values[i]

                if targetted in labels:

                    if labels.index(targetted) in sources:

                        for i in range(0, len(sources)):
                            x = sources[i]
                            if x == labels.index(targetted):
                                res_entry += values[i]

                    if labels.index(targetted) in targets:

                        for i in range(0, len(targets)):
                            if targets[i] == labels.index(targetted):
                                res_ongoing += values[i]

                if (renamed, targetted) not in links:

                    if match[0] > rate * res_exit and match[
                            0] > rate * res_entry and match[
                                0] > rate * res_ongoing and match[
                                    0] > rate * res_incoming:

                        if ((targetted, renamed) in links):

                            if values[links.index(
                                (targetted, renamed))] <= match[0]:

                                links.append((renamed, targetted))
                                sources.append(labels.index(renamed))
                                if targetted in labels:
                                    targets.append(labels.index(targetted))
                                else:
                                    labels.append(targetted)
                                    targets.append(labels.index(targetted))
                                sources.pop(links.index((targetted, renamed)))
                                targets.pop(links.index((targetted, renamed)))
                                values.pop(links.index((targetted, renamed)))
                                links.pop(links.index((targetted, renamed)))
                                values.append(match[0])

                        else:

                            links.append((renamed, targetted))
                            sources.append(labels.index(renamed))
                            if targetted in labels:
                                targets.append(labels.index(targetted))
                            else:
                                labels.append(targetted)
                                targets.append(labels.index(targetted))
                            values.append(match[0])

                else:

                    values[links.index((renamed, targetted))] += match[0]

    global_matrix = generate_global_matrix(List_actions, labels)

    return [labels, links, values, global_matrix, liste_resfinal, List_actions]


#pk.dump(labels, open('Listedespages.pkl', 'wb'))
#pk.dump(links, open('Listofedges.pkl', 'wb'))
#pk.dump(values, open('Listofvalues.pkl', 'wb'))

#pk.dump((subgraphs[23],list_subsources[23], list_subtargets[23], list_subvalues[23]), open('subgraphtotestonclustering.pkl','wb'))
Ejemplo n.º 17
0
#! python3
# -*- coding:utf-8 -*-

__author__ = "yoyo"

from prefixspan import PrefixSpan as PS
import os.path as path

data_dir = "./dataset/vocabulary/"
filename = "GRE_pure.txt"

if __name__ == "__main__":
    filepath = path.join(data_dir, filename)
    f = open(filepath)
    vocabulary = f.read()
    vocabulary = vocabulary.split("\n")
    f.close()
    ps = PS(vocabulary)
    for sequence in ps.frequent(3):
        if len(sequence[1]) >= 4:
            print(sequence)