def add_nodes_to_graph(seqs,last_k):
    t = SmartTree()
    rootNode = t.set_root()

    countDict = {}
    G=nx.DiGraph()
    for s in seqs:
        nearHistory = tuple(s[-(last_k):])
        if nearHistory in countDict:
            #increment count
            countDict[nearHistory] += 1
        else:
            #init count
            countDict[nearHistory] = 1
            #add seq to sequence tree
            t.add_path(rootNode,list(nearHistory))
            #add node to graph
            G.add_node(nearHistory)

        ## i also have to save the sequence of length k+1 because otherwise I cannot calculate the count
        ## from state x to state y. So the seqeunces of length k+1 are in the tree but not in the states
        nearHistoryLong = tuple(s[-(last_k+1):])# +1 because I need one more element to calculate the transition prob
        if nearHistory != nearHistoryLong: # otherwise short seq are counted double
            if nearHistoryLong in countDict:
                #increment count
                countDict[nearHistoryLong]+= 1
            else:
                #init count
                countDict[nearHistoryLong] = 1
    return (t,countDict,G)
Beispiel #2
0
    def fit(self, train_data=None):
        """
        Fit the model
        :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence".
            If None, run FSM using SPFM over the sequence database stored in `self.db_path`.
            Otherwise, run FSM using `pymining.seqmining` (slower).
        """

        if train_data is None:
            if self.spmf_path is None or self.db_path is None:
                raise ValueError(
                    "You should set db_path and spfm_path before calling fit() without arguments."
                )

            self.logger.info('Using SPFM (Java) for Frequent Sequence Mining')
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            # call spmf
            command = ' '.join([
                self.spmf_algorithm, self.db_path, self.output_path,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            # parse back output from text file
            self._parse_spfm_output()
        else:
            # use pymining
            self.logger.info(
                'Using pymining.seqmining (python) for Frequent Sequence Mining'
            )
            sequences = train_data['sequence'].values
            msup = int(
                self.minsup *
                len(sequences)) if 0 <= self.minsup <= 1 else self.minsup
            self.logger.info(
                'Mining frequent sequences (minsup={})'.format(msup))
            self.freq_seqs = seqmining.freq_seq_enum(sequences, msup)

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.info('Building the prefix tree')
        self.tree = SmartTree()
        self.root_node = self.tree.set_root()
        for pattern, support in self.freq_seqs:
            if len(pattern) == 1:
                # add node to root
                self.tree.create_node(pattern[0],
                                      parent=self.root_node,
                                      data={"support": support})
            elif len(pattern) > 1:
                # add entire path starting from root
                self.tree.add_path(self.root_node, pattern, support)
            else:
                raise ValueError('Frequent sequence of length 0')
        self.logger.info('Training completed')
    def fit(self, seqs):
        """Takes a list of list of sequences ."""

        if self.spmf_path and self.db_path:
            self.logger.info("Using SPMF")
            #parse minsup
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            #call spmf
            algorithm = "PrefixSpan"
            command = ' '.join([
                algorithm, self.db_path, self.outputPath,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            #parse back output from text file
            self._parse_SPMF_output()
        elif seqs:
            msup = self.minsup * len(
                seqs) if 0 <= self.minsup <= 1 else self.minsup

            self.logger.debug('Mining frequent sequences')
            self.freq_seqs = seqmining.freq_seq_enum(seqs, msup)
        else:
            self.logger.error(
                "No sequence dabase path nor sequence list provided.")

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.debug('Building frequent sequence tree')
        self.tree = SmartTree()
        self.rootNode = self.tree.set_root()
        for tuple in self.freq_seqs:
            if len(tuple[0]) == 1:
                #add node to root
                self.tree.create_node(tuple[0][0],
                                      parent=self.rootNode,
                                      data={"support": tuple[1]})
            elif len(tuple[0]) > 1:
                #add entire path starting from root
                self.tree.add_path(self.rootNode, tuple[0], tuple[1])
            else:
                raise NameError('Frequent sequence of length 0')
        self.logger.debug('Tree completed')
Beispiel #4
0
    def build_smart_tree(self):
        t = SmartTree()
        rootNode = t.set_root()
        defaultSupport = 1
        t.add_path(rootNode, [1, 2], defaultSupport)
        t.add_path(rootNode, [1, 3, 1], 8)
        t.add_path(rootNode, [1, 3, 1, 0], 8)
        t.add_path(rootNode, [1, 3, 1, 6], 1)
        t.add_path(rootNode, [1, 3, 1, 4], 6)
        t.add_path(rootNode, [1, 3, 1, 4, 9], 3)
        t.add_path(rootNode, [1, 3, 1, 4, 2], 3)
        t.add_path(rootNode, [1, 3, 6], defaultSupport)
        t.add_path(rootNode, [1, 3, 2], defaultSupport)
        t.add_path(rootNode, [2, 3], defaultSupport)
        t.add_path(rootNode, [2, 1], defaultSupport)
        t.add_path(rootNode, [3, 4, 2, 1, 5, 1], defaultSupport)
        t.add_path(rootNode, [3, 4, 2, 6], defaultSupport)
        t.add_path(rootNode, [3, 4, 1, 6], defaultSupport)
        t.add_path(rootNode, [3, 4, 1, 5, 5], defaultSupport)
        t.add_path(rootNode, [3, 4, 1, 5, 4], defaultSupport)
        t.add_path(rootNode, [3, 5], defaultSupport)
        t.add_path(rootNode, [9], defaultSupport)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 3], 1)
        t.add_path(rootNode, [4, 2, 1, 5, 1], 8)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 2], 7)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6], 8)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 9], 4)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 9, 3], 1)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 9, 0], 3)
        t.add_path(rootNode, [4, 2, 1, 5, 1, 6, 4], 4)

        return t
class FSMRecommender(ISeqRecommender):
    """Frequent Sequence Mining recommender"""

    def __init__(self, minsup, minconf, max_context=1, min_context=1, spmf_path=None, db_path=None):
        """

        :param minsup: the minimum support threshold. It is interpreted as relative count if in [0-1],
                otherwise as an absolute count. NOTE: Relative count required for training with SPFM (faster).
        :param minconf: the minimum confidence threshold.
        :param max_context: (optional) the maximum number of items in the user profile (starting from the last) that will be used
                for lookup in the database of frequent sequences.
        :param min_context: (optional) the minimum number of items in the user profile (starting from the last) that will be used
                for lookup in the database of frequent sequences.
        :param spmf_path: (optional) path to SPMF jar file. If provided, SPFM library will be used for pattern extraction (algorithm: Prefix Span).
                Otherwise, use pymining, which can be significantly slower depending on the sequence database size.
        :param db_path: (optional) path to the sequence database file
        """

        super(FSMRecommender, self).__init__()
        self.minsup = minsup
        self.minconf = minconf
        self.max_context = max_context
        self.min_context = min_context
        self.recommendation_length = 1
        self.db_path = db_path
        self.spmf_path = spmf_path
        self.spmf_algorithm = "PrefixSpan"
        self.output_path = "tmp/tmp_output.txt"

    def __str__(self):
        return 'FreqSeqMiningRecommender: ' \
               'minsup={minsup}, ' \
               'minconf={minconf}, ' \
               'max_context={max_context}, ' \
               'min_context={min_context}, ' \
               'spmf_path={spmf_path}, ' \
               'db_path={db_path}'.format(**self.__dict__)

    def fit(self, train_data=None):
        """
        Fit the model
        :param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence".
            If None, run FSM using SPFM over the sequence database stored in `self.db_path`.
            Otherwise, run FSM using `pymining.seqmining` (slower).
        """

        if train_data is None:
            if self.spmf_path is None or self.db_path is None:
                raise ValueError("You should set db_path and spfm_path before calling fit() without arguments.")

            self.logger.info('Using SPFM (Java) for Frequent Sequence Mining')
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            # call spmf
            command = ' '.join([self.spmf_algorithm, self.db_path, self.output_path, str(percentage_min_sup) + '%'])
            callSPMF(self.spmf_path, command)

            # parse back output from text file
            self._parse_spfm_output()
        else:
            # use pymining
            self.logger.info('Using pymining.seqmining (python) for Frequent Sequence Mining')
            sequences = train_data['sequence'].values
            msup = int(self.minsup * len(sequences)) if 0 <= self.minsup <= 1 else self.minsup
            self.logger.info('Mining frequent sequences (minsup={})'.format(msup))
            self.freq_seqs = seqmining.freq_seq_enum(sequences, msup)

        self.logger.info('{} frequent sequences found'.format(len(self.freq_seqs)))
        self.logger.info('Building the prefix tree')
        self.tree = SmartTree()
        self.root_node = self.tree.set_root()
        for pattern, support in self.freq_seqs:
            if len(pattern) == 1:
                # add node to root
                self.tree.create_node(pattern[0], parent=self.root_node, data={"support": support})
            elif len(pattern) > 1:
                # add entire path starting from root
                self.tree.add_path(self.root_node, pattern, support)
            else:
                raise ValueError('Frequent sequence of length 0')
        self.logger.info('Training completed')

    def recommend(self, user_profile, user_id=None):
        n = len(user_profile)
        c = min(n, self.max_context)
        match = []
        # iterate over decreasing context lengths until a match with sufficient confidence is found
        while not match and c >= self.min_context:
            q = user_profile[n - c:n]
            match = self._find_match(q, self.recommendation_length)
            c -= 1
        return match

    def _find_match(self, context, recommendation_length):
        # search context
        lastNode = self.tree.find_path(self.root_node, context)

        if lastNode == -1:
            return []
        else:  # context matched
            context_support = self.tree[lastNode].data['support']
            children = self.tree[lastNode].fpointer

            if not children:
                return []

            # find all path of length recommendation_length from match
            paths = self.tree.find_n_length_paths(lastNode, recommendation_length)
            return self._filter_confidence(context_support, paths)

    def _filter_confidence(self, context_support, path_list):
        goodPaths = []
        for p in path_list:
            confidence = self.tree[p[len(p) - 1]].data['support'] / float(context_support)
            if confidence >= self.minconf:
                goodPaths.append((self.tree.get_nodes_tag(p), confidence))
        return goodPaths

    def _set_tree_debug_only(self, tree):
        self.tree = tree
        self.root_node = tree.get_root()

    def get_freq_seqs(self):
        return self.freq_seqs

    def get_sequence_tree(self):
        return self.tree

    def show_tree(self):
        self.tree.show()

    def get_confidence_list(self, recommendation):
        return list(map(lambda x: x[1], recommendation))

    def _parse_spfm_output(self):
        with open(self.output_path, 'r') as fin:
            self.freq_seqs = []
            for line in fin:
                pieces = line.split('#SUP: ')
                support = pieces[1].strip()
                items = pieces[0].split(' ')
                seq = tuple(x for x in items if x != '' and x != '-1')
                seq_and_support = (seq, int(support))
                self.freq_seqs.append(seq_and_support)
class FreqSeqMiningRecommender(ISeqRecommender):
    """Frequent sequence mining recommender"""

    outputPath = "tmp_output.txt"

    def __init__(self,
                 minsup,
                 minconf,
                 max_context,
                 min_context=1,
                 spmf_path=None,
                 db_path=None):
        """minsup is interpreted as percetage if [0-1] or as count if > 1.
        spmf_path is the path where the spmf jar is while db_path is the path of the sequence db
        in spmf format. Both have to be valid in order to use spfm for sequence mining"""

        super(FreqSeqMiningRecommender, self).__init__()
        self.minsup = minsup
        self.minconf = minconf
        self.max_context = max_context
        self.min_context = min_context
        self.recommendation_length = 1
        self.spmf_path = spmf_path
        self.db_path = db_path

    def fit(self, seqs):
        """Takes a list of list of sequences ."""

        if self.spmf_path and self.db_path:
            self.logger.info("Using SPMF")
            #parse minsup
            if 0 <= self.minsup <= 1:
                percentage_min_sup = self.minsup * 100
            else:
                raise NameError("SPMF only accepts 0<=minsup<=1")

            #call spmf
            algorithm = "PrefixSpan"
            command = ' '.join([
                algorithm, self.db_path, self.outputPath,
                str(percentage_min_sup) + '%'
            ])
            callSPMF(self.spmf_path, command)

            #parse back output from text file
            self._parse_SPMF_output()
        elif seqs:
            msup = self.minsup * len(
                seqs) if 0 <= self.minsup <= 1 else self.minsup

            self.logger.debug('Mining frequent sequences')
            self.freq_seqs = seqmining.freq_seq_enum(seqs, msup)
        else:
            self.logger.error(
                "No sequence dabase path nor sequence list provided.")

        self.logger.info('{} frequent sequences found'.format(
            len(self.freq_seqs)))
        self.logger.debug('Building frequent sequence tree')
        self.tree = SmartTree()
        self.rootNode = self.tree.set_root()
        for tuple in self.freq_seqs:
            if len(tuple[0]) == 1:
                #add node to root
                self.tree.create_node(tuple[0][0],
                                      parent=self.rootNode,
                                      data={"support": tuple[1]})
            elif len(tuple[0]) > 1:
                #add entire path starting from root
                self.tree.add_path(self.rootNode, tuple[0], tuple[1])
            else:
                raise NameError('Frequent sequence of length 0')
        self.logger.debug('Tree completed')

    def recommend(self, user_profile):
        '''
        Given the user profile return a list of recomendation
        :param user_profile: user profile
        :return: list of recommendations e.g. [([2], 0.875), ([6], 1.0)]
        '''
        n = len(user_profile)
        c = min(n, self.max_context)
        match = []
        while not match and c >= self.min_context:
            q = user_profile[n - c:n]
            match = self._find_match(q, self.recommendation_length)
            c -= 1
        return match

    def _find_match(self, context, recommendation_length):
        #self.logger.debug('Searching match '+str(context))

        #search context
        lastNode = self.tree.find_path(self.rootNode, context)

        if lastNode == -1:
            #self.logger.debug('Context match not found')
            return []
        else:  #context matched
            context_support = self.tree[lastNode].data['support']
            children = self.tree[lastNode].fpointer

            if not children: return []

            #find all path of length recommendation_length from match
            paths = self.tree.find_n_legth_paths(lastNode,
                                                 recommendation_length)
            return self._filter_confidence(context_support, paths)

    def _filter_confidence(self, context_support, pathsList):
        goodPaths = []
        for p in pathsList:
            confidence = self.tree[p[len(p) - 1]].data['support'] / float(
                context_support)
            if confidence >= self.minconf:
                goodPaths.append((self.tree.get_nodes_tag(p), confidence))
        return goodPaths

    def _set_tree_debug_only(self, tree):
        self.tree = tree
        self.rootNode = tree.get_root()

    def get_freq_seqs(self):
        return self.freq_seqs

    def get_sequence_tree(self):
        return self.tree

    def show_tree(self):
        self.tree.show()

    def get_confidence_list(self, recommendation):
        return list(map(lambda x: x[1], recommendation))

    def _parse_SPMF_output(self):
        with open(self.outputPath, 'r') as fin:
            self.freq_seqs = []
            for line in fin:
                pieces = line.split('#SUP: ')
                support = pieces[1].strip()
                items = pieces[0].split(' ')
                seq = tuple(x for x in items if x != '' and x != '-1')
                seq_and_support = ((seq, int(support)))
                self.freq_seqs.append(seq_and_support)