Example #1
0
    def _convert_DataFrame_to_DiGraph(self, df):
        columns = ['Class ID', 'Parents']
        try:
            tmp = df[columns]
            edges = [(get_short_concept_name(self.name, parent),
                      get_short_concept_name(self.name, row['Class ID']))
                     for index, row in tmp.iterrows()
                     for parent in str(row['Parents']).split('|')]
            self.G = nx.DiGraph()
            self.G.graph['name'] = self.name
            self.G.graph['year'] = self.year
            self.G.graph['submission_id'] = self.submission_id
            self.G.add_edges_from(edges)

            try:
                self.G.remove_nodes_from(['nan'])
            except:
                pass

            printf('Convertion {}-{} DataFrame to DiGraph done!'.format(
                self.name, self.year))
        except Exception as ex:
            printf(ex)
            printf(
                'ERROR: {}-{} NOT converted from DataFrame to DiGraph!'.format(
                    self.name, self.year))
Example #2
0
def get_submissions(fn):
    if not os.path.exists(fn):
        raise ValueError("submission fn does not exist!")
        return None
    try:
        with open(fn, 'r') as f:
            obj = json.load(f)
        printf('{} loaded!'.format(fn))
        printf('- {} ontologies'.format(len(obj.keys())))
        printf('- {} years'.format(
            len(
                set([
                    year for o, years in obj.items()
                    for year, data in years.items()
                ]))))

    except Exception as ex:
        printf(ex)
        printf('ERROR: {} NOT loaded!'.format(fn))

    obj = {k.upper(): v for k, v in obj.items()}
    return obj
Example #3
0
def load_clickstream(path, year):
    try:
        fn = os.path.join(path, CS_FN_SOURCE.replace('<YEAR>', year))
        df = read_csv(fn, index_col=None, compression=COMPRESSION)
        printf('{} loaded!'.format(fn))
        return df
    except Exception as ex:
        printf(ex)
        printf('ERROR: CS{} NOT loaded!'.format(year))
Example #4
0
    def _convert_DataFrame_to_DiGraph(self,
                                      df,
                                      nodes,
                                      min_session_length=MIN_SESSION_LENGTH):

        edges = defaultdict(lambda: 0)

        try:

            for name, group in df.groupby(['ip', '_sessionid']):
                if len(group) < min_session_length:
                    continue

                dyad0 = None
                dyad1 = None
                seq0 = None
                seq1 = None

                for i, row in group.iterrows():

                    if dyad0 is None:
                        dyad0 = row._concept
                        seq0 = row._sequence
                        continue

                    if dyad1 is None:
                        dyad1 = row._concept
                        seq1 = row._sequence

                        if seq1 == (seq0 + 1) and dyad0 != dyad1:
                            if self.navitype is None or (self.navitype
                                                         == row._navitype):
                                edges[(dyad0, dyad1)] += 1

                        dyad0 = dyad1
                        seq0 = seq1
                        dyad1 = None
                        seq1 = None

        except Exception as ex:
            printf(ex)
            printf('ERROR converting dataframe to digraph')
            return

        tmp = nx.DiGraph()
        tmp.add_weighted_edges_from([(e[0], e[1], w)
                                     for e, w in edges.items()])
        self.H = tmp.subgraph(nodes).copy()
        del (edges)

        printf('{}-{}-{}: {} concepts found, but {} kept (cros-val)'.format(
            self.name, self.year, self.navitype, tmp.number_of_nodes(),
            self.H.number_of_nodes()))
        del (tmp)
Example #5
0
 def load_ontology(self):
     fn = [
         fn for fn in os.listdir(self._path)
         if fn.startswith(self.name) and fn.endswith(ONTO_EXT)
     ]
     if len(fn) == 0:
         raise ValueError("Ontology file not found in {}".format(
             self._path))
     try:
         fn = os.path.join(self._path, fn[0])
         df = read_csv(fn, index_col=False, compression=COMPRESSION)
         printf('{} loaded!'.format(fn))
     except Exception as ex:
         printf(ex)
         printf('ERROR: {}-{} NOT loaded!'.format(self.name, self.year))
         return
     self._convert_DataFrame_to_DiGraph(df)
     self.sorted_nodes = sorted(list(self.G.nodes()))
     self.lcc_sorted_nodes = sorted(
         list(
             max(nx.connected_component_subgraphs(self.G.to_undirected()),
                 key=len).nodes()))
Example #6
0
def main():
    printf('class ontology')
Example #7
0
    def create_hops_matrices(self, path, maxk=5, lcc=False):

        self.set_lcc(lcc)
        self.set_path_khop(path)

        reached_zero = False

        if lcc:
            if self.lcc_A is None:
                printf('{}-{}-{}: Adjacency matrix is not loaded.'.format(
                    self.name, self.year, self.submission_id))
                return
            A = self.lcc_A
        else:
            if self.A is None:
                printf('{}-{}-{}: Adjacency matrix is not loaded.'.format(
                    self.name, self.year, self.submission_id))
                return
            A = self.A

        uA = self.get_undirected_adjacency(lcc).tocsr().astype(
            np.int32, copy=False)  # undirected
        kdone = 1

        khops = get_khop_with_partial_results_load_previous(
            uA, maxk, self.get_khop)
        for k, hop in khops:

            if hop.sum() == 0:
                printf('{}-{}-{}: {}-hop has reached zero!'.format(
                    self.name, self.year, self.submission_id, k))
                kdone = k - 1 if (k - kdone) > 1 else kdone
                break

            kdone = k

            # save
            printf('{}-{}-{}: {}-hop --> shape:{}, sum:{}!'.format(
                self.name, self.year, self.submission_id, k, hop.shape,
                hop.sum()))
            printf('{}-{}-{}: {}-hop saving...'.format(self.name, self.year,
                                                       self.submission_id, k))

            fn = self.get_khop_matrix_fn(k, lcc=lcc)
            #save_sparse_matrix(hop, path, fn)
            printf('{}-{}-{}: {}-hop done!'.format(self.name, self.year,
                                                   self.submission_id, k))
            printf('')

        return kdone
Example #8
0
 def __get_damping_factor__(self, alpha):
     if alpha is None:
         alpha = round(self.M.multiply(self.T).sum() / self.T.sum(), 2)
         printf('Empirical alpha (damping factor): {}'.format(alpha))
         return alpha
     return alpha
Example #9
0
def main():
    printf('class clickstreams')