Example #1
0
    def run(self):
        with self.input()[0] as i:
            graphIndex = i.query()

        with self.input()[1] as i:
            nodeIndex = i.query()

        row = []
        column = []
        data = []
        floatType = False

        row_shape = 0
        col_shape = 0

        for ix in range(2, len(self.input())):
            with self.input()[ix] as i:
                D = i.query()
            for ID, entry in D.items():
                if ID not in graphIndex:
                    continue
                gI = graphIndex[ID]
                row_shape = max(row_shape, gI)
                for n, c in entry['kernel_bag'].items():
                    if n not in nodeIndex:
                        continue
                    nI = nodeIndex[n]
                    col_shape = max(col_shape, nI)
                    floatType = floatType or isinstance(c, float)
                    row.append(gI)
                    column.append(nI)
                    data.append(c)

        dtype = np.float64 if floatType else np.uint64

        phi = coo_matrix((data, (row, column)),
                         shape=(row_shape + 1, col_shape + 1),
                         dtype=dtype).tocsr()

        if self.tfidf:
            phi = TfidfTransformer(sublinear_tf=True).fit_transform(phi)

        NZ = phi.nonzero()
        data = phi[NZ].A
        shape = phi.get_shape()

        out = {
            'graphIndex': graphIndex,
            'nodeIndex': nodeIndex,
            'rows': NZ[0].tolist(),
            'columns': NZ[1].tolist(),
            'data': data.tolist()[0],
            'row_shape': shape[0],
            'column_shape': shape[1]
        }

        with self.output() as o:
            o.emit(out)