Esempio n. 1
0
 def load_phlist(self, wordlist):
     assert self._loaded == False, "The words were already loaded"
     assert isinstance(wordlist, PyHuskyList)
     param = {OperationParam.list_str: self.list_name}
     op = Operation("Word#load_phlist_py", param, [wordlist.pending_op])
     scheduler.compute(op)
     self._loaded = True
Esempio n. 2
0
 def load_edgelist_phlist(self, edgelist):
     assert self._loaded == False, "The graph was already loaded"
     assert isinstance(edgelist, PyHuskyList)
     param = {OperationParam.list_str: self.list_name}
     op = Operation("Graph#load_edgelist_phlist_py", param,
                    [edgelist.pending_op])
     scheduler.compute(op)
     self._loaded = True
Esempio n. 3
0
 def wordcount(self):
     """wordcount function is to compute wordcount using the C++ library
     """
     assert self._loaded == True, "Words are not loaded"
     param = {OperationParam.list_str: self.list_name, "Type": "cpp"}
     op = Operation("Word#wordcount_py", param, [])
     scheduler.compute(op)
     self._computed = True
Esempio n. 4
0
 def write_to_hdfs(self, url):
     param = {
         OperationParam.url_str: url,
         OperationParam.list_str: self.list_name
     }
     op = Operation("Functional#write_to_hdfs_py", param, [self.pending_op])
     compute(op)
     return None
Esempio n. 5
0
 def uncache(self):
     if self.pending_op.is_materialized is False:
         return None
     param = {OperationParam.list_str: self.list_name}
     op = Operation("Functional#uncache_py", param, [])
     compute(op)
     self.pending_op.is_materialized = False
     return None
Esempio n. 6
0
 def cache(self):
     if self.pending_op.is_materialized is True:
         return self
     param = {OperationParam.list_str: self.list_name}
     op = Operation("Functional#cache_py", param, [self.pending_op])
     compute(op)
     self.pending_op.is_materialized = True
     return self
Esempio n. 7
0
 def topk(self, k):
     assert self._computed == True, "You haven't computed wordcount"
     param = {
         "k": str(k),
         OperationParam.list_str: self.list_name,
         "Type": "cpp"
     }
     op = Operation("Word#wordcount_topk_py", param, [])
     return scheduler.compute_collect(op)
Esempio n. 8
0
 def concat(self, other_list):
     if isinstance(other_list, PyHuskyList):
         phlist = PyHuskyList()
         param = {OperationParam.list_str: phlist.list_name}
         phlist.pending_op = Operation(
             "Functional#concat_py", param,
             [self.pending_op, other_list.pending_op])
         return phlist
     else:
         return NotImplemented
Esempio n. 9
0
 def compute_pagerank(self, iter):
     assert self._loaded == True, "The graph is not loaded"
     param = {
         "iter": str(iter),
         OperationParam.list_str: self.list_name,
         "Type": "cpp"
     }
     op = Operation("Graph#pagerank_py", param, [])
     scheduler.compute(op)
     self._computed = True
Esempio n. 10
0
 def topk_pagerank(self, k):
     assert self._computed == True, "You haven't computed Pagerank"
     param = {
         "k": str(k),
         OperationParam.list_str: self.list_name,
         "Type": "cpp"
     }
     op = Operation("Graph#pagerank_topk_py", param, [])
     topk_list = scheduler.compute_collect(op)
     return topk_list
Esempio n. 11
0
 def reduce(self, func):
     if hasattr(func, '__call__'):
         param = {
             OperationParam.lambda_str: func,
             OperationParam.list_str: self.list_name
         }
         op = Operation("Functional#reduce_py", param, [self.pending_op])
         return compute_collect(op)
     else:
         return NotImplemented
Esempio n. 12
0
 def load(self, path=None):
     assert path is not None
     param = {
         "Protocol": "hdfs",
         "Host": self.host,
         "Port": self.port,
         "Path": path,
         OperationParam.list_str: self.list_name
     }
     self.pending_op = Operation("Functional#load_py", param, [])
     return self
Esempio n. 13
0
 def foreach(self, func):
     if hasattr(func, '__call__'):
         param = {
             OperationParam.lambda_str: func,
             OperationParam.list_str: self.list_name
         }
         op = Operation("Functional#foreach_py", param, [self.pending_op])
         compute(op)
         return
     else:
         return NotImplemented
Esempio n. 14
0
    def load_pyhlist(self, xy_list):
        assert not self.loaded

        if isinstance(xy_list, PyHuskyList):
            param = {OperationParam.list_str: self.list_name}
            self.pending_op = Operation("SVMModel#SVM_load_pyhlist_py", param,
                                        [xy_list.pending_op])
            scheduler.compute(self.pending_op)
            self.loaded = True
        else:
            return NotImplemented
Esempio n. 15
0
 def load_hdfs(self, url):
     assert self._loaded == False, "The words were already loaded"
     assert type(url) is str
     param = {
         OperationParam.list_str: self.list_name,
         "url": url,
         "Type": "cpp"
     }
     op = Operation("Word#load_hdfs_py", param, [])
     scheduler.compute(op)
     self._loaded = True
Esempio n. 16
0
 def load_adjlist_hdfs(self, url):
     assert type(url) is str
     assert self._loaded == False, "The graph was already loaded"
     param = {
         OperationParam.list_str: self.list_name,
         "url": url,
         "Type": "cpp"
     }
     op = Operation("Graph#load_adjlist_hdfs_py", param, [])
     scheduler.compute(op)
     self._loaded = True
Esempio n. 17
0
 def filter(self, func):
     if hasattr(func, '__call__'):
         phlist = PyHuskyList()
         param = {
             OperationParam.lambda_str: func,
             OperationParam.list_str: phlist.list_name
         }
         phlist.pending_op = Operation("Functional#filter_py", param,
                                       [self.pending_op])
         return phlist
     else:
         return NotImplemented
Esempio n. 18
0
    def load_hdfs(self, url):
        assert isinstance(url, str)
        assert not self.loaded

        param = {
            "url": url,
            OperationParam.list_str: self.list_name,
            "Type": "cpp"
        }
        op = Operation("SVMModel#SVM_load_hdfs_py", param, [])
        scheduler.compute(op)
        self.loaded = True
Esempio n. 19
0
def parallelize(data):
    if isinstance(data, list):
        hlist = PyHuskyList()
        pdata = cPickle.dumps(data)
        param = {
            OperationParam.data_str: pdata,
            OperationParam.list_str: hlist.list_name
        }
        hlist.pending_op = Operation("Functional#parallelize_py", param, [])
        return hlist
    else:
        return NotImplemented
Esempio n. 20
0
    def train(self, n_iter=10, alpha=0.1, is_sparse=1):
        assert self.loaded
        assert isinstance(n_iter, int)
        assert isinstance(alpha, float)

        self.pending_op = Operation("LogisticRegressionModel#LogisticR_train_py",
            {
                "n_iter" : str(n_iter),
                "alpha" : str(alpha),
                OperationParam.list_str : self.list_name,
                "is_sparse": str(is_sparse),
                "Type" : "cpp"
            },
            [Operation("LogisticRegressionModel#LogisticR_init_py", {"Type" : "cpp"})] \
                if self.trained else [self.pending_op]
        )

        print self.pending_op.op_deps
        paramlist = scheduler.compute_collect(self.pending_op)
        self.param = np.array(paramlist[:-1])
        self.intercept = paramlist[-1]
        self.trained = True
Esempio n. 21
0
 def difference(self, other_list):
     if isinstance(other_list, PyHuskyList):
         phlist = PyHuskyList()
         param = {OperationParam.list_str: phlist.list_name}
         self.pending_op.op_param[phlist.list_name + "_diffl"] = "dummy"
         other_list.pending_op.op_param[phlist.list_name +
                                        "_diffr"] = "dummy"
         phlist.pending_op = Operation(
             "Functional#difference_py", param,
             [self.pending_op, other_list.pending_op])
         return phlist
     else:
         return NotImplemented
Esempio n. 22
0
 def load(self, database=None, collection=None):
     assert database is not None and collection is not None
     param = {
         "Protocol": "mongodb",
         "Server": '{}:{}'.format(self.host, self.port),
         "Database": database,
         "Collection": collection,
         "Username": self.user,
         "Password": self.pwd,
         OperationParam.list_str: self.list_name
     }
     self.pending_op = Operation("Functional#load_py", param, [])
     return self
Esempio n. 23
0
    def load_op(self):
        """
        Load an operation without its dependencies
        """
        op_name = self.load_str()
        param_sz = self.load_int64()
        op_param = dict()
        for _ in xrange(param_sz):
            k, v = self.load_str(), self.load_str()
            op_param[k] = v

        op = Operation(op_name, op_param)
        return op
Esempio n. 24
0
    def load_hdfs(self, url, is_sparse=0, fmat="tsv"):
        assert isinstance(url, str)
        assert not self.loaded

        param = {
            "url": url,
            OperationParam.list_str: self.list_name,
            "is_sparse": str(is_sparse),
            "format": fmat,
            "Type": "cpp"
        }
        op = Operation("LinearRegressionModel#LinearR_load_hdfs_py", param, [])
        scheduler.compute(op)
        self.loaded = True
Esempio n. 25
0
    def __init__(self, n_feature=-1):
        assert isinstance(n_feature, int)
        super(LogisticRegressionModel, self).__init__()

        self.list_name += "LogisticRgression"
        self.loaded = False
        self.trained = False
        self.param = None
        self.intercept = None

        param = {"n_feature" : str(n_feature),
                 OperationParam.list_str : self.list_name,
                 "Type" : "cpp"}
        self.pending_op = Operation("LogisticRegressionModel#LogisticR_init_py", param, [])
Esempio n. 26
0
    def load_pyhlist(self, xy_list, is_sparse=1):
        assert not self.loaded

        if isinstance(xy_list, PyHuskyList):
            param = {
                OperationParam.list_str: self.list_name,
                "is_sparse": str(is_sparse)
            }
            self.pending_op = Operation(
                "LogisticRegressionModel#LogisticR_load_pyhlist_py", param,
                [xy_list.pending_op])
            scheduler.compute(self.pending_op)
            self.loaded = True
        else:
            return NotImplemented
Esempio n. 27
0
    def __load_pyhlist(self, xy_list, **kwargs):
        assert not self.loaded

        if "is_sparse" not in kwargs:
            kwargs["is_sparse"] = 1

        if not isinstance(xy_list, PyHuskyList):
            raise NotImplementedError
        self.pending_op = Operation("LogisticRegressionModel#LogisticR_load_pyhlist_py", 
            {
                OperationParam.list_str : self.list_name,
                "is_sparse": str(kwargs["is_sparse"])
            },
            [self.pending_op]
        )
        self.loaded = True
Esempio n. 28
0
def load(path):
    # hlist = HuskyListStr()
    # In this case the list represents a list of std::string
    hlist = PyHuskyList()
    param = {
        "Type": "cpp",
        "Path": path,
        OperationParam.list_str: hlist.list_name
    }
    if path.startswith("nfs:"):
        param["Protocol"] = "nfs"
    elif path.startswith("hdfs"):
        param["Protocol"] = "hdfs"
    else:
        raise Exception("ERROR: Cannot resolve the protocol of the load path")
    hlist.pending_op = Operation("Functional#load_py", param, [])
    return hlist
Esempio n. 29
0
    def train(self, n_iter=10, alpha=0.1):
        assert self.loaded
        assert isinstance(n_iter, int)
        assert isinstance(alpha, float)

        param = {
            "n_iter": str(n_iter),
            "alpha": str(alpha),
            OperationParam.list_str: self.list_name,
            "Type": "cpp"
        }
        op = Operation("SVMModel#SVM_train_py", param, [])
        param_list = scheduler.compute_collect(op)
        self.param = np.array(param_list[:-1])
        self.intercept = param_list[-1]
        self.loaded = False
        self.trained = True
Esempio n. 30
0
    def __init__(self, n_feature=-1):
        assert isinstance(n_feature, int)
        super(SVMModel, self).__init__()

        self.list_name += "SVM"
        self.loaded = False
        self.trained = False
        self.param = None
        self.intercept = None

        param = {
            "n_feature": str(n_feature),
            OperationParam.list_str: self.list_name,
            "Type": "cpp"
        }
        op = Operation("SVMModel#SVM_init_py", param, [])
        scheduler.compute(op)