def load_phlist(self, wordlist): assert self._loaded == False, "The words were already loaded" assert isinstance(wordlist, PyHuskyList) param = {OperationParam.list_str: self.list_name} op = Operation("Word#load_phlist_py", param, [wordlist.pending_op]) scheduler.compute(op) self._loaded = True
def load_edgelist_phlist(self, edgelist): assert self._loaded == False, "The graph was already loaded" assert isinstance(edgelist, PyHuskyList) param = {OperationParam.list_str: self.list_name} op = Operation("Graph#load_edgelist_phlist_py", param, [edgelist.pending_op]) scheduler.compute(op) self._loaded = True
def wordcount(self): """wordcount function is to compute wordcount using the C++ library """ assert self._loaded == True, "Words are not loaded" param = {OperationParam.list_str: self.list_name, "Type": "cpp"} op = Operation("Word#wordcount_py", param, []) scheduler.compute(op) self._computed = True
def write_to_hdfs(self, url): param = { OperationParam.url_str: url, OperationParam.list_str: self.list_name } op = Operation("Functional#write_to_hdfs_py", param, [self.pending_op]) compute(op) return None
def uncache(self): if self.pending_op.is_materialized is False: return None param = {OperationParam.list_str: self.list_name} op = Operation("Functional#uncache_py", param, []) compute(op) self.pending_op.is_materialized = False return None
def cache(self): if self.pending_op.is_materialized is True: return self param = {OperationParam.list_str: self.list_name} op = Operation("Functional#cache_py", param, [self.pending_op]) compute(op) self.pending_op.is_materialized = True return self
def topk(self, k): assert self._computed == True, "You haven't computed wordcount" param = { "k": str(k), OperationParam.list_str: self.list_name, "Type": "cpp" } op = Operation("Word#wordcount_topk_py", param, []) return scheduler.compute_collect(op)
def concat(self, other_list): if isinstance(other_list, PyHuskyList): phlist = PyHuskyList() param = {OperationParam.list_str: phlist.list_name} phlist.pending_op = Operation( "Functional#concat_py", param, [self.pending_op, other_list.pending_op]) return phlist else: return NotImplemented
def compute_pagerank(self, iter): assert self._loaded == True, "The graph is not loaded" param = { "iter": str(iter), OperationParam.list_str: self.list_name, "Type": "cpp" } op = Operation("Graph#pagerank_py", param, []) scheduler.compute(op) self._computed = True
def topk_pagerank(self, k): assert self._computed == True, "You haven't computed Pagerank" param = { "k": str(k), OperationParam.list_str: self.list_name, "Type": "cpp" } op = Operation("Graph#pagerank_topk_py", param, []) topk_list = scheduler.compute_collect(op) return topk_list
def reduce(self, func): if hasattr(func, '__call__'): param = { OperationParam.lambda_str: func, OperationParam.list_str: self.list_name } op = Operation("Functional#reduce_py", param, [self.pending_op]) return compute_collect(op) else: return NotImplemented
def load(self, path=None): assert path is not None param = { "Protocol": "hdfs", "Host": self.host, "Port": self.port, "Path": path, OperationParam.list_str: self.list_name } self.pending_op = Operation("Functional#load_py", param, []) return self
def foreach(self, func): if hasattr(func, '__call__'): param = { OperationParam.lambda_str: func, OperationParam.list_str: self.list_name } op = Operation("Functional#foreach_py", param, [self.pending_op]) compute(op) return else: return NotImplemented
def load_pyhlist(self, xy_list): assert not self.loaded if isinstance(xy_list, PyHuskyList): param = {OperationParam.list_str: self.list_name} self.pending_op = Operation("SVMModel#SVM_load_pyhlist_py", param, [xy_list.pending_op]) scheduler.compute(self.pending_op) self.loaded = True else: return NotImplemented
def load_hdfs(self, url): assert self._loaded == False, "The words were already loaded" assert type(url) is str param = { OperationParam.list_str: self.list_name, "url": url, "Type": "cpp" } op = Operation("Word#load_hdfs_py", param, []) scheduler.compute(op) self._loaded = True
def load_adjlist_hdfs(self, url): assert type(url) is str assert self._loaded == False, "The graph was already loaded" param = { OperationParam.list_str: self.list_name, "url": url, "Type": "cpp" } op = Operation("Graph#load_adjlist_hdfs_py", param, []) scheduler.compute(op) self._loaded = True
def filter(self, func): if hasattr(func, '__call__'): phlist = PyHuskyList() param = { OperationParam.lambda_str: func, OperationParam.list_str: phlist.list_name } phlist.pending_op = Operation("Functional#filter_py", param, [self.pending_op]) return phlist else: return NotImplemented
def load_hdfs(self, url): assert isinstance(url, str) assert not self.loaded param = { "url": url, OperationParam.list_str: self.list_name, "Type": "cpp" } op = Operation("SVMModel#SVM_load_hdfs_py", param, []) scheduler.compute(op) self.loaded = True
def parallelize(data): if isinstance(data, list): hlist = PyHuskyList() pdata = cPickle.dumps(data) param = { OperationParam.data_str: pdata, OperationParam.list_str: hlist.list_name } hlist.pending_op = Operation("Functional#parallelize_py", param, []) return hlist else: return NotImplemented
def train(self, n_iter=10, alpha=0.1, is_sparse=1): assert self.loaded assert isinstance(n_iter, int) assert isinstance(alpha, float) self.pending_op = Operation("LogisticRegressionModel#LogisticR_train_py", { "n_iter" : str(n_iter), "alpha" : str(alpha), OperationParam.list_str : self.list_name, "is_sparse": str(is_sparse), "Type" : "cpp" }, [Operation("LogisticRegressionModel#LogisticR_init_py", {"Type" : "cpp"})] \ if self.trained else [self.pending_op] ) print self.pending_op.op_deps paramlist = scheduler.compute_collect(self.pending_op) self.param = np.array(paramlist[:-1]) self.intercept = paramlist[-1] self.trained = True
def difference(self, other_list): if isinstance(other_list, PyHuskyList): phlist = PyHuskyList() param = {OperationParam.list_str: phlist.list_name} self.pending_op.op_param[phlist.list_name + "_diffl"] = "dummy" other_list.pending_op.op_param[phlist.list_name + "_diffr"] = "dummy" phlist.pending_op = Operation( "Functional#difference_py", param, [self.pending_op, other_list.pending_op]) return phlist else: return NotImplemented
def load(self, database=None, collection=None): assert database is not None and collection is not None param = { "Protocol": "mongodb", "Server": '{}:{}'.format(self.host, self.port), "Database": database, "Collection": collection, "Username": self.user, "Password": self.pwd, OperationParam.list_str: self.list_name } self.pending_op = Operation("Functional#load_py", param, []) return self
def load_op(self): """ Load an operation without its dependencies """ op_name = self.load_str() param_sz = self.load_int64() op_param = dict() for _ in xrange(param_sz): k, v = self.load_str(), self.load_str() op_param[k] = v op = Operation(op_name, op_param) return op
def load_hdfs(self, url, is_sparse=0, fmat="tsv"): assert isinstance(url, str) assert not self.loaded param = { "url": url, OperationParam.list_str: self.list_name, "is_sparse": str(is_sparse), "format": fmat, "Type": "cpp" } op = Operation("LinearRegressionModel#LinearR_load_hdfs_py", param, []) scheduler.compute(op) self.loaded = True
def __init__(self, n_feature=-1): assert isinstance(n_feature, int) super(LogisticRegressionModel, self).__init__() self.list_name += "LogisticRgression" self.loaded = False self.trained = False self.param = None self.intercept = None param = {"n_feature" : str(n_feature), OperationParam.list_str : self.list_name, "Type" : "cpp"} self.pending_op = Operation("LogisticRegressionModel#LogisticR_init_py", param, [])
def load_pyhlist(self, xy_list, is_sparse=1): assert not self.loaded if isinstance(xy_list, PyHuskyList): param = { OperationParam.list_str: self.list_name, "is_sparse": str(is_sparse) } self.pending_op = Operation( "LogisticRegressionModel#LogisticR_load_pyhlist_py", param, [xy_list.pending_op]) scheduler.compute(self.pending_op) self.loaded = True else: return NotImplemented
def __load_pyhlist(self, xy_list, **kwargs): assert not self.loaded if "is_sparse" not in kwargs: kwargs["is_sparse"] = 1 if not isinstance(xy_list, PyHuskyList): raise NotImplementedError self.pending_op = Operation("LogisticRegressionModel#LogisticR_load_pyhlist_py", { OperationParam.list_str : self.list_name, "is_sparse": str(kwargs["is_sparse"]) }, [self.pending_op] ) self.loaded = True
def load(path): # hlist = HuskyListStr() # In this case the list represents a list of std::string hlist = PyHuskyList() param = { "Type": "cpp", "Path": path, OperationParam.list_str: hlist.list_name } if path.startswith("nfs:"): param["Protocol"] = "nfs" elif path.startswith("hdfs"): param["Protocol"] = "hdfs" else: raise Exception("ERROR: Cannot resolve the protocol of the load path") hlist.pending_op = Operation("Functional#load_py", param, []) return hlist
def train(self, n_iter=10, alpha=0.1): assert self.loaded assert isinstance(n_iter, int) assert isinstance(alpha, float) param = { "n_iter": str(n_iter), "alpha": str(alpha), OperationParam.list_str: self.list_name, "Type": "cpp" } op = Operation("SVMModel#SVM_train_py", param, []) param_list = scheduler.compute_collect(op) self.param = np.array(param_list[:-1]) self.intercept = param_list[-1] self.loaded = False self.trained = True
def __init__(self, n_feature=-1): assert isinstance(n_feature, int) super(SVMModel, self).__init__() self.list_name += "SVM" self.loaded = False self.trained = False self.param = None self.intercept = None param = { "n_feature": str(n_feature), OperationParam.list_str: self.list_name, "Type": "cpp" } op = Operation("SVMModel#SVM_init_py", param, []) scheduler.compute(op)