def get_data_by_part_id(self, result, part_id): ParaLiteLog.debug("partition number : %s" % len(self.p_node)) if self.dest != conf.DATA_TO_ANO_OP or self.dest == conf.DATA_TO_ANO_OP and len(self.p_node) == 1: # part id is the job id rs = "" for dataid in self.result: for data in self.result[dataid]: if isinstance(data, str): # data is stored in file f = open(data, "rb") rs += f.read() f.close() else: # data is stored in buffer rs += data.getvalue() return rs rs = "" for part in result[part_id]: if isinstance(part, str): # data is stored in file f = open(part, "rb") rs += f.read() f.close() else: # data is stored in buffer rs += part.getvalue() return rs
def process_ck_info(self, message): # message --> DATA_PERSIST:CHECKPOINT[:REPLICA_NODE:REPLICA_PORT] is_ck = message[1] self.is_checkpoint = is_ck if is_ck == conf.CHECKPOINT: replica_addr = (message[2], string.atoi(message[3])) if self.result_type != conf.MULTI_FILE: for dataid in self.result: ds = "" for data in self.result[dataid]: ds += data.getvalue() self.write_data_to_disk(dataid, ds) f_name = self.get_file_name_by_data_id(gethostname(), dataid) cmd = "scp %s %s:%s" % (f_name, replica_addr[0], f_name) ParaLiteLog.debug("CDM: %s" % cmd) os.system(cmd) else: for dataid in self.result: f = open(self.result[dataid], "rb") while True: ds = f.read(self.max_size) if not ds: break msg = conf.SEP_IN_MSG.join( [conf.DATA_REPLICA, gethostname(), str(dataid), ds]) self.send_data_to_node(msg, AF_INET, replica_addr)
def sql_proc(self): try: ParaLiteLog.debug("sql proc : START") # start local socket server to listen all connections ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 100, ("", self.my_port)) n, self.my_port = ch.ss.getsockname() ParaLiteLog.debug("listen on port : %s ..." % str(self.my_port)) # register local port to the master self.register_to_master(self.cqid, self.opid, gethostname(), self.my_port) ParaLiteLog.debug("reg to master: FINISH") while self.is_running: s_time = time.time() ev = self.next_event(None) if isinstance(ev, ioman_base.event_accept): self.handle_accept(ev) if isinstance(ev, ioman_base.event_read): if ev.data != "": e_time = time.time() self.handle_read(ev) for thd in self.threads: thd.join() for proc in self.processes: proc.join() ParaLiteLog.info("--sql node %s on %s is finished--" % (self.opid, gethostname())) #self.notifier.join() except KeyboardInterrupt, e: self.report_error("ParaLite receives a interrupt signal and then will close the process\n") ParaLiteLog.info("--sql node %s on %s is finished--" % (self.opid, gethostname())) sys.exit(1)
def distribute_data(self): whole_data = cStringIO.StringIO() for i in self.result: for csio in self.result[i]: d = string.strip(csio.getvalue()) if len(d) == 0: continue whole_data.write(d) whole_data.write("\n") del csio if self.distinct or self.limit != -1: data_list = whole_data.getvalue().split(self.db_row_sep) del whole_data if self.distinct: data_list = set(data_list) if self.limit != -1: data_list = data_list[:self.limit] data = cStringIO.StringIO() data.write(self.db_row_sep.join(str(s) for s in data_list)) del data_list else: data = whole_data if self.dest == conf.DATA_TO_ONE_CLIENT: # send data to a random client random_num = random.randint(0, len(self.client_sock) - 1) addr = self.client_sock[random_num] sock = socket(AF_INET, SOCK_STREAM) sock.connect(addr) data_s = data.getvalue() sock.send("%10s%s" % (len(data_s), data_s)) re = sock.recv(10) assert re == "OK" sock.close() elif self.dest == conf.DATA_TO_DB: self.data = data col_sep = self.db_col_sep row_sep = self.db_row_sep master = (self.master_name, self.master_port) ParaLiteLog.debug("Load data start:") # send request to the master t_size = len(data.getvalue()) sep = conf.SEP_IN_MSG tag = conf.LOAD_FROM_API if row_sep is None or row_sep == "\n": temp_sep = "NULL" else: temp_sep = row_sep msg = sep.join( str(s) for s in [conf.REQ, self.cqid, gethostname(), self.my_port, self.dest_db, self.dest_table, t_size, tag, self.fashion, temp_sep, "0"]) so_master = socket(AF_INET, SOCK_STREAM) so_master.connect(master) so_master.send("%10s%s" % (len(msg),msg)) so_master.close()
def hash_data_file(self, data, key_pos, nodes, row_sep, col_sep, chunk_num, sub_dbs): sep = conf.SEP_IN_MSG db_buf = {} for db in sub_dbs: db_buf[db] = cStringIO.StringIO() if col_sep is None: SEP = self.db_col_sep else: SEP = col_sep records = data.split(row_sep) count = 1 for line in records: if line == "": continue key = " ".join(line.strip().split(SEP)[kp] for kp in key_pos) pnum = abs(hash(key)) % (len(nodes)*chunk_num) if pnum == 0: count += 1 db_name = sub_dbs[pnum] db_buf[db_name].write("%s%s" % (line.strip(), row_sep)) ParaLiteLog.debug("count %s" % count) for db in db_buf: ParaLiteLog.debug("%s -- > %s" % (db, len(db_buf[db].getvalue()))) break return db_buf
def send_to_node(self, db, table, data, addr, row_sep, col_sep, is_replace): sep = conf.SEP_IN_MSG req_info = "%s%s%s%s%s%s%s%s%s%s%s%s%s" % (conf.INFO, sep, db, sep, table, sep, self.db_col_sep, sep, col_sep, sep, row_sep, sep,is_replace) ParaLiteLog.info("sending %s --> %s" % (req_info, addr[0])) self.really_send(addr, req_info) # use the first 10 charactors to indicate the database self.really_send(addr, "%10s%s%s" % (len(db), db, data)) ParaLiteLog.info("sending data : %s --> %s" % (len(data), repr(addr)))
def recv_bytes(self, so, n): A = [] while n > 0: x = so.recv(n) if x == "": break A.append(x) ParaLiteLog.debug(len(x)) n = n - len(x) return string.join(A, "")
def step(self, value): try: newvalue = value if isinstance(value, unicode): newvalue = value.encode("ascii") if isinstance(newvalue, str): newvalue = string.atoi(newvalue) self.product *= newvalue except: ParaLiteLog.info(traceback.format_exc()) raise(Exception(traceback.format_exc()))
def proc_drop(self, exp, target_db): try: for db in target_db: conn = sqlite3.connect(db) c = conn.cursor() c.execute(exp) conn.commit() conn.close() except sqlite3.OperationalError, e: es("%s: %s" % (gethostname(), " ".join(e.args))) ParaLiteLog.info(traceback.format_exc())
def register_to_master(self, cqid, opid, node, port): sep = conf.SEP_IN_MSG msg = sep.join([conf.REG, conf.DATA_NODE, cqid, opid, gethostname(), str(self.my_port), self.local_addr]) ParaLiteLog.debug("MASTER_NODE: %s MASTER_PORT: %s" % (self.master_name, self.master_port)) addr = (self.master_name, self.master_port) sock = socket(AF_INET, SOCK_STREAM) try: sock.connect(addr) except Exception, e: ParaLiteLog.error("Error in register_to_master: %s" % traceback.format_exc()) if e.errno == 4: sock.connect(addr)
def start(self): try: # start socket server to listen all connections ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 100, ("", self.my_port)) n, self.my_port = ch.ss.getsockname() ParaLiteLog.debug("listen on port : %s ..." % str(self.my_port)) # start socket server for local connections self.local_addr = "/tmp/paralite-local-addr-orderby-%s-%s-%s" % (gethostname(), self.cqid, self.opid) if os.path.exists(self.local_addr): os.remove(self.local_addr) self.iom.create_server_socket(AF_UNIX, SOCK_STREAM, 10, self.local_addr) # register local port to the master self.register_to_master(self.cqid, self.opid, gethostname(), self.my_port) ParaLiteLog.debug("reg to master: FINISH") while self.is_running: ev = self.next_event(None) if isinstance(ev, ioman_base.event_accept): self.handle_accept(ev) if isinstance(ev, ioman_base.event_read): if ev.data != "": self.handle_read(ev) ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname())) except KeyboardInterrupt, e: self.report_error("ParaLite receives a interrupt signal and then will close the process\n") ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname())) sys.exit(1)
def proc_create(self, exp, target_db): try: # first of all, check if the directory holds database exists or not for db in target_db: parent = db[0:db.rfind(os.sep)] if not os.path.exists(parent): os.makedirs(parent) conn = sqlite3.connect(db) c = conn.cursor() c.execute(exp) conn.commit() conn.close() except sqlite3.OperationalError, e: ParaLiteLog.info(traceback.format_exc()) raise(Exception("ERROR: in proc_create: %s: %s" % (gethostname(), " ".join(e.args))))
def main(): if len(sys.argv) != 7: sys.exit(1) proc = OrderbyOp() proc.master_name = sys.argv[1] proc.master_port = string.atoi(sys.argv[2]) proc.cqid = sys.argv[3] proc.opid = sys.argv[4] proc.my_port = string.atoi(sys.argv[5]) proc.log_dir = sys.argv[6] if not os.path.exists(proc.log_dir): os.makedirs(proc.log_dir) cur_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())) ParaLiteLog.init("%s/orderby-%s-%s.log" % (proc.log_dir, gethostname(), cur_time), logging.DEBUG) ParaLiteLog.info("--orderby node %s on %s is started" % (proc.opid, gethostname())) proc.start()
def main(): if len(sys.argv) != 7: sys.exit(1) proc = SqlOp() proc.master_name = sys.argv[1] proc.master_port = string.atoi(sys.argv[2]) proc.cqid = sys.argv[3] proc.opid = sys.argv[4] proc.my_port = string.atoi(sys.argv[5]) proc.log_dir = sys.argv[6] if not os.path.exists(proc.log_dir): os.makedirs(proc.log_dir) ParaLiteLog.init("%s/sql-%s-%s-%s.log" % ( proc.log_dir, gethostname(), proc.cqid, proc.opid), logging.DEBUG) ParaLiteLog.info("--sql node %s on %s is started" % (proc.opid, gethostname())) proc.sql_proc()
def distinct_data(self, data_list): try: csio = cStringIO.StringIO() whole_data = "" for data in data_list: if data.strip() == "": continue whole_data += data.strip() + self.db_row_sep del(data) if whole_data == "": return None, None, None whole_data = set(whole_data.strip().split(self.db_row_sep)) csio.write(self.db_row_sep.join(whole_data)) return conf.SINGLE_BUFFER, [csio], len(csio.getvalue()) except Exception, e: ParaLiteLog.debug(traceback.format_exc()) self.report_error("ERROR in order_by.py : %s" % traceback.format_exc()) return None, None, None
def get_data_by_blocksize(self, jobid, bksize): if self.reader is None: return None data = self.reader.read(bksize) if not data: read_size = 0 else: read_size = len(data) if read_size < bksize or read_size == bksize and read_size == self.job_data[jobid]: # while True: # if self.reader is not None: # self.reader.close() # self.reader = None # self.reader = self.get_next_reader() # if self.reader is None: # break # new_data = self.reader.read(bksize - read_size) # data += new_data # read_size = len(data) # if read_size >= bksize: # break if self.reader is not None: self.reader.close() self.reader = None self.reader = self.get_next_reader() return data if self.db_row_sep == "\n": if not data.endswith("\n"): extra_data = self.reader.readline() if extra_data: data += extra_data return data else: if data: pos = data.rfind(self.db_row_sep) ParaLiteLog.info(pos) send_ds = self.left_ds + data[0:pos] self.left_ds = data[pos+len(self.db_row_sep):] return send_ds else: return None
def handle_read(self, ev): data = ev.data if data == conf.END_TAG: ParaLiteLog.info("receive: END_TAG") self.is_running = False self.queue.put(conf.END_TAG) elif data.startswith(conf.INFO): m = data.split(conf.SEP_IN_MSG) assert len(m) == 7 if self.table == None: self.table = m[2] if self.db_col_sep == None: self.db_col_sep = m[3] if self.cmd_col_sep == None: self.cmd_col_sep = m[4] if self.cmd_row_sep == None: self.cmd_row_sep = m[5] if self.is_replace == None: self.is_replace = m[6] ParaLiteLog.info("DB_COL_SEP = %s CMD_COL_SEP = %s CMD_ROW_SEP = %s is_replace = %s" % (self.db_col_sep, self.cmd_col_sep, self.cmd_row_sep, self.is_replace)) else: """ TODO: we can control the buffer size here. """ self.queue.put(data) """
def scan_data_queue(self): while True: data = self.queue.get() if data == conf.END_TAG: ParaLiteLog.info("SCAN DATA QUEUE : END") break try: pos = 10+string.atoi(data[0:10].strip()) target_db = data[10:pos] data = data[pos:] """ thd = threading.Thread(target=self.write_to_db, args=(data, len(data))) thd.setDaemon(True) thd.start() self.threads.append(thd) """ self.write_to_db(target_db, data, len(data)) del(data) except Exception, e: ParaLiteLog.info(traceback.format_exc()) es("in write_to_db: %s" % (traceback.format_exc())) sys.exit(1)
def start(self, argument): self.parse(argument) cur_time = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time())) ParaLiteLog.init("%s/dload-server-%s-%s.log" % (self.log_dir, gethostname(), cur_time), logging.DEBUG) ParaLiteLog.info("START") ParaLiteLog.info("parse the argumens sucessfully") ss = time.time() scan_thd = threading.Thread(target=self.scan_data_queue) scan_thd.setDaemon(True) scan_thd.start() t = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time())) self.local_socket = "%s%s%s-%s-%s" % (self.log_dir, os.sep, gethostname(), t, "UNIX.d") self.iom.create_server_socket(AF_UNIX, SOCK_STREAM, 5, self.local_socket) ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 5, ("", self.port)) n, self.port = ch.ss.getsockname() ParaLiteLog.info("global socket addr = %s" % (repr(ch.ss.getsockname()))) self.register_to_master() try: while self.is_running: ev = self.next_event(None) if isinstance(ev, ioman_base.event_accept): self.handle_accept(ev) elif isinstance(ev, ioman_base.event_read): if ev.data != "": self.handle_read(ev) except Exception, e: es("in dload_server.py : %s" % traceback.format_exc()) ParaLiteLog.info(traceback.format_exc()) sys.exit(1)
def handle_read(self, event): message = event.data[10:] m = message.split(conf.SEP_IN_MSG) try: if m[0] == conf.JOB_ARGUMENT: self.parse_args(m[1]) ParaLiteLog.info("parse arguments: FINISH") elif m[0] == conf.JOB: ParaLiteLog.debug("MESSAGE: %s" % message) self.cur_jobid = m[1] elif m[0] == conf.DATA: data_id = string.strip(m[1][0:2]) data = m[1][2:] self.source_data.append(data) # sort data if not self.is_data_ready(self.source_data, self.num_of_children): return ParaLiteLog.debug("****SORT DATA****: start") s = 0 for data in self.source_data: s += len(data) ParaLiteLog.debug("source data size: %s" % s) s_time = time.time() rs_type, rs, t_size = self.sort(self.source_data) del self.source_data ParaLiteLog.debug("****SORT DATA****: finish") if rs_type is None: self.send_status_to_master(self.cur_jobid, conf.PENDING) return self.total_size += t_size self.source_data = {} # store the result of one job to the final result for i in range(len(rs)): if i not in self.result: self.result[i] = [rs[i]] else: self.result[i].append(rs[i]) if rs_type != conf.MULTI_FILE: # check if the whole data exceeds the LIMITATION if self.total_size > self.MAX_SIZE: self.write_data_to_disk() self.result_type = conf.MULTI_FILE e_time = time.time() self.total_time += e_time - s_time self.send_status_to_master(self.cur_jobid, conf.PENDING) elif m[0] == conf.JOB_END: ParaLiteLog.debug("MESSAGE: %s" % message) # all jobs are finished self.send_rs_info_to_master(self.total_size, self.total_time) # distribute data if self.dest == conf.DATA_TO_ONE_CLIENT: ParaLiteLog.debug("dest = %s" % self.dest) self.distribute_data() self.send_status_to_master(self.cur_jobid, conf.ACK) self.is_running = False elif self.dest == conf.DATA_TO_DB: self.distribute_data() elif m[0] == conf.DATA_PERSIST: # if the data is requried to be persisted or not if m[1] == conf.CHECKPOINT: self.write_data_to_disk() elif m[0] == conf.DLOAD_REPLY: sep = conf.SEP_IN_MSG reply = sep.join(m[1:]) ParaLiteLog.info("receive the information from the master") ParaLiteLog.debug(reply) if len(self.data.getvalue()) != 0: dload_client.dload_client().load_internal_buffer( reply, self.dest_table, self.data, self.fashion, self.hash_key, self.hash_key_pos, self.db_col_sep, self.db_row_sep, self.db_col_sep, False, "0", self.log_dir, ) # send END_TAG to the master client_id = "0" msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id]) so_master = socket(AF_INET, SOCK_STREAM) so_master.connect((self.master_name, self.master_port)) so_master.send("%10s%s" % (len(msg), msg)) so_master.close() ParaLiteLog.debug("sending to master: %s" % (conf.END_TAG)) ParaLiteLog.debug("----- dload client finish -------") elif message == conf.DLOAD_END_TAG: ParaLiteLog.debug("---------import finish---------") self.send_status_to_master(" ".join(self.cur_jobid), conf.ACK) self.is_running = False elif m[0] == conf.EXIT: self.is_running = False elif m[0] == conf.NODE_FAIL: ParaLiteLog.debug("MESSAGE: %s" % message) # message --> NODE_FAIL:FAILED_NODE:REPLICA_NODE failed_node, replica_node = m[1:3] self.failed_node.append(failed_node) if replica_node != "" and replica_node == gethostname(): # load replica data for the failed node self.recovery_data(self.replica_result, replica_node) ParaLiteLog.debug("Finish to handle node failure message") except Exception, e: es(traceback.format_exc()) ParaLiteLog.info(traceback.format_exc()) self.is_running = False self.no_error = False
def process_job(self, jobid, exp, target_db, process_queue): s_time = time.time() rs_type, rs, t_size = self.proc_select(jobid, exp, target_db) e_time = time.time() process_queue.put((jobid, rs_type, rs, t_size, e_time-s_time)) ParaLiteLog.debug("Job %s cost %s " % (jobid, e_time-s_time))
def sort(self, data_list): try: ParaLiteLog.debug("sort data START") csio = cStringIO.StringIO() whole_data = "" for data in data_list: if data.strip() == "": continue whole_data += data.strip() + self.db_row_sep del (data) if whole_data == "": return None, None, None whole_data = whole_data.strip().split(self.db_row_sep) ParaLiteLog.debug("order key %s" % (str(self.order_key))) ParaLiteLog.debug("order type %s" % (str(self.order_type))) # get the column positions to be sorted in the input key_pos = [] # (pos:string_or_int:reverse_or_not) for key in self.order_key: pos = self.input.index(key) if self.attrs.has_key(key) == False: key_type = conf.STRING else: key_type = self.attrs[key] if self.order_type[self.order_key.index(key)] == conf.DESC: key_pos.append((pos, key_type, True)) else: key_pos.append((pos, key_type, False)) # check if the ordering type is the same or not flag = 0 t0 = self.order_type[0] for t in self.order_type: if t != t0: flag = 1 break if flag == 1: # for different ordering type, we have to perform sort several times i = len(key_pos) - 1 while i >= 0: pos = key_pos[i] if pos[1] == conf.INT: whole_data.sort(key=lambda l: string.atoi(l.split(self.db_col_sep)[pos[0]]), reverse=pos[2]) elif pos[1] == conf.FLOAT or pos[1] == conf.REAL: whole_data.sort(key=lambda l: float(l.split(self.db_col_sep)[pos[0]]), reverse=pos[2]) else: whole_data.sort(key=lambda l: l.split(self.db_col_sep)[pos[0]], reverse=pos[2]) i -= 1 else: sort_key = [] col_sep = self.db_col_sep for pos in key_pos: if pos[1] == conf.INT: sort_key.append("string.atoi(l.split('%s')[%s])" % (self.db_col_sep, pos[0])) elif pos[1] == conf.FLOAT or pos[1] == conf.REAL: sort_key.append("float(l.split('%s')[%s])" % (self.db_col_sep, pos[0])) else: sort_key.append("l.split('%s')[%s]" % (self.db_col_sep, pos[0])) whole_data.sort(key=lambda l: eval(",".join(sort_key)), reverse=key_pos[0][2]) csio.write(self.db_row_sep.join(whole_data)) # if len(self.db_col_sep) == 1 and self.db_row_sep == "\n": # ParaLiteLog.debug("sort data: shell") # self.shell_sort(data_list, csio) # else: # ParaLiteLog.debug("sort data: quick sort") # self.quick_sort(data_list, csio) return conf.SINGLE_BUFFER, [csio], len(csio.getvalue()) except Exception, e: ParaLiteLog.debug(traceback.format_exc()) self.report_error("ERROR in order_by.py : %s" % traceback.format_exc()) return None, None, None
while self.is_running: ev = self.next_event(None) if isinstance(ev, ioman_base.event_accept): self.handle_accept(ev) if isinstance(ev, ioman_base.event_read): if ev.data != "": self.handle_read(ev) ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname())) except KeyboardInterrupt, e: self.report_error("ParaLite receives a interrupt signal and then will close the process\n") ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname())) sys.exit(1) except Exception, e1: ParaLiteLog.debug(traceback.format_exc()) self.report_error(traceback.format_exc()) sys.exit(1) def handle_read(self, event): message = event.data[10:] m = message.split(conf.SEP_IN_MSG) try: if m[0] == conf.JOB_ARGUMENT: self.parse_args(m[1]) ParaLiteLog.info("parse arguments: FINISH") elif m[0] == conf.JOB: ParaLiteLog.debug("MESSAGE: %s" % message) self.cur_jobid = m[1]
def parse_func_2(self): """ parse each expression to get useful information: input is different input --> ***[count(*), sum(a), sum(a+b*c)]*** output --> ***[count(*), sum(a) + 1, avg(a+b*c)]*** expression --> ['count(*)', 'sum(a)', 'avg(a+b*c)'] ==> [ ['count', 1, 0, '*', 1, 0], ['sum', 2, 0, 'a', 2, 1, [1], lambda _a:_a+1 ], ['sum', 3, 0, 'a+b*c', 3, 0] ] """ # if there is avg function, convert it to sum and count avg_pos = [] for fun in self.function: if fun.find("avg(") != -1: avg_pos.append(self.function.index(fun)) if avg_pos != []: if "count(*)" not in self.function: self.function.append("count(*)") # sometimes, a result column has more than one function, then the pos in # the self.expr and self.expression is not the same, this counter is to # count the pos of a function in self.expr and pos_map shows the mapping # of these two kinds of postions. fun_counter = 0 pos_map = {} for expr_num in range(len(self.function)): expr = self.function[expr_num] pos_map[expr_num] = fun_counter # parse "sum(a) + 1" --> ['sum(a)', '+', '1'] if expr.find("count(*)") != -1: expr = expr.replace("count(*)", "count(a_a)") _expr = newparser.parse_column_expr(expr) # to describe the pos of elements in the outer arithmetic operation outer_ao_pos = [] new_expr = expr new_args = [] ParaLiteLog.debug(self.input) for ele in _expr: if re.match("(.*)\((.*)\)", ele): parsed_expr = [] new_expr = new_expr.replace(ele, "_col%s" % str(fun_counter)) new_args.append("_col%s" % str(fun_counter)) func_name = ele[0:ele.find("(")] if func_name not in conf.GENERAL_FUNC: return False, "ParaLite cannot support aggregation function %s" % func_name if func_name == "avg": ele = ele.replace("avg", "sum") fun_counter += 1 func_attr = ele[ele.find("(") + 1 : ele.rfind(")")] parsed_expr.append(func_name) if func_attr == "a_a": func_attr = "*" ele = ele.replace("a_a", "*") expr = expr.replace("count(a_a)", "count(*)") opexpr = [func_attr] pos_in_input = self.input.index(ele) parsed_expr.append(pos_in_input) parsed_expr.append(0) parsed_expr.append(ele) if expr in self.output: parsed_expr.append(self.output.index(expr)) else: parsed_expr.append(-1) self.expr.append(parsed_expr) outer_ao_pos.append(fun_counter - 1) else: # other operator element: + - * / ^ continue cur_pos = pos_map[expr_num] if cur_pos >= len(self.expr): # the exception that select sum(a), count(*), avg(a) ... does not need # to do anything for avg(a) continue if len(_expr) == 1: self.expr[cur_pos].append(0) else: self.expr[cur_pos].append(1) self.expr[cur_pos].append(outer_ao_pos) tempexpr = new_expr tempargs = ",".join(new_args) for eacharg in new_args: newarg = eacharg.replace(".", "_") tempexpr = tempexpr.replace(eacharg, newarg) tempargs = tempargs.replace(eacharg, newarg) self.expr[cur_pos].append( eval("lambda %s:%s" % (tempargs, tempexpr))) self.pos_map = pos_map return True, None
def load_internal_buffer(self, reply, table, buf, fashion, key, key_pos, db_col_sep, row_sep, col_sep, is_replace, client_id, LOG_DIR): ParaLiteLog.info("load_internal: START") ParaLiteLog.info("row separator = %s col separator = %s" % (row_sep, col_sep) ) self.db_col_sep = db_col_sep total_size = len(buf.getvalue()) try: """ received message = nodes # sub_dbs # chunk_num # replica_info nodes should be: n1:p1:l1 , n2:p2:l2 , ... IF fashion = HASH_FASHION n1:p1:l1:s1:num , n2:p2:l2:s2:num , ... IF fashion = ROUND_ROBIN TBD IF fashion = RANGE_FASHION node_db_info: db_1_1 , db_1_2 , db_2_1, ... replica_info: db_1_1 db_1_1_r_1 node1 , db_1_2 db_1_2_r_1 node2 , ... """ mm = reply.split("#") ParaLiteLog.info("receive the information from the master %s" % mm) nodes = mm[0].split(",") sub_dbs = mm[1].split(",") chunk_num = string.atoi(mm[2]) replica = mm[3] replica_info = {} # {db_name : {replica_db_name:node}} if replica != "": for ll in replica.split(","): lll = ll.split(" ") if lll[0] not in replica_info: replica_info[lll[0]] = {} replica_info[lll[0]][lll[1]] = lll[2] ParaLiteLog.info(nodes) node_addr = {} # {node:addr} for node in nodes: m = node.split(conf.SEP_IN_MSG) if m[0] == gethostname(): addr = m[2] else: addr = (m[0], string.atoi(m[1])) node_addr[m[0]] = addr ss1 = time.time() if nodes == []: ParaLiteLog.info("there is no data to load") elif fashion == conf.HASH_FASHION: ParaLiteLog.info(fashion) # get the data for each sub db # db_buf = {db_name, buffer_of_data} db_buf = self.hash_data_buffer(buf, key_pos, nodes, row_sep, col_sep, chunk_num, sub_dbs) for db in db_buf: data = db_buf[db].getvalue() node = db.split("_")[-3] self.send_to_node(db, table, data, node_addr[node], row_sep, col_sep, is_replace) if db in replica_info: for rdb in replica_info[db]: node = replica_info[db][rdb] self.send_to_node(rdb, table, data, node_addr[node], row_sep, col_sep, is_replace) """ buf_scanner = threading.Thread(target=self.scan_buf, args=(table, node_buf, node_addr, row_sep, col_sep, is_replace)) buf_scanner.setDaemon(True) buf_scanner.start() buf_scanner.join() """ elif fashion == conf.REPLICATE_FASHION: self.replicate_data(table, files, total_size, nodes) elif fashion == conf.RANGE_FASHION: self.range_data() else: thds = [] num_of_db = len(nodes) * chunk_num if row_sep is not None and row_sep != "\n": whole_data = buf.getvalue() lines = whole_data.split(row_sep) if lines[len(lines)-1] == "": lines.pop(len(lines)-1) l = len(lines) if l % num_of_db == 0: num_each = l / num_of_db else: num_each = l / num_of_db + 1 i = 0 while i < num_of_db: db = sub_dbs[i] node = db.split("_")[-3] cur_num = i*num_each + num_each if cur_num > l: cur_num = l ds = row_sep.join(lines[i*num_each:cur_num]) thd = threading.Thread(target=self.send_to_node, args=(db, table, ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) if db in replica_info: for rdb in replica_info[db]: node = replica_info[db][rdb] thd = threading.Thread(target=self.send_to_node, args=(rdb, table, ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) i += 1 else: buf.seek(0) i = 0 while i < num_of_db: db = sub_dbs[i] node = db.split("_")[-3] node_id = i / chunk_num size = string.atoi(nodes[node_id].split(conf.SEP_IN_MSG)[3]) / chunk_num ParaLiteLog.info("start to get data as bk: %s" % (size)) ds = buf.read(size) if ds is None: ParaLiteLog.info("really get data as bk: 0") continue if not ds.endswith("\n"): ds += buf.readline() ParaLiteLog.info("really get data as bk: %s" % (len(ds))) thd = threading.Thread(target=self.send_to_node, args=(db, table, ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) if db in replica_info: for rdb in replica_info[db]: node = replica_info[db][rdb] thd = threading.Thread(target=self.send_to_node, args=(rdb, table, ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) i += 1 for thd in thds: thd.join() except Exception, e: raise(e)
def handle_read(self, event): message = event.data[10:] sep = conf.SEP_IN_MSG m = message.split(sep) try: if m[0] == conf.DATA_END: ParaLiteLog.debug("MESSAGE: %s" % message) # all data is dipatched to the parent nodes self.send_status_to_master(" ".join(self.job_data), conf.ACK) ParaLiteLog.debug("notify ACK to master") self.is_running = False elif message == conf.END_TAG: ParaLiteLog.debug("MESSAGE: %s" % message) self.send_status_to_master(" ".join(self.job_data), conf.ACK) self.is_running = False elif message == conf.DLOAD_END_TAG: ParaLiteLog.debug("---------import finish---------") self.send_status_to_master(" ".join(self.job_data), conf.ACK) self.is_running = False elif message == conf.EXIT: ParaLiteLog.debug("MESSAGE: %s" % message) self.is_running = False elif m[0] == conf.JOB_ARGUMENT: self.parse_args(m[1]) ParaLiteLog.info("parse arguments: FINISH") # init the persisted result data if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT: ParaLiteLog.debug("recovery data: START") # this is a recovery operator self.recovery_data(self.result, gethostname()) ParaLiteLog.debug("recovery data: FINISH") self.send_rs_info_to_master(0, 0) else: # delete all temporary files for this operator os.system("rm -f %s/%s_%s" % (self.temp_dir, "sql", self.opid)) ############################### # scanner = threading.Thread(target=self.scan_process_queue, args=(self.process_queue, )) # scanner.setDaemon(True) # scanner.start() # self.threads.append(scanner) ########################## elif m[0] == conf.JOB: self.ex_s_time = time.time() self.ex_w_time = 0 ParaLiteLog.debug("MESSAGE: %s" % message) s_time = time.time() jobid = m[1] target_db = m[2].split() exp = self.expression ParaLiteLog.debug("*****JOB %s******:start" % jobid) # FAULT TOLERANCE: if jobid in self.job_data: # this is a failed job, we should first delete the old result value if self.dest == conf.DATA_TO_ANO_OP and self.partition_num > 1: for partid in self.result: pos = self.job_list.index(jobid) self.result[partid][pos] = "" else: self.result[jobid] = "" if exp.lower().startswith("select"): """ selection task: (1), execute sql (2), notify the result to the master (3), wait for the DATA_PERSIST message from the master (4), persist data if so (5), notify ACK to the master """ ParaLiteLog.info("proc_select: START") st_time = time.time() rs_type, rs, t_size = self.proc_select(jobid, exp, target_db) et_time = time.time() ParaLiteLog.debug("Job %s cost time %s second" % (jobid, (et_time - st_time))) # FAULT TOLERANCE: if jobid in self.job_data: # this is a failed job if self.dest == conf.DATA_TO_ANO_OP and self.partition_num > 1: for partid in self.result: pos = self.job_list.index(jobid) self.result[partid][pos] = rs[partid] else: self.result[jobid] = rs self.send_status_to_master(jobid, conf.PENDING) return self.job_data[jobid] = t_size self.job_list.append(jobid) self.total_size += t_size # store the result of one job to the final result if len(rs) == 1: if self.dest == conf.DATA_TO_ANO_OP: # dest is AGGR op or ORDER op, use 0 as the key if 0 not in self.result: self.result[0] = rs else: self.result[0].append(rs[0]) if self.is_checkpoint == 1: self.write_data_to_disk(0, rs[0].getvalue()) else: # dest is UDX op, use jobid as the key self.result[string.atoi(jobid)] = rs if self.is_checkpoint == 1: self.write_data_to_disk(0, rs[0].getvalue()) else: # use partid as the key for i in range(len(rs)): if i not in self.result: self.result[i] = [rs[i]] else: self.result[i].append(rs[i]) if self.is_checkpoint == 1: for i in range(len(rs)): self.write_data_to_disk(i, rs[i].getvalue()) # check if the whole data exceeds the LIMITATION if rs_type != self.MULTI_FILE: if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT or self.total_size > self.MAX_SIZE: for dataid in self.result: data = "" for d in self.result[dataid]: data += d.getvalue() self.write_data_to_disk(dataid, data) self.result_type = self.MULTI_FILE e_time = time.time() if self.total_time == 0: self.total_time = (e_time - s_time) self.send_status_to_master(jobid, conf.PENDING) elif exp.lower().startswith("create"): ParaLiteLog.info("proc_create: START") ParaLiteLog.info("SQL: %s" % exp) self.proc_create(exp, target_db) ParaLiteLog.info("proc_create: START") self.send_status_to_master(jobid, conf.ACK) self.is_running = False elif exp.lower().startswith("drop"): ParaLiteLog.info("proc_drop: START") self.proc_drop(exp, target_db) self.send_status_to_master(jobid, conf.ACK) self.is_running = False ParaLiteLog.debug("*****JOB %s******:finish" % jobid) self.ex_w_time += (time.time() - self.ex_s_time) self.ex_s_time = 0 elif m[0] == conf.JOB_END: ParaLiteLog.debug("MESSAGE: %s" % message) # all jobs are finished # create a dictionary to store the status of each part of data data_status = {} # {data_id:[(pos_in_result, status)]} for dataid in self.result: if dataid not in data_status: data_status[dataid] = [] for i in range(len(self.result[dataid])): data_status[dataid].append((i, 1)) self.data_status = data_status self.reader = self.get_next_reader() self.send_rs_info_to_master(self.total_size, self.total_time) # distribute data if self.dest == conf.DATA_TO_ONE_CLIENT: self.distribute_data() self.send_status_to_master(" ".join(self.job_data), conf.ACK) self.is_running = False elif self.dest == conf.DATA_TO_DB: self.distribute_data() elif m[0] == conf.DLOAD_REPLY: reply = sep.join(m[1:]) ParaLiteLog.info("receive the information from the master") ParaLiteLog.debug(reply) if len(self.data.getvalue()) != 0: dload_client.dload_client().load_internal_buffer( reply, self.dest_table, self.data, self.fashion, self.hash_key, self.hash_key_pos, self.db_col_sep, self.db_row_sep, self.db_col_sep, False, "0", self.log_dir) # send END_TAG to the master client_id = "0" msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id]) so_master = socket(AF_INET, SOCK_STREAM) so_master.connect((self.master_name, self.master_port)) so_master.send("%10s%s" % (len(msg), msg)) so_master.close() ParaLiteLog.debug("sending to master: %s" % (conf.END_TAG)) ParaLiteLog.debug("----- dload client finish -------") elif m[0] == conf.DATA_PERSIST: ParaLiteLog.debug("MESSAGE: %s" % message) # if the data is requried to be persisted or not self.process_ck_info(m) elif m[0] == conf.DATA_DISTRIBUTE: ParaLiteLog.debug("MESSAGE: %s" % message) # send a part of data to the next operator # DATA_DISTRIBUTE:partition_num:destnode part_id, destnode = m[1:] data = self.get_data_by_part_id(self.result, string.atoi(part_id)) # DATA message includes: type:id+data # the first 2 chars represents the opid msg = sep.join([conf.DATA, "%2s%s" % (self.opid, data)]) if destnode == gethostname(): # use local socket addr = self.p_node[destnode][1] t = AF_UNIX else: addr = (destnode, self.p_node[destnode][0]) t = AF_INET self.send_data_to_node(msg, t, addr) ParaLiteLog.debug("send data susscufully %s %s --> %s" % (self.opid, gethostname(), destnode)) elif m[0] == conf.DATA_DISTRIBUTE_UDX: ParaLiteLog.debug("MESSAGE: %s" % message) # send data to udx client # m[1:] = worker.id:jobid:(node:port | addr):size if len(m) == 6: w_id, jobid = m[1:3] addr = (m[3], string.atoi(m[4])) t = AF_INET bk = string.atoi(m[5]) elif len(m) == 5: w_id, jobid = m[1:3] addr = m[3] t = AF_UNIX bk = string.atoi(m[4]) data = self.get_data_by_blocksize(jobid, bk) if not data: # if we don't send something here, udx will not send KAL # again, and then they will not receive data again, the whole # process will be blocked for ever msg = sep.join([conf.DATA, "EMPTY"]) else: msg = sep.join([conf.DATA, data]) self.send_data_to_node(msg, t, addr) elif m[0] == conf.DATA_REPLICA: ParaLiteLog.debug("MESSAGE: %s" % message) # message --> DATA_REPLICA:DATANODE:DATAID:DATA datanode, dataid = m[1:3] f_name = self.get_file_name_by_data_id(gethostname(), dataid) fr = open(f_name, "wa") fr.write(m[4]) fr.close() elif m[0] == conf.NODE_FAIL: ParaLiteLog.debug("MESSAGE: %s" % message) # message --> NODE_FAIL:FAILED_NODE:REPLICA_NODE failed_node, replica_node = m[1:3] self.failed_node.append(failed_node) if replica_node == gethostname(): # load replica data for the failed node self.recovery_data(self.replica_result, replica_node) except Exception, e: es("in sql_proc : %s" % traceback.format_exc()) ParaLiteLog.info(traceback.format_exc()) self.is_running = False self.no_error = False
def range_data(self): ParaLiteLog.info("Now RANGE FASHION is not supported...")
def proc_select(self, jobid, exp, target_db): assert len(target_db) == 1 cur_db = target_db[0] try: conn = sqlite3.connect(cur_db) conn.text_factory = str # register the user-defined aggregate conn.create_aggregate("mul", 1, mul) c = conn.cursor() """ if self.temp_store != 0: c.execute('pragma temp_store=%s' % (self.temp_store)) if self.cache_size != -1: c.execute('pragma cache_size=%s' % (self.cache_size)) """ # for test c.execute('pragma temp_store=memory') c.execute('pragma cache_size=2073741824') ParaLiteLog.info("start to execute sql: %s" % exp) col_sep = self.db_col_sep row_sep = self.db_row_sep num_of_dest = self.partition_num if self.dest == conf.DATA_TO_ANO_OP and num_of_dest > 1: columns = self.output split_key = self.split_key assert split_key is not None # partition data in hash fashion pos = [] for key in split_key: pos.append(columns.index(key)) data_part_list = [] for i in range(self.partition_num): data_part_list.append(cStringIO.StringIO()) size = 0 t_size = 0 for row in c.execute(exp): part_id = abs(hash(self.db_col_sep.join(str(row[p]) for p in pos))) % num_of_dest #part_id = abs(hash(row[pos[0]])) % num_of_dest data = col_sep.join(str(s) for s in row) """ size += len(data) if size > self.MAX_SIZE: for partid in data_part_list: fs = self.write_data_to_disk( partid, data_part_list[partid]) # delete all data in csio data_part_list[partid].truncate(0) t_size += size size = 0 self.result_type = self.MULTI_FILE """ data_part_list[part_id].write(data) data_part_list[part_id].write(row_sep) for i in range(len(data_part_list)): t_size += len(data_part_list[i].getvalue()) ParaLiteLog.debug("finish to retrieve the result: %s" % t_size) if self.result_type == self.MULTI_FILE: for partid in data_part_list: self.write_data_to_disk( partid, data_part_list[partid].getvalue()) del data_part_list return self.MULTI_FILE, None, t_size else: ######################## # new_list = [] # for d in data_part_list: # new_list.append(d.getvalue()) # return self.MULTI_BUFFER, new_list, t_size ################### return self.MULTI_BUFFER, data_part_list, t_size else: csio = cStringIO.StringIO() t_size = 0 size = 0 # record the size of current data data_pos = [] # the file name of data if persisted for row in c.execute(exp): # NOTE: For aggregation SQL, e.g. "select max(col) from T ..." # if there is no record in T, (None,) will be returned if row[0] is None: continue data = col_sep.join(str(s) for s in row) size += len(data) if size >= self.MAX_SIZE: result_type = self.MULTI_FILE self.write_data_to_disk(jobid, csio.getvalue()) # delete all data in csio csio.truncate(0) t_size += size size = 0 csio.write(data) csio.write(row_sep) t_size += len(csio.getvalue()) ParaLiteLog.debug("finish to retrieve the result: %s" % t_size) if self.result_type == conf.MULTI_FILE: self.write_data_to_disk(jobid, csio.getvalue()) del csio return conf.MULTI_FILE, None, t_size else: return self.SINGLE_BUFFER, [csio], t_size except sqlite3.OperationalError, e: ParaLiteLog.info(traceback.format_exc()) raise(Exception("%s: QueryExecutionError: %s" % (gethostname(), traceback.format_exc())))
def load_internal_file(self, reply, opt, db_col_sep, LOG_DIR): ParaLiteLog.info("load_internal_file: START") table = opt.table files = opt.files col_sep = opt.col_sep row_sep = opt.row_sep fashion = opt.fashion key = opt.key key_pos = opt.key_pos is_replace = opt.replace self.db_col_sep = db_col_sep for f in files: self.files[f] = 1 self.file_reader = open(self.get_next_file(), "rb") try: """ received message = nodes # sub_dbs # chunk_num # replica_info nodes should be (| is SEP_IN_MSG): n1 : p1|l1 , n2 : p2|l2 , ... IF fashion = HASH_FASHION n1 : p1|l1|s1|num , n2 : p2|l2|s2|num , ... IF fashion = ROUND_ROBIN TBD IF fashion = RANGE_FASHION node_db_info: node1:[db_1_1] , node2:[db_1_2] , node3:[db_2_1], ... replica_info: db_1_1 db_1_1_r_1 node1 , db_1_2 db_1_2_r_1 node2 , ... """ mm = reply.split("#") nodes = mm[0].split(",") sub_dbs = mm[1].split(",") chunk_num = string.atoi(mm[2]) replica = mm[3] replica_info = {} # {db_name : {replica_db_name:node}} if replica != "": for whole_re in replica.split(","): lll = whole_re.split(" ") if lll[0] not in replica_info: replica_info[lll[0]] = {} replica_info[lll[0]][lll[1]] = lll[2] node_addr = {} # {node:addr} for node in nodes: m = node.split(conf.SEP_IN_MSG) if m[0] == gethostname(): addr = m[2] else: addr = (m[0], string.atoi(m[1])) node_addr[m[0]] = addr thds = [] if nodes == []: ParaLiteLog.info("there is no data to load") elif fashion == conf.HASH_FASHION: ParaLiteLog.info(fashion) if row_sep is not None and row_sep != "\n": while True: dst = self.get_data_as_bk(DATA_MAX_SIZE) if dst is None: ParaLiteLog.info("really get data as bk: 0") break ParaLiteLog.info("really get data as bk: %s" % (len(dst))) pos = dst.rfind(row_sep) ds = left_ds + dst[0:pos] left_ds = dst[pos+len(row_sep):] del dst db_buf = self.hash_data_file(ds, key_pos, nodes, row_sep, col_sep, chunk_num, sub_dbs) ParaLiteLog.debug("hash data finish %s" % len(ds)) del ds for db in db_buf: data = db_buf[db].getvalue() node = db.split("_")[-3] thd = threading.Thread(target=self.send_to_node, args=(db, table, data, node_addr[node], row_sep,col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) if db in replica_info: for rdb in replica_info[db]: node = replica_info[db][rdb] self.send_to_node(rdb, table, data, node_addr[node], row_sep, col_sep, is_replace) else: while True: ds = self.get_data_as_bk(DATA_MAX_SIZE) if ds is None: ParaLiteLog.info("really get data as bk: 0") break ParaLiteLog.info("really get data as bk: %s" % (len(ds))) db_buf = self.hash_data_file(ds, key_pos, nodes, "\n", col_sep, chunk_num, sub_dbs) for db in db_buf: ParaLiteLog.debug( "%s -- > %s" % (db, len(db_buf[db].getvalue()))) break for db in db_buf: data = db_buf[db].getvalue() node = db.split("_")[-3] thd = threading.Thread(target=self.send_to_node, args=(db, table, data, node_addr[node], row_sep,col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) if db in replica_info: for rdb in replica_info[db]: node = replica_info[db][rdb] self.send_to_node(rdb, table, data, node_addr[node], row_sep, col_sep, is_replace) del db_buf del ds elif fashion == conf.REPLICATE_FASHION: self.replicate_data(table, files, total_size, nodes) elif fashion == conf.RANGE_FASHION: self.range_data() else: num_of_db = len(nodes) * chunk_num if row_sep is not None and row_sep != "\n": i = 0 left_ds = "" while True: db = sub_dbs[i % num_of_db] #m = nodes[(i % num_of_db) / chunk_num].split(conf.SEP_IN_MSG) #node = m[0] node = db.split("_")[-3] size = string.atoi(m[3]) / chunk_num + 1 if size > DATA_MAX_SIZE: ParaLiteLog.info("start to get data as bk: %s" % (DATA_MAX_SIZE)) ds = self.get_data_as_bk(DATA_MAX_SIZE) else: ParaLiteLog.info("start to get data as bk: %s" % (size)) ds = self.get_data_as_bk(size) if ds is None: ParaLiteLog.info("really get data as bk: 0") break ParaLiteLog.info("really get data as bk: %s" % (len(ds))) pos = ds.rfind(row_sep) send_ds = left_ds + ds[0:pos] left_ds = ds[pos+len(row_sep):] thd = threading.Thread( target=self.send_to_node, args=(db, table, send_ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) if db in replica_info: for rdb in replica_info[db]: node = replica_info[db][rdb] thd = threading.Thread( target=self.send_to_node, args=(rdb, table, ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) i += 1 else: i = 0 while True: db = sub_dbs[i % num_of_db] #m = nodes[(i % num_of_db)/chunk_num].split(conf.SEP_IN_MSG) #node = m[0] node = db.split("_")[-3] size = string.atoi(m[3]) / chunk_num + 1 if size > DATA_MAX_SIZE: ParaLiteLog.info( "start to get data as bk: %s" % (DATA_MAX_SIZE)) ds = self.get_data_as_bk(DATA_MAX_SIZE) else: ParaLiteLog.info("start to get data as bk: %s" % (size)) ds = self.get_data_as_bk(size) if ds is None: ParaLiteLog.info("really get data as bk: 0") break ParaLiteLog.info("really get data as bk: %s" % (len(ds))) thd = threading.Thread( target=self.send_to_node, args=(db, table, ds, node_addr[node], row_sep,col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) if db in replica_info: for rdb in replica_info[db]: ParaLiteLog.info(rdb) node = replica_info[db][rdb] thd = threading.Thread( target=self.send_to_node, args=(rdb, table, ds, node_addr[node], row_sep, col_sep, is_replace)) thd.setDaemon(True) thd.start() thds.append(thd) i += 1 del ds for thd in thds: thd.join() except Exception, e: ParaLiteLog.debug(traceback.format_exc()) raise(Exception(traceback.format_exc()))
def distribute_data(self): # handle the limit condition: get the first N records # E.g. select ... limit 10, the master firstly decides the limit # number for each process and set the limit value for each process # to be the post-limit whole_data = cStringIO.StringIO() for i in self.result: for csio in self.result[i]: d = string.strip(csio.getvalue()) if len(d) == 0: continue whole_data.write(d) whole_data.write("\n") del csio if self.distinct or self.limit != -1: data_list = whole_data.getvalue().split(self.db_row_sep) del whole_data if self.distinct: data_list = set(data_list) if self.limit != -1: data_list = data_list[:self.limit] data = cStringIO.StringIO() data.write(self.db_row_sep.join(str(s) for s in data_list)) del data_list else: data = whole_data if self.dest == conf.DATA_TO_DB: self.data = data col_sep = self.db_col_sep row_sep = self.db_row_sep master = (self.master_name, self.master_port) ParaLiteLog.info("proc_select: load data start") # send request to the master t_size = len(data.getvalue()) sep = conf.SEP_IN_MSG tag = conf.LOAD_FROM_API if row_sep is None or row_sep == "\n": temp_sep = "NULL" else: temp_sep = row_sep msg = sep.join( str(s) for s in [conf.REQ, self.cqid, gethostname(), self.my_port, self.dest_db, self.dest_table, t_size, tag, self.fashion, temp_sep, "0"]) so_master = socket(AF_INET, SOCK_STREAM) so_master.connect(master) so_master.send("%10s%s" % (len(msg),msg)) so_master.close() # dload_client.dload_client().load_internal_buffer( # master, self.cqid, gethostname(), self.my_port, self.dest_db, # self.dest_table, data, conf.LOAD_FROM_API, self.fashion, # self.hash_key, self.hash_key_pos, self.db_col_sep, row_sep, # col_sep, False, "0", self.log_dir) elif self.dest == conf.DATA_TO_ONE_CLIENT: random_num = random.randint(0, len(self.client_sock) - 1) addr = self.client_sock[random_num] sock = socket(AF_INET, SOCK_STREAM) sock.connect(addr) data_s = data.getvalue() ParaLiteLog.info("DATA SIZE = %s" % len(data_s)) sock.send("%10s%s" % (len(data_s), data_s)) re = sock.recv(10) assert re == "OK" sock.close()