Exemple #1
0
 def get_data_by_part_id(self, result, part_id):
     ParaLiteLog.debug("partition number : %s" % len(self.p_node))
     if self.dest != conf.DATA_TO_ANO_OP or self.dest == conf.DATA_TO_ANO_OP and len(self.p_node) == 1:
         # part id is the job id
         rs = ""
         for dataid in self.result:
             for data in self.result[dataid]:
                 if isinstance(data, str):
                     # data is stored in file
                     f = open(data, "rb")
                     rs += f.read()
                     f.close()
                 else:
                     # data is stored in buffer
                     rs += data.getvalue()
         return rs
     
     rs = ""
     for part in result[part_id]:
         if isinstance(part, str):
             # data is stored in file
             f = open(part, "rb")
             rs += f.read()
             f.close()
         else:
             # data is stored in buffer
             rs += part.getvalue()
     return rs
Exemple #2
0
    def process_ck_info(self, message):
        # message --> DATA_PERSIST:CHECKPOINT[:REPLICA_NODE:REPLICA_PORT]
        is_ck = message[1]
        self.is_checkpoint = is_ck
        if is_ck == conf.CHECKPOINT:
            replica_addr = (message[2], string.atoi(message[3]))
            if self.result_type != conf.MULTI_FILE:
                for dataid in self.result:
                    ds = ""
                    for data in self.result[dataid]:
                        ds += data.getvalue()
                    self.write_data_to_disk(dataid, ds)

                    f_name = self.get_file_name_by_data_id(gethostname(), dataid)
                    cmd = "scp %s %s:%s" % (f_name, replica_addr[0], f_name)
                    ParaLiteLog.debug("CDM: %s" % cmd)
                    os.system(cmd)

            else:
                for dataid in self.result:
                    f = open(self.result[dataid], "rb")
                    while True:
                        ds = f.read(self.max_size)
                        if not ds:
                            break
                        msg = conf.SEP_IN_MSG.join(
                            [conf.DATA_REPLICA, gethostname(), str(dataid), ds])
                        self.send_data_to_node(msg, AF_INET, replica_addr)
Exemple #3
0
    def sql_proc(self):
        try:
            ParaLiteLog.debug("sql proc : START")
            # start local socket server to listen all connections
            ch = self.iom.create_server_socket(AF_INET,
                                               SOCK_STREAM, 100, ("", self.my_port)) 
            n, self.my_port = ch.ss.getsockname()
            ParaLiteLog.debug("listen on port : %s ..." % str(self.my_port))
            
            # register local port to the master
            self.register_to_master(self.cqid, self.opid, gethostname(), self.my_port)
            ParaLiteLog.debug("reg to master: FINISH")
            
            while self.is_running:
                s_time = time.time()
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                if isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        e_time = time.time()
                        self.handle_read(ev)

            for thd in self.threads:
                thd.join()
            for proc in self.processes:
                proc.join()
            ParaLiteLog.info("--sql node %s on %s is finished--" % (self.opid,
                                                                    gethostname()))
            #self.notifier.join()
        except KeyboardInterrupt, e:
            self.report_error("ParaLite receives a interrupt signal and then will close the process\n")
            ParaLiteLog.info("--sql node %s on %s is finished--" % (self.opid,
                                                                    gethostname()))
            sys.exit(1)
Exemple #4
0
    def distribute_data(self):
        whole_data = cStringIO.StringIO()
        for i in self.result:
            for csio in self.result[i]:
                d = string.strip(csio.getvalue())
                if len(d) == 0:
                    continue
                whole_data.write(d)
                whole_data.write("\n")
                del csio
            
        if self.distinct or self.limit != -1:
            data_list = whole_data.getvalue().split(self.db_row_sep)
            del whole_data
        
            if self.distinct:
                data_list = set(data_list)
            if self.limit != -1:
                data_list = data_list[:self.limit]

            data = cStringIO.StringIO()
            data.write(self.db_row_sep.join(str(s) for s in data_list))
            del data_list
        else:
            data = whole_data

        if self.dest == conf.DATA_TO_ONE_CLIENT:
            # send data to a random client
            random_num = random.randint(0, len(self.client_sock) - 1)
            addr = self.client_sock[random_num]
            sock = socket(AF_INET, SOCK_STREAM)
            sock.connect(addr)
            data_s = data.getvalue()
            sock.send("%10s%s" % (len(data_s), data_s))
            re = sock.recv(10)
            assert re == "OK"
            sock.close()

        elif self.dest == conf.DATA_TO_DB:
            self.data = data
            col_sep = self.db_col_sep
            row_sep = self.db_row_sep
            master = (self.master_name, self.master_port)
            ParaLiteLog.debug("Load data start:")
            # send request to the master
            t_size = len(data.getvalue())
            sep = conf.SEP_IN_MSG
            tag = conf.LOAD_FROM_API
            if row_sep is None or row_sep == "\n":
                temp_sep = "NULL"
            else:
                temp_sep = row_sep
            msg = sep.join(
                str(s) for s in [conf.REQ, self.cqid, gethostname(), 
                                 self.my_port, self.dest_db, self.dest_table,
                                 t_size, tag, self.fashion, temp_sep, "0"])
            so_master = socket(AF_INET, SOCK_STREAM)
            so_master.connect(master)
            so_master.send("%10s%s" % (len(msg),msg))
            so_master.close()
Exemple #5
0
    def hash_data_file(self, data, key_pos, nodes, row_sep, col_sep, chunk_num, sub_dbs):
        sep = conf.SEP_IN_MSG
        db_buf = {}
        for db in sub_dbs:
            db_buf[db] = cStringIO.StringIO()
        if col_sep is None: SEP = self.db_col_sep
        else: SEP = col_sep
        records = data.split(row_sep)
        count = 1
        for line in records:
            if line == "":
                continue
            key = " ".join(line.strip().split(SEP)[kp] for kp in key_pos)
            pnum = abs(hash(key)) % (len(nodes)*chunk_num)
            if pnum == 0:
                count += 1
            db_name = sub_dbs[pnum]
            db_buf[db_name].write("%s%s" % (line.strip(), row_sep))
            
        ParaLiteLog.debug("count %s" % count)
        
        for db in db_buf:
            ParaLiteLog.debug("%s -- > %s" % (db, len(db_buf[db].getvalue())))
            break

        return db_buf
Exemple #6
0
 def send_to_node(self, db, table, data, addr, row_sep, col_sep, is_replace):
     sep = conf.SEP_IN_MSG
     req_info = "%s%s%s%s%s%s%s%s%s%s%s%s%s" % (conf.INFO, sep, db, sep, table, sep, self.db_col_sep, sep, col_sep, sep, row_sep, sep,is_replace)
     ParaLiteLog.info("sending %s  --> %s" % (req_info, addr[0]))
     self.really_send(addr, req_info)
     # use the first 10 charactors to indicate the database 
     self.really_send(addr, "%10s%s%s" % (len(db), db, data))
     ParaLiteLog.info("sending data : %s --> %s" % (len(data), repr(addr)))
Exemple #7
0
 def recv_bytes(self, so, n):
     A = []
     while n > 0:
         x = so.recv(n)
         if x == "": break
         A.append(x)
         ParaLiteLog.debug(len(x))
         n = n - len(x)
     return string.join(A, "")
Exemple #8
0
 def step(self, value):
     try:
         newvalue = value
         if isinstance(value, unicode):
             newvalue = value.encode("ascii")
         if isinstance(newvalue, str):
             newvalue = string.atoi(newvalue)
         self.product *= newvalue
     except:
         ParaLiteLog.info(traceback.format_exc())
         raise(Exception(traceback.format_exc()))
Exemple #9
0
 def proc_drop(self, exp, target_db):
     try:
         for db in target_db:
             conn = sqlite3.connect(db)
             c = conn.cursor()
             c.execute(exp)
             conn.commit()
             conn.close()
     except sqlite3.OperationalError, e:
         es("%s: %s" % (gethostname(), " ".join(e.args)))
         ParaLiteLog.info(traceback.format_exc())
Exemple #10
0
 def register_to_master(self, cqid, opid, node, port):
     sep = conf.SEP_IN_MSG
     msg = sep.join([conf.REG, conf.DATA_NODE, cqid, opid, gethostname(), str(self.my_port), self.local_addr])
     ParaLiteLog.debug("MASTER_NODE: %s  MASTER_PORT: %s" % (self.master_name, self.master_port))
     addr = (self.master_name, self.master_port)
     sock = socket(AF_INET, SOCK_STREAM)
     try:
         sock.connect(addr)
     except Exception, e:
         ParaLiteLog.error("Error in register_to_master: %s" % traceback.format_exc())
         if e.errno == 4:
             sock.connect(addr)
Exemple #11
0
    def start(self):
        try:
            # start socket server to listen all connections
            ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 100, ("", self.my_port))
            n, self.my_port = ch.ss.getsockname()
            ParaLiteLog.debug("listen on port : %s ..." % str(self.my_port))

            # start socket server for local connections
            self.local_addr = "/tmp/paralite-local-addr-orderby-%s-%s-%s" % (gethostname(), self.cqid, self.opid)
            if os.path.exists(self.local_addr):
                os.remove(self.local_addr)
            self.iom.create_server_socket(AF_UNIX, SOCK_STREAM, 10, self.local_addr)

            # register local port to the master
            self.register_to_master(self.cqid, self.opid, gethostname(), self.my_port)
            ParaLiteLog.debug("reg to master: FINISH")

            while self.is_running:
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                if isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        self.handle_read(ev)

            ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname()))

        except KeyboardInterrupt, e:
            self.report_error("ParaLite receives a interrupt signal and then will close the process\n")
            ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname()))
            sys.exit(1)
Exemple #12
0
 def proc_create(self, exp, target_db):
     try:
         # first of all, check if the directory holds database exists or not
         for db in target_db:
             parent = db[0:db.rfind(os.sep)]
             if not os.path.exists(parent):
                 os.makedirs(parent)
             conn = sqlite3.connect(db)
             c = conn.cursor()
             c.execute(exp)
             conn.commit()
             conn.close()
     except sqlite3.OperationalError, e:
         ParaLiteLog.info(traceback.format_exc())
         raise(Exception("ERROR: in proc_create: %s: %s" % (gethostname(),
                                                            " ".join(e.args))))
Exemple #13
0
def main():
    if len(sys.argv) != 7:
        sys.exit(1)
    proc = OrderbyOp()
    proc.master_name = sys.argv[1]
    proc.master_port = string.atoi(sys.argv[2])
    proc.cqid = sys.argv[3]
    proc.opid = sys.argv[4]
    proc.my_port = string.atoi(sys.argv[5])
    proc.log_dir = sys.argv[6]
    if not os.path.exists(proc.log_dir):
        os.makedirs(proc.log_dir)
    cur_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))
    ParaLiteLog.init("%s/orderby-%s-%s.log" % (proc.log_dir, gethostname(), cur_time), logging.DEBUG)
    ParaLiteLog.info("--orderby node %s on %s is started" % (proc.opid, gethostname()))
    proc.start()
Exemple #14
0
def main():
    if len(sys.argv) != 7:
        sys.exit(1)
    proc = SqlOp()
    proc.master_name = sys.argv[1]
    proc.master_port = string.atoi(sys.argv[2])
    proc.cqid = sys.argv[3]
    proc.opid = sys.argv[4]
    proc.my_port = string.atoi(sys.argv[5])
    proc.log_dir = sys.argv[6]
    
    if not os.path.exists(proc.log_dir): os.makedirs(proc.log_dir)
    ParaLiteLog.init("%s/sql-%s-%s-%s.log" % (
        proc.log_dir, gethostname(), proc.cqid, proc.opid),
                     logging.DEBUG)
    ParaLiteLog.info("--sql node %s on %s is started" % (proc.opid, gethostname()))
    proc.sql_proc()
Exemple #15
0
 def distinct_data(self, data_list):
     try:
         csio = cStringIO.StringIO()
         whole_data = ""
         for data in data_list:
             if data.strip() == "":
                 continue
             whole_data += data.strip() + self.db_row_sep
             del(data)
         if whole_data == "":
             return None, None, None
         whole_data = set(whole_data.strip().split(self.db_row_sep))
         csio.write(self.db_row_sep.join(whole_data))    
         return conf.SINGLE_BUFFER, [csio], len(csio.getvalue())
     except Exception, e:
         ParaLiteLog.debug(traceback.format_exc())
         self.report_error("ERROR in order_by.py : %s" % traceback.format_exc())
         return None, None, None
Exemple #16
0
    def get_data_by_blocksize(self, jobid, bksize):
        if self.reader is None:
            return None
        data = self.reader.read(bksize)
        if not data:
            read_size = 0
        else:
            read_size = len(data)

        if read_size < bksize or read_size == bksize and read_size == self.job_data[jobid]:
            # while True:
            #     if self.reader is not None:
            #         self.reader.close()
            #     self.reader = None
            #     self.reader = self.get_next_reader()
            #     if self.reader is None:
            #         break
            #     new_data = self.reader.read(bksize - read_size)
            #     data += new_data
            #     read_size = len(data)
            #     if read_size >= bksize:
            #         break
            if self.reader is not None:
                self.reader.close()
            self.reader = None
            self.reader = self.get_next_reader()
            
            return data
            
        if self.db_row_sep == "\n":
            if not data.endswith("\n"):
                extra_data = self.reader.readline()
                if extra_data:
                    data += extra_data
            return data
        else:
            if data:
                pos = data.rfind(self.db_row_sep)
                ParaLiteLog.info(pos)
                send_ds =  self.left_ds + data[0:pos]
                self.left_ds = data[pos+len(self.db_row_sep):]
                return send_ds
            else:
                return None
Exemple #17
0
 def handle_read(self, ev):
     data = ev.data
     if data == conf.END_TAG:
         ParaLiteLog.info("receive: END_TAG")
         self.is_running = False
         self.queue.put(conf.END_TAG)
     elif data.startswith(conf.INFO):
         m = data.split(conf.SEP_IN_MSG)
         assert len(m) == 7
         if self.table == None: self.table = m[2]
         if self.db_col_sep == None: self.db_col_sep = m[3]
         if self.cmd_col_sep == None: self.cmd_col_sep = m[4]
         if self.cmd_row_sep == None: self.cmd_row_sep = m[5]
         if self.is_replace == None:
             self.is_replace = m[6]
             ParaLiteLog.info("DB_COL_SEP = %s CMD_COL_SEP = %s  CMD_ROW_SEP = %s is_replace = %s" % (self.db_col_sep, self.cmd_col_sep, self.cmd_row_sep, self.is_replace))
     else:
         """
         TODO: we can control the buffer size here.
         """
         self.queue.put(data)
         """
Exemple #18
0
 def scan_data_queue(self):
     while True:
         data = self.queue.get()
         if data == conf.END_TAG:
             ParaLiteLog.info("SCAN DATA QUEUE : END")
             break
         try:
             pos = 10+string.atoi(data[0:10].strip())
             target_db = data[10:pos]
             data = data[pos:]
             """
             thd = threading.Thread(target=self.write_to_db, args=(data, len(data)))
             thd.setDaemon(True)
             thd.start()
             self.threads.append(thd)
             """
             self.write_to_db(target_db, data, len(data))
             del(data)
         except Exception, e:
             ParaLiteLog.info(traceback.format_exc())
             es("in write_to_db: %s" % (traceback.format_exc()))
             sys.exit(1)
Exemple #19
0
    def start(self, argument):
        self.parse(argument)
        
        cur_time = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))
        ParaLiteLog.init("%s/dload-server-%s-%s.log" % (self.log_dir,
                                                        gethostname(), cur_time),
                         logging.DEBUG)

        ParaLiteLog.info("START")
        ParaLiteLog.info("parse the argumens sucessfully")
        ss = time.time()
        scan_thd = threading.Thread(target=self.scan_data_queue)
        scan_thd.setDaemon(True)
        scan_thd.start()
        
        t = time.strftime('%Y-%m-%d-%H-%M-%S',time.localtime(time.time()))
        self.local_socket = "%s%s%s-%s-%s" % (self.log_dir, os.sep, gethostname(), t, "UNIX.d")
        self.iom.create_server_socket(AF_UNIX, SOCK_STREAM, 5, self.local_socket)
        ch = self.iom.create_server_socket(AF_INET, SOCK_STREAM, 5, ("", self.port))
        n, self.port = ch.ss.getsockname()
        ParaLiteLog.info("global socket addr = %s" % (repr(ch.ss.getsockname())))
        self.register_to_master()
        
        try:
            while self.is_running:
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                elif isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        self.handle_read(ev)
                        
        except Exception, e:
            es("in dload_server.py : %s" % traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())
            sys.exit(1)
Exemple #20
0
    def handle_read(self, event):
        message = event.data[10:]

        m = message.split(conf.SEP_IN_MSG)
        try:
            if m[0] == conf.JOB_ARGUMENT:
                self.parse_args(m[1])
                ParaLiteLog.info("parse arguments: FINISH")

            elif m[0] == conf.JOB:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.cur_jobid = m[1]

            elif m[0] == conf.DATA:
                data_id = string.strip(m[1][0:2])
                data = m[1][2:]
                self.source_data.append(data)

                # sort data
                if not self.is_data_ready(self.source_data, self.num_of_children):
                    return

                ParaLiteLog.debug("****SORT DATA****: start")
                s = 0
                for data in self.source_data:
                    s += len(data)
                ParaLiteLog.debug("source data size: %s" % s)
                s_time = time.time()
                rs_type, rs, t_size = self.sort(self.source_data)
                del self.source_data
                ParaLiteLog.debug("****SORT DATA****: finish")

                if rs_type is None:
                    self.send_status_to_master(self.cur_jobid, conf.PENDING)
                    return

                self.total_size += t_size
                self.source_data = {}

                # store the result of one job to the final result
                for i in range(len(rs)):
                    if i not in self.result:
                        self.result[i] = [rs[i]]
                    else:
                        self.result[i].append(rs[i])

                if rs_type != conf.MULTI_FILE:
                    # check if the whole data exceeds the LIMITATION
                    if self.total_size > self.MAX_SIZE:
                        self.write_data_to_disk()
                        self.result_type = conf.MULTI_FILE

                e_time = time.time()
                self.total_time += e_time - s_time

                self.send_status_to_master(self.cur_jobid, conf.PENDING)

            elif m[0] == conf.JOB_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # all jobs are finished
                self.send_rs_info_to_master(self.total_size, self.total_time)

                # distribute data
                if self.dest == conf.DATA_TO_ONE_CLIENT:
                    ParaLiteLog.debug("dest = %s" % self.dest)
                    self.distribute_data()
                    self.send_status_to_master(self.cur_jobid, conf.ACK)
                    self.is_running = False
                elif self.dest == conf.DATA_TO_DB:
                    self.distribute_data()

            elif m[0] == conf.DATA_PERSIST:
                # if the data is requried to be persisted or not
                if m[1] == conf.CHECKPOINT:
                    self.write_data_to_disk()

            elif m[0] == conf.DLOAD_REPLY:
                sep = conf.SEP_IN_MSG
                reply = sep.join(m[1:])
                ParaLiteLog.info("receive the information from the master")
                ParaLiteLog.debug(reply)

                if len(self.data.getvalue()) != 0:
                    dload_client.dload_client().load_internal_buffer(
                        reply,
                        self.dest_table,
                        self.data,
                        self.fashion,
                        self.hash_key,
                        self.hash_key_pos,
                        self.db_col_sep,
                        self.db_row_sep,
                        self.db_col_sep,
                        False,
                        "0",
                        self.log_dir,
                    )

                # send END_TAG to the master
                client_id = "0"
                msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id])
                so_master = socket(AF_INET, SOCK_STREAM)
                so_master.connect((self.master_name, self.master_port))
                so_master.send("%10s%s" % (len(msg), msg))
                so_master.close()
                ParaLiteLog.debug("sending to master: %s" % (conf.END_TAG))
                ParaLiteLog.debug("----- dload client finish -------")

            elif message == conf.DLOAD_END_TAG:
                ParaLiteLog.debug("---------import finish---------")
                self.send_status_to_master(" ".join(self.cur_jobid), conf.ACK)
                self.is_running = False

            elif m[0] == conf.EXIT:
                self.is_running = False

            elif m[0] == conf.NODE_FAIL:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # message --> NODE_FAIL:FAILED_NODE:REPLICA_NODE
                failed_node, replica_node = m[1:3]
                self.failed_node.append(failed_node)
                if replica_node != "" and replica_node == gethostname():
                    # load replica data for the failed node
                    self.recovery_data(self.replica_result, replica_node)
                ParaLiteLog.debug("Finish to handle node failure message")

        except Exception, e:
            es(traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())
            self.is_running = False
            self.no_error = False
Exemple #21
0
 def process_job(self, jobid, exp, target_db, process_queue):
     s_time = time.time()
     rs_type, rs, t_size = self.proc_select(jobid, exp, target_db)
     e_time = time.time()
     process_queue.put((jobid, rs_type, rs, t_size, e_time-s_time))
     ParaLiteLog.debug("Job %s cost %s " % (jobid, e_time-s_time))
Exemple #22
0
    def sort(self, data_list):
        try:
            ParaLiteLog.debug("sort data START")
            csio = cStringIO.StringIO()
            whole_data = ""
            for data in data_list:
                if data.strip() == "":
                    continue
                whole_data += data.strip() + self.db_row_sep
                del (data)
            if whole_data == "":
                return None, None, None

            whole_data = whole_data.strip().split(self.db_row_sep)
            ParaLiteLog.debug("order key %s" % (str(self.order_key)))
            ParaLiteLog.debug("order type %s" % (str(self.order_type)))

            # get the column positions to be sorted in the input
            key_pos = []  # (pos:string_or_int:reverse_or_not)
            for key in self.order_key:
                pos = self.input.index(key)
                if self.attrs.has_key(key) == False:
                    key_type = conf.STRING
                else:
                    key_type = self.attrs[key]
                if self.order_type[self.order_key.index(key)] == conf.DESC:
                    key_pos.append((pos, key_type, True))
                else:
                    key_pos.append((pos, key_type, False))

            # check if the ordering type is the same or not
            flag = 0
            t0 = self.order_type[0]
            for t in self.order_type:
                if t != t0:
                    flag = 1
                    break

            if flag == 1:
                # for different ordering type, we have to perform sort several times
                i = len(key_pos) - 1
                while i >= 0:
                    pos = key_pos[i]
                    if pos[1] == conf.INT:
                        whole_data.sort(key=lambda l: string.atoi(l.split(self.db_col_sep)[pos[0]]), reverse=pos[2])
                    elif pos[1] == conf.FLOAT or pos[1] == conf.REAL:
                        whole_data.sort(key=lambda l: float(l.split(self.db_col_sep)[pos[0]]), reverse=pos[2])
                    else:
                        whole_data.sort(key=lambda l: l.split(self.db_col_sep)[pos[0]], reverse=pos[2])
                    i -= 1
            else:
                sort_key = []
                col_sep = self.db_col_sep
                for pos in key_pos:

                    if pos[1] == conf.INT:
                        sort_key.append("string.atoi(l.split('%s')[%s])" % (self.db_col_sep, pos[0]))
                    elif pos[1] == conf.FLOAT or pos[1] == conf.REAL:
                        sort_key.append("float(l.split('%s')[%s])" % (self.db_col_sep, pos[0]))
                    else:
                        sort_key.append("l.split('%s')[%s]" % (self.db_col_sep, pos[0]))

                whole_data.sort(key=lambda l: eval(",".join(sort_key)), reverse=key_pos[0][2])

            csio.write(self.db_row_sep.join(whole_data))
            # if len(self.db_col_sep) == 1 and self.db_row_sep == "\n":
            #     ParaLiteLog.debug("sort data: shell")
            #     self.shell_sort(data_list, csio)
            # else:
            #     ParaLiteLog.debug("sort data: quick sort")
            #     self.quick_sort(data_list, csio)
            return conf.SINGLE_BUFFER, [csio], len(csio.getvalue())
        except Exception, e:
            ParaLiteLog.debug(traceback.format_exc())
            self.report_error("ERROR in order_by.py : %s" % traceback.format_exc())
            return None, None, None
Exemple #23
0
            while self.is_running:
                ev = self.next_event(None)
                if isinstance(ev, ioman_base.event_accept):
                    self.handle_accept(ev)
                if isinstance(ev, ioman_base.event_read):
                    if ev.data != "":
                        self.handle_read(ev)

            ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname()))

        except KeyboardInterrupt, e:
            self.report_error("ParaLite receives a interrupt signal and then will close the process\n")
            ParaLiteLog.info("--orderby node %s on %s is finished--" % (self.opid, gethostname()))
            sys.exit(1)
        except Exception, e1:
            ParaLiteLog.debug(traceback.format_exc())
            self.report_error(traceback.format_exc())
            sys.exit(1)

    def handle_read(self, event):
        message = event.data[10:]

        m = message.split(conf.SEP_IN_MSG)
        try:
            if m[0] == conf.JOB_ARGUMENT:
                self.parse_args(m[1])
                ParaLiteLog.info("parse arguments: FINISH")

            elif m[0] == conf.JOB:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.cur_jobid = m[1]
Exemple #24
0
    def parse_func_2(self):
        """
        parse each expression to get useful information: input is different
        input -->      ***[count(*), sum(a), sum(a+b*c)]***
        output -->     ***[count(*), sum(a) + 1, avg(a+b*c)]***
        expression --> ['count(*)', 'sum(a)', 'avg(a+b*c)']
        ==> [ 
             ['count', 1,       0, '*',       1,  0],
             ['sum',   2,       0, 'a',       2,  1, [1], lambda _a:_a+1 ],
             ['sum',   3,       0, 'a+b*c',   3,  0]
            ]
        """
        # if there is avg function, convert it to sum and count
        avg_pos = []
        for fun in self.function:
            if fun.find("avg(") != -1:
                avg_pos.append(self.function.index(fun))
        if avg_pos != []:
            if "count(*)" not in self.function:
                self.function.append("count(*)")
        # sometimes, a result column has more than one function, then the pos in
        # the self.expr and self.expression is not the same, this counter is to 
        # count the pos of a function in self.expr and pos_map shows the mapping
        # of these two kinds of postions.
        fun_counter = 0 
        pos_map = {}
        for expr_num in range(len(self.function)):
            expr = self.function[expr_num]
            pos_map[expr_num] = fun_counter

            # parse "sum(a) + 1" --> ['sum(a)', '+', '1']
            if expr.find("count(*)") != -1:
                expr = expr.replace("count(*)", "count(a_a)")
            _expr = newparser.parse_column_expr(expr)
            
            #  to describe the pos of elements in the outer arithmetic operation
            outer_ao_pos = [] 
            new_expr = expr
            new_args = []
            ParaLiteLog.debug(self.input)
            for ele in _expr:
                if re.match("(.*)\((.*)\)", ele):
                    parsed_expr = []                                
                    new_expr = new_expr.replace(ele, "_col%s" % str(fun_counter))
                    new_args.append("_col%s" % str(fun_counter))
                    func_name = ele[0:ele.find("(")]
                    if func_name not in conf.GENERAL_FUNC:
                        return False, "ParaLite cannot support aggregation function %s" % func_name
                    if func_name == "avg":
                        ele = ele.replace("avg", "sum")
                    fun_counter += 1
                    func_attr = ele[ele.find("(") + 1 : ele.rfind(")")]
                    parsed_expr.append(func_name)
                    if func_attr == "a_a": 
                        func_attr = "*"
                        ele = ele.replace("a_a", "*")
                        expr = expr.replace("count(a_a)", "count(*)")
                        opexpr = [func_attr]
                    pos_in_input = self.input.index(ele)
                    parsed_expr.append(pos_in_input)
                    parsed_expr.append(0)
                    parsed_expr.append(ele)
                    if expr in self.output: parsed_expr.append(self.output.index(expr))
                    else: parsed_expr.append(-1)
                    self.expr.append(parsed_expr)
                    outer_ao_pos.append(fun_counter - 1)
                else:
                    # other operator element: + - * / ^
                    continue
            cur_pos = pos_map[expr_num]
            if cur_pos >= len(self.expr):
                # the exception that select sum(a), count(*), avg(a) ... does not need
                # to do anything for avg(a)
                continue
            if len(_expr) == 1:
                self.expr[cur_pos].append(0)
            else:
                self.expr[cur_pos].append(1)
                self.expr[cur_pos].append(outer_ao_pos)

                tempexpr = new_expr
                tempargs = ",".join(new_args)
                for eacharg in new_args:
                    newarg = eacharg.replace(".", "_")
                    tempexpr = tempexpr.replace(eacharg, newarg)
                    tempargs = tempargs.replace(eacharg, newarg)
                    
                self.expr[cur_pos].append(
                    eval("lambda %s:%s" % (tempargs, tempexpr)))
        self.pos_map = pos_map
        return True, None
Exemple #25
0
    def load_internal_buffer(self, reply, table, buf, fashion, key, key_pos, 
                             db_col_sep, row_sep, col_sep, is_replace, client_id, 
                             LOG_DIR):
        ParaLiteLog.info("load_internal: START")
        ParaLiteLog.info("row separator = %s col separator = %s" % (row_sep, col_sep) )
        self.db_col_sep = db_col_sep
        total_size = len(buf.getvalue())
        try:
            """
            received message = nodes # sub_dbs # chunk_num # replica_info 
            
            nodes should be:
            n1:p1:l1 , n2:p2:l2 , ...               IF fashion = HASH_FASHION 
            n1:p1:l1:s1:num , n2:p2:l2:s2:num , ... IF fashion = ROUND_ROBIN
            TBD                                     IF fashion = RANGE_FASHION

            node_db_info: db_1_1 , db_1_2 , db_2_1, ...
            replica_info: db_1_1 db_1_1_r_1 node1 , db_1_2 db_1_2_r_1 node2 , ...
            """
            mm = reply.split("#")
            ParaLiteLog.info("receive the information from the master %s" % mm)
            nodes = mm[0].split(",")
            sub_dbs = mm[1].split(",")
            chunk_num = string.atoi(mm[2])
            replica = mm[3]
            
            replica_info = {} # {db_name : {replica_db_name:node}}
            if replica != "":
                for ll in replica.split(","):
                    lll = ll.split(" ")
                    if lll[0] not in replica_info:
                        replica_info[lll[0]] = {}
                    replica_info[lll[0]][lll[1]] = lll[2]
            ParaLiteLog.info(nodes)
            node_addr = {} # {node:addr}
            for node in nodes:
                m = node.split(conf.SEP_IN_MSG)
                if m[0] == gethostname(): addr = m[2]
                else: addr = (m[0], string.atoi(m[1]))
                node_addr[m[0]] = addr

            ss1 = time.time()
            if nodes == []:
                ParaLiteLog.info("there is no data to load")
            elif fashion == conf.HASH_FASHION:
                ParaLiteLog.info(fashion)
                # get the data for each sub db
                # db_buf = {db_name, buffer_of_data}
                db_buf = self.hash_data_buffer(buf, key_pos, nodes, row_sep, col_sep, chunk_num, sub_dbs)
                for db in db_buf:
                    data = db_buf[db].getvalue()
                    node = db.split("_")[-3]
                    self.send_to_node(db, table, data, node_addr[node], row_sep, col_sep, is_replace)
                    if db in replica_info:
                        for rdb in replica_info[db]:
                            node = replica_info[db][rdb]                            
                            self.send_to_node(rdb, table, data, node_addr[node], row_sep, col_sep, is_replace)
                """
                buf_scanner = threading.Thread(target=self.scan_buf,
                args=(table, node_buf, node_addr, row_sep, col_sep, is_replace))
                buf_scanner.setDaemon(True)
                buf_scanner.start()
                buf_scanner.join()
                """
            elif fashion == conf.REPLICATE_FASHION:
                self.replicate_data(table, files, total_size, nodes)
            elif fashion == conf.RANGE_FASHION:
                self.range_data()
            else:
                thds = []
                num_of_db = len(nodes) * chunk_num
                if row_sep is not None and row_sep != "\n":
                    whole_data = buf.getvalue()
                    lines = whole_data.split(row_sep)
                    if lines[len(lines)-1] == "":
                        lines.pop(len(lines)-1)
                    l = len(lines)
                    if l % num_of_db == 0:
                        num_each = l / num_of_db
                    else:
                        num_each = l / num_of_db + 1
                    i = 0
                    while i < num_of_db:
                        db = sub_dbs[i]
                        node = db.split("_")[-3]
                        cur_num = i*num_each + num_each
                        if cur_num > l:
                            cur_num = l
                        ds = row_sep.join(lines[i*num_each:cur_num])
                        thd = threading.Thread(target=self.send_to_node,
                                               args=(db, table, ds, node_addr[node], row_sep,
                                                     col_sep, is_replace))
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                node = replica_info[db][rdb]
                                thd = threading.Thread(target=self.send_to_node,
                                                       args=(rdb, table, ds, node_addr[node],
                                                             row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                else:
                    buf.seek(0)
                    i = 0
                    while i < num_of_db:
                        db = sub_dbs[i]
                        node = db.split("_")[-3]
                        node_id = i / chunk_num
                        size = string.atoi(nodes[node_id].split(conf.SEP_IN_MSG)[3]) / chunk_num
                        ParaLiteLog.info("start to get data as bk: %s" % (size))
                        ds = buf.read(size)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            continue
                        if not ds.endswith("\n"):
                            ds += buf.readline()

                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        thd = threading.Thread(target=self.send_to_node,
                                               args=(db, table, ds, node_addr[node],
                                                     row_sep, col_sep, is_replace))
                        
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                node = replica_info[db][rdb]
                                thd = threading.Thread(target=self.send_to_node,
                                                       args=(rdb, table, ds, node_addr[node],
                                                             row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                for thd in thds:
                    thd.join()
        except Exception, e:
            raise(e)
Exemple #26
0
    def handle_read(self, event):
        message = event.data[10:]

        sep = conf.SEP_IN_MSG
        m = message.split(sep)
        try:
            if m[0] == conf.DATA_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # all data is dipatched to the parent nodes
                self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                ParaLiteLog.debug("notify ACK to master")                    
                self.is_running = False
                
            elif message == conf.END_TAG:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                self.is_running = False

            elif message == conf.DLOAD_END_TAG:
                ParaLiteLog.debug("---------import finish---------")
                self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                self.is_running = False

            elif message == conf.EXIT:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                self.is_running = False
            
            elif m[0] == conf.JOB_ARGUMENT:
                self.parse_args(m[1])
                ParaLiteLog.info("parse arguments: FINISH")
                # init the persisted result data
                if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT:
                    ParaLiteLog.debug("recovery data: START")
                    # this is a recovery operator
                    self.recovery_data(self.result, gethostname())
                    ParaLiteLog.debug("recovery data: FINISH")
                    self.send_rs_info_to_master(0, 0)
                else:
                    # delete all temporary files for this operator
                    os.system("rm -f %s/%s_%s" % (self.temp_dir, "sql", self.opid))

                ###############################
                # scanner = threading.Thread(target=self.scan_process_queue, args=(self.process_queue, ))
                # scanner.setDaemon(True)
                # scanner.start()
                # self.threads.append(scanner)
                ##########################
            elif m[0] == conf.JOB:
                self.ex_s_time = time.time()
                self.ex_w_time = 0
                ParaLiteLog.debug("MESSAGE: %s" % message)
                s_time = time.time()
                jobid = m[1]
                target_db = m[2].split()
                exp = self.expression
                ParaLiteLog.debug("*****JOB %s******:start" % jobid)
                
                # FAULT TOLERANCE:
                if jobid in self.job_data:
                    # this is a failed job, we should first delete the old result value
                    if self.dest == conf.DATA_TO_ANO_OP and self.partition_num > 1:
                        for partid in self.result:
                            pos = self.job_list.index(jobid)
                            self.result[partid][pos] = ""
                    else:
                        self.result[jobid] = ""
                
                if exp.lower().startswith("select"):
                    """
                    selection task: (1), execute sql (2), notify the result to
                    the master (3), wait for the DATA_PERSIST message from the
                    master (4), persist data if so (5), notify ACK to the master
                    """
                    ParaLiteLog.info("proc_select: START")
                    st_time = time.time()
                    
                    rs_type, rs, t_size = self.proc_select(jobid, exp, target_db)

                    et_time = time.time()
                    ParaLiteLog.debug("Job %s cost time %s second" % (jobid, (et_time - st_time)))
                    # FAULT TOLERANCE:
                    if jobid in self.job_data:
                        # this is a failed job
                        if self.dest == conf.DATA_TO_ANO_OP and self.partition_num > 1:
                            for partid in self.result:
                                pos = self.job_list.index(jobid)
                                self.result[partid][pos] = rs[partid]
                        else:
                            self.result[jobid] = rs
                        self.send_status_to_master(jobid, conf.PENDING)
                        return
                        
                    self.job_data[jobid] = t_size
                    self.job_list.append(jobid)
                    self.total_size += t_size
                    
                    # store the result of one job to the final result
                    if len(rs) == 1:
                        if self.dest == conf.DATA_TO_ANO_OP:
                            # dest is AGGR op or ORDER op, use 0 as the key
                             if 0 not in self.result:
                                 self.result[0] = rs
                             else:
                                 self.result[0].append(rs[0])

                             if self.is_checkpoint == 1:
                                 self.write_data_to_disk(0, rs[0].getvalue())
                        else:
                            # dest is UDX op, use jobid as the key
                            self.result[string.atoi(jobid)] = rs
                            if self.is_checkpoint == 1:
                                self.write_data_to_disk(0, rs[0].getvalue())
                        
                    else:
                        # use partid as the key
                        for i in range(len(rs)):
                            if i not in self.result:
                                self.result[i] = [rs[i]]
                            else:
                                self.result[i].append(rs[i])
                        if self.is_checkpoint == 1:
                            for i in range(len(rs)):
                                self.write_data_to_disk(i, rs[i].getvalue())
                        
                    # check if the whole data exceeds the LIMITATION
                    if rs_type != self.MULTI_FILE:
                        if self.is_checkpoint is not None and self.is_checkpoint == conf.CHECKPOINT or self.total_size > self.MAX_SIZE:
                            for dataid in self.result:
                                data = ""
                                for d in self.result[dataid]:
                                    data += d.getvalue()
                                self.write_data_to_disk(dataid, data)
                            self.result_type = self.MULTI_FILE
                            
                    e_time = time.time()
                    if self.total_time == 0:
                        self.total_time = (e_time - s_time)
                    self.send_status_to_master(jobid, conf.PENDING)

                elif exp.lower().startswith("create"):
                    ParaLiteLog.info("proc_create: START")
                    ParaLiteLog.info("SQL: %s" % exp)                    
                    self.proc_create(exp, target_db)
                    ParaLiteLog.info("proc_create: START")
                    self.send_status_to_master(jobid, conf.ACK)
                    self.is_running = False
                elif exp.lower().startswith("drop"):
                    ParaLiteLog.info("proc_drop: START")            
                    self.proc_drop(exp, target_db)
                    self.send_status_to_master(jobid, conf.ACK)
                    self.is_running = False
                ParaLiteLog.debug("*****JOB %s******:finish" % jobid)
                self.ex_w_time += (time.time() - self.ex_s_time)
                self.ex_s_time = 0

            elif m[0] == conf.JOB_END:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # all jobs are finished
                # create a dictionary to store the status of each part of data
                data_status = {}  # {data_id:[(pos_in_result, status)]}
                for dataid in self.result:
                    if dataid not in data_status:
                        data_status[dataid] = []
                    for i in range(len(self.result[dataid])):
                        data_status[dataid].append((i, 1))
                self.data_status = data_status
                self.reader = self.get_next_reader()
                    
                self.send_rs_info_to_master(self.total_size, self.total_time)

                # distribute data
                if self.dest == conf.DATA_TO_ONE_CLIENT:
                    self.distribute_data()
                    self.send_status_to_master(" ".join(self.job_data), conf.ACK)
                    self.is_running = False
                elif self.dest == conf.DATA_TO_DB:
                    self.distribute_data()
 
            elif m[0] == conf.DLOAD_REPLY:
                reply = sep.join(m[1:])
                ParaLiteLog.info("receive the information from the master")
                ParaLiteLog.debug(reply)
                
                if len(self.data.getvalue()) != 0:
                    dload_client.dload_client().load_internal_buffer(
                        reply, self.dest_table, self.data, self.fashion, 
                        self.hash_key, self.hash_key_pos, self.db_col_sep, 
                        self.db_row_sep, self.db_col_sep, False, "0", self.log_dir)

                # send END_TAG to the master
                client_id = "0"
                msg = sep.join([conf.REQ, conf.END_TAG, gethostname(), client_id])
                so_master = socket(AF_INET, SOCK_STREAM)
                so_master.connect((self.master_name, self.master_port))
                so_master.send("%10s%s" % (len(msg), msg))
                so_master.close()
                ParaLiteLog.debug("sending to master: %s" % (conf.END_TAG))
                ParaLiteLog.debug("----- dload client finish -------")

            elif m[0] == conf.DATA_PERSIST:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # if the data is requried to be persisted or not
                self.process_ck_info(m)
                
            elif m[0] == conf.DATA_DISTRIBUTE:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # send a part of data to the next operator
                # DATA_DISTRIBUTE:partition_num:destnode
                part_id, destnode = m[1:]
                data = self.get_data_by_part_id(self.result, string.atoi(part_id))
                
                # DATA message includes: type:id+data
                # the first 2 chars represents the opid
                msg = sep.join([conf.DATA, "%2s%s" % (self.opid, data)])
                if destnode == gethostname():
                    # use local socket
                    addr = self.p_node[destnode][1]
                    t = AF_UNIX
                else:
                    addr = (destnode, self.p_node[destnode][0])
                    t = AF_INET
                self.send_data_to_node(msg, t, addr)
                ParaLiteLog.debug("send data susscufully   %s %s --> %s" % (self.opid, gethostname(), destnode))

            elif m[0] == conf.DATA_DISTRIBUTE_UDX:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # send data to udx client
                # m[1:] = worker.id:jobid:(node:port | addr):size
                
                if len(m) == 6:
                    w_id, jobid = m[1:3]
                    addr = (m[3], string.atoi(m[4]))
                    t = AF_INET
                    bk = string.atoi(m[5])
                elif len(m) == 5:
                    w_id, jobid = m[1:3]
                    addr = m[3]
                    t = AF_UNIX
                    bk = string.atoi(m[4])
                data = self.get_data_by_blocksize(jobid, bk)
                if not data:
                    # if we don't send something here, udx will not send KAL
                    # again, and then they will not receive data again, the whole
                    # process will be blocked for ever
                    msg = sep.join([conf.DATA, "EMPTY"])
                else:
                    msg = sep.join([conf.DATA, data])
                self.send_data_to_node(msg, t, addr)
                
            elif m[0] == conf.DATA_REPLICA:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # message --> DATA_REPLICA:DATANODE:DATAID:DATA
                datanode, dataid = m[1:3]
                f_name = self.get_file_name_by_data_id(gethostname(), dataid)
                fr = open(f_name, "wa")
                fr.write(m[4])
                fr.close()

            elif m[0] == conf.NODE_FAIL:
                ParaLiteLog.debug("MESSAGE: %s" % message)
                # message --> NODE_FAIL:FAILED_NODE:REPLICA_NODE
                failed_node, replica_node = m[1:3]
                self.failed_node.append(failed_node)
                if replica_node == gethostname():
                    # load replica data for the failed node
                    self.recovery_data(self.replica_result, replica_node)
        except Exception, e:
            es("in sql_proc : %s" % traceback.format_exc())
            ParaLiteLog.info(traceback.format_exc())
            self.is_running = False
            self.no_error = False
Exemple #27
0
 def range_data(self):
     ParaLiteLog.info("Now RANGE FASHION is not supported...")
Exemple #28
0
    def proc_select(self, jobid, exp, target_db):
        assert len(target_db) == 1
        cur_db = target_db[0]
        try:
            conn = sqlite3.connect(cur_db)
            conn.text_factory = str
            
            # register the user-defined aggregate
            conn.create_aggregate("mul", 1, mul)

            c = conn.cursor()
            """
            if self.temp_store != 0:
                c.execute('pragma temp_store=%s' % (self.temp_store))
            if self.cache_size != -1:
                c.execute('pragma cache_size=%s' % (self.cache_size))
            """

            # for test
            c.execute('pragma temp_store=memory')
            c.execute('pragma cache_size=2073741824')

            ParaLiteLog.info("start to execute sql: %s" % exp)
            
            col_sep = self.db_col_sep
            row_sep = self.db_row_sep
            num_of_dest = self.partition_num

            if self.dest == conf.DATA_TO_ANO_OP and num_of_dest > 1:
                columns = self.output
                split_key = self.split_key
                assert split_key is not None
                
                # partition data in hash fashion
                pos = []
                for key in split_key:
                    pos.append(columns.index(key))
                data_part_list = []
                for i in range(self.partition_num):
                    data_part_list.append(cStringIO.StringIO())
                size = 0
                t_size = 0
                for row in c.execute(exp):
                    part_id = abs(hash(self.db_col_sep.join(str(row[p]) for p in pos))) % num_of_dest
                    #part_id = abs(hash(row[pos[0]])) % num_of_dest
                    data = col_sep.join(str(s) for s in row)
                    """
                    size += len(data)
                    if size > self.MAX_SIZE:
                        for partid in data_part_list:
                            fs = self.write_data_to_disk(
                                partid, data_part_list[partid])
                            # delete all data in csio
                            data_part_list[partid].truncate(0)
                        t_size += size
                        size = 0
                        self.result_type = self.MULTI_FILE
                    """
                    data_part_list[part_id].write(data)
                    data_part_list[part_id].write(row_sep)

                for i in range(len(data_part_list)):
                    t_size += len(data_part_list[i].getvalue())
                    
                ParaLiteLog.debug("finish to retrieve the result: %s" % t_size)
                
                if self.result_type == self.MULTI_FILE:
                    for partid in data_part_list:
                        self.write_data_to_disk(
                            partid, data_part_list[partid].getvalue())
                        del data_part_list
                    return self.MULTI_FILE, None, t_size
                else:
                    ########################
                    # new_list = []
                    # for d in data_part_list:
                    #     new_list.append(d.getvalue())
                    # return self.MULTI_BUFFER, new_list, t_size
                    ###################
                    return self.MULTI_BUFFER, data_part_list, t_size
                
            else:
                csio = cStringIO.StringIO()
                t_size = 0
                size = 0 # record the size of current data
                data_pos = [] # the file name of data if persisted
                for row in c.execute(exp):
                    # NOTE:  For aggregation SQL, e.g. "select max(col) from T ..."
                    # if there is no record in T, (None,) will be returned
                    if row[0] is None:
                        continue
                    data = col_sep.join(str(s) for s in row)
                    size += len(data)
                    if size >= self.MAX_SIZE:
                        result_type = self.MULTI_FILE
                        self.write_data_to_disk(jobid, csio.getvalue())
                        # delete all data in csio
                        csio.truncate(0)
                        t_size += size
                        size = 0
                    csio.write(data)
                    csio.write(row_sep)

                t_size += len(csio.getvalue())
                ParaLiteLog.debug("finish to retrieve the result: %s" % t_size)

                if self.result_type == conf.MULTI_FILE:
                    self.write_data_to_disk(jobid, csio.getvalue())
                    del csio
                    return conf.MULTI_FILE, None, t_size
                else:
                    return self.SINGLE_BUFFER, [csio], t_size

        except sqlite3.OperationalError, e:
            ParaLiteLog.info(traceback.format_exc())
            raise(Exception("%s: QueryExecutionError: %s" % (gethostname(),
                                                             traceback.format_exc())))
Exemple #29
0
    def load_internal_file(self, reply, opt, db_col_sep, LOG_DIR):
        ParaLiteLog.info("load_internal_file: START")
        table = opt.table
        files = opt.files
        col_sep = opt.col_sep
        row_sep = opt.row_sep
        fashion = opt.fashion
        key = opt.key
        key_pos = opt.key_pos
        is_replace = opt.replace

        self.db_col_sep = db_col_sep        
        for f in files:
            self.files[f] = 1
        self.file_reader = open(self.get_next_file(), "rb")

        try:
            """
            received message = nodes # sub_dbs # chunk_num # replica_info 
            
            nodes should be (| is SEP_IN_MSG):
            n1 : p1|l1 , n2 : p2|l2 , ...               IF fashion = HASH_FASHION 
            n1 : p1|l1|s1|num , n2 : p2|l2|s2|num , ... IF fashion = ROUND_ROBIN
            TBD                                     IF fashion = RANGE_FASHION

            node_db_info: node1:[db_1_1] , node2:[db_1_2] , node3:[db_2_1], ...
            replica_info: db_1_1 db_1_1_r_1 node1 , db_1_2 db_1_2_r_1 node2 , ...
            """
            mm = reply.split("#")

            nodes = mm[0].split(",")
            sub_dbs = mm[1].split(",")
            chunk_num = string.atoi(mm[2])
            replica = mm[3]
            
            replica_info = {} # {db_name : {replica_db_name:node}}
            if replica != "":
                for whole_re in replica.split(","):
                    lll = whole_re.split(" ")
                    if lll[0] not in replica_info:
                        replica_info[lll[0]] = {}
                    replica_info[lll[0]][lll[1]] = lll[2]

            node_addr = {} # {node:addr}
            for node in nodes:
                m = node.split(conf.SEP_IN_MSG)
                if m[0] == gethostname(): addr = m[2]
                else: addr = (m[0], string.atoi(m[1]))
                node_addr[m[0]] = addr
            
            thds = []
            if nodes == []:
                ParaLiteLog.info("there is no data to load")
            elif fashion == conf.HASH_FASHION:
                ParaLiteLog.info(fashion)
                if row_sep is not None and row_sep != "\n":
                    while True:
                        dst = self.get_data_as_bk(DATA_MAX_SIZE)
                        if dst is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(dst)))
                        pos = dst.rfind(row_sep)

                        ds =  left_ds + dst[0:pos]
                        left_ds = dst[pos+len(row_sep):]
                        del dst
                        db_buf = self.hash_data_file(ds, key_pos, nodes,
                                                       row_sep, col_sep,
                                                       chunk_num, sub_dbs)
                        ParaLiteLog.debug("hash data finish %s" % len(ds))
                        del ds                        
                        for db in db_buf:
                            data = db_buf[db].getvalue()
                            node = db.split("_")[-3]
                            thd = threading.Thread(target=self.send_to_node,
                                                   args=(db, table, data, node_addr[node],
                                                         row_sep,col_sep,
                                                         is_replace))
                            thd.setDaemon(True)
                            thd.start()
                            thds.append(thd)
                            if db in replica_info:
                                for rdb in replica_info[db]:
                                    node = replica_info[db][rdb]
                                    self.send_to_node(rdb, table, data,
                                                      node_addr[node],
                                                      row_sep, col_sep, is_replace)

                else:
                    while True:
                        ds = self.get_data_as_bk(DATA_MAX_SIZE)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        db_buf = self.hash_data_file(ds, key_pos, nodes,
                                                       "\n", col_sep, chunk_num, sub_dbs)
                        for db in db_buf:
                            ParaLiteLog.debug(
                                "%s -- > %s" % (db, len(db_buf[db].getvalue())))
                            break
                        for db in db_buf:
                            data = db_buf[db].getvalue()
                            node = db.split("_")[-3]

                            thd = threading.Thread(target=self.send_to_node,
                                                   args=(db, table, data,
                                                         node_addr[node],
                                                         row_sep,col_sep,
                                                         is_replace))
                            thd.setDaemon(True)
                            thd.start()
                            thds.append(thd)
                            if db in replica_info:
                                for rdb in replica_info[db]:
                                    node = replica_info[db][rdb]
                                    self.send_to_node(rdb, table, data,
                                                      node_addr[node],
                                                      row_sep, col_sep, is_replace)
                        del db_buf        
                        del ds
                        
            elif fashion == conf.REPLICATE_FASHION:
                self.replicate_data(table, files, total_size, nodes)
            elif fashion == conf.RANGE_FASHION:
                self.range_data()
            else:
                num_of_db = len(nodes) * chunk_num                
                if row_sep is not None and row_sep != "\n":
                    i = 0
                    left_ds = ""
                    while True:
                        db = sub_dbs[i % num_of_db]
                        #m = nodes[(i % num_of_db) / chunk_num].split(conf.SEP_IN_MSG) 
                        #node = m[0]
                        node = db.split("_")[-3]

                        size = string.atoi(m[3]) / chunk_num + 1
                        if size > DATA_MAX_SIZE:
                            ParaLiteLog.info("start to get data as bk: %s" % (DATA_MAX_SIZE))  
                            ds = self.get_data_as_bk(DATA_MAX_SIZE)
                        else:
                            ParaLiteLog.info("start to get data as bk: %s" % (size))
                            ds = self.get_data_as_bk(size)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        pos = ds.rfind(row_sep)
                        send_ds =  left_ds + ds[0:pos]
                        left_ds = ds[pos+len(row_sep):]
                        thd = threading.Thread(
                            target=self.send_to_node,
                            args=(db, table, send_ds, node_addr[node],
                                  row_sep, col_sep, is_replace))
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                node = replica_info[db][rdb]
                                thd = threading.Thread(
                                    target=self.send_to_node,
                                    args=(rdb, table, ds, node_addr[node],
                                          row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                else:
                    i = 0
                    while True:
                        db = sub_dbs[i % num_of_db]
                        #m = nodes[(i % num_of_db)/chunk_num].split(conf.SEP_IN_MSG) 
                        #node = m[0]
                        node = db.split("_")[-3]
                        size = string.atoi(m[3]) / chunk_num + 1
                        if size > DATA_MAX_SIZE:
                            ParaLiteLog.info(
                                "start to get data as bk: %s" % (DATA_MAX_SIZE))  
                            ds = self.get_data_as_bk(DATA_MAX_SIZE)
                        else:
                            ParaLiteLog.info("start to get data as bk: %s" % (size))
                            ds = self.get_data_as_bk(size)
                        if ds is None:
                            ParaLiteLog.info("really get data as bk: 0")
                            break
                        ParaLiteLog.info("really get data as bk: %s" % (len(ds)))
                        thd = threading.Thread(
                            target=self.send_to_node,
                            args=(db, table, ds, node_addr[node],
                                  row_sep,col_sep, is_replace))
                        thd.setDaemon(True)
                        thd.start()
                        thds.append(thd)
                        if db in replica_info:
                            for rdb in replica_info[db]:
                                ParaLiteLog.info(rdb)
                                node = replica_info[db][rdb]
                                thd = threading.Thread(
                                    target=self.send_to_node,
                                    args=(rdb, table, ds, node_addr[node],
                                          row_sep, col_sep, is_replace))
                                thd.setDaemon(True)
                                thd.start()
                                thds.append(thd)
                        i += 1
                        del ds
            for thd in thds:
                thd.join()
        except Exception, e:
            ParaLiteLog.debug(traceback.format_exc())
            raise(Exception(traceback.format_exc()))
Exemple #30
0
    def distribute_data(self):
        # handle the limit condition: get the first N records
        # E.g. select ... limit 10, the master firstly decides the limit
        # number for each process and set the limit value for each process
        # to be the post-limit

        whole_data = cStringIO.StringIO()
        for i in self.result:
            for csio in self.result[i]:
                d = string.strip(csio.getvalue())
                if len(d) == 0:
                    continue
                whole_data.write(d)
                whole_data.write("\n")
                del csio

        if self.distinct or self.limit != -1:
            data_list = whole_data.getvalue().split(self.db_row_sep)
            del whole_data
        
            if self.distinct:
                data_list = set(data_list)
            if self.limit != -1:
                data_list = data_list[:self.limit]

            data = cStringIO.StringIO()
            data.write(self.db_row_sep.join(str(s) for s in data_list))
            del data_list
        else:
            data = whole_data
        
        if self.dest == conf.DATA_TO_DB:
            self.data = data
            col_sep = self.db_col_sep
            row_sep = self.db_row_sep
            master = (self.master_name, self.master_port)
            
            ParaLiteLog.info("proc_select: load data start")
            # send request to the master
            t_size = len(data.getvalue())
            sep = conf.SEP_IN_MSG
            tag = conf.LOAD_FROM_API
            if row_sep is None or row_sep == "\n":
                temp_sep = "NULL"
            else:
                temp_sep = row_sep
            msg = sep.join(
                str(s) for s in [conf.REQ, self.cqid, gethostname(), 
                                 self.my_port, self.dest_db, self.dest_table,
                                 t_size, tag, self.fashion, temp_sep, "0"])
            so_master = socket(AF_INET, SOCK_STREAM)
            so_master.connect(master)
            so_master.send("%10s%s" % (len(msg),msg))
            so_master.close()

            # dload_client.dload_client().load_internal_buffer(
            #     master, self.cqid, gethostname(), self.my_port, self.dest_db,
            #     self.dest_table, data, conf.LOAD_FROM_API, self.fashion, 
            #     self.hash_key, self.hash_key_pos, self.db_col_sep, row_sep,
            #     col_sep, False, "0", self.log_dir)

        elif self.dest == conf.DATA_TO_ONE_CLIENT:
            random_num = random.randint(0, len(self.client_sock) - 1)
            addr = self.client_sock[random_num]
            sock = socket(AF_INET, SOCK_STREAM)
            sock.connect(addr)

            data_s = data.getvalue()
            ParaLiteLog.info("DATA SIZE = %s" % len(data_s))
            sock.send("%10s%s" % (len(data_s), data_s))
            re = sock.recv(10)
            assert re == "OK"
            sock.close()