Example #1
0
 def __init__(self, f_descriptor, namespace, skiptables=[], forced_timestamp=""):
     self._skip_tables = skiptables
     self._forced_timestamp = forced_timestamp
     self._hbase = HBaseIntake(namespace)
     self._hbase.connect()
     self._md5 = hashlib.md5()
     self._namespace = namespace
     start_time = time.time()
     self.run(f_descriptor)
     # using past_row, instead of curr_row, due the variable's scope
     self._hbase.set_row_count(self._row_counter)
     self._hbase.set_md5(self._md5.hexdigest())
     self._hbase.set_parse_time(time.time() - start_time)
     self._hbase.commit(self)
Example #2
0
class MySQLDump:
    # this is a state machine reading through the file
    _md5 = None  # md5 object associated to the mysql input stream
    _row_counter = 0  # I want to know where I'm
    _forced_timestamp = ""  # just in case

    def __init__(self, f_descriptor, namespace, skiptables=[], forced_timestamp=""):
        self._skip_tables = skiptables
        self._forced_timestamp = forced_timestamp
        self._hbase = HBaseIntake(namespace)
        self._hbase.connect()
        self._md5 = hashlib.md5()
        self._namespace = namespace
        start_time = time.time()
        self.run(f_descriptor)
        # using past_row, instead of curr_row, due the variable's scope
        self._hbase.set_row_count(self._row_counter)
        self._hbase.set_md5(self._md5.hexdigest())
        self._hbase.set_parse_time(time.time() - start_time)
        self._hbase.commit(self)

    def __str__(self):
        return "MD5> " + self._md5.hexdigest() + "\nROWs> " + str(self._row_counter)

    def timestamp(self):
        if self._forced_timestamp != "":
            self._timestamp = datetime.strptime(self._forced_timestamp, "%Y-%m-%d").strftime("%s")
        if self._timestamp is None or self._timestamp == 0:
            raise RuntimeError("we've not extracted the timestamp of the mysqldump")
        return self._timestamp

    def run(self, f_descriptor):
        past_row = MySQLRow("")  # useless line to start with
        for line in f_descriptor:
            self._row_counter += 1
            curr_row = MySQLRow(line)

            if past_row.is_create():

                if curr_row.is_useless():  # most likely the CREATE is complete
                    if os.environ.get("DEBUG") is not None:
                        print >> sys.stderr, past_row, past_row.stmt()
                    if not past_row.tbl_name() in self._skip_tables:
                        self._hbase.set_create_tbl(past_row.tbl_name(), past_row.payload())
                    elif os.environ.get("DEBUG") is not None:
                        print >> sys.stderr, "SKIP> CREATE", past_row.tbl_name()

                if not curr_row.is_useless() and not curr_row.is_create() and not curr_row.is_insert():
                    past_row.append(line)
                    curr_row = past_row

            if curr_row.is_view():
                if os.environ.get("DEBUG") is not None:
                    print >> sys.stderr, curr_row.raw()
                self._hbase.set_view(curr_row.tbl_name(), curr_row.payload())

            if curr_row.is_insert():
                if not curr_row.tbl_name() in self._skip_tables:
                    self._hbase.send(curr_row)
                elif os.environ.get("DEBUG") is not None:
                    print >> sys.stderr, "SKIP> INSERT", past_row.tbl_name()

            if os.environ.get("DEBUG") is not None:
                print >> sys.stderr, self._row_counter, curr_row, curr_row.tbl_name()
                sys.stderr.flush()

            self._md5.update(line)
            past_row = curr_row  # for next line's parsing

        # last parsed line, we have (hopefully) the timestamp in it
        self._timestamp = past_row.timestamp()