Python fieldTypeToString 예제들, catalog.fieldTypeToString Python 예제들

예제 #1

0

파일 보기

    def processOpFields(self, fields, op, content):
        if self.debug:
            LOG.debug("Processing operation fields\n%s", pformat(content))
        for k, v in content.iteritems():
            # Skip anything that starts with our special char
            # Those are flag markers used by MongoDB's queries
            if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                continue

            # We need to add the field to the collection if it doesn't
            # already exist. This will occur if this op was an aggregate,
            # which we ignore when recreating the schema
            f_type = type(v)
            if not k in fields:
                fields[k] = catalog.Collection.fieldFactory(
                    k, catalog.fieldTypeToString(f_type))
            fields[k]['query_use_count'] += 1

            # No predicate for insert operations
            # No projections for insert operations
            if op['type'] != constants.OP_TYPE_INSERT:
                # Update how this key was used with predicates
                if workload.isOpRegex(op, field=k):
                    op['predicates'][k] = constants.PRED_TYPE_REGEX
                elif isinstance(v, dict):
                    op['predicates'][k] = constants.PRED_TYPE_RANGE
                elif not k in op['predicates']:
                    op['predicates'][k] = constants.PRED_TYPE_EQUALITY

            ## TODO: Should we expect there to be field names with dot notation here?
            ##       Or should have all been cleaned out by the converters?

        ## FOR

        return

예제 #2

0

파일 보기

파일: abstractconverter.py 프로젝트: craigmartin/mongodb-d4

    def processOpFields(self, fields, op, content):
        if self.debug: LOG.debug("Processing operation fields\n%s", pformat(content))
        for k,v in content.iteritems():
            # Skip anything that starts with our special char
            # Those are flag markers used by MongoDB's queries
            if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                continue

            # We need to add the field to the collection if it doesn't
            # already exist. This will occur if this op was an aggregate,
            # which we ignore when recreating the schema
            f_type = type(v)
            if not k in fields:
                fields[k] = catalog.Collection.fieldFactory(k, catalog.fieldTypeToString(f_type))
            fields[k]['query_use_count'] += 1

            # No predicate for insert operations
            # No projections for insert operations
            if op['type'] != constants.OP_TYPE_INSERT:
                # Update how this key was used with predicates
                if workload.isOpRegex(op, field=k):
                    op['predicates'][k] = constants.PRED_TYPE_REGEX
                elif isinstance(v, dict):
                    op['predicates'][k] = constants.PRED_TYPE_RANGE
                elif not k in op['predicates']:
                    op['predicates'][k] = constants.PRED_TYPE_EQUALITY

            ## TODO: Should we expect there to be field names with dot notation here?
            ##       Or should have all been cleaned out by the converters?

        ## FOR

        return

예제 #3

0

파일 보기

파일: mysqlconverter.py 프로젝트: cmu-db/mongodb-d4

    def extractSchema(self):
        c1 = self.mysql_conn.cursor()
        c1.execute("""
            SELECT TABLE_NAME
            FROM information_schema.TABLES
            WHERE TABLE_SCHEMA = %s AND TABLE_NAME != %s""", \
            (self.dbName, MYSQL_LOG_TABLE_NAME))
        tbl_cols = {}
        LOG.info("Extracting table information from database '%s'", self.dbName)
        for row in c1:
            tbl_name = row[0]
            col_info = self.metadata_db.Collection()
            col_info['name'] = tbl_name
            tbl_cols[col_info['name']] = []
            if self.debug: LOG.debug("Created metadata object for collection '%s'", tbl_name)

            c2 = self.mysql_conn.cursor()
            c2.execute("""
                SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION
                FROM information_schema.COLUMNS
                WHERE TABLE_SCHEMA = %s AND TABLE_NAME=%s
            """, (self.dbName, tbl_name))
            for col_row in c2:
                col_name = col_row[0]
                col_type = catalog.sqlTypeToPython(col_row[1])
                col_position = col_row[2]
                
                tbl_cols[col_info['name']].append(col_name)
                col_type_str = catalog.fieldTypeToString(col_type)
                col_info["fields"][col_name] = catalog.Collection.fieldFactory(col_name, col_type_str)
                col_info["fields"][col_name]['ordinal_position'] = int(col_position)
                if self.debug:
                    LOG.info("Created column information for '%s.%s'", tbl_name, col_name)
            ## FOR
            c2.close()

            # Get the index information from MySQL for this table
            sql = "SHOW INDEXES FROM " + self.dbName + "." + tbl_name
            c3 = self.mysql_conn.cursor()
            c3.execute(sql)
            index_name = None
            LOG.info("Extracting index information from table '%s'", tbl_name)
            # FIXME
            #for ind_row in c3:
                #if index_name <> ind_row[2]:
                    #print pformat(ind_row)
                    #col_info['indexes'][ind_row[2]] = []
                    #index_name = ind_row[2]
                #col_info['indexes'][ind_row[2]].append(ind_row[4])
            ## FOR
            col_info.save()

            ## -----------------------------------------------------------
            ## EXTRACT DATA
            ## -----------------------------------------------------------
            if not self.no_mysql_dataset:
                self.extractData(tbl_name, tbl_cols[tbl_name])

예제 #4

0

파일 보기

    def extractSchema(self):
        c1 = self.mysql_conn.cursor()
        c1.execute("""
            SELECT TABLE_NAME
            FROM information_schema.TABLES
            WHERE TABLE_SCHEMA = %s AND TABLE_NAME != %s""", \
            (self.dbName, MYSQL_LOG_TABLE_NAME))
        tbl_cols = {}
        LOG.info("Extracting table information from database '%s'", self.dbName)
        for row in c1:
            tbl_name = row[0]
            col_info = self.metadata_db.Collection()
            col_info['name'] = tbl_name
            tbl_cols[col_info['name']] = []
            if self.debug: LOG.debug("Created metadata object for collection '%s'", tbl_name)

            c2 = self.mysql_conn.cursor()
            c2.execute("""
                SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION
                FROM information_schema.COLUMNS
                WHERE TABLE_SCHEMA = %s AND TABLE_NAME=%s
            """, (self.dbName, tbl_name))
            for col_row in c2:
                col_name = col_row[0]
                col_type = catalog.sqlTypeToPython(col_row[1])
                col_position = col_row[2]
                
                tbl_cols[col_info['name']].append(col_name)
                col_type_str = catalog.fieldTypeToString(col_type)
                col_info["fields"][col_name] = catalog.Collection.fieldFactory(col_name, col_type_str)
                col_info["fields"][col_name]['ordinal_position'] = int(col_position)
                if self.debug:
                    LOG.info("Created column information for '%s.%s'", tbl_name, col_name)
            ## FOR
            c2.close()

            # Get the index information from MySQL for this table
            sql = "SHOW INDEXES FROM " + self.dbName + "." + tbl_name
            c3 = self.mysql_conn.cursor()
            c3.execute(sql)
            index_name = None
            LOG.info("Extracting index information from table '%s'", tbl_name)
            # FIXME
            #for ind_row in c3:
                #if index_name <> ind_row[2]:
                    #print pformat(ind_row)
                    #col_info['indexes'][ind_row[2]] = []
                    #index_name = ind_row[2]
                #col_info['indexes'][ind_row[2]].append(ind_row[4])
            ## FOR
            col_info.save()

            ## -----------------------------------------------------------
            ## EXTRACT DATA
            ## -----------------------------------------------------------
            if not self.no_mysql_dataset:
                self.extractData(tbl_name, tbl_cols[tbl_name])

예제 #5

0

파일 보기

파일: unittest_utilmethods.py 프로젝트: greinerb/mongodb-d4

 def testFieldTypeSerialization(self):
     for t in [int, str, unicode, float]:
         t_bson = catalog.fieldTypeToString(t)
         self.assertFalse(t_bson == None)
         #print "BSON:", t_bson
         t_python = catalog.fieldTypeToPython(t_bson)
         self.assertFalse(t_python == None)
         #print "PYTHON:", t_python
         self.assertEquals(t, t_python)

예제 #6

0

파일 보기

파일: unittest_utilmethods.py 프로젝트: cmu-db/mongodb-d4

 def testFieldTypeSerialization(self):
     for t in [ int, str, unicode, float ]:
         t_bson = catalog.fieldTypeToString(t)
         self.assertFalse(t_bson == None)
         #print "BSON:", t_bson
         t_python = catalog.fieldTypeToPython(t_bson)
         self.assertFalse(t_python == None)
         #print "PYTHON:", t_python
         self.assertEquals(t, t_python)

예제 #7

0

파일 보기

    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS + 1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'],
                             self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info},
                                BUFFER_SIZE)
        self.buffer.initialize(self.design)

예제 #8

0

파일 보기

파일: unittest_lrubuffer.py 프로젝트: cmu-db/mongodb-d4

    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS+1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE)
        self.buffer.initialize(self.design)

예제 #9

0

파일 보기

    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get('parent_col', None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue
            if k == constants.FUNCTIONAL_FIELD: continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]['type'] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                #if v != None:
                #assert fields[k]['type'] == f_type_str, \
                #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not 'distinct_values' in fields[k]:
                fields[k]['distinct_values'] = set()
            if not "num_values" in fields[k]:
                fields[k]['num_values'] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not 'size_histogram' in fields[k]:
                fields[k]['size_histogram'] = Histogram()
            # Maintain a histogram of list lengths
            if not 'list_len' in fields[k]:
                fields[k]['list_len'] = Histogram()

            if fields[k]['query_use_count'] > 0 and not k in col_info[
                    'interesting']:
                col_info['interesting'].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(
                        constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]['type'] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(
                                fields[k]['type'], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \
                                    k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue
                        col_info['data_size'] += size
                        fields[k]['size_histogram'].put(size)
                        fields[k]['distinct_values'].add(v)
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" %
                                  k)
                    if not 'fields' in fields[k]: fields[k]['fields'] = {}
                    self.processDataFields(col_info, fields[k]['fields'],
                                           doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not 'fields' in fields[k]: fields[k]['fields'] = {}

                list_len = len(doc[k])
                fields[k]['list_len'].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug(
                                "Extracting keys in nested field in list position %d for '%s'"
                                % (i, k))
                        self.processDataFields(col_info, fields[k]['fields'],
                                               doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]['fields'].get(
                            constants.LIST_INNER_FIELD, {})
                        inner['type'] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(
                                inner['type'], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\
                                      i, k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue

                        fields[k]['fields'][constants.LIST_INNER_FIELD] = inner
                        fields[k]['size_histogram'].put(inner_size)
                        fields[k]['distinct_values'].add(doc[k][i])
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]['type'], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error("Failed to estimate size for field %s in collection %s\n%s",\
                              k, col_info['name'], pformat(fields[k]))
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]",
                             self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info['data_size'] += size
                fields[k]['size_histogram'].put(size)
                fields[k]['distinct_values'].add(v)
                fields[k]['num_values'] += 1
                if parent_col:
                    fields[k]['parent_col'] = parent_col

예제 #10

0

파일 보기

파일: abstractconverter.py 프로젝트: theseusyang/mongodb-d4

    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get("parent_col", None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == "_id":
                continue
            if k == constants.FUNCTIONAL_FIELD:
                continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]["type"] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                # if v != None:
                # assert fields[k]['type'] == f_type_str, \
                # "Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not "distinct_values" in fields[k]:
                fields[k]["distinct_values"] = set()
            if not "num_values" in fields[k]:
                fields[k]["num_values"] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not "size_histogram" in fields[k]:
                fields[k]["size_histogram"] = Histogram()
            # Maintain a histogram of list lengths
            if not "list_len" in fields[k]:
                fields[k]["list_len"] = Histogram()

            if fields[k]["query_use_count"] > 0 and not k in col_info["interesting"]:
                col_info["interesting"].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]["type"] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(fields[k]["type"], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error(
                                    "Failed to estimate size for field '%s' in collection '%s'\n%s",
                                    k,
                                    col_info["name"],
                                    pformat(fields[k]),
                                )
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr
                            )
                            continue
                        col_info["data_size"] += size
                        fields[k]["size_histogram"].put(size)
                        fields[k]["distinct_values"].add(v)
                        fields[k]["num_values"] += 1
                        if parent_col:
                            fields[k]["parent_col"] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" % k)
                    if not "fields" in fields[k]:
                        fields[k]["fields"] = {}
                    self.processDataFields(col_info, fields[k]["fields"], doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not "fields" in fields[k]:
                    fields[k]["fields"] = {}

                list_len = len(doc[k])
                fields[k]["list_len"].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug("Extracting keys in nested field in list position %d for '%s'" % (i, k))
                        self.processDataFields(col_info, fields[k]["fields"], doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]["fields"].get(constants.LIST_INNER_FIELD, {})
                        inner["type"] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(inner["type"], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error(
                                    "Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",
                                    i,
                                    k,
                                    col_info["name"],
                                    pformat(fields[k]),
                                )
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr
                            )
                            continue

                        fields[k]["fields"][constants.LIST_INNER_FIELD] = inner
                        fields[k]["size_histogram"].put(inner_size)
                        fields[k]["distinct_values"].add(doc[k][i])
                        fields[k]["num_values"] += 1
                        if parent_col:
                            fields[k]["parent_col"] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]["type"], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error(
                        "Failed to estimate size for field %s in collection %s\n%s",
                        k,
                        col_info["name"],
                        pformat(fields[k]),
                    )
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info["data_size"] += size
                fields[k]["size_histogram"].put(size)
                fields[k]["distinct_values"].add(v)
                fields[k]["num_values"] += 1
                if parent_col:
                    fields[k]["parent_col"] = parent_col