Python normalize_utf8 Examples, utils.utils.normalize_utf8 Python Examples

Example #1

0

Show file

File: ftsquery.py Project: hibozzy/mediatum

    def nodeToExtSearch(self, node,
                        schema):  # build extended search index from node

        if len(node.getSearchFields()
               ) == 0:  # stop if schema has no searchfields
            return True

        self.nodeToSchemaDef(node, schema)  # save definition

        keyvalue = []
        i = 1
        for field in node.getSearchFields():
            key = "field%d" % i
            i += 1
            value = ""
            if field.getFieldtype() == "union":
                for item in field.get("valuelist").split(";"):
                    value += node.get(item) + '|'
            else:
                value = node.get(field.getName())
            keyvalue += [(key, modify_tex(u(protect(value)), 'strip'))]

        sql0 = 'SELECT id FROM searchmeta where id=\'{}\''.format(node.id)
        sql1 = 'UPDATE searchmeta SET '
        sql2 = 'INSERT INTO searchmeta (id, type, schema, updatetime'
        for key, value in keyvalue:
            sql1 += key + "='" + normalize_utf8(value) + "', "
            sql2 += ", "
            sql2 += key
        sql1 += "type='" + node.getContentType(
        ) + "', schema='" + schema + "', updatetime='" + node.get(
            "updatetime") + "'"
        sql2 += ") VALUES("
        sql2 += '\'{}\', "{}", "{}", "{}"'.format(node.id,
                                                  node.getContentType(),
                                                  schema,
                                                  node.get("updatetime"))

        for key, value in keyvalue:
            sql2 += ", '" + normalize_utf8(value) + "'"
        sql1 += " WHERE id='{}'".format(node.id)
        sql2 += ")"

        sql = ""
        try:
            sql = sql0
            if self.execute(sql0, schema, 'ext'):  # select
                sql = sql1
                self.execute(sql1, schema, 'ext')  # do update
            else:
                sql = sql2
                self.execute(sql2, schema, 'ext')  # do insert
            return True
        except:
            logException('error in sqlite insert/update: ' + sql)
            return False

Example #2

0

Show file

File: ftsquery.py Project: agromsl/mediatum

    def nodeToExtSearch(self, node, schema):  # build extended search index from node

        if len(node.getSearchFields()) == 0:  # stop if schema has no searchfields
            return True

        self.nodeToSchemaDef(node, schema)  # save definition

        keyvalue = []
        i = 1
        for field in node.getSearchFields():
            key = "field%d" % i
            i += 1
            value = ""
            if field.getFieldtype() == "union":
                for item in field.get("valuelist").split(";"):
                    value += node.get(item) + '|'
            else:
                value = node.get(field.getName())
            keyvalue += [(key, modify_tex(u(protect(value)), 'strip'))]

        sql0 = 'SELECT id FROM searchmeta where id=\'{}\''.format(node.id)
        sql1 = 'UPDATE searchmeta SET '
        sql2 = 'INSERT INTO searchmeta (id, type, schema, updatetime'
        for key, value in keyvalue:
            sql1 += key + "='" + normalize_utf8(value) + "', "
            sql2 += ", "
            sql2 += key
        sql1 += "type='" + node.getContentType() + "', schema='" + schema + "', updatetime='" + node.get("updatetime") + "'"
        sql2 += ") VALUES("
        sql2 += '\'{}\', "{}", "{}", "{}"'.format(node.id,
                                          node.getContentType(),
                                          schema,
                                          node.get("updatetime"))

        for key, value in keyvalue:
            sql2 += ", '" + normalize_utf8(value) + "'"
        sql1 += " WHERE id='{}'".format(node.id)
        sql2 += ")"

        sql = ""
        try:
            sql = sql0
            if self.execute(sql0, schema, 'ext'):  # select
                sql = sql1
                self.execute(sql1, schema, 'ext')  # do update
            else:
                sql = sql2
                self.execute(sql2, schema, 'ext')  # do insert
            return True
        except:
            logException('error in sqlite insert/update: ' + sql)
            return False

Example #3

0

Show file

File: ftsquery.py Project: hibozzy/mediatum

        def getSQL(type, value, spc={}):  # deliver sql for given type
            value = normalize_utf8(protect(u(value)))

            if type == "full":  # all metadata
                return 'select distinct(id) from fullsearchmeta where fullsearchmeta match \'value:' + value + '\' and type <>\'directory\''
            elif type == "fulltext":  # fulltext
                return 'select distinct(id) from textsearchmeta where textsearchmeta match \'value:' + value + '\' and type<>\'directory\''
            elif type == "schema":  # schemadef
                return 'select distinct(id) from fullsearchmeta where schema="' + value.replace(
                    "'", "") + '"'
            elif type == "objtype":  # object type
                return 'select distinct(id) from fullsearchmeta where type="' + value.replace(
                    "'", "") + '"'
            elif type == "updatetime":  # update time with operator <|>|=
                if len(value) == 10:
                    value += "T00:00:00"
                return 'select distinct(id) from searchmeta where updatetime ' + spc[
                    'op'] + ' "' + value.replace("t", "T") + '"'
            elif type == "field":
                return 'select position, name from searchmeta_def where attrname=\'' + value + '\''
            elif type == "spcompare":
                return 'select distinct(id) from searchmeta where schema="' + \
                    str(spc['pos'][1]) + '" and field' + str(spc['pos'][0]) + ' ' + spc['op'] + ' "' + value + '"'
            elif type == "spfield":
                return 'select distinct(id) from searchmeta where field' + str(
                    spc['pos'][0]) + '=""'
            elif type == "spmatch":
                return 'select distinct(id) from searchmeta where schema=\'' + \
                    str(spc['pos'][1]) + '\' and field' + str(spc['pos'][0]) + ' match \'' + value + '\''
            elif type == "content_full":
                return 'select * from fullsearchmeta where id=\'' + value + '\''
            elif type == "content_text":
                return 'select * from textsearchmeta where id=\'' + value + '\''
            elif type == "content_ext":
                return 'select * from searchmeta where id=\'' + value + '\''

Example #4

0

Show file

File: ftsquery.py Project: agromsl/mediatum

        def getSQL(type, value, spc={}):  # deliver sql for given type
            value = normalize_utf8(protect(u(value)))

            if type == "full":  # all metadata
                return 'select distinct(id) from fullsearchmeta where fullsearchmeta match \'value:' + value + '\' and type <>\'directory\''
            elif type == "fulltext":  # fulltext
                return 'select distinct(id) from textsearchmeta where textsearchmeta match \'value:' + value + '\' and type<>\'directory\''
            elif type == "schema":  # schemadef
                return 'select distinct(id) from fullsearchmeta where schema="' + value.replace("'", "") + '"'
            elif type == "objtype":  # object type
                return 'select distinct(id) from fullsearchmeta where type="' + value.replace("'", "") + '"'
            elif type == "updatetime":  # update time with operator <|>|=
                if len(value) == 10:
                    value += "T00:00:00"
                return 'select distinct(id) from searchmeta where updatetime ' + spc['op'] + ' "' + value.replace("t", "T") + '"'
            elif type == "field":
                return 'select position, name from searchmeta_def where attrname=\'' + value + '\''
            elif type == "spcompare":
                return 'select distinct(id) from searchmeta where schema="' + \
                    str(spc['pos'][1]) + '" and field' + str(spc['pos'][0]) + ' ' + spc['op'] + ' "' + value + '"'
            elif type == "spfield":
                return 'select distinct(id) from searchmeta where field' + str(spc['pos'][0]) + '=""'
            elif type == "spmatch":
                return 'select distinct(id) from searchmeta where schema=\'' + \
                    str(spc['pos'][1]) + '\' and field' + str(spc['pos'][0]) + ' match \'' + value + '\''
            elif type == "content_full":
                return 'select * from fullsearchmeta where id=\'' + value + '\''
            elif type == "content_text":
                return 'select * from textsearchmeta where id=\'' + value + '\''
            elif type == "content_ext":
                return 'select * from searchmeta where id=\'' + value + '\''

Example #5

0

Show file

File: indexer.py Project: hibozzy/mediatum

    def nodeToExtSearch(self, node):
        # build extended search index from node
        if len(node.getSearchFields()) == 0:
            # stop if schema has no searchfields
            return True

        v_list = {}
        i = 1
        for field in node.getSearchFields():
            v_list[str(i)] = node.get(field.getName())
            i += 1
        # save definition
        self.nodeToSchemaDef(node)

        sql = 'INSERT INTO searchmeta (id, type, schema, '
        values = ''
        try:
            if len(v_list) > 0:
                for key in v_list:
                    sql += 'field' + str(key) + ', '
                    #values += '"'+u(v_list[key])+ '", '
                    values += '"' + normalize_utf8(u(v_list[key])) + '", '
                sql = sql[:-2]
                values = values[:-2]
                sql = '{}) VALUES("{}", "{}", "{}", {})'.format(
                    sql, node.id, node.getContentType(), node.getSchema(),
                    values)
            else:
                sql = sql[:-2]
                sql = '{}) VALUES("{}", "{}", "{}")'.format(
                    sql, node.id, node.getContentType(), node.getSchema())
            self.db.execute(sql)
            return True
        except:
            return False

Example #6

0

Show file

File: ftsquery.py Project: hibozzy/mediatum

    def nodeToSimpleSearch(self,
                           node,
                           schema,
                           type=""):  # build simple search index from node

        sql_upd = "UPDATE fullsearchmeta SET type='{}', schema='{}', value='{}| ".format(
            node.getContentType(), node.getSchema(), node.name)
        sql_ins = "INSERT INTO fullsearchmeta (id, type, schema, value) VALUES('{}', '{}', '{}', '{}| ".format(
            node.id, node.getContentType(), node.getSchema(), node.name)

        # attributes
        val = ''
        for key, value in node.items():
            if key not in SYSTEMATTRS:  # ignore system attributes
                val += protect(u(value)) + '| '
        for v in val.split(" "):
            v = u(v)
            if normalize_utf8(v) != v.lower():
                val += ' ' + normalize_utf8(v)

        val = val.replace(chr(0), "") + ' '

        # remove tex markup
        val = modify_tex(val, 'strip')

        # files
        for file in node.getFiles():
            val += protect(
                u(file.getName() + '| ' + file.getType() + '| ' +
                  file.getMimeType()) + '| ')

        sql_upd += val + '\' WHERE id=\'{}\''.format(node.id)
        sql_ins += val + '\')'

        sql = ""
        try:
            sql = 'SELECT id from fullsearchmeta WHERE id=\'{}\''.format(
                node.id)
            if self.execute(sql, schema, 'full'):  # check existance
                sql = sql_upd  # do update
            else:
                sql = sql_ins  # do insert
            self.execute(sql, schema, 'full')
            return True
        except:
            logException('error in sqlite insert/update: ' + sql)
            return False

Example #7

0

Show file

File: ftsquery.py Project: agromsl/mediatum

    def nodeToSimpleSearch(self, node, schema, type=""):  # build simple search index from node

        sql_upd ="UPDATE fullsearchmeta SET type='{}', schema='{}', value='{}| ".format(node.getContentType(),
                                                                                        node.getSchema(),
                                                                                        node.name)
        sql_ins = "INSERT INTO fullsearchmeta (id, type, schema, value) VALUES('{}', '{}', '{}', '{}| ".format(node.id,
                                                                                                               node.getContentType(),
                                                                                                               node.getSchema(),
                                                                                                               node.name)

        # attributes
        val = ''
        for key, value in node.items():
            if key not in SYSTEMATTRS:  # ignore system attributes
                val += protect(u(value)) + '| '
        for v in val.split(" "):
            v = u(v)
            if normalize_utf8(v) != v.lower():
                val += ' ' + normalize_utf8(v)

        val = val.replace(chr(0), "") + ' '

        # remove tex markup
        val = modify_tex(val, 'strip')

        # files
        for file in node.getFiles():
            val += protect(u(file.getName() + '| ' + file.getType() + '| ' + file.getMimeType()) + '| ')

        sql_upd += val + '\' WHERE id=\'{}\''.format(node.id)
        sql_ins += val + '\')'

        sql = ""
        try:
            sql = 'SELECT id from fullsearchmeta WHERE id=\'{}\''.format(node.id)
            if self.execute(sql, schema, 'full'):  # check existance
                sql = sql_upd  # do update
            else:
                sql = sql_ins  # do insert
            self.execute(sql, schema, 'full')
            return True
        except:
            logException('error in sqlite insert/update: ' + sql)
            return False

Example #8

0

Show file

File: db_analysis.py Project: agromsl/mediatum

def get_extended_field_ratio(schema, node, db_content):
    """
    Compares the values in the ext search db and the values in the node instance and returns
    a ratio of likeness between the two values.
    @param schema: String, name of the schema
    @param node: Node, an core.tree node instance
    @return: Float
    """
    ratios = []

    field_names = get_zero_index_schema_fields(schema)

    for field in field_names:
        node_value = normalize_utf8(modify_tex(u(protect(node.get(field.name))), 'strip'))
        db_value = str(db_content[field.position])
        equality_ratio = difflib.SequenceMatcher(None, db_value, node_value).ratio()
        ratios.append(equality_ratio)

    return sum(ratios) / len(ratios)

Example #9

0

Show file

File: db_analysis.py Project: hibozzy/mediatum

def get_extended_field_ratio(schema, node, db_content):
    """
    Compares the values in the ext search db and the values in the node instance and returns
    a ratio of likeness between the two values.
    @param schema: String, name of the schema
    @param node: Node, an core.tree node instance
    @return: Float
    """
    ratios = []

    field_names = get_zero_index_schema_fields(schema)

    for field in field_names:
        node_value = normalize_utf8(
            modify_tex(u(protect(node.get(field.name))), 'strip'))
        db_value = str(db_content[field.position])
        equality_ratio = difflib.SequenceMatcher(None, db_value,
                                                 node_value).ratio()
        ratios.append(equality_ratio)

    return sum(ratios) / len(ratios)

Example #10

0

Show file

File: indexer.py Project: agromsl/mediatum

    def nodeToExtSearch(self, node):
        # build extended search index from node
        if len(node.getSearchFields()) == 0:
            # stop if schema has no searchfields
            return True

        v_list = {}
        i = 1
        for field in node.getSearchFields():
            v_list[str(i)] = node.get(field.getName())
            i += 1
        # save definition
        self.nodeToSchemaDef(node)

        sql = 'INSERT INTO searchmeta (id, type, schema, '
        values = ''
        try:
            if len(v_list) > 0:
                for key in v_list:
                    sql += 'field' + str(key) + ', '
                    #values += '"'+u(v_list[key])+ '", '
                    values += '"' + normalize_utf8(u(v_list[key])) + '", '
                sql = sql[:-2]
                values = values[:-2]
                sql = '{}) VALUES("{}", "{}", "{}", {})'.format(sql,
                                                                node.id,
                                                                node.getContentType(),
                                                                node.getSchema(),
                                                                values)
            else:
                sql = sql[:-2]
                sql = '{}) VALUES("{}", "{}", "{}")'.format(sql,
                                                            node.id,
                                                            node.getContentType(),
                                                            node.getSchema())
            self.db.execute(sql)
            return True
        except:
            return False

Example #11

0

Show file

File: indexer.py Project: agromsl/mediatum

    def nodeToSimpleSearch(self, node):
        # build simple search index from node
        try:
            sql = 'INSERT INTO fullsearchmeta (id, type, schema, value) VALUES(\'{}\', \'{}\', \'{}\', \'{}| '.format(node.id,
                                                                                                                      node.getContentType(),
                                                                                                                      node.getSchema(),
                                                                                                                      node.name)

            # attributes
            a = ''
            for key, value in node.items():
                if key not in SYSTEMATTRS:
                    a += protect(u(value)) + '| '
            a = normalize_utf8(a)
            sql += a
            # files
            for file in node.getFiles():
                sql += protect(u(file.getName() + '| ' + file.getType() + '| ' + file.getMimeType()) + '| ')

            sql += '\')'
            self.db.execute(sql)
            return True
        except:
            return False

Example #12

0

Show file

File: indexer.py Project: hibozzy/mediatum

    def nodeToSimpleSearch(self, node):
        # build simple search index from node
        try:
            sql = 'INSERT INTO fullsearchmeta (id, type, schema, value) VALUES(\'{}\', \'{}\', \'{}\', \'{}| '.format(
                node.id, node.getContentType(), node.getSchema(), node.name)

            # attributes
            a = ''
            for key, value in node.items():
                if key not in SYSTEMATTRS:
                    a += protect(u(value)) + '| '
            a = normalize_utf8(a)
            sql += a
            # files
            for file in node.getFiles():
                sql += protect(
                    u(file.getName() + '| ' + file.getType() + '| ' +
                      file.getMimeType()) + '| ')

            sql += '\')'
            self.db.execute(sql)
            return True
        except:
            return False

Example #13

0

Show file

File: ftsquery.py Project: hibozzy/mediatum

    def nodeToFulltextSearch(self, node,
                             schema):  # build fulltext index from node

        if not hasattr(node, "getCategoryName") or not node.getCategoryName(
        ) == "document":  # only build fulltext of document nodes
            return True
        r = re.compile("[a-zA-Z0-9]+")

        if self.execute(
                'SELECT id from textsearchmeta where id=\'{}\''.format(
                    node.id), schema, 'text'):
            # FIXME: we should not delete the old textdata from this node, and insert
            # the new files. Only problem is, DELETE from a FTS3 table is prohibitively
            # slow.
            return

        for file in node.getFiles():
            w = ''
            if file.getType() == "fulltext" and os.path.exists(
                    file.retrieveFile()):
                data = {}
                content = ''
                f = open(file.retrieveFile())
                try:
                    for line in f:
                        if FULLTEXT_INDEX_MODE == 0:
                            content += u(line)
                        else:
                            for w in re.findall(r, line):
                                if w not in data.keys():
                                    data[w] = 1
                            try:
                                data[w] += 1
                            except KeyError:
                                data[w] = 1
                finally:
                    f.close()

                if FULLTEXT_INDEX_MODE == 1:
                    for key in data.keys():
                        content += key + " "
                elif FULLTEXT_INDEX_MODE == 2:
                    for key in data.keys():
                        content += key + " [" + str(data[key]) + "] "

                content = u(content.replace("'", "").replace('"', ""))
                if len(content) > 0:
                    content_len = len(content)
                    p = 0

                    while p in range(0, int(ceil(content_len / 500000.0))):
                        sql = 'INSERT INTO textsearchmeta (id, type, schema, value) VALUES("{}", "{}", "{}", "{}")'.format(
                            node.id, node.getContentType(), schema,
                            normalize_utf8(
                                (content[p * 500000:(p + 1) * 500000 - 1])))
                        try:
                            self.execute(sql, schema, 'text')
                        except:
                            print "\nerror in fulltext of node", node.id
                            return False
                        p += 1
                return True
        return True

Example #14

0

Show file

File: ftsquery.py Project: agromsl/mediatum

    def nodeToFulltextSearch(self, node, schema):  # build fulltext index from node

        if not hasattr(node, "getCategoryName") or not node.getCategoryName() == "document":  # only build fulltext of document nodes
            return True
        r = re.compile("[a-zA-Z0-9]+")

        if self.execute('SELECT id from textsearchmeta where id=\'{}\''.format(node.id), schema, 'text'):
            # FIXME: we should not delete the old textdata from this node, and insert
            # the new files. Only problem is, DELETE from a FTS3 table is prohibitively
            # slow.
            return

        for file in node.getFiles():
            w = ''
            if file.getType() == "fulltext" and os.path.exists(file.retrieveFile()):
                data = {}
                content = ''
                f = open(file.retrieveFile())
                try:
                    for line in f:
                        if FULLTEXT_INDEX_MODE == 0:
                            content += u(line)
                        else:
                            for w in re.findall(r, line):
                                if w not in data.keys():
                                    data[w] = 1
                            try:
                                data[w] += 1
                            except KeyError:
                                data[w] = 1
                finally:
                    f.close()

                if FULLTEXT_INDEX_MODE == 1:
                    for key in data.keys():
                        content += key + " "
                elif FULLTEXT_INDEX_MODE == 2:
                    for key in data.keys():
                        content += key + " [" + str(data[key]) + "] "

                content = u(content.replace("'", "").replace('"', ""))
                if len(content) > 0:
                    content_len = len(content)
                    p = 0

                    while p in range(0, int(ceil(content_len / 500000.0))):
                        sql = 'INSERT INTO textsearchmeta (id, type, schema, value) VALUES("{}", "{}", "{}", "{}")'.format(node.id,
                                                                                                                           node.getContentType(),
                                                                                                                           schema,
                                                                                                                           normalize_utf8((content[p * 500000:(p + 1) * 500000 - 1])))
                        try:
                            self.execute(sql, schema, 'text')
                        except:
                            print "\nerror in fulltext of node", node.id
                            return False
                        p += 1
                return True
        return True