Esempio n. 1
0
    def query(self, criteria, limit=20, offset=0):
        """
        The most general way to query based on a set of criteria.
        """
        criteria = criteria.copy()
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)
        for criterion in ['node', 'other', 'start', 'end']:
            if criterion in criteria and criteria[
                    criterion] in TOO_BIG_PREFIXES:
                criteria['filter_' + criterion] = criteria[criterion] + '%'

        query_string = make_list_query(criteria)
        params = {
            key: remove_control_chars(value)
            for (key, value) in criteria.items()
        }
        params['limit'] = limit
        params['offset'] = offset

        cursor = self.connection.cursor()
        cursor.execute(query_string, params)
        results = [
            transform_for_linked_data(data) for uri, data in cursor.fetchall()
        ]
        return results
Esempio n. 2
0
def test_control_chars():
    text = (
        "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb "
        "\u206aget standardized.\r\n"
    )
    fixed = "Sometimes, bad ideas like these characters get standardized.\r\n"
    assert remove_control_chars(text) == fixed
Esempio n. 3
0
    def lookup_grouped_by_feature(self, uri, limit=20):
        uri = remove_control_chars(uri)
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)

        def extract_feature(row):
            return tuple(row[:2])

        def feature_data(row):
            direction, _, data = row

            # Hacky way to figure out what the 'other' node is, the one that
            # (in most cases) didn't match the URI. If both start with our
            # given URI, take the longer one, which is either a more specific
            # sense or a different, longer word.
            shorter, longer = sorted([data['start'], data['end']], key=len)
            if shorter.startswith(uri):
                data['other'] = longer
            else:
                data['other'] = shorter
            return data

        cursor = self.connection.cursor()
        cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit})
        results = {}
        for feature, rows in itertools.groupby(cursor.fetchall(),
                                               extract_feature):
            results[feature] = [
                transform_for_linked_data(feature_data(row)) for row in rows
            ]
        return results
Esempio n. 4
0
def test_control_chars():
    text = (
        "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb "
        "\u206aget standardized\U000E0065\U000E006E.\r\n"
    )
    fixed = "Sometimes, bad ideas like these characters get standardized.\r\n"
    eq_(remove_control_chars(text), fixed)
Esempio n. 5
0
def fix_text_segment(
    text,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,
    remove_bom=True,
    normalization='NFC'
):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Esempio n. 6
0
def fix_text_segment(text,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     fix_latin_ligatures=True,
                     fix_character_width=True,
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True,
                     normalization='NFC'):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
Esempio n. 7
0
 def lookup_assertion(self, uri):
     """
     Get a single assertion, given its URI starting with /a/.
     """
     # Sanitize URIs to remove control characters such as \x00. The postgres driver would
     # remove \x00 anyway, but this avoids reporting a server error when that happens.
     uri = remove_control_chars(uri)
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     cursor.execute("SELECT data FROM edges WHERE uri=%(uri)s", {'uri': uri})
     results = [transform_for_linked_data(data) for (data,) in cursor.fetchall()]
     return results
Esempio n. 8
0
 def lookup_assertion(self, uri):
     """
     Get a single assertion, given its URI starting with /a/.
     """
     # Sanitize URIs to remove control characters such as \x00. The postgres driver would
     # remove \x00 anyway, but this avoids reporting a server error when that happens.
     uri = remove_control_chars(uri)
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     cursor.execute("SELECT data FROM edges WHERE uri=%(uri)s", {'uri': uri})
     results = [transform_for_linked_data(data) for (data,) in cursor.fetchall()]
     return results
Esempio n. 9
0
 def sample_dataset(self, uri, limit=50, offset=0):
     uri = remove_control_chars(uri)
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     cursor = self.connection.cursor()
     dataset_json = json.dumps(uri)
     cursor.execute(DATASET_QUERY, {
         'dataset': dataset_json,
         'limit': limit,
         'offset': offset
     })
     results = [
         transform_for_linked_data(data) for uri, data in cursor.fetchall()
     ]
     return results
Esempio n. 10
0
 def query(self, criteria, limit=20, offset=0):
     if self.connection is None:
         self.connection = get_db_connection(self.dbname)
     params = {
         key: remove_control_chars(value)
         for (key, value) in criteria.items()
     }
     params['limit'] = limit
     params['offset'] = offset
     query_string = make_list_query(criteria)
     cursor = self.connection.cursor()
     cursor.execute(query_string, params)
     results = [
         transform_for_linked_data(data) for uri, data in cursor.fetchall()
     ]
     return results
Esempio n. 11
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
Esempio n. 12
0
def fix_text_segment(text,
                     remove_unsafe_private_use=False,
                     fix_entities='auto',
                     remove_terminal_escapes=True,
                     fix_encoding=True,
                     normalization='NFKC',
                     uncurl_quotes=True,
                     fix_line_breaks=True,
                     fix_surrogates=True,
                     remove_control_chars=True,
                     remove_bom=True):
    """
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is all in the same encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_unsafe_private_use:
            text = fixes.remove_unsafe_private_use(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if text == origtext:
            return text
def clean_string(s):
    s = str(s)
    if isnull(s):
        return None
    elif re.search('[a-zA-Z]', s) is None:
        return None
    else:
        s = remove_bom(s)
        s = remove_control_chars(s)
        s = fix_encoding(s)
        s = fix_text(s)
        s = fix_partial_utf8_punct_in_1252(s)
        s = decode_escapes(s)
        s = fix_latin_ligatures(s)
        s = uncurl_quotes(s)
        s = s.replace("Äu0087", "ć")
        s = s.replace("Äu0090", "Đ")
        s = s.replace("Ãu0096", "Ö")
        s = s.replace("Åu008D", "ō")

        s = s.replace("\\", " ")
        s = s.replace("/", " ")
        s = s.replace("ö", "ö")

        p = re.compile("^\w+[A-Z]{1}\w*$")
        if p.search(s):
            # From: https://stackoverflow.com/a/37697078
            s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s)

        new_string = ""
        p = False
        for letter in s:
            if letter in "([":
                p = True
            elif letter in ")]":
                p = False
                continue
            if not p:
                new_string += letter
        return new_string.strip()
Esempio n. 14
0
    def lookup_grouped_by_feature(self, uri, limit=20):
        """
        The query used by the browseable interface, which groups its results
        by what 'feature' they describe of the queried node.

        A feature is defined by the relation, the queried node, and the direction
        (incoming or outgoing).
        """
        uri = remove_control_chars(uri)
        if self.connection is None:
            self.connection = get_db_connection(self.dbname)

        def extract_feature(row):
            return tuple(row[:2])

        def feature_data(row):
            direction, _, data = row

            # Hacky way to figure out what the 'other' node is, the one that
            # (in most cases) didn't match the URI. If both start with our
            # given URI, take the longer one, which is either a more specific
            # sense or a different, longer word.
            shorter, longer = sorted([data['start'], data['end']], key=len)
            if shorter.startswith(uri):
                data['other'] = longer
            else:
                data['other'] = shorter
            return data

        cursor = self.connection.cursor()
        cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit})
        results = {}
        for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature):
            results[feature] = [
                transform_for_linked_data(feature_data(row)) for row in rows
            ]
        return results
Esempio n. 15
0
    def lookup_grouped_by_feature(self, uri, limit=20):
        """
        The query used by the browseable interface, which groups its results
        by what 'feature' they describe of the queried node.

        A feature is defined by the relation, the queried node, and the direction
        (incoming or outgoing).
        """
        uri = remove_control_chars(uri)

        def extract_feature(row):
            return tuple(row[:2])

        def feature_data(row):
            direction, _, data = row

            # Hacky way to figure out what the 'other' node is, the one that
            # (in most cases) didn't match the URI. If both start with our
            # given URI, take the longer one, which is either a more specific
            # sense or a different, longer word.
            shorter, longer = sorted([data['start'], data['end']], key=len)
            if shorter.startswith(uri):
                data['other'] = longer
            else:
                data['other'] = shorter
            return data

        cursor = self.connection.cursor()
        cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit})
        results = {}
        for feature, rows in itertools.groupby(cursor.fetchall(),
                                               extract_feature):
            results[feature] = [
                transform_for_linked_data(feature_data(row)) for row in rows
            ]
        return results
Esempio n. 16
0
def test_control_chars():
    text = (
        "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb "
        "\u206aget standardized\U000E0065\U000E006E.\r\n")
    fixed = "Sometimes, bad ideas like these characters get standardized.\r\n"
    eq_(remove_control_chars(text), fixed)
Esempio n. 17
0
def render_safe(text):
    '''
    Make sure the given text is safe to pass to an external process.
    '''
    return remove_control_chars(remove_unsafe_private_use(text))
Esempio n. 18
0
def render_safe(text):
    '''
    Make sure the given text is safe to pass to an external process.
    '''
    return remove_control_chars(remove_unsafe_private_use(text))
Esempio n. 19
0
def test_welsh_flag():
    # ftfy used to remove "tag characters", but they have been repurposed in the
    # "Flag of England", "Flag of Scotland", and "Flag of Wales" emoji sequences.
    text = "This flag has a dragon on it ­ЪЈ┤заЂДзаЂбзаЂизаЂгзаЂ│заЂ┐"
    assert remove_control_chars(text) == text