def query(self, criteria, limit=20, offset=0): """ The most general way to query based on a set of criteria. """ criteria = criteria.copy() if self.connection is None: self.connection = get_db_connection(self.dbname) for criterion in ['node', 'other', 'start', 'end']: if criterion in criteria and criteria[ criterion] in TOO_BIG_PREFIXES: criteria['filter_' + criterion] = criteria[criterion] + '%' query_string = make_list_query(criteria) params = { key: remove_control_chars(value) for (key, value) in criteria.items() } params['limit'] = limit params['offset'] = offset cursor = self.connection.cursor() cursor.execute(query_string, params) results = [ transform_for_linked_data(data) for uri, data in cursor.fetchall() ] return results
def test_control_chars(): text = ( "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb " "\u206aget standardized.\r\n" ) fixed = "Sometimes, bad ideas like these characters get standardized.\r\n" assert remove_control_chars(text) == fixed
def lookup_grouped_by_feature(self, uri, limit=20): uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) def extract_feature(row): return tuple(row[:2]) def feature_data(row): direction, _, data = row # Hacky way to figure out what the 'other' node is, the one that # (in most cases) didn't match the URI. If both start with our # given URI, take the longer one, which is either a more specific # sense or a different, longer word. shorter, longer = sorted([data['start'], data['end']], key=len) if shorter.startswith(uri): data['other'] = longer else: data['other'] = shorter return data cursor = self.connection.cursor() cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit}) results = {} for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature): results[feature] = [ transform_for_linked_data(feature_data(row)) for row in rows ] return results
def test_control_chars(): text = ( "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb " "\u206aget standardized\U000E0065\U000E006E.\r\n" ) fixed = "Sometimes, bad ideas like these characters get standardized.\r\n" eq_(remove_control_chars(text), fixed)
def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def lookup_assertion(self, uri): """ Get a single assertion, given its URI starting with /a/. """ # Sanitize URIs to remove control characters such as \x00. The postgres driver would # remove \x00 anyway, but this avoids reporting a server error when that happens. uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() cursor.execute("SELECT data FROM edges WHERE uri=%(uri)s", {'uri': uri}) results = [transform_for_linked_data(data) for (data,) in cursor.fetchall()] return results
def sample_dataset(self, uri, limit=50, offset=0): uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() dataset_json = json.dumps(uri) cursor.execute(DATASET_QUERY, { 'dataset': dataset_json, 'limit': limit, 'offset': offset }) results = [ transform_for_linked_data(data) for uri, data in cursor.fetchall() ] return results
def query(self, criteria, limit=20, offset=0): if self.connection is None: self.connection = get_db_connection(self.dbname) params = { key: remove_control_chars(value) for (key, value) in criteria.items() } params['limit'] = limit params['offset'] = offset query_string = make_list_query(criteria) cursor = self.connection.cursor() cursor.execute(query_string, params) results = [ transform_for_linked_data(data) for uri, data in cursor.fetchall() ] return results
def fix_text_segment(text, remove_unsafe_private_use=False, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def clean_string(s): s = str(s) if isnull(s): return None elif re.search('[a-zA-Z]', s) is None: return None else: s = remove_bom(s) s = remove_control_chars(s) s = fix_encoding(s) s = fix_text(s) s = fix_partial_utf8_punct_in_1252(s) s = decode_escapes(s) s = fix_latin_ligatures(s) s = uncurl_quotes(s) s = s.replace("Äu0087", "ć") s = s.replace("Äu0090", "Đ") s = s.replace("Ãu0096", "Ö") s = s.replace("Åu008D", "ō") s = s.replace("\\", " ") s = s.replace("/", " ") s = s.replace("ö", "ö") p = re.compile("^\w+[A-Z]{1}\w*$") if p.search(s): # From: https://stackoverflow.com/a/37697078 s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s) new_string = "" p = False for letter in s: if letter in "([": p = True elif letter in ")]": p = False continue if not p: new_string += letter return new_string.strip()
def lookup_grouped_by_feature(self, uri, limit=20): """ The query used by the browseable interface, which groups its results by what 'feature' they describe of the queried node. A feature is defined by the relation, the queried node, and the direction (incoming or outgoing). """ uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) def extract_feature(row): return tuple(row[:2]) def feature_data(row): direction, _, data = row # Hacky way to figure out what the 'other' node is, the one that # (in most cases) didn't match the URI. If both start with our # given URI, take the longer one, which is either a more specific # sense or a different, longer word. shorter, longer = sorted([data['start'], data['end']], key=len) if shorter.startswith(uri): data['other'] = longer else: data['other'] = shorter return data cursor = self.connection.cursor() cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit}) results = {} for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature): results[feature] = [ transform_for_linked_data(feature_data(row)) for row in rows ] return results
def lookup_grouped_by_feature(self, uri, limit=20): """ The query used by the browseable interface, which groups its results by what 'feature' they describe of the queried node. A feature is defined by the relation, the queried node, and the direction (incoming or outgoing). """ uri = remove_control_chars(uri) def extract_feature(row): return tuple(row[:2]) def feature_data(row): direction, _, data = row # Hacky way to figure out what the 'other' node is, the one that # (in most cases) didn't match the URI. If both start with our # given URI, take the longer one, which is either a more specific # sense or a different, longer word. shorter, longer = sorted([data['start'], data['end']], key=len) if shorter.startswith(uri): data['other'] = longer else: data['other'] = shorter return data cursor = self.connection.cursor() cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit}) results = {} for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature): results[feature] = [ transform_for_linked_data(feature_data(row)) for row in rows ] return results
def test_control_chars(): text = ( "\ufeffSometimes, \ufffcbad ideas \x7f\ufffalike these characters\ufffb " "\u206aget standardized\U000E0065\U000E006E.\r\n") fixed = "Sometimes, bad ideas like these characters get standardized.\r\n" eq_(remove_control_chars(text), fixed)
def render_safe(text): ''' Make sure the given text is safe to pass to an external process. ''' return remove_control_chars(remove_unsafe_private_use(text))
def test_welsh_flag(): # ftfy used to remove "tag characters", but they have been repurposed in the # "Flag of England", "Flag of Scotland", and "Flag of Wales" emoji sequences. text = "This flag has a dragon on it ЪЈ┤заЂДзаЂбзаЂизаЂгзаЂ│заЂ┐" assert remove_control_chars(text) == text