def twitterBOT(api, screen_name, min_RT_count, min_fav_count, hashtags, keywords): #initialize a list to hold all the tweepy Tweets alltweets = [] for keyword in keywords: # search for a specific keyword firstTime = True while True: tweets_within_24_hours = [] if firstTime: new_tweets = api.search(q=keyword) else: new_tweets = api.search(q=keyword, since_id = oldest) for t in new_tweets: if in24Hours(t.created_at): tweets_within_24_hours.append(t) print unidecode(t.text) #save most recent tweets alltweets.extend(tweets_within_24_hours) if len(new_tweets) < 200: break else: firstTime = False oldest = new_tweets[-1].id for t in alltweets: t.user.follow() start = time.time() name = t.user.name userExists = 0 try: sql1 = 'select name, time_followed, following from twitter where name = "' + str(name) + '"' c.execute(sql1) rows = fetchone() userExists = 1 time_followed = float(rows[1]) if start - time_followed > 172800: t.user.unfollow() except: pass if not userExists: sql = 'insert into twitter (name, time_followed, following) values ( "' + \ unidecode(name) + '", ' +\ str(start) + ', 1 );' c.execute(sql) conn.commit() if any(word.lower() in t.text.lower() for word in hashtags): if int(t.retweet_count)> int(min_RT_count): print 'int(t.retweet_count)', int(t.retweet_count) t.retweet() if int(t.favorite_count) > int(min_fav_count): print 'int(t.favorite_count)', int(t.favorite_count) t.favorite()
def add_payment(self, payment): """ Function to add payments @param payment: The payment dict @raise exception: when payment is invalid """ # Validate the payment self.check_payment(payment) if self.clean: from text_unidecode import unidecode payment['name'] = unidecode(payment['name'])[:70] payment['description'] = unidecode(payment['description'])[:140] # Get the CstmrDrctDbtInitnNode if not self._config['batch']: # Start building the non batch payment PmtInf_nodes = self._create_PmtInf_node() PmtInf_nodes['PmtInfIdNode'].text = make_id(self._config['name']) PmtInf_nodes['PmtMtdNode'].text = "TRF" PmtInf_nodes['BtchBookgNode'].text = "false" PmtInf_nodes['NbOfTxsNode'].text = "1" PmtInf_nodes['CtrlSumNode'].text = int_to_decimal_str( payment['amount'] ) PmtInf_nodes['Cd_SvcLvl_Node'].text = "SEPA" if 'execution_date' in payment: PmtInf_nodes['ReqdExctnDtNode'].text = payment['execution_date'] else: del PmtInf_nodes['ReqdExctnDtNode'] PmtInf_nodes['Nm_Dbtr_Node'].text = self._config['name'] PmtInf_nodes['IBAN_DbtrAcct_Node'].text = self._config['IBAN'] if 'BIC' in self._config: PmtInf_nodes['BIC_DbtrAgt_Node'].text = self._config['BIC'] PmtInf_nodes['ChrgBrNode'].text = "SLEV" if 'BIC' in payment: bic = True else: bic = False TX_nodes = self._create_TX_node(bic) TX_nodes['InstdAmtNode'].set("Ccy", self._config['currency']) TX_nodes['InstdAmtNode'].text = int_to_decimal_str(payment['amount']) TX_nodes['EndToEnd_PmtId_Node'].text = payment.get('endtoend_id', 'NOTPROVIDED') if bic: TX_nodes['BIC_CdtrAgt_Node'].text = payment['BIC'] TX_nodes['Nm_Cdtr_Node'].text = payment['name'] TX_nodes['IBAN_CdtrAcct_Node'].text = payment['IBAN'] TX_nodes['UstrdNode'].text = payment['description'] if self._config['batch']: self._add_batch(TX_nodes, payment) else: self._add_non_batch(TX_nodes, PmtInf_nodes)
def add_tags(self): self.tags.clear() tag_names = set(unidecode(s.lower()) for s in self.name.split()) tag_names.update(unidecode(quantity.ingredient.name.lower()) for quantity in self.quantities) for tag_name in tag_names: tag = Tag.query.filter_by(name=tag_name).first() if tag is None: tag = Tag(name=tag_name) self.tags.append(tag)
def verifica_estrutura_cabecalho(cls, cabecalho): estrutura_correta = True for coluna, nome in cls.__CABECALHOS.items(): titulo_coluna_arquivo = unidecode(cabecalho[coluna]) titulo_coluna_modelo = unidecode(nome) if titulo_coluna_arquivo != titulo_coluna_modelo: msg_erro = ( f'Título da coluna {coluna} errado. Encontrado "{cabecalho[coluna]}". ' f'Deveria ser "{nome}". Confira o arquivo com o modelo.') raise CargaAssociacaoException(msg_erro) return estrutura_correta
def fingerprint(self): key = self.key if not isinstance(key, str): key = unidecode(key) fp = fingerprints.generate(key) if TRACE_TEXT or TRACE_FP: logger_debug('Text.fingerprint:key: ', repr(self.key)) logger_debug('Text.fingerprint:fp : ', fingerprints.generate(unidecode(self.key))) self.key = fp
def getElement(elements, name): """Function to extract data from publication page which has data in tables""" try: for e in elements: if unidecode(e.find('th').getText()).strip() == unidecode(name).strip(): if name.strip() == 'DOI:': ## if DOI is to be extracted, we need the pdf link return e.find('td').find('a')['href'] else: ## for all other elements, text is required return e.find('td').getText().strip() except: pass
def cabecalho_correto(cls, cabecalho): estrutura_correta = True for coluna, nome in cls.__CABECALHOS.items(): titulo_coluna_arquivo = unidecode(cabecalho[coluna]) titulo_coluna_modelo = unidecode(nome) if titulo_coluna_arquivo != titulo_coluna_modelo: msg_erro = ( f'Título da coluna {coluna} errado. Encontrado "{cabecalho[coluna]}". ' f'Deveria ser "{nome}". Confira o arquivo com o modelo.') logger.error(msg_erro) cls.logs = f"{cls.logs}\n{msg_erro}" estrutura_correta = False break return estrutura_correta
def test_recipe_add_tags(set_db): #TODO warning here when we add "eau" for the second time, # when adding the quantity in add_ingredients. Try to fix that ? ingredients_eausel = [{ "name": "eau", "quantity": 1, "unit": "L" }, { "name": "sel", "quantity": 10, "unit": "g" }] ingredients_eaupoivre = [{ "name": "eau", "quantity": 2, "unit": "mL" }, { "name": "poivre", "quantity": 10, "unit": "g" }] recipe_eausel = Recipe(name="eau salée") recipe_eausel.add_ingredients(ingredients_eausel) recipe_eausel.add_tags() db.session.add(recipe_eausel) db.session.commit() recipe_eaupoivre = Recipe(name="eau poivrée") recipe_eaupoivre.add_ingredients(ingredients_eaupoivre) recipe_eaupoivre.add_tags() db.session.add(recipe_eaupoivre) db.session.commit() expected_tags = { unidecode(ingredient["name"]).lower() for ingredient in chain(ingredients_eausel, ingredients_eaupoivre) } expected_tags.update( unidecode(elem).lower() for elem in recipe_eausel.name.split()) expected_tags.update( unidecode(elem).lower() for elem in recipe_eaupoivre.name.split()) assert expected_tags == set(tag.name for tag in Tag.query) assert len(Tag.query.filter_by(name="eau").first().recipes) == 2 assert len(Tag.query.filter_by(name="sel").first().recipes) == 1 assert len(Tag.query.filter_by(name="salee").first().recipes) == 1
def parse(element_html, data): element = lxml.html.fragment_fromstring(element_html) file_name = pl.get_string_attrib(element, 'file-name', '') answer_name = get_answer_name(file_name) normalize_to_ascii = pl.get_boolean_attrib(element, 'normalize-to-ascii', NORMALIZE_TO_ASCII_DEFAULT) # Get submitted answer or return parse_error if it does not exist file_contents = data['submitted_answers'].get(answer_name, None) if not file_contents: add_format_error(data, 'No submitted answer for {0}'.format(file_name)) return if normalize_to_ascii: try: decoded_contents = base64.b64decode(file_contents).decode('utf-8') normalized = unidecode(decoded_contents) file_contents = base64.b64encode(normalized.encode('UTF-8').strip()).decode() data['submitted_answers'][answer_name] = file_contents except UnicodeError: add_format_error(data, 'Submitted answer is not a valid UTF-8 string.') if data['submitted_answers'].get('_files', None) is None: data['submitted_answers']['_files'] = [] data['submitted_answers']['_files'].append({ 'name': file_name, 'contents': file_contents }) elif isinstance(data['submitted_answers'].get('_files', None), list): data['submitted_answers']['_files'].append({ 'name': file_name, 'contents': file_contents }) else: add_format_error(data, '_files was present but was not an array.')
def __init__(self, config, schema, clean=True): """ Constructor. Checks the config, prepares the document and builds the header. @param param: The config dict. @raise exception: When the config file is invalid. """ self._config = None # Will contain the config file. self._xml = None # Will contain the final XML file. self._batches = OrderedDict() # Will contain the SEPA batches. self._batch_totals = OrderedDict() # Will contain the total amount to debit per batch for checksum total. self.schema = schema self.msg_id = make_msg_id() self.clean = clean config_result = self.check_config(config) if config_result: self._config = config if self.clean: from text_unidecode import unidecode self._config['name'] = unidecode(self._config['name'])[:70] self._prepare_document() self._create_header()
def save(self): if self.position_id: try: position_model = Positions.objects.get(id=self.position_id) except Positions.DoesNotExist: raise serializers.ValidationError( {'category': ['position not found']}) else: position_model = Positions() category_id = self.validated_data.get('category', position_model.category_id) name = self.validated_data.get('name', position_model.name) is_active = self.validated_data.get('is_active', position_model.is_active) slug = slugify(unidecode(name['ru'])) if Positions.objects.filter(slug__iexact=slug).exists(): slug = f"{slug}-{datetime.now().timestamp()}" position_model.category_id = category_id position_model.name = name position_model.is_active = is_active position_model.slug = slug position_model.save() try: position_model.clean() except Exception as e: print('error') print(str(e)) return position_model
def save(self): if self.category_id: try: category_model = Category.objects.get(id=self.category_id) except Category.DoesNotExist: raise serializers.ValidationError( {'category': ['category found region']}) else: category_model = Category() parent_id = self.validated_data.get('parent_id', category_model.parent_id) name = self.validated_data.get('name', category_model.name) is_active = self.validated_data.get('is_active', category_model.is_active) is_main = self.validated_data.get('is_main', category_model.is_main) sort_order = self.validated_data.get('sort_order', category_model.sort_order) slug = slugify(unidecode(name['ru'])) if Category.objects.filter(slug__iexact=slug).exists(): slug = f"{slug}-{datetime.now().timestamp()}" category_model.parent_id = parent_id category_model.name = name category_model.is_active = is_active category_model.is_main = is_main category_model.sort_order = sort_order category_model.slug = slug category_model.save() return category_model
def parse_namespace_repository( repository, library_namespace, include_tag=False, allow_library=True ): repository = unidecode(repository) parts = repository.rstrip("/").split("/", 1) if len(parts) < 2: namespace = library_namespace repository = parts[0] if not allow_library: raise ImplicitLibraryNamespaceNotAllowed() else: (namespace, repository) = parts if include_tag: parts = repository.split(":", 1) if len(parts) < 2: tag = "latest" else: (repository, tag) = parts repository = urllib.parse.quote_plus(repository) if include_tag: return (namespace, repository, tag) return (namespace, repository)
def parse(element_html, data): element = lxml.html.fragment_fromstring(element_html) name = pl.get_string_attrib(element, 'answers-name') # Get allow-blank option allow_blank = pl.get_string_attrib(element, 'allow-blank', ALLOW_BLANK_DEFAULT) normalize_to_ascii = pl.get_boolean_attrib(element, 'normalize-to-ascii', NORMALIZE_TO_ASCII_DEFAULT) # Get submitted answer or return parse_error if it does not exist a_sub = data['submitted_answers'].get(name, None) if a_sub is None: data['format_errors'][name] = 'No submitted answer.' data['submitted_answers'][name] = None return if normalize_to_ascii: a_sub = unidecode(a_sub) data['submitted_answers'][name] = a_sub if not a_sub and not allow_blank: data['format_errors'][ name] = 'Invalid format. The submitted answer was left blank.' data['submitted_answers'][name] = None else: data['submitted_answers'][name] = pl.to_json(a_sub)
def generate_valid_usernames(input_username): if isinstance(input_username, bytes): try: input_username = input_username.decode("utf-8") except UnicodeDecodeError as ude: raise UnicodeDecodeError( "Username %s contains invalid characters: %s", input_username, ude) normalized = unidecode(input_username).strip().lower() prefix = re.sub(INVALID_USERNAME_CHARACTERS, "_", normalized)[:MAX_USERNAME_LENGTH] prefix = re.sub(r"_{2,}", "_", prefix) if prefix.endswith("_"): prefix = prefix[0:len(prefix) - 1] while prefix.startswith("_"): prefix = prefix[1:] num_filler_chars = max(0, MIN_USERNAME_LENGTH - len(prefix)) while num_filler_chars + len(prefix) <= MAX_USERNAME_LENGTH: for suffix in _gen_filler_chars(num_filler_chars): yield prefix + suffix num_filler_chars += 1
def apply_unidecode(self, to_convert: str): if self.norm_form: to_convert = normalize(to_convert, self.norm_form) tg = TransductionGraph(to_convert) # Conversion is done character by character using unidecode converted = [text_unidecode.unidecode(c) for c in to_convert] tg.output_string = "".join(converted) # Edges are calculated to follow the conversion step by step if tg.output_string == "": # Some inputs get completely deleted by unidecode, in which case there are no # valid edges to output. tg.edges = [] else: edges = [] x_len, y_len = 0, 0 for tgt in converted: if tgt: for c in tgt: edges.append((x_len, y_len)) y_len += 1 else: edges.append((x_len, max(y_len - 1, 0))) x_len += 1 tg.edges = edges return tg
def voc_corpus(): """Construct the vocabulary based on most frequent opinion words in learning set. Parameters ---------- Returns ------- None The vocabulary is constructed. """ from learning_class import learning learning() from learning_class import tokenized_learning_class words = [] for tokens in tokenized_learning_class.values(): words += tokens freq = nltk.FreqDist(words) words_freq = map(lambda (a, b) : a, freq.most_common()) for i in range(len(words_freq)): words_freq[i] = unidecode(words_freq[i]) if pos_bool: list_senti_synsets = swn.senti_synsets(words_freq[i][:-2], words_freq[i][-1]) else: list_senti_synsets = swn.senti_synsets(words_freq[i][:-2]) if list_senti_synsets == []: continue if condition(list_senti_synsets): add_key(voc, words_freq[i]) if voc_size == max_voc_size: break
def save(self, commit=True): self.instance.slug = slugify(unidecode(self.instance.name)) if self.parent_pk: self.instance.parent = get_object_or_404(Category, pk=self.parent_pk) super(CategoryForm, self).save(commit=commit) return self.instance
def get_absolute_url(self): return reverse('product:brand', kwargs={ 'path': slugify(smart_text(unidecode(self.brand_name))), 'brand_id': self.id })
def get_crossword_string(topic): # return upper case alphanumeric only topic = unidecode(topic) topic = re.sub('&', 'AND', topic) topic = topic.upper() only_alphanumeric = re.compile('[\W]+') return only_alphanumeric.sub('', topic)
def sendSMS(request): form = SendMsgForm(request.POST or None) form2 = SaveMsgForm() gateway = SmsGateway() if form.is_valid(): number = form.cleaned_data['phoneNumber'] number_list = number.split(",") message3 = form.cleaned_data['message'] message2 = unidecode(message3) message = message2.replace("'", " ") deviceID = request.POST.get('deviceID') device_obj = device.objects.all() for d_obj in device_obj: if d_obj.user == request.user: for num in number_list: accountEmail = d_obj.accountEmail accountPassword = d_obj.accountPassword gateway.loginDetails(accountEmail, accountPassword) gateway.sendMessageToNumber(num, message, deviceID) save_it = form2.save(commit=False) save_it.user = request.user save_it.sentTo = num save_it.msgText = message save_it.save() messages.success(request, 'Message Envoye') return redirect('/messages/0') username = request.user.username device_obj = device.objects.all() contact_list = contacts.objects.filter(user=request.user).order_by('firstName') group_list = contactgroup.objects.filter(contact__user=request.user).distinct().order_by('groupName') template_list = msgTemplates.objects.filter(user=request.user).distinct() context = {"form": form} template = "sendsms.html" pg = ['active', '', ''] return render_to_response(template, locals(), context_instance=RequestContext(request))
def norm_names(x): if isinstance(x,float): return x else: text = unidecode.unidecode(x) normed = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode() return normed
def slugify_text(text): if not isinstance(text, str): text = str(text, 'utf-8', 'ignore') text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) text = unidecode(text) text = CHAR_ENTITY_PATTERN.sub( lambda m: unichr(name2codepoint[m.group(1)]), text) try: text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) except Exception: pass try: text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) except Exception: pass text = unicodedata.normalize('NFKD', text) text = text.lower() text = QUOTE_PATTERN.sub('', text) text = re.sub(ALLOWED_CHARS_PATTERN, DEFAULT_SEPARATOR, text) text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) return text
def _message_to_tag_value(message, allowed_chars=string.ascii_lowercase + string.digits + '_'): """ Turn a long user-facing error message into a short slug that can be used as a datadog tag value passes through unidecode to get something ascii-compatible to work with, then uses the first four space-delimited words and filters out unwanted characters. >>> _message_to_tag_value('Sorry, an error occurred while processing that request.') 'sorry_an_error_occurred' >>> _message_to_tag_value('Another process prevented us from servicing your request. Please try again later.') 'another_process_prevented_us' >>> _message_to_tag_value('509 Unknown Status Code') '509_unknown_status_code' >>> _message_to_tag_value( ... 'EntityScreen EntityScreen [Detail=org.commcare.suite.model.Detail@1f984e3c, ' ... 'selection=null] could not select case 8854f3583f6f46e69af59fddc9f9428d. ' ... 'If this error persists please report a bug to CommCareHQ.') 'entityscreen_entityscreen_detail_org' """ message_tag = unidecode(message) message_tag = ''.join( (c if c in allowed_chars else ' ') for c in message_tag.lower()) message_tag = '_'.join(re.split(r' +', message_tag)[:4]) return message_tag[:59]
def _format_account_name(self, name): name = self.NAME_FORBIDDEN_CHARS_REGEX.sub('-', name) name = self.DASH_COLON_DASH_REGEX.sub(':', name) name = self.DASH_EOL_REGEX.sub('', name) name = self.DASH_DASH_REGEX.sub('-', name) name = unidecode(name) return config.BC_ACCOUNTS_DICT.get(name, name)
def build_form_multimedia_zip( domain, export_id, datespan, user_types, download_id, owner_id, ): from corehq.apps.export.models import FormExportInstance export = FormExportInstance.get(export_id) form_ids = get_form_ids_having_multimedia( domain, export.app_id, export.xmlns, datespan, user_types ) forms_info = _get_form_attachment_info(domain, form_ids, export) num_forms = len(forms_info) DownloadBase.set_progress(build_form_multimedia_zip, 0, num_forms) all_case_ids = set.union(*(info['case_ids'] for info in forms_info)) if forms_info else set() case_id_to_name = _get_case_names(domain, all_case_ids) with TransientTempfile() as temp_path: with open(temp_path, 'wb') as f: _write_attachments_to_file(temp_path, num_forms, forms_info, case_id_to_name) with open(temp_path, 'rb') as f: zip_name = 'multimedia-{}'.format(unidecode(export.name)) _save_and_expose_zip(f, zip_name, domain, download_id, owner_id) DownloadBase.set_progress(build_form_multimedia_zip, num_forms, num_forms)
def save(self): if self.schedule_id: try: schedule_model = Schedules.objects.get(id=self.schedule_id) except Schedules.DoesNotExist: raise serializers.ValidationError( {'schedule': ['schedule not found']}) else: schedule_model = Schedules() name = self.validated_data.get('name', schedule_model.name) is_active = self.validated_data.get('is_active', schedule_model.is_active) slug = self.validated_data.get("alias", schedule_model.alias) slug = slugify(unidecode(slug)) if not self.schedule_id: if Schedules.objects.filter(alias__iexact=slug).exists(): slug = f"{slug}-{datetime.now().timestamp()}" schedule_model.name = name schedule_model.alias = slug schedule_model.is_active = is_active schedule_model.save() try: schedule_model.clean() except Exception as e: print('error') print(str(e)) return schedule_model
def toascii(s, translit=False): """ Convert a Unicode or byte string to ASCII characters, including replacing accented characters with their non-accented equivalent. If `translit` is False use the Unicode NFKD equivalence. If `translit` is True, use a transliteration with the unidecode library. Non ISO-Latin and non ASCII characters are stripped from the output. When no transliteration is possible, the resulting character is replaced by an underscore "_". For Unicode NFKD equivalence, see http://en.wikipedia.org/wiki/Unicode_equivalence The convertion may NOT preserve the original string length and with NFKD some characters may be deleted. Inspired from: http://code.activestate.com/recipes/251871/#c10 by Aaron Bentley. """ if not isinstance(s, compat.unicode): s = as_unicode(s) if translit: converted = unidecode(s) else: converted = unicodedata.normalize('NFKD', s) converted = converted.replace('[?]', '_') converted = converted.encode('ascii', 'ignore') return converted.decode('ascii')
def getSKUData(driver, prod, queue): data = [] asin = (prod['asin']) image = prod['image'] title = prod['title'] price = prod['price_new'].replace(',', '.') num_reviews = prod['num_reviews'] url = 'http://www.amazon.com/dp/' + unidecode(asin) #driver = webdriver.Firefox() driver.get(url) try: product_details = driver.find_element_by_id( 'productDetails_techSpec_section_1').find_elements_by_tag_name( 'tr') except: product_details = [] try: product_details += driver.find_element_by_id( 'productDetails_detailBullets_sections1' ).find_elements_by_tag_name('tr') except: pass try: product_details += driver.find_element_by_id( 'productDetails_techSpec_section_2').find_elements_by_tag_name( 'tr') except: pass try: product_details += driver.find_element_by_id( 'productDetails_feature_div').find_elements_by_tag_name('tr') except: pass salesRank = 'NA' # try to get the sales rank else return True. Does not work sometimes. Need to be more robust. try: salesRankElem = getElement(product_details, 'Best Sellers Rank').strip() except Exception, e: try: product = amazon.lookup(ItemId=unidecode(asin)) salesRank = product.sales_rank except: salesRank = 'NA'
def save(self, commit=True): self.instance.slug = slugify(unidecode(self.instance.name)) instance = super().save(commit=commit) if instance.pk and 'background_image' in self.changed_data: create_collection_background_image_thumbnails.delay(instance.pk) return instance
def main(args): """ Usage: unidecode [<input_file> [<output_file>]] """ write_file(args["<output_file>"], text_unidecode.unidecode(read_file(args["<input_file>"])))
def clean_table_name(domain, readable_name): """ Slugifies and truncates readable name to make a valid configurable report table name. """ name_slug = '_'.join(unidecode(readable_name).lower().split(' ')) # 63 = max postgres table name, 24 = table name prefix + hash overhead max_length = 63 - len(domain) - 24 return name_slug[:max_length]
def _normalize_char(cls, char): if cls._ASCIIZE: return [character.Character(string_char, char.attr) for string_char in text_unidecode.unidecode(str(char))] return [character.Character(string_char, char.attr) if not unicodedata.category(string_char).startswith("Z") else character.Character(' ', char.attr) for string_char in unicodedata.normalize("NFC", str(char))]
def make_list_better(list_item): """ The unicode stuff makes me crazy. This fixes it """ print list_item[0][0] better = unidecode(list_item) print list_item[0] print better return better
def save(self, commit=True): self.instance.slug = slugify(unidecode(self.instance.name)) if self.parent_pk: self.instance.parent = get_object_or_404( Category, pk=self.parent_pk) if self.instance.parent and self.instance.parent.hidden: self.instance.hidden = True super(CategoryForm, self).save(commit=commit) self.instance.set_hidden_descendants(self.cleaned_data['hidden']) return self.instance
def test_ascii_rank(self): from text_unidecode import unidecode strings = [u"a", u"az", u"aaaa", u"azzz", u"zaaa", u"jazz", u"ball", u"a ball", u"łukąźć", u"ołówek", u"♧"] ranks = [get_ascii_string_rank(s) for s in strings] # Ordering the ranks should result in the same order as the strings. self.assertEqual( [get_ascii_string_rank(s) for s in sorted([unidecode(s) for s in strings])], sorted(ranks) )
def save(self, commit=True): self.instance.slug = slugify(unidecode(self.instance.name)) if self.parent_pk: self.instance.parent = get_object_or_404(Category, pk=self.parent_pk) instance = super().save(commit=commit) if instance.pk and "background_image" in self.changed_data: create_category_background_image_thumbnails.delay(instance.pk) return instance
def get_ascii_string_rank(string, max_digits=9): """Convert a string into a number such that when the numbers are sorted they maintain the lexicographic sort order of the words they represent. The number of characters in the string for which lexicographic order will be maintained depends on max_digits. For the default of 9, the number of chars that the order is maintained for is 5. Unfortunately this basically means: >>> get_ascii_string_rank("Python") == get_ascii_string_rank("Pythonic") True when obviously it'd be better if the rank for "Pythonic" was > than the rank for "Python" since "Pythonic" is alphabetically after "Python". """ # Smallest ordinal value we take into account smallest_ord = ord(u"A") # Ord value to use for punctuation - we define punctuation as ordering after # all letters in the alphabet punctuation_ord = smallest_ord - 1 # Offset to normalize the actual ord value by. 11 is taken off because # otherwise the values for words starting with 'A' would start with '00' # which would be ignored when cast to an int offset = smallest_ord - 11 # Fn to get the normalized ordinal get_ord = lambda c: (ord(c) if c.isalpha() else punctuation_ord) - offset # Padding for the string if it's shorter than `max_digits` padding = chr(punctuation_ord) * max_digits if HAS_UNIDECODE: # And parse it with unidecode to get rid of non-ascii characters string = unidecode(string) else: logging.warning( 'text_unidecode package not found. If a string with non-ascii chars ' 'is used for a document rank it may result in unexpected ordering' ) # Get the ordinals... ords = [get_ord(c) for c in (string + padding)] # Concat them, making sure they're all 2 digits long joinable = [str(o).zfill(2) for o in ords] # Cast back to an int, making sure it's at at most `max_digits` long return int("".join(joinable)[:max_digits])
def slugify(text, delim=u'-'): """Generates an ASCII-only slug :param text: text to be translated into a slug :type text: unicode :param delim: delimiter that replace any punctuations and whitespaces :type delim: unicode :returns: an URL-safe, ASCII-only slug :rtype: unicode .. seealso:: http://flask.pocoo.org/snippets/5/ """ if not isinstance(text, unicode): raise TypeError('text should be an unicode, not {0}'.format(text)) text = text.lower() result = (w for word in _punct_re.split(text) for w in unidecode(word).split()) return unicode(delim.join(result))
def convert_unicode_punctuation(self, word): word_converted_punct = [] for c in word: decoded_c = unidecode(c).lower() if len(decoded_c) == 0: # Cannot decode to anything reasonable word_converted_punct.append(c) else: # Check if all punctuation and therefore fine # to include unidecoded version allowed_punct = punct_word( decoded_c, punctuation=ALLOWED_CONVERTED_UNICODE_PUNCTUATION) if allowed_punct: word_converted_punct.append(decoded_c) else: word_converted_punct.append(c) return ''.join(word_converted_punct)
def save(self, commit=True): self.instance.slug = slugify(unidecode(self.instance.name)) return super().save(commit=commit)
def test_7bit_text_purity(): txt = "".join([chr(x) for x in range(128)]) assert unidecode(txt) == txt
def test_7bit_purity(code): ch = chr(code) assert unidecode(ch) == ch
def test_transliterate(text, result): assert unidecode(text) == result
def save(self, commit=True): self.instance.slug = slugify(unidecode(self.instance.name)) if self.parent_pk: self.instance.parent = get_object_or_404( Category, pk=self.parent_pk) return super().save(commit=commit)
def get_slug(self): return slugify(smart_text(unidecode(self.name)))
def _to_ascii(self, string): for search, replace in self.replacements: string = string.replace(search, replace) string = unidecode(string) return string
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, replacements=()): """ Make a slug from the given text. :param text (str): initial text :param entities (bool): :param decimal (bool): :param hexadecimal (bool): :param max_length (int): output string length :param word_boundary (bool): :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order :param separator (str): separator between words :param stopwords (iterable): words to discount :param regex_pattern (str): regex pattern for allowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] :return (str): """ # user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # ensure text is unicode if not isinstance(text, _unicode_type): text = _unicode(text, 'utf-8', 'ignore') # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) # decode unicode text = unidecode.unidecode(text) # ensure text is still in unicode if not isinstance(text, _unicode_type): text = _unicode(text, 'utf-8', 'ignore') # character entity reference if entities: text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) # decimal character reference if decimal: try: text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) except Exception: pass # hexadecimal character reference if hexadecimal: try: text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) except Exception: pass # translate text = unicodedata.normalize('NFKD', text) if sys.version_info < (3,): text = text.encode('ascii', 'ignore') # make the text lowercase (optional) if lowercase: text = text.lower() # remove generated quotes -- post-process text = QUOTE_PATTERN.sub('', text) # cleanup numbers text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters if lowercase: pattern = regex_pattern or ALLOWED_CHARS_PATTERN else: pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) # remove stopwords if stopwords: if lowercase: stopwords_lower = [s.lower() for s in stopwords] words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] else: words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] text = DEFAULT_SEPARATOR.join(words) # finalize user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # smart truncate if requested if max_length > 0: text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) if separator != DEFAULT_SEPARATOR: text = text.replace(DEFAULT_SEPARATOR, separator) return text