def scrape_movie_data_to_dic(self,request): """ Takes a reuest.get() and returns a dictionary with the desired values """ soup = BeautifulSoup(request.text) dic={} ##add the title value= soup.find("h1", { "itemprop" : "name" }).text value= unicodedata.normalize('NFKD', value).encode('ascii','ignore') dic['Movie Title']=value ##grab the data from each section in finance section finances=soup.find("table", { "id" : "movie_finances" }) for item in finances.findAll("td", { "class" : "data" }): ##get the key key=item.previousSibling.previousSibling.text key=unicodedata.normalize('NFKD', key).encode('ascii','ignore') ##get the value value=unicodedata.normalize('NFKD', item.text).encode('ascii','ignore') dic[key]= item.text ##grab the data from each section in the summary section data=soup.find("div", { "id" : "summary" }).findAll('table')[1] for item in data.findAll('tr'): ##change text to string and replace new line with '' and split by : s=unicodedata.normalize('NFKD', item.text).encode('ascii','ignore') s= s.replace("\n",'').split(':') dic[s[0]]=[s[1]] return dic
def parseFileName(name): nameString = dropInsideContent(name,"[","]" ) nameString = dropInsideContent(nameString,"{","}" ) nameString = dropInsideContent(nameString,"(",")" ) nameString = nameString.strip('()_{}[]!@#$^&*+=|\\/"\'?<>~`') nameString = nameString.lstrip(' ') nameString = nameString.rstrip(' ') nameString = dropInsideContent(nameString,"{","}" ) nameString = nameString.lower() nameString = string.replace(nameString,"\t"," ") nameString = string.replace(nameString," "," ") try: nameString = unicodedata.normalize('NFKD',nameString).encode() nameString = nameString.encode() except: try: nameString = nameString.encode('latin-1', 'ignore') nameString = unicodedata.normalize('NFKD',nameString).encode("ascii") nameString = str(nameString) except: nameString = "unknown" if len(nameString)==0: nameString=" " return nameString
def get_completions(self, document, complete_event): if not document.current_line.strip(): return used, matches = self.ipy_completer.complete( line_buffer=document.current_line, cursor_pos=document.cursor_position_col ) start_pos = -len(used) for m in matches: m = unicodedata.normalize('NFC', m) # When the first character of the completion has a zero length, # then it's probably a decomposed unicode character. E.g. caused by # the "\dot" completion. Try to compose again with the previous # character. if wcwidth(m[0]) == 0: if document.cursor_position + start_pos > 0: char_before = document.text[document.cursor_position + start_pos - 1] m = unicodedata.normalize('NFC', char_before + m) # Yield the modified completion instead, if this worked. if wcwidth(m[0:1]) == 1: yield Completion(m, start_position=start_pos - 1) continue yield Completion(m, start_position=start_pos)
def get_cast_crew(self,url): request=get_file(url) soup = BeautifulSoup(request.text) main_dic={} lst=[u'Cast',u'Production and Technical Credits'] for i in xrange(len(lst)): main_dic[lst[i]]=np.nan dic={} try: lst[i]=soup.findAll('div',{'id':'cast'})[i].find('h1').text for row in soup.findAll('div',{'id':'cast'})[i].findAll('tr'): position, filler, name = row.findAll('td') position= unicodedata.normalize('NFKD', position.text).encode('ascii','ignore') name = unicodedata.normalize('NFKD', name.text).encode('ascii','ignore') if position in dic: dic[position]+=[name] else: dic[position]=[name] dic=json.dumps(dic) except: dic=np.nan main_dic[lst[i]]=dic return main_dic
def find_all_translations(soup): file_string = '' for word_data in soup.find_all("td", class_="list-title"): part_link = word_data.find("a")['href'] full_link = domain + part_link soup2 = getSoup(full_link) translations = soup2.find("article", class_="item-page").find_all(style="text-align: center;") for translation in translations: tagalog = translation.find(['b', 'strong']) new_line = translation.find('br') if new_line: english = new_line.next_sibling else: english = None if tagalog and english and tagalog.string and english.string is not None: if ' ' not in tagalog.string.strip() and tagalog.string is not english.string: file_string += unicodedata.normalize('NFD', tagalog.string.strip()).encode('ascii', 'ignore').decode("utf-8") + "\n" file_string += unicodedata.normalize('NFD', str([word.strip() for word in english.string.strip().split(',')])).encode('ascii', 'ignore').decode("utf-8") + "\n" file_string += "\n" f = open('translations.txt', 'a') f.write(file_string) f.close() next_page_link = soup.find('li', class_='pagination-next').find('a')['href'] print('Parsing %s...'%(domain + next_page_link)) find_all_translations(getSoup(domain + next_page_link))
def getPerson(num, file): res1=list() os.system('clear') while len(res1)==0: x = input(str(num) + '. name : ') # file.seek(0, 0) for line in file.readlines(): input_normalized = unicodedata.normalize('NFKD', x.strip().lower()).encode('ascii','ignore').decode('ascii') line_normalized = unicodedata.normalize('NFKD', line.split(';')[0].lower()).encode('ascii','ignore').decode('ascii') birth=line.split(';')[1] full_match=True for input_word in input_normalized.split(' '): if input_word not in line_normalized: full_match= False if full_match and input_normalized.strip() and birth and '?' not in birth: res1.append(line.strip()) os.system('clear') if len(res1)>SEARCH_LIMIT: #limit number of found entries for easier item selection res1=list() print('Please enter more specific keyword') elif len(res1)==0: print('No matching entry found') sel=0 while len(res1)>1: c=getsel(res1, sel) #get user action - selection or cursor move if c==curses.KEY_DOWN and sel<(len(res1)-1): sel+=1 elif c ==curses.KEY_UP and sel>0: sel-=1 elif c == curses.KEY_ENTER or c == 13: break return res1[sel] #return selected item
def crawler(): arr=["http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=07XG6QFJZEE6BBVY6J2Z&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1"] fp=open('data.csv',"w") a=csv.writer(fp,delimiter=',',quotechar="$") visited=[] c=0 while c<200: page=arr.pop() if page not in visited: r=requests.get(page) soup=bs4.BeautifulSoup(r.text) rate=unicodedata.normalize('NFKD',soup.find("span",attrs={"itemprop":"ratingValue"}).string).encode('ascii','ignore') n=float(rate) if n>6.5 and n<=8.5: c=c+1 name=unicodedata.normalize('NFKD',soup.find("h1",attrs={"itemprop":"name"}).text).encode('ascii','ignore') year=soup.find(attrs={"id":"titleYear"}).text director=unicodedata.normalize('NFKD',soup.find("span",attrs={"itemprop":"name"}).string).encode('ascii','ignore') print([c,name,year,director,n]) a.writerow([c,name,year,director,n]) divs=soup.find_all('div',attrs={"class":"rec-title"}) links=[div.find('a')['href'] for div in divs] links=[urljoin(page,link) for link in links] arr=list(set(arr)|set(links)) visited.append(page) fp.close()
def mnemonic_to_seed(self, mnemonic, passphrase): # trezor uses bip39 import pbkdf2, hashlib, hmac PBKDF2_ROUNDS = 2048 mnemonic = unicodedata.normalize('NFKD', ' '.join(mnemonic.split())) passphrase = unicodedata.normalize('NFKD', passphrase) return pbkdf2.PBKDF2(mnemonic, 'mnemonic' + passphrase, iterations = PBKDF2_ROUNDS, macmodule = hmac, digestmodule = hashlib.sha512).read(64)
def test_greek_print_ipa(self): """Test the Word class's `_print_ipa` in Greek.""" w = grc.Word("élipe", grc.GREEK["Attic"]["Probert"]) output = [w._print_ipa(True), w._print_ipa(False)] target = [unicodedata.normalize('NFC', "é.li.pe"), unicodedata.normalize('NFC', "élipe")] self.assertEqual(output, target)
def output(index): i = 0 totalList = [] while i < index: totalList.append(addr[i].total) i += 1 totalList.sort() maxTotal = totalList[index-1] line = 0 i = 0 while i < index: if addr[i].total == maxTotal: line += 1 i += 1 i = 0 l = 0 print "\"", while i < index: if addr[i].total == maxTotal: if l < line-1: print u"\b%s, 最高成交價:%d, 最低成交價:%d" %(unicodedata.normalize('NFKD', addr[i].road), addr[i].maxP, addr[i].minP) l += 1 else: print u"%s, 最高成交價:%d, 最低成交價:%d\"" %(unicodedata.normalize('NFKD', addr[i].road), addr[i].maxP, addr[i].minP) i += 1
def ok_to_send(day_start, day_end): now = datetime.datetime.now().time() dstart = str.split( unicodedata.normalize( 'NFKD', day_start).encode( 'ascii', 'ignore'), ":") dend = str.split( unicodedata.normalize( 'NFKD', day_end).encode( 'ascii', 'ignore'), ":") on_time = datetime.time(int(dstart[0]), int(dstart[1])) off_time = datetime.time(int(dend[0]), int(dend[1])) when, matching = check_time(now, on_time, off_time) should_I_send = False if matching: if when == DAY: return True elif when == NIGHT: return False else: return False else: return False
def test_names(self, data, time_locale): # GH 17354 # Test .weekday_name, .day_name(), .month_name with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert data.weekday_name == 'Monday' if time_locale is None: expected_day = 'Monday' expected_month = 'August' else: with tm.set_locale(time_locale, locale.LC_TIME): expected_day = calendar.day_name[0].capitalize() expected_month = calendar.month_name[8].capitalize() result_day = data.day_name(time_locale) result_month = data.month_name(time_locale) # Work around https://github.com/pandas-dev/pandas/issues/22342 # different normalizations if not PY2: expected_day = unicodedata.normalize("NFD", expected_day) expected_month = unicodedata.normalize("NFD", expected_month) result_day = unicodedata.normalize("NFD", result_day,) result_month = unicodedata.normalize("NFD", result_month) assert result_day == expected_day assert result_month == expected_month # Test NaT nan_ts = Timestamp(NaT) assert np.isnan(nan_ts.day_name(time_locale)) assert np.isnan(nan_ts.month_name(time_locale))
def test_listdir2_returns_name_stat_pairs(self): funny_unicode = u'M\u00E4kel\u00E4' funny_utf8 = funny_unicode.encode('utf-8') self.fs.write_file(funny_utf8, 'data') pairs = self.fs.listdir2('.') self.assertEqual(len(pairs), 1) self.assertEqual(len(pairs[0]), 2) name_utf8, st = pairs[0] self.assertEqual(type(name_utf8), str) name_unicode = name_utf8.decode('utf-8') # See https://en.wikipedia.org/wiki/Unicode_equivalence for # background. The NFKD normalisation seems to be the best way # to ensure things work across Linux and Mac OS X both (their # default normalisation for filenames is different). self.assertEqual( unicodedata.normalize('NFKD', name_unicode), unicodedata.normalize('NFKD', funny_unicode)) self.assertTrue(hasattr(st, 'st_mode')) self.assertFalse(hasattr(st, 'st_mtime')) self.assertTrue(hasattr(st, 'st_mtime_sec')) self.assertTrue(hasattr(st, 'st_mtime_nsec'))
def clean_song_data(self, artist, title): # convert to lowercase artist = artist.lower() title = title.lower() # remove accents artist = unicodedata.normalize('NFKD', artist) artist = "".join([c for c in artist if not unicodedata.combining(c)]) title = unicodedata.normalize('NFKD', title) title = "".join([c for c in title if not unicodedata.combining(c)]) if self.ignore_brackets: LYRICS_TITLE_STRIP.append("\(.*\)") # replace ampersands and the like for exp in LYRICS_ARTIST_REPLACE: artist = re.sub(exp[0], exp[1], artist) for exp in LYRICS_TITLE_REPLACE: title = re.sub(exp[0], exp[1], title) # strip things like "(live at Somewhere)", "(acoustic)", etc for exp in LYRICS_TITLE_STRIP: title = re.sub (exp, '', title) # compress spaces title = title.strip() artist = artist.strip() return (artist, title)
def freeze(self): """Clean the destination and build all URLs from generators.""" remove_extra = self.app.config['FREEZER_REMOVE_EXTRA_FILES'] if not os.path.isdir(self.root): os.makedirs(self.root) if remove_extra: ignore = self.app.config['FREEZER_DESTINATION_IGNORE'] previous_files = set( # See https://github.com/SimonSapin/Frozen-Flask/issues/5 normalize('NFC', os.path.join(self.root, *name.split('/'))) for name in walk_directory(self.root, ignore=ignore)) seen_urls = set() seen_endpoints = set() built_files = set() for url, endpoint in self._generate_all_urls(): seen_endpoints.add(endpoint) if url in seen_urls: # Don't build the same URL more than once continue seen_urls.add(url) new_filename = self._build_one(url) built_files.add(normalize('NFC', new_filename)) self._check_endpoints(seen_endpoints) if remove_extra: # Remove files from the previous build that are not here anymore. for extra_file in previous_files - built_files: os.remove(extra_file) parent = os.path.dirname(extra_file) if not os.listdir(parent): # The directory is now empty, remove it. os.removedirs(parent) return seen_urls
def test_list_notebooks(self): nbs = notebooks_only(self.nb_api.list().json()) self.assertEqual(len(nbs), 1) self.assertEqual(nbs[0]['name'], 'inroot.ipynb') nbs = notebooks_only( self.nb_api.list('/Directory with spaces in/').json()) self.assertEqual(len(nbs), 1) self.assertEqual(nbs[0]['name'], 'inspace.ipynb') nbs = notebooks_only(self.nb_api.list(u'/unicodé/').json()) self.assertEqual(len(nbs), 1) self.assertEqual(nbs[0]['name'], 'innonascii.ipynb') self.assertEqual(nbs[0]['path'], u'unicodé') nbs = notebooks_only(self.nb_api.list('/foo/bar/').json()) self.assertEqual(len(nbs), 1) self.assertEqual(nbs[0]['name'], 'baz.ipynb') self.assertEqual(nbs[0]['path'], 'foo/bar') nbs = notebooks_only(self.nb_api.list('foo').json()) self.assertEqual(len(nbs), 4) nbnames = {normalize('NFC', n['name']) for n in nbs} expected = [u'a.ipynb', u'b.ipynb', u'name with spaces.ipynb', u'unicodé.ipynb'] expected = {normalize('NFC', name) for name in expected} self.assertEqual(nbnames, expected) nbs = notebooks_only(self.nb_api.list('ordering').json()) nbnames = [n['name'] for n in nbs] expected = ['A.ipynb', 'b.ipynb', 'C.ipynb'] self.assertEqual(nbnames, expected)
def tokenizeComparison(self, given, correct): # compare in NFC form so accents appear correct given = ucd.normalize("NFC", given) correct = ucd.normalize("NFC", correct) s = difflib.SequenceMatcher(None, given, correct, autojunk=False) givenElems = [] correctElems = [] givenPoint = 0 correctPoint = 0 offby = 0 def logBad(old, new, str, array): if old != new: array.append((False, str[old:new])) def logGood(start, cnt, str, array): if cnt: array.append((True, str[start:start+cnt])) for x, y, cnt in s.get_matching_blocks(): # if anything was missed in correct, pad given if cnt and y-offby > x: givenElems.append((False, "-"*(y-x-offby))) offby = y-x # log any proceeding bad elems logBad(givenPoint, x, given, givenElems) logBad(correctPoint, y, correct, correctElems) givenPoint = x+cnt correctPoint = y+cnt # log the match logGood(x, cnt, given, givenElems) logGood(y, cnt, correct, correctElems) return givenElems, correctElems
def create_fake_user(): first_name = fake.first_name() last_name = fake.last_name() _first = unicodedata.normalize('NFD', first_name).encode('ascii', 'ignore') _last = unicodedata.normalize('NFD', last_name).encode('ascii', 'ignore') email = u'*****@*****.**' % (_first.lower(), _last.lower()) user = User.objects.create_user(email=email, password='******') address = Address.objects.create( first_name=first_name, last_name=last_name, street_address_1=fake.street_address(), city=fake.city(), postal_code=fake.postcode(), country=fake.country_code()) user.addresses.add(address) user.default_billing_address = address user.default_shipping_address = address user.is_active = True user.save() return user
def HandleSqlite(SFile): print "\n[INFO] SQLite DB Extraction" try: data = '' con = sq.connect(SFile) cur = con.cursor() cur.execute("SELECT name FROM sqlite_master WHERE type='table';") tables = cur.fetchall() for table in tables: data += "\nTABLE: " + str(table[0]).decode('utf8', 'ignore') + \ " \n=====================================================\n" cur.execute("PRAGMA table_info('%s')" % table) rows = cur.fetchall() head = '' for r in rows: z = r[1] if type(z) is unicode: z = unicodedata.normalize( 'NFKD', z).encode('ascii', 'ignore') head += str(z).decode('utf8', 'ignore') + " | " data += head + " \n=====================================================================\n" cur.execute("SELECT * FROM '%s'" % table) rows = cur.fetchall() for r in rows: dat = '' for x in r: if type(x) is unicode: x = unicodedata.normalize( 'NFKD', x).encode('ascii', 'ignore') dat += str(x).decode('utf8', 'ignore') + " | " data += dat + "\n" return data except: PrintException("[ERROR] SQLite DB Extraction") pass
def _getPDFText(self, filename, d): logger.debug(u"filename: %s" % filename) newparatextlist = list() try: pdfDoc = PdfFileReader(file(filename, u"rb")) pdfDict = pdfDoc.getDocumentInfo() for x in pdfDict.keys(): d.addConceptKeyType(x[1:], pdfDict[x]) # c.logConcepts() for page in pdfDoc.pages: text = page.extractText() if not isinstance(text, str): unicodedata.normalize(u'NFKD', text).encode(u'ascii', u'ignore') logger.debug(u"PDF : %s" % text) newparatextlist.append(text + u". ") return newparatextlist except Exception, msg: logger.error(u"%s" % msg)
def strings_equal(s1, s2): """ Timing-attack resistant string comparison. Normal comparison using == will short-circuit on the first mismatching character. This avoids that by scanning the whole string, though we still reveal to a timing attack whether the strings are the same length. """ s1 = unicodedata.normalize('NFKC', s1) s2 = unicodedata.normalize('NFKC', s2) try: # Python 3.3+ and 2.7.7+ include a timing-attack-resistant # comparison function, which is probably more reliable than ours. # Use it if available. from hmac import compare_digest return compare_digest(s1, s2) except ImportError: pass if len(s1) != len(s2): return False differences = 0 for c1, c2 in zip(s1, s2): differences |= ord(c1) ^ ord(c2) return differences == 0
def artist_search(results, media, lang, artist_name): # Precompose. try: artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8')) except UnicodeError: artist_name = unicodedata.normalize('NFKD', artist_name) # Strip diacritics. stripped = u'' for i in range(len(artist_name)): point = artist_name[i] if not unicodedata.combining(point): stripped += point artist_name = stripped json_obj = JSON.ObjectFromURL('http://127.0.0.1:32400/services/vevo/search?q=%s&artistsLimit=6&videosLimit=1' % (String.Quote(artist_name))) score = 100 normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name)) for artist in json_obj['artists']: # Require a perfect match after normalization to avoid false positives. normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name'])) Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result)) if normalized_artist_name == normalized_artist_result: results.add(SearchResult( id = artist['urlSafeName'], score = score )) score = score - 1
def CrearPedidoCertificado(self, cuit="", empresa="", nombre="pyafipws", filename="empresa.csr"): "Crear un certificate signing request (X509 CSR)" from M2Crypto import RSA, EVP, X509 # create the certificate signing request (CSR): self.x509_req = X509.Request () # normalizar encoding (reemplazar acentos, eñe, etc.) if isinstance(empresa, unicode): empresa = unicodedata.normalize('NFKD', empresa).encode('ASCII', 'ignore') if isinstance(nombre, unicode): nombre = unicodedata.normalize('NFKD', nombre).encode('ASCII', 'ignore') # subjet: C=AR/O=[empresa]/CN=[nombre]/serialNumber=CUIT [nro_cuit] x509name = X509.X509_Name () # default OpenSSL parameters: kwargs = {"type": 0x1000 | 1, "len": -1, "loc": -1, "set": 0} x509name.add_entry_by_txt(field='C', entry='AR', **kwargs) x509name.add_entry_by_txt(field='O', entry=empresa, **kwargs) x509name.add_entry_by_txt(field='CN', entry=nombre, **kwargs) x509name.add_entry_by_txt(field='serialNumber', entry="CUIT %s" % str(cuit), **kwargs) self.x509_req.set_subject_name(x509name) # sign the request with the previously created key (CrearClavePrivada) self.x509_req.set_pubkey (pkey=self.pkey) self.x509_req.sign(pkey=self.pkey, md='sha256') # save the CSR result to a file: f = open(filename, "w") f.write(self.x509_req.as_pem()) f.close() return True
def toRSSItem(self): title = self.repo.tagname if self.message and len(self.message) > 50: title += " - " + self.message[:50] + "..." elif self.message: title += " - " + self.message if self.dbkeywords: title += " - " + ",".join(self.dbkeywords) description = "<pre>" description += self.getpprint() description += "</pre>" title = unicodedata.normalize('NFKD', unicode(title, 'utf-8')).encode('ascii', 'ignore') description = unicodedata.normalize('NFKD', unicode(description, 'utf-8')).encode('ascii', 'ignore') link = '' if self.repo.viewlink: link = self.repo.viewlink.replace('%ID', self.uniqueid) item = RSSItem( title = title, link = link, description = description, guid = Config.rooturl + "/commit/" + self.repo.tagname + "/" + self.uniqueid, pubDate = unixToDatetime(self.date) ) return item
def normalize_token(data): # credit: http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string data = unicodedata.normalize( "NFC", "".join((c for c in unicodedata.normalize("NFD", data) if unicodedata.category(c) != "Mn")).lower() ) data = re.sub(ur"['’]", "", data) return data
def redirect_if_needed(self, i): params = {} need_redirect = False for k, v in i.items(): if k in plurals: params[k] = None k = plurals[k] need_redirect = True if isinstance(v, list): if v == []: continue clean = [normalize('NFC', b.strip()) for b in v] if clean != v: need_redirect = True if len(clean) == 1 and clean[0] == u'': clean = None else: clean = normalize('NFC', v.strip()) if clean == '': need_redirect = True clean = None if clean != v: need_redirect = True params[k] = clean if need_redirect: raise web.seeother(web.changequery(**params))
def __init__(self): if xbmc: self.RssFeedsPath = xbmc.translatePath('special://userdata/RssFeeds.xml').decode("utf-8") else: self.RssFeedsPath = r'C:\Documents and Settings\Xerox\Application Data\XBMC\userdata\RssFeeds.xml' sane = self.checkRssFeedPathSanity() if sane: try: self.feedsTree = parse(self.RssFeedsPath) except: log('[script] RSS Editor --> Failed to parse ' + unicodedata.normalize( 'NFKD', self.RssFeedsPath ).encode( 'ascii', 'ignore' )) regen = xbmcgui.Dialog().yesno(getLS(40), getLS(51), getLS(52), getLS(53)) if regen: log('[script] RSS Editor --> Attempting to Regenerate RssFeeds.xml') xml = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<rssfeeds>\n\ <!-- RSS feeds. To have multiple feeds, just add a feed to the set. You can also have multiple sets. !-->\n\ <!-- To use different sets in your skin, each must be called from skin with a unique id. !-->\n\ <set id="1">\n <feed updateinterval="30">http://feeds.feedburner.com/xbmc</feed>\n </set>\n</rssfeeds>' f = open(self.RssFeedsPath, 'w') f.write(xml) f.close() self.__init__() else: log('[script] RSS Editor --> User opted to not regenerate RssFeeds.xml. Script Exiting') self.feedsTree = False if self.feedsTree: self.feedsList = self.getCurrentRssFeeds() else: self.feedsTree = False self.feedsList = False log('[SCRIPT] RSS Editor --> Could not open ' + unicodedata.normalize( 'NFKD', self.RssFeedsPath ).encode( 'ascii', 'ignore' ) +'. Either the file does not exist, or its size is zero.')
def add_other_bank_account(request): """ function to add a receiver of another bank to which user wants to transfer the money. It fills in all the details of the receiver and also validates them. """ try: cust_id=request.session.get('user_id') name=request.POST["name"] connected_acc_no1=request.POST["account_no"] confirm_acc_no=request.POST["account_no_2"] addressline1=request.POST["line1"] addressline2=request.POST["line2"] addressline3=request.POST["line3"] IFSC_code1=request.POST["IFSC"] limit1=request.POST["limit"] error1="Account Confirmation Failed" error2="Please Enter Valid numbers in fields" error3="Please Enter numeral entries in fields" error4="Sorry The account you wish to connect does not exist" error6="Account Already Added" error7="IFSC code does no exists" if(connected_acc_no1!=confirm_acc_no): return render_to_response("add_other_bank_account.html",{'error':error1,'STATIC_URL':"/static/"}) limit=unicodedata.normalize('NFKD', limit1).encode('ascii','ignore') connected_acc_no=unicodedata.normalize('NFKD', connected_acc_no1).encode('ascii','ignore') IFSC_code=unicodedata.normalize('NFKD', IFSC_code1).encode('ascii','ignore') try: i = float(limit) except ValueError, TypeError: return render_to_response("add_other_bank_account.html",{'error':error3,'STATIC_URL':"/static/"}) else:
def noDiacritics(s): """Removes any diacritics""" # sanity check if s is None: return None # try the right way first try: strAux = unicode(s, 'utf-8') # remove some chars strAux = strAux.replace(unichr(0xba), "") # 4o strAux = strAux.replace(unichr(0xaa), "") # 4a # normalization ret = unicodedata.normalize('NFKD', strAux) ret = ret.encode('ascii', 'ignore') except: ret = None # try as a unicode encoded string if ret is None: try: strAux = s.decode(s, 'utf-8') # remove some chars strAux = strAux.replace(unichr(0xba), "") # 4o strAux = strAux.replace(unichr(0xaa), "") # 4a # normalization ret = unicodedata.normalize('NFKD', strAux) ret = ret.encode('ascii', 'ignore') except: ret = s # return as received return ret
def fromUser(self, screen_name, tweets_number=10, is_bot=False): user = self.createUser(screen_name, is_bot) tweets = self.twitter_client.user_timeline(screen_name=screen_name, count=tweets_number) for i, status in enumerate(tweets): tweet = status._json text = tweet['text'] date = tweet['created_at'] entities = tweet['entities'] user_mentions = entities['user_mentions'] mentions_list = [] if len(user_mentions) > 0: for mention in user_mentions: mentions_list.append(mention['screen_name']) text_string = unicodedata.normalize('NFKD', text).encode('ascii','ignore') date_string = unicodedata.normalize('NFKD', date).encode('ascii','ignore') name_mentions_string = ",".join(mentions_list) Tweet.create( user = user, text = text_string, date = date_string, source = status.source, mentions = name_mentions_string )
def remove_accents(s: str) -> str: return "".join(c for c in unicodedata.normalize("NFD", s) if not unicodedata.combining(c))
def main(): os_v = os.uname()[2].split(".")[0] if os_v == "10": source1 = "/System/Library/Input Methods/CharacterPalette.app/Contents/Frameworks/CharacterPaletteFramework.framework/Resources/kanji.db" elif (os_v > "10" and os_v < "18"): source1 = "/System/Library/Input Methods/CharacterPalette.app/Contents/Resources/CharacterDB.sqlite3" else: source1 = "/System/Library/Components/CharacterPalette.component/Contents/SharedSupport/\ CharPaletteServer.app/Contents/Frameworks/CharacterPaletteFramework.framework/Versions/A/Resources/kanji.db" bundleLibPath = os.environ["TM_BUNDLE_SUPPORT"] + "/lib/" source2 = bundleLibPath + "allHanForRadical.txt.zip" def lastCharInUCSdec(s): isPaneB = False if s: if u"\udc00" <= s[-1] <= u"\udfff" and len(s) >= 2 and u"\ud800" <= s[-2] <= u"\udbff": isPaneB = True return (((ord(s[-2])&0x3ff)<<10 | (ord(s[-1])&0x3ff)) + 0x10000, isPaneB) return (ord(s[-1]), isPaneB) return (-1, isPaneB) if "TM_SELECTED_TEXT" in os.environ: sys.exit(200) if os.environ["DIALOG"][-1] == '2': dialog2 = True else: dialog2 = False outDict = SeqDict() if "TM_CURRENT_LINE" in os.environ and "TM_LINE_INDEX" in os.environ and int(os.environ["TM_LINE_INDEX"]): line, x = os.environ["TM_CURRENT_LINE"], int(os.environ["TM_LINE_INDEX"]) else: sys.exit(206) (lastCharDecCode, charIsPaneB) = lastCharInUCSdec(unicode(line[:x], "UTF-8")) char = wunichr(lastCharDecCode) lastCharUCShexCode = "%04X" % lastCharDecCode UnicodeData = os.popen("zgrep '^" + lastCharUCShexCode + ";' '" + bundleLibPath + "UnicodeData.txt.gz'").read().decode("utf-8") name = "" if not UnicodeData: name = getNameForRange(lastCharDecCode) else: (dummy1, name, category, combiningclass, bididir, decomposition, numtype1, numtype2, numtype3, bidimirror, oldname, comment, upcase, lowcase, titlecase) = UnicodeData.strip().split(';') if name[0] == '<': name = getNameForRange(lastCharDecCode) block = getBlockName(lastCharDecCode) outDict['Character'] = char outDict['Name'] = name outDict['Block'] = block # look for related chars frel = open(bundleLibPath + "relatedChars.txt", "rb") reldata = frel.read().decode("UTF-8") frel.close() for part in reldata.split('\n'): if char in part: break if part: outDict["Related to"] = part if "CJK" in name and ("IDEO" in name or "Ideo" in name): cmd = "zgrep -F '" + char + ",' '" + source2 + "'" gdata = os.popen(cmd.encode("UTF-8")).read().decode("UTF-8") if len(gdata) > 0: RadNum, RadStrokeCnt, RadName, Rad, ExtStrokeCnt, Dummy = gdata.split('\t') outDict['Radical (trad.)'] = [Rad, RadStrokeCnt, u"画", RadName, RadNum, ExtStrokeCnt] outDict['Strokes (trad.)'] = str(int(RadStrokeCnt) + int(ExtStrokeCnt)) # get all data from Apple's internal UniDict cmd = "sqlite3 '" + source1 + "' 'select * from unihan_dict where uchr=\"" + char + "\";' 2>/dev/null" udata = os.popen(cmd.encode("UTF-8")).read().decode("UTF-8") if udata: (uChar, a1, readings, hangul_name_sound, pinyin, zhWubiXing, zhWubiHua, zhBianhao, a2, zhCangjieCh, zhDayi, pinyin1, Bopomofo, jaKun, jaOn, pinyin, zhCangjie) = udata.split('|') zhCangjie = zhCangjie.strip() if readings: japDict = SeqDict() kunon = readings.split('/') if kunon[0]: japDict['Kun'] = kunon[0] if kunon[1]: japDict['On'] = kunon[1] outDict['Japanese'] = japDict # get Chinese simplified/traditional equivalent cmd = "egrep '^" + char + "' '" + bundleLibPath + "zhSimTradHanzi.txt'" simtrad = os.popen(cmd.encode("UTF-8")).read().decode("UTF-8") data = "" if simtrad: c1, st, data = simtrad.split('\t') if pinyin1 or Bopomofo or data or zhWubiXing or zhWubiHua or \ zhBianhao or zhCangjie or zhCangjieCh or zhDayi: zhDict = SeqDict() if data: if st == 'T': zhDict['Traditional'] = data.rstrip() elif st == 'S': zhDict['Simplified'] = data.rstrip() if pinyin1: zhDict['Pinyin'] = pinyin1 if Bopomofo: zhDict['Zhuyin'] = Bopomofo if zhWubiXing: zhDict['Wubi Xing'] = zhWubiXing if zhWubiHua: zhDict['Wubi Hua'] = zhWubiHua if zhBianhao: zhDict['Bishu Bianhao'] = zhBianhao if zhCangjie: zhDict['Cangjie'] = zhCangjie + " " + zhCangjieCh if zhDayi: zhDict['Dayi'] = zhDayi outDict['Chinese'] = zhDict if hangul_name_sound: korDict = SeqDict() korDict['Hangul'] = hangul_name_sound outDict['Korean'] = korDict else: if 'HANGUL' in name and not 'Jamo' in block: outDict['Decomposition'] = " ".join(unicodedata.normalize("NFKD", char)) if UnicodeData: if category: outDict['Category'] = expandUniCategories(category) if oldname: outDict['Old Name'] = oldname if bididir: outDict['Bidirectional'] = expandUniDirectionClass(bididir) if combiningclass: outDict['Combining Class'] = expandUniCombiningClass(combiningclass) if bidimirror: outDict['Mirrored'] = bidimirror if upcase: outDict['Upper Case'] = wunichr(int(upcase,16)) + " (U+" + upcase + ")" if lowcase: outDict['Lower Case'] = wunichr(int(lowcase,16)) + " (U+" + lowcase + ")" if titlecase: outDict['Title Case'] = wunichr(int(titlecase,16)) + " (U+" + titlecase + ")" if numtype1: outDict['Numeral Type'] = (numtype1 + " " + numtype2 + " " + numtype3).strip() if decomposition and not charIsPaneB: decompDict = SeqDict() if decomposition[0] == '<': dc = decomposition.split(' ') decompDict['Class'] = expandUniDecompositionClass(dc[0]) decomposition = " ".join(dc[1:]) decomp = decomposition def cDec(x): return unichr(int(x,16)) def rDec(x): return "U+%04X" % ord(x) clist = decomp.split(' ') decomp = " ".join(map(cDec, clist)) + " (U+" + " U+".join(clist) + ")" cflist = unicodedata.normalize("NFKD", char) if len(clist) != len(cflist): decompDict['into'] = decomp + "; " + " ".join(cflist) + "(" + " ".join(map(rDec, cflist)) + ")" else: decompDict['into'] = decomp outDict['Decomposition'] = decompDict cpDict = SeqDict() cpDict['UCS dec/hex'] = "%s / U+%s" % (str(lastCharDecCode), lastCharUCShexCode) cpDict['UTF-8'] = " ".join([hex(ord(c))[2:].upper() for c in char.encode("utf-8")]) utf16be = hexlify(char.encode("utf-16-be")).upper() if len(utf16be)>4: cpDict['UTF-16BE'] = utf16be[:4] + "+" + utf16be[4:] outDict['Codepoints'] = cpDict if dialog2: dlgout = "<table style=\"border-collapse:collapse;\">" plh = "" if outDict.has_key('Category') and "Nonspacing" in outDict['Category']: plh = u"o" dlgout += "<tr><td rowspan=2 style=\"border:1px dotted silver;font-size:20pt;text-align:center;\"><font color=#CCCCCC>%s</font>%s</td><td> </td><td style=\"color:grey;\">Name</td><td>%s</td></tr>" % (plh, outDict['Character'], outDict['Name']) dlgout += "<tr><td> </td><td style=\"color:grey;\">Block</td><td>%s</td></tr>" % outDict['Block'] dlgout += "</table><table style=\"border-collapse:collapse;width:200px;\">" del outDict['Character'] del outDict['Name'] del outDict['Block'] for k, v in outDict.items(): if "Radical" in k: dlgout += "<tr><td align=right style=\"color:grey;\">%s</td><td> </td><td style=\"white-space:nowrap;\">%s (%s%s - %s) %s.%s" % (k, v[0], v[1], v[2], v[3], v[4], v[5]) elif "Related" in k: # and len(v) > 60 dlgout += "<tr><td align=right style=\"color:grey;\">%s</td><td> </td><td>%s</td></tr>" % (k, v) else: try: v.items() dlgout += "<tr><td colspan=2 align=right style=\"color:grey;\"><b><i>%s</i></b></td></tr>" % k for ku, vu in v.items(): dlgout += "<tr><td align=right style=\"color:grey;white-space:nowrap;\">%s</td><td> </td><td style=\"white-space:nowrap;\">%s</td></tr>" % (ku, vu) except AttributeError: dlgout += "<tr><td align=right style=\"color:grey;white-space:nowrap;\">%s</td><td> </td><td style=\"white-space:nowrap;\">%s</td></tr>" % (k, v) cmd = "'%s' tooltip --html '%s'" % (os.environ["DIALOG"], dlgout.replace("'", u"'")) os.popen(cmd.encode("UTF-8")) sys.exit(206) else: sep = u"┊" for k, v in outDict.items(): if "Radical" in k: print "%-15s %s %s (%s%s - %s) %s.%s" % (k, sep, v[0], v[1], v[2], v[3], v[4], v[5]) else: try: v.items() print "%-15s" % k for ku, vu in v.items(): print "%15s %s %s" % (ku, sep, vu) except AttributeError: print "%-15s %s %s" % (k, sep, v) sys.exit(206)
file.write(fstrip) with open(srcfile, 'r') as file: for fileline in file: matchesv = re.match(r'\\\\v\*\*\\\\v \d+==[^#]', fileline) matchesmt = re.match(r'\\\\mt\*\*\\\\mt \d+==[^#]', fileline) #print ('#### verse or mt is ####: ',fileline) if matchesv or matchesmt: linesplit = fileline.split('==####') line2 = linesplit[0] line1 = line2.replace(' ', '##').replace('\\\\', '\\').replace('##-', '## ') #print ('39 #### working line sfm file ####: ',fileline) #print ('40 #### verse spaces removed ####: ',line1) line = unicodedata.normalize('NFC', line1) versetext1 = linesplit[1] versetext2 = versetext1.rstrip('\n') versetext = unicodedata.normalize('NFC', versetext2) #print () #print ('44 #### bare verse normalized ####: ',versetext) #sanitize variable - https://stackoverflow.com/questions/8237647/clear-variable-in-python cars = None #cars = dict(x.split('**') for x in line.split('==')) - old try, but makes values into str, needs to be list so I an append suffixes later #https://stackoverflow.com/questions/4627981/creating-a-dictionary-from-a-string - key thing here is the [v] which turns it into a list cars = dict( (k, [v]) for k, v in (e.split('**') for e in line.split('=='))) #only for debugging log #print ('51 #### annots as dict keys ####: ', cars.keys()) newlist = [] for key in cars:
def unicodeToAscii(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def TTYstr(ustr): return unicodedata.normalize('NFKD', ustr).encode('ascii', 'ignore').upper()
def unicode_normalize_string(self, text): return unicodedata.normalize('NFD', unicode(text, 'utf-8')).encode( 'ascii', 'ignore').upper().replace("-", "")
def to_python(self, value): return unicodedata.normalize( 'NFKC', super(UsernameField, self).to_python(value))
def strip_accents(string): return u''.join(c for c in unicodedata.normalize('NFD', unicode(string)) if unicodedata.category(c) != 'Mn')
def processAlgorithm(self, parameters, context, feedback): """ Here is where the processing itself takes place. """ source = self.parameterAsVectorLayer(parameters, self.INPUT, context) field_insee = self.parameterAsString(parameters, self.INSEE_CODE, context) field_commune = self.parameterAsString(parameters, self.COMMUNE_NAME, context) value_epsg = self.parameterAsString(parameters, self.EPSG_CODE, context) if value_epsg == '2154' or value_epsg == '3942' or value_epsg == '3943' or value_epsg == '3944' or value_epsg == '3945' or value_epsg == '3946' or value_epsg == '3947' or value_epsg == '3948' or value_epsg == '3949' or value_epsg == '3950' or value_epsg == '32630' or value_epsg == ' 32631' or value_epsg == '32632' or value_epsg == '3857' or value_epsg == '4326' or value_epsg == '4258' or value_epsg == '32620' or value_epsg == '2970' or value_epsg == '2972' or value_epsg == '2973' or value_epsg == '2975' or value_epsg == '32622' or value_epsg == '32740' or value_epsg == '32738' or value_epsg == '4471' or value_epsg == '32621' : feedback.pushInfo('EPSG code' + value_epsg) tab = [] for f in source.getFeatures(): col_select=f[field_insee],(''.join((c for c in unicodedata.normalize('NFD', f[field_commune]) if unicodedata.category(c) != 'Mn'))) # Insere chaque ligne du CSV dans le tableau tab.append(col_select) #Permet la suppression des doublons et le tri Lt=sorted(set(tab)) print (Lt) for c_insee, n_couche in Lt : urlWithParams ="url=http://inspire.cadastre.gouv.fr/scpc/"+c_insee+".wms?contextualWMSLegend=0&crs=EPSG:"+value_epsg+"&dpiMode=7&featureCount=10&format=image/png&layers=LIEUDIT&styles=&maxHeight=1024&maxWidth=1280" rlayer = QgsRasterLayer(urlWithParams,'Lieu_dit_'+n_couche+'_'+c_insee, 'wms') feedback.pushInfo('Category :'+ n_couche +' - '+c_insee) feedback.pushInfo('Validity of WMS : %s' % rlayer.isValid()) if not rlayer.isValid(): print('Lieu_dit_'+n_couche+'_'+c_insee + ' failed to load!') feedback.pushInfo('WMS INVALID : Cadastre_'+n_couche+'_'+c_insee) else: #Source : https://gis.stackexchange.com/questions/342802/loading-openstreetmap-in-pyqgis output_layers = [] output_layers.append(rlayer) context.temporaryLayerStore().addMapLayer(rlayer) context.addLayerToLoadOnCompletion( rlayer.id(), QgsProcessingContext.LayerDetails( 'Lieu_dit_'+n_couche+'_'+c_insee, context.project(), self.OUTPUT_LAYERS ) ) else : feedback.pushInfo('Error EPSG code') # Return the results of the algorithm. In this case our only result is # the feature sink which contains the processed features, but some # algorithms may return multiple feature sinks, calculated numeric # statistics, etc. These should all be included in the returned # dictionary, with keys matching the feature corresponding parameter # or output names. # At the end of the processAlgorithmn # Add the layer to the project return {}
def strip_accents(s): #retira acentos de strings s = s.replace('`', '').replace("'", '') return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def normalize(y): if pd.isnull(y): return y # 欠損値 return ud.normalize('NFKC', y) # 全角数字を半角数字へ
def remove_words_accents(word: str): normalized = unicodedata.normalize('NFD', word) ascii_text = normalized.encode('ascii', 'ignore') return str(ascii_text.decode("utf-8"))
def normalize(s): return ''.join((c for c in unicodedata.normalize('NFKD', unicode(s)) if unicodedata.category(c) != 'Mn'))
def parse_store(self, response): lat = response.xpath('//*[@id="location-lat"]/@value').extract_first() lon = response.xpath('//*[@id="location-lng"]/@value').extract_first() name = response.xpath( '//div[@class="title-wrap"]/h2/text()').extract_first() phone = response.xpath( '//div[@class="title-wrap"]/div/text()').extract_first() street = response.xpath( '//li[@itemprop="streetAddress"]/text()').extract_first().strip() city = response.xpath( '//span[@itemprop="addressLocality"]/text()').extract_first() state = response.xpath( '//span[@itemprop="addressRegion"]/text()').extract_first() postcode = response.xpath( '//span[@itemprop="postalCode"]/text()').extract_first() website = response.xpath( '//*[@id="my_location_url"]/@value').extract_first() address = "{}{} {} {}".format(street, city, state, postcode) # Some pages post notices such as "No longer accepting checks" # in the day/hours open section hour = response.xpath( '//*[@class="location-sidebar-item"][2]/descendant::*[contains(' '., "am") or contains(., "pm") or contains(' '., "Closed")]/text()').extract() day = self.convert_days( response.xpath( '//*[@class="location-sidebar-item"][2]/descendant::*[contains(' '., "Sunday") or contains(., "Monday") or contains(' '., "Tuesday") or contains(., "Wednesday") or contains(' '., "Thursday") or contains(., "Friday") or contains(' '., "Saturday")]/text()').extract()) for i in range(len(hour)): hour[i] = unicodedata.normalize("NFKD", hour[i]) # handle \xa0 hour[i] = hour[i].strip() hour = [x for x in hour if x] hour = self.convert_hours(hour) opening_hours = ', '.join('{} : {}'.format(*t) for t in zip(day, hour)) yield GeojsonPointItem( lat=lat, lon=lon, addr_full=address, street=street, city=city, state=state, postcode=postcode, phone=phone, website=website, opening_hours=opening_hours, ref=response.url, )
def lat2asc(self, title): title = title.decode('iso8859-1') return unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
def normalize_ascii(value): return unicodedata.normalize('NFKD', value) \ .encode('ascii', 'ignore')
def unicodeToAscii(series): return series.apply(lambda s: unicodedata.normalize('NFKC', str(s)))
def execute(self, commande): """ Fonction permettant de reconnaître l'ordre donné par l'utilisateur. On vérifie d'abord la présence des verbes prendre et poser dans la chaine, puis celle d'aller. Pour prendre et poser, on change la position de l'objet. Pour aller, on modifie la position actuelle. On vérifie également la présence d'autres commandes comme l'inventaire. """ commande = unicodedata.normalize('NFD', commande).encode( 'ascii', 'ignore').decode('utf8') commande = commande.replace("'", " ") words = commande.strip(" ").split(" ") mots_reconnus = 0 if words[0] == "prendre": for mot in words[1:]: for obj in self.lieu[self.lieu_actuel].contenu: if mot == obj.raccourci: mots_reconnus += 1 self.personnage.inventaire.append(obj) self.lieu[self.lieu_actuel].contenu.remove(obj) print("Vous avez obtenu : " + obj.nom) if mots_reconnus == 0: print("Impossible de prendre cet objet.") # en l'état actuel, la commande poser crée des problèmes # elif words[0] == "poser": # for mot in words[1:]: # for obj in self.personnage.inventaire: # if mot == obj.raccourci: # self.lieu[self.lieu_actuel].contenu.append(obj) # self.personnage.inventaire.remove(obj) elif words[0] == "aller": for mot in words[1:]: if mot in self.lieu[self.lieu_actuel].adjacence: self.lieu_actuel = self.lieu[ self.lieu_actuel].adjacence[mot] self.transition = 1 mots_reconnus += 1 if mots_reconnus == 0: print( "La destination n'a pas été reconnue, ou est inaccessible depuis ce lieu." ) if mots_reconnus > 1: print( "Attention, plusieurs lieux ont été reconnus. Vous arrivez dans le dernier possible" ) elif words[0] == "parler": for mot in words[1:]: if mot in self.lieu[self.lieu_actuel].dialogues: mots_reconnus += 1 print("\n\033[1m" + mot.capitalize() + "\033[0m : " + self.lieu[self.lieu_actuel].dialogues[mot]) if mots_reconnus == 0: print("Impossible de parler à cette personne.") elif words[0] == "utiliser": for mot in words[1:]: if (mot in self.lieu[self.lieu_actuel].utilisation): for obj in self.personnage.inventaire: if mot == obj.raccourci: self.declencher( self.lieu_actuel, self.lieu[self.lieu_actuel].utilisation[mot]) mots_reconnus += 1 if not mots_reconnus: print("Utilisation impossible.") elif words[0] == "inventaire": self.personnage.afficher_inventaire() else: print("Verbe non reconnu.")
def parse_hours(self, item): item = unicodedata.normalize('NFKD', item).encode('ascii','ignore').strip() symbals= ['\r', '\n', '<br>', '<br/>', '<p>', '<ul>','</ul>', '</li>', '<h3>', '</h3>', '<li style="list-style: initial;">'] for s in symbals: item = item.replace(s,'') return item
remap = { ord('\t') : ' ', ord('\f') : ' ', ord('\r') : None # Deleted } a = s.translate(remap) a import unicodedata import sys sys.maxunicode cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) b b.translate(cmb_chrs) digitmap = { c: ord('0') + unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'} len(digitmap) # Arabic digits x = '\u0661\u0662\u0663' x x.translate(digitmap)
def interpreter(data_path, model_path): """ Run this function, if you want to talk to seq2seq model. if you type "exit", finish to talk. :param data_path: the path of corpus you made model learn :param model_path: the path of model you made learn :return: """ # call dictionary class if args.lang == 'en': corpus = ConvCorpus(file_path=None) corpus.load(load_dir=data_path) elif args.lang == 'ja': corpus = JaConvCorpus(file_path=None) corpus.load(load_dir=data_path) else: print('You gave wrong argument to this system. Check out your argument about languages.') raise ValueError print('Vocabulary Size (number of words) :', len(corpus.dic.token2id)) print('') # rebuild seq2seq model model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num, hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu) serializers.load_hdf5(model_path, model) # load word2vec model sim_th = 50 w2v_model = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=False) # run conversation system print('The system is ready to run, please talk to me!') print('( If you want to end a talk, please type "exit". )') print('') while True: print('>> ', end='') sentence = input() if sentence == 'exit': print('See you again!') break if args.lang == 'en': input_vocab = [unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence)] elif args.lang == 'ja': input_vocab = parse_ja_text(sentence) input_vocab.reverse() input_vocab.insert(0, "<eos>") # convert word into ID input_sentence = [] for word in input_vocab: if corpus.dic.token2id.get(word) is not None: input_sentence.append(corpus.dic.token2id.get(word)) else: try: sim_words = w2v_model.most_similar(positive=[word], topn=sim_th) for index, candidate_tuple in enumerate(sim_words): if corpus.dic.token2id.get(candidate_tuple[0]) is not None: input_sentence.append(corpus.dic.token2id.get(candidate_tuple[0])) break if index == sim_th - 1: input_sentence.append(corpus.dic.token2id['<unk>']) except KeyError: input_sentence.append(corpus.dic.token2id['<unk>']) # input a sentence into model model.initialize() # initialize cell sentence = model.generate(input_sentence, sentence_limit=len(input_sentence) + 30, word2id=corpus.dic.token2id, id2word=corpus.dic) print("-> ", sentence) print('')
def ganadores_por_artista(req): try: artista = req.get('queryResult').get('parameters').get('artista') except AttributeError: return '¿Podrías especificar el nombre de un artista?' if str(artista)=='': return 'Estas seguro de que esa persona esta compitiendo?. Lucas acá me dice que no.' print(str(artista), file=sys.stderr) slug = str(unicodedata.normalize('NFKD', artista)).lower().replace(" ", "-") print(slug, file=sys.stderr) #tag = translate_tags[cat] data = {} response = requests.get( url_win+'categories?slug='+slug, params=data ) rjson = response.json() print(str(rjson), file=sys.stderr) cat_id = rjson[0].get('id') # need cat id to get posts/videos print(cat_id, file=sys.stderr) r = 'posts?categories='+str(cat_id) response = requests.get( url_win+r, params=data ) rjson = response.json() answer = 'Este artista ganó ' i=0 for video in rjson: slug = video.get('slug') data = {} response = requests.get( url_win+'posts?slug='+slug, params=data ) _rjson = response.json()[0] tags = _rjson.get('tags') r = 'tags?include=' r = r + str(tags).replace('[','').replace(']', '') response = requests.get( url_win+r, params=data ) _rjson = response.json() if type(_rjson)!=list: continue print('tags----' + str(_rjson), file=sys.stderr) video_cats = [] answer = answer + 'por el video "'+ video.get('title').get('rendered') + '" en ' for categoria in _rjson: #print(str(video[0]), file=sys.stderr) print(str(categoria), file=sys.stderr) video_cats.append(categoria.get('description')) answer = answer + and_last_comma(str(video_cats).replace('[','').replace(']', '').replace('\'','')) + '' if i != len(rjson) - 1: answer = answer + ', por ' i = i+1 if answer=="Este artista ganó ": return "Este artista no obtuvo premios" return html.unescape(answer)
def normalize_unicode(text): """ unicode string normalization """ return unicodedata.normalize("NFKD", text)
def validate(self, item): try: return unicodedata.normalize('NFKD', item).encode('ascii','ignore').strip().replace(';','') except: return ''
def getNormalizedFilesList(dir): return list(map(lambda x: unicodedata.normalize("NFD", x), os.listdir(dir)))
def remove_accents(input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) only_ascii = nfkd_form.encode('ASCII', 'ignore') return only_ascii
def tag2sentences(tag): sentence = unicodedata.normalize("NFKC", tag.text) return [sentence]
def getNormalizedString(fname): return unicodedata.normalize("NFD", fname)
def remove_accents(input_str): # Borrowed from https://stackoverflow.com/a/517974/1509718 nfkd_form = unicodedata.normalize('NFKD', input_str) return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
def to_python(self, value): return unicodedata.normalize('NFKC', super().to_python(value))