def scrape_movie_data_to_dic(self,request):
     """
     Takes a reuest.get() and returns a dictionary with the desired values
     """
     soup = BeautifulSoup(request.text)
     dic={}
     
     ##add the title
     value= soup.find("h1", { "itemprop" : "name" }).text
     value= unicodedata.normalize('NFKD', value).encode('ascii','ignore')
     dic['Movie Title']=value
     
     ##grab the data from each section in finance section
     finances=soup.find("table", { "id" : "movie_finances" })
     for item in finances.findAll("td", { "class" : "data" }):
         ##get the key
         key=item.previousSibling.previousSibling.text
         key=unicodedata.normalize('NFKD', key).encode('ascii','ignore')
         ##get the value
         value=unicodedata.normalize('NFKD', item.text).encode('ascii','ignore')
         dic[key]= item.text
     
     ##grab the data from each section in the summary section
     data=soup.find("div", { "id" : "summary" }).findAll('table')[1]
     for item in data.findAll('tr'):
         ##change text to string and replace new line with '' and split by :
         s=unicodedata.normalize('NFKD', item.text).encode('ascii','ignore')
         s= s.replace("\n",'').split(':')
         dic[s[0]]=[s[1]]
     
     return dic
def parseFileName(name):
    nameString = dropInsideContent(name,"[","]" )
    nameString = dropInsideContent(nameString,"{","}" )
    nameString = dropInsideContent(nameString,"(",")" )    
    nameString = nameString.strip('()_{}[]!@#$^&*+=|\\/"\'?<>~`')
    nameString = nameString.lstrip(' ')
    nameString = nameString.rstrip(' ')
    nameString = dropInsideContent(nameString,"{","}" )
    nameString = nameString.lower()
    nameString = string.replace(nameString,"\t"," ")
    nameString = string.replace(nameString,"  "," ")    
    
    try: 
        nameString = unicodedata.normalize('NFKD',nameString).encode()
        nameString = nameString.encode()
    except:
        try:
            nameString = nameString.encode('latin-1', 'ignore')
            nameString = unicodedata.normalize('NFKD',nameString).encode("ascii")
            nameString = str(nameString)
        except:
            nameString = "unknown"
    if len(nameString)==0: nameString=" "
    
    return nameString
Example #3
0
    def get_completions(self, document, complete_event):
        if not document.current_line.strip():
            return

        used, matches = self.ipy_completer.complete(
                            line_buffer=document.current_line,
                            cursor_pos=document.cursor_position_col
        )
        start_pos = -len(used)
        for m in matches:
            m = unicodedata.normalize('NFC', m)

            # When the first character of the completion has a zero length,
            # then it's probably a decomposed unicode character. E.g. caused by
            # the "\dot" completion. Try to compose again with the previous
            # character.
            if wcwidth(m[0]) == 0:
                if document.cursor_position + start_pos > 0:
                    char_before = document.text[document.cursor_position + start_pos - 1]
                    m = unicodedata.normalize('NFC', char_before + m)

                    # Yield the modified completion instead, if this worked.
                    if wcwidth(m[0:1]) == 1:
                        yield Completion(m, start_position=start_pos - 1)
                        continue

            yield Completion(m, start_position=start_pos)
    def get_cast_crew(self,url):
        request=get_file(url)
        soup = BeautifulSoup(request.text)
        main_dic={}

        lst=[u'Cast',u'Production and Technical Credits']
        for i in xrange(len(lst)):
            main_dic[lst[i]]=np.nan
            dic={}
            try:
                lst[i]=soup.findAll('div',{'id':'cast'})[i].find('h1').text
                for row in soup.findAll('div',{'id':'cast'})[i].findAll('tr'):
                    position, filler, name = row.findAll('td')
                    position= unicodedata.normalize('NFKD', position.text).encode('ascii','ignore')
                    name = unicodedata.normalize('NFKD', name.text).encode('ascii','ignore')
                    if position in dic:
                        dic[position]+=[name]
                    else:
                        dic[position]=[name]
                dic=json.dumps(dic)
            except:
                dic=np.nan

            main_dic[lst[i]]=dic
        return main_dic
def find_all_translations(soup):
	file_string = ''

	for word_data in soup.find_all("td", class_="list-title"):
		part_link = word_data.find("a")['href']
		full_link = domain + part_link

		soup2 = getSoup(full_link)

		translations = soup2.find("article", class_="item-page").find_all(style="text-align: center;")

		for translation in translations:
			tagalog = translation.find(['b', 'strong'])
			new_line = translation.find('br')

			if new_line:
				english = new_line.next_sibling
			else:
				english = None

			if tagalog and english and tagalog.string and english.string is not None:
				if ' ' not in tagalog.string.strip() and tagalog.string is not english.string:
					file_string += unicodedata.normalize('NFD', tagalog.string.strip()).encode('ascii', 'ignore').decode("utf-8") + "\n"
					file_string += unicodedata.normalize('NFD', str([word.strip() for word in english.string.strip().split(',')])).encode('ascii', 'ignore').decode("utf-8") + "\n"
					file_string += "\n"

	f = open('translations.txt', 'a')
	f.write(file_string)
	f.close()

	next_page_link = soup.find('li', class_='pagination-next').find('a')['href']

	print('Parsing %s...'%(domain + next_page_link))
	find_all_translations(getSoup(domain + next_page_link))
def getPerson(num, file):
        res1=list()
        os.system('clear')
        while len(res1)==0:
          x = input(str(num) + '. name : ')  #
          file.seek(0, 0)
          for line in file.readlines():
            input_normalized = unicodedata.normalize('NFKD', x.strip().lower()).encode('ascii','ignore').decode('ascii')
            line_normalized = unicodedata.normalize('NFKD', line.split(';')[0].lower()).encode('ascii','ignore').decode('ascii')
            birth=line.split(';')[1]
            full_match=True
            for input_word in input_normalized.split(' '):
            	if  input_word not in line_normalized:
            		full_match= False
            if full_match and input_normalized.strip() and birth and '?' not in birth:
            	res1.append(line.strip())
          os.system('clear')   
          if len(res1)>SEARCH_LIMIT:  #limit number of found entries for easier item selection
             res1=list()
             print('Please enter more specific keyword')
          elif len(res1)==0:
             print('No matching entry found')   
          

        sel=0
        while len(res1)>1:
           c=getsel(res1, sel) #get user action - selection or cursor move
           if c==curses.KEY_DOWN and sel<(len(res1)-1):  
                  sel+=1
           elif c ==curses.KEY_UP and sel>0:
                  sel-=1
           elif c == curses.KEY_ENTER or c == 13:
           	break
        return res1[sel]       #return selected item
def crawler():
        arr=["http://www.imdb.com/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2398042102&pf_rd_r=07XG6QFJZEE6BBVY6J2Z&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1"]
        fp=open('data.csv',"w")
        a=csv.writer(fp,delimiter=',',quotechar="$")
        visited=[]
        c=0
        while c<200:
            page=arr.pop()
            if page not in visited: 
                r=requests.get(page)
                soup=bs4.BeautifulSoup(r.text)
                rate=unicodedata.normalize('NFKD',soup.find("span",attrs={"itemprop":"ratingValue"}).string).encode('ascii','ignore')
                n=float(rate)
                if n>6.5 and n<=8.5:
                    c=c+1
                    name=unicodedata.normalize('NFKD',soup.find("h1",attrs={"itemprop":"name"}).text).encode('ascii','ignore')
                    year=soup.find(attrs={"id":"titleYear"}).text
                    director=unicodedata.normalize('NFKD',soup.find("span",attrs={"itemprop":"name"}).string).encode('ascii','ignore')
                    print([c,name,year,director,n])
                    a.writerow([c,name,year,director,n])
                divs=soup.find_all('div',attrs={"class":"rec-title"})
                links=[div.find('a')['href'] for div in divs]
                links=[urljoin(page,link) for link in links]
                arr=list(set(arr)|set(links))
                visited.append(page)
        fp.close()
Example #8
0
 def mnemonic_to_seed(self, mnemonic, passphrase):
     # trezor uses bip39
     import pbkdf2, hashlib, hmac
     PBKDF2_ROUNDS = 2048
     mnemonic = unicodedata.normalize('NFKD', ' '.join(mnemonic.split()))
     passphrase = unicodedata.normalize('NFKD', passphrase)
     return pbkdf2.PBKDF2(mnemonic, 'mnemonic' + passphrase, iterations = PBKDF2_ROUNDS, macmodule = hmac, digestmodule = hashlib.sha512).read(64)
Example #9
0
 def test_greek_print_ipa(self):
     """Test the Word class's `_print_ipa` in Greek."""
     w = grc.Word("élipe", grc.GREEK["Attic"]["Probert"])
     output = [w._print_ipa(True), w._print_ipa(False)]
     target = [unicodedata.normalize('NFC', "é.li.pe"),
                 unicodedata.normalize('NFC', "élipe")]
     self.assertEqual(output, target)
Example #10
0
def output(index):
	i = 0
	totalList = []
	while i < index:
		totalList.append(addr[i].total)
		i += 1

	totalList.sort()
	maxTotal = totalList[index-1]
	line = 0
	i = 0
	while i < index:
		if addr[i].total == maxTotal:
			line += 1
		i += 1
	i = 0
	l = 0
	print "\"",
	while i < index:
		if addr[i].total == maxTotal:
			if l < line-1:
				print u"\b%s, 最高成交價:%d, 最低成交價:%d" %(unicodedata.normalize('NFKD', addr[i].road), addr[i].maxP, addr[i].minP)
				l += 1
			else:
				print u"%s, 最高成交價:%d, 最低成交價:%d\"" %(unicodedata.normalize('NFKD', addr[i].road), addr[i].maxP, addr[i].minP)
		i += 1
Example #11
0
def ok_to_send(day_start, day_end):
    now = datetime.datetime.now().time()
    dstart = str.split(
        unicodedata.normalize(
            'NFKD', day_start).encode(
                'ascii', 'ignore'), ":")

    dend = str.split(
        unicodedata.normalize(
            'NFKD', day_end).encode(
                'ascii', 'ignore'), ":")

    on_time = datetime.time(int(dstart[0]), int(dstart[1]))
    off_time = datetime.time(int(dend[0]), int(dend[1]))
    when, matching = check_time(now, on_time, off_time)
    should_I_send = False
    if matching:
        if when == DAY:
            return True
        elif when == NIGHT:
            return False
        else:
            return False
    else:
        return False
Example #12
0
    def test_names(self, data, time_locale):
        # GH 17354
        # Test .weekday_name, .day_name(), .month_name
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            assert data.weekday_name == 'Monday'
        if time_locale is None:
            expected_day = 'Monday'
            expected_month = 'August'
        else:
            with tm.set_locale(time_locale, locale.LC_TIME):
                expected_day = calendar.day_name[0].capitalize()
                expected_month = calendar.month_name[8].capitalize()

        result_day = data.day_name(time_locale)
        result_month = data.month_name(time_locale)

        # Work around https://github.com/pandas-dev/pandas/issues/22342
        # different normalizations

        if not PY2:
            expected_day = unicodedata.normalize("NFD", expected_day)
            expected_month = unicodedata.normalize("NFD", expected_month)

            result_day = unicodedata.normalize("NFD", result_day,)
            result_month = unicodedata.normalize("NFD", result_month)

        assert result_day == expected_day
        assert result_month == expected_month

        # Test NaT
        nan_ts = Timestamp(NaT)
        assert np.isnan(nan_ts.day_name(time_locale))
        assert np.isnan(nan_ts.month_name(time_locale))
Example #13
0
    def test_listdir2_returns_name_stat_pairs(self):
        funny_unicode = u'M\u00E4kel\u00E4'
        funny_utf8 = funny_unicode.encode('utf-8')

        self.fs.write_file(funny_utf8, 'data')
        pairs = self.fs.listdir2('.')
        self.assertEqual(len(pairs), 1)
        self.assertEqual(len(pairs[0]), 2)
        name_utf8, st = pairs[0]

        self.assertEqual(type(name_utf8), str)
        name_unicode = name_utf8.decode('utf-8')

        # See https://en.wikipedia.org/wiki/Unicode_equivalence for
        # background. The NFKD normalisation seems to be the best way
        # to ensure things work across Linux and Mac OS X both (their
        # default normalisation for filenames is different).
        self.assertEqual(
            unicodedata.normalize('NFKD', name_unicode),
            unicodedata.normalize('NFKD', funny_unicode))

        self.assertTrue(hasattr(st, 'st_mode'))
        self.assertFalse(hasattr(st, 'st_mtime'))
        self.assertTrue(hasattr(st, 'st_mtime_sec'))
        self.assertTrue(hasattr(st, 'st_mtime_nsec'))
Example #14
0
 def clean_song_data(self, artist, title):
     # convert to lowercase
     artist = artist.lower()
     title = title.lower()
     
     # remove accents
     artist = unicodedata.normalize('NFKD', artist)
     artist = "".join([c for c in artist if not unicodedata.combining(c)])
     title = unicodedata.normalize('NFKD', title)
     title = "".join([c for c in title if not unicodedata.combining(c)])
     
     if self.ignore_brackets:
         LYRICS_TITLE_STRIP.append("\(.*\)")
 
     # replace ampersands and the like
     for exp in LYRICS_ARTIST_REPLACE:
         artist = re.sub(exp[0], exp[1], artist)
     for exp in LYRICS_TITLE_REPLACE:
         title = re.sub(exp[0], exp[1], title)
 
     # strip things like "(live at Somewhere)", "(acoustic)", etc
     for exp in LYRICS_TITLE_STRIP:
         title = re.sub (exp, '', title)
 
     # compress spaces
     title = title.strip()
     artist = artist.strip()
             
     return (artist, title)
Example #15
0
    def freeze(self):
        """Clean the destination and build all URLs from generators."""
        remove_extra = self.app.config['FREEZER_REMOVE_EXTRA_FILES']
        if not os.path.isdir(self.root):
            os.makedirs(self.root)
        if remove_extra:
            ignore = self.app.config['FREEZER_DESTINATION_IGNORE']
            previous_files = set(
                # See https://github.com/SimonSapin/Frozen-Flask/issues/5
                normalize('NFC', os.path.join(self.root, *name.split('/')))
                for name in walk_directory(self.root, ignore=ignore))
        seen_urls = set()
        seen_endpoints = set()
        built_files = set()

        for url, endpoint in self._generate_all_urls():
            seen_endpoints.add(endpoint)
            if url in seen_urls:
                # Don't build the same URL more than once
                continue
            seen_urls.add(url)
            new_filename = self._build_one(url)
            built_files.add(normalize('NFC', new_filename))

        self._check_endpoints(seen_endpoints)
        if remove_extra:
            # Remove files from the previous build that are not here anymore.
            for extra_file in previous_files - built_files:
                os.remove(extra_file)
                parent = os.path.dirname(extra_file)
                if not os.listdir(parent):
                    # The directory is now empty, remove it.
                    os.removedirs(parent)
        return seen_urls
Example #16
0
    def test_list_notebooks(self):
        nbs = notebooks_only(self.nb_api.list().json())
        self.assertEqual(len(nbs), 1)
        self.assertEqual(nbs[0]['name'], 'inroot.ipynb')

        nbs = notebooks_only(
            self.nb_api.list('/Directory with spaces in/').json())
        self.assertEqual(len(nbs), 1)
        self.assertEqual(nbs[0]['name'], 'inspace.ipynb')

        nbs = notebooks_only(self.nb_api.list(u'/unicodé/').json())
        self.assertEqual(len(nbs), 1)
        self.assertEqual(nbs[0]['name'], 'innonascii.ipynb')
        self.assertEqual(nbs[0]['path'], u'unicodé')

        nbs = notebooks_only(self.nb_api.list('/foo/bar/').json())
        self.assertEqual(len(nbs), 1)
        self.assertEqual(nbs[0]['name'], 'baz.ipynb')
        self.assertEqual(nbs[0]['path'], 'foo/bar')

        nbs = notebooks_only(self.nb_api.list('foo').json())
        self.assertEqual(len(nbs), 4)
        nbnames = {normalize('NFC', n['name']) for n in nbs}
        expected = [u'a.ipynb', u'b.ipynb',
                    u'name with spaces.ipynb', u'unicodé.ipynb']
        expected = {normalize('NFC', name) for name in expected}
        self.assertEqual(nbnames, expected)

        nbs = notebooks_only(self.nb_api.list('ordering').json())
        nbnames = [n['name'] for n in nbs]
        expected = ['A.ipynb', 'b.ipynb', 'C.ipynb']
        self.assertEqual(nbnames, expected)
Example #17
0
 def tokenizeComparison(self, given, correct):
     # compare in NFC form so accents appear correct
     given = ucd.normalize("NFC", given)
     correct = ucd.normalize("NFC", correct)
     s = difflib.SequenceMatcher(None, given, correct, autojunk=False)
     givenElems = []
     correctElems = []
     givenPoint = 0
     correctPoint = 0
     offby = 0
     def logBad(old, new, str, array):
         if old != new:
             array.append((False, str[old:new]))
     def logGood(start, cnt, str, array):
         if cnt:
             array.append((True, str[start:start+cnt]))
     for x, y, cnt in s.get_matching_blocks():
         # if anything was missed in correct, pad given
         if cnt and y-offby > x:
             givenElems.append((False, "-"*(y-x-offby)))
             offby = y-x
         # log any proceeding bad elems
         logBad(givenPoint, x, given, givenElems)
         logBad(correctPoint, y, correct, correctElems)
         givenPoint = x+cnt
         correctPoint = y+cnt
         # log the match
         logGood(x, cnt, given, givenElems)
         logGood(y, cnt, correct, correctElems)
     return givenElems, correctElems
Example #18
0
def create_fake_user():
    first_name = fake.first_name()
    last_name = fake.last_name()

    _first = unicodedata.normalize('NFD', first_name).encode('ascii', 'ignore')
    _last = unicodedata.normalize('NFD', last_name).encode('ascii', 'ignore')

    email = u'*****@*****.**' % (_first.lower(), _last.lower())

    user = User.objects.create_user(email=email, password='******')

    address = Address.objects.create(
        first_name=first_name,
        last_name=last_name,
        street_address_1=fake.street_address(),
        city=fake.city(),
        postal_code=fake.postcode(),
        country=fake.country_code())

    user.addresses.add(address)
    user.default_billing_address = address
    user.default_shipping_address = address
    user.is_active = True
    user.save()
    return user
def HandleSqlite(SFile):
    print "\n[INFO] SQLite DB Extraction"
    try:
        data = ''
        con = sq.connect(SFile)
        cur = con.cursor()
        cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cur.fetchall()
        for table in tables:
            data += "\nTABLE: " + str(table[0]).decode('utf8', 'ignore') + \
                " \n=====================================================\n"
            cur.execute("PRAGMA table_info('%s')" % table)
            rows = cur.fetchall()
            head = ''
            for r in rows:
                z = r[1]
                if type(z) is unicode:
                    z = unicodedata.normalize(
                        'NFKD', z).encode('ascii', 'ignore')
                head += str(z).decode('utf8', 'ignore') + " | "
            data += head + " \n=====================================================================\n"
            cur.execute("SELECT * FROM '%s'" % table)
            rows = cur.fetchall()
            for r in rows:
                dat = ''
                for x in r:
                    if type(x) is unicode:
                        x = unicodedata.normalize(
                            'NFKD', x).encode('ascii', 'ignore')
                    dat += str(x).decode('utf8', 'ignore') + " | "
                data += dat + "\n"
        return data
    except:
        PrintException("[ERROR] SQLite DB Extraction")
        pass
    def _getPDFText(self, filename, d):
        logger.debug(u"filename: %s" % filename)
        newparatextlist = list()

        try:
            pdfDoc = PdfFileReader(file(filename, u"rb"))

            pdfDict = pdfDoc.getDocumentInfo()

            for x in pdfDict.keys():
                d.addConceptKeyType(x[1:], pdfDict[x])

            # c.logConcepts()

            for page in pdfDoc.pages:
                text = page.extractText()
                if not isinstance(text, str):
                    unicodedata.normalize(u'NFKD', text).encode(u'ascii', u'ignore')

                logger.debug(u"PDF : %s" % text)

                newparatextlist.append(text + u". ")

            return newparatextlist

        except Exception, msg:
            logger.error(u"%s" % msg)
Example #21
0
def strings_equal(s1, s2):
    """
    Timing-attack resistant string comparison.

    Normal comparison using == will short-circuit on the first mismatching
    character. This avoids that by scanning the whole string, though we
    still reveal to a timing attack whether the strings are the same
    length.
    """
    s1 = unicodedata.normalize('NFKC', s1)
    s2 = unicodedata.normalize('NFKC', s2)
    try:
        # Python 3.3+ and 2.7.7+ include a timing-attack-resistant
        # comparison function, which is probably more reliable than ours.
        # Use it if available.
        from hmac import compare_digest

        return compare_digest(s1, s2)
    except ImportError:
        pass

    if len(s1) != len(s2):
        return False

    differences = 0
    for c1, c2 in zip(s1, s2):
        differences |= ord(c1) ^ ord(c2)
    return differences == 0
def artist_search(results, media, lang, artist_name):

  # Precompose.
  try:
    artist_name = unicodedata.normalize('NFKD', artist_name.decode('utf-8'))
  except UnicodeError:
    artist_name = unicodedata.normalize('NFKD', artist_name)

  # Strip diacritics.
  stripped = u''
  for i in range(len(artist_name)):
    point = artist_name[i]
    if not unicodedata.combining(point):
      stripped += point
  artist_name = stripped


  json_obj = JSON.ObjectFromURL('http://127.0.0.1:32400/services/vevo/search?q=%s&artistsLimit=6&videosLimit=1' % (String.Quote(artist_name)))

  score = 100
  normalized_artist_name = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist_name))
  for artist in json_obj['artists']:

    # Require a perfect match after normalization to avoid false positives.
    normalized_artist_result = Core.messaging.call_external_function('com.plexapp.agents.plexmusic', 'MessageKit:NormalizeArtist', kwargs = dict(artist=artist['name']))
    Log('Sanity checking normalized artist: %s against Vevo result: %s' % (normalized_artist_name, normalized_artist_result))
    if normalized_artist_name == normalized_artist_result:        
      results.add(SearchResult(
        id = artist['urlSafeName'],
        score = score
      ))
      score = score - 1
Example #23
0
    def CrearPedidoCertificado(self, cuit="", empresa="", nombre="pyafipws",
                                     filename="empresa.csr"):
        "Crear un certificate signing request (X509 CSR)"
        from M2Crypto import RSA, EVP, X509

        # create the certificate signing request (CSR):
        self.x509_req = X509.Request ()

        # normalizar encoding (reemplazar acentos, eñe, etc.)
        if isinstance(empresa, unicode):
            empresa = unicodedata.normalize('NFKD', empresa).encode('ASCII', 'ignore')
        if isinstance(nombre, unicode):
            nombre = unicodedata.normalize('NFKD', nombre).encode('ASCII', 'ignore')

        # subjet: C=AR/O=[empresa]/CN=[nombre]/serialNumber=CUIT [nro_cuit]
        x509name = X509.X509_Name ()
        # default OpenSSL parameters:
        kwargs = {"type": 0x1000 | 1, "len": -1, "loc": -1, "set": 0}
        x509name.add_entry_by_txt(field='C', entry='AR', **kwargs)
        x509name.add_entry_by_txt(field='O', entry=empresa, **kwargs)
        x509name.add_entry_by_txt(field='CN', entry=nombre, **kwargs)
        x509name.add_entry_by_txt(field='serialNumber', entry="CUIT %s" % str(cuit), **kwargs)     
        self.x509_req.set_subject_name(x509name)

        # sign the request with the previously created key (CrearClavePrivada)
        self.x509_req.set_pubkey (pkey=self.pkey)
        self.x509_req.sign(pkey=self.pkey, md='sha256')
        # save the CSR result to a file:
        f = open(filename, "w")
        f.write(self.x509_req.as_pem())
        f.close()
        return True
Example #24
0
	def toRSSItem(self):
		title = self.repo.tagname
		if self.message and len(self.message) > 50: title += " - " + self.message[:50] + "..."
		elif self.message: title += " - " + self.message
		if self.dbkeywords: title += " - " + ",".join(self.dbkeywords)
		
		description  = "<pre>"
		description += self.getpprint()
		description += "</pre>"
		
		title = unicodedata.normalize('NFKD', unicode(title, 'utf-8')).encode('ascii', 'ignore')
		description = unicodedata.normalize('NFKD', unicode(description, 'utf-8')).encode('ascii', 'ignore')

		link = ''
		if self.repo.viewlink:
			link = self.repo.viewlink.replace('%ID', self.uniqueid)

		item = RSSItem(
			title = title,
			link = link,
			description = description,
			guid = Config.rooturl + "/commit/" + self.repo.tagname + "/" + self.uniqueid,
			pubDate = unixToDatetime(self.date)
			)
		return item
Example #25
0
def normalize_token(data):
    # credit: http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    data = unicodedata.normalize(
        "NFC", "".join((c for c in unicodedata.normalize("NFD", data) if unicodedata.category(c) != "Mn")).lower()
    )
    data = re.sub(ur"['’]", "", data)
    return data
Example #26
0
 def redirect_if_needed(self, i):
     params = {}
     need_redirect = False
     for k, v in i.items():
         if k in plurals:
             params[k] = None
             k = plurals[k]
             need_redirect = True
         if isinstance(v, list):
             if v == []:
                 continue
             clean = [normalize('NFC', b.strip()) for b in v]
             if clean != v:
                 need_redirect = True
             if len(clean) == 1 and clean[0] == u'':
                 clean = None
         else:
             clean = normalize('NFC', v.strip())
             if clean == '':
                 need_redirect = True
                 clean = None
             if clean != v:
                 need_redirect = True
         params[k] = clean
     if need_redirect:
         raise web.seeother(web.changequery(**params))
Example #27
0
 def __init__(self):
     if xbmc:
         self.RssFeedsPath = xbmc.translatePath('special://userdata/RssFeeds.xml').decode("utf-8")
     else:
         self.RssFeedsPath = r'C:\Documents and Settings\Xerox\Application Data\XBMC\userdata\RssFeeds.xml'
     sane = self.checkRssFeedPathSanity()
     if sane:
         try:
             self.feedsTree = parse(self.RssFeedsPath)
         except:
             log('[script] RSS Editor --> Failed to parse ' + unicodedata.normalize( 'NFKD', self.RssFeedsPath ).encode( 'ascii', 'ignore' ))
             regen = xbmcgui.Dialog().yesno(getLS(40), getLS(51), getLS(52), getLS(53))
             if regen:
                 log('[script] RSS Editor --> Attempting to Regenerate RssFeeds.xml')
                 xml = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<rssfeeds>\n\
                 <!-- RSS feeds. To have multiple feeds, just add a feed to the set. You can also have multiple sets. 	!-->\n\
                 <!-- To use different sets in your skin, each must be called from skin with a unique id.             	!-->\n\
                 <set id="1">\n    <feed updateinterval="30">http://feeds.feedburner.com/xbmc</feed>\n  </set>\n</rssfeeds>'
                 f = open(self.RssFeedsPath, 'w')
                 f.write(xml)
                 f.close()
                 self.__init__()
             else:
                 log('[script] RSS Editor --> User opted to not regenerate RssFeeds.xml.  Script Exiting')
                 self.feedsTree = False
         if self.feedsTree:
             self.feedsList = self.getCurrentRssFeeds()
     else:
         self.feedsTree = False
         self.feedsList = False
         log('[SCRIPT] RSS Editor --> Could not open ' + unicodedata.normalize( 'NFKD', self.RssFeedsPath ).encode( 'ascii', 'ignore' ) +'. Either the file does not exist, or its size is zero.')
def add_other_bank_account(request):
  """
  function to add a receiver of another bank to which user wants to transfer the money.
  It fills in all the details of the receiver and also validates them.
  """
  try:
    cust_id=request.session.get('user_id')
    name=request.POST["name"]
    connected_acc_no1=request.POST["account_no"]
    confirm_acc_no=request.POST["account_no_2"]
    addressline1=request.POST["line1"]
    addressline2=request.POST["line2"]
    addressline3=request.POST["line3"]
    IFSC_code1=request.POST["IFSC"]
    limit1=request.POST["limit"]

    error1="Account Confirmation Failed"
    error2="Please Enter Valid numbers in fields"
    error3="Please Enter numeral entries in fields"
    error4="Sorry The account you wish to connect does not exist"
    error6="Account Already Added"
    error7="IFSC code does no exists"
    if(connected_acc_no1!=confirm_acc_no):
	return render_to_response("add_other_bank_account.html",{'error':error1,'STATIC_URL':"/static/"})
    limit=unicodedata.normalize('NFKD', limit1).encode('ascii','ignore')
    connected_acc_no=unicodedata.normalize('NFKD', connected_acc_no1).encode('ascii','ignore')
    IFSC_code=unicodedata.normalize('NFKD', IFSC_code1).encode('ascii','ignore')
    try:
    	i = float(limit)
    except ValueError, TypeError:
    	return render_to_response("add_other_bank_account.html",{'error':error3,'STATIC_URL':"/static/"})
    else:
Example #29
0
def noDiacritics(s):
    """Removes any diacritics"""

    # sanity check
    if s is None:
        return None

    # try the right way first
    try:
        strAux = unicode(s, 'utf-8')
        # remove some chars
        strAux = strAux.replace(unichr(0xba), "")     # 4o
        strAux = strAux.replace(unichr(0xaa), "")     # 4a
        # normalization
        ret = unicodedata.normalize('NFKD', strAux)
        ret = ret.encode('ascii', 'ignore')
    except:
        ret = None

    # try as a unicode encoded string
    if ret is None:
        try:
            strAux = s.decode(s, 'utf-8')
            # remove some chars
            strAux = strAux.replace(unichr(0xba), "")     # 4o
            strAux = strAux.replace(unichr(0xaa), "")     # 4a
            # normalization
            ret = unicodedata.normalize('NFKD', strAux)
            ret = ret.encode('ascii', 'ignore')
        except:
            ret = s     # return as received

    return ret
    def fromUser(self, screen_name, tweets_number=10, is_bot=False):
        user = self.createUser(screen_name, is_bot)

        tweets = self.twitter_client.user_timeline(screen_name=screen_name, count=tweets_number)
        for i, status in enumerate(tweets):
            tweet = status._json
            text = tweet['text']
            date = tweet['created_at']
            entities = tweet['entities']
            user_mentions = entities['user_mentions']
            mentions_list = []

            if len(user_mentions) > 0:
                for mention in user_mentions:
                    mentions_list.append(mention['screen_name'])

            text_string = unicodedata.normalize('NFKD', text).encode('ascii','ignore')
            date_string = unicodedata.normalize('NFKD', date).encode('ascii','ignore')
            name_mentions_string = ",".join(mentions_list)

            Tweet.create(
                    user = user,
                    text = text_string,
                    date = date_string,
                    source = status.source,
                    mentions = name_mentions_string
            )
Example #31
0
def remove_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s)
                   if not unicodedata.combining(c))
Example #32
0
def main():

    os_v = os.uname()[2].split(".")[0]

    if os_v == "10":
    	source1 = "/System/Library/Input Methods/CharacterPalette.app/Contents/Frameworks/CharacterPaletteFramework.framework/Resources/kanji.db"
    elif (os_v > "10" and os_v < "18"):
    	source1 = "/System/Library/Input Methods/CharacterPalette.app/Contents/Resources/CharacterDB.sqlite3"
    else:
    	source1 = "/System/Library/Components/CharacterPalette.component/Contents/SharedSupport/\
    	CharPaletteServer.app/Contents/Frameworks/CharacterPaletteFramework.framework/Versions/A/Resources/kanji.db"

    bundleLibPath = os.environ["TM_BUNDLE_SUPPORT"] + "/lib/"

    source2 = bundleLibPath + "allHanForRadical.txt.zip"

    def lastCharInUCSdec(s):
        isPaneB = False
        if s:
            if u"\udc00" <= s[-1] <= u"\udfff" and len(s) >= 2 and u"\ud800" <= s[-2] <= u"\udbff":
                isPaneB = True
                return (((ord(s[-2])&0x3ff)<<10 | (ord(s[-1])&0x3ff)) + 0x10000, isPaneB)
            return (ord(s[-1]), isPaneB)
        return (-1, isPaneB)


    if "TM_SELECTED_TEXT" in os.environ: sys.exit(200)

    if os.environ["DIALOG"][-1] == '2':
        dialog2 = True
    else:
        dialog2 = False

    outDict = SeqDict()

    if "TM_CURRENT_LINE" in os.environ and "TM_LINE_INDEX" in os.environ and int(os.environ["TM_LINE_INDEX"]):
        line, x = os.environ["TM_CURRENT_LINE"], int(os.environ["TM_LINE_INDEX"])
    else:
        sys.exit(206)


    (lastCharDecCode, charIsPaneB) = lastCharInUCSdec(unicode(line[:x], "UTF-8"))
    char = wunichr(lastCharDecCode)
    lastCharUCShexCode = "%04X" % lastCharDecCode

    UnicodeData = os.popen("zgrep '^" + lastCharUCShexCode + ";' '" + bundleLibPath + 
                            "UnicodeData.txt.gz'").read().decode("utf-8")

    name = ""

    if not UnicodeData:
        name = getNameForRange(lastCharDecCode)
    else:
        (dummy1, name, category, combiningclass, bididir, 
        decomposition, numtype1, numtype2, numtype3, bidimirror, 
        oldname, comment, upcase, lowcase, titlecase) = UnicodeData.strip().split(';')

    if name[0] == '<': name = getNameForRange(lastCharDecCode)
    block = getBlockName(lastCharDecCode)

    outDict['Character'] = char
    outDict['Name'] = name
    outDict['Block'] = block

    # look for related chars
    frel = open(bundleLibPath + "relatedChars.txt", "rb")
    reldata = frel.read().decode("UTF-8")
    frel.close()
    for part in reldata.split('\n'):
        if char in part: break
    if part: outDict["Related to"] = part

    if "CJK" in name and ("IDEO" in name or "Ideo" in name):
        cmd = "zgrep -F '" + char + ",' '" + source2 + "'"
        gdata = os.popen(cmd.encode("UTF-8")).read().decode("UTF-8")
        if len(gdata) > 0:
            RadNum, RadStrokeCnt, RadName, Rad, ExtStrokeCnt, Dummy = gdata.split('\t')
            outDict['Radical (trad.)'] = [Rad, RadStrokeCnt, u"画", RadName, RadNum, ExtStrokeCnt]
            outDict['Strokes (trad.)'] = str(int(RadStrokeCnt) + int(ExtStrokeCnt))

        # get all data from Apple's internal UniDict
        cmd = "sqlite3 '" + source1 + "' 'select * from unihan_dict where uchr=\"" + char + "\";' 2>/dev/null"
        udata = os.popen(cmd.encode("UTF-8")).read().decode("UTF-8")
        if udata:
            (uChar, a1, readings, hangul_name_sound, pinyin, zhWubiXing, 
            zhWubiHua, zhBianhao, a2, zhCangjieCh, zhDayi, pinyin1, 
            Bopomofo, jaKun, jaOn, pinyin, zhCangjie) = udata.split('|')
            zhCangjie = zhCangjie.strip()
            if readings:
                japDict = SeqDict()
                kunon = readings.split('/')
                if kunon[0]: japDict['Kun'] = kunon[0]
                if kunon[1]: japDict['On']  = kunon[1]
                outDict['Japanese'] = japDict

            # get Chinese simplified/traditional equivalent
            cmd = "egrep '^" + char + "' '" + bundleLibPath + "zhSimTradHanzi.txt'"
            simtrad = os.popen(cmd.encode("UTF-8")).read().decode("UTF-8")
            data = ""
            if simtrad: c1, st, data = simtrad.split('\t')
            if pinyin1 or Bopomofo or data or zhWubiXing or zhWubiHua or \
                zhBianhao or zhCangjie or zhCangjieCh or zhDayi:
                zhDict = SeqDict()
                if data:
                    if st == 'T': zhDict['Traditional'] = data.rstrip()
                    elif st == 'S': zhDict['Simplified'] = data.rstrip()
                if pinyin1: zhDict['Pinyin'] = pinyin1
                if Bopomofo: zhDict['Zhuyin'] = Bopomofo
                if zhWubiXing: zhDict['Wubi Xing'] = zhWubiXing
                if zhWubiHua: zhDict['Wubi Hua'] = zhWubiHua
                if zhBianhao: zhDict['Bishu Bianhao'] = zhBianhao
                if zhCangjie: zhDict['Cangjie'] =  zhCangjie + " " + zhCangjieCh
                if zhDayi: zhDict['Dayi'] = zhDayi
                outDict['Chinese'] = zhDict
            if hangul_name_sound:
                korDict = SeqDict()
                korDict['Hangul'] = hangul_name_sound
                outDict['Korean'] = korDict
    else:
        if 'HANGUL' in name and not 'Jamo' in block:
            outDict['Decomposition'] = " ".join(unicodedata.normalize("NFKD", char))

        if UnicodeData:
            if category:       outDict['Category'] = expandUniCategories(category)
            if oldname:        outDict['Old Name'] = oldname
            if bididir:        outDict['Bidirectional'] = expandUniDirectionClass(bididir)
            if combiningclass: outDict['Combining Class'] = expandUniCombiningClass(combiningclass)
            if bidimirror:     outDict['Mirrored'] = bidimirror
            if upcase:         outDict['Upper Case'] = wunichr(int(upcase,16)) + " (U+" + upcase + ")"
            if lowcase:        outDict['Lower Case'] = wunichr(int(lowcase,16)) + " (U+" + lowcase + ")"
            if titlecase:      outDict['Title Case'] = wunichr(int(titlecase,16)) + " (U+" + titlecase + ")"
            if numtype1:       outDict['Numeral Type'] = (numtype1 + " " + numtype2 + " " + numtype3).strip()

            if decomposition and not charIsPaneB:
                decompDict = SeqDict()
                if decomposition[0] == '<':
                    dc = decomposition.split(' ')
                    decompDict['Class'] = expandUniDecompositionClass(dc[0])
                    decomposition = " ".join(dc[1:])
                decomp = decomposition
                def cDec(x): return unichr(int(x,16))
                def rDec(x): return "U+%04X" % ord(x)
                clist = decomp.split(' ')
                decomp = " ".join(map(cDec, clist)) + " (U+" + " U+".join(clist) + ")"
                cflist = unicodedata.normalize("NFKD", char)
                if len(clist) != len(cflist):
                    decompDict['into'] = decomp + "; " + " ".join(cflist) + "(" + " ".join(map(rDec, cflist)) + ")"
                else:
                    decompDict['into'] = decomp
                outDict['Decomposition'] = decompDict


    cpDict = SeqDict()
    cpDict['UCS dec/hex'] = "%s / U+%s" % (str(lastCharDecCode), lastCharUCShexCode)
    cpDict['UTF-8'] = " ".join([hex(ord(c))[2:].upper() for c in char.encode("utf-8")])
    utf16be = hexlify(char.encode("utf-16-be")).upper()
    if len(utf16be)>4: cpDict['UTF-16BE'] = utf16be[:4] + "+" + utf16be[4:]
    outDict['Codepoints'] = cpDict

    if dialog2:
        dlgout = "<table style=\"border-collapse:collapse;\">"
        plh = ""
        if outDict.has_key('Category') and "Nonspacing" in outDict['Category']: plh = u"o"
        dlgout += "<tr><td rowspan=2 style=\"border:1px dotted silver;font-size:20pt;text-align:center;\"><font color=#CCCCCC>%s</font>%s</td><td>&nbsp;</td><td style=\"color:grey;\">Name</td><td>%s</td></tr>" % (plh, outDict['Character'], outDict['Name'])
        dlgout += "<tr><td>&nbsp;</td><td style=\"color:grey;\">Block</td><td>%s</td></tr>" % outDict['Block']
        dlgout += "</table><table style=\"border-collapse:collapse;width:200px;\">"
        del outDict['Character']
        del outDict['Name']
        del outDict['Block']
        for k, v in outDict.items():
            if "Radical" in k:
                dlgout += "<tr><td align=right style=\"color:grey;\">%s</td><td>&nbsp;</td><td style=\"white-space:nowrap;\">%s (%s%s - %s) %s.%s" % (k, v[0], v[1], v[2], v[3], v[4], v[5])
            elif "Related" in k:
                #  and len(v) > 60
                dlgout += "<tr><td align=right style=\"color:grey;\">%s</td><td>&nbsp;</td><td>%s</td></tr>" % (k, v)
            else:
                try:
                    v.items()
                    dlgout += "<tr><td colspan=2 align=right style=\"color:grey;\"><b><i>%s</i></b></td></tr>" % k
                    for ku, vu in v.items():
                        dlgout += "<tr><td align=right style=\"color:grey;white-space:nowrap;\">%s</td><td>&nbsp;</td><td style=\"white-space:nowrap;\">%s</td></tr>" % (ku, vu)
                except AttributeError:
                    dlgout += "<tr><td align=right style=\"color:grey;white-space:nowrap;\">%s</td><td>&nbsp;</td><td style=\"white-space:nowrap;\">%s</td></tr>" % (k, v)

        cmd = "'%s' tooltip --html '%s'" % (os.environ["DIALOG"], dlgout.replace("'", u"'"))
        os.popen(cmd.encode("UTF-8"))
        sys.exit(206)
    else:
        sep = u"┊"
        for k, v in outDict.items():
            if "Radical" in k:
                print "%-15s %s %s (%s%s - %s) %s.%s" % (k, sep, v[0], v[1], v[2], v[3], v[4], v[5])
            else:
                try:
                    v.items()
                    print "%-15s" % k
                    for ku, vu in v.items():
                        print "%15s %s %s" % (ku, sep, vu)
                except AttributeError:
                    print "%-15s %s %s" % (k, sep, v)
        sys.exit(206)
    file.write(fstrip)

with open(srcfile, 'r') as file:
    for fileline in file:
        matchesv = re.match(r'\\\\v\*\*\\\\v \d+==[^#]', fileline)
        matchesmt = re.match(r'\\\\mt\*\*\\\\mt \d+==[^#]', fileline)
        #print ('#### verse or mt is ####: ',fileline)
        if matchesv or matchesmt:
            linesplit = fileline.split('==####')
            line2 = linesplit[0]
            line1 = line2.replace(' ',
                                  '##').replace('\\\\',
                                                '\\').replace('##-', '## ')
            #print ('39 #### working line sfm file	####: ',fileline)
            #print ('40 #### verse spaces removed	####: ',line1)
            line = unicodedata.normalize('NFC', line1)
            versetext1 = linesplit[1]
            versetext2 = versetext1.rstrip('\n')
            versetext = unicodedata.normalize('NFC', versetext2)
            #print ()
            #print ('44 #### bare verse normalized	####: ',versetext)
            #sanitize variable - https://stackoverflow.com/questions/8237647/clear-variable-in-python
            cars = None
            #cars = dict(x.split('**') for x in line.split('==')) - old try, but makes values into str, needs to be list so I an append suffixes later
            #https://stackoverflow.com/questions/4627981/creating-a-dictionary-from-a-string - key thing here is the [v] which turns it into a list
            cars = dict(
                (k, [v]) for k, v in (e.split('**') for e in line.split('==')))
            #only for debugging log
            #print ('51 #### annots as dict keys	####: ', cars.keys())
            newlist = []
            for key in cars:
Example #34
0
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
Example #35
0
def TTYstr(ustr):
    return unicodedata.normalize('NFKD', ustr).encode('ascii',
                                                      'ignore').upper()
 def unicode_normalize_string(self, text):
     return unicodedata.normalize('NFD', unicode(text, 'utf-8')).encode(
         'ascii', 'ignore').upper().replace("-", "")
Example #37
0
 def to_python(self, value):
     return unicodedata.normalize(
         'NFKC',
         super(UsernameField, self).to_python(value))
Example #38
0
def strip_accents(string):
    return u''.join(c for c in unicodedata.normalize('NFD', unicode(string))
                    if unicodedata.category(c) != 'Mn')
Example #39
0
    def processAlgorithm(self, parameters, context,  feedback):
        """
        Here is where the processing itself takes place.
        """

        source = self.parameterAsVectorLayer(parameters, self.INPUT, context)
        field_insee = self.parameterAsString(parameters,  self.INSEE_CODE, context)
        field_commune = self.parameterAsString(parameters,  self.COMMUNE_NAME, context)
        value_epsg = self.parameterAsString(parameters, self.EPSG_CODE, context)

        if value_epsg == '2154' or value_epsg == '3942' or value_epsg == '3943' or value_epsg == '3944' or value_epsg == '3945' or value_epsg == '3946' or value_epsg == '3947' or value_epsg == '3948' or value_epsg == '3949' or value_epsg == '3950' or value_epsg == '32630' or value_epsg == ' 32631' or value_epsg == '32632' or value_epsg == '3857' or value_epsg == '4326' or value_epsg == '4258' or value_epsg == '32620' or value_epsg == '2970' or value_epsg == '2972' or value_epsg == '2973' or value_epsg == '2975' or value_epsg == '32622' or value_epsg == '32740' or value_epsg == '32738' or value_epsg == '4471' or value_epsg == '32621' :

            feedback.pushInfo('EPSG code' + value_epsg)
            tab = []

            for f in source.getFeatures():

                col_select=f[field_insee],(''.join((c for c in unicodedata.normalize('NFD', f[field_commune]) if unicodedata.category(c) != 'Mn')))

                # Insere chaque ligne du CSV dans le tableau
                tab.append(col_select)

                #Permet la suppression des doublons et le tri
                Lt=sorted(set(tab))

                print (Lt)

            for c_insee, n_couche in Lt  :

                urlWithParams ="url=http://inspire.cadastre.gouv.fr/scpc/"+c_insee+".wms?contextualWMSLegend=0&crs=EPSG:"+value_epsg+"&dpiMode=7&featureCount=10&format=image/png&layers=LIEUDIT&styles=&maxHeight=1024&maxWidth=1280"
                rlayer = QgsRasterLayer(urlWithParams,'Lieu_dit_'+n_couche+'_'+c_insee, 'wms')
                feedback.pushInfo('Category :'+ n_couche +' - '+c_insee)
                feedback.pushInfo('Validity of WMS : %s' % rlayer.isValid())
                if not rlayer.isValid():
                    print('Lieu_dit_'+n_couche+'_'+c_insee + ' failed to load!')
                    feedback.pushInfo('WMS INVALID : Cadastre_'+n_couche+'_'+c_insee)
                else:
                    #Source : https://gis.stackexchange.com/questions/342802/loading-openstreetmap-in-pyqgis
                    output_layers = []
                    output_layers.append(rlayer)
                    context.temporaryLayerStore().addMapLayer(rlayer)
                    context.addLayerToLoadOnCompletion(
                        rlayer.id(),
                        QgsProcessingContext.LayerDetails(
                            'Lieu_dit_'+n_couche+'_'+c_insee,
                            context.project(),
                            self.OUTPUT_LAYERS
                        )
                    )
        else :
            feedback.pushInfo('Error EPSG code')


        # Return the results of the algorithm. In this case our only result is
        # the feature sink which contains the processed features, but some
        # algorithms may return multiple feature sinks, calculated numeric
        # statistics, etc. These should all be included in the returned
        # dictionary, with keys matching the feature corresponding parameter
        # or output names.
        # At the end of the processAlgorithmn
        # Add the layer to the project
        return {}
Example #40
0
def strip_accents(s):
    #retira acentos de strings
    s = s.replace('`', '').replace("'", '')
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')
Example #41
0
def normalize(y):
    if pd.isnull(y):
        return y  # 欠損値
    return ud.normalize('NFKC', y)  # 全角数字を半角数字へ
Example #42
0
def remove_words_accents(word: str):
    normalized = unicodedata.normalize('NFD', word)
    ascii_text = normalized.encode('ascii', 'ignore')

    return str(ascii_text.decode("utf-8"))
Example #43
0
def normalize(s):
    return ''.join((c for c in unicodedata.normalize('NFKD', unicode(s))
                    if unicodedata.category(c) != 'Mn'))
Example #44
0
    def parse_store(self, response):
        lat = response.xpath('//*[@id="location-lat"]/@value').extract_first()

        lon = response.xpath('//*[@id="location-lng"]/@value').extract_first()

        name = response.xpath(
            '//div[@class="title-wrap"]/h2/text()').extract_first()

        phone = response.xpath(
            '//div[@class="title-wrap"]/div/text()').extract_first()

        street = response.xpath(
            '//li[@itemprop="streetAddress"]/text()').extract_first().strip()

        city = response.xpath(
            '//span[@itemprop="addressLocality"]/text()').extract_first()

        state = response.xpath(
            '//span[@itemprop="addressRegion"]/text()').extract_first()

        postcode = response.xpath(
            '//span[@itemprop="postalCode"]/text()').extract_first()

        website = response.xpath(
            '//*[@id="my_location_url"]/@value').extract_first()

        address = "{}{} {} {}".format(street, city, state, postcode)

        # Some pages post notices such as "No longer accepting checks"
        # in the day/hours open section
        hour = response.xpath(
            '//*[@class="location-sidebar-item"][2]/descendant::*[contains('
            '., "am") or contains(., "pm") or contains('
            '., "Closed")]/text()').extract()

        day = self.convert_days(
            response.xpath(
                '//*[@class="location-sidebar-item"][2]/descendant::*[contains('
                '., "Sunday") or contains(., "Monday") or contains('
                '., "Tuesday") or contains(., "Wednesday") or contains('
                '., "Thursday") or contains(., "Friday") or contains('
                '., "Saturday")]/text()').extract())

        for i in range(len(hour)):
            hour[i] = unicodedata.normalize("NFKD", hour[i])  # handle \xa0
            hour[i] = hour[i].strip()
        hour = [x for x in hour if x]
        hour = self.convert_hours(hour)

        opening_hours = ', '.join('{} : {}'.format(*t) for t in zip(day, hour))

        yield GeojsonPointItem(
            lat=lat,
            lon=lon,
            addr_full=address,
            street=street,
            city=city,
            state=state,
            postcode=postcode,
            phone=phone,
            website=website,
            opening_hours=opening_hours,
            ref=response.url,
        )
 def lat2asc(self, title):
     title = title.decode('iso8859-1')
     return unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
Example #46
0
def normalize_ascii(value):
    return unicodedata.normalize('NFKD', value) \
        .encode('ascii', 'ignore')
def unicodeToAscii(series):
    return series.apply(lambda s: unicodedata.normalize('NFKC', str(s)))
Example #48
0
    def execute(self, commande):
        """
        Fonction permettant de reconnaître l'ordre donné par l'utilisateur.
        On vérifie d'abord la présence des verbes prendre et poser dans la chaine, puis celle d'aller.
        Pour prendre et poser, on change la position de l'objet. Pour aller, on modifie la position actuelle.

        On vérifie également la présence d'autres commandes comme l'inventaire.
        """

        commande = unicodedata.normalize('NFD', commande).encode(
            'ascii', 'ignore').decode('utf8')

        commande = commande.replace("'", " ")
        words = commande.strip(" ").split(" ")

        mots_reconnus = 0

        if words[0] == "prendre":
            for mot in words[1:]:
                for obj in self.lieu[self.lieu_actuel].contenu:
                    if mot == obj.raccourci:
                        mots_reconnus += 1
                        self.personnage.inventaire.append(obj)
                        self.lieu[self.lieu_actuel].contenu.remove(obj)
                        print("Vous avez obtenu : " + obj.nom)

            if mots_reconnus == 0:
                print("Impossible de prendre cet objet.")

# en l'état actuel, la commande poser crée des problèmes
#        elif words[0] == "poser":
#            for mot in words[1:]:
#                for obj in self.personnage.inventaire:
#                    if mot == obj.raccourci:
#                        self.lieu[self.lieu_actuel].contenu.append(obj)
#                        self.personnage.inventaire.remove(obj)

        elif words[0] == "aller":
            for mot in words[1:]:
                if mot in self.lieu[self.lieu_actuel].adjacence:
                    self.lieu_actuel = self.lieu[
                        self.lieu_actuel].adjacence[mot]
                    self.transition = 1
                    mots_reconnus += 1

            if mots_reconnus == 0:
                print(
                    "La destination n'a pas été reconnue, ou est inaccessible depuis ce lieu."
                )
            if mots_reconnus > 1:
                print(
                    "Attention, plusieurs lieux ont été reconnus. Vous arrivez dans le dernier possible"
                )

        elif words[0] == "parler":
            for mot in words[1:]:
                if mot in self.lieu[self.lieu_actuel].dialogues:
                    mots_reconnus += 1
                    print("\n\033[1m" + mot.capitalize() + "\033[0m : " +
                          self.lieu[self.lieu_actuel].dialogues[mot])
            if mots_reconnus == 0:
                print("Impossible de parler à cette personne.")

        elif words[0] == "utiliser":
            for mot in words[1:]:
                if (mot in self.lieu[self.lieu_actuel].utilisation):
                    for obj in self.personnage.inventaire:
                        if mot == obj.raccourci:
                            self.declencher(
                                self.lieu_actuel,
                                self.lieu[self.lieu_actuel].utilisation[mot])
                            mots_reconnus += 1

            if not mots_reconnus:
                print("Utilisation impossible.")

        elif words[0] == "inventaire":
            self.personnage.afficher_inventaire()

        else:
            print("Verbe non reconnu.")
	def parse_hours(self, item):
		item = unicodedata.normalize('NFKD', item).encode('ascii','ignore').strip()
		symbals= ['\r', '\n', '<br>', '<br/>', '<p>', '<ul>','</ul>', '</li>', '<h3>', '</h3>', '<li style="list-style: initial;">']
		for s in symbals:
			item = item.replace(s,'')
		return item
Example #50
0
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None    # Deleted
}

a = s.translate(remap)
a


import unicodedata
import sys
sys.maxunicode
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
b
b.translate(cmb_chrs)


digitmap = { c: ord('0') + unicodedata.digit(chr(c))
            for c in range(sys.maxunicode)
            if unicodedata.category(chr(c)) == 'Nd'}

len(digitmap)
# Arabic digits
x = '\u0661\u0662\u0663'
x
x.translate(digitmap)

def interpreter(data_path, model_path):
    """
    Run this function, if you want to talk to seq2seq model.
    if you type "exit", finish to talk.
    :param data_path: the path of corpus you made model learn
    :param model_path: the path of model you made learn
    :return:
    """
    # call dictionary class
    if args.lang == 'en':
        corpus = ConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    elif args.lang == 'ja':
        corpus = JaConvCorpus(file_path=None)
        corpus.load(load_dir=data_path)
    else:
        print('You gave wrong argument to this system. Check out your argument about languages.')
        raise ValueError
    print('Vocabulary Size (number of words) :', len(corpus.dic.token2id))
    print('')

    # rebuild seq2seq model
    model = Seq2Seq(len(corpus.dic.token2id), feature_num=args.feature_num,
                    hidden_num=args.hidden_num, batch_size=1, gpu_flg=args.gpu)
    serializers.load_hdf5(model_path, model)

    # load word2vec model
    sim_th = 50
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL_PATH, binary=False)

    # run conversation system
    print('The system is ready to run, please talk to me!')
    print('( If you want to end a talk, please type "exit". )')
    print('')
    while True:
        print('>> ', end='')
        sentence = input()
        if sentence == 'exit':
            print('See you again!')
            break

        if args.lang == 'en':
            input_vocab = [unicodedata.normalize('NFKC', word.lower()) for word in word_tokenize(sentence)]
        elif args.lang == 'ja':
            input_vocab = parse_ja_text(sentence)
        input_vocab.reverse()
        input_vocab.insert(0, "<eos>")

        # convert word into ID
        input_sentence = []
        for word in input_vocab:
            if corpus.dic.token2id.get(word) is not None:
                input_sentence.append(corpus.dic.token2id.get(word))
            else:
                try:
                    sim_words = w2v_model.most_similar(positive=[word], topn=sim_th)
                    for index, candidate_tuple in enumerate(sim_words):
                        if corpus.dic.token2id.get(candidate_tuple[0]) is not None:
                            input_sentence.append(corpus.dic.token2id.get(candidate_tuple[0]))
                            break
                        if index == sim_th - 1:
                            input_sentence.append(corpus.dic.token2id['<unk>'])
                except KeyError:
                    input_sentence.append(corpus.dic.token2id['<unk>'])

        # input a sentence into model
        model.initialize()          # initialize cell
        sentence = model.generate(input_sentence, sentence_limit=len(input_sentence) + 30,
                                  word2id=corpus.dic.token2id, id2word=corpus.dic)
        print("-> ", sentence)
        print('')
Example #52
0
def ganadores_por_artista(req):

    try:
        artista = req.get('queryResult').get('parameters').get('artista')

    except AttributeError:
        return '¿Podrías especificar el nombre de un artista?'
    if str(artista)=='':
        return 'Estas seguro de que esa persona esta compitiendo?. Lucas acá me dice que no.'
    print(str(artista), file=sys.stderr)

    slug = str(unicodedata.normalize('NFKD', artista)).lower().replace(" ", "-")

    print(slug, file=sys.stderr)

    #tag = translate_tags[cat]
    data = {}
    response = requests.get(
        url_win+'categories?slug='+slug,
        params=data
    )

    rjson = response.json()
    print(str(rjson), file=sys.stderr)
    cat_id = rjson[0].get('id') # need cat id to get posts/videos
    print(cat_id, file=sys.stderr)
    r = 'posts?categories='+str(cat_id)

    response = requests.get(
        url_win+r,
        params=data
    )

    rjson = response.json()
    answer = 'Este artista ganó '
    i=0
    for video in rjson:

        slug = video.get('slug')
        data = {}
        response = requests.get(
            url_win+'posts?slug='+slug,
            params=data
        )

        _rjson = response.json()[0]

        tags = _rjson.get('tags')

        r = 'tags?include='
        r = r + str(tags).replace('[','').replace(']', '')

        response = requests.get(
            url_win+r,
            params=data
        )
        _rjson = response.json()
        if type(_rjson)!=list:
            continue
        print('tags----' + str(_rjson), file=sys.stderr)
        video_cats = []
        answer = answer + 'por el video "'+ video.get('title').get('rendered') + '" en '
        for categoria in _rjson:
            #print(str(video[0]), file=sys.stderr)
            print(str(categoria), file=sys.stderr)
            video_cats.append(categoria.get('description'))

        answer = answer + and_last_comma(str(video_cats).replace('[','').replace(']', '').replace('\'','')) + ''
        if i != len(rjson) - 1:
            answer = answer + ', por '
        i = i+1
    if answer=="Este artista ganó ":
        return "Este artista no obtuvo premios"
    return html.unescape(answer)
Example #53
0
def normalize_unicode(text):
    """
    unicode string normalization
    """
    return unicodedata.normalize("NFKD", text)
	def validate(self, item):
		try:
			return unicodedata.normalize('NFKD', item).encode('ascii','ignore').strip().replace(';','')
		except:
			return ''
Example #55
0
def getNormalizedFilesList(dir):
    return list(map(lambda x: unicodedata.normalize("NFD", x),
                    os.listdir(dir)))
Example #56
0
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii
Example #57
0
def tag2sentences(tag):
    sentence = unicodedata.normalize("NFKC", tag.text)
    return [sentence]
Example #58
0
def getNormalizedString(fname):
    return unicodedata.normalize("NFD", fname)
Example #59
0
def remove_accents(input_str):
    # Borrowed from https://stackoverflow.com/a/517974/1509718
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
Example #60
0
 def to_python(self, value):
     return unicodedata.normalize('NFKC', super().to_python(value))