def on_message(self, *args):
        message = args[1].split(':', 3)
        key = int(message[0])
        namespace = message[2]

        if len(message) >= 4:
           data = message[3]
        else:
            data = ''
        if key == 1 and args[1] == '1::':
            self.send_packet_helper(1)
        elif key == 1 and args[1] == '1::{}'.format(self.namespace):
            self.send_packet_helper(5, data={'name':'initialize'})
            data = {'name':'join', 'args':['{}'.format(self._streamer_name)]}
            self.send_packet_helper(5, data=data)
        elif key == 2:
            self.send_packet_helper(2)
        elif key  == 5:
            data = json.loads(data, )
            if data['name'] == 'message':
                message = data['args'][0]
                sender = html.unescape(message['sender'])
                message = html.unescape(message['text'])
                self._messager.recieve_chat_data(sender, 
                                                 message, 
                                                 gui.StatusBarSelector.WatchPeopleCode.value)
Example #2
0
def parse_urban(refresh=False):
    try:
        with open('words.json', 'r') as f:
            definitions = json.load(f)
        if refresh:
            raise FileNotFoundError()
    except FileNotFoundError:
        definitions = {}
        links = []
        for char in chars:
            link = popular_link.format(char=char)
            data = urllib.request.urlopen(link).read().decode("utf8")
            found = map(lambda x: (x[0], unescape(x[1])), sample(re.findall(popular_regex, data), items_to_sample))
            links += found
            print("{char} done!".format(char=char))
        for link, word in links:
            data = urllib.request.urlopen(base_link + link).read().decode("utf8")
            definition = re.findall(meaning_regex, data)[0]
            definition = re.sub(to_delete_regex, '', definition)
            definition = re.sub(open_tag_regex, '', definition)
            definition = re.sub(close_tag_regex, '', definition)
            definitions[word] = unescape(definition)
            print("{word} done!".format(word=word))
        print("Done!")

        with open('words.json', 'w+') as f:
            f.write(json.dumps(definitions, ensure_ascii=False))
    return definitions
Example #3
0
def getepisodelist(html, url):
	if '"loggedIn":true' not in html:
		raise Exception("you didn't log in!")
	base = re.search("(https?://[^/]+)", url).group(1)
	s = []
	while True:
		safeprint(url)
		startpos = html.index('id="gmi-ResourceStream"')
		ex = re.compile('<a class="thumb[^"]*?" href="({}/art/.+?)" title="(.+?)"'.format(base))
		r = ex.findall(html, startpos)
		for m in r:
			id = re.search("\d+$", m[0]).group()
			title = m[1].rpartition(" by ")[0]
			# WTF r u doing deviantArt?
			title = unescape(unescape(title))
			
			e = comiccrawler.Episode()
			e.firstpageurl = m[0]
			e.title = "{} - {}".format(id, title)
			s.append(e)
			
		next = re.search('id="gmi-GPageButton"[^>]+?href="([^"]+?)"><span>Next</span>', html)
		if not next:
			break
		url = base + next.group(1).replace("&amp;", "&")
		html = comiccrawler.grabhtml(url, header)
		
	return s[::-1]
Example #4
0
def get_title(html, url):
	if is_pool(url):
		title = unescape(re.search("<h3>Now Viewing: ([^<]+)", html).group(1))
		pool_id = re.search("id=(\d+)", url).group(1)
		return "[Gelbooru] {title} ({pool_id})".format(title=title, pool_id=pool_id)
	title = unescape(re.search("<title>([^<]+)\| Gelbooru", html).group(1).strip())
	return "[Gelbooru] {title}".format(title=title)
Example #5
0
def get_lectures_urllist(url, login_cookie):
    with requests.session() as s:
        s.cookies = login_cookie
        r = s.get(url)
    # Different html structure for videolessons on elearning.polito.it and
    # didattica.polito.it
    if "didattica.polito.it" in url:
        lectures_urllist = re.findall(
            'href="(sviluppo\.videolezioni\.vis.*lez=\w*)">', r.text)
        for i in range(len(lectures_urllist)):
            lectures_urllist[i] = \
                'https://didattica.polito.it/pls/portal30/'+html.unescape(
                lectures_urllist[i])
    elif "elearning.polito.it" in url:
        lectures_urllist = re.findall(
            "href='(template_video\.php\?[^']*)", r.text)
        for i in range(len(lectures_urllist)):
            lectures_urllist[i] = \
                'https://elearning.polito.it/gadgets/video/'+html.unescape(
                lectures_urllist[i])
    else:
        # Still under developement
        new_domain_message()
        exit(1)
        lectures_urllist = ""
    return lectures_urllist
def main():

    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('calendar', 'v3', http=http)

    now = datetime.datetime.utcnow().isoformat() + 'Z' # 'Z' indicates UTC time
    print('Getting the upcoming 50 events')
    eventsResult = service.events().list(
        calendarId='*****@*****.**', timeMin=now, maxResults=50, singleEvents=True,
        orderBy='startTime').execute()
    events = eventsResult.get('items', [])
	
    username, password = "******", "BOT_PASSWORD"
    subreddit = "mscalstest"

    r = praw.Reddit("MSCals Ticker 0.1")

    print ("Logging in")
    r.login(username,password)
    subreddit=r.get_subreddit(subreddit)

    #create string, add Title thing
    sl=list()
    sl.append("Today (GMT):")
    import time
	
    if not events:
        print('No upcoming events found.')
    for event in events:
        #Pull required info from event, then filter on events that match todays date
        start = event['start'].get('dateTime', event['start'].get('date'))
        fullEv = start[11:16] + ' - ' + event['summary']    
        if (time.strftime("%Y-%m-%d") == start[0:10]): 
            sl.append(fullEv)
        print(sl)

    #Account for no events existing on a given day
    if (len(sl) == 1):
	    sl.append("No events")
	
    #Very possibly redundant, from old code	
    try:
        config = html.unescape(r.get_wiki_page(subreddit,"sidebar_bot_config").content_md)
    except requests.exceptions.HTTPError:
        print ("Couldn't access format wiki page, reddit may be down.")
        raise

    sidebar_string = ' | '.join(sl)

    #Updating sidebar section
    print ("Updating sidebar")
    sidebar = r.get_settings(subreddit)
    submit_text = html.unescape(sidebar["submit_text"])
    desc = html.unescape(sidebar['description'])
    startmarker, endmarker = desc.index("[](#StartMarker)"), desc.index("[](#MarkerEnd)") + len("[](#MarkerEnd)")
    updated_desc = desc.replace(desc[startmarker:endmarker], "[](#StartMarker)" + sidebar_string + "[](#MarkerEnd)")
    
    if updated_desc != desc:
        subreddit.update_settings(description=updated_desc.encode('utf8'), submit_text=submit_text)
Example #7
0
 def write_unitn(cls, out_path, unitn_path, download_path, is_train):
     with open(unitn_path) as unitn_sr, open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
         for unitn_line, download_line in zip(unitn_sr, download_sr):
             doc_id_unitn, label_unitn, text_unitn = \
                 re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', unitn_line).groups()
             doc_id_download, label_download, text_download = \
                 re.match(r'\d+\t(\d+)\t(negative|neutral|positive)\t(.+)', download_line).groups()
             text_unitn = text_unitn.encode().decode('unicode-escape')
             text_unitn = text_unitn.replace(r'’', '\'')
             if is_train:
                 text_unitn = html.unescape(text_unitn)
                 text_unitn = text_unitn.replace('""', '"')
             text_download = html.unescape(html.unescape(text_download))
             assert doc_id_unitn == doc_id_download
             assert label_unitn == label_download
             text = text_unitn
             if text_download != 'Not Available':
                 # some differences are impossible to reconcile, some unitn data have the wrong order
                 # if re.sub(r'\s+', ' ', text_unitn) != re.sub(r'\s+', ' ', text_download):
                 #     logging.error(out_path)
                 #     logging.error(text_unitn)
                 #     logging.error(text_download)
                 # assert re.sub(r'\s+', ' ', text_unitn) == re.sub(r'\s+', ' ', text_download)
                 text = text_download
             out_sr.write(json.dumps({'id': doc_id_unitn, 'text': text, 'label': cls.class_map[label_unitn]}) + '\n')
Example #8
0
def get_login_cookie(user, passw):
    if user is None:
        user = input("Username: "******"Password: "******"https://idp.polito.it:443/idp/profile/SAML2/Redirect/SSO":
            # Login successfull, we just need to follow some redirects
            relaystate = html.unescape(
                re.findall('name="RelayState".*value="(.*)"', r.text)[0])
            samlresponse = html.unescape(
                re.findall('name="SAMLResponse".*value="(.*)"', r.text)[0])
            r = s.post(
                'https://www.polito.it/Shibboleth.sso/SAML2/POST',
                data={'RelayState': relaystate, 'SAMLResponse': samlresponse})
            r = s.post('https://login.didattica.polito.it/secure/ShibLogin.php')
            relaystate = html.unescape(
                re.findall('name="RelayState".*value="(.*)"', r.text)[0])
            samlresponse = html.unescape(
                re.findall('name="SAMLResponse".*value="(.*)"', r.text)[0])
            r = s.post(
                'https://login.didattica.polito.it/Shibboleth.sso/SAML2/POST',
                data={'RelayState': relaystate, 'SAMLResponse': samlresponse}
                )
            login_cookie = s.cookies
        else:
            login_cookie = ""
    return login_cookie
Example #9
0
    def __init__(self, uid, access_token, output_folder, album):
        self.uid = uid
        self.access_token = access_token
        self.output_folder = output_folder
        self.album = album
        self.cpu_count = os.cpu_count()

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        # '{04}_{}.mp3
        self.folder_aids = {x[5:-4] for x in os.listdir(self.output_folder)}

        url = (
            "https://api.vkontakte.ru/method/audio.get.json?"
            "uid={uid}&access_token={access_token}"
        ).format(uid=self.uid, access_token=self.access_token)
        response = urllib.request.urlopen(url)
        content = response.read()
        self._content = json.loads(content.decode('utf-8'))
        self.music_list = self._content['response']

        self.tracks_map = {}
        for ind, track in enumerate(reversed(self.music_list)):
            self.tracks_map[str(track['aid'])] = {
                'index': ind,
                'artist': unescape(track['artist']),
                'title': unescape(track['title']),
                'url': track['url'],
                'output_path': os.path.join(output_folder, '{}_{}.mp3'.format(format(ind, '04'), track['aid'])),
            }
Example #10
0
def spider_xitu_gold(proxy=None):
    source_name = '稀土掘金'
    time1 = int(time.time())
    level = 3
    headers1 = headers.copy()
    headers1.update({"x-avoscloud-request-sign": "dd36c74cb860e12f7e12ac1c9c14917f,2139632477696",
                     "X-avoscloud-Application-Id": "mhke0kuv33myn4t4ghuid4oq2hjj12li374hvcif202y5bm6"})
    r = trequests.get(
        'https://api.leancloud.cn/1.1/classes/Entry?include=user,user.installation&limit=15&order=-createdAt&where={"tags":{"__type":"Pointer","className":"Tag","objectId":"559a7227e4b08a686d25744f"}}', timeout=20, headers=headers1)
    # print(r.text)
    items = r.json().get('results', [])
    if not items:
        logit('%s 解析失败.' % source_name)
        return
    titles = [unescape(i.get('title', '').strip()) for i in items]
    if '' in titles:
        logit('%s 出现空Title字段。' % source_name)
    covers = [re.sub('.*user-gold-cdn.xitu.io.*','',getlist1(jp(i, '$.screenshot.url'))) for i in items]
    urls = [i.get('originalUrl', 'http://gold.xitu.io/#/tag/Python')
            for i in items]
    descriptions = [unescape(i.get('createdAt', '').split('T')[0])
                    for i in items]
    result = [{'title': i[0], '_id':cleanid(i[0]), 'level':level, 'cover':i[1], 'description':i[2], 'toptime':0, 'urls':{
        source_name: i[3]}, 'time':time1} for i in zip(titles, covers, descriptions, urls)]
    print('%s finished: %s gotten.' % (source_name, len(result)))
    assert len(result) > 0, '%s 抓取结果为 0。' % source_name
    return result
Example #11
0
def spider_bole_article(proxy=None):
    source_name = '伯乐文章'
    time1 = int(time.time())
    level = 3
    r = trequests.get(
        'http://python.jobbole.com/all-posts/', proxies=proxy, **default_args)
    items = fromstring(r.text).xpath(
        '//div[@id="archive"]/div[@class="post floated-thumb"]')
    titles = [unescape(getlist1(i.xpath(
        './div[@class="post-meta"]/p/a[@class="archive-title"]/text()')).strip()) for i in items]
    if '' in titles:
        logit('%s 出现空Title字段。' % source_name)
    # covers = [getlist1(i.xpath('./div[@class="post-thumb"]//img/@src')) for
    # i in items]
    covers = [''] * len(titles)
    descriptions1 = [unescape(getlist1(i.xpath(
        './div[@class="post-meta"]/span[@class="excerpt"]/p/text()'))[:50]) for i in items]
    urls = [getlist1(i.xpath('./div[@class="post-thumb"]/a/@href'))
            for i in items]
    times = [getlist1(
        i.xpath('./div[@class="post-meta"]/p[1]')).text_content() for i in items]
    times = [getlist1(re.findall('20\d\d/\d\d/\d\d', i)) for i in times]
    descriptions = ['...'.join(i) for i in zip(descriptions1, times)]
    result = [{'title': i[0], '_id':cleanid(i[0]), 'level':level, 'cover':i[1], 'description':i[2], 'toptime':0, 'urls':{
        source_name: i[3]}, 'time':time1} for i in zip(titles, covers, descriptions, urls)]
    print('%s finished: %s gotten.' % (source_name, len(result)))
    assert len(result) > 0, '%s 抓取结果为 0。' % source_name
    return result
Example #12
0
def getepisodelist(html, url):
	if '"loggedIn":true' not in html:
		raise PauseDownloadError("you didn't log in!")
	base = search("(https?://[^/]+)", url).group(1)
	s = []
	while True:
		safeprint(url)
		startpos = html.index('id="gmi-ResourceStream"')
		ex = compile('<a class="thumb[^"]*?" href="({}/art/.+?)" title="(.+?)"'.format(base))

		for match in ex.finditer(html, startpos):
			id = search("\d+$", match.group(1)).group()
			title = match.group(2).rpartition(" by ")[0]
			# WTF r u doing deviantArt?
			title = unescape(unescape(title))

			s.append(
				Episode(
					"{} - {}".format(id, title),
					match.group(1)
				)
			)

		next = search('id="gmi-GPageButton"[^>]+?href="([^"]+?)"><span>Next</span>', html)
		if not next:
			break
		url = base + unescape(next.group(1))
		html = grabhtml(url)

	return s[::-1]
Example #13
0
    def print_decision_table_row(elem):
        headers = [elem.decision_table.hit_policy[0]]
        for input in elem.decision_table.input_list:
            inp = input.label if input.label != None else input.expression
            headers.append(inp)
        headers.append('')
        headers.append(elem.decision_table.output.name)

        table = []
        for rule in elem.decision_table.rule_list:
            table_row = [elem.decision_table.rule_list.index(rule) + 1]
            for input in rule.input_list:
                table_row.append(html.unescape(input.value))
            table_row.append('')
            table_row.append(html.unescape(rule.output))
            table.append(table_row)

        print('\n\n ' + elem.name)
        print(tabulate(table, headers, tablefmt='fancy_grid'))


#     table_row = [elem.decision_table.output]
#     table_row.append(elem.decision_table.output)
#     for rule in elem.decision_table.rule_list:
#         table_row.append(elem.decision_table.input_list[0])
#     table = [table_row]
#     table_row = [elem.decision_table.output]
#     table_row.append(elem.decision_table.output)
#     for rule in elem.decision_table.rule_list:
#         table_row.append(html.unescape(rule.input_list[0].value))
#     table.append(table_row)
#     for rule in elem.decision_table.rule_list:
#         table_row = [elem.decision_table.input_list[1]]
#         table_row.append(html.unescape(rule.input_list[1].value))
#         table_row.append()
Example #14
0
File: cli.py Project: jwilk/dcs-cli
def print_results(options, items):
    query_regexp = options.query_regexp
    try:
        query_regexp = re.compile('({0})'.format(query_regexp))
    except re.error:
        query_regexp = re.compile(r'\Zx')  # never match anything
    for item in items:
        colors.print('{path}:{line}:', pkg=item['package'], path=item['path'], line=item['line'])
        for line in item['ctxp2'], item['ctxp1']:
            line = html.unescape(line)
            colors.print('{t.dim}|{t.off} {line}', line=line)
        line = html.unescape(item['context'])
        template = '{t.dim}>{t.off} '
        chunkdict = {}
        for i, (chunk, matched) in enumerate(xsplit(query_regexp, line)):
            chunkdict['l{0}'.format(i)] = chunk
            template += '{t.bold}'
            if matched:
                template += '{t.yellow}'
            template += '{l' + str(i) + '}{t.off}'
        colors.print(template, **chunkdict)
        for line in item['ctxn1'], item['ctxn2']:
            line = html.unescape(line)
            colors.print('{t.dim}|{t.off} {line}', line=line)
        colors.print('{t.dim}(pathrank {pathrank:.4f}, rank {rank:.4f}){t.off}',
            pathrank=item['pathrank'],
            rank=item['ranking'],
        )
        print()
        sys.stdout.flush()
Example #15
0
def get_http_homepage(xmlfile:str, host_ip:str) -> list:
    lines = get_lines_between(xmlfile,host_ip,
            '<script id="http-homepage"','</script>')
    if not lines:
        return []
    lines = [re.sub(r'<.+?>','', _) for _ in lines]

    lines = ''.join(_ for _ in lines)
    header, body, *_ = lines.split('\n\n')
    
    header = [_.split(':',1) for _ in header.splitlines() if ':' in _]
    header = flatten_listoflist([k.strip(),v.strip()] for k,v in header if k not in 
            ['Date','Expires','Pragma','Connection','Content-Type','ETag', 
                'Last-Modified', 'Accept-Ranges','Content-Length', 'Cache-Control'])
    from html import unescape
    header = [unescape(_) for _ in header]
    header = [remove_uuid(_) for _ in header]
    header = flatten_listoflist(re.split(r""",|\ |=|"|;|:|\(|\)""", _) for _ in header)

    body = unescape(unescape(body))
    body = re.sub(r'\\x([0-9A-Fa-f]{2})', lambda m:chr(int(m.group(1),16)), body)
    body = remove_uuid(body)
    body = remove_html_tags(body)
    body = [_ for _ in re.split(r"""</|/>|<|>|=|"|\ |\t|\n|\r|,|;""", body) if _]
    body = [_ for _ in (strip_punc(_) for _ in body) if _]
    body = [_ for _ in (strip_punc(_) for _ in body) if _]
    return [_.strip() for _ in header+body if _.strip()]
def extract_bilibili_by_api(url):
    u'''使用api,获取title和index'''

    title,page_titles = get_title_by_url(url)

    titles = []
    if title:
        # unquote html entities
        title = unescape(title.strip())
        # replace / to _
        title = escape_seps(title)
    else:
        title = ""
    titles.append(title)

    if page_titles:
        #debug(text_match.groups())
        index = len(page_titles)
        if index == 1 and page_titles[0] == '':
            index = 0
    debug(index)

    # have multi pages
    if index > 0:
        for page_title in page_titles:
            # unquote html entities
            page_title = unescape(page_title.strip())
            # replace / to _
            page_title = escape_seps(page_title)
            # delete bilibli page prefix
            titles.append(page_title)

    debug(titles)

    return titles,index
Example #17
0
 def getTrackInfo(self, context=""):
     trackInfo = TrackInfo()
     trackInfo.TrackAuthor = html.unescape(re.search(r"target=\"_blank\">(.*?)</a></TD>", context).group(1))
     trackInfo.TrackName = html.unescape(re.search(r"id=\"ctl03_ShowTrackName\">(.*?)</span>", context).group(1))
     trackInfo.TrackLength = html.unescape(re.search(r"id=\"ctl03_ShowLength\">(.*?)</span>", context).group(1))
     trackInfo.TrackStyle = html.unescape(re.search(r"id=\"ctl03_ShowStyle\">(.*?)</span>", context).group(1))
     return trackInfo
Example #18
0
    def getTmxRecordsData(self, data=""):
        recordsData = []
        context = data
        while True:
            record = RecordInfo()
            record.RecordType = "Tmx"
            record.Server = "Offline"

            replay_link = re.search("get.aspx\?action=recordgbx&amp;id=(.*?)\"", context)
            if replay_link is not None:
                record.ReplayUrl = html.unescape("https://tmnforever.tm-exchange.com/get.aspx?action=recordgbx&amp;id=" + replay_link.group(1))
            else:
                break
            index = context.index("</a></td><td>")
            time = context[index - 13:index - 6]

            record.Time = time

            context = context[index + 7:]

            author = re.search("target=\"_blank\">(.*?)</a></td><td>", context).group(1)
            record.Player = html.unescape(author)
            index = context.index("</a></td><td>")
            context = context[index + 7:]

            recordsData.append(record)

        return recordsData
Example #19
0
    def test_complete_task_input(self, section, area, goal_number, task_number):
        generators = {
            'lower128': {
               'all': self.generate_all_lower_128_unicode_string,
            },
            'high': {
               'limit': self.generate_500_high_unicode_string,
            },
        }

        comment = generators[section][area]()

        self.webapp.click('goal-%s-task-%s_incomplete' % (
            goal_number,
            task_number,
        ))
        self.fill_form(
            task_comment=comment,
        )
        self.webapp.click('complete-task')
        element = self.driver.find_elements_by_id('goal-%s-task-%s_complete' % (goal_number, task_number))[0]

        self.webapp.click('task-%s_history' % task_number)
        today = urllib.parse.quote_plus(datetime.datetime.strftime(
            datetime.datetime.today(),
            '%Y %b %d',
        ))

        comment_id = '{date}_comment'.format(date=today)

        comment_element = self.driver.find_elements_by_id(comment_id)[0]
        comment_text = html.unescape(comment_element.get_attribute('innerHTML'))
        comment_expected = html.unescape(comment)
        assert comment_text == comment_expected, '%s does not match comment %s' % (comment_text, comment_expected)
        self.webapp.click('home-link')
Example #20
0
    def on_message(self, *args):
        message = args[1].split(':', 3)
        key = int(message[0])
        # namespace = message[2]

        if len(message) >= 4:
            data = message[3]
        else:
            data = ''
        if key == 1 and args[1] == '1::':
            self.send_packet_helper(1)
        elif key == 1 and args[1] == '1::{}'.format(self.namespace):
            self.send_packet_helper(5, data={'name': 'initialize'})
            data = {'name': 'join',
                    'args': ['{}'.format(self._streamer_name)]}

            self.send_packet_helper(5, data=data)
            self.log.info('Connected to channel with socket io!')
            self.messaging.send_status('CONNECTED')
        elif key == 2:
            self.send_packet_helper(2)
        elif key == 5:
            data = json.loads(data, )
            if data['name'] == 'message':
                message = data['args'][0]
                sender = html.unescape(message['sender'])
                message = html.unescape(message['text'])
                self.messaging.send_message(author=sender, message=message)
            elif data['name'] == 'join':
                self.nick = data['args'][1]
Example #21
0
def get_dlurl(lecture_url, login_cookie, dl_format='video'):
    with requests.session() as s:
        s.cookies = login_cookie
        r = s.get(lecture_url)
        if "didattica.polito.it" in lecture_url:
            if dl_format == 'video':
                dlurl = re.findall('href="(.*)".*Video', r.text)[0]
            if dl_format == 'iphone':
                dlurl = re.findall('href="(.*)".*iPhone', r.text)[0]
            if dl_format == 'audio':
                dlurl = re.findall('href="(.*)".*Audio', r.text)[0]
            r = s.get(
                'https://didattica.polito.it'+html.unescape(dlurl),
                allow_redirects=False)
            dlurl = r.headers['location']
        elif "elearning.polito.it" in lecture_url:
            if dl_format == 'video':
                dlurl = re.findall(
                    'href="(download.php[^\"]*).*video1', r.text)[0]
            if dl_format == 'iphone':
                dlurl = re.findall(
                    'href="(download.php[^\"]*).*video2', r.text)[0]
            if dl_format == 'audio':
                dlurl = re.findall(
                    'href="(download.php[^\"]*).*video3', r.text)[0]
            r = s.get(
                'https://elearning.polito.it/gadgets/video/' +
                html.unescape(dlurl), allow_redirects=False)
            dlurl = r.headers['location']
        else:
            # Still under developement
            new_domain_message()
            exit(1)
            dlurl = ""
    return dlurl
Example #22
0
def api_get_post(post_url):
    GlobalVars.api_request_lock.acquire()

    # Respect backoff, if we were given one
    if GlobalVars.api_backoff_time > time.time():
        time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
    d = parsing.fetch_post_id_and_site_from_url(post_url)
    if d is None:
        GlobalVars.api_request_lock.release()
        return None
    post_id, site, post_type = d
    if post_type == "answer":
        api_filter = r"!FdmhxNRjn0vYtGOu3FfS5xSwvL"
    else:
        assert post_type == "question"
        api_filter = r"!DEPw4-PqDduRmCwMBNAxrCdSZl81364qitC3TebCzqyF4-y*r2L"

    request_url = "https://api.stackexchange.com/2.2/{}s/{}".format(post_type, post_id)
    params = {
        'filter': api_filter,
        'key': 'IAkbitmze4B8KpacUfLqkw((',
        'site': site
    }
    response = requests.get(request_url, params=params).json()
    if "backoff" in response:
        if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
            GlobalVars.api_backoff_time = time.time() + response["backoff"]
    if 'items' not in response or len(response['items']) == 0:
        GlobalVars.api_request_lock.release()
        return False
    GlobalVars.api_request_lock.release()

    item = response['items'][0]
    post_data = PostData()
    post_data.post_id = post_id
    post_data.post_url = parsing.url_to_shortlink(item['link'])
    post_data.post_type = post_type
    post_data.title = html.unescape(item['title'])
    if 'owner' in item and 'link' in item['owner']:
        post_data.owner_name = html.unescape(item['owner']['display_name'])
        post_data.owner_url = item['owner']['link']
        post_data.owner_rep = item['owner']['reputation']
    else:
        post_data.owner_name = ""
        post_data.owner_url = ""
        post_data.owner_rep = 1
    post_data.site = site
    post_data.body = item['body']
    post_data.score = item['score']
    post_data.up_vote_count = item['up_vote_count']
    post_data.down_vote_count = item['down_vote_count']
    post_data.creation_date = item['creation_date']
    try:
        post_data.last_edit_date = item['last_edit_date']
    except KeyError:
        post_data.last_edit_date = post_data.creation_date  # Key not present = not edited
    if post_type == "answer":
        post_data.question_id = item['question_id']
    return post_data
Example #23
0
def generate_user(config, user):
    url = "https://www.soundcloud.com/" + user

    data = rssit.util.download(url)

    soup = bs4.BeautifulSoup(data, 'lxml')

    author = html.unescape(soup.find("meta", attrs={"property": "og:title"})["content"])

    if config["author_username"]:
        author = user

    description = html.unescape(soup.find("p", attrs={"itemprop": "description"}).text).strip()
    if len(description) <= 0:
        description = "%s's soundcloud" % user

    feed = {
        "title": author,
        "description": description,
        "url": url,
        "author": user,
        "entries": []
    }

    tracks = soup.findAll("article", attrs={"itemprop": "track"})
    for track in tracks:
        tracka = track.find("a", attrs={"itemprop": "url"})
        trackname = tracka.text
        trackurl = urllib.parse.urljoin(url, tracka["href"])
        date = parse(track.find("time").text)

        title = trackname
        duration_delta = isodate.parse_duration(track.find("meta", attrs={"itemprop": "duration"})["content"])
        duration_seconds = duration_delta.total_seconds()
        duration_text = "[%s:%s:%s]" % (
            str(int(duration_seconds / 3600)).zfill(2),
            str(int((duration_seconds % 3600) / 60)).zfill(2),
            str(int(duration_seconds % 60)).zfill(2)
        )

        content = "<p>%s <a href='%s'>%s</a> by <a href='%s'>%s</a></p>" % (
            duration_text,

            trackurl,
            trackname,

            url,
            author
        )

        feed["entries"].append({
            "url": trackurl,
            "title": title,
            "content": content,
            "author": user,
            "date": date,
        })

    return ("feed", feed)
Example #24
0
 def write_download(cls, out_path, download_path):
     with open(download_path) as download_sr, open(out_path, 'a+') as out_sr:
         for line in download_sr:
             doc_id, label, text = re.match(r'(?:\d+\t)?(\d+)\t(negative|neutral|positive)\t(.+)', line).groups()
             text = html.unescape(html.unescape(text))
             if text == 'Not Available':
                 continue
             out_sr.write(json.dumps({'id': doc_id, 'text': text, 'label': cls.class_map[label]}) + '\n')
Example #25
0
 def write_test_2015(cls, out_path, input_path, label_path):
     with open(input_path) as in_sr, open(label_path) as labels_sr, open(out_path, 'a+') as out_sr:
         for line, label_line in zip(in_sr, labels_sr):
             doc_id, text = re.match(r'NA\t(T\d+)\tunknwn\t(.+)', line).groups()
             text = html.unescape(html.unescape(text))
             doc_id_label, label = re.match(r'\d+\t(T\d+)\t(negative|neutral|positive)', label_line).groups()
             assert doc_id == doc_id_label
             out_sr.write(json.dumps({'id': doc_id, 'text': text, 'label': cls.class_map[label]}) + '\n')
Example #26
0
 def _path2href( self, match ):
   if match.group(0) not in core.listoutdir():
     return match.group(0)
   filename = html.unescape( match.group(0) )
   ext = html.unescape( match.group(1) )
   whitelist = ['.jpg','.png','.svg','.txt','.mp4','.webm'] + list(getattr(config, 'plot_extensions', []))
   fmt = '<a href="{href}"' + (' class="plot"' if ext in whitelist else '') + '>{name}</a>'
   return fmt.format( href=urllib.parse.quote( filename ), name=html.escape( filename ) )
Example #27
0
 def insert_hyphenation(self, s):
     hs = ''
     if s:
         if self.hyphenator and self.hyphenate and not (self.header or self.subheader):
             hs = ' '.join([self.hyphenate_word(html.unescape(w)) for w in s.split(' ')])
         else:
             hs = html.unescape(s)
     return hs
Example #28
0
 def checkArtistTypes(types):
   whatArtists = [unescape(x['name']) for x in albumGroup['group']['musicInfo'][types[0]]]
   types.pop(0)
   if len(whatArtists) == 1:
     return whatArtists
   elif len(whatArtists)>1:
     return whatArtists+[unescape(x['name']) for y in types for x in albumGroup['group']['musicInfo'][y]]
   return checkArtistTypes(types) if len(types)>0 else []
def represent_question(question):
    return {
        'title': html.unescape(question['title']),
        'owner': html.unescape(question['owner']['display_name']),
        'date': question['creation_date'],
        'link': question['link'],
        'is_answered': question['is_answered'],
    }
Example #30
0
 def _path2href( self, match ):
   if not os.path.exists( os.path.join( self._logdir, match.group(0) ) ):
     return match.group(0)
   filename = html.unescape( match.group(0) )
   ext = html.unescape( match.group(1) )
   whitelist = ['.jpg','.png','.svg','.txt','.mp4','.webm'] + list( core.getprop( 'plot_extensions', [] ) )
   fmt = '<a href="{href}"' + (' class="plot"' if ext in whitelist else '') + '>{name}</a>'
   return fmt.format( href=urllib.parse.quote( filename ), name=html.escape( filename ) )
    if 'Planetarium' in event['categoryCalendar']:
        print(f"{event['location']}: {event['title']}: {event['time_start']}: {event['categoryCalendar']}")
        break

# Challenge 5: Repeat challenge 4 also print Cost
print("\nChallenge 5: First event with categoryCalendar ending in 'Planetarium' including 'Cost'")
import html

for event in events:
    if 'Planetarium' in event['categoryCalendar']:
        print(f"{event['location']}: {event['title']}: {event['time_start']}: {event['categoryCalendar']}")
        for field in event['customFields']:
            if field['label'] == 'Cost':
                cost_with_entities = field['value']
                print(cost_with_entities)
                cost = html.unescape(cost_with_entities)
                print(cost)
                break
        break

# Challenge 6: Repeat challenge 4 using regular expression search on Planetarium.
print("\nChallenge 6: First event with categoryCalendar ending in 'Planetarium' including 'Cost' using regular expressions")
import re
for event in events:
    if re.search('Planetarium', event['categoryCalendar']):
        print(f"{event['location']}: {event['title']}: {event['time_start']}: {event['categoryCalendar']}")
        for field in event['customFields']:
            if field['label'] == 'Cost':
                print(html.unescape(field['value']))
                break
        break
Example #32
0
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)
Example #33
0
def uni_norm(text):
    text = text.translate(
        {0x2018: 0x27, 0x2019: 0x27, 0x201C: 0x22, 0x201D: 0x22, 0xA0: 0x20}
    )
    return unescape(text)
            "div",
            {"class": "card__content"})  # find all the cards on the page
        print(len(days), " number of cards found")
        for d in days:
            try:
                subhead = d.find_all(
                    "div", {"class": "date_day"})[0].decode_contents()
                event = d.find_all(
                    "h3",
                    {"class": "card__title heading"
                     })[0].find_all("a")[0].decode_contents(
                     )  # all h3 card__title headings have an a element inside
                if month in subhead and str(
                        int(day)
                ) in subhead:  # events related to other days appear on each day page
                    #print(subhead, event)
                    rows.append((int(m), int(day), unescape(event)))
                else:
                    pass
                    #print("wrong date")
            except:
                pass
        sleep(
            5
        )  # sleep for 5 seconds between requests to prevent spamming webserver
        # takes about 30 minutes to scrape a whole year

with open("fun_holidays_4.txt", 'a', encoding="UTF-8") as file:
    for row in rows:
        r = '\t'.join([str(x) for x in row])
        file.write(f"{r}\n")
Example #35
0
 def extract_title(entry):
     if entry.get('title'):
         return html.unescape(entry['title'])
Example #36
0
def messages(data, user, recipient, report_html, local, time_start, time_end, timeformat, operating_system):
    """ Function that show database messages """

    rep_med = ""  # Saves the complete chat
    rows = len(data.index)
    for i in data.index:
        try:
            report_msj = ""  # Saves each message
            report_name = ""  # Saves the chat sender
            message = ""  # Saves each msg
            sys.stdout.write("\rMessage {}/{}".format(str(i+1), str(rows)))
            sys.stdout.flush()
            # transform chat time in epoch local time
            time_parse = str(data['Date'][i]) + " " + str(data['Time'][i])
            utc_time = time.strptime(time_parse, timeformat)
            dt = time.mktime(utc_time)
            if time_start <= dt <= time_end:
                sender = str(data['Author'][i])
                if operating_system == "ios":
                    text = getAttachediOS(str(data['Message'][i]))
                else:
                    text = getAttachedAndroid(str(data['Message'][i]))

                if ("Los mensajes y las llamadas están cifrados de extremo a extremo. Nadie fuera de este chat, ni siquiera WhatsApp, puede leerlos ni escucharlos"
                    "" in text) or ("Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them") in text:
                    sender = "None"
                if sender == user:
                    # The owner post a message
                    if (report_var == 'EN') or (report_var == 'ES'):
                        report_name = user
                    else:
                        message = Fore.RED + "\n--------------------------------------------------------------------------------" + Fore.RESET + "\n"
                        message += Fore.GREEN + "From " + Fore.RESET + user + Fore.GREEN + " to " + Fore.RESET + recipient + "\n"

                elif sender == "None":
                    # The system post a message
                    if report_var == 'EN':
                        report_name = "System Message"
                    elif report_var == 'ES':
                        report_name = "Mensaje de Sistema"
                    else:
                        message = Fore.RED + "\n--------------------------------------------------------------------------------" + Fore.RESET + "\n"
                        message += Fore.GREEN + "From " + Fore.RESET + "System\n"

                else:
                    # Other user post a message
                    if (report_var == 'EN') or (report_var == 'ES'):
                        report_name = "<font color='{}'> {} </font>".format(color.get(sender), sender)
                    else:
                        message = Fore.RED + "\n--------------------------------------------------------------------------------" + Fore.RESET + "\n"
                        message += Fore.GREEN + "From " + Fore.RESET + sender + Fore.GREEN + " to" + Fore.RESET + " Me\n"

                if (report_var == 'EN') or (report_var == 'ES'):
                    report_msj += text
                else:
                    message += Fore.GREEN + "Message: " + Fore.RESET + html.unescape(text) + "\n"

                report_time = "{} - {}".format(str(data['Date'][i]), str(data['Time'][i]))
                if (report_var == 'EN') or (report_var == 'ES'):
                    if report_name == user:
                        rep_med += """
                <li>
                    <div class="bubble2">
                        <span class="personName2">""" + report_name + """</span><br>
                        <span class="personSay2">""" + report_msj + """</span><br>
                        <span class="time round2">""" + report_time + "&nbsp" + """</span><br>
                    </div>
                </li>"""
                    elif (report_name == "System Message") or (report_name == "Mensaje de Sistema"):
                        rep_med += """
                <li>
                    <div class="bubble-system"> 
                        <span class="time-system round">""" + report_time + "&nbsp" + """</span><br>
                        <span class="person-System">""" + report_msj + """</span><br>
                    </div>
                </li>"""
                    else:
                        rep_med += """
                <li>
                    <div class="bubble"> 
                        <span class="personName">""" + report_name + """</span><br>
                        <span class="personSay">""" + report_msj + """</span><br>
                        <span class="time round">""" + report_time + "&nbsp" + """</span><br>
                    </div>
                </li>"""
                elif report_var == 'None':
                    message += Fore.GREEN + "Timestamp: " + Fore.RESET + report_time + "\n"
                    print(message)

        except Exception as e:
            print("\nError showing message details: {}, Message ID {}, Timestamp {}".format(e, str(i), data['Date'][i] + ", " + data['Time'][i]))

        if report_var != "None":
            report(rep_med, report_html, local)
Example #37
0
    def extract_link(self, parse_url, link):
        """html解码"""
        link = unescape(link)
        """判断后缀是否在黑名单中"""
        filename = os.path.basename(link)
        file_extend = self.get_file_extend(filename)
        is_link = False
        if link.startswith(
            ('http://',
             'https://')) and file_extend not in self.black_extend_list:
            full_url = link
        elif link.startswith('javascript:'):
            return False
        elif link.startswith('////') and len(link) > 4:
            full_url = 'http://' + link[2:]
        elif link.startswith('//') and len(link) > 2:
            full_url = 'http:' + link
        elif link.startswith('/'):
            full_url = parse_url.scheme + '://' + parse_url.netloc + link
        elif link.startswith('./'):
            full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + link[
                1:]
        else:
            full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + '/' + link
        """解析爬取到链接的域名和根域名"""
        extract_full_url_domain = extract(full_url)
        root_domain = extract_full_url_domain.domain + '.' + extract_full_url_domain.suffix
        sub_domain = urlparse(full_url).netloc
        """判断爬取到的链接是否满足keyword"""
        in_keyword = False
        for keyword in self.keywords:
            if keyword in root_domain:
                in_keyword = True
        if not in_keyword:
            return False
        """添加根域名"""
        try:
            self._value_lock.acquire()
            if root_domain not in self.root_domains:
                self.root_domains.append(root_domain)
                logger.info(
                    '[+]Find a new root domain ==> {}'.format(root_domain))
                if root_domain not in self.extract_urls:
                    self.extract_urls.append(root_domain)
                    self.queue.put('http://' + root_domain)
        finally:
            self._value_lock.release()
        """添加子域名"""
        try:
            self._value_lock.acquire()
            if sub_domain not in self.sub_domains and sub_domain != root_domain:
                self.sub_domains.append(sub_domain)
                logger.info(
                    '[+]Find a new subdomain ==> {}'.format(sub_domain))
                if sub_domain not in self.extract_urls:
                    self.extract_urls.append(sub_domain)
                    self.queue.put('http://' + sub_domain)
        finally:
            self._value_lock.release()
        if file_extend in self.black_extend_list:
            return False
        if is_link is True:
            return link
        try:
            self._value_lock.acquire()
            if full_url not in self.apis and file_extend != 'html' and file_extend != 'js':
                self.apis.append(full_url)
                # logger.info('[+]Find a new api in {}'.format(parse_url.netloc))
        finally:
            self._value_lock.release()

        format_url = self.get_format_url(urlparse(full_url), filename,
                                         file_extend)

        try:
            self._value_lock.acquire()
            if format_url not in self.extract_urls:
                self.extract_urls.append(format_url)
                self.queue.put(full_url)
        finally:
            self._value_lock.release()
Example #38
0
    def invoice_validate(self):
        detalles = []
        subtotal = 0
        for factura in self:
            if factura.journal_id.usuario_fel and not factura.firma_fel:
                attr_qname = etree.QName(
                    "http://www.w3.org/2001/XMLSchema-instance",
                    "schemaLocation")

                NSMAP = {
                    "ds": "http://www.w3.org/2000/09/xmldsig#",
                    "dte": "http://www.sat.gob.gt/dte/fel/0.2.0",
                }

                NSMAP_REF = {
                    "cno":
                    "http://www.sat.gob.gt/face2/ComplementoReferenciaNota/0.1.0",
                }

                NSMAP_ABONO = {
                    "cfc": "http://www.sat.gob.gt/dte/fel/CompCambiaria/0.1.0",
                }

                NSMAP_EXP = {
                    "cex":
                    "http://www.sat.gob.gt/face2/ComplementoExportaciones/0.1.0",
                }

                NSMAP_FE = {
                    "cfe":
                    "http://www.sat.gob.gt/face2/ComplementoFacturaEspecial/0.1.0",
                }

                DTE_NS = "{http://www.sat.gob.gt/dte/fel/0.2.0}"
                DS_NS = "{http://www.w3.org/2000/09/xmldsig#}"
                CNO_NS = "{http://www.sat.gob.gt/face2/ComplementoReferenciaNota/0.1.0}"
                CFE_NS = "{http://www.sat.gob.gt/face2/ComplementoFacturaEspecial/0.1.0}"
                CEX_NS = "{http://www.sat.gob.gt/face2/ComplementoExportaciones/0.1.0}"
                CFC_NS = "{http://www.sat.gob.gt/dte/fel/CompCambiaria/0.1.0}"

                # GTDocumento = etree.Element(DTE_NS+"GTDocumento", {attr_qname: "http://www.sat.gob.gt/dte/fel/0.2.0"}, Version="0.4", nsmap=NSMAP)
                GTDocumento = etree.Element(DTE_NS + "GTDocumento", {},
                                            Version="0.1",
                                            nsmap=NSMAP)
                SAT = etree.SubElement(GTDocumento,
                                       DTE_NS + "SAT",
                                       ClaseDocumento="dte")
                DTE = etree.SubElement(SAT,
                                       DTE_NS + "DTE",
                                       ID="DatosCertificados")
                DatosEmision = etree.SubElement(DTE,
                                                DTE_NS + "DatosEmision",
                                                ID="DatosEmision")

                tipo_documento_fel = factura.journal_id.tipo_documento_fel
                if tipo_documento_fel in ['FACT', 'FACM'
                                          ] and factura.type == 'out_refund':
                    tipo_documento_fel = 'NCRE'

                moneda = "GTQ"
                if factura.currency_id.id != factura.company_id.currency_id.id:
                    moneda = "USD"

                fecha = fields.Date.from_string(
                    factura.date_invoice).strftime('%Y-%m-%d')
                hora = fields.Datetime.context_timestamp(
                    factura, timestamp=datetime.now()).strftime('%H:%M:%S')
                fecha_hora = fecha + 'T' + hora
                DatosGenerales = etree.SubElement(DatosEmision,
                                                  DTE_NS + "DatosGenerales",
                                                  CodigoMoneda=moneda,
                                                  FechaHoraEmision=fecha_hora,
                                                  Tipo=tipo_documento_fel)
                if factura.tipo_gasto == 'importacion':
                    DatosGenerales.attrib['Exp'] = "SI"

                Emisor = etree.SubElement(
                    DatosEmision,
                    DTE_NS + "Emisor",
                    AfiliacionIVA="GEN",
                    CodigoEstablecimiento=factura.journal_id.
                    codigo_establecimiento_fel,
                    CorreoEmisor="",
                    NITEmisor=factura.company_id.vat.replace('-', ''),
                    NombreComercial=factura.journal_id.direccion.name,
                    NombreEmisor=factura.company_id.name)
                DireccionEmisor = etree.SubElement(Emisor,
                                                   DTE_NS + "DireccionEmisor")
                Direccion = etree.SubElement(DireccionEmisor,
                                             DTE_NS + "Direccion")
                Direccion.text = factura.journal_id.direccion.street or 'Ciudad'
                CodigoPostal = etree.SubElement(DireccionEmisor,
                                                DTE_NS + "CodigoPostal")
                CodigoPostal.text = factura.journal_id.direccion.zip or '01001'
                Municipio = etree.SubElement(DireccionEmisor,
                                             DTE_NS + "Municipio")
                Municipio.text = factura.journal_id.direccion.city or 'Guatemala'
                Departamento = etree.SubElement(DireccionEmisor,
                                                DTE_NS + "Departamento")
                Departamento.text = factura.journal_id.direccion.state_id.name if factura.journal_id.direccion.state_id else ''
                Pais = etree.SubElement(DireccionEmisor, DTE_NS + "Pais")
                Pais.text = factura.journal_id.direccion.country_id.code or 'GT'

                nit_receptor = 'CF'
                if factura.partner_id.vat:
                    nit_receptor = factura.partner_id.vat.replace('-', '')
                if tipo_documento_fel == "FESP" and factura.partner_id.cui:
                    nit_receptor = factura.partner_id.cui
                Receptor = etree.SubElement(
                    DatosEmision,
                    DTE_NS + "Receptor",
                    IDReceptor=nit_receptor,
                    NombreReceptor=factura.partner_id.name)
                if factura.partner_id.nombre_facturacion_fel:
                    Receptor.attrib[
                        'NombreReceptor'] = factura.partner_id.nombre_facturacion_fel
                if factura.partner_id.email:
                    Receptor.attrib[
                        'CorreoReceptor'] = factura.partner_id.email
                if tipo_documento_fel == "FESP" and factura.partner_id.cui:
                    Receptor.attrib['TipoEspecial'] = "CUI"

                DireccionReceptor = etree.SubElement(
                    Receptor, DTE_NS + "DireccionReceptor")
                Direccion = etree.SubElement(DireccionReceptor,
                                             DTE_NS + "Direccion")
                Direccion.text = (factura.partner_id.street or '') + ' ' + (
                    factura.partner_id.street2 or '')
                # Direccion.text = " "
                CodigoPostal = etree.SubElement(DireccionReceptor,
                                                DTE_NS + "CodigoPostal")
                CodigoPostal.text = factura.partner_id.zip or '01001'
                Municipio = etree.SubElement(DireccionReceptor,
                                             DTE_NS + "Municipio")
                Municipio.text = factura.partner_id.city or 'Guatemala'
                Departamento = etree.SubElement(DireccionReceptor,
                                                DTE_NS + "Departamento")
                Departamento.text = factura.partner_id.state_id.name if factura.partner_id.state_id else ''
                Pais = etree.SubElement(DireccionReceptor, DTE_NS + "Pais")
                Pais.text = factura.partner_id.country_id.code or 'GT'

                if tipo_documento_fel not in [
                        'NDEB', 'NCRE', 'RECI', 'NABN', 'FESP'
                ]:
                    ElementoFrases = etree.fromstring(
                        factura.company_id.frases_fel)
                    if factura.tipo_gasto == 'importacion':
                        Frase = etree.SubElement(ElementoFrases,
                                                 DTE_NS + "Frase",
                                                 CodigoEscenario="1",
                                                 TipoFrase="4")
                    DatosEmision.append(ElementoFrases)

                Items = etree.SubElement(DatosEmision, DTE_NS + "Items")

                linea_num = 0
                gran_subtotal = 0
                gran_total = 0
                gran_total_impuestos = 0
                cantidad_impuestos = 0
                for linea in factura.invoice_line_ids:

                    if linea.quantity * linea.price_unit == 0:
                        continue

                    linea_num += 1

                    tipo_producto = "B"
                    if linea.product_id.type == 'service':
                        tipo_producto = "S"
                    precio_unitario = linea.price_unit * (100 -
                                                          linea.discount) / 100
                    precio_sin_descuento = linea.price_unit
                    descuento = precio_sin_descuento * linea.quantity - precio_unitario * linea.quantity
                    precio_unitario_base = linea.price_subtotal / linea.quantity
                    total_linea = precio_unitario * linea.quantity
                    total_linea_base = precio_unitario_base * linea.quantity
                    total_impuestos = total_linea - total_linea_base
                    cantidad_impuestos += len(linea.invoice_line_tax_ids)

                    Item = etree.SubElement(Items,
                                            DTE_NS + "Item",
                                            BienOServicio=tipo_producto,
                                            NumeroLinea=str(linea_num))
                    Cantidad = etree.SubElement(Item, DTE_NS + "Cantidad")
                    Cantidad.text = str(linea.quantity)
                    UnidadMedida = etree.SubElement(Item,
                                                    DTE_NS + "UnidadMedida")
                    UnidadMedida.text = "UNI"
                    Descripcion = etree.SubElement(Item,
                                                   DTE_NS + "Descripcion")
                    Descripcion.text = linea.name
                    PrecioUnitario = etree.SubElement(
                        Item, DTE_NS + "PrecioUnitario")
                    PrecioUnitario.text = '{:.6f}'.format(precio_sin_descuento)
                    Precio = etree.SubElement(Item, DTE_NS + "Precio")
                    Precio.text = '{:.6f}'.format(precio_sin_descuento *
                                                  linea.quantity)
                    Descuento = etree.SubElement(Item, DTE_NS + "Descuento")
                    Descuento.text = '{:.6f}'.format(descuento)
                    if len(linea.invoice_line_tax_ids) > 0:
                        Impuestos = etree.SubElement(Item,
                                                     DTE_NS + "Impuestos")
                        Impuesto = etree.SubElement(Impuestos,
                                                    DTE_NS + "Impuesto")
                        NombreCorto = etree.SubElement(Impuesto,
                                                       DTE_NS + "NombreCorto")
                        NombreCorto.text = "IVA"
                        CodigoUnidadGravable = etree.SubElement(
                            Impuesto, DTE_NS + "CodigoUnidadGravable")
                        CodigoUnidadGravable.text = "1"
                        if factura.tipo_gasto == 'importacion':
                            CodigoUnidadGravable.text = "2"
                        MontoGravable = etree.SubElement(
                            Impuesto, DTE_NS + "MontoGravable")
                        MontoGravable.text = '{:.2f}'.format(
                            factura.currency_id.round(total_linea_base))
                        MontoImpuesto = etree.SubElement(
                            Impuesto, DTE_NS + "MontoImpuesto")
                        MontoImpuesto.text = '{:.2f}'.format(
                            factura.currency_id.round(total_impuestos))
                    Total = etree.SubElement(Item, DTE_NS + "Total")
                    Total.text = '{:.2f}'.format(
                        factura.currency_id.round(total_linea))

                    gran_total += factura.currency_id.round(total_linea)
                    gran_subtotal += factura.currency_id.round(
                        total_linea_base)
                    gran_total_impuestos += factura.currency_id.round(
                        total_impuestos)

                Totales = etree.SubElement(DatosEmision, DTE_NS + "Totales")
                if cantidad_impuestos > 0:
                    TotalImpuestos = etree.SubElement(
                        Totales, DTE_NS + "TotalImpuestos")
                    TotalImpuesto = etree.SubElement(
                        TotalImpuestos,
                        DTE_NS + "TotalImpuesto",
                        NombreCorto="IVA",
                        TotalMontoImpuesto='{:.2f}'.format(
                            factura.currency_id.round(gran_total_impuestos)))
                GranTotal = etree.SubElement(Totales, DTE_NS + "GranTotal")
                GranTotal.text = '{:.2f}'.format(
                    factura.currency_id.round(gran_total))

                if factura.company_id.adenda_fel:
                    Adenda = etree.SubElement(SAT, DTE_NS + "Adenda")
                    exec(factura.company_id.adenda_fel, {
                        'etree': etree,
                        'Adenda': Adenda,
                        'factura': factura
                    })

                # En todos estos casos, es necesario enviar complementos
                if tipo_documento_fel in [
                        'NDEB', 'NCRE'
                ] or tipo_documento_fel in [
                        'FCAM'
                ] or (tipo_documento_fel in ['FACT', 'FCAM']
                      and factura.tipo_gasto
                      == 'importacion') or tipo_documento_fel in ['FESP']:
                    Complementos = etree.SubElement(DatosEmision,
                                                    DTE_NS + "Complementos")

                    if tipo_documento_fel in ['NDEB', 'NCRE']:
                        Complemento = etree.SubElement(
                            Complementos,
                            DTE_NS + "Complemento",
                            IDComplemento="ReferenciasNota",
                            NombreComplemento="Nota de Credito" if
                            tipo_documento_fel == 'NCRE' else "Nota de Debito",
                            URIComplemento="text")
                        if factura.factura_original_id.numero_fel:
                            ReferenciasNota = etree.SubElement(
                                Complemento,
                                CNO_NS + "ReferenciasNota",
                                FechaEmisionDocumentoOrigen=str(
                                    factura.factura_original_id.date_invoice),
                                MotivoAjuste="-",
                                NumeroAutorizacionDocumentoOrigen=factura.
                                factura_original_id.firma_fel,
                                NumeroDocumentoOrigen=factura.
                                factura_original_id.numero_fel,
                                SerieDocumentoOrigen=factura.
                                factura_original_id.serie_fel,
                                Version="0.0",
                                nsmap=NSMAP_REF)
                        else:
                            ReferenciasNota = etree.SubElement(
                                Complemento,
                                CNO_NS + "ReferenciasNota",
                                RegimenAntiguo="Antiguo",
                                FechaEmisionDocumentoOrigen=str(
                                    factura.factura_original_id.date_invoice),
                                MotivoAjuste="-",
                                NumeroAutorizacionDocumentoOrigen=factura.
                                factura_original_id.firma_fel,
                                NumeroDocumentoOrigen=factura.
                                factura_original_id.name.split("-")[1],
                                SerieDocumentoOrigen=factura.
                                factura_original_id.name.split("-")[0],
                                Version="0.0",
                                nsmap=NSMAP_REF)

                    if tipo_documento_fel in ['FCAM']:
                        Complemento = etree.SubElement(
                            Complementos,
                            DTE_NS + "Complemento",
                            IDComplemento="FCAM",
                            NombreComplemento="AbonosFacturaCambiaria",
                            URIComplemento="#AbonosFacturaCambiaria")
                        AbonosFacturaCambiaria = etree.SubElement(
                            Complemento,
                            CFC_NS + "AbonosFacturaCambiaria",
                            Version="1",
                            nsmap=NSMAP_ABONO)
                        Abono = etree.SubElement(AbonosFacturaCambiaria,
                                                 CFC_NS + "Abono")
                        NumeroAbono = etree.SubElement(Abono,
                                                       CFC_NS + "NumeroAbono")
                        NumeroAbono.text = "1"
                        FechaVencimiento = etree.SubElement(
                            Abono, CFC_NS + "FechaVencimiento")
                        FechaVencimiento.text = str(factura.date_due)
                        MontoAbono = etree.SubElement(Abono,
                                                      CFC_NS + "MontoAbono")
                        MontoAbono.text = '{:.2f}'.format(
                            factura.currency_id.round(gran_total))

                    if tipo_documento_fel in [
                            'FACT', 'FCAM'
                    ] and factura.tipo_gasto == 'importacion':
                        Complemento = etree.SubElement(
                            Complementos,
                            DTE_NS + "Complemento",
                            IDComplemento="text",
                            NombreComplemento="text",
                            URIComplemento="text")
                        Exportacion = etree.SubElement(Complemento,
                                                       CEX_NS + "Exportacion",
                                                       Version="1",
                                                       nsmap=NSMAP_EXP)
                        NombreConsignatarioODestinatario = etree.SubElement(
                            Exportacion,
                            CEX_NS + "NombreConsignatarioODestinatario")
                        NombreConsignatarioODestinatario.text = factura.consignatario_fel.name if factura.consignatario_fel else "-"
                        DireccionConsignatarioODestinatario = etree.SubElement(
                            Exportacion,
                            CEX_NS + "DireccionConsignatarioODestinatario")
                        DireccionConsignatarioODestinatario.text = factura.consignatario_fel.street or "-" if factura.consignatario_fel else "-"
                        NombreComprador = etree.SubElement(
                            Exportacion, CEX_NS + "NombreComprador")
                        NombreComprador.text = factura.comprador_fel.name if factura.comprador_fel else "-"
                        DireccionComprador = etree.SubElement(
                            Exportacion, CEX_NS + "DireccionComprador")
                        DireccionComprador.text = factura.comprador_fel.street or "-" if factura.comprador_fel else "-"
                        INCOTERM = etree.SubElement(Exportacion,
                                                    CEX_NS + "INCOTERM")
                        INCOTERM.text = factura.incoterm_fel or "-"
                        NombreExportador = etree.SubElement(
                            Exportacion, CEX_NS + "NombreExportador")
                        NombreExportador.text = factura.exportador_fel.name if factura.exportador_fel else "-"
                        CodigoExportador = etree.SubElement(
                            Exportacion, CEX_NS + "CodigoExportador")
                        CodigoExportador.text = factura.exportador_fel.ref or "-" if factura.exportador_fel else "-"

                    if tipo_documento_fel in ['FESP']:
                        total_isr = abs(factura.amount_tax)

                        total_iva_retencion = 0
                        for impuesto in factura.tax_line_ids:
                            if impuesto.amount > 0:
                                total_iva_retencion += impuesto.amount

                        Complemento = etree.SubElement(
                            Complementos,
                            DTE_NS + "Complemento",
                            IDComplemento="text",
                            NombreComplemento="text",
                            URIComplemento="text")
                        RetencionesFacturaEspecial = etree.SubElement(
                            Complemento,
                            CFE_NS + "RetencionesFacturaEspecial",
                            Version="1",
                            nsmap=NSMAP_FE)
                        RetencionISR = etree.SubElement(
                            RetencionesFacturaEspecial,
                            CFE_NS + "RetencionISR")
                        RetencionISR.text = str(total_isr)
                        RetencionIVA = etree.SubElement(
                            RetencionesFacturaEspecial,
                            CFE_NS + "RetencionIVA")
                        RetencionIVA.text = str(total_iva_retencion)
                        TotalMenosRetenciones = etree.SubElement(
                            RetencionesFacturaEspecial,
                            CFE_NS + "TotalMenosRetenciones")
                        TotalMenosRetenciones.text = str(factura.amount_total)

                xml_sin_firma = etree.tostring(
                    GTDocumento, encoding="UTF-8").decode("utf-8")
                logging.warn(xml_sin_firma)

                # signature = xmlsig.template.create(
                #     xmlsig.constants.TransformInclC14N,
                #     xmlsig.constants.TransformRsaSha256,
                #     "Signature"
                # )
                # signature_id = utils.get_unique_id()
                # ref_datos = xmlsig.template.add_reference(
                #     signature, xmlsig.constants.TransformSha256, uri="#DatosEmision"
                # )
                # xmlsig.template.add_transform(ref_datos, xmlsig.constants.TransformEnveloped)
                # ref_prop = xmlsig.template.add_reference(
                #     signature, xmlsig.constants.TransformSha256, uri_type="http://uri.etsi.org/01903#SignedProperties", uri="#" + signature_id
                # )
                # xmlsig.template.add_transform(ref_prop, xmlsig.constants.TransformInclC14N)
                # ki = xmlsig.template.ensure_key_info(signature)
                # data = xmlsig.template.add_x509_data(ki)
                # xmlsig.template.x509_data_add_certificate(data)
                # xmlsig.template.x509_data_add_subject_name(data)
                # serial = xmlsig.template.x509_data_add_issuer_serial(data)
                # xmlsig.template.x509_issuer_serial_add_issuer_name(serial)
                # xmlsig.template.x509_issuer_serial_add_serial_number(serial)
                # qualifying = template.create_qualifying_properties(
                #     signature, name=utils.get_unique_id()
                # )
                # props = template.create_signed_properties(
                #     qualifying, name=signature_id, datetime=fecha_hora
                # )
                #
                # GTDocumento.append(signature)
                # ctx = XAdESContext()
                # with open(path.join("/home/odoo/megaprint_leplan", "51043491-6747a80bb6a554ae.pfx"), "rb") as key_file:
                #     ctx.load_pkcs12(crypto.load_pkcs12(key_file.read(), "Planeta123$"))
                # ctx.sign(signature)
                # ctx.verify(signature)
                # DatosEmision.remove(SingatureTemp)

                # xml_con_firma = etree.tostring(GTDocumento, encoding="utf-8").decode("utf-8")

                request_url = "apiv2"
                request_path = ""
                if factura.company_id.pruebas_fel:
                    request_url = "dev2.api"
                    request_path = ""

                headers = {"Content-Type": "application/xml"}
                data = '<?xml version="1.0" encoding="UTF-8"?><SolicitaTokenRequest><usuario>{}</usuario><apikey>{}</apikey></SolicitaTokenRequest>'.format(
                    factura.journal_id.usuario_fel,
                    factura.journal_id.clave_fel)
                r = requests.post('https://' + request_url +
                                  '.ifacere-fel.com/' + request_path +
                                  'api/solicitarToken',
                                  data=data,
                                  headers=headers)
                resultadoXML = etree.XML(bytes(r.text, encoding='utf-8'))

                if len(resultadoXML.xpath("//token")) > 0:
                    token = resultadoXML.xpath("//token")[0].text
                    uuid_factura = str(
                        uuid.uuid5(uuid.NAMESPACE_OID,
                                   str(factura.id))).upper()

                    headers = {
                        "Content-Type": "application/xml",
                        "authorization": "Bearer " + token
                    }
                    data = '<?xml version="1.0" encoding="UTF-8"?><FirmaDocumentoRequest id="{}"><xml_dte><![CDATA[{}]]></xml_dte></FirmaDocumentoRequest>'.format(
                        uuid_factura, xml_sin_firma)
                    r = requests.post(
                        'https://api.soluciones-mega.com/api/solicitaFirma',
                        data=data.encode('utf-8'),
                        headers=headers)
                    logging.warn(r.text)
                    resultadoXML = etree.XML(bytes(r.text, encoding='utf-8'))
                    if len(resultadoXML.xpath("//xml_dte")) > 0:
                        xml_con_firma = html.unescape(
                            resultadoXML.xpath("//xml_dte")[0].text)

                        headers = {
                            "Content-Type": "application/xml",
                            "authorization": "Bearer " + token
                        }
                        data = '<?xml version="1.0" encoding="UTF-8"?><RegistraDocumentoXMLRequest id="{}"><xml_dte><![CDATA[{}]]></xml_dte></RegistraDocumentoXMLRequest>'.format(
                            uuid_factura, xml_con_firma)
                        logging.warn(data)
                        r = requests.post('https://' + request_url +
                                          '.ifacere-fel.com/' + request_path +
                                          'api/registrarDocumentoXML',
                                          data=data.encode('utf-8'),
                                          headers=headers)
                        resultadoXML = etree.XML(
                            bytes(r.text, encoding='utf-8'))

                        if len(resultadoXML.xpath("//listado_errores")) == 0:
                            xml_certificado = html.unescape(
                                resultadoXML.xpath("//xml_dte")[0].text)
                            xml_certificado_root = etree.XML(
                                bytes(xml_certificado, encoding='utf-8'))
                            numero_autorizacion = xml_certificado_root.find(
                                ".//{http://www.sat.gob.gt/dte/fel/0.2.0}NumeroAutorizacion"
                            )

                            factura.firma_fel = numero_autorizacion.text
                            factura.name = numero_autorizacion.get(
                                "Serie") + "-" + numero_autorizacion.get(
                                    "Numero")
                            factura.serie_fel = numero_autorizacion.get(
                                "Serie")
                            factura.numero_fel = numero_autorizacion.get(
                                "Numero")

                            headers = {
                                "Content-Type": "application/xml",
                                "authorization": "Bearer " + token
                            }
                            data = '<?xml version="1.0" encoding="UTF-8"?><RetornaPDFRequest><uuid>{}</uuid></RetornaPDFRequest>'.format(
                                factura.firma_fel)
                            r = requests.post('https://' + request_url +
                                              '.ifacere-fel.com/' +
                                              request_path + 'api/retornarPDF',
                                              data=data,
                                              headers=headers)
                            resultadoXML = etree.XML(
                                bytes(r.text, encoding='utf-8'))
                            if len(resultadoXML.xpath(
                                    "//listado_errores")) == 0:
                                pdf = resultadoXML.xpath("//pdf")[0].text
                                factura.pdf_fel = pdf
                        else:
                            raise UserError(r.text)
                    else:
                        raise UserError(r.text)
                else:
                    raise UserError(r.text)

        return super(AccountInvoice, self).invoice_validate()
Example #39
0
def get_title(html, url):
    return unescape(re.search("<h6[^>]*>([^<]+)", html).group(1)).strip()
Example #40
0
print("Path: " + path)
print("Start From: " + str(startFrom))

print("Starting test data generation...")
print("------------")

p = re.compile('[\S]+')

f = open("data/test.txt", 'w+')
i = 1

for mReview in parse(path):
    if i > startFrom:
        if i % 100 == 0:
            print(i)
        if p.search(str(mReview['reviewText'])):
            try:
                f.write(
                    getMark(int(mReview['overall'])) + "\t" +
                    html.unescape(mReview['reviewText']).lower() + "\n")
            except UnicodeEncodeError:
                f.write(
                    getMark(int(mReview['overall'])) + "\t" +
                    mReview['reviewText'] + "\n")
    i += 1

print("------------")
print("End parsing test data!")
f.close()
Example #41
0
    course_credit_description = course_info_table.find_next("p")
    try:
        course_credit_type = course_credit_description.string.split(
            ") ")[0][1:]
    except AttributeError:
        course_credit_type = course_credit_description.contents[0].split(
            ") ")[0][1:]
    if len(course_credit_type) > 30:
        course_credit_type = "N.A"

    course_prereqs_raw = course_credit_description.find_next("p")
    course_prereqs = " ".join([
        tag.text if type(tag) == bs4.Tag else tag
        for tag in course_prereqs_raw.contents
    ])
    course_prereqs = html.unescape(course_prereqs)
    #if type(course_prereqs_raw.contents) == list:
    #    course_prereqs = " ".join([tag.text for tag in
    #                               course_prereqs_raw.contents])
    #elif type(course_prereqs_raw.contents) == bs4.Tag:
    #    course_prereqs = " ".join(course_prereqs_raw.text)
    #else:
    #    raise Exception

    course_details = course_info_table.find_next("tbody")
    course_id = course_details.find_next("td")
    course_section = course_id.find_next("td")
    course_credits = course_section.find_next("td")
    course_capacity = course_credits.find_next("td")
    course_enrolled = course_capacity.find_next("td")
    course_instructors = course_enrolled.find_next("td")
Example #42
0
 def test_unescape_method(self):
     from html import unescape
     p = self.get_collector()
     with self.assertWarns(DeprecationWarning):
         s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
         self.assertEqual(p.unescape(s), unescape(s))
Example #43
0
 def generate_question(self):
     self.current = self.list[self.num]
     self.num += 1
     question = html.unescape(self.current.text)
     return f'Q.{self.num}: {question} '
Example #44
0
    def _preprocess_v2(self, text: str) -> str:
        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in url_regexes:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in email_regexes:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(user_mention_regex, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        if self.map_hindi_numbers_to_arabic:
            text = text.translate(hindi_to_arabic_map)

        # remove repeated characters >2
        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
                r" \1 ",
                text,
            )

            # insert whitespace between words and numbers or numbers and words
            text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)",
                          r" \1 \2 ", text)
            text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)",
                          r" \1 \2 ", text)

        if self.replace_slash_with_dash:
            text = text.replace("/", "-")

        # remove unwanted characters
        if self.keep_emojis:
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
            text = re.sub(rejected_chars_regex2, " ", text)
        else:
            text = re.sub(rejected_chars_regex, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if (self.model_name == "bert-base-arabertv2"
                or self.model_name == "bert-large-arabertv2"):
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text
Example #45
0
def amend_html_symbols(string: str) -> str:
    return html.unescape(string)
Example #46
0
def init_oca( fname, message_stream=sys.stdout ):
	
	if fname == '_':
		fname = fnameDefault
	
	if message_stream == sys.stdout or message_stream == sys.stderr:
		def messsage_stream_write( s ):
			message_stream.write( removeDiacritic(s) )
	else:
		def messsage_stream_write( s ):
			message_stream.write( u'{}'.format(s) )
			
	tstart = datetime.datetime.now()
	
	fix_bad_license_codes()
	
	discipline_id = dict( (discipline, Discipline.objects.get(name=discipline)) for discipline in ['Road', 'Track', 'Cyclocross', 'MTB', 'Para'] )
	discipline_cols = {
		'Road':				['national_road', 'provincial_road'],
		'Cyclocross':		['national_cyclocross', 'provincial_cyclocross'],
		'Track':			['track'],
		'MTB':				['cross_country', 'provincial_cross_country', 'downhill', 'fourx'],
		'Para':				['para_cycling'],
	}
	
	effective_date = datetime.date.today()
	
	html_parser = HTMLParser()
	
	# Process the records in larger transactions for performance.
	@transaction.atomic
	def process_ur_records( ur_records ):
		for i, ur in ur_records:
			try:
				date_of_birth	= date_from_str( ur.dob )
			except Exception as e:
				messsage_stream_write( u'Line {}: Invalid birthdate "{}" ({}) {}\n'.format( i, ur.dob, ur, e ) )
				continue
				
			attributes = {
				'license_code':	ur.license,
				'last_name':	ur.last_name,
				'first_name':	ur.first_name,
				'gender':		gender_from_str(ur.sex),
				'date_of_birth':date_of_birth,
				'state_prov':	'Ontario',
				'nationality':	'Canada',
				'uci_code':		ur.uci_code,
			}
			if attributes['uci_code'][:3] != 'CAN':
				attributes['nationality'] = ''
			try:
				attributes['city'] = ur.city
			except:
				pass
			
			try:
				lh = LicenseHolder.objects.get( license_code=ur.license )
				if set_attributes(lh, attributes):
					lh.save()
			
			except LicenseHolder.DoesNotExist:
				lh = LicenseHolder( **attributes )
				lh.save()
			
			messsage_stream_write( u'{:>6}: {:>8} {:>9} {:>10} {}, {}, ({})\n'.format(
					i, lh.license_code, lh.uci_code, lh.date_of_birth.strftime('%Y/%m/%d'), lh.last_name, lh.first_name, lh.city
				)
			)
			
			team_name = ur.club or ur.trade_team
			TeamHint.objects.filter( license_holder=lh ).delete()
			if team_name:
				team_names = [t.strip() for t in team_name.split(',') if t.strip()]
				for count, team_name in enumerate(team_names):
					team = Team.objects.get_or_create( name=team_name )[0]
					if count == len(team_names)-1:
						for discipline_name, discipline in discipline_id.items():
							for col_name in discipline_cols[discipline_name]:
								if getattr(ur, col_name, None):
									TeamHint( license_holder=lh, team=team, discipline=discipline, effective_date=effective_date ).save()
									break

	ur_records = []
	with io.open(fname, 'r', encoding='utf-8', errors='replace') as fp:
		oca_reader = csv.reader( fp )
		for i, row in enumerate(oca_reader):
			if i == 0:
				# Get the header fields from the first row.
				fields = utils.getHeaderFields( [unescape(v.strip()) for v in row] )
				messsage_stream_write( u'Recognized Header Fields:\n' )
				messsage_stream_write( u'----------------------------\n' )
				messsage_stream_write( u'\n'.join(fields) + u'\n' )
				messsage_stream_write( u'----------------------------\n' )
				
				oca_record = namedtuple('oca_record', fields)
				continue
			
			ur = oca_record( *[unescape(v.strip()) for v in row] )
			
			ur_records.append( (i, ur) )
			if len(ur_records) == 3000:
				process_ur_records( ur_records )
				ur_records = []
			
	process_ur_records( ur_records )
	
	messsage_stream_write( 'Initialization in: {}\n'.format(datetime.datetime.now() - tstart) )
Example #47
0
    async def run(self, bot: Bot, event: Event):
        if not isinstance(event, Message):
            return True

        root_args = ''
        root_call = ''
        args = ''
        handler = None
        if event.text:
            try:
                root_call, root_args = SPACE_RE.split(event.text, 1)
            except ValueError:
                root_call = event.text
        elif event.message and event.message.text:
            try:
                root_call, root_args = SPACE_RE.split(event.message.text, 1)
            except ValueError:
                root_call = event.message.text

        if root_call == bot.config.PREFIX + self.name:
            for c in self.route_list:
                if c.subtype == event.subtype or c.subtype == '*':
                    if root_args is None:
                        if c.name is None:
                            handler = c.handler
                            break
                    else:
                        try:
                            call, args = SPACE_RE.split(root_args, 1)
                        except ValueError:
                            call = root_args

                        if c.name == call:
                            handler = c.handler
                            break
            else:
                handler = Handler(self.fallback)

        if handler:
            raw = html.unescape(args)
            func_params = handler.params
            try:
                chunks = split_chunks(raw, self.use_shlex)
            except ValueError:
                await bot.say(event.channel,
                              '*Error*: Can not parse this command')
                return False

            try:
                kw, remain_chunks = parse_option_and_arguments(
                    handler,
                    chunks,
                )
            except SyntaxError as e:
                await bot.say(event.channel, '*Error*\n{}'.format(e))
                return False
            with self.prepare_kwargs(
                    bot=bot,
                    event=event,
                    func_params=func_params,
                    **kw,
            ) as kwargs:
                return await handler(**kwargs)
        return True
Example #48
0
def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()
Example #49
0
    def next_question(self):
        self.current_question = self.question_list[self.question_number]
        self.question_number += 1
        q_text = html.unescape(self.current_question.text)

        return (f"Q.{self.question_number}: {q_text} ")
    def test_views_lti_development_post_bypass_lti_instructor_no_video(self):
        """When bypassing LTI, the "example.com" consumer site is automatically created."""
        data = {
            "resource_link_id": "example.com-123",
            "context_id": "course-v1:ufr+mathematics+00001",
            "roles": "instructor",
            "tool_consumer_instance_guid": "example.com",
            "user_id": "56255f3807599c377bf0e5bf072359fd",
        }
        response = self.client.post(
            "/lti/videos/{!s}".format(uuid.uuid4()),
            data,
            HTTP_REFERER="https://example.com",
        )
        self.assertEqual(response.status_code, 200)
        self.assertContains(response, "<html>")
        content = response.content.decode("utf-8")
        match = re.search(
            '<div id="marsha-frontend-data" data-context="(.*)">', content
        )

        context = json.loads(unescape(match.group(1)))
        jwt_token = AccessToken(context.get("jwt"))
        video = Video.objects.get()
        self.assertEqual(jwt_token.payload["resource_id"], str(video.id))
        self.assertEqual(jwt_token.payload["user_id"], data["user_id"])
        self.assertEqual(jwt_token.payload["context_id"], data["context_id"])
        self.assertEqual(jwt_token.payload["roles"], [data["roles"]])
        self.assertEqual(jwt_token.payload["locale"], "en_US")
        self.assertDictEqual(
            jwt_token.payload["course"],
            {"school_name": "ufr", "course_name": "mathematics", "course_run": "00001"},
        )
        self.assertEqual(context.get("state"), "success")

        self.assertEqual(
            context.get("resource"),
            {
                "active_stamp": None,
                "is_ready_to_show": False,
                "show_download": True,
                "description": video.description,
                "id": str(video.id),
                "upload_state": "pending",
                "timed_text_tracks": [],
                "thumbnail": None,
                "title": video.title,
                "urls": None,
                "should_use_subtitle_as_transcript": False,
                "has_transcript": False,
                "playlist": {
                    "title": "course-v1:ufr+mathematics+00001",
                    "lti_id": "course-v1:ufr+mathematics+00001",
                },
                "live_state": None,
                "live_info": {},
            },
        )
        self.assertEqual(context.get("modelName"), "videos")
        # The consumer site was created with a name and a domain name
        ConsumerSite.objects.get(name="example.com", domain="example.com")
Example #51
0
def get_article(article_url, article_i, total):
    article_url = normurl(article_url)

    myjson = {"entries": []}

    basetext = "(%i/%i) " % (article_i, total)
    article_i += 1

    sys.stderr.write(basetext + "Downloading %s... " % article_url)
    sys.stderr.flush()
    try:
        data = download(article_url)
    except Exception:
        sys.stderr.write("failed!\n")
        sys.stderr.flush()
        return

    sys.stderr.write("\r" + basetext + "Processing  %s... " % article_url)
    sys.stderr.flush()

    soup = bs4.BeautifulSoup(data, 'lxml')

    jsondata = soup.find(attrs={"type": "application/ld+json"}).text
    jsondecode = rssit.util.json_loads(jsondata)

    sitetitle = html.unescape(
        soup.find("meta", attrs={"property": "og:site_name"})["content"])
    myjson["title"] = sitetitle
    myjson["author"] = sitetitle
    author = sitetitle
    title = html.unescape(jsondecode["headline"])
    date = parse(jsondecode["datePublished"])
    album = "[" + str(date.year)[-2:] + str(date.month).zfill(2) + str(
        date.day).zfill(2) + "] " + title

    article_selectors = [
        ".entry .article", ".article_post", "#content", "#mArticle",
        "#article", "#entire > #contents > .post", "#main .article-desc",
        "section > article"
    ]

    for selector in article_selectors:
        articletag = soup.select(selector)
        if articletag and len(articletag) > 0:
            articletag = articletag[0]
            break

    if not articletag:
        sys.stderr.write("failed!\n")
        sys.stderr.flush()
        return

    images = []
    videos = []

    articlestr = str(articletag)
    re_images = re.findall(
        "(https?://cfile\d+\.uf\.tistory\.com/(image|attach|original)/\w+)",
        articlestr)

    for image in re_images:
        url = get_full_image(article_url, image[0])

        if url not in images:
            images.append(url)

    lightboxes = articletag.findAll(attrs={"data-lightbox": True})

    for lightbox in lightboxes:
        image = get_full_image(article_url, lightbox["data-url"])
        if image not in images:
            images.append(image)
        #images.append(re.sub("/image/", "/original/", lightbox["data-url"]))

    #imageblocks = articletag.select(".imageblock img")
    imageblocks = articletag.select("p img, .imageblock img")

    for image in imageblocks:
        if "onclick" in image:
            url = re.sub("^open_img\(['\"](.*)['\"]\)$", "\\1",
                         image["onclick"])

        url = image["src"]

        url = get_full_image(article_url, url)

        if url not in images:
            images.append(url)

    videotags = articletag.select("video")

    for video in videotags:
        if video.has_attr("src"):
            url = video["src"]
        else:
            sources = video.select("source")
            if len(sources) > 0:
                url = sources[0]["src"]
            else:
                continue

        videos.append({
            "image": None,
            "video": get_full_image(article_url, url)
        })

    myjson["entries"].append({
        "caption": title,
        "url": article_url,
        "album": album,
        "date": date,
        "author": author,
        "images": images,
        "videos": videos
    })

    sys.stderr.write("done\n")
    sys.stderr.flush()

    return myjson
    def test_views_lti_development_post_bypass_lti_instructor(self):
        """In development, passport creation and LTI verif can be bypassed for a instructor."""
        video = VideoFactory(
            playlist__consumer_site__domain="example.com",
            playlist__title="foo bar",
            playlist__lti_id="course-v1:ufr+mathematics+00001",
        )
        data = {
            "resource_link_id": video.lti_id,
            "context_id": video.playlist.lti_id,
            "roles": "instructor",
            "tool_consumer_instance_guid": "example.com",
            "context_title": "mathematics",
            "tool_consumer_instance_name": "ufr",
            "user_id": "56255f3807599c377bf0e5bf072359fd",
        }
        response = self.client.post(
            "/lti/videos/{!s}".format(video.pk),
            data,
            HTTP_REFERER="https://example.com",
        )
        self.assertEqual(response.status_code, 200)
        self.assertContains(response, "<html>")
        content = response.content.decode("utf-8")

        match = re.search(
            '<div id="marsha-frontend-data" data-context="(.*)">', content
        )

        context = json.loads(unescape(match.group(1)))
        jwt_token = AccessToken(context.get("jwt"))
        self.assertEqual(jwt_token.payload["resource_id"], str(video.id))
        self.assertEqual(jwt_token.payload["user_id"], data["user_id"])
        self.assertEqual(jwt_token.payload["context_id"], data["context_id"])
        self.assertEqual(jwt_token.payload["roles"], [data["roles"]])
        self.assertEqual(jwt_token.payload["locale"], "en_US")
        self.assertEqual(
            jwt_token.payload["permissions"],
            {"can_access_dashboard": True, "can_update": True},
        )
        self.assertDictEqual(
            jwt_token.payload["course"],
            {"school_name": "ufr", "course_name": "mathematics", "course_run": "00001"},
        )
        self.assertEqual(context.get("state"), "success")
        self.assertEqual(
            context.get("resource"),
            {
                "active_stamp": None,
                "is_ready_to_show": False,
                "show_download": True,
                "description": video.description,
                "id": str(video.id),
                "upload_state": "pending",
                "timed_text_tracks": [],
                "thumbnail": None,
                "title": video.title,
                "urls": None,
                "should_use_subtitle_as_transcript": False,
                "has_transcript": False,
                "playlist": {
                    "title": "foo bar",
                    "lti_id": "course-v1:ufr+mathematics+00001",
                },
                "live_state": None,
                "live_info": {},
            },
        )
        self.assertEqual(context.get("modelName"), "videos")
Example #53
0
                                       re.sub(r" *\n *", "\n", re.sub(r" +", " ", re.sub(r"\r", "", plaintext))))

                if len(plaintext) > 0:
                    # Guessing MIME of the file (checked on original content)
                    mime = magic.from_buffer(text, mime=True)
                    mimeFile.write(mime.encode() + b"\n")

                    urlFile.write(url.encode() + b"\n")
                    langFile.write(lang.encode() + b"\n")
                    encodingFile.write(orig_encoding.encode() + b"\n")

                    b64norm = base64.b64encode(cleantree.encode())
                    normHtmlFile.write(b64norm + b"\n")

                    if options.boilerpipe:
                        b64deboil = base64.b64encode(deboiled.encode())
                        deboilFile.write(b64deboil + b"\n")

                    b64text = base64.b64encode(html.unescape(plaintext).encode())
                    plainTextFile.write(b64text + b"\n")

urlFile.close()
langFile.close()
encodingFile.close()
mimeFile.close()
normHtmlFile.close()
plainTextFile.close()
# Boilerpipe cleaning is optional
if options.boilerpipe:
    deboilFile.close()
    def sanitize(self):
        super(Cataclysmic, self).sanitize()

        # Calculate some columns based on imported data, sanitize some fields
        name = self[self._KEYS.NAME]
        aliases = self.get_aliases()

        if ((name.startswith('SN') and is_number(name[2:6]) and
             self._KEYS.DISCOVER_DATE in self and
             int(self[self._KEYS.DISCOVER_DATE][0][QUANTITY.VALUE].split('/')[
                 0]) >= 2016 and not any(['AT' in x for x in aliases]))):
            source = self.add_self_source()
            self.add_quantity(self._KEYS.ALIAS, 'AT' + name[2:], source)

        if self._KEYS.CLAIMED_TYPE in self:
            # FIX: this is something that should be done completely internally
            #      i.e. add it to `clean` or something??
            self[self._KEYS.CLAIMED_TYPE] = self.ct_list_prioritized()
        if self._KEYS.CLAIMED_TYPE in self:
            self[self._KEYS.CLAIMED_TYPE][:] = [
                ct for ct in self[self._KEYS.CLAIMED_TYPE]
                if (ct[QUANTITY.VALUE] != '?' and ct[QUANTITY.VALUE] != '-')
            ]
            if not len(self[self._KEYS.CLAIMED_TYPE]):
                del (self[self._KEYS.CLAIMED_TYPE])
        if self._KEYS.CLAIMED_TYPE not in self and name.startswith('AT'):
            source = self.add_self_source()
            self.add_quantity(self._KEYS.CLAIMED_TYPE, 'Candidate', source)

        if self._KEYS.SOURCES in self:
            for source in self[self._KEYS.SOURCES]:
                if SOURCE.BIBCODE in source:
                    import urllib
                    from html import unescape
                    # First sanitize the bibcode
                    if len(source[SOURCE.BIBCODE]) != 19:
                        source[SOURCE.BIBCODE] = urllib.parse.unquote(
                            unescape(source[SOURCE.BIBCODE])).replace('A.A.',
                                                                      'A&A')
                    if source[SOURCE.BIBCODE] in self.catalog.biberror_dict:
                        source[SOURCE.BIBCODE] = \
                            self.catalog.biberror_dict[source[SOURCE.BIBCODE]]

                    if (source[SOURCE.BIBCODE] not in
                            self.catalog.bibauthor_dict):
                        bibcode = source[SOURCE.BIBCODE]
                        adsquery = (self.catalog.ADS_BIB_URL +
                                    urllib.parse.quote(bibcode) +
                                    '&data_type=Custom&format=%253m%20%25(y)')
                        bibcodeauthor = ''
                        try:
                            response = urllib.request.urlopen(adsquery)
                            html = response.read().decode('utf-8')
                            hsplit = html.split("\n")
                            if len(hsplit) > 5:
                                bibcodeauthor = hsplit[5]
                        except:
                            pass

                        if not bibcodeauthor:
                            warnings.warn(
                                "Bibcode didn't return authors, not converting"
                                "this bibcode.")

                        self.catalog.bibauthor_dict[bibcode] = unescape(
                            bibcodeauthor).strip()

            for source in self[self._KEYS.SOURCES]:
                if (SOURCE.BIBCODE in source and
                        source[SOURCE.BIBCODE] in self.catalog.bibauthor_dict
                        and
                        self.catalog.bibauthor_dict[source[SOURCE.BIBCODE]]):
                    source[SOURCE.REFERENCE] = self.catalog.bibauthor_dict[
                        source[SOURCE.BIBCODE]]
                if (SOURCE.NAME not in source and SOURCE.BIBCODE in source and
                        source[SOURCE.BIBCODE]):
                    source[SOURCE.NAME] = source[SOURCE.BIBCODE]

        if self._KEYS.REDSHIFT in self:
            self[self._KEYS.REDSHIFT] = list(
                sorted(
                    self[self._KEYS.REDSHIFT],
                    key=lambda q: frame_priority(q, self._KEYS.REDSHIFT)))

        if self._KEYS.VELOCITY in self:
            self[self._KEYS.VELOCITY] = list(
                sorted(
                    self[self._KEYS.VELOCITY],
                    key=lambda q: frame_priority(q, self._KEYS.VELOCITY)))

        if self._KEYS.CLAIMED_TYPE in self:
            self[self._KEYS.CLAIMED_TYPE] = self.ct_list_prioritized()

        # Renumber and reorder sources
        if self._KEYS.SOURCES in self:
            # Sort sources reverse-chronologically
            self[self._KEYS.SOURCES] = sorted(
                self[self._KEYS.SOURCES], key=lambda x: bib_priority(x))

            # Assign new aliases to match new order
            source_reps = OrderedDict(
                [[x[SOURCE.ALIAS], str(i + 1)]
                 for i, x in enumerate(self[self._KEYS.SOURCES])])
            for i, source in enumerate(self[self._KEYS.SOURCES]):
                self[self._KEYS.SOURCES][i][SOURCE.ALIAS] = source_reps[source[
                    SOURCE.ALIAS]]

            # Change sources to match new aliases
            for key in self.keys():
                if self._KEYS.get_key_by_name(key).no_source:
                    continue
                for item in self[key]:
                    aliases = [
                        str(y)
                        for y in sorted(
                            int(source_reps[x])
                            for x in item[item._KEYS.SOURCE].split(','))
                    ]
                    item[item._KEYS.SOURCE] = ','.join(aliases)
Example #55
0
def get_presentes(pk, response, materia):
    if type(materia) == OrdemDia:
        presentes = PresencaOrdemDia.objects.filter(sessao_plenaria_id=pk)
    else:
        presentes = SessaoPlenariaPresenca.objects.filter(
            sessao_plenaria_id=pk)

    sessao = SessaoPlenaria.objects.get(id=pk)
    num_presentes = len(presentes)
    data_sessao = sessao.data_inicio
    oradores = OradorExpediente.objects.filter(
        sessao_plenaria_id=pk).order_by('numero_ordem')

    oradores_list = []
    for o in oradores:

        oradores_list.append({
            'nome': o.parlamentar.nome_parlamentar,
            'numero': o.numero_ordem
        })

    presentes_list = []
    for p in presentes:
        legislatura = sessao.legislatura
        # Recupera os mandatos daquele parlamentar
        mandatos = p.parlamentar.mandato_set.filter(legislatura=legislatura)

        if p.parlamentar.ativo and mandatos:
            filiacao = filiacao_data(p.parlamentar, data_sessao, data_sessao)
            if not filiacao:
                partido = 'Sem Registro'
            else:
                partido = filiacao

            presentes_list.append({
                'id': p.id,
                'parlamentar_id': p.parlamentar.id,
                'nome': p.parlamentar.nome_parlamentar,
                'partido': partido,
                'voto': ''
            })

        elif not p.parlamentar.ativo or not mandatos:
            num_presentes += -1

    if materia:
        if materia.tipo_votacao == 1:
            tipo_votacao = 'Simbólica'
        elif materia.tipo_votacao == 2:
            tipo_votacao = 'Nominal'
        elif materia.tipo_votacao == 3:
            tipo_votacao = 'Secreta'

        response.update({
            'tipo_resultado': materia.resultado,
            'observacao_materia': html.unescape(materia.observacao),
            'tipo_votacao': tipo_votacao,
            'materia_legislativa_texto': str(materia.materia)
        })

    presentes_list = sort_lista_chave(presentes_list, 'nome')

    response.update({
        'presentes': presentes_list,
        'num_presentes': num_presentes,
        'oradores': oradores_list,
        'msg_painel': str(_('Votação aberta!')),
    })

    return response
Example #56
0
    else:
        data['uselang'] = 'zh-cn'
    r = requests.post('https://zh.wikipedia.org/w/api.php',
                      data=data,
                      headers=headers)
    try:
        result = r.json()
    except Exception as e:
        print(e)
        print(r.text)
        continue
    result = result['parse']['text']['*']
    matches = re.findall(r'<div id="text(\d+)">(.+?)</div>', result)
    for match in matches:
        idx = int(match[0])
        newtext = html.unescape(match[1]).replace('\\n', '\\\\n')
        # print(idx, newtext)
        if args.mode == 1:
            newregex = r'\g<1>\g<2>\g<3>{}\g<5>'.format(newtext)
        else:
            newregex = r'\g<1>{}\g<3>\g<4>\g<5>'.format(newtext)
        jstext = re.sub(
            r"(wgULS\(')({})(',[\s\n]*?')({})('\))".format(
                re.escape(messages[idx][0]), re.escape(messages[idx][1])),
            newregex,
            jstext,
        )

    jstext = re.sub(r"wgULS\('(.+?)',[\s\n]*?'\1'\)", r"'\1'", jstext)

    with open(full_filename, 'w', encoding='utf8') as f:
Example #57
0
    def print_info(self, req, req_body, res, res_body):
        def _parse_qsl(s):
            return '\n'.join("%-20s %s" % (k, v)
                             for k, v in parse_qsl(s, keep_blank_values=True))

        if not options['debug']:
            return

        req_header_text = "%s %s %s\n%s" % (req.method, req.uri,
                                            self.request_version, req.headers)

        if res is not None:
            reshdrs = res.headers

            if type(reshdrs) == dict or 'CaseInsensitiveDict' in str(
                    type(reshdrs)):
                reshdrs = ''
                for k, v in res.headers.items():
                    if k in plugins.IProxyPlugin.proxy2_metadata_headers.values(
                    ):
                        continue
                    if k.lower().startswith('x-proxy2-'): continue
                    reshdrs += '{}: {}\n'.format(k, v)

            res_header_text = "%s %d %s\n%s" % (
                res.response_version, res.status, res.reason, reshdrs)

        self.logger.trace("==== REQUEST ====\n%s" % req_header_text,
                          color=ProxyLogger.colors_map['yellow'])

        u = urlparse(req.uri)
        if u.query:
            query_text = _parse_qsl(u.query)
            self.logger.trace("==== QUERY PARAMETERS ====\n%s\n" % query_text,
                              color=ProxyLogger.colors_map['green'])

        cookie = req.headers.get('Cookie', '')
        if cookie:
            cookie = _parse_qsl(re.sub(r';\s*', '&', cookie))
            self.logger.trace("==== COOKIES ====\n%s\n" % cookie,
                              color=ProxyLogger.colors_map['green'])

        auth = req.headers.get('Authorization', '')
        if auth.lower().startswith('basic'):
            token = auth.split()[1].decode('base64')
            self.logger.trace("==== BASIC AUTH ====\n%s\n" % token,
                              color=ProxyLogger.colors_map['red'])

        if req_body is not None:
            req_body_text = None
            content_type = req.headers.get('Content-Type', '')

            if content_type.startswith('application/x-www-form-urlencoded'):
                req_body_text = _parse_qsl(req_body)
            elif content_type.startswith('application/json'):
                try:
                    json_obj = json.loads(req_body)
                    json_str = json.dumps(json_obj, indent=2)
                    if json_str.count('\n') < 50:
                        req_body_text = json_str
                    else:
                        lines = json_str.splitlines()
                        req_body_text = "%s\n(%d lines)" % ('\n'.join(
                            lines[:50]), len(lines))
                except ValueError:
                    req_body_text = req_body
            elif len(req_body) < 1024:
                req_body_text = req_body

            if req_body_text:
                self.logger.trace("==== REQUEST BODY ====\n%s\n" %
                                  req_body_text.strip(),
                                  color=ProxyLogger.colors_map['white'])

        if res is not None:
            self.logger.trace("\n==== RESPONSE ====\n%s" % res_header_text,
                              color=ProxyLogger.colors_map['cyan'])

            cookies = res.headers.get('Set-Cookie')
            if cookies:
                if type(cookies) == list or type(cookies) == tuple:
                    cookies = '\n'.join(cookies)

                self.logger.trace("==== SET-COOKIE ====\n%s\n" % cookies,
                                  color=ProxyLogger.colors_map['yellow'])

        if res_body is not None:
            res_body_text = res_body
            content_type = res.headers.get('Content-Type', '')

            if content_type.startswith('application/json'):
                try:
                    json_obj = json.loads(res_body)
                    json_str = json.dumps(json_obj, indent=2)
                    if json_str.count('\n') < 50:
                        res_body_text = json_str
                    else:
                        lines = json_str.splitlines()
                        res_body_text = "%s\n(%d lines)" % ('\n'.join(
                            lines[:50]), len(lines))
                except ValueError:
                    res_body_text = res_body
            elif content_type.startswith('text/html'):
                if type(res_body) == str: res_body = str.encode(res_body)
                m = re.search(r'<title[^>]*>\s*([^<]+?)\s*</title>',
                              res_body.decode(errors='ignore'), re.I)
                if m:
                    self.logger.trace("==== HTML TITLE ====\n%s\n" %
                                      html.unescape(m.group(1)),
                                      color=ProxyLogger.colors_map['cyan'])
            elif content_type.startswith('text/') and len(res_body) < 1024:
                res_body_text = res_body

            if res_body_text:
                res_body_text2 = ''
                maxchars = 4096
                halfmax = int(maxchars / 2)
                try:
                    if type(res_body_text) == bytes:
                        dec = res_body_text.decode()
                    else:
                        dec = res_body_text

                    if dec != None and len(dec) > maxchars:
                        res_body_text2 = dec[:halfmax] + ' <<< ... >>> ' + dec[
                            -halfmax:]
                    else:
                        res_body_text2 = dec

                except UnicodeDecodeError:
                    if len(res_body_text) > maxchars:
                        res_body_text2 = hexdump(list(res_body_text[:halfmax]))
                        res_body_text2 += '\n\t................\n'
                        res_body_text2 += hexdump(
                            list(res_body_text[-halfmax:]))
                    else:
                        res_body_text2 = hexdump(list(res_body_text))

                self.logger.trace("==== RESPONSE BODY ====\n%s\n" %
                                  res_body_text2,
                                  color=ProxyLogger.colors_map['green'])
        self.close()

    def handle_starttag(self, tag, attrs):
        if tag.lower() == 'script':
            attrs = dict(attrs)
            if attrs.get('id', '').lower() == 'last-search-results':
                self.json = True

    def handle_data(self, data):
        if self.json is True:
            self.json = data


try:
    from html import unescape
    PasskeyParser.unescape = lambda self, text: unescape(text)
except ImportError as e:
    pass


def type_day(arg):
    try:
        d = datetime.strptime(arg, '%Y-%m-%d')
    except ValueError:
        raise ArgumentTypeError("%s is not a date in the form YYYY-MM-DD" %
                                arg)
    if not firstDay <= d <= lastDay:
        raise ArgumentTypeError(
            "%s is outside the Gencon housing block window" % arg)
    return arg
Example #59
0
        file_success = False

        with open(fname_in, 'r', encoding='utf8') as fin, open(fname_out,
                                                               'w') as fout:
            reader = csv.DictReader(fin)
            next(reader, None)  # skip the headers
            writer = csv.writer(fout)

            try:
                logger.info(
                    "File successfully pulled from Acalog remote service. Generate CSV file of outcomes."
                )
                for row in reader:
                    writer.writerow(
                        (row["Prefix"] +
                         html.unescape(row["Common Course Identifier"]),
                         row["Code"], row["Catalog Name"],
                         row["Course Outcomes"]))

                file_success = True
            except csv.Error as e:
                logger.exception("Error reading or writing courses file. " +
                                 str(e))

        # if we successfully generated the file, move it to its remote destination
        if file_success and os.path.getsize(fname_out) > 9000:
            logger.info(
                "Course outcomes file successfully generated so let's move it to the remote location."
            )
            put_file_smb(fname_out)
        else:
Example #60
0
def loadLocalsFromWebsite(url):

    print('Loading', url, '...')

    with urllib.request.urlopen(url) as response:
        content = response.read()
        d = pq(content)

        entries = []
        categories = d('#content > div.grid-container')
        availableDistricts = {}
        os.makedirs(imagesFolder, exist_ok=True)
        imagesCached = [
            f for f in os.listdir(imagesFolder)
            if os.path.isfile(os.path.join(imagesFolder, f))
        ]

        print('Found', len(categories), 'categories.\nAlready fetched',
              len(imagesCached), 'images.')

        for category in categories:

            elements = d('.cat-items .post.item', category)
            availableSubCategories = {}
            categoryTitle = d('h2:first', category).text()

            for subCategoryItem in d('.options a.option.category', category):
                subCategoryItem = d(subCategoryItem)
                availableSubCategories[subCategoryItem.attr(
                    'data-filter-value')] = subCategoryItem.text()

            for tagFilterItem in d('.options a.option.tag', category):
                tagFilterItem = d(tagFilterItem)
                availableDistricts[tagFilterItem.attr(
                    'data-filter-value')] = tagFilterItem.text()

            print('Found', len(elements), 'elements in category',
                  categoryTitle, '\nSub.categories:\n -',
                  '\n - '.join(availableSubCategories.values()), '\n')

            for element in elements:

                classes = set()
                for cssClass in d(element).attr('class').split(' '):
                    cssClass = cssClass.strip()
                    if cssClass != '':
                        classes.add(cssClass)

                image = d('img', element)
                imagePath = image.attr('src')
                imageFilename = ntpath.basename(imagePath)

                # Fix for multi-char umlaut.
                imageFilename = imageFilename.replace('ö', 'oe')

                if imageFilename not in imagesCached:
                    saveImage(imagePath, imageFilename)
                    imagesCached.append(imageFilename)

                link = d('a:last', element)

                subCategories = []
                for subCategoryKey in classes & availableSubCategories.keys():
                    subCategories.append(
                        availableSubCategories[subCategoryKey])

                districts = []
                for districtKey in classes & availableDistricts.keys():
                    districts.append(availableDistricts[districtKey])

                entry = {
                    'image': imageFilename,
                    'title': link.text(),
                    'link': link.attr('href'),
                    'sub-categories': sorted(subCategories),
                    'districts': sorted(districts),
                    'category': categoryTitle,
                    'cleanTitle': link.text().split('/ ')[0].strip(),
                    'description': html.unescape(d('p', element).text())
                }
                entries.append(entry)

        print('Parsed', len(entries), 'entries in', len(categories),
              'categories\nDistricts:\n -',
              '\n - '.join(availableDistricts.values()))
        return entries