Python BeautifulSoup.encoding Examples, bs4.BeautifulSoup.encoding Python Examples

Example #1

0

Show file

File: nh_news_spider.py Project: xukaijie/spider

    def parse(self, response):

        soup = BeautifulSoup(response.body, 'lxml', from_encoding="utf-8")
        soup.encoding = 'utf-8'

        db_value = []

        for list in soup.select("#Columns")[0].select('li'):

            new_time = list.span.text

            link = list.a['href']

            title = list.a.text

            content = self.get_content(self.domain + link)

            jason = {
                'title': title,
                'content': content,
                'new_time': new_time,
                'orign': response.url
            }

            db_value.append(jason)

        collection.insert_many(db_value)

Example #2

0

Show file

File: website.py Project: CorleoneDany/Python_Projet_2

    def request_category_urls(self, url):
        """Return all the books's urls from a category."""
        self.url = url
        response = requests.get(self.url)
        if response.ok:
            response.encoding = "utf-8"
            self.content = BeautifulSoup(response.text, features="html.parser")
            self.retrieve_urls_from_content(self.content)
            try:
                next_page_html = requests.get(
                    self.load_next_page_url(self.content))
                next_page_html.encoding = "utf-8"
                next_page_content = BeautifulSoup(next_page_html.text,
                                                  features="html.parser")
                self.retrieve_urls_from_content(next_page_content)

                while self.has_next_page(next_page_content):
                    self.retrieve_urls_from_content(next_page_content)
                    next_page_html = requests.get(
                        self.load_next_page_url(next_page_content))
                    next_page_content.encoding = "utf-8"
                    next_page_content = BeautifulSoup(next_page_html.text,
                                                      features="html.parser")

            except MissingSchema:
                pass

        else:
            print("La requête à retourné une erreur : ")
            print(requests.status_codes)

Example #3

0

Show file

File: Firtst_get_html.py Project: mingxming/Python_code

def get_url(url):
    kv = {'user-agent': 'Mozilla/5.0'}
    f = requests.get(url, headers=kv)
    bf = BeautifulSoup(f.content, 'lxml')
    bf.encoding = bf.apparent_encoding

    print(bf.text)

Example #4

0

Show file

def data_(driver):
    time.sleep(0.5)
    html = driver.page_source
    data = str(pq(html))
    data = BeautifulSoup(data, "lxml")
    data.encoding = 'utf-8'
    return data

Example #5

0

Show file

File: jiandan.py Project: linyuzhe210/pypracitce

 def download_img(self):
     ls = []
     if self.start_page == self.end_page:
         for i in range(1):
             html = requests.get(self.html, headers=self.headers)
             html.encoding = 'utf-8'
             html_source = BeautifulSoup(html.content, 'lxml')
             meizi = html_source.select('.commentlist .view_img_link')
             for each_meizi in meizi:
                 ls.append(each_meizi)
     else:
         for i in range(eval(self.end_page) - eval(self.start_page)):
             if self.start_page <= self.end_page:
                 html_source = requests.get(self.html, headers=self.headers)
                 html_source.encoding = 'utf-8'
                 page_source = BeautifulSoup(html_source.content, 'lxml')
                 meizi = page_source.select('.commentlist .view_img_link')
                 for each_meizi in meizi:
                     ls.append(each_meizi)
             else:
                 print('起始页面大于末尾页面！')
                 break
     count = 0
     meizi_all_img = len(ls)
     for meizi_picture in ls:
         count += 1
         meizi_picture = meizi_picture['href']
         download_img = requests.get('http:' + meizi_picture,
                                     headers=self.headers,
                                     stream=True)
         filename = meizi_picture.split('/')[-1]
         with open(filename, 'wb') as file:
             print('正在下载第{0}/{1}张图片'.format(count, meizi_all_img))
             file.write(download_img.content)

Example #6

0

Show file

def update_queue():
    global authenticated
    global queueSoup
    if not authenticated:
        if queueSoup:
            print(color.YELLOW+'Error: Could not update queue. You are not authenticated'+color.END)
        else:
            print(color.RED+'Warning: Could not load queue. You are not authenticated'+color.END)
        return

    if queueSoup:
        print_overridable('Updating queue...')
        resultStr = 'Queue updated'
    else:
        print_overridable('Loading queue...')
        resultStr = 'Queue loaded'
    data = {
        'session_id': cookies['sess_id'],
        'fields': 'last_watched_media,last_watched_media_playhead,most_likely_media,most_likely_media_playhead,media.media_id,media.series_id,media.name,media.episode_number,media.available_time,media.duration,media.collection_name,media.url,series,series.name'
    }
    queueSoup = BeautifulSoup(requests.get('http://api.crunchyroll.com/queue.0.xml', headers=api_headers, params=data, cookies=cookies).text, 'xml')
    queueSoup.encoding = 'utf-8'
    if queueSoup.response.error.text == "true":
        if queueSoup.response.code.text == "bad_session":
            msg = "Your session has expired. You are no longer authenticated"
            unset_cache("session_id")
            authenticated = False
        else:
            msg = "{} ({})".format(queueSoup.response.message.text, queueSoup.response.code.text)
        print_overridable(color.RED+'Error: Could not fetch queue. '+msg+color.END, True)
    else:
        print_overridable(color.GREEN+resultStr+color.END, True)

Example #7

0

Show file

def get_year_month_data(year, month, url):
    pageRequest = requests.get("https://stock.wearn.com/" + url +
                               ".asp?Year=" + str(year) + "&month=" +
                               str(month) + "&kind=2330")
    soup = BeautifulSoup(pageRequest.content, 'html.parser')
    soup.encoding = 'utf-8'
    return soup

Example #8

0

Show file

def get_url(url):
    kv = {'user-agent': 'Mozilla/5.0'}
    f = requests.get(url, headers=kv)
    bf = BeautifulSoup(f.content, 'lxml')
    bf.encoding = bf.apparent_encoding
    bcontent = bf.find_all('div', class_='content')
    for k in bcontent:
        print(k.text)

Example #9

0

Show file

File: zhihu_per_coll_spider.py Project: mingxming/learngit

def get_paper(url):
    kv = {'user-agent': 'Mozilla/5.0'}
    f = requests.get(url, headers=kv, verify=True)
    bf = BeautifulSoup(f.text, 'lxml')
    bf.encoding = bf.apparent_encoding
    a1 = bf.find_all('div', class_='zm-invite-pager')
    a_bf = BeautifulSoup(str(a1), 'lxml')
    a2 = a_bf.find_all('span')
    return int(a2[-2].text)

Example #10

0

Show file

File: process_file.py Project: xiaobaiyizhi/PMtools

 def commited_id(self):
     checklist_url = 'http://113.196.57.124/playerEmail/playerEmail_chklist.php'
     checklist_respone = requests.get(checklist_url, cookies=self.cookie)
     checklist_respone.encoding = 'utf-8'
     soup = BeautifulSoup(checklist_respone.text, "html.parser")
     soup.encoding = 'utf-8'
     # print(soup.text)
     names = soup.table.find_all('tr')[1].find_all('td')[2].text.strip()
     return names

Example #11

0

Show file

def get_url(url):
	kv={'user-agent':'Mozilla/5.0'}
	f = requests.get(url,headers=kv)
	bf=BeautifulSoup(f.content,'lxml')
	bf.encoding=bf.apparent_encoding	
	no1=bf.find_all('div', class_="list_box")
	bf2=BeautifulSoup(str(no1),'lxml')
	no2=bf2.find_all('span',class_="fh_bt")
	for k in no2:
		print(k.text)

Example #12

0

Show file

File: parser.py Project: gtereshkov/parser_python

def get_price(link):
	html_doc = urllib.request.urlopen(link)
	soup = BeautifulSoup(html_doc, 'html.parser')
	soup.encoding = 'utf-8'
	parse_result = soup.find('a', class_='offers-description__link offers-description__link_subsidiary offers-description__link_nodecor')
	if parse_result==None :
		price = 'Net v nalichii'
	else:
		price = soup.find('a', class_='offers-description__link offers-description__link_subsidiary offers-description__link_nodecor').text.strip()
	return price

Example #13

0

Show file

File: zhihu_per_coll_spider.py Project: mingxming/learngit

def get_title(url):
    kv = {'user-agent': 'Mozilla/5.0'}
    f = requests.get(url, headers=kv, verify=True)
    bf = BeautifulSoup(f.text, 'lxml')
    bf.encoding = bf.apparent_encoding
    h2 = bf.find_all('h2', class_='zm-item-title')
    a_bf = BeautifulSoup(str(h2), 'lxml')
    a = a_bf.find_all('a')
    for i in a:
        print(i.text)
        with open(cur_path + '\\spider_data.txt', 'a+') as f:
            f.write(i.text + '\n')

Example #14

0

Show file

def get6vtext(url):
    respone = requests.get('http://www.6vhao.tv/')
    soup = BeautifulSoup(respone.text, 'html.parser')
    # print(soup)
    result = soup.select('body > div:nth-of-type(4) > div.tjlist > ul > li:nth-of-type(1) > a')
    # print(result[0]['href'])
    infohtml = requests.get('http://www.6vhao.tv/dy6/2018-03-16/33676.html')
    info_soup = BeautifulSoup(infohtml.text, 'html.parser')

    info_soup.encoding = "gbk"
    # print(info_soup)
    text = info_soup.find(id='text').find_all('a')
    for t in text:
        print(t['href'])

Example #15

0

Show file

File: namu.py Project: Lastorder-DC/namuhub

def contrib(username):
    """contributions"""
    result = []

    url = URL.format(author=username)
    if '.' in username and RE_IP.match(username):
        url = URL_IP.format(author=username)

    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'lxml')
    soup.encoding = 'utf-8'

    rows = soup.select('article table tbody tr')
    item = None
    hasdetail = False
    for i, row in enumerate(rows):
        if not item and not hasdetail:
            info = row.select('td')[0]
            document = info.select('a')[0].string
            try:
                qs = {k: ''.join(v) for k, v in
                      urllib.parse.parse_qs(info.select('a')[2].attrs['href'].split('?')[-1]).items()}
            except IndexError:
                revision = 1
            else:
                try:
                    revision = int(qs['rev'])
                except KeyError:
                    revision = 1
            changes = int(info.select('span')[-1].string)
            when = row.select('td')[2].string.strip()
            item = NamuContrib(document=document, revision=revision, changes=changes, when=when)

            # Find reverts
            revert = info.select('a + i')
            if revert:
                revert = extint(revert[0].string)
                item.revert = revert
        elif item and hasdetail:
            desc = row.select('td')[0].string
            item.desc = desc
            hasdetail = False
        if 'no-line' in row.attrs.get('class', []):
            hasdetail = True

        if item and not hasdetail:
            result.append(item)
            item = None

    return result

Example #16

0

Show file

File: Course_Snatching.py Project: ghnmqdtg/NTUST-Course-Predator

def login():
    while True:
        response = session.get(config["URL"]["login"],
                               headers=headers,
                               verify=False)
        if response.status_code == requests.codes.ok:
            soup = BeautifulSoup(response.content, features='html.parser')
            soup.encoding = 'utf-8'
            token = soup.select(
                'body > div > div > div > div > div > div > form > input[type=hidden]'
            )[0].get('value')
            # print(token)
            response = session.get(config["URL"]["captcha"], verify=False)
            open('img.png', 'wb').write(response.content)
            captcha = verifycode('img.png')

            # print(captcha)
            data_login["__RequestVerificationToken"] = token
            data_login["UserName"] = config["stud_info"]["studentno"]
            data_login["Password"] = config["stud_info"]["password"]
            data_login["VerifyCode"] = captcha

            r_temp = session.post(config["URL"]["login"], data=data_login)
            if r_temp.status_code == requests.codes.ok:
                soup = BeautifulSoup(r_temp.content, features='html.parser')
                soup.encoding = 'utf-8'
                try:
                    if (token != soup.select(
                            'body > div > div > div > div > div > div > form > input[type=hidden]'
                    )[0].get('value')):
                        print(datetime.datetime.now().strftime("%H:%M:%S") +
                              " Wrong Captcha")
                except:
                    print("login complete")
                    # print(r_temp.text)
                    break

Example #17

0

Show file

File: namu.py Project: ssut/namuhub

def contrib(username):
    """contributions"""
    result = []

    url = URL.format(author=username)
    if '.' in username and RE_IP.match(username):
        url = URL_IP.format(author=username)

    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'lxml')
    soup.encoding = 'utf-8'

    rows = soup.select('article table tbody tr')
    item = None
    hasdetail = False
    for i, row in enumerate(rows):
        if not item and not hasdetail:
            info = row.select('td')[0]
            document = info.select('a')[0].string
            try:
                qs = {k: ''.join(v) for k, v in
                      urllib.parse.parse_qs(info.select('a')[2].attrs['href'].split('?')[-1]).items()}
            except IndexError:
                revision = 1
            else:
                revision = int(qs['rev'])
            changes = int(info.select('span')[-1].string)
            when = row.select('td')[2].string.strip()
            item = NamuContrib(document=document, revision=revision, changes=changes, when=when)

            # Find reverts
            revert = info.select('a + i')
            if revert:
                revert = extint(revert[0].string)
                item.revert = revert
        elif item and hasdetail:
            desc = row.select('td')[0].string
            item.desc = desc
            hasdetail = False
        if 'no-line' in row.attrs.get('class', []):
            hasdetail = True

        if item and not hasdetail:
            result.append(item)
            item = None

    return result

Example #18

0

Show file

File: CDKUtil.py Project: wangwindlong/cdk-gui

    def loadHaier(self, url):
        session = requests.Session()
        print("loadHaier url=" + url)
        haierMain = session.get(url)
        if haierMain.status_code == 200:
            soap = BeautifulSoup(haierMain.text, features="lxml")
            soap.encoding = 'utf-8'
            # print(soap)
            # 返回3个js polyfills.c38c86ad444630494a92.bundle.js main.4b3d8dea306811e889d6.bundle.js
            # http://cdkaz.rrs.com/inline.1557c7584b9dbbbbbcec.bundle.js

            return self.authAndgetMenu(url)

            # haierUrl = soap.find('a', text=re.compile('服务处理'))['href']
            # orderMain = loadHaier(session, baseurl + haierUrl)
            # print(orderMain)
        else:
            return False

Example #19

0

Show file

File: priceTracker.py Project: YuTan9/priceTracker

def crawl():
    today = date.today()
    # dd/mm/YY
    d1 = today.strftime("%Y%m%d")

    URL = 'https://www.myprotein.tw/voucher-codes.list'
    headers = {
        "User-Agent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    }

    page = requests.get(URL, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup.encoding = 'utf-8'
    title = soup.findAll("h2", {"class": "voucher-title"})
    day = soup.findAll("div", {"class": "voucher-end-date"})
    msg = soup.findAll("div", {"class": "voucher-message"})
    numbers = [d.string for d in title]

    # print(str('crawled objects\' sizes: ') + str((len(title), len(day), len(msg))))
    if len(title) != len(day) or len(title) != len(msg) or len(day) != len(
            msg):
        return 1
    discounts = []
    for t in title:
        tmp = t.text.split(u' ')
        # print(tmp)
        for i in range(len(tmp)):
            if tmp[i][0] == u'\u6298':
                try:  #'折' sometimes is not describing what percent off
                    if int(tmp[i - 1]) < 10:
                        discounts.append(100 - int(tmp[i - 1]) * 10)
                    else:
                        discounts.append(100 - int(tmp[i - 1]))
                except:
                    continue

    fields = [today, max(discounts)]
    with open('log.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow(fields)
    # saveEventsToHtml(title, day, msg, numbers, d1)
    return 0

Example #20

0

Show file

def get_coordinate():
    with open('loupan_url.txt' , 'r' , encoding='utf-8') as fin:
        for line in fin.readlines():
            line = line.strip().split('\t')
            url = line[0]
            print(url)
            name = line[1]
            price = line[2]
            la = ''
            lo = ''
            try:
                data = requests.get(url, headers=headers)
                data.encoding = 'gb18030'
                data = BeautifulSoup(data.text, "lxml").body
                try:
                    map = data.find('div' , {'class' , 'mapbox'}).iframe.get('src').strip()
                    map = 'http:' + map
                    data = requests.get(map, headers=headers)
                    data.encoding = 'gb18030'
                    data = BeautifulSoup(data.text, "lxml").body.script.text
                    if 'coord' in data:
                        meta = data.strip().split(',')
                        for item in meta:
                            if 'coordx' in item or 'baidu_coord_x' in item:
                                la = item.strip().split(':')[1].replace('"' , '')
                            if 'coordy' in item or 'baidu_coord_y' in item:
                                lo = item.strip().split(':')[1].replace('"' , '')
                except:
                    print('error\t' , url)
                    with open('error_lalo.txt' , 'a') as f:
                        f.write(url + '\n')
            except:
                print('error\t' , url)
                with open('error_lalo.txt' , 'a') as f:
                    f.write(url + '\n')

            if la != '' or lo != '':
                with open('loupan_url_lalo.txt' , 'a' , encoding='utf-8') as f:
                    f.write(url + '\t' + name + '\t' + price + '\t' + la + ' ' + lo + '\n')
            time.sleep(3)

Example #21

0

Show file

 def crawl(self):
     ls = []
     super(Webspider, self).__init__()
     if self.start_page == self.end_page:
         for i in range(1):
             html = requests.get(self.html, headers=self.headers)
             html.encoding = 'utf-8'
             html_source = BeautifulSoup(html.content, 'lxml')
             meizi = html_source.select('.commentlist .view_img_link')
             ls.append(meizi)
     else:
         for i in range(eval(self.end_page) - eval(self.start_page)):
             if self.start_page <= self.end_page:
                 html_source = requests.get(self.html, headers=self.headers)
                 html_source.encoding = 'utf-8'
                 page_source = BeautifulSoup(html_source.content, 'lxml')
                 meizi = page_source.select('.commentlist .view_img_link')
                 ls.append(meizi)
             else:
                 print('起始页面大于末尾页面！')
                 break
     return ls

Example #22

0

Show file

File: buildHeritages.py Project: ajohnson-ventera/pf2

def get_heritageLink():
    heritageLinks = []
    for eachEntry in get_ancestryLink():
        res = requests.get("https://2e.aonprd.com/" + eachEntry)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'lxml')
        for linebreak in soup.find_all('br'):
            linebreak.extract()
        soup.encoding = "utf-8"
        subNav = soup.find("span", {'id': 'ctl00_MainContent_SubNavigation'})
        try:
            heritageLink = subNav.find_all("a")
            #print (heritageLink)
            #heritageLink = heritageLink.get('href')
            heritageLinks.append(heritageLink[2])
            #heritageLinks.append(heritageLink[2].get('href'))
            #print (heritageLink[2].text.replace(' Heritages', ''))
            #print (heritageLink[2].get('href'))
            #nameAncestry = heritageLinks.text.replace(' Heritages', '')
        except:
            pass
    #print ("Heritage links: ", heritageLinks)
    return heritageLinks

Example #23

0

Show file

File: nh_news_spider.py Project: xukaijie/spider

    def get_content(self, url):

        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

        headers = {'User-Agent': user_agent}

        request = urllib2.Request(url, headers=headers)

        response = urllib2.urlopen(request)

        body = response.read()

        soup = BeautifulSoup(body, 'lxml', from_encoding="utf-8")

        soup.encoding = 'utf-8'

        content = []

        for p in soup.select(".topic")[0].select('p'):

            content.append({'value': p.text})

        return content

Example #24

0

Show file

File: ancestries.py Project: Hero-Workshop-Apps/pf2

def get_details():

    details = []
    # print "tutti i links", get_ancestryLink()
    for eachEntry in get_ancestryLink():
        detail = {}
        #print "ogni entry", eachEntry
        res = requests.get("https://2e.aonprd.com/" + eachEntry)
        #res = requests.get("C:/Users/lillo/Downloads/Dwarf.html")
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'lxml')
        #print(soup)
        # soup.br.decompose()
        for linebreak in soup.find_all('br'):
            linebreak.extract()
        soup.encoding = "utf-8"
        main = soup.find("div", {'id': 'main'})
        #print (main)
        pfsLegal = main.find("img", {'title': 'PFS Standard'})
        if (pfsLegal):
            pfsLegal = True
        else:
            pfsLegal = False
        for finder in main.find_all(
                "a",
            {'href': eachEntry}):  # temporary for error into html source
            name = finder.text
        #name = main.find("a", {'href': eachEntry}).text

        print('Start with: ', name)

        traitsarray = main.find_all(
            "span", {"class": lambda L: L and L.startswith('trai')})
        traitHolder = []
        for trait in traitsarray:
            traitHolder.append(trait.text)

        textHolder = []
        textRaw = soup.find(
            "meta",
            {'name': 'description'
             })['content']  #First we grab the content from the meta tag
        textSplit = re.split(
            '<(.*?)>', textRaw
        )  #Now we split it into groups of strings seperated by < >, to pull out any links
        textClean = (''.join(textSplit[::2])).strip(
        )  #Finally, we join every other group together (passing over the link groups) into one string, and strip it of whitespace
        #print (encoder(textClean))
        textHolder.append(encoder(textClean))

        description = []
        descRaw = main.find_all(
            "i"
        )  #Finds the i tags, the first of which should have what we want in descRaw[1]
        descSplit = re.split(
            '<(.*?)>', ''.join(descRaw[1])
        )  #Now we split it into groups of strings seperated by < >, to pull out any links.
        descClean = ''.join(
            descSplit[::2]
        )  #Finally, we join every other group together (passing over the link groups) into one string
        description.append(encoder(descClean))

        detail['name'] = name

        detail['source'] = encoder(
            soup.find("a", {
                'class': 'external-link'
            }).text
        )  #The source should be the text of the first 'a' tag with the "class='external-link" attribute

        detail['traits'] = traitHolder
        detail['description'] = description
        detail['text'] = " ".join(textHolder)

        physPoint = soup.find("h2", string='Physical Description').next_sibling
        detail['physical'] = cleanSplit(
            physPoint)  #cleanSplit cleans up the list

        socPoint = soup.find("h2", string='Society').next_sibling
        detail['society'] = cleanSplit(socPoint)

        alignPoint = soup.find("h2",
                               string='Alignment and Religion').next_sibling
        detail['alignment'] = cleanSplit(alignPoint)

        namesTypePoint = soup.find("h2", string='Names').next_sibling
        detail['namesType'] = cleanSplit(namesTypePoint)

        detail['abilityBoosts'] = titleContent(
            soup.find("h2", string='Ability Boosts'))
        if (soup.find(
                "h2",
                string='Ability Flaw(s)')):  #not all ancestries have flaws
            detail['abilityFlaws'] = titleContent(
                soup.find("h2", string='Ability Flaw(s)'))
        detail['hp'] = str(soup.find("h2", string='Hit Points').next_sibling)
        detail['size'] = str(soup.find("h2", string='Size').next_sibling)
        detail['speed'] = str(soup.find("h2", string='Speed').next_sibling)

        langPoint = soup.find("h2", string='Languages').next_sibling
        detail['languages'] = cleanSplit(langPoint)

        nameList = []
        nameList = (
            encoder(str(soup.find("h3", string='Sample Names').next_sibling))
        ).split(
            ', '
        )  #We split it in case anyone wants to draw out a single sample name
        nameList = [nameItem.strip() for nameItem in nameList
                    ]  #strips whitespace from each entry
        detail['nameList'] = nameList

        mightList = []
        mightRaw = soup.find("h2", string='You Might...').next_sibling
        for mightItem in mightRaw.find_all("li"):
            mightList.append(encoder(encoder(mightItem.text)))
        detail['might'] = mightList

        probablyList = []
        probablyRaw = soup.find("h2", string='Others Probably...').next_sibling
        for probablyItem in probablyRaw.find_all("li"):
            probablyList.append(encoder(encoder(probablyItem.text)))
        detail['probably'] = probablyList

        detail['pfsLegal'] = pfsLegal

        details.append(detail)
    return details

Example #25

0

Show file

File: buildHeritages.py Project: ajohnson-ventera/pf2

def get_details():
    #herit = {}
    #print (get_herytageLink())
    details = []
    # print "tutti i links", get_ancestryLink()
    for eachEntry in get_heritageLink():

        #print (eachEntry.text.replace(' Heritages', ''))
        #print (eachEntry.get('href'))
        nameAncestry = eachEntry.text.replace(' Heritages', '')
        linkAncestry = eachEntry.get('href')

        res = requests.get("https://2e.aonprd.com/" + linkAncestry)

        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'lxml')

        for linebreak in soup.find_all('br'):
            linebreak.extract()
        for linebreak2 in soup.find_all('hr'):
            linebreak2.extract()

        soup.encoding = "utf-8"
        main = soup.find("div", {'id': 'main'})
        print('start with: ', nameAncestry)
        #H1 Problem on HTML page IF and ELSE take element
        for finder in main.find_all(
                "a", {'href': lambda L: L and L.startswith('Heritages.aspx')}):

            detail = {}
            if not finder.text.endswith('eritages'):

                #print(finder.previous_sibling)
                prevElement = finder.previous_sibling
                #print (prevElement.contents)
                pfsLegal = prevElement.find("img", {'title': 'PFS Standard'})
                if (pfsLegal):
                    pfsLegal = True
                else:
                    pfsLegal = False
                """ if finder.previous_sibling:
                    print(finder.previous_sibling) """
                #print (finder)
                name = finder.text
                detail['heritageName'] = name
                th1 = finder.next_sibling
                if th1 is not None:
                    #print('th1', th1)
                    th2 = th1.next_sibling
                    sourceHref = th2.next_sibling
                    #print (sourceHref.text)
                    detail['source'] = sourceHref.text
                    detail['sourceLink'] = sourceHref.get('href')

                else:
                    th1 = finder.parent.next_sibling
                    th2 = th1.next_sibling
                    sourceHref = th2.next_sibling
                    #print (sourceHref)
                    detail['source'] = sourceHref.text
                    detail['sourceLink'] = sourceHref.get('href')

                #print (encoder(sourceHref.next_sibling))
                description = sourceHref.next_sibling
                detail['desc'] = encoder(description)
                ah1 = description.next_sibling
                #print (ah1)
                if ah1 is not None:
                    if ah1.name == 'h3':
                        detailAbility = {}
                        #print(ah1.text)
                        detailAbility['abilityName'] = ah1.text

                        #print(ah1.child)
                        #print(ah1.find("img").get('alt'))
                        detailAbility['abilityActions'] = ah1.find("img").get(
                            'alt')
                        traitsFinder = ah1.next_sibling
                        if traitsFinder.get('class'):
                            traitsHolder = []
                            while traitsFinder.get('class'):
                                #print (traitsFinder.text)
                                traitsFinder = traitsFinder.next_sibling
                                traitsHolder.append(traitsFinder.text)

                            #print('traits: ', traitsHolder)
                            detailAbility['abilityTraits'] = traitsHolder
                        else:
                            pass
                        #sourceAbiHref = th2.next_sibling
                        ak = traitsFinder.next_sibling
                        abiSource = ak.next_sibling

                        detailAbility['abilitySource'] = abiSource.text
                        detailAbility['abilitySourceLink'] = abiSource.get(
                            'href')

                        extraEleAbi = abiSource.next_sibling
                        if extraEleAbi.name == 'b':
                            while extraEleAbi.name == 'b':
                                #print(extraEleAbi.text)
                                detailAbility[extraEleAbi.text] = encoder(
                                    extraEleAbi.next_sibling)
                                ar = extraEleAbi.next_sibling
                                extraEleAbi = ar.next_sibling
                                #print (extraEleAbi)
                        else:
                            pass
                        #abiDesc = ay.next_sibling
                        #print (detailAbility)

                        detail['ability'] = detailAbility
                detail['pfsLegal'] = pfsLegal
                detail['ancestryName'] = nameAncestry
                details.append(detail)
            else:
                pass

        #nameAncestry = {}
        #herit[nameAncestry] = details
        #print (prova)
    return details

Example #26

0

Show file

def run_media(pageurl):
    global queueSoup
    #seriesid = None
    while True:
        mediaid = re.search(r'[^\d](\d{6})(?:[^\d]|$)', pageurl).group(1)

        data = {
            'req': 'RpcApiVideoPlayer_GetStandardConfig',
            'media_id': mediaid,
            'video_format': '108',
            'video_quality': '80',
            'current_page': pageurl
        }

        print_overridable('Fetching media information...')
        config = requests.get('http://www.crunchyroll.com/xml/', headers=rpc_headers, params=data, cookies=cookies)
        config.encoding = 'utf-8'
        print_overridable()
        if config.status_code != 200:
            print(color.RED+'Error: '+config.text+color.END)
            return

        #What is this even? Does it catch some specific media or 404 pages?
        if len(config.text) < 100:
            print(config.url)
            print(config.text)
            return

        config = BeautifulSoup(config.text, 'lxml-xml')

        #Check for errors
        error = config.find('error')
        if error:
            print(color.RED+'Error: '+error.msg.text+color.END)
            return

        #Check if media is unavailable
        error = config.find('upsell')
        if error:
            print(color.RED+'Error: Media is only available for premium members'+color.END)
            return

        nextEpisode = config.find('nextUrl').text
        series = config.series_title.text
        epnum = config.episode_number.text
        episode = config.episode_title.text
        duration = config.duration.text
        print('{} - E{}'.format(series, epnum))
        print(episode)
        print('Duration: {}'.format(mmss(duration)))

        sub = config.find('subtitle', attrs={'link': None})
        if sub:
            print_overridable('Preparing subtitles...')
            _id = int(sub['id'])
            _iv = sub.iv.text
            _subdata = sub.data.text
            # print(_id, _iv, _subdata)
            open(SUBTITLE_TEMP_PATH, 'w').write(convert(decode_subtitles(_id, _iv, _subdata).decode('utf-8')))

        print_overridable('Fetching stream information...')
        data['req'] = 'RpcApiVideoEncode_GetStreamInfo'
        streamconfig = BeautifulSoup(requests.post('http://www.crunchyroll.com/xml', headers=rpc_headers, data=data, cookies=cookies).text, 'lxml-xml')
        streamconfig.encoding = 'utf-8'

        print_overridable('Starting stream...')
        playhead = 0
        if not streamconfig.host.text:
            url = streamconfig.file.text
            subprocess.call(['mpv', url])
        else:
            host = streamconfig.host.text
            file = streamconfig.file.text
            if re.search('fplive\.net', host):
                url1, = re.findall('.+/c[0-9]+', host)
                url2, = re.findall('c[0-9]+\?.+', host)
            else:
                url1, = re.findall('.+/ondemand/', host)
                url2, = re.findall('ondemand/.+', host)

            subarg = ""
            if sub: subarg = " --sub-file "+SUBTITLE_TEMP_PATH
            proc = subprocess.Popen(
                ["rtmpdump -a '"+url2+"' --flashVer 'WIN 11,8,800,50' -m 15 --pageUrl '"+pageurl+"' --rtmp '"+url1+"' --swfVfy http://www.crunchyroll.com/vendor/ChromelessPlayerApp-c0d121b.swf -y '"+file+"' | mpv --force-seekable=yes"+subarg+" -"],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.PIPE,
                bufsize=1,
                shell=True
            )

            # Pick up stderr for playhead information
            while True:
                line = proc.stderr.readline().decode("utf-8")
                if line == '' and proc.poll() is not None:
                    break
                timestamp = re.search('AV: ([0-9]{2}:[0-9]{2}:[0-9]{2}) / ([0-9]{2}:[0-9]{2}:[0-9]{2})', line)
                if timestamp:
                    current = [int(i) for i in timestamp.group(1).split(":")]
                    playhead = (current[0]*60+current[1])*60+current[2]
                    print_overridable('Playhead: {}'.format(mmss(playhead)))

        print_under()
        if sub: os.remove(SUBTITLE_TEMP_PATH)

        if authenticated and input_yes('Do you want to update seen duration to {}/{}'.format(mmss(playhead), mmss(duration))):
            print_overridable('Updating seen duration...')
            data = {
                'req': 'RpcApiVideo_VideoView',
                'media_id': mediaid,
                'cbcallcount': 0,
                'cbelapsed': 30,
                'playhead': config.duration
            }
            resp = requests.get('http://www.crunchyroll.com/xml/', headers=rpc_headers, params=data, cookies=cookies)
            if resp.status_code != 200:
                print_overridable(color.RED+'Error: '+resp.text+color.END, True)
            else:
                print_overridable(color.GREEN+'Seen duration was saved'+color.END, True)
                update_queue() #We update the queue after marking episode as seen!

        if nextEpisode != "":
            if input_yes('Another episode is available, do you want to watch it'):
                pageurl = nextEpisode
            else:
                break
        else:
            print(color.RED+'No more episodes available'+color.END)
            break

Example #27

0

Show file

)[0].font.font.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.text
num_pages = int(re.sub('\W', '', num_pages))

### Get the data

data_list = defaultdict(list)

#
for i in range(1, num_pages + 1):

    start_time = time.time()

    ### Get web page
    page = requests.get('http://www.baibai.com.tw/temple.asp?Page=' + str(i) +
                        '&name=&keyword=&morder=')
    page.encoding = 'big5'
    page = BeautifulSoup(page.text)

    num_items_per_page = int(
        len(
            page.tr.next_sibling.next_sibling.find_all(
                'a', {"href": re.compile('^view-temple\.asp\?com_ser=')})) / 3)

    step = int(
        len(
            page.tr.next_sibling.next_sibling.find_all(
                'a', {"href": re.compile('^view-temple\.asp\?com_ser=')})) /
        num_items_per_page)

    ### Get data
    ## Get name of the temple

Example #28

0

Show file

from bs4 import BeautifulSoup
from urllib.request import urlopen
import sys

f = open(r'content_page_list_tichong_tichu.txt', 'r')
a = []
for i in f.readlines():
    lst = i.strip('\n')
    a.append(lst)
f.close()

for x1 in a:
    try:
        html = urlopen("http://www.langji520.com" + x1)
        ctObj = BeautifulSoup(html, "html.parser")
        ctObj.encoding = 'utf-8'
        content = ctObj.find(name='div', attrs={
            'class': "article_con"
        }).find("p").get_text()
        print(content)
        fo = open("langji520.txt", "a")
        fo.write(content + '\r\n')
        print("write Successful!" + str(x1) + "Still remaining:" +
              str(len(a) - a.index(x1)))
    except AttributeError as reason:
        print("AttributeError:" + str(reason) + str(x1))
        pass
    except UnicodeEncodeError as reason:
        print(str(reason) + str(x1))
        pass
    finally:

Example #29

0

Show file

def get_details():
   
    details = []
    # print "tutti i links", get_ancestryLink()
    for eachEntry in get_ancestryLink():
        detail = {}
        #print "ogni entry", eachEntry
        res = requests.get("https://2e.aonprd.com/" + eachEntry)        
        #res = requests.get("C:/Users/lillo/Downloads/Dwarf.html")
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'lxml')
        #print(soup)
        # soup.br.decompose()
        for linebreak in soup.find_all('br'):
            linebreak.extract()        
        soup.encoding = "utf-8"
        main = soup.find("div", {'id': 'main'})
        #print (main)      
        pfsLegal = main.find("img", {'title': 'PFS Standard'})
        if(pfsLegal):
            pfsLegal = True
        else:
            pfsLegal = False
        for finder in main.find_all("a", {'href': eachEntry}): # temporary for error into html source
            name = finder.text
        #name = main.find("a", {'href': eachEntry}).text
 
        print('Start with: ', name)
 
        traitsarray = main.find_all(
            "span", {"class": lambda L: L and L.startswith('trai')})
        traitHolder = []
        for trait in traitsarray:
            traitHolder.append(trait.text)
       
        description = []
        #source = main.find("a", {'class':'title'}).text
        children = main.contents
       
        detailHolder = []
       
 
        for child in children:
 
            stringContents = child.encode('utf-8')
            #print('stringato: ', stringContents)
            if stringContents.startswith("<"):                
                if child.name == "a":
                    #print("href", child)
                    try:
                        if child['class'][0] == "external-link":
                            #print("source", child.text)
                            detail['source'] = child.text
                    except:
                        pass
                   
                if child.name == "b":
                    if child.text != "Source":
                        tagType = child.text.lower()
 
                if child.name == "a":
                    #print("href", child)
                    try:
                        if child['class'][0] == "external-link":
                            nextchild = child.next_sibling
                            if nextchild.name == "i":
                                child = nextchild
                                description.append(encoder(child.text))                              
                                description.extend(titleContent(child))
                    except:
                        pass    
 
                if child.name == "h2":
                    if child.text == "You Might...":
                        ul = child.next_sibling
                        liList = []
                        for li in ul.findAll('li'):
                            liList.append(encoder(encoder(li.text)))
                        detail['might'] = liList
 
                if child.name == "h2":
                    if child.text == "Others Probably...":
                        ul = child.next_sibling
                        liList = []
                        for li in ul.findAll('li'):
                            liList.append(encoder(encoder(li.text)))
                        detail['probably'] = liList
 
                if child.name == "h2":
                    if child.text == "Physical Description":
 
                        detail['physical'] = titleContent(child)
 
                if child.name == "h2":
                    if child.text == "Society":
 
                        detail['society'] = titleContent(child)
 
                if child.name == "h2":
                    if child.text == "Alignment and Religion":
 
                        detail['alignment'] = titleContent(child)
 
                if child.name == "h2":
 
                    if child.text == "Names":
 
                        detail['namesType'] = titleContent(child)
 
                if child.name == "h3":
 
                    if child.text == "Sample Names":
                        nameList = []
                        words = child.next_sibling.split(', ')
                        # for each word in the line:
                        for word in words:
                            nameList.append(encoder(word))
                            # print the word
                        detail['nameList'] = nameList
                if child.name == "h2":
                    if child.text == "Hit Points":
                        detail['hp'] = child.next_sibling
               
                if child.name == "h2":
                    if child.text == "Size":
                        detail['size'] = child.next_sibling
                if child.name == "h2":
                    if child.text == "Speed":
                        detail['speed'] = child.next_sibling
                if child.name == "h2":
                    if child.text == "Ability Boosts":
                        detail['abilityBoosts'] = titleContent(child)
                if child.name == "h2":
                    if child.text == "Ability Flaw(s)":
                        detail['abilityFlaws'] = titleContent(child)
                if child.name == "h2":
                    if child.text == "Languages":
                        #detail['languages'] = titleContent(child)
                        languagesHolder = []
                        #languagesOptHolder = []
                        nextChild = child.next_sibling
                        while nextChild.name != "h2" and nextChild.name != "h3" and nextChild != None:                          
                            if nextChild.name == "a" :
                                languagesHolder.append(encoder(nextChild.text))
                                nextChild = nextChild.next_sibling
                            else:                                
                                languagesHolder.append(encoder(nextChild))
                                #languagesOptHolder.append(encoder(nextChild).split(', '))
                                break                                
                               
                        detail['languages'] = languagesHolder
                        #detail['languagesOpt'] = languagesOptHolder
                if child.name == "h2":
                    if child.text == "Languages":
                        t=0                        
                        while child.find_next("h2"):
                            otherHolder = []
                            titolo = child.find_next("h2")
                            t+= 1
                            if titolo.name == "h2":                                
                                #print(t, "eccolo", titolo.text)
                                otherHolder.append(titolo.text)
                                otherHolder.append(titolo.next_sibling)                            
                               
                                detail['other'+str(t)] = otherHolder
                                child = child.find_next("h2")
 
                else:
                    if not stringContents.isspace():
                        detailHolder.append(encoder(child.text))
           
        detail['pfsLegal'] = pfsLegal
        detail['name'] = name
        detail['traits'] = traitHolder
        detail['description'] = description
        string = " "
        detail['text'] = string.join(detailHolder)
        details.append(detail)
    return details

Example #30

0

Show file

 def get_text(self, URL):
     if 'news.na' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="_article_body_contents"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'enter' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="end_body_wrp"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'chosun' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="par"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'daum' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="news_view"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'joins' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         soup.encoding = 'utf-8'
         text = ''
         for item in soup.find_all(class_="article_body fs1 mg"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'hani' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="text"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'kmib' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="tx"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'hankookilbo' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="article-story"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'seoul' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="user-snb-wrapper"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     elif 'asiatoday' in URL:
         source_code_from_URL = urllib.request.urlopen(URL)
         soup = BeautifulSoup(source_code_from_URL, 'html.parser')
         text = ''
         for item in soup.find_all(class_="news_bm"):
             text = text + str(item.find_all(text=True))
             for n in text:
                 text = re.sub('&nbsp;| |\t|\r|\n', ' ', text)
                 text = re.sub('<script.*?>.*?</script>', '', text)
                 text = text.replace(r"\xa0", "")
                 text = re.sub(
                     '[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]', "",
                     text)
                 text = re.sub('[a-zA-Z]', '', text)
                 text = text.replace("  ", "")
                 text = text.strip()
     return text

Example #31

0

Show file

import requests
import time
from bs4 import BeautifulSoup
from flask import Flask, request, abort
from selenium import webdriver
import sys
import datetime
import json
import os
while (2 > 1):
    pageRequest = requests.get('https://www.naif.org.tw/infoPigSellDaily.aspx')
    soup = BeautifulSoup(pageRequest.content, 'html.parser')
    soup.encoding = 'utf-8'
    pigpig = soup.find(attrs={"class": "ScrollForm"}).text
    print(pigpig)
    time.sleep(60 * 5)