Python BeautifulSoup.prettify Examples, bs4.BeautifulSoup.prettify Python Examples

Example #1

0

Show file

File: pgessays.py Project: norayr/pgessays

def addSection(link, title):
    if not 'http' in link:
        page = urllib2.urlopen('http://www.paulgraham.com/'+link).read()
        soup = BeautifulSoup(page)
        soup.prettify() 
    else:
        page = urllib2.urlopen(link).read()
        
    section = ez_epub.Section()
    try:
        section.title = title
        print section.title

        if not 'http' in link:
            font = str(soup.findAll('table', {'width':'435'})[0].findAll('font')[0])
            if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100:
                content = font
            else:
                content = ''
                for par in soup.findAll('table', {'width':'435'})[0].findAll('p'):
                    content += str(par)

            for p in content.split("<br /><br />"):
                section.text.append(genshi.core.Markup(p))

            #exception for Subject: Airbnb
            for pre in soup.findAll('pre'):
                section.text.append(genshi.core.Markup(pre))
        else:
            for p in str(page).replace("\n","<br />").split("<br /><br />"):
                section.text.append(genshi.core.Markup(p))
    except:
        pass
    
    return section

Example #2

0

Show file

File: clean-word-markup-table.py Project: geritwagner/tools

def convert_to_lca_style(filename):
    outfile = open(filename + '.html', mode='w', encoding='utf-8')
    soup = BeautifulSoup(open(filename + '.htm', encoding='utf-8'))
    soup.prettify()

    # tags style und head entfernen
    soup.style.decompose()
    soup.head.decompose()

    # b, div, span entfernen (Inhalt bleibt)
    bs = soup.find_all("b")
    for tag in bs:
        soup.b.unwrap()
    divs = soup.find_all("div")
    for tag in divs:
        soup.div.unwrap()
    spans = soup.find_all("span")
    for tag in spans:
        soup.span.unwrap()
    ps = soup.find_all("p")
    for tag in ps:
        soup.p.unwrap()

    # td: attribute etc. bereinigen
    tds = soup.find_all("td")
    for td in tds:
        del td['style']

    # html und body entfernen (Inhalt bleibt)
    soup.html.unwrap()
    soup.body.unwrap()

    outfile.write(soup.prettify())
    outfile.close()
    return

Example #3

0

Show file

File: donga.py Project: dikien/Machine-Learning-Newspaper

def get_urls(url):
    data = get_page(url) 
    soup = BeautifulSoup(data)
    soup.prettify()
    basic_url = "http://news.donga.com"
    contents = soup.findAll("p", { "class" : "title" })
    for content in contents:
        if str(content) != '\n':
            data1 = str(content)
            soup1 = BeautifulSoup(data1)
            for link in soup1.findAll('p'):
                url = basic_url + str(link.contents[1]['href'])
                final_content = get_articles(url)[0] # 기사내용
                final_content_length = get_articles(url)[1] # 기사길이
                if len(link.contents[1].contents) != 0: # 기사제목이 없을때 에러방지
                    final_title = str(link.contents[1].contents[0])
                else:
                    final_title = ""
                for p in list(punctuation):
                    final_title = final_title.replace(p,' ')
                    final_title = final_title.replace("“",' ').replace("”",' ').replace("·",' ').replace("△",' ').replace("■",' ').replace("‘",' ').replace("’",' ').replace("…",' ').replace("▲",' ').replace("⊙",' ').replace("◇",' ').replace("▶",' ').replace("◆",' ')
                    final_title = final_title.strip()
                    final_title = re.sub('  ',' ',final_title)                        
                year = str(link.contents[3])[7:11] 
                month = str(link.contents[3])[12:14]
                day = str(link.contents[3])[15:17]
                print year + "-" + month + "-" + day
                '''

Example #4

0

Show file

File: merriam_webster.py Project: ravenzheng/pagecrawl

def analyze(page):
    log("Analyzing, extracting ...")
    soup = BeautifulSoup(page)
    soup.prettify()
    word_item = MWordItem()
    # url
    url_tag = soup.find("meta", {"property": "og:url"})
    word_item.source_url = url_tag["content"]
    # 单词
    word_tag = soup.find("strong", class_="main_entry_word")
    word_item.word = word_tag.string
    # 词性
    func_tag = soup.find("p", class_="word_function")
    word_item.func = func_tag.text
    # 释义
    sense_tag_list = soup.find_all("span", class_="ssens")
    word_item.sense_list = [sense_tag.text[2:] for sense_tag in sense_tag_list]
    # example & do you know
    example_dyn_tag_list = soup.find_all("p", class_="word_example_didu")
    if len(example_dyn_tag_list) != 2:
        log('Should Contain both example & "do you know" parts, Please Check')
    else:
        word_item.example = example_dyn_tag_list[0].text
        story = example_dyn_tag_list[1].text
        word_item.story = re.sub(r"((Test Your Memory)|(Name That Synonym)).+$", r"", story)
    print word_item.story
    return word_item

Example #5

0

Show file

File: convert.py Project: andreask/convert_jsontv

def download_json_files():
    if not os.path.exists('/tmp/xmltv_convert/json'):
        os.makedirs('/tmp/xmltv_convert/json')

    page = urllib2.urlopen('http://json.xmltv.se/')
    soup = BeautifulSoup(page)
    soup.prettify()

    for anchor in soup.findAll('a', href=True):
        if anchor['href'] != '../':
            try:
                anchor_list = anchor['href'].split("_")
                channel = anchor_list[0]
                filedate = datetime.datetime.strptime(anchor_list[1][0:10], "%Y-%m-%d").date()
            except IndexError:
                filedate = datetime.datetime.today().date()

            if filedate >= datetime.datetime.today().date():
                if len(channels) == 0 or channel in channels or channel == "channels.js.gz":
                    stdout.write("Downloading http://xmltv.tvtab.la/json/%s " % anchor['href'])
                    f = urllib2.urlopen('http://xmltv.tvtab.la/json/%s' % anchor['href'])
                    data = f.read()
                    with open('/tmp/xmltv_convert/json/%s' % anchor['href'].replace('.gz', ''), 'w+ ') as outfile:
                        outfile.write(data)
                    stdout.write("Done!\n")
                    stdout.flush()

Example #6

0

Show file

File: getImage.py Project: prophetw/somepython

def getImg(requestUrl):
    theHtml = rs.get(requestUrl).text
    # BeautifulSoup to get the page html
    soup = BeautifulSoup(theHtml, 'lxml')
    soup.prettify()
    theCommentList = soup.find('ol', class_='commentlist')
    allLiTag = theCommentList.find_all('li')
    for li in allLiTag:
        if li.get('id')=='adsense':
            continue
        vote = li.find('div', class_='vote').find_all('span')[1].string
        vote = int(vote)
        if vote>votelimit:
            try:
                id = li.get('id')
                # print(id)
                originalLink = li.find('span', class_='righttext').find('a').get('href')
                dirName = getDirFrom(originalLink)
                name = getPreNameFrom(originalLink)
                imageUrl = li.find('a', class_='view_img_link').get('href')
                vote = li.find('div', class_='vote').find_all('span')[1].string
                prefixName= rename(name,vote)
                extraName = getFileExt(imageUrl)
                downloadImg('http:'+imageUrl,dirName,prefixName,extraName,originalLink)
                print(vote)
            except:
                continue
        else:
            continue

Example #7

0

Show file

File: beautifulsoup.py Project: ChesleyTan/Instant-Answers-Screenscraper

def bsoup():
    r = requests.get("https://answers.yahoo.com/question/index?qid=20080613085817AAqvcNW")
    soup = BeautifulSoup (open(r.content))
    string = soup.findall("div", {"class":"group"})
    print soup.finalall(re.compile("^[A-Z]"))
    print soup.prettify()
    print string

Example #8

0

Show file

File: beautiful_soup.py Project: bianXiaoTuan/toolkit

def hello_world():
    ''' BeautifulSoup
    '''
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie/chen/hong" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie/huan/jiang" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie/pang/guai" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>

    <p class="story">...</p>
    """

    soup = BeautifulSoup(html_doc, 'html.parser')
    print soup.prettify()

    # 下面两种方式一样
    print soup.find_all('a')
    links = soup('a')

    for link in links:
        print link.attrs
        print link.string
        print link.contents

Example #9

0

Show file

File: field.py Project: yotamproject/alert_system

    def get_data_apple(self):
        """
        Grabs official releases from Apple site.
        The code uses BeautifulSoup to grab the first table. The code looks for a row with the right Software.
        In case the Apple's format has changed a KeyError is thrown.
        :return:
        """

        html = urllib2.urlopen(self.data_dic[self.name]["link"]).read()
        soup = BeautifulSoup(html)
        soup.prettify()

        rows = soup.find('table').find_all('tr')
        for row in rows:
            text = row.contents[0].get_text()

            m = re.search(self.data_dic[self.name]["format"], text)
            if m:
                self.version = m.group(1)
                self.date = datetime.datetime.strptime(row.contents[4].get_text(), self.data_dic[self.name]["date_format"])

                self.generate_table()
                return

        raise KeyError('ERROR: No new releases in the Apple site, please change url or matching pattern')

Example #10

0

Show file

File: walmartSingleItemCrawler.py Project: shengnwen/Yale-DatabaseConcepts

def walmartSingleItemCrawler(link):
    soup = BeautifulSoup(urllib2.urlopen(link), 'html.parser')
    print soup.prettify()
    # name
    name = soup(class_ = re.compile("^js-product-heading"))[0].span
    print "1.Name:" + name.text
    price = soup(class_ = re.compile("^js-price-display"))[0].text
    print "2.Price" + price
    unitPrice = soup(class_ = re.compile("^unit-price-display"))[0].text
    print "3.unit price" + unitPrice
    img_src = soup(class_ = "product-image js-product-image js-product-primary-image")[0]['src']
    print "4.img_src:" + img_src
    print "5.About this item ==========="
    itemInfo = soup(class_ ="product-description-disclaimer-mweb")[0].text
    print itemInfo
    liItemInfo = soup(class_= "about-item-preview-text js-about-item-preview-text")[0]
    # print liItemInfo
    for li in liItemInfo:
        print "*" + li.text
    print "6. Rank ======"
    itemRank = soup(class_="Grid-col item-ranks")[0]
    print itemRank
    starRated = soup(class_ = "Grid mweb-snippet-stars")[0].find_all("i", class_="star star-rated")
    print "7. star review:"
    print "star " + str(len(starRated)) + " out of 5"

Example #11

0

Show file

File: convert.py Project: NonameTV/nonametv

def download_json_files():
    if not os.path.exists("/tmp/xmltv_convert/json"):
        os.makedirs("/tmp/xmltv_convert/json")

    page = urllib2.urlopen("http://json.xmltv.se/")
    soup = BeautifulSoup(page)
    soup.prettify()

    for anchor in soup.findAll("a", href=True):
        if anchor["href"] != "../":
            aweekago = datetime.datetime.now() - datetime.timedelta(days=7)
            try:
                anchor_list = anchor["href"].split("_")
                channel = anchor_list[0]
                filedate = datetime.datetime.strptime(anchor_list[1][0:10], "%Y-%m-%d").date()
            except IndexError:
                filedate = aweekago.date()

            if filedate >= aweekago.date():
                if len(channels) == 0 or channel in channels or channel == "channels.js.gz":
                    stdout.write("Downloading http://json.xmltv.se/%s " % anchor["href"])
                    f = urllib2.urlopen("http://json.xmltv.se/%s" % anchor["href"])
                    data = f.read()
                    with open("/tmp/xmltv_convert/json/%s" % anchor["href"].replace(".gz", ""), "w+ ") as outfile:
                        outfile.write(data)
                    stdout.write("Done!\n")
                    stdout.flush()

Example #12

0

Show file

File: hu_elections_2014.py Project: michalskop/czhuhack2016

def get_parteredmenyek(content):
    results = list()
    soup = BeautifulSoup(content, from_encoding='utf-8')
    soup.prettify(formatter=lambda s: s.replace(u'\xa0', ' '))
    jegyzokonyv = soup.find(text='Jegyzőkönyv')
    voter_table = jegyzokonyv.find_next('table')

    voter_data = voter_table.find_all('td')
    total = voter_data[0].text
    voters = list(voter_data[1])
    #non_voters = int(total) - int(voters[0].replace(' ', ''))
    non_voters = 0

    nonvoter = dict()
    nonvoter['statistics_code'] = 'non-voters'
    nonvoter['statistics_name'] = 'Non voters'
    nonvoter['value'] = non_voters
    results.append(nonvoter)

    jelolt_table = soup.find('p', text='A szavazatok száma pártlistánként').find_next('table')
    #print type(jelolt_table)

    rows = jelolt_table.find_all('tr')
    #print 'rows: ', rows
    for row in rows[1:]:
        party = dict()
        cells = row.find_all('td')
        #print 'cells: ', cells
        party['statistics_code'] = slugify(cells[1].text)
        party['statistics_name'] = cells[1].text
        party['value'] = cells[2].text
        results.append(party)

    return results

Example #13

0

Show file

File: field.py Project: yotamproject/alert_system

    def get_data_ios(self):
        """
        Grabs official releases Data from Wikipedia page regarding iOS releases.
        The code uses BeautifulSoup to grab the first table. The code looks for the entry which is colored in green
        (This color is defined in the XML file and can be easily changed without opening the code).
        In case the Wikipedia format is changed a KeyError is thrown.
        :return:
        """

        html = urllib2.urlopen(self.link).read()
        soup = BeautifulSoup(html)
        soup.prettify()

        rows = soup.find('table').find_all('tr')
        for row in rows:
            tmp = row.find('td', {'style': 'background:' + self.color
                           + ';'})
            if tmp:
                cells = row.find_all('td')
                self.version = tmp.get_text()
                self.date = datetime.datetime.strptime(row.find('span',
                        {'class': 'bday dtstart published updated'
                        }).get_text(), '%Y-%m-%d')
                self.TableObj[self.soft_name][1] = self
                self.TableStr[self.soft_name][1] = self.return_date()
                return
        raise KeyError('ERROR: The Wikipedia format has probably changed.'
                       )

Example #14

0

Show file

File: field.py Project: michaelmirkin/Alerter

    def get_data_selenium(self):
        """
        Grabs official releases from Selenium site.
        The code uses BeautifulSoup to grab the first table. The code looks for a row with the right Software.
        In case the Seleniums's format has changed a KeyError is thrown.
        :return:
        """
        try:
            self.link = self.data_dic[self.name]["link"]
            html = urllib2.urlopen(self.link).read()
            soup = BeautifulSoup(html)
            soup.prettify()

            rows = soup.find('table').find_all('tr')
            for row in rows:
                text = row.contents[1].get_text()

                if text == "Python":
                    self.version = row.contents[3].get_text()
                    self.date = datetime.datetime.strptime(row.contents[5].get_text(),
                        self.data_dic[self.name]["date_format"])
                    self.generate_table(self.link, self.data_dic[self.name]["link2"])
                    return

            raise KeyError()
        except:
            raise KeyError('ERROR: Error reading version or date: ' + self.name)

Example #15

0

Show file

File: parsers.py Project: biddyweb/retina-crawler

def _get_out_links(article, doc):
    #Needs to focus on only relevent links(is it an actual article)
    soup = BS(article.html)
    soup.prettify()
    for link in soup.findAll('a'):
        out_link = link.get('href')
        article.out_links.append(urljoin(article.url, out_link))

Example #16

0

Show file

File: page_parser.py Project: lunzhy/PyShanbay

def parse_members_info(pages):
    members = []
    for page in pages:
        soup = BeautifulSoup(page)
        soup.prettify()
        tr_members = soup.find_all('tr', {'class': 'member'})
        for tr in tr_members:
            td_infos = tr.find_all('td')

            username = td_infos[0].find_all('img')[0].get('alt')
            nickname = td_infos[0].find_all('a', {'class': 'nickname'})[0].get_text()
            login_id = _get_number_out(td_infos[0].find_all('a', {'class': 'nickname'})[0].get(
                'href'))

            points = td_infos[1].get_text()
            days = _get_number_out(td_infos[2].get_text())

            rate = td_infos[3].get_text()

            checked = True if 'label-success' in td_infos[4].find_all('span')[0].get('class') \
                else False

            member = {
                'username': username,
                'nickname': nickname,
                'login_id': login_id,
                'points': points,
                'days': days,
                'rate': rate,
                'checked_today': checked
            }
            members.append(member)
    return members

Example #17

0

Show file

File: html-include.py Project: bojdell/bodellamaria

	def __processFile(self, input_file, output_file, recursing):
		"""Process a single file at path input_file and write processed file to path output_file.
		May be called recursively depending on template structure.
		"""
		f_in = open(input_file, "r", encoding="utf-8")
		soup = BeautifulSoup(f_in)

		# find all comments in the document
		comments = soup.findAll(text=lambda text:isinstance(text, Comment))

		# parse out the template filepath from each comment, recursively obtain the content,
		# and replace the comment with the content to be inserted
		for comment in comments:
			# TODO: fix regex to include 'file' keyword
			match = re.match('^#include.+?virtual=\"(.+?)\"', comment)
			if match:
				inc_path = match.group(1)
				if inc_path:
					inc_content = BeautifulSoup(self.__processFile(self.input_dir + '/' + inc_path, "", True))
					for tag in inc_content.contents: 
						comment.insert_before(tag)
					comment.extract()
		f_in.close()

		# if we are recursing, return our result as a string so it can be included recursively
		if recursing:
			return soup.prettify(formatter=None)

		# if not, this must be the original SHTML file, so write our result to output_file as an HTML document
		else:
			f_out = open(output_file, "w", encoding="utf-8")
			f_out.write(soup.prettify(formatter=None))
			f_out.close()

Example #18

0

Show file

File: page_parser.py Project: lunzhy/PyShanbay

def parse_checkin(checkin_page):
    soup = BeautifulSoup(checkin_page)
    soup.prettify()
    div_checkins = soup.find_all('div', {'class': 'checkin span8'})
    notes, dates = [], []
    for checkin in div_checkins:
        note = checkin.find_all('div', {'class': 'note'})[0].get_text().strip()
        date = checkin.find_all('div', {'class': 'span4'})[0].get_text().strip()
        notes.append(note)
        dates.append(date)

    notes, dates = notes[:7], dates[:7]  # only get 7 items
    words = [_regex_search(r'\d+(?= 个单词)', note) for note in notes]
    reads = [_regex_search(r'\d+(?= 篇文章)', note) for note in notes]
    sents = [_regex_search(r'\d+(?= 个句子)', note) for note in notes]
    lstns = [_regex_search(r'\d+(?= 句听力)', note) for note in notes]

    dates = [_parse_chinese_date(date) for date in dates]
    checkin_list = []

    for date, word, read, sent, lstn in zip(dates, words, reads, sents, lstns):
        word = 0 if word == '' else int(word)
        read = 0 if read == '' else int(read)
        sent = 0 if sent == '' else int(sent)
        lstn = 0 if lstn == '' else int(lstn)
        checkin = {'words': word, 'reads': read, 'sents': sent, 'lstns': lstn}
        date_checkin = [date, checkin]
        checkin_list.append(date_checkin)

    return checkin_list

Example #19

0

Show file

File: page_parser.py Project: lunzhy/PyShanbay

def parse_total_checkin(page):
    soup = BeautifulSoup(page)
    soup.prettify()
    ul = soup.find_all('ul', {'class': 'nav-stacked'})
    checkins = ul[0].find_all('a')[1].get_text()
    days = _get_number_out(checkins)
    return days

Example #20

0

Show file

File: NANA.py Project: xuxubin/crawler

    def getPageItem(self, content):
        if not content:
            print '页面加载失败'

        #最大页码
        page_pattern = re.compile('<a href=.*?"pageTo" action-data="page=([0-9].*?)">.*?</a>')
        pages = re.findall(page_pattern, content)
        # 更新最大页码
        pages_int = [int(x) for x in pages]
        self.maxIdx = max(pages_int)

        # 两个链接,时间,微博内容
        # 不使用正则表达式
        # img_pattern = re.compile('<li>.*?<dl class="m_photoItem m_photoItem_a phtItem_hv">.*?<a href="(.*?)">.*?' +
        #                          '<img src="(.*?)".*?/>.*?</a>.*?<span node-type="time">(.*?)</span>.*?' +
        #                          '<p title="(.*?)" class.*?</p>.*?</dd>.*?</dl></li>',re.S)
        # items = re.findall(img_pattern, content)
        html_tree = BeautifulSoup(content)
        html_tree.prettify()
        photo_list = html_tree.find_all("dl", class_="m_photoItem m_photoItem_a phtItem_hv")
        items = []
        for photo_item in photo_list:
            item = {}
            item["detail_page"] = photo_item.find('dt').find('a')["href"]
            item["small_img_link"] = photo_item.find('dt').find('a').find('img')["src"]
            item["time"] = photo_item.find('dd').find("span", attrs={"node-type":"time"}).string
            try:
                item["description"] = photo_item.find('dd').find_all("p")[-1].get("title")
            except:
                item["description"] = None
            items.append(item)

        return items

Example #21

0

Show file

File: extract_data.py Project: HackerPack/StockTunes

def extract_data(code):

	p = urllib2.urlopen('http://www.nasdaq.com/symbol/'+code+'/historical').read()
	page = BeautifulSoup(p)
	page.prettify()

	div = page.find(id="historicalContainer")

	tables = div.find_all('table')

	for table in tables:
		tr_tags = table.find_all('tr')

	val=[]
	date = []

	count=0
	for tr_tag in tr_tags:
		if count > 1:
			count1=0
			td_tags = tr_tag.find_all('td')
			for td_tag in td_tags:
				if count1 == 4:
					val.append(str(td_tag.string.strip()))
				if count1 == 0:
					date.append(str(td_tag.string.strip()))
				count1=count1+1
		count=count+1

	return {"data": val, "date": date}

Example #22

0

Show file

File: test_ebook_instapaper_mobilizer.py Project: kubamarchwicki/readlist-ebook-parser

 def test_complete_download_and_mobilization(self):
     mobilizer = InstapaperMobilizer()
     u = urllib.urlopen(mobilizer.url("http://m.onet.pl/wiadomosci/4986708,detal.html"))
     soup = BeautifulSoup(u.read())
     self.assertTrue(mobilizer.is_correctly_mobilized(soup), "Correctly mobilized")
     soup = mobilizer.post_process_html(soup)
     print soup.prettify()

Example #23

0

Show file

File: extract.py Project: seamustuohy/overview_archive

def from_html(file_path):
    with open(file_path, "r") as html_file:
        html_text = html_file.read()

        html_obj = BeautifulSoup(html_text, 'lxml')
        html_title = html_obj.title.string.strip()
        # Remove all comment elements
        comments = html_obj.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
        #print(html_obj)

        # print and reparse html or we get an error for some reason
        html_obj = BeautifulSoup(html_obj.prettify(), 'lxml')

        # remove all script and style elements
        for unwanted in html_obj(["script", "style"]):
            unwanted.extract()

        # print and reparse html or we get an error for some reason
        html_obj = BeautifulSoup(html_obj.prettify(), 'lxml')

        # get the text
        text = html_obj.get_text()

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
    return text, html_obj.title.string

Example #24

0

Show file

File: leetcode_init.py Project: gaopinghuang0/learn-python

 def get_content(self):
     print 'fetching from {}'.format(self.url)
     req = urllib2.Request(self.url,headers=headers)  
     page = urllib2.urlopen(req).read()
     soup = BeautifulSoup(page)
     soup.prettify()
     print soup.html

Example #25

0

Show file

File: ticker.py Project: chasekb/twitterPredictor

    def scrapeTickers(self):
        # create list of tickers
        url2scrape = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        page = urllib2.urlopen(url2scrape).read()
        soup = BeautifulSoup(page)
        soup.prettify()
        table = soup.find("table", { "class" : "wikitable sortable" })

        for row in table.findAll("tr")[1:]:
            self.__tickerList.append(row.a.string)

        for ticker in self.__tickerList:
            tableCreate = 'CREATE TABLE IF NOT EXISTS tickers (' \
                          'ID int(11) PRIMARY KEY NOT NULL AUTO_INCREMENT, ' \
                          'TICKER VARCHAR(5));'
            addTicker = "INSERT INTO tickers (ticker) VALUE ('{}');".format(ticker)
            try:
                #self.__cursor.execute(tableCreate)
                self.__cursor.execute(addTicker)
                self.__cnxn.commit()
            except Exception as e:
                self.tlog.warning(e)
                print e
                pass
            else:
                self.__cnxn.close

Example #26

0

Show file

File: rra.py Project: stormvirux/Result-retrieval-analyser

def getval():
	import codecs
	lol=[]
	record=[]
	x=0
	while x<len(usnl):
		page_html=open("results/"+usnl[x]+".html", 'rb')
		soup=BeautifulSoup(page_html)
		soup.prettify()
		fl = codecs.open('output'+sys.argv[1]+sys.argv[2]+'.csv', 'ab',encoding="Utf-8")
		record=[texts.text for texts in soup.findAll('td',{"align":"center"})]
		"""for y in soup.findAll("td"):
			if y.parent.name=="tr":
				lol.append(y.text)
		if lol[-10] == "A" or lol[-10]=="P" or lol[-10]=="F":
			lol[-10],lol[-1]=lol[-1],lol[-10]"""
		del record[0:4]
		for y in record:
			if "P" in y: record.remove("P")
			elif "F" in y: record.remove("F")
			elif "A" in y: record.remove("A")
				
		if len(record)>24:
			del record[24:]
		if "Total" in record: del record[:]	
		if record:
			fl.write("\n"+usnl[x]+",")
			for y in record:
				fl.write(y)
				fl.write(",")	
				#fl.write(lol[-10])
			fl.close()
		x+=1

Example #27

0

Show file

File: scraper.py Project: jennypeng/MangaScrape

def openURL(url):
    print ("yooooo")
    r = br.open(url)  # open our browser object to the comic page
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page)
    soup.prettify()
    html = r.read()
    # manga is licensed
    if "has been licensed, it is not available" in html:
        print ("Sorry, the series has been licensed")
        reprompt()
    elif "searchform_name" in html:
        # deal with search
        br.select_form(nr=1)
        br.submit()
        # print(br.response().read())
    else:  # does not work for half chapters at the moment
        chapter_num = raw_input("Chapter number: ")
        zero_pad_num = chapter_num.zfill(3)
        for chapter in soup.find_all("a", {"class": "tips"}):
            chapterURL = chapter.get("href")
            print chapterURL
            query = "c" + zero_pad_num
            if query in chapterURL:
                print ("found query at " + chapterURL)
                getFiles(chapterURL, chapter_num)  # rememeber to add name of title
                break

Example #28

0

Show file

File: items.py Project: oooShiny/Projects

def find_champ():
	url = "http://champion.gg"
	ourUrl = opener.open(url).read()
	soup = BeautifulSoup(ourUrl)
	soup.prettify()
	for i in soup.findAll('div', {"class" : "champ-index-img"}):
		champs.append(champions(i))

Example #29

0

Show file

File: cli_tool.py Project: amalrkrishna/myRedHat

def extract_text(filename):
	#data extraction from the html file.
	file = open(filename)
	data = file.read()

	#read the data using BeautifulSoup.
	soup = BeautifulSoup(data, "html.parser")

	#represent the html document as a nested data structure.
	soup.prettify()
	file.close()
	#extract the data between <p>, <div>, <b>, <i>, <td>, <h1>...<h5>, and <span> tags.
	vari = []
	tags = ['h1','h2','h3','h4','h5','p','div','b','i','td','span']
	for i in range(0, len(tags)):
		var = map(str, (soup.find_all(tags[i])))
		var = map(lambda it: it.strip('<'+tags[i]+'>'), var)
		var = map(lambda it: it.strip('</'), var)
		vari.append(var)

	#open a file for writing the data into 'example.properties'
	if os.path.exists('example.properties'):
		f = open('example.properties', 'w+')
	else:
		f = open('example.properties', 'a+')

	#writing the data into 'example.properties'
	for i in range(0, len(tags)):
		f.write(""+tags[i]+" = ")
		for j in range(0, len(vari[i])):
			f.write("%s " % vari[i][j])
		f.write("\n")
	f.close()

Example #30

0

Show file

File: prettify.py Project: SvenDowideit/clearlinux

def main():
    """Entry point for this script."""

    args = parse_command_line_arguments()

    try:
        soup = BeautifulSoup(open(args.file))
    except IOError as exception:
        print("ERROR: File '%s' could not be parsed: %s"
              % (args.file, exception))
        return 1

    if args.write_changes:
        try:
            with open(args.file, 'wb') as output:
                prettified = soup.prettify(encoding="utf8")
                output.write(prettified)
        except IOError as exception:
            print("ERROR: File '%s' could not be written: %s"
                  % (args.file, exception))
            return 1
    else:
        prettified = soup.prettify(encoding="utf8")
        print(prettified)

    return 0

Example #31

0

Show file

File: PY01-testRequest.py Project: SimonMcLain/dataRepresentation

import requests
from bs4 import BeautifulSoup
page = requests.get(
    "https://dataquestio.github.io/web-scraping-pages/simple.html")
print(page)
print("--------------")
print(page.content)
soup1 = BeautifulSoup(page.content, 'html.parser')
print(soup1.prettify())

Example #32

0

Show file

File: crawler_class.py Project: xuebijishui/taoyin

    def make_new_html(self):
        html_frame = dedent('''\
                            <html>
                             <head>
                              <title>
                              </title>
                             </head>
                             <body bgcolor="#00FFFF">
                              <div style="margin:40px ; text-align:center;">
                               <a href="" style="color:red;text-decoration:none;">
                                <h1>
                                </h1>
                               </a>
                              </div>
                             </body>
                            </html>
                            ''')
        if self.html_type == 1:
            soup = BeautifulSoup(self.html_source_code, 'lxml')
            content = soup.find('td', class_='t_f')
            [s.extract() for s in content('i')]

            img_names = []
            wrong_tag = 0
            for img_name in content.find_all('img'):
                if img_name.has_attr('file'):
                    img_names.append(img_name['file'].split('/')[-1])
                else:
                    wrong_tag += 1
            '''
            发现了一个毒瘤（790614）：
            根据<img>提取出来的图片，还有这种情况，是广告图片，要加个判断
            <img src="static/image/smiley/default/handshake.gif" smilieid="23" border="0" alt="" />
            '''

            pre_img_formats = re.findall(r'<img.+?/>', str(content))

            self.new_html_source_code = str(content)
            self.new_html_source_code = re.sub(r'<td.+?>', '',
                                               self.new_html_source_code)
            self.new_html_source_code = re.sub(r'</td>', '',
                                               self.new_html_source_code)

            for i in range(len(pre_img_formats) - wrong_tag):
                after_img_formot = r'<img src="' + img_names[i] + r'"/>'
                self.new_html_source_code = self.new_html_source_code.replace(
                    pre_img_formats[i], after_img_formot)

            self.new_html_source_code = html_frame[:html_frame.index('</title>')] + self.html_title + \
                                   html_frame[html_frame.index('</title>'):html_frame.index('" style')] + self.url + \
                                   html_frame[html_frame.index('" style'):html_frame.index('</h1>')] + self.html_title + \
                                   html_frame[html_frame.index('</h1>'):html_frame.index('</div>')] + self.new_html_source_code + \
                                   html_frame[html_frame.index('</div>'):]

            new_html_soup = BeautifulSoup(self.new_html_source_code, 'lxml')
            self.new_html_source_code = str(new_html_soup.prettify())

            return self.new_html_source_code

        elif self.html_type == 2:
            '''
            干扰项太多了，直接把<div class="pattl">和第一个<td class="t_f"...>
            标签的内容“粘贴”到新的html框架中，再将图片标签修改<img src="...(本地路径)...">
            这样网页源代码会很复杂、不简洁，但干扰项的一个个删除太繁琐了
            '''

            soup = BeautifulSoup(self.html_source_code, 'lxml')

            self.new_html_source_code = html_frame[:html_frame.index(
                '</title>')] + self.html_title + html_frame[html_frame.
                                                            index('</title>'):]

            self.new_html_source_code = html_frame[:html_frame.index('</title>')] + self.html_title + \
                                   html_frame[html_frame.index('</title>'):html_frame.index('" style')] + self.url + \
                                   html_frame[html_frame.index('" style'):html_frame.index('</h1>')] + self.html_title + \
                                   html_frame[html_frame.index('</h1>'):]

            first_part_soup = soup.find('td', class_='t_f')
            [s.extract() for s in first_part_soup('i')]
            self.new_html_source_code = self.new_html_source_code[:self.new_html_source_code.index(
                '</div>')] + str(first_part_soup) + self.new_html_source_code[
                    self.new_html_source_code.index('</div>'):]

            other_part_soup = soup.find('div', class_='pattl')
            self.new_html_source_code = self.new_html_source_code[:self.new_html_source_code.index(
                '</div>')] + str(other_part_soup) + self.new_html_source_code[
                    self.new_html_source_code.index('</div>'):]

            img_list_soup = soup.select('div.mbn.savephotop img')
            img_names = []
            for img_name in img_list_soup:
                if img_name.has_attr('zoomfile'):
                    img_names.append(img_name['zoomfile'].split('/')[-1])

            pre_img_formats = [str(img_name) for img_name in img_list_soup]

            for i in range(len(pre_img_formats)):
                after_img_formot = r'<img src="' + img_names[i] + r'"/>'
                self.new_html_source_code = self.new_html_source_code.replace(
                    pre_img_formats[i], after_img_formot)

            new_html_soup = BeautifulSoup(self.new_html_source_code, 'lxml')
            self.new_html_source_code = str(new_html_soup.prettify())

            return self.new_html_source_code

        else:
            print('网页类型并非1或2，，对于本网页的任务中止\n')
            return None

Example #33

0

Show file

File: intro_to_soup.py Project: souviksaha97/Data-Science-Lab

    file.close()
    return data


# Read the file
html_file = read_file()


# For parsing html we can use lxml or html.parser
soup = BeautifulSoup(html_file,'html.parser')

# soup = BeautifulSoup(html_file,'lxml')

# soup prettify - prints html file with correct indentation

print(soup.prettify())



"""
Let see google.com html structure

"""


ua = UserAgent()

header = {'user-agent':ua.chrome}

# Get the response
google_page = requests.get('https://www.google.com',headers=header)

Example #34

0

Show file

File: toupiao.py Project: haohaojin/EULER_NEW

import requests
from bs4 import BeautifulSoup

with requests.session() as c:
    url = 'https://passport.douguo.com/login/?next=/'
    username = '******'
    password = ''
    c.get(url)
    login_data = dict(username=username, password=password, next='/')
    c.post(url, data=login_data)
    page = c.get('http://m.douguo.com/activity/fotilebraize/index/lists/507')
    print(page.content)
    soup = BeautifulSoup(page.text, 'html.parser')

    print(soup.prettify()).encode('gb18030')

Example #35

0

Show file

File: fixm2html.py Project: fgraciani/connectedbyairm

def create_html():
    # Create Index
    # Create page per class

    mapping_pages_directory = "docs/developers/fixm-4.2.0-to-airm-1.0.0"
    # creates developers/docs/developers/fixm-4.2.0-to-airm-1.0.0 directory
    path = mapping_pages_directory
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    import fixm
    fixm = fixm.Fixm()
    fixm_mapping_dict = fixm.fixm_mapping_dataframe.to_dict('records')

    #Create index page
    #creates soup for index using concept-list-template.html
    html = open("data/html/templates/concept-list-template.html").read()
    soup = BeautifulSoup(html, "lxml")
    soup.title.string = "FIXM 4.2.0 to AIRM 1.0.0 | AIRM.aero"

    #For each entry
    #create table entry
    for record in fixm_mapping_dict:
        tr = soup.new_tag("tr")

        td_ic_name = soup.new_tag("td")
        td_ic_name.string = str(record["Information Concept"])
        tr.insert(1, td_ic_name)

        if record["Data Concept"] != "":
            td_dc_name = soup.new_tag("td")
            url = "fixm-4.2.0-to-airm-1.0.0/" + record[
                "Information Concept"] + ".html" + "#" + record["Data Concept"]
            text = record["Data Concept"]
            print(text)
            new_link = soup.new_tag("a")
            new_link['href'] = url
            new_link['target'] = "_blank"
            new_link.string = text
            td_dc_name.insert(1, new_link)
            tr.insert(2, td_dc_name)

        if record["Definition"] != "":
            td_def = soup.new_tag("td")
            td_def.string = str(record["Definition"])
            tr.insert(3, td_def)

        if record["Type"] != "":
            td_dc_type = soup.new_tag("td")
            parts = str(record["Type"]).split(":")
            clean_type = parts[-1]
            #url = "fixm-4.2.0-to-airm-1.0.0/"+clean_type+".html"
            #text = clean_type
            print(text)
            td_dc_type.string = clean_type
            #new_link = soup.new_tag("a")
            #new_link['href'] = url
            #new_link['target'] = "_blank"
            #new_link.string = text
            #td_dc_type.insert(1,new_link)
            tr.insert(4, td_dc_type)

        soup.find('tbody').insert(1, tr)

    f = open("docs/developers/fixm-4.2.0-to-airm-1.0.0.html", "w+")
    f.write(soup.prettify())
    f.close()

Example #36

0

Show file

File: fixm2html.py Project: fgraciani/connectedbyairm

def create_html_pages():
    import fixm
    import airm
    fixm = fixm.Fixm()
    airm = airm.Airm()
    fixm_info_concepts_dict = fixm.get_information_concepts()

    for info_concept in fixm_info_concepts_dict:
        if info_concept['Information Concept'] != "missing data":
            print(info_concept['Information Concept'])
            #creates soup for concept page using concept-template.html
            html = open("data/html/templates/concept-template.html").read()
            soup = BeautifulSoup(html, "lxml")

            #span = soup.new_tag("span")
            #span.string = str(info_concept['Information Concept'])
            #soup.find(id="BC_INFO_CONCEPT_NAME").insert(0,span)span = soup.new_tag("span")
            #span.string = str(info_concept['Information Concept'])
            soup.title.string = str(
                info_concept['Information Concept']
            ) + " - FIXM 4.2.0 to AIRM 1.0.0 | AIRM.aero"

            soup.find(text="FIXM_CLASS_NAME_BC").replace_with(
                str(info_concept['Information Concept']))

            h2 = soup.new_tag("h2")
            h2.string = str(info_concept['Information Concept'])
            soup.find(id="INFO_CONCEPT_NAME").insert(0, h2)
            code = soup.new_tag("code")
            datac_identifier = info_concept['Identifier']
            parts = datac_identifier.split(":")
            identifier = parts[0] + ":" + parts[1]
            code.string = identifier
            code["class"] = "text-secondary"
            soup.find(id="INFO_CONCEPT_NAME").insert(1, code)

            definition = fixm.get_fixm_class_definition(
                info_concept['Information Concept'])
            soup.find(text="FIXM_CLASS_DEFINITION").replace_with(
                str(definition))

            traces = fixm.get_traces_by_info_concept(
                info_concept['Information Concept'])
            for trace in traces:
                print('\t' + trace['Data Concept'])

                tr = soup.new_tag("tr")

                if trace["Data Concept"] != "":
                    td_dc_name = soup.new_tag("td")
                    url = "#" + trace["Data Concept"]
                    text = trace["Data Concept"]
                    new_link = soup.new_tag("a")
                    new_link['href'] = url
                    new_link.string = text
                    td_dc_name.insert(1, new_link)
                    tr.insert(1, td_dc_name)

                if trace["Definition"] != "":
                    td_def = soup.new_tag("td")
                    td_def.string = str(trace["Definition"])
                    tr.insert(2, td_def)

                if trace["Type"] != "":
                    td_type = soup.new_tag("td")
                    if trace["Type"] != "enum value":
                        parts = str(trace["Type"]).split(":")
                        clean_type = parts[-1]
                        url = clean_type + ".html"
                        text = clean_type
                        print(text)
                        new_link = soup.new_tag("a")
                        new_link['href'] = url
                        new_link['target'] = "_blank"
                        new_link.string = text
                        td_type.insert(1, new_link)
                    else:
                        td_type.string = str(trace["Type"])
                    tr.insert(3, td_type)

                soup.find(id="DATA_CONCEPTS_LIST").insert(1, tr)

            for trace in traces:
                property_div = soup.new_tag("div")
                property_div[
                    "style"] = "border: 0.5px solid #b2b2b2;border-radius: 4px;box-shadow: 2px 2px #b2b2b2;padding: 15px;padding-bottom: 0px; margin-bottom: 30px"

                h3 = soup.new_tag("h3")
                h3.string = str(trace["Data Concept"])
                h3["id"] = str(trace["Data Concept"])
                h3["style"] = "padding-top: 120px; margin-top: -120px;"
                property_div.insert(0, h3)

                code = soup.new_tag("code")
                identifier = trace['Identifier']
                code.string = identifier
                code["class"] = "text-secondary"
                property_div.insert(1, code)

                p = soup.new_tag("p")
                p.string = str(trace["Definition"])
                br = soup.new_tag("br")
                p.insert(2, br)
                property_div.insert(2, p)

                p = soup.new_tag("p")
                p.string = "Type: "
                span = soup.new_tag("span")
                if trace["Type"] != "enum value":
                    parts = str(trace["Type"]).split(":")
                    clean_type = parts[-1]
                    url = clean_type + ".html"
                    text = clean_type
                    print(text)
                    new_link = soup.new_tag("a")
                    new_link['href'] = url
                    new_link['target'] = "_blank"
                    new_link.string = text
                    span.insert(1, new_link)
                else:
                    span.string = str(trace["Type"])
                p.insert(2, span)
                property_div.insert(3, p)

                sc_h5 = soup.new_tag("h5")
                sc_h5.string = "Semantic Correspondence"
                sc_h5['style'] = "margin-top: 40px;"
                property_div.insert(4, sc_h5)

                sc_div = soup.new_tag("div")
                sc_div["class"] = "table-responsive"
                sc_table = soup.new_tag("table")
                sc_table["class"] = "table"
                sc_thead = soup.new_tag("thead")
                tr = soup.new_tag("tr")
                th = soup.new_tag("th")
                th.string = "AIRM Concept"
                tr.insert(1, th)
                th = soup.new_tag("th")
                th.string = "Definition"
                tr.insert(2, th)
                sc_thead.insert(1, tr)
                sc_table.insert(1, sc_thead)
                tbody = soup.new_tag("tbody")
                #for each insert row
                print('\t\tSemantic Corresponce:')
                sem_correspondences = str(
                    trace['Semantic Correspondence']).split('\n')
                for line in sem_correspondences:
                    print('\t\t\t' + line)
                    tr = soup.new_tag("tr")
                    td = soup.new_tag("td")

                    url = create_url(line)
                    text = create_name(line)
                    a = soup.new_tag("a")
                    a['href'] = url
                    a['target'] = "_blank"
                    a.string = text

                    a["data-toggle"] = "tooltip"
                    a["data-placement"] = "right"
                    a["title"] = line

                    td.insert(1, a)
                    tr.insert(1, td)
                    td = soup.new_tag("td")
                    airm_entry = airm.load_and_find_urn(line)
                    td.string = airm_entry["definition"]
                    tr.insert(2, td)
                    tbody.insert(1, tr)

                sc_table.insert(2, tbody)
                sc_div.insert(1, sc_table)
                property_div.insert(5, sc_div)

                add_correspondences = str(
                    trace['Additional Traces']).split('\n')
                if len(add_correspondences) > 0:
                    if add_correspondences[0] != "missing data":

                        h5 = soup.new_tag("h5")
                        h5.string = "Additional Traces"
                        property_div.insert(6, h5)

                        add_div = soup.new_tag("div")
                        add_div["class"] = "table-responsive"
                        add_table = soup.new_tag("table")
                        add_table["class"] = "table"
                        add_thead = soup.new_tag("thead")
                        tr = soup.new_tag("tr")
                        th = soup.new_tag("th")
                        th.string = "AIRM Concept"
                        tr.insert(1, th)
                        th = soup.new_tag("th")
                        th.string = "Definition"
                        tr.insert(2, th)
                        add_thead.insert(1, tr)
                        add_table.insert(1, add_thead)
                        tbody = soup.new_tag("tbody")
                        #for each insert row
                        print('\t\tAdditional Traces:')

                        for line in add_correspondences:
                            print('\t\t\t' + line)
                            tr = soup.new_tag("tr")
                            td = soup.new_tag("td")
                            url = create_url(line)
                            text = create_name(line)
                            a = soup.new_tag("a")
                            a['href'] = url
                            a['target'] = "_blank"
                            a.string = text

                            a["data-toggle"] = "tooltip"
                            a["data-placement"] = "right"
                            a["title"] = line

                            td.insert(1, a)
                            tr.insert(1, td)
                            td = soup.new_tag("td")
                            airm_entry = airm.load_and_find_urn(line)
                            td.string = airm_entry["definition"]
                            tr.insert(2, td)
                            tbody.insert(1, tr)

                        add_table.insert(2, tbody)
                        add_div.insert(1, add_table)
                        property_div.insert(7, add_div)

                if str(trace["Rationale"]) != "missing data":
                    h5 = soup.new_tag("h5")
                    h5.string = "Rationale"
                    property_div.insert(8, h5)

                    p = soup.new_tag("p")
                    p.string = str(trace["Rationale"])
                    print('Rationale:' + str(trace["Rationale"]))
                    property_div.insert(9, p)

                if str(trace["Notes"]) != "missing data":
                    notes_h5 = soup.new_tag("h5")
                    notes_h5.string = "Notes"
                    property_div.insert(10, notes_h5)

                    p = soup.new_tag("p")
                    p.string = str(trace["Notes"])
                    print('NOTES:' + str(trace["Notes"]))
                    property_div.insert(11, p)

                top_link_p = soup.new_tag("p")
                new_link = soup.new_tag("a")
                new_link['href'] = "#top"
                new_icon = soup.new_tag("i")
                new_icon['class'] = "fa fa-arrow-circle-up"
                new_icon["data-toggle"] = "tooltip"
                new_icon["data-placement"] = "left"
                new_icon["title"] = "Top of page"
                new_link.insert(1, new_icon)
                top_link_p.insert(1, new_link)
                top_link_p['class'] = "text-right"
                property_div.insert(12, top_link_p)

                soup.find(id="DATA_CONCEPTS_DETAIL").insert(1, property_div)

            f = open(
                "docs/developers/fixm-4.2.0-to-airm-1.0.0/" +
                str(info_concept['Information Concept']) + ".html", "w+")
            f.write(soup.prettify())
            f.close()

Example #37

0

Show file

File: scrape-google.py Project: xrodneylee/scrape-ratecard

from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import time

url = 'https://aws.amazon.com/tw/ec2/pricing/on-demand/'

driver = webdriver.Chrome('chromedriver.exe')
driver.get(url)
pageSource = driver.page_source
# print(pageSource.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding))
soup = BeautifulSoup(pageSource, 'lxml')

print(soup.prettify(encoding='utf-8'))
for caption in soup.find_all('caption'):
    print('caption', caption.get_text())

driver.quit()

Example #38

0

Show file

File: btc_analysis2.py Project: CullenDolan/ml_udemy_course

stop_words[:10]

#remove all stop_words from list
words_ns = []

for word in words:
    if word not in stop_words:
        words_ns.append(word)

page.status_code #anything satrting w/ a 2 is good

page.content #loads HTML content from site

soup = BeautifulSoup(page.content, 'html.parser')

print(soup.prettify()) #formats everything in a semi readable format

list(soup.children)
[type(item) for item in list(soup.children)]

#gets all text
#h2 = heades, p = body text/equations, ol = bullets

headers = soup.find_all('h2')
bullets = soup.find_all('ol')
text = soup.find_all('p')

list(text.children)
white_paper = pd.DataFrame(text)
header_frame = pd.DataFrame(headers)
text.dispersion_plot('bitcoin')

Example #39

0

Show file

def test_soup():
    with open('D:\\work\\MyPYProject\\HTML\\1.html') as f:
        soup = BeautifulSoup(f, 'lxml')
        print soup.prettify()

Example #40

0

Show file

# Formats raw html using soup.prettify() for better analysis

from bs4 import BeautifulSoup

for i in range(1007):
    try:
        if i < 1000:
            file = open('./Gradesheets/' + str(190001 + i) + '.html').read()
        else:
            file = open('./Gradesheets/' + str(190075 + i) + '.html').read()
        soup = BeautifulSoup(file, 'html.parser')
        if i < 1000:
            file1 = open('./Gradesheets/' + str(190001 + i) + '.html', 'w')
        else:
            file1 = open('./Gradesheets/' + str(190075 + i) + '.html', 'w')
        file1.write(soup.prettify())
        file1.close()
        if i < 1000:
            print(str(190001 + i) + ' successful!')
        else:
            print(str(190075 + i) + ' successful!')
    except:
        if i < 1000:
            print(str(190001 + i) + ' failed!')
        else:
            print(str(190075 + i) + ' failed!')

Example #41

0

Show file

File: 37.py Project: marcoswebermw/learning-python

from bs4 import BeautifulSoup

# Este texto já foi reconhecido como utf-8. Por isso os erros.
html = "<html><body><div>Olá Mundo</div></body></html>"

soup = BeautifulSoup(html, "html5lib")

# Mostrando o encoding
print(soup.original_encoding)

# Imprimindo com prettify()
print(soup.prettify())
print("\n")

# Imprimindo com prettify() com codificação passada.
print(soup.prettify("utf-8"))
print("\n")

# Imprimindo com encode() com codificação passada.
print(soup.div.encode("utf-8"))
print("\n")

Example #42

0

Show file

File: webScraping mod.py Project: RTG8055/Url-mining

#soup = BeautifulSoup(data)
#soup.prettify()

#html=soup.get_text()
urls = open('urls_extracted.txt','r')
# db = sqlite3.connect("webScraping.db")
# urls = ['https://www.ticketmaster.co.uk/member?tm_link=tm_homeA_header_name','http://www.ticketmaster.co.uk/']
y = '1'
page_name='page' + y + '.txt'

for Iurl in urls:
	try:
		r=requests.get(Iurl)
		data=r.content
		soup= BeautifulSoup(data)
		soup.prettify()
		#html=soup.get_text()
		words_list= []

		for link in soup.find_all('p'):
			content = link.text 
			words = content.lower().split()
			cleaned_words= re.sub("[^A-Za-z]+"," ",str(words))
			words_list.append(cleaned_words)

		words_list.append(soup.find('title').text)
				

		# file = open ( "docs2\\\\" + page_name , 'w')
		file= open(join('docs',page_name),'w')
		file.write(str(words_list))

Example #43

0

Show file

File: auth32p.py Project: daovillar/mylar

    def authenticate(self):

        feedinfo = []

        try:
            with requests.session() as s:
                if mylar.VERIFY_32P == 1 or mylar.VERIFY_32P == True:
                    verify = True
                else:
                    verify = False

                logger.fdebug('[32P] Verify SSL set to : ' + str(verify))

                if not verify:
                #32P throws back an insecure warning because it can't validate against the CA. The below suppresses the message just for 32P instead of being displa$
                    from lib.requests.packages.urllib3.exceptions import InsecureRequestWarning
                    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


                # fetch the login page

                s.headers = self.headers
                try:
                    s.get(self.url, verify=verify, timeout=30)
                except (requests.exceptions.SSLError, requests.exceptions.Timeout) as e:
                    logger.error(self.module + ' Unable to establish connection to 32P: ' + str(e))
                    return
                    
                # post to the login form
                r = s.post(self.url, data=self.payload, verify=verify)

                #need a way to find response code (200=OK), but returns 200 for everything even failed signons (returns a blank page)
                #logger.info('[32P] response: ' + str(r.content))
                soup = BeautifulSoup(r.content)
                soup.prettify()
                #check for invalid username/password and if it's invalid - disable provider so we don't autoban (manual intervention is required after).
                chk_login = soup.find_all("form", {"id":"loginform"})
                for ck in chk_login:
                    errorlog = ck.find("span", {"id":"formerror"})
                    loginerror = " ".join(list(errorlog.stripped_strings)) #login_error.findNext(text=True)
                    errornot = ck.find("span", {"class":"notice"})
                    noticeerror = " ".join(list(errornot.stripped_strings)) #notice_error.findNext(text=True)
                    logger.error(self.module + ' Error: ' + loginerror)
                    if noticeerror:
                        logger.error(self.module + ' Warning: ' + noticeerror)
                    logger.error(self.module + ' Disabling 32P provider until username/password can be corrected / verified.')
                    return "disable"


                if not self.searchterm:
                    logger.info('[32P] Successfully authenticated. Verifying authentication & passkeys for usage.')
                else:
                    logger.info('[32P] Successfully authenticated. Initiating search for : ' + self.searchterm)
                    return self.search32p(s)
                
                all_script = soup.find_all("script", {"src": False})
                all_script2 = soup.find_all("link", {"rel": "alternate"})

                for ind_s in all_script:
                    all_value = str(ind_s)
                    all_items = all_value.split()
                    auth_found = False
                    user_found = False
                    for al in all_items:
                        if al == 'authkey':
                            auth_found = True
                        elif auth_found == True and al != '=':
                            authkey = re.sub('["/;]', '', al).strip()
                            auth_found = False
                            logger.fdebug(self.module + ' Authkey found: ' + str(authkey))
                        if al == 'userid':
                            user_found = True
                        elif user_found == True and al != '=':
                            userid = re.sub('["/;]', '', al).strip()
                            user_found = False
                            logger.fdebug(self.module + ' Userid found: ' + str(userid))

                authfound = False
                logger.info(self.module + ' Atttempting to integrate with all of your 32P Notification feeds.')

                for al in all_script2:
                    alurl = al['href']
                    if 'auth=' in alurl and 'torrents_notify' in alurl and not authfound:
                        f1 = alurl.find('auth=')
                        f2 = alurl.find('&', f1 + 1)
                        auth = alurl[f1 +5:f2]
                        logger.fdebug(self.module + ' Auth:' + str(auth))
                        authfound = True
                        p1 = alurl.find('passkey=')
                        p2 = alurl.find('&', p1 + 1)
                        passkey = alurl[p1 +8:p2]
                        logger.fdebug(self.module + ' Passkey:' + str(passkey))
                        if self.reauthenticate: break

                    if 'torrents_notify' in alurl and ('torrents_notify_' + str(passkey)) not in alurl:
                        notifyname_st = alurl.find('name=')
                        notifyname_en = alurl.find('&', notifyname_st +1)
                        if notifyname_en == -1: notifyname_en = len(alurl)
                        notifyname = alurl[notifyname_st +5:notifyname_en]
                        notifynumber_st = alurl.find('torrents_notify_')
                        notifynumber_en = alurl.find('_', notifynumber_st +17)
                        notifynumber = alurl[notifynumber_st:notifynumber_en]
                        logger.fdebug(self.module + ' [NOTIFICATION: ' + str(notifyname) + '] Notification ID: ' + str(notifynumber))

                        #generate the rss-url here
                        feedinfo.append({'feed':     notifynumber + '_' + str(passkey),
                                         'feedname': notifyname,
                                         'user':     userid,
                                         'auth':     auth,
                                         'passkey':  passkey,
                                         'authkey':  authkey})
        except (requests.exceptions.Timeout, EnvironmentError):
            logger.warn('Unable to retrieve information from 32Pages - either it is not responding/is down or something else is happening that is stopping me.')
            return

        #set the keys here that will be used to download.
        try:
            mylar.PASSKEY_32P = passkey
            mylar.AUTHKEY_32P = authkey  # probably not needed here.
            mylar.KEYS_32P = {}
            mylar.KEYS_32P = {"user": userid,
                              "auth": auth,
                              "passkey": passkey,
                              "authkey": authkey}
        except NameError:
            logger.warn('Unable to retrieve information from 32Pages - either it is not responding/is down or something else is happening that is stopping me.')
            return
           
        if self.reauthenticate:
            return
        else:
            mylar.FEEDINFO_32P = feedinfo
            return feedinfo

Example #44

0

Show file

    def search(self, search_string, **kwargs):
        """
		Searches ehentai for the provided string or list of hashes,
		returns a dict with search_string:[list of title & url tuples] of hits found or emtpy dict if no hits are found.
		"""
        assert isinstance(search_string, (str, list))
        if isinstance(search_string, str):
            search_string = [search_string]

        cookies = kwargs.pop('cookies', None)

        def no_hits_found_check(soup):
            "return true if hits are found"
            f_div = soup.body.find_all('div')
            for d in f_div:
                if 'No hits found' in d.text:
                    return False
            return True

        found_galleries = {}
        log_i('Initiating hash search on ehentai')
        for h in search_string:
            log_d('Hash search: {}'.format(h))
            self.begin_lock()
            if 'color' in kwargs:
                file_search = self.e_url_o + '?filesearch=1'
                if cookies:
                    self.check_cookie(cookies)
                    self._browser.session.cookies.update(self.COOKIES)
                self._browser.open(file_search)
                file_form = self._browser.get_forms()[1]
                f_obj = open(h, 'rb')
                file_form['sfile'].value = f_obj
                self._browser.submit_form(file_form)
                f_obj.close()

                soup = self._browser.parsed
            else:
                hash_url = self.e_url_o + '?f_shash='
                hash_search = hash_url + h + '&fs_exp=1'  # to enable expunged.. maybe make this an option?
                if cookies:
                    self.check_cookie(cookies)
                    r = requests.get(hash_search,
                                     timeout=30,
                                     headers=self.HEADERS,
                                     cookies=self.COOKIES)
                else:
                    r = requests.get(hash_search,
                                     timeout=30,
                                     headers=self.HEADERS)
                if not self.handle_error(r):
                    return 'error'
                soup = BeautifulSoup(r.text, "html.parser")

            self.end_lock()
            if not no_hits_found_check(soup):
                log_e('No hits found with hash: {}'.format(h))
                continue
            log_i('Parsing html')
            try:
                if soup.body:
                    found_galleries[h] = []
                    # list view or grid view
                    type = soup.find(attrs={'class': 'itg'}).name
                    if type == 'div':
                        visible_galleries = soup.find_all(
                            'div', attrs={'class': 'id1'})
                    elif type == 'table':
                        visible_galleries = soup.find_all(
                            'div', attrs={'class': 'it5'})

                    log_i('Found {} visible galleries'.format(
                        len(visible_galleries)))
                    for gallery in visible_galleries:
                        title = gallery.text
                        g_url = gallery.a.attrs['href']
                        found_galleries[h].append((title, g_url))
            except AttributeError:
                log.exception('Unparseable html')
                log_d("\n{}\n".format(soup.prettify()))
                continue

        if found_galleries:
            log_i('Found {} out of {} galleries'.format(
                len(found_galleries), len(search_string)))
            return found_galleries
        else:
            log_w('Could not find any galleries')
            return {}

Example #45

0

Show file

# -*- coding:utf-8 -*-

import urllib
import re
import xlwt
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')

html = open('e:/test/index.html').read()
soup = BeautifulSoup(html, "html.parser")
text = soup.prettify()

txt = open('e:/test/index.htmlll.txt', 'w')
txt.write(text)
txt.close()
print 'ok'

Example #46

0

Show file

File: 6 Importing Data in Python (Part 2).py Project: Ran-Dou/Python-for-Data-Scientist

import requests
url = 'https://wikipedia.org/'
r = requests.get(url)  #package, send, and catch in a single function
text = r.text

### Scraping the Web
### BeatifulSoup package
# parse and extract structured data from HTML
from bs4 import BeautifulSoup
import requests
url = 'https://www.crummy.com/software/BeautifulSoup/'
r = requests.get(url)
html_doc = r.text
# the prettified Soup is indented
soup = BeautifulSoup(html_doc)
pretty_soup = soup.prettify()
print(soup.title)
print(soup.get_text())
for link in soup.find_all('a'):
    print(link.get('href'))

# =============================================================================
# API
# =============================================================================

# Application Programming Interface: allows two software programs to communicate with each other
# OMDb: Open Movie Database API
# Tweepy: Twitter API
# JSONs: JavaScript Object Notation

### Import from local directory

Example #47

0

Show file

import time
from matplotlib import pyplot as plt
from scipy.interpolate import griddata
import cv2
# from Adafruit_AMG88xx import Adafruit_AMG88xx
import requests
from bs4 import BeautifulSoup

# sensor = Adafruit_AMG88xx()
# num_requests = 20
# while(num_requests > 0):
# for i in range(1,20):
r = requests.get("http://192.168.1.101/xml")

soup = BeautifulSoup(r.content)
data_sensor = soup.prettify()
print("------------------------------------------")
print(data_sensor)
print("------------------------------------------")
# pass

# Access an instance of Configuration
# config = channel.config()
# Start sensor

active = True

while (1):
    if active == True:
        # Read pixels, convert them to values between 0 and 1, map them to an 8x8 grid
        # pixels = sensor.readPixels()

Example #48

0

Show file

headers[
    'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36"

data = {}
data['FromDate'] = "2020-09-01"
data['ToDate'] = today
data['SubSystem'] = "Prices"
data['Action'] = "GetTrades"
data['Exchange'] = "NMF"
data['ext_xslt'] = "/nordicV3/t_table_simple.xsl"
data['ext_xslt_lang'] = "en"
data['ext_xslt_tableId'] = "danish-bond-trade-history-table"
data['t__a'] = "1,2,4,6,7,8,18"
data[
    'Instrument'] = "XCSE0:5NDASDRO50"  # Need to change this param to change bonds
data['showall'] = "1"
data['app'] = "/bonds/denmark/microsite"

data = urllib.parse.urlencode(data)
full_url = url + "?" + data
response = requests.get(full_url, headers=headers)

soup = BeautifulSoup(response.content, 'xml')
raw_df = pd.read_html(soup.prettify())[0]

transactions = ["OTC-Primary Transaction", "OTC-Loan Payment"]
df = raw_df.loc[raw_df['Trade type'].isin(transactions)]

df.set_index('Time', inplace=True)

df = df[['Volume', 'Trade type']]

Example #49

0

Show file

File: scraping.py Project: yamauchi5748/Pythonscraping

class Model():
    def __init__(self, name, root_url, target_url, target_save, target_params,
                 target_contents):
        self.name = name
        self.root_url = root_url
        self.target_url = target_url
        self.target_save = target_save
        self.target_params = target_params
        self.target_contents = target_contents
        self.soup = object
        self.content_list = []

    def run(self):
        #Requestsを使って、webから取得
        res = requests.get(self.target_url)

        #要素を抽出
        self.soup = BeautifulSoup(res.content, 'lxml')

        #HTMLファイルとして保存したい場合はファイルオープンして保存
        with open('files/' + self.name + '.html', mode='w',
                  encoding='utf-8') as fw:
            fw.write(self.soup.prettify())

        #「target_contents」の項目を全て取得
        for target_content in self.target_contents:
            content = target_content['selector']

            # エレメント取得
            elems = self.soup.select(content)

            # 初回のみコンテンツを格納する配列を初期化
            if len(self.content_list) < len(elems):

                # Jsonオブジェクトを取得した要素数分確保
                self.content_list = [{} for i in range(len(elems))]

            # 各要素を配列に格納
            self.setContents(elems, target_content['type'],
                             target_content['elem'])

        #「params」の項目を各オブジェクトに付与する
        for i in range(len(self.target_params)):
            param = self.target_params[i]
            key = [key for key in param][0]

            for j in range(len(self.content_list)):
                self.content_list[j].update({key: param[key]})

        return self.content_list

    def setContents(self, elems, _type, prop):

        # Jsonオブジェクトに変換しリストに格納する
        for i in range(len(elems)):

            elem = elems[i].getText() if _type == "text" else self.replace(
                elems[i][_type])
            print(elem)

            self.content_list[i].update({prop: elem})

    def replace(self, item):
        #相対urlを絶対urlに置換するパターン
        pattern = '^http(s)'

        if not re.match(pattern, item):

            if not re.match('^/', item):

                item = '/' + item
            item = self.root_url + item
        return item

Example #50

0

Show file

File: headlessscript.py Project: deeptiwaddin/pythonproj

from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.PhantomJS(executable_path = r'C:\Users\deepti.waddin\Desktop\phython\phantomjs.exe')

driver.get('http://python.org')

html_doc  = driver.page_source

soup = BeautifulSoup(html_doc, 'lxml')

print soup.prettify()

driver.quit()

Example #51

0

Show file

# Replaces local pdf URL's with an external reference.
import os
import sys
import csv
from bs4 import BeautifulSoup
count = 0
script_path = os.path.abspath(os.path.dirname(sys.argv[0]))
infile_path = os.path.join(script_path, './URLReplaceTest.csv')
with open(infile_path) as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        html_file = row[0]
        orig_url = row[1]
        replace_url = row[2]
        html_file_path = os.path.abspath(
            os.path.join(script_path, *html_file.split('/')))
        prettyHTML = None
        with open(html_file_path, mode='r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            # Replacing URL
            target = soup.find('a', href=orig_url)
            if target:
                count += 1
                print('replacing', count)
                replacement = target.replace_with(replace_url)
            prettyHTML = soup.prettify()
        if prettyHTML:
            with open(html_file_path, mode='w', encoding='utf-8') as of:
                of.write(prettyHTML)

Example #52

0

Show file

File: brainfeeder.py Project: tkhskt/Python-Tests

from bs4 import BeautifulSoup
import urllib.request
import re

url = 'http://www.brainfeedersite.com/'
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
html = response.read()
soup = BeautifulSoup(html, "lxml")

p = soup.prettify()

artist = []
title = []
url = []

pre = []
'''
for sl in soup.find_all(class_="slideshow"):
    for h2 in sl.find_all("h2"):
        pre.append(h2.text)
        for a in h2.find_all("a"):
            url.append(a['href'])
'''

for pc in soup.find_all(class_='list clear', id='loop'):
    for h2 in pc.find_all('h2'):
        for a in h2.find_all('a'):
            title.append(a.text)
            url.append(a['href'])

Example #53

0

Show file

File: teste.py Project: michelpf/desafios

from bs4 import BeautifulSoup
import urllib

req = urllib.request.Request(url="https://www.reddit.com/r/AskReddit/",
                             headers={'User-agent': 'tester 0.2'})

page = urllib.request.urlopen(req).read()
soup = BeautifulSoup(page, "lxml")
conteudo = soup.prettify()

lista = soup.find("div", {"id": "siteTable"})

dados_reddit = {}
lista_dados = []

for l in lista:
    #print(l)
    score = l.find("div", {"class": "score likes"})

    if score is not None:

        subreddit = l["data-subreddit"]
        comments_link = l.find(
            "a", {"class": "bylink comments may-blank"})['data-href-url']
        thread_link = l.find("a",
                             {"class": "bylink comments may-blank"})['href']
        title = l.a.text

        if score.text == '•':
            upvote = 0
        else:

Example #54

0

Show file

File: example19.py Project: Crane-YU/Advanced-Python

from bs4 import BeautifulSoup
from urllib import request

url = "http://www.baidu.com"
rsp = request.urlopen(url)
content = rsp.read()

soup = BeautifulSoup(content, 'lxml')

# bs自动转码
content = soup.prettify()
print(content)
print("==" * 36)

print(soup.head)
print("==" * 36)

print(soup.meta)
print("==" * 36)

print(soup.link)
print("==" * 36)

print(soup.link.name)
print("==" * 36)

print(soup.link.attrs)
print("==" * 36)

print(soup.link.attrs['type'])

Example #55

0

Show file

for file in os.listdir("."):
    if file.endswith(".html"):
        html_files.append(file)

# Find which is the input file:
messagebox.showinfo('File Input Helper',
                    "Please select the file to copy from.")
in_file = OpenFile()

# Find the menu of input file
in_soup = BeautifulSoup(open(in_file), 'html.parser')
in_menu = in_soup.find(id="menu")

# Remove input file from the html_files thing -- working 3/25/19
for file in html_files:
    if file in in_file:
        html_files.remove(file)

# Go through all files and update the menu
for out_file in html_files:

    # Open the file as soup
    out_soup = BeautifulSoup(open(out_file), 'html.parser')

    # Overwrite menu of the output file
    out_soup.find(id="menu").replace_with(in_menu)

    # Save the file
    with open(out_file, "w") as file:
        file.write(str(out_soup.prettify()))

Example #56

0

Show file

File: africacheck.py Project: TRESCA-MSW/claimskg-extractor

def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    # performing a search by each letter, and adding each article to a urls_ var.
    urls_ = {}
    last_page = []
    for page_number in range(1, 500):
        if 0 < criteria.maxClaims <= len(urls_):
            break

        url = "https://africacheck.org/latest-reports/page/" + str(
            page_number) + "/"
        try:
            page = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()
            links = soup.findAll("div", {"class": "article-content"})
            if (len(links) != 0) or (links != last_page):
                for anchor in links:
                    anchor = anchor.find('a', href=True)
                    ind_ = str(anchor['href'])
                    if ind_ not in list(urls_.keys()):
                        if 0 < criteria.maxClaims <= len(urls_):
                            break
                        if ind_ not in criteria.avoid_url:
                            urls_[ind_] = ind_
                            print("adding " + str(ind_))
                last_page = links
            else:
                print("break!")
                break
        except:
            print("error=>" + str(url))

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(
            str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " +
            str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        # try:
        page = requests.get(url_complete, headers=headers, timeout=5)
        soup = BeautifulSoup(page.text, "lxml")
        soup.prettify("utf-8")

        claim_ = Claim()
        claim_.set_url(url_complete)
        claim_.set_source("africacheck")

        # title
        title = soup.find("meta", {"property": "og:title"})
        title_content = title['content']
        if "|" in title_content:
            title_content = title_content.split("|")[-1]
        claim_.set_title(title_content)

        # date

        date_ = soup.find('time')
        # print date_["content"]
        if date_:
            date_str = search_dates(
                date_['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d")
            # print date_str
            claim_.set_date(date_str)
        # print claim_.date

        # rating

        truth_rating = ""
        if soup.find("div", {"class": "verdict-stamp"}):
            truth_rating = soup.find("div", {
                "class": "verdict-stamp"
            }).get_text()
        if soup.find("div", {"class": "verdict"}):
            truth_rating = soup.find("div", {"class": "verdict"}).get_text()
        if soup.find("div", {"class": "indicator"}):
            truth_rating = soup.find("div", {"class": "indicator"}).get_text()
            if soup.find("div", {"class": "indicator"}).find('span'):
                truth_rating = soup.find("div", {
                    "class": "indicator"
                }).find('span').get_text()

        claim_.set_rating(
            str(re.sub('[^A-Za-z0-9 -]+', '', truth_rating)).lower().strip())

        # when there is no json

        date_ = soup.find("time", {"class": "datetime"})
        if date_:
            claim_.set_date(date_.get_text())

        # body
        body = soup.find("div", {"id": "main"})
        claim_.set_body(body.get_text())

        # author
        author = soup.find("div", {"class": "sharethefacts-speaker-name"})
        if author:
            claim_.set_author(author.get_text())

        # related links
        divTag = soup.find("div", {"id": "main"})
        related_links = []
        for link in divTag.findAll('a', href=True):
            related_links.append(link['href'])
        claim_.set_refered_links(related_links)

        if soup.find("div", {"class": "report-claim"}):
            claim_.set_claim(
                soup.find("div", {
                    "class": "report-claim"
                }).find("strong").get_text())
        else:
            claim_.set_claim(claim_.title)

        tags = []

        for tag in soup.findAll('meta', {"property": "article:tag"}):
            tags.append(tag["content"])
        claim_.set_tags(", ".join(tags))

        claims.append(claim_.generate_dictionary())

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf

Example #57

0

Show file

File: _sample.py Project: qq46128558/sample

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
<b><!--Hey, buddy. Want to buy a used parser?--></b>
"""
# html.parser lxml ["lxml", "xml"] xml html5lib
soup = BeautifulSoup(html_doc, "html.parser")

# 按照标准的缩进格式的结构输出
logging.info(soup.prettify())
'''几个浏览结构化数据的方法'''
logging.info(soup.title)
# INFO:root:<title>The Dormouse's story</title>

logging.info(soup.title.name)
# INFO:root:title

logging.info(soup.title.string)
logging.info(soup.title.text)
logging.info(soup.title.get_text())
# INFO:root:The Dormouse's story

logging.info(soup.title.parent.name)
# INFO:root:head

Example #58

0

Show file

    return obj.__dict__


if False:
    contents = do_request(
        "https://homezz.ro/anunturi_apartamente_de-vanzare_in-timisoara-tm.html"
    )

    with open('result_homezz.html', 'wb') as file:
        file.write(contents)

with open('result_homezz.html', 'r', encoding='utf-8') as file:
    contents = file.read()

soup = BeautifulSoup(contents, 'html.parser')
pretty_html = soup.prettify()
offerNodes = soup.body.find_all('a', attrs={'class': 'main_items'})
i = 0
offers = []
for offerNode in offerNodes:
    price = offerNode.find('span', attrs={
        "class": "price"
    }).text.strip().replace("€", "EUR")

    titleNode = offerNode.find('span', attrs={'class': 'title'})
    title = titleNode.text.strip()

    url = offerNode['href']

    imgNode = offerNode.find('div', attrs={'class': 'overflow_image'})
    if imgNode.img:

Example #59

0

Show file

File: WebscrapingWorkshop.py Project: napster21/GWDATA

- To view a specific part: "Inspect Element"
- Safari users: Safari menu, Preferences, Advanced, Show Develop menu in menu bar
- Let's inspect example.html
'''

# read the HTML code for a web page and save as a string
with open('example.html', 'rU') as f:
    html = f.read()

# convert HTML into a structured Soup object
from bs4 import BeautifulSoup
b = BeautifulSoup(html)

# print out the object
print b
print b.prettify()

# 'find' method returns the first matching Tag (and everything inside of it)
b.find(name='body')
b.find(name='h1')

# Tags allow you to access the 'inside text'
b.find(name='h1').text

# Tags also allow you to access their attributes
b.find(name='h1')['id']

# 'find_all' method is useful for finding all matching Tags
b.find(name='p')        # returns a Tag
b.find_all(name='p')    # returns a ResultSet (like a list of Tags)

Example #60

0

Show file

import requests
from bs4 import BeautifulSoup

URL = 'https://www.bedbathandbeyond.com/store/product/breville-reg-the-barista-express-trade-espresso-machine/3244573?opbthead=true&ta=typeahead&keyword=breville-espresso'

headers = {
    "User-Agent":
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}

page = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, "html.parser")

soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

title = soup2.find(id="productTitle")

converted_price = 500
print(title.strip())