Exemple #1
0
def get_price(code_number, headers):
    """
    年でループ
    """
    dfs = []
    year = range(2000, 2021)
    for y in year:
        try:
            url = "https://kabuoji3.com/stock/{}/{}/".format(code_number, y)
            soup = BeautifulSoup(
                requests.get(url, headers=headers).content, "html.parser")
            tag_tr = soup.find_all("tr")
            head = [h.text for h in tag_tr[0].find_all("th")]
            data = []
            for i in range(1, len(tag_tr)):
                data.append([d.text for d in tag_tr[i].find_all("td")])
            df = pd.DataFrame(data, columns=head)

            col = ["始値", "高値", "安値", "終値", "出来高", "終値調整"]
            for c in col:
                df[c] = df[c].astype(float)
            dfs.append(df)
        except IndexError:
            pass
    data = pd.concat(dfs, axis=0)
    data = data.reset_index(drop=True)

    return data
Exemple #2
0
def book_spider(book_tag):
    global file_content
 
    url =  "http://bbs.csdn.net/topics/310046216"
    source_code = requests.get(url)
    # just get the code, no headers or anything
    plain_text = source_code.text
    # BeautifulSoup objects can be sorted through easy
    soup = BeautifulSoup(plain_text)
    '''print('\n')
    print('--' * 30)
    print('--' * 30)
    print("\t"*4+book_tag+" :")
    print('--' * 30)
    print('--' * 30)
    print('\n')'''
    title_divide = '\n' + '--' * 30 + '\n' + '--' * 30 + '\n'
    file_content += title_divide + '\t' * 4 + \
            book_tag + ':' + title_divide
    count = 1
    for book_info in soup.findAll('div', {'class': 'info'}):
        title = book_info.findAll('a', {
            'onclick': re.compile(r"\"moreurl(.+)")})[0].get('title')
 
        pub = book_info.findAll('div', {'class':'pub'})[0].string.strip()
        rating = book_info.findAll('span', {
            'class':'rating_nums'})[0].string.strip()
        people_num = book_info.findAll('span', {
            'class':'pl'})[0].string.strip()
        file_content += "*%d\t《%s》\t评分:%s%s\n\t%s\n\n" % (
                count, title, rating, people_num, pub)
        count += 1
Exemple #3
0
def get_single_book_data(book_url):
    source_code = requests.get(book_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
#    for rating in soup.findAll('strong', {'class':'ll rating_num '}):
#        print("评分:" + rating.string.strip())
    for rating in soup.findAll('p', {'class':'rating_self clearfix'}):
        print rating.strong.string
    '''for book_info in soup.findAll('div', {'id':'info'}):
def fetch_tv_info(username, password):
    match = re.search(pattern, get_home_page())
    challenge = ""
    if match is not None:
        # get the challenge
        challenge = match.group(2)
        print "challenge string: " + challenge
        # now login
        response = login(username, password, challenge)
        content = response.read()

        # open the TV page
        opener = urllib2.build_opener()
        response = execute_opener(opener, "http://iptv.bg/watch")
        content = response.read()
        #print content

        soup = BeautifulSoup(content, fromEncoding='utf-8')
        tvTags = soup.findAll(name='li', attrs={'class': 'listmode_tv'})

        tv_info = []

        for tag in tvTags:
            name = tag.find(name='div', attrs={'class': 'tv_info'}).find(name='b').getText()
            logo = tag.find(name='img').get('src', default='')
            url = tag.findAll(
                name='div',
                attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='')
            thumbnail = tag.findAll(
                name='div',
                attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='')
            info_tag = tag.find(name='div', attrs={'class': 'tv_info'})
            info = ''
            thumbnail = ''
            if info_tag is not None:
                thumbnail = info_tag.find(name='img').get('src', default='')
                detail_tag = info_tag.find(name='em').find(name='abbr')
                if detail_tag is not None:
                    info = detail_tag.get('title', default='Unknown')

            tv_info += [{'name': name, 'logo': logo, 'path': url, 'thumbnail': thumbnail, 'info': info}]

        return tv_info
Exemple #5
0
    def geturl(self, webpage, key=None):
        #key = None ##############################test
        global dlLinksNext

        try:
            webpage = unicode(webpage, 'gbk').encode('utf-8')
            soup = BeautifulSoup(webpage)
            tagA = soup.findAll('a')

            for link in tagA:
                if not key:
                    dlLinksNext.put(link.get('href'))
                elif key in str(link):
                    dlLinksNext.put(link.get('href'))

        except (UnicodeDecodeError):
            #error = '132 have code'
            error = 'UnicodeDecodeError'
            self.loger.logInfo(error)
        except (UnicodeEncodeError):
            #error = '135 had code'
            error = 'UnicodeDecodeError'
            self.loger.logInfo(error)
Exemple #6
0
import requests
from bs3 import BeautifulSoup

url = "https://www.yelp.com/sf"

yelp_r = requests.get(url)

print(yelp_r.status_code)  #should be 200

yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser')

print(yelp_soup.prettify())

print(yelp_soup.findAll('a'))

for link in yelp_soup.findAll('a'):
    print(link)
Exemple #7
0
from bs3 import BeautifulSoup
import requests
import matplotlib.pyplot as plt

raw_html = requests.get('https://nl.wikipedia.org/wiki/Regering-Jambon').text
html = BeautifulSoup(raw_html, 'html.parser')

leden = []
views = []
for a in html.select('table.wikitable tr td:nth-child(2)'):
    leden.append(a.text.replace(' ', '_').strip())

for lid in leden:
    r = requests.get(
        "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/nl.wikipedia/all-access/all-agents/{}/daily/20191001/2019103100"
        .format(lid))
    data = r.json()
    count = 0
    for item in data['items']:
        count += item['views']
    views.append(count)
    #print("{}".format(lid.replace('_', ' ') + ' ' + str(count)))

plt.style.use('seaborn-poster')
plt.title('Wikipedia page visits in Oct.')
plt.xlabel('Flemish minister')
plt.ylabel('Visits')

#plt.annotate(xy=[0, 1], s=str(14000))
plt.bar([
    'Jambon', 'Crevits', 'Somers', 'Weyts', 'Demir', 'Beke', 'Diependaele',
Exemple #8
0
class WebScrape:
    def __init__(self):
        print("WebScrape Imported")

    def lazada_scrape(self, head, category, url):
        list_of_rows = []

    url = "http://www.lazada.com.ph/" + url + "/"
    source_code = requests.get(url)
    txt = source_code.text
    soup = BeautifulSoup(txt, 'html.parser')
    max_page = int(soup.select("span.pages > a:nth-of-type(6)")[0].get_text())
    page = 1
    myfile = open(category + ".csv", 'w', newline='')
    writer = csv.DictWriter(myfile,
                            fieldnames=[
                                "url", "product_name", "product_header",
                                "product_category", "product_price",
                                "product_sale", "product_old", "installment",
                                "rating"
                            ],
                            delimiter=',')
    writer.writeheader()
    while page <= max_page:
        print(page)
        url = "http://www.lazada.com.ph/shop-mobiles/?page=" + str(page)
        source_code = requests.get(url)
        txt = source_code.text
        soup = BeautifulSoup(txt, 'html.parser')
        for div in soup.find_all("div", {"class": "product-card"}):
            mylist = []

            for link in div.find_all("a"):
                mylist.append(str(link.get("href")))
            for title in div.find_all("span", {"class": "product-card__name"}):
                mylist.append(
                    str(title.text).replace("\u200f", " ").replace(
                        "\uFF08", "(").replace("\uff09", ")"))
                mylist.append(head)
                mylist.append(category)
            for price in div.find_all("div", {"class": "product-card__price"}):
                mylist.append(str(price.text.replace("\u20B1", "Php ")))

            sale = div.find_all("div", {"class": "product-card__sale"})
            if not sale:
                mylist.append("0%")
            else:
                for sales in sale:
                    mylist.append(str(sales.text))

            old = div.find_all("div", {"class": "old-price-wrap"})
            if not old:
                mylist.append("Php 0.00")
            else:
                for olds in old:
                    mylist.append(
                        str(olds.text).replace("\u20B1",
                                               "Php ").replace("\n", ""))

            installment = div.find_all("span", {"class": "installment-part"})
            if not installment:
                mylist.append("Php 0.00")
            else:
                for installments in installment:
                    mylist.append(
                        str(installments.text).replace("\u20B1", "Php "))

            rating = div.find_all("span", {"class": "rating__number"})
            if not rating:
                mylist.append("(0 reviews)")
            else:
                for ratings in rating:
                    mylist.append(str(ratings.text))

            list_of_rows.append(mylist)
        page += 1
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(list_of_rows)