예제 #1
0
def get_price(code_number, headers):
    """
    年でループ
    """
    dfs = []
    year = range(2000, 2021)
    for y in year:
        try:
            url = "https://kabuoji3.com/stock/{}/{}/".format(code_number, y)
            soup = BeautifulSoup(
                requests.get(url, headers=headers).content, "html.parser")
            tag_tr = soup.find_all("tr")
            head = [h.text for h in tag_tr[0].find_all("th")]
            data = []
            for i in range(1, len(tag_tr)):
                data.append([d.text for d in tag_tr[i].find_all("td")])
            df = pd.DataFrame(data, columns=head)

            col = ["始値", "高値", "安値", "終値", "出来高", "終値調整"]
            for c in col:
                df[c] = df[c].astype(float)
            dfs.append(df)
        except IndexError:
            pass
    data = pd.concat(dfs, axis=0)
    data = data.reset_index(drop=True)

    return data
예제 #2
0
    def geturl(self, webpage, key=None):
        #key = None ##############################test
        global dlLinksNext

        try:
            webpage = unicode(webpage, 'gbk').encode('utf-8')
            soup = BeautifulSoup(webpage)
            tagA = soup.findAll('a')

            for link in tagA:
                if not key:
                    dlLinksNext.put(link.get('href'))
                elif key in str(link):
                    dlLinksNext.put(link.get('href'))

        except (UnicodeDecodeError):
            #error = '132 have code'
            error = 'UnicodeDecodeError'
            self.loger.logInfo(error)
        except (UnicodeEncodeError):
            #error = '135 had code'
            error = 'UnicodeDecodeError'
            self.loger.logInfo(error)
예제 #3
0
import requests
from bs3 import BeautifulSoup

url = "https://www.yelp.com/sf"

yelp_r = requests.get(url)

print(yelp_r.status_code)  #should be 200

yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser')

print(yelp_soup.prettify())

print(yelp_soup.findAll('a'))

for link in yelp_soup.findAll('a'):
    print(link)
예제 #4
0
from bs3 import BeautifulSoup
import requests
import matplotlib.pyplot as plt

raw_html = requests.get('https://nl.wikipedia.org/wiki/Regering-Jambon').text
html = BeautifulSoup(raw_html, 'html.parser')

leden = []
views = []
for a in html.select('table.wikitable tr td:nth-child(2)'):
    leden.append(a.text.replace(' ', '_').strip())

for lid in leden:
    r = requests.get(
        "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/nl.wikipedia/all-access/all-agents/{}/daily/20191001/2019103100"
        .format(lid))
    data = r.json()
    count = 0
    for item in data['items']:
        count += item['views']
    views.append(count)
    #print("{}".format(lid.replace('_', ' ') + ' ' + str(count)))

plt.style.use('seaborn-poster')
plt.title('Wikipedia page visits in Oct.')
plt.xlabel('Flemish minister')
plt.ylabel('Visits')

#plt.annotate(xy=[0, 1], s=str(14000))
plt.bar([
    'Jambon', 'Crevits', 'Somers', 'Weyts', 'Demir', 'Beke', 'Diependaele',
예제 #5
0
class WebScrape:
    def __init__(self):
        print("WebScrape Imported")

    def lazada_scrape(self, head, category, url):
        list_of_rows = []

    url = "http://www.lazada.com.ph/" + url + "/"
    source_code = requests.get(url)
    txt = source_code.text
    soup = BeautifulSoup(txt, 'html.parser')
    max_page = int(soup.select("span.pages > a:nth-of-type(6)")[0].get_text())
    page = 1
    myfile = open(category + ".csv", 'w', newline='')
    writer = csv.DictWriter(myfile,
                            fieldnames=[
                                "url", "product_name", "product_header",
                                "product_category", "product_price",
                                "product_sale", "product_old", "installment",
                                "rating"
                            ],
                            delimiter=',')
    writer.writeheader()
    while page <= max_page:
        print(page)
        url = "http://www.lazada.com.ph/shop-mobiles/?page=" + str(page)
        source_code = requests.get(url)
        txt = source_code.text
        soup = BeautifulSoup(txt, 'html.parser')
        for div in soup.find_all("div", {"class": "product-card"}):
            mylist = []

            for link in div.find_all("a"):
                mylist.append(str(link.get("href")))
            for title in div.find_all("span", {"class": "product-card__name"}):
                mylist.append(
                    str(title.text).replace("\u200f", " ").replace(
                        "\uFF08", "(").replace("\uff09", ")"))
                mylist.append(head)
                mylist.append(category)
            for price in div.find_all("div", {"class": "product-card__price"}):
                mylist.append(str(price.text.replace("\u20B1", "Php ")))

            sale = div.find_all("div", {"class": "product-card__sale"})
            if not sale:
                mylist.append("0%")
            else:
                for sales in sale:
                    mylist.append(str(sales.text))

            old = div.find_all("div", {"class": "old-price-wrap"})
            if not old:
                mylist.append("Php 0.00")
            else:
                for olds in old:
                    mylist.append(
                        str(olds.text).replace("\u20B1",
                                               "Php ").replace("\n", ""))

            installment = div.find_all("span", {"class": "installment-part"})
            if not installment:
                mylist.append("Php 0.00")
            else:
                for installments in installment:
                    mylist.append(
                        str(installments.text).replace("\u20B1", "Php "))

            rating = div.find_all("span", {"class": "rating__number"})
            if not rating:
                mylist.append("(0 reviews)")
            else:
                for ratings in rating:
                    mylist.append(str(ratings.text))

            list_of_rows.append(mylist)
        page += 1
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(list_of_rows)