Python BS4 예제들, bs4.BS4 Python 예제들

예제 #1

0

파일 보기

파일: bs4_tutorial01.py 프로젝트: MacHu-GWU/six-demon-bag

def example1():
    '''两大基本找tag的方式
    1. find
    2. findAll (py2中是findAll, 但是由于py3的新的命名标准, 被更改为了find_all。但是原来的findAll依然可用
                这两种method的本质是一模一样的)
    '''
    url = 'http://www.cvs.com/stores/cvs-pharmacy-locations'
    spider = Crawler()
    html = spider.html(url)
    if html:
        soup = BS4(html)
        #         print soup.find('li') # just find the first one
        #         print soup.find_all('li') # find a list of tag match your search
        print soup.find_all('li',
                            limit=3)  # find first 3 tag match your search

예제 #2

0

파일 보기

def parse_cost_from_html(html):
    """get average bill cost value from html
    """
    try:
        soup = BS4(html)
        article = soup.find("article", class_="sub_details")
        h3 = article.find("h3")
        strong = h3.find("strong")
        text = strong.text.strip()
        text = text.replace("$", "")
        text = text[:-1]
        cost = float(text)
        return cost
    except:
        return None

예제 #3

0

파일 보기

파일: task2.py 프로젝트: noolfon/tetrika-school_test_task

def parser_step_1() -> list:
    '''функция первичного парсинга и получения списка животных с первой страницы'''
    req = requests.get(
        'https://ru.wikipedia.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:'
        '%D0%96%D0%B8%D0%B2%D0%BE%D1%82%D0%BD%D1%8B%D0%B5_%D0%BF%D0%BE_'
        '%D0%B0%D0%BB%D1%84%D0%B0%D0%B2%D0%B8%D1%82%D1%83')
    html = req.text
    parser = BS4(html, "html.parser")
    elements = parser.select("ul")
    list_animals = elements[2].text.split('\n')
    while True:
        list_animals += parser_step_2(list_animals)
        if 'Ящурки' == list_animals[-1]:
            break
    return list_animals

예제 #4

0

파일 보기

파일: task2.py 프로젝트: noolfon/tetrika-school_test_task

def parser_step_2(list_animals) -> list:
    '''Функция вторичного парсинга и получения оставшихся страничек с животными'''
    element = list_animals[-1].split()
    result = '+'.join(element)
    req = requests.get(
        f"https://ru.wikipedia.org/w/index.php?title=Категория:Животные_по_алфавиту&pagefrom={result}"
    )

    html = req.text
    parser = BS4(html, "html.parser")
    elements = parser.select("ul")
    tmp_list_animals = elements[2].text.split('\n')
    if 'ЖивотныеОрганизмы по алфавиту' != elements[3].text:
        return tmp_list_animals + elements[3].text.split('\n')
    return tmp_list_animals

예제 #5

0

파일 보기

def property_detail_by_street_and_zipcode(driver, street, zipcode):
    """ENTER text in a input box"""
    driver.get("http://www.zillow.com/") # 再次打开页面
    elem = driver.find_element_by_name("citystatezip") # 定位到 name = "q" 的query搜索框
    elem.send_keys("%s %s" % (street, zipcode)) # 在文本框内输入内容
    elem.send_keys(Keys.RETURN) # 把内容send出去(相当于点击了"search")
    html = driver.page_source
    
    soup = BS4(html)
    a = soup.find("a", class_ = "routable mask hdp-link")
    url = "http://www.zillow.com" + a["href"]
    
    driver.get(url)
    html = driver.page_source
    return html

예제 #6

0

파일 보기

def parse(response):
    base_domain = response.url
    r = response.content
    soup = BS4(r, 'lxml')
    li_list = soup.select('#p_left ul')[0].find_all('li')
    for li in li_list:
        li_tag = li.a.text
        content = li.p.text
    a_list = soup.select('#p_left ul')[1].find_all('a')
    page_format = a_list[-1]['href'][:-5].split('_')
    page_last, page_sec = page_format[-1], page_format[1]
    for i in range(1, int(page_last) + 1):
        link = base_domain + 'list_' + page_sec + '_' + str(i) + '.html'
        link_set.add(link)
    [parse_set(link) for link in link_set]

예제 #7

0

파일 보기

파일: d.py 프로젝트: healingdrawing/data

def lostfilm_tv_scanner(url):
    # done 20190211
    html = gethtml(url.get(), True)
    soup = BS4(html, "html5lib")

    webname = soup.find("div", class_='title-ru').text or "parsing_error"
    webname += " / " + soup.find("div",
                                 class_='title-en').text or "parsing_error"

    website = "lostfilm.tv"
    textready = soup.find("div", class_="details").text or "parsing_error"
    text = textready.split("серий: ", 1)[1].split(" ")[0]
    webready = safeint(text) or -1
    canscan = "yes"
    scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error"
    return webname, website, canscan, scanstatus, webready

예제 #8

0

파일 보기

def crawel_wds_url():
    url = "https://wds.modian.com/ajax/comment_list"
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language':
        'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    payload = {'page': '1', 'post_id': '17190', 'pro_id': '8098'}
    r1 = requests.post(url, data=payload, headers=headers)
    raw_text = r1.text
    html_doc_1 = str(json.loads(raw_text)['data'])
    soup = BS4(html_doc_1, "html.parser")
    return soup

예제 #9

0

파일 보기

def green_tea_scanner(url):
    # done 20190211
    html = gethtml2(url)
    # html = gethtml(url,True)
    soup = BS4(html, "html5lib")

    webname = soup.find("meta", itemprop="name")["content"] or "parsing_error"
    website = "green-teatv.com"
    webready = -1
    for div in soup.find_all("div",class_="info-label"):
        if div.text == "Длительность:":
            webready = safeint(div.findNext("div",class_="info-desc").text.split("из")[0]) or -1
            break
    canscan = "yes"
    scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error"
    return webname, website, canscan, scanstatus, webready

예제 #10

0

파일 보기

	def parse_overview_data(self):
		data = []
		soup = BS4(self.page_source, "html.parser")
		if(soup.find('h1', {'class': ' strong tightAll'})):
			name = soup.find('h1', {'class': ' strong tightAll'}).text.strip()

		overview = soup.find('div', {'class': 'info flexbox row col-hh'})

		infos = overview.find_all('div', {'class': 'infoEntity'})
		website = ''
		hq = ''
		size = ''
		founded = ''
		industry = ''
		revenue = ''

		for info in infos:
			if 'Website' in info.text:
				website = info.find('a', {'class': 'link'}).text.strip()
			elif 'Headquarters' in info.text:
				hq = info.find('span', {'class': 'value'}).text.strip()
			elif 'Size' in info.text:
				size = info.find('span', {'class': 'value'}).text.strip()
			elif 'Founded' in info.text:
				founded = info.find('span', {'class': 'value'}).text.strip()
			elif 'Type' in info.text:
				type = info.find('span', {'class': 'value'}).text.strip()
			elif 'Industry' in info.text:
				industry = info.find('span', {'class': 'value'}).text.strip()
			elif 'Revenue' in info.text:
				revenue = info.find('span', {'class': 'value'}).text.strip()

			#print"***print info over"
		review = {
			'Name':name,
			'Website': website,
			'Headquarters': hq,
			'Size': size,
			'Founded': founded,
			'Type': type,
			'Industry': industry,
			'Revenue': revenue
			}

		data.append(review)

		return data

예제 #11

0

파일 보기

 def __init__(self, registry: dict):
     """ Virtually private constructor, missing docstring
 """
     if ServiceNowSession.instance != None:
         raise Exception("This is a singleton object")
     else:
         session = requests.session()
         request = session.get(ServiceNowSession.__snow_url)
         auth_payload = registry
         request_1 = session.post(ServiceNowSession.__login_url,
                                  data=auth_payload)
         soup = BS4(request_1.text, 'html.parser')
         if 'Establishing session' in soup.find_all(
                 "h1", class_="loading-message")[0]:
             ServiceNowSession.instance = session
         else:
             ServiceNowSession.instance = None

예제 #12

0

파일 보기

def zillow_property_detail(address, zipcode):
    url = gen_url(address, zipcode)  # generate query's http url
    spider = Crawler()
    html = spider.html(url)  # fetch html
    if html:  # if good html, analysis it
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:  # if something wrong in analysis, raise ExtractorError
            raise ExtractorError(address, zipcode, url)
    else:  # if bad html, raise HttpError
        raise HttpError(address, zipcode, url)

예제 #13

0

파일 보기

def anilibria_tv_scanner(url):
    # done 20190211
    html = gethtml2(url)
    # html = gethtml(url,True)
    soup = BS4(html, "html5lib")

    webname = soup.find("h1", class_='title-rus').text or "parsing_error"
    webname +=" / "+ soup.find("h3", class_='title-original').text or "parsing_error"

    website = "anilibria.tv"
    textready = soup.find("div", class_='torrent-first-col').span.text or "parsing_error"
    text = textready.split(" ",1)[1]
    text = text.split(" ")[0].split("[")[0]
    if "-" in text: text=text.split("-")[1]
    webready = safeint(text) or -1
    canscan = "yes"
    scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error"
    return webname, website, canscan, scanstatus, webready

예제 #14

0

파일 보기

파일: d.py 프로젝트: healingdrawing/data

def shikimori_org_scanner(url):
    # done 20190211
    html = gethtml(url.get())
    soup = BS4(html, "html5lib")

    webname = soup.find("header",
                        class_="head").meta["content"] or "parsing_error"
    website = "shikimori.org"
    maxready = 0
    span_ep_num = soup.find_all('span', class_="episode-num")
    for span_ep in span_ep_num:
        nextSpan = span_ep.findNext('span', class_="episode-kinds").string
        epNum = safeint(span_ep.string.split("#")[1]) or -1
        if "озвучка" in nextSpan and epNum > maxready: maxready = epNum
    webready = maxready
    canscan = "yes"
    scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error"
    return webname, website, canscan, scanstatus, webready

예제 #15

0

파일 보기

파일: big5.py 프로젝트: collab-uniba/personality

def clean_up(message_bodies, nlon, nlon_model):
    cleansed = list()
    words_number = 0
    words_limit = 10000
    for message_body in message_bodies:
        try:
            soup = BS4(message_body, 'html.parser')
            clean_message_body = soup.text
        except Exception as e:
            logger.error('Error with BS4 on text:\n\n%s\n\n' % message_body, str(e))
            clean_message_body = message_body.strip()

        clean_message_body = re.sub(r'^\s*>+( .*)?', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'^\s*\+', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'^\s*---\+', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'\n[\t\s]*\n+', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'({+|}+|\++|_+|=+|-+|\*|\\+|/+|@+|\[+|\]+|:+|<+|>+|\(+|\)+)', '',
                                    clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'On\s(.[^\sw]*\s)*wrote', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'[\n+]Sent from', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'https?:\/\/\S*', '', clean_message_body, flags=re.MULTILINE)
        clean_message_body = re.sub(r'[\w\.-]+ @ [\w\.-]+', '', clean_message_body, flags=re.MULTILINE)
        # clean_message_body = clean_message_body.encode('utf-8').strip()

        message_by_lines = clean_message_body.splitlines()
        list_length = len(message_by_lines)
        index = 0
        for count in range(0, list_length):
            text = robjects.StrVector([message_by_lines[index]])
            if nlon.NLoNPredict(nlon_model, text)[0] == 'Not':
                del message_by_lines[index]
            else:
                index = index + 1
        clean_message_body = '\n'.join(message_by_lines)

        split_message = clean_message_body.split()
        words_number += len(split_message)
        if words_number > words_limit:
            split_message = split_message[:(words_limit - words_number)]
            clean_message_body = ' '.join(split_message)
            cleansed.append(clean_message_body.strip())
            break
        cleansed.append(clean_message_body.strip())
    return cleansed

예제 #16

0

파일 보기

def anistar_me_scanner(url):
    # done 20190211
    r = urllib.request.Request(url ,headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'})
    # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'})
    html = gethtml2(r)
    if html:pass
    else:
        r = urllib.request.Request(url ,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'})
        # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'})
        html = gethtml2(r)
    soup = BS4(html, "html5lib")

    webname = soup.find("h1", itemprop="name").string or "parsing_error"
    website = "anistar.me"
    webready = soup.find("p", class_="reason").string
    webready = safeint(re.findall('\d+', webready )[0]) or -1
    canscan = "yes"
    scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error"
    return webname, website, canscan, scanstatus, webready

예제 #17

0

파일 보기

파일: new.py 프로젝트: a19880730/wds_Crawler

def mytask(bot):  #jz信息实时更新
    global base
    url1 = "https://wds.modian.com/show_weidashang_pro/5329#1"
    r1 = requests.get(url1, verify=False, headers=headers)
    html_doc_1 = r1.text
    soup1 = BS4(html_doc_1, "html.parser")
    gl = bot.List('group', '556592071')
    if gl is not None:
        for group in gl:
            result = return_top()
            print("首位" + result)
            if (result != base):
                print("before:" + base)
                base = result
                print("after:" + base)
                bot.SendTo(group, return_ans(result))
            else:
                print("equal_before:" + base)
                print("equal_after:" + base)

예제 #18

0

파일 보기

def scrape_menu(meal_id, date):
    meal_id = str(meal_id)
    menu_dict = {}
    food_list = []
    url = 'https://tamuk.campusdish.com/Commerce/Catalog/Menus.aspx?LocationId=6532&PeriodId=' + meal_id + '&MenuDate' \
          + date
    res = requests.get(url)
    res.raise_for_status()

    soup = BS4(res.text, 'html.parser')

    for group in soup.select('.menu-details-station'):
        category = group.find('h2').text
        food_items = group.select('.menu-details-station-item .menu-name a')
        for item in food_items:
            food_list.append(item.text)
        copy_list = food_list[:]
        menu_dict[category] = copy_list
    return menu_dict

예제 #19

0

파일 보기

파일: main.py 프로젝트: MacHu-GWU/Calc.myenergy.com-Crawler

def level3(url, spider):
    base_url = "http://high-schools.com"
    html = spider.html(base_url + url)
    soup = BS4(html)
    table = soup.find(
        "table",
        class_="table table-striped table-hover table-condensed table-sortable"
    )
    tbody = table.find("tbody")

    for tr in tbody.find_all("tr"):
        url = tr.td.a["href"]
        info = {
            key: td.text.strip()
            for key, td in zip([
                "school_name", "type", "students", "student_to_teacher_ratio",
                "free_or_reduced_lunch", "school_distinct"
            ], tr.find_all("td"))
        }
        yield url, info

예제 #20

0

파일 보기

파일: new.py 프로젝트: zhou244740711/wds_Crawler

def return_ans(result):
    url1 = "https://wds.modian.com/show_weidashang_pro/5329#1"
    r1 = requests.get(url1, verify=False, headers=headers)
    html_doc_1 = r1.text
    soup1 = BS4(html_doc_1, "html.parser")

    nick_sup = []
    money_sup = []
    rg = soup1.find("div", class_="b").get_text()
    num = int(rg[0:3])
    print(num)
    people = num
    if num >= 20:
        num = 20
    print(people)
    fond = 14968.42
    res = soup1.find_all(
        "div", class_="mon current")[0].find_all("span")[1].get_text()
    res = res[1:]
    res = float(res.replace(',', ""))
    sub = round(fond - res, 2)
    if sub <= 0:
        sub = 0

    for i in range(num):
        nick = soup2.find_all("span", "nickname")[i].get_text()
        nick_sup.append(nick)
        money = soup1.find_all("span", "nick_sup")[0].get_text()
    for i in range(num):
        if (result == nick_sup[i]):
            print(i)
            return ("刚刚" + result + "聚聚 " + money + "！" + "在聚聚榜上排名第" +
                    str(i + 1) + "位！" + "\n" + "现在共有" + str(people) +
                    "个人参与了活动。" + "\n" + "距离今日集资目标【￥" + 一张大盘 + "】还差【￥" +
                    str(sub) + "】\n" + "在最后的这段日子里，我们再坚持一下！" + "\n" +
                    "wds链接：http://jli.li/I")
        else:
            return ("刚刚" + result + "聚聚 " + money + "！" + "\n" + "现在共有" +
                    str(people) + "个人参与了活动。" + "\n" + "距离今日集资目标【" + "一张大盘" +
                    "】还差【￥" + str(sub) + "】\n" + "在最后的这段日子里，我们再坚持一下！" + "\n" +
                    "wds链接：http://jli.li/I")

예제 #21

0

파일 보기

파일: request_mixin.py 프로젝트: mohkale/RequestMixin

        def make_soup_request(self,
                              url,
                              *args,
                              soup_parser=default_soup_parser,
                              **kwargs):
            """Makes a request to a given url & converts the response text to a
            `bs4.BeautifulSoup` instance.

            WARNING: request status codes should be considered to not automatically
            be checked so it is highly suggested that you explicitly pass the kwarg
            check_status_code with a truthy value to ensure it is checked.

            See also: help(self.make_request)

            Parameters
            ----------
            soup_parser
                explicit parser to be used by the `BeautifulSoup` instance
            """
            return BS4(
                self.make_request(url, *args, **kwargs).text, soup_parser)

예제 #22

0

파일 보기

파일: zillow.py 프로젝트: MacHu-GWU/six-demon-bag

def property_info(address, zipcode):
    url = gen_url(address, zipcode)
    spider = Crawler()
    html = spider.html(url)
    if html:
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:
            log.write(
                "Failed to analysis address = %s, zipcode = %s" %
                (address, zipcode), "Failed Extraction")
            return None
    else:
        log.write("%s Failed to get http request" % url, "Http Error")

예제 #23

0

파일 보기

파일: parser.py 프로젝트: kresendonada/naoTimes

async def query_take_first_result(query):
    print('Requesting page to anibin...')
    async with aiohttp.ClientSession() as sesi:
        async with sesi.get('http://anibin.blogspot.com/search?q={}'.format(
                query)) as resp:
            response = await resp.text()

    # Let's fiddle with the data
    soup_data = BS4(response, 'html.parser')
    first_query = soup_data.find('div', attrs={'class': 'date-posts'})

    if not first_query:
        return None, None, None

    # Query results
    query_title = first_query.find('h3',
                                   attrs={
                                       'class': 'post-title entry-title'
                                   }).text.strip()

    if not query_title:
        return None, None, None

    content_data = str(
        first_query.find('div', attrs={'class': 'post-body entry-content'}))
    n_from = content_data.find('評価:')
    if n_from == -1:
        return False, False, False
    nat_res = content_data[n_from + 3:]
    nat_res = nat_res[:nat_res.find('<br/>')]

    n_from2 = content_data.find('制作:')

    if n_from2 == -1:
        return [query_title, nat_res, 'Unknown']

    studio = content_data[n_from2 + 3:]
    studio = studio[:studio.find('<br/>')]

    return [query_title, nat_res, studio]

예제 #24

0

파일 보기

파일: test.py 프로젝트: healingdrawing/data

def shikimori_one_scanner(url):
    # done 20190625
    # time.sleep(3) #looks like need 2-3 sec pause, or server not response
    # dt = datetime.datetime.today().strftime('%Y-%m-%d')
    timeout = 5
    r = urllib.request.Request(
        url,
        headers={
            'User-Agent':
            'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
        })
    # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'})
    html = gethtml2(r)
    if html: pass
    else:
        r = urllib.request.Request(
            url,
            headers={
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'
            })
        # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'})
        html = gethtml2(r)

    soup = BS4(html, "html5lib")
    htmltext = str(soup)

    webready = -1  #error etc
    print("Эпизоды:" in htmltext)
    episodes = htmltext.split("Эпизоды:", 1)[1]
    episodes = episodes.split("class=\"value\">", 1)[1]
    episodes = episodes.split("<", 1)[0].split("/")[0]
    print("эпизоды === ", episodes, flush=True)
    if episodes: webready = safenumber(episodes) or webready
    fullSeason = soup.find("span", {"data-text": "вышло"}) or False
    if fullSeason:
        webready = 0  # keep it for full season released etc, for shikimori.one only
    print(webready)
    return 0, 0, 0, 0, webready

예제 #25

0

파일 보기

파일: glod.py 프로젝트: RedheatWei/wow_glod_price

 def details(self, response):
     details_html = Selector(response=response).extract()
     detail = BS4(details_html, 'lxml')
     details_data = self.make_details_data(detail)
     price = details_data.get('price')
     glod = details_data.get('glod')
     if price != 0:
         unit_price = glod / price
     else:
         unit_price = 0
     print(glod, price, response.url)
     item_obj = WowGlodPriceItem(
         price=price,
         glod=glod,
         unit_price=unit_price,
         area=details_data.get('area'),
         server=details_data.get('server'),
         camp=details_data.get('camp'),
         push_timestrap=details_data.get('push_timestrap'),
         order_id=details_data.get('order_id'),
         url=response.url)
     yield item_obj

예제 #26

0

파일 보기

    def download_proxy(self, maximum_num_of_proxy=10):
        """
        [EN]load latest availble proxy from www.us-proxy.org
        There are 3 levels of proxies according to their anonymity.
        Level 1 - Elite Proxy / Highly Anonymous Proxy: The web server can't detect whether you are using a proxy.
        Level 2 - Anonymous Proxy: The web server can know you are using a proxy, but it can't know your real IP.
        Level 3 - Transparent Proxy: The web server can know you are using a proxy and it can also know your real IP.
        
        [CN]从www.us-proxy.org上抓取我们需要的代理
        在=== EDIT THE FOLLOWING RULES CAN FILTER THE PROXY YOU WANT 一行下可以修改规则，确定你所需要的
        代理。默认只使用Elite proxy
        """
        ### get www.us-proxy.org homepage html
        spider = Crawler()
        html = spider.html("http://www.us-proxy.org/")

        ### analyze the html, save useful proxy.
        ips = list()
        res = list()

        soup = BS4(html)
        table = soup.find("table", id="proxylisttable")
        for tr in table.tbody.find_all("tr"):
            ip, port, code, country, anonymity, google, https, last_check = [
                td.text for td in tr.find_all("td")
            ]
            ### === EDIT THE FOLLOWING RULES CAN FILTER THE PROXY YOU WANT
            if anonymity == "elite proxy":  # default only use elite proxy
                ips.append("http://%s:%s" % (ip, port))
                res.append([0.0, 0.0, 1.0])
                if len(
                        res
                ) >= maximum_num_of_proxy:  # if got enough useful proxy, then step out
                    break

        self.proxy = pd.DataFrame(res,
                                  index=ips,
                                  columns=["success", "tried", "health"])

예제 #27

0

파일 보기

파일: _yahoofinance.py 프로젝트: makeev/django-currencies

    def get_bulkcurrencies(self):
        """
        Get the supported currencies
        Scraped from a JSON object on the html page in javascript tag
        """
        start = r'YAHOO\.Finance\.CurrencyConverter\.addCurrencies\('
        _json = r'\[[^\]]*\]'
        try:
            resp = get(self.currencies_url)
            resp.raise_for_status()
        except exceptions.RequestException as e:
            self.warn("%s: Problem whilst contacting endpoint:\n%s" %
                      (self._name, e))
        else:
            # Find the javascript that contains the json object
            soup = BS4(resp.text, 'html.parser')
            re_start = re.compile(start)
            jscript = soup.find('script',
                                type='text/javascript',
                                text=re_start).string

            # Separate the json object
            re_json = re.compile(_json)
            match = re_json.search(jscript)
            if match:
                json_str = match.group(0)
                with open(self._cached_currency_file, 'w') as fd:
                    fd.write(json_str.encode('utf-8'))

        # Parse the json file
        with open(self._cached_currency_file, 'r') as fd:
            j = json.load(fd)
        if not j:
            raise RuntimeError(
                "JSON not found at endpoint or as cached file:\n%s" %
                self._cached_currency_file)
        return j

예제 #28

0

파일 보기

파일: client.py 프로젝트: sethc23/pygodaddy

    def find_dns_records(self, domain, record_type='A', dataframe=True):
        """ find all dns records of a given domain

        :param domain: a typical domain name, e.g. "example.com"
        :returns: a dict of (hostname -> DNSRecord)
        """
        html = self.session.get(self.zonefile_url.format(domain=domain)).text

        if record_type == 'A':
            #Update the security token while we're at it.
            sec_pattern = 'nonce=\"([0-9A-Za-z]+)\"'
            self.sec = re.compile(sec_pattern).findall(html)[0]

            pattern = "Undo{rt}Edit\\('tbl{rt}Records_([0-9]+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)'\\)".format(
                rt=record_type)
            try:
                results = map(DNSRecord._make,
                              re.compile(pattern).findall(html))
            except:
                logger.exception(
                    'find domains broken, maybe godaddy has changed its web structure'
                )
                return []
            return results

        else:

            # available records: ['A','CNAME','MX','TXT','SRV','AAAA','NS']
            try:
                assert ['CNAME'].count(record_type.upper())
            except:
                logger.exception(
                    'package development incomplete.  currently not handling "%s" records'
                    % record_type)
                return []

            h = BS4(BS4(html).encode('ascii'))
            r_table = h.find_all('table',
                                 id=re.compile('tbl%sRecords' %
                                               record_type.upper()))[0]
            tbl_headers = r_table.tr.text
            tbl_headers = re.sub(r'[^\u0000-\u007F\s]+', '', tbl_headers)
            tbl_headers = re.sub(r'[\n]+', '\n',
                                 tbl_headers).strip().split('\n')
            tbl_headers = [it.strip() for it in tbl_headers if it.strip()]

            df = pd_dataframe(columns=tbl_headers)

            tbl_items = r_table.find_all('tr', attrs={'lstindex': True})
            for row in tbl_items:
                cols = row.find_all('input',
                                    attrs={"type": "hidden"
                                           })[1:]  #skipping checkmark col
                new_vals = [v.attrs['value'] for v in cols]
                df = df.append(dict(zip(tbl_headers, new_vals)),
                               ignore_index=True)

            df.index += 1  #adjusting index to coincide with godaddy index

            if dataframe == True: return df
            else:
                DNSRecord = namedtuple(
                    'DNSRecord',
                    'index, hostname, value, ttl, host_td, points_to, rec_modified'
                )
                pattern = "\\('tbl{rt}Records_([0-9]+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)'\\)".format(
                    rt=record_type)
                results = map(DNSRecord._make,
                              re.compile(pattern).findall(html))

예제 #29

0

파일 보기

            print('해당 주차 강의를 다운로드 합니다. 완료될 때까지 기다려주세요.')
            print('- 온라인 출석부에서 해당 동영상 강의 열람 횟수가 1회 증가합니다.')
            vod_list = week.find_all('li',
                                     {'class': 'activity vod modtype_vod'})
            for i in range(0, len(vod_list)):
                vodLink = re.search(r'https://.*\d*', str(vod_list[i])).group()
                vodLink = vodLink[:vodLink.find('"')]
                vodLink = vodLink.replace('view', 'viewer')
                if (vodLink.find(
                        'https://plato.pusan.ac.kr/mod/vod/viewer.php?') == -1
                    ):
                    continue
                driver.get(vodLink)
                try:
                    da = Alert(driver)
                    da.dismiss()
                except:
                    print("", end='')
                html = driver.page_source
                soup = BS4(html, 'html.parser')
                source = str(soup.find_all('source'))
                source = source[source.find('https'):source.find('m3u8') + 4]
                fileDownload(
                    courseName + week.attrs['aria-label'] + '_' + str(i + 1),
                    source)

    driver.get('https://plato.pusan.ac.kr/')
    driver.find_element_by_xpath(
        '//*[@id="page-header"]/div[1]/div[2]/ul/li[2]/a').click()
    driver.quit()
    exit()

예제 #30

0

파일 보기

        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(0.5)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script(
            "return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

        #preparing the soup
        search = browser.page_source
        soup = BS4(search, "html.parser")

        #searching all the span attribute to the actor name class
        people = soup.findAll(
            "span", {
                "class":
                "entity-result__title-line flex-shrink-1 entity-result__title-text--black"
            })

        for name in people:
            name_list.append(name.text)

        for name in people:
            title_list.append(company)

clean_list = []