Example #1
0
def get_url_set(driver, key_word):
    container_list = []  #存放临时URL
    real_url_set = set()  #URL集

    #百度搜索结果
    baidu_url = "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word
    driver.get(baidu_url)
    #获取每条搜索结果的URL
    for page in range(1, MAX_PAGEs + 1):
        BF1 = BF(driver.page_source)
        #print(driver.page_source)
        page_container_list = BF1.findAll(
            "div", {"class": re.compile(".*c-container.*")})
        container_list.extend(page_container_list)
        b = driver.find_element_by_xpath("//*[text()='下一页>']").click()

        time.sleep(2)

    #将每条URL进行一次跳转,得到初始URL,并添加进real_url_set中
    for container in container_list:
        href = container.find('h3').find('a').get('href')
        try:
            baidu_url = requests.get(url=href,
                                     headers=headers,
                                     allow_redirects=False)
        except:
            continue
        real_url = baidu_url.headers['Location']  #得到网页原始地址
        if real_url.startswith('http'):
            real_url_set.add(real_url + '\n')

    #必应搜索结果

    being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE"
    try:
        driver.get(being_url)
    except:
        driver.refresh()
    #需要刷新一下界面
    time.sleep(2)
    driver.refresh()
    time.sleep(5)
    for page in range(1, MAX_PAGEs + 1):
        BF1 = BF(driver.page_source)
        #print(driver.page_source)
        page_container_list = BF1.find("ol", {"id": "b_results"}).findAll("h2")
        for page_container in page_container_list:
            try:
                real_url_set.add(page_container.find("a").get('href'))
            except:
                break
        b = driver.find_element_by_xpath(".//*[@title='下一页']").click()
        time.sleep(2)

    #谷歌暂时没做
    #google_url=""
    return real_url_set
    def get_url_set(self, key_word):
        container_list = []  # 存放临时URL
        real_url_set = set()  # URL集

        # CNN

        CNN_url = "https://edition.cnn.com/search?size=20&q=" + key_word
        self.driver.get(CNN_url)
        BF1 = BF(self.driver.page_source, 'lxml')
        container_list = BF1.findALL("div", {"class", "cnn-search__result-contents"})

        for container in container_list:
            try:
                href = container.find("h3").find("a").get("href")
            except:
                continue
            real_url_set.add(container)
Example #3
0
def get_url_set(driver, key_word):
    container_list = []  #存放临时URL
    real_url_set = set()  #URL集

    baidu_url_list = [
        "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word,
        "https://www.baidu.com/s?ie=UTF-8&tn=news&wd=" + key_word
    ]
    for i in range(2):
        driver.get(baidu_url_list[i])
        #获取每条搜索结果的URL
        for page in range(1, MAX_PAGEs + 1):
            BF1 = BF(driver.page_source, 'lxml')
            #print(driver.page_source)
            if i == 0:
                page_container_list = BF1.findAll(
                    "div", {"class": re.compile(".*c-container.*")})
            else:
                page_container_list = BF1.findAll(
                    "div", {"class": re.compile("result")})
            #print(page_container_list)
            container_list.extend(page_container_list)
            b = driver.find_element_by_xpath("//*[text()='下一页>']").click()
            time.sleep(2)
        if i == 0:
            #print(container_list)
            for container in container_list:
                print(container)
                href = container.find("h3").find("a").get("href")
                try:
                    baidu_url = requests.get(url=href,
                                             headers=headers,
                                             allow_redirects=False)
                except:
                    continue
                real_url = baidu_url.headers['Location']  #得到网页原始地址
                if real_url.startswith('http'):
                    real_url_set.add(real_url + '\n')
                container_list = []
        else:
            for container in container_list:
                href = container.find("h3").find("a").get("href")
                if "baijiahao" not in href:
                    real_url_set.add(href)
    return real_url_set
Example #4
0
def main():
    # main function

    # Set the url link
    urlMovies = 'https://www.subs4free.club/'

    # Set my-user-aget. If you dont know who is your 'User-aget', just google "my user agent" and it will show it first on result
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}

    # Get the whole page content
    pageMovies = requests.get(urlMovies, headers=headers)

    # Convert page to lxml
    soupMovies = BF(pageMovies.content, 'html.parser')
    
    # DIV attribute that contains info for every movie
    elementListMovies = soupMovies.findAll("div", {"class": "movies-info"})

    createInfoMsgToSend(elementListMovies)
def abcNewsGetUrlSet(self, key_word) -> set:
    container_list = []  # 存放临时URL
    real_url_set = set()  # URL集

    # abcNews

    abcNews_url = "https://abcnews.go.com/search?r=week&searchtext=" + key_word
    self.driver.get(abcNews_url)
    BF1 = BF(self.driver.page_source, 'lxml')
    container_list = BF1.findALL("div", {"class": re.compile("result.*")})

    for container in container_list:
        try:
            href = container.find("a", {"class": "title"}).get("href")
            title = container.find("a", {"class": "title"}).get_text()
        except:
            continue
        real_url_set.add((href, title))

    return real_url_set
def cnnGetUrlSet(self, key_word) -> set:
    container_list = []  # 存放临时URL
    real_url_set = set()  # URL集

    # CNN

    cnn_url = "https://edition.cnn.com/search?size=20&q=" + key_word
    self.driver.get(cnn_url)
    bf1 = BF(self.driver.page_source, 'lxml')
    container_list = bf1.findALL("div",
                                 {"class": "cnn-search__result-contents"})

    for container in container_list:
        try:
            href = container.find("h3").find("a").get("href")
            title = container.find("h3").find("a").get_text()
        except:
            continue
        real_url_set.add((href, title))

    return real_url_set
def tassGetUrlSet(self, key_word) -> set:
    container_list = []  # 存放临时URL
    real_url_set = set()  # URL集

    # tass

    tass_url = "https://tass.com/search?sort=date&searchStr" + key_word
    self.driver.get(tass_url)
    BF1 = BF(self.driver.page_source, 'lxml')
    container_list = BF1.findALL("div", {"class": "news-list__item ng-scope"})

    for container in container_list:
        try:
            href = "www.tass.con/" + container.find("a").get("href")
            title = container.find("span", {
                "class": "news-preview__title ng-binding"
            }).get_text()
        except:
            continue
        real_url_set.add((href, title))

    return real_url_set
def find_jobs():
    html_text = requests.get(website).text
    soup = BF(html_text, 'lxml')
    jobs = soup.find_all('li', class_='clearfix job-bx wht-shd-bx')

    for index, job in enumerate(jobs):
        published_date = job.find('span', class_='sim-posted').span.text

        if 'few' in published_date:
            company_name = job.find('h3', class_='joblist-comp-name').text
            skills = job.find('span', class_='srp-skills').text
            experience = job.li.text.replace('card_travel', '')
            more_info = job.header.h2.a['href']

            if unfamiliar_skill not in skills:
                with open(f'posts/{company_name.strip()}.txt', 'w') as f:
                    f.write(f"Company name: {company_name.strip()}\n")
                    f.write(f"Required Skills: {skills.strip()}\n")
                    f.write(f"Required Experience: {experience.strip()}\n")
                    f.write(f'More Info: {more_info}\n')

                print(
                    f'The file saved in the posts folder: {company_name.strip()}.txt '
                )
from bs4 import BeautifulSoup as BF

with open('home.html', 'r') as html_file:
    content = html_file.read()

    soup = BF(content, 'lxml')
    course_cards = soup.find_all('div', class_='card')
    for course in course_cards:
        course_name = course.h5.text
        course_price = course.a.text.split()[-1]

        print(f"{course_name} costs {course_price}")
Example #10
0
def get_soup(corpus):  #获取爬去xml文件的Beautifulsoup
    original_corpus = open('%s.xml' % corpus, "r")
    soup = BF(original_corpus, 'lxml')
    return soup
Example #11
0
def getDataFromSite():
    response = requests.get(currentTarget, headers=HEADERS)
    return BF(response.content, 'html.parser')
    def get_url_set(self, driver, key_word):
        container_list = []  #存放临时URL
        real_url_set = set()  #URL集

        #百度网页搜索+百度资讯搜索结果

        baidu_url_list = [
            "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word,
            "https://www.baidu.com/s?ie=UTF-8&tn=news&wd=" + key_word
        ]
        for i in range(2):
            driver.get(baidu_url_list[i])
            #获取每条搜索结果的URL
            for page in range(1, MAX_PAGEs + 1):
                BF1 = BF(driver.page_source, 'lxml')
                #print(driver.page_source)
                if i == 0:
                    page_container_list = BF1.findAll(
                        "div", {"class": re.compile(".*c-container.*")})
                else:
                    page_container_list = BF1.findAll(
                        "div", {"class": re.compile("result")})
                #print(page_container_list)
                container_list.extend(page_container_list)
                b = driver.find_element_by_xpath("//*[text()='下一页>']").click()
                time.sleep(2)
            if i == 0:
                #print(container_list)
                for container in container_list:
                    #print(container)
                    href = container.find("h3").find("a").get("href")
                    try:
                        baidu_url = requests.get(url=href,
                                                 headers=headers,
                                                 allow_redirects=False)
                    except:
                        continue
                    real_url = baidu_url.headers['Location']  #得到网页原始地址
                    if real_url.startswith('http'):
                        real_url_set.add(real_url + '\n')
                    container_list = []
            else:
                for container in container_list:
                    href = container.find("h3").find("a").get("href")
                    if "baijiahao" not in href:
                        real_url_set.add(href)

        #必应搜索结果

        being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE"
        try:
            driver.get(being_url)
        except:
            driver.refresh()
        #需要刷新一下界面
        time.sleep(2)
        driver.refresh()
        time.sleep(5)
        for page in range(1, MAX_PAGEs + 1):
            BF1 = BF(driver.page_source)
            #print(driver.page_source)
            page_container_list = BF1.find("ol", {
                "id": "b_results"
            }).findAll("h2")
            for page_container in page_container_list:
                try:
                    real_url_set.add(page_container.find("a").get('href'))
                except:
                    break
            try:
                b = driver.find_element_by_xpath(".//*[@title='下一页']").click()
            except:
                b = driver.find_element_by_xpath(
                    ".//*[@title='Next page']").click()
            time.sleep(2)
        #谷歌网页加谷歌新闻  #需使用VPN
        google_url_list = [
            "https://www.google.com.hk/search?q=" + key_word,
            "https://www.google.com/search?q={}&tbm=nws".format(key_word)
        ]
        for google_url in google_url_list:
            try:
                driver.get(google_url)
            except:
                driver.refresh()
            #需要刷新一下界面
            time.sleep(2)
            driver.refresh()
            time.sleep(5)
            driver.get(google_url)
            for page in range(1, MAX_PAGEs + 1):
                BF1 = BF(driver.page_source)
                #print(driver.page_source)
                page_container_list = BF1.findAll("div", {"class": "g"})
                for page_container in page_container_list:
                    try:
                        real_url_set.add(page_container.find("a").get('href'))
                    except:
                        break
                b = driver.find_element_by_xpath("//*[text()='下一页']").click()
                time.sleep(2)

        #Wikipedia 需使用VPN
        Wikipedia_url = "https://zh.wikipedia.org/w/index.php?search=" + key_word + "&limit=100&ns0=1"
        driver.get(Wikipedia_url)
        BF1 = BF(driver.page_source)
        page_container_list = BF1.findAll(
            "div", {"class": "mw-search-result-heading"})
        for page_container in page_container_list:
            try:
                real_url_set.add("https://zh.wikipedia.org" +
                                 page_container.find("a").get('href'))
            except:
                break

        return real_url_set
Example #13
0
    'Accept-Encoding':
    'gzip, deflate, compress',
    'Accept-Language':
    'en-us;q=0.5,en;q=0.3',
    'Cache-Control':
    'max-age=0',
    'Connection':
    'keep-alive',
    'User-Agent':
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}

real_url_list = []
container_list = []
for page in range(1, max_pages + 1):
    BF1 = BF(driver.page_source)
    #print(driver.page_source)
    page_container_list = BF1.findAll("div",
                                      {"class": re.compile(".*c-container.*")})
    container_list.extend(page_container_list)
    b = driver.find_element_by_xpath("//*[text()='下一页>']").click()
    time.sleep(2)

#get all URLs

for container in container_list:
    href = container.find('h3').find('a').get('href')
    try:
        baidu_url = requests.get(url=href,
                                 headers=headers,
                                 allow_redirects=False)
Example #14
0
import os
import requests
from bs4 import BeautifulSoup as BF

if __name__ == "__main__":
    # main function

    # Set the url link
    urlGames = 'https://game20.gr/category/news/'
    urlMovies = 'https://www.subs4free.info/'
    urlBeta = 'https://www.allgamesdelta.net/'

    # Set my-user-aget. If you dont know who is your 'User-aget', just google "my user agent" and it will show it first on result
    headers = {
        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}

    # Get the whole page content        
    pageGames = requests.get(urlGames, headers=headers)
    pageMovies = requests.get(urlMovies, headers=headers)
    pageBeta = requests.get(urlBeta, headers=headers)
    
    soupGames = BF(pageGames.content, 'lxml')
    soupMovies = BF(pageMovies.content, 'lxml')
    soupBeta = BF(pageBeta.content, 'lxml')
    sel = " body > div.wrapper > div.container > section > div.container-section > div.list-info > div.movies-info > div.movie-cont-right > div.panel-heading-info > a"    
    # elementListMovies = soupMovies.select('a.headinglink')
    elementListMovies = soupMovies.findAll("div", {"class" : "movie-info"})
    elementListGames = soupGames.findAll('article')
    elementListBeta = soupBeta.findAll("div", {"class" : "post hentry"})    
def get_url_set(self, key_word):
    container_list = []  # 存放临时URL
    real_url_set = set()  # URL集

    # CNN

    CNN_url = "https://edition.cnn.com/search?size=20&q="+key_word
    self.driver.get(CNN_url)
    BF1 = BF(self.driver.page_source, 'lxml')
    container_list=BF1.findALL("div", {"class", "cnn-search__result-contents"})

    for container in container_list:
        try:
            href=container.find("h3").find("a").get("href")
        except:
            continue
        real_url_set.add(container)

    def get_url_set(self, key_word):
        container_list = []  # 存放临时URL
        real_url_set = set()  # URL集

        # CNN

        CNN_url = "https://edition.cnn.com/search?size=20&q=" + key_word
        self.driver.get(CNN_url)
        BF1 = BF(self.driver.page_source, 'lxml')
        container_list = BF1.findALL("div", {"class", "cnn-search__result-contents"})

        for container in container_list:
            try:
                href = container.find("h3").find("a").get("href")
            except:
                continue
            real_url_set.add(container)



    for i in range(2):
        self.driver.get(baidu_url_list[i])
        # 获取每条搜索结果的URL
        for page in range(1, self.MAX_PAGEs + 1):
            BF1 = BF(self.driver.page_source, 'lxml')
            # print(driver.page_source)
            if i == 0:
                page_container_list = BF1.findAll("div", {"class": re.compile(".*c-container.*")})
            else:
                page_container_list = BF1.findAll("div", {"class": re.compile("result")})
            # print(page_container_list)
            container_list.extend(page_container_list)
            b = self.driver.find_element_by_xpath("//*[text()='下一页>']").click()
            time.sleep(2)
        if i == 0:
            # print(container_list)
            for container in container_list:
                # print(container)
                try:
                    href = container.find("h3").find("a").get("href")
                    baidu_url = requests.get(url=href, headers=self.headers, allow_redirects=False)
                except:
                    continue
                real_url = baidu_url.headers['Location']  # 得到网页原始地址
                if real_url.startswith('http'):
                    real_url_set.add(real_url + '\n')
                container_list = []
        else:
            for container in container_list:
                href = container.find("h3").find("a").get("href")
                if "baijiahao" not in href:
                    real_url_set.add(href)