def parse_requests(links):
    search_urls = requests.get(
        links).text  # source of link from overridden 'parse" method
    html_link_parent = bb(search_urls).findAll('h3')

    for link_title in html_link_parent:
        link_title = link_title.find_next()
        url_strip = link_title.get('href')
        if '/url?q=' in url_strip and 'stackoverflow.com/questions/tagged' in url_strip:  # checks to see if the url points to the specific page. If not, pass
            pass
        elif '/url?q=' in url_strip and 'stackoverflow.com/questions/' in url_strip:
            url_strip = url_strip[7:]

            if 'webcache' not in url_strip:  # filters that are not desired.  Legit links  have the same url namespacing
                req = requests.get(url_strip).text

                soup = bb(req).find(
                    'div', {'class': 'container'
                            })  # Section containging question text for a
                for link_title in soup:
                    title = soup.find('a', {
                        'class': 'question-hyperlink'
                    }).text
                    title_body_dict = soup.find('div', {
                        'class': 'post-text'
                    }).text
                    time.sleep(
                        3)  # wait before requests extractions the next site.

                    return dict.update(title, title_body_dict)
Esempio n. 2
0
def getHref(cityList):
    global sum
    sum = 0
    header = [{
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/68.0.3440.84 Safari/537.36'
    },
    {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'}]
    global fw
    fw = open('../fo.json', 'w', encoding='utf-8')
    for provinceItem in cityList.keys():
        for cityItem in cityList[provinceItem].keys():
            cityCode = cityList[provinceItem][cityItem]  # 城市对应的编码
            try:
                href = urllib.request.Request(
                    "https://search.51job.com/list/"+cityCode+",000000,0000,00,9,99,%2520,2,1.html",headers=header[random.randint(0,1)])
                b = bb(urllib.request.urlopen(href).read(), 'lxml')
                total_page = b.find('div', {'class': 'p_in'}).find('span', {'class': 'td'}).text  # 获取每个城市的总页数
                total_page=int(re.findall(r"\d+\.?\d*", total_page)[0])
                for i in range(total_page):
                    href = urllib.request.Request(
                        "https://search.51job.com/list/"+cityCode+",000000,0000,00,9,99,%2520,2,"+"%d.html"%(i),headers=header[random.randint(0,1)])
                    b = bb(urllib.request.urlopen(href).read(), 'lxml')
                    items = b.findAll('p', {'class', 't1'})
                    for item in items:
                        href = urllib.request.Request(item.find('span').find('a')['href'], headers=header[random.randint(0,1)])
                        Thread(target=getInformation, args=(
                            bb(urllib.request.urlopen(href).read(), 'lxml'),provinceItem,cityItem,sum)).start()
                        sum += 1
                    sleep(2)    #每获取一页数据就休眠2秒
            except Exception as e:
                print("1:%s"%e)
                continue
    fw.close()
    dataFormat("../fo.json")
def site2():
    s = 1
    q = ''
    e = {}
    j = ''
    t = 0
    test = 9
    backdir9 = os.getcwd()
    fine = backdir9 + '\\Data\\name.txt'
    fine1 = backdir9 + '\\Data\\count.txt'
    with open(fine, 'r+') as (f):
        f.truncate()
    with open(fine1, 'r+') as (f):
        f.truncate()
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }
    popup.lead2()
    with open(fine, 'r+') as (f):
        j = f.read()
        f.seek(0)
        f.truncate()
    book = j
    name = book.replace(' ', '+')
    web = 'http://www.ebook777.com'
    r = requests.get(('http://www.ebook777.com/?s=' + name), headers=headers)
    soup = bb(r.text, 'lxml')
    match = soup.findAll('div', class_='content')
    x = 1
    for a in match:
        g = a.find('a', {'class': 'title'}).text
        o = a.find('a', {'class': 'title'})['href']
        e[g] = o

    with open(fine, 'r+') as (f):
        for a in e.keys():
            m = str(x)
            f.write(m + '.  ' + a + '\n\n')
            x += 1

    os.startfile(fine)
    popup.lead3()
    with open(fine1, 'r+') as (f):
        t = int(f.readline())
        f.truncate()
    for d in e.keys():
        if s == t:
            q = d
            test = 10
        s += 1

    if test == 10:
        new = e[q]
        newr = requests.get(new, headers)
        soup1 = bb(newr.text, 'lxml')
        match1 = soup1.find('span', class_='download-links')
        match2 = match1.find('a')['href']
        wb.open_new(match2)
Esempio n. 4
0
def getHref(cityList):
    global sum
    sum = 0
    header = [{
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/68.0.3440.84 Safari/537.36'
    }, {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'
    }]
    global fw
    fw = open('../leiPin.json', 'w', encoding='utf-8')
    for provinceItem in cityList.keys():  #每一个省份
        for cityItem in cityList[provinceItem].keys():  #每一个城市
            cityCode = cityList[provinceItem][cityItem]  #城市对应的编码
            nextPage = "https://www.liepin.com/zhaopin/?dqs=%s&curPage=1" % (
                cityCode)
            while (True):  #遍历每一页
                try:
                    #          href = urllib.request.Request(("https://www.liepin.com/zhaopin/?dqs=%s&curPage=%s")%(cityCode,i+1),headers=header[random.randint(0,1)])  # 伪装浏览器
                    href = urllib.request.Request(
                        nextPage, headers=header[random.randint(0,
                                                                1)])  # 伪装浏览器
                    b = bb(urllib.request.urlopen(href).read(), 'lxml')
                    items = b.find('ul', {
                        'class': 'sojob-list'
                    }).findAll('li')  # 每一页对应的工作列表
                    for item in items:
                        href = urllib.request.Request(
                            item.find('div', {
                                'class': 'job-info'
                            }).find('h3').find('a')['href'],
                            headers=header[random.randint(0, 1)])
                        Thread(target=getInformation,
                               args=(bb(
                                   urllib.request.urlopen(href).read(),
                                   'lxml'), provinceItem, cityItem,
                                     sum)).start()
                        sum += 1
                    nextPage = "https://www.liepin.com/" + b.find(
                        'div', {
                            'class': 'pagerbar'
                        }).findAll('a')[-3]['href']
                except Exception as e:
                    if (str(e) == 'HTTP Error 404: Not Found'):
                        break
                    print("1:%s" % e)
                    continue
Esempio n. 5
0
def getHref(cityList):
    global sum, fw, proxy_list
    sum = 0
    header = [{
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/68.0.3440.84 Safari/537.36'
    }, {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'
    }]
    fw = open('../fe.json', 'w', encoding='utf-8')
    for provinceItem in cityList.keys():
        for cityItem in cityList[provinceItem].keys():
            cityCode = cityList[provinceItem][cityItem]  # 城市对应的编码
            for careerItem in career.keys():  # 每种职业对应的编码加进去
                try:
                    #     href = urllib.request.Request(("https://%s.58.com/%s/") % (cityCode,career[careerItem]),headers=header[random.randint(0,1)])
                    #     b = bb(urllib.request.urlopen(href).read(), 'lxml')
                    #     total_page = int(b.find('span', {'class': 'num_operate'}).find('i', {'class': 'total_page'}).text)  # 获取每个城市的总页数
                    for i in range(1):
                        href = requests.get(
                            ("https://%s.58.com/%s/pn%s") %
                            (cityCode, career[careerItem], i),
                            headers=header[random.randint(0, 1)])
                        b = bb(href.text, 'lxml')
                        items = b.findAll('li', {'class', 'job_item clearfix'})
                        for item in items:
                            #    href = urllib.request.Request(item.find('div',{'class':'job_name clearfix'}).find('a')['href'], headers=header[random.randint(0,1)])
                            href = requests.get(
                                item.find('div', {
                                    'class': 'job_name clearfix'
                                }).find('a')['href'],
                                headers=header[random.randint(0, 1)])
                            #   Thread(target=getInformation, args=(
                            #       bb(urllib.request.urlopen(href).read(), 'lxml'),provinceItem,cityItem,sum)).start()
                            Thread(target=getInformation,
                                   args=(bb(href.text, 'lxml'), provinceItem,
                                         cityItem, sum)).start()
                            sum += 1
                        sleep(2)  #每获取一页数据就休眠2秒
                except Exception as e:
                    print("1:%s" % e)
                    continue
Esempio n. 6
0
def getHref(cityList):
    global sum
    sum = 0
    header = [{
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/68.0.3440.84 Safari/537.36'
    }, {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'
    }]
    global fw
    fw = open('../boss.json', 'w', encoding='utf-8')
    for provinceItem in cityList.keys():  #每一个省份
        for cityItem in cityList[provinceItem].keys():  #每一个城市
            cityCode = cityList[provinceItem][cityItem]  #城市对应的编码
            for i in range(5):  #遍历每一页
                try:
                    href = urllib.request.Request(
                        ("https://www.zhipin.com/c%s/?page=%s") %
                        (cityCode, i + 1),
                        headers=header[random.randint(0, 1)])  #伪装浏览器
                    b = bb(urllib.request.urlopen(href).read(), 'lxml')
                    items = b.find('div', {
                        'class': 'job-list'
                    }).findAll('li')  #每一页对应的工作列表
                    for item in items:
                        href = urllib.request.Request(
                            "https://www.zhipin.com" +
                            item.find('div', {
                                'class': 'info-primary'
                            }).find('h3').find('a')['href'],
                            headers=header[random.randint(0, 1)])
                        Thread(target=getInformation,
                               args=(bb(
                                   urllib.request.urlopen(href).read(),
                                   'lxml'), provinceItem, cityItem,
                                     sum)).start()
                        sum += 1
                except Exception as e:
                    print("1:%s" % e)
                    continue
Esempio n. 7
0
def get_nav_links(index_url):
    # 从引导页获得该页面中的所有相册链接
    html = requests.get(index_url, headers=headers)
    html.encoding = 'gbk'
    soup = bb(html.text, 'lxml')
    neirong = soup.find_all('h2')
    print(neirong)
    with open(os.path.join(FIG_BASE, "page_urls.txt"), 'a') as f:
        for i in neirong:
            print(i.a.get("href"), i.text)
            link = f"{main_domain}/" + i.a.get("href")
            f.write(f"{link}  {i.text} \n")
            get_figs(link)
Esempio n. 8
0
def get_nav_links(index_url):
    # 从引导页获得该页面中的所有相册链接
    html = requests.get(index_url, headers=headers)
    html.encoding = 'gbk'
    soup = bb(html.text, 'lxml')
    neirong = soup.find_all('h2')
    print(neirong)
    with open("page_urls.txt", 'a') as f:
        for i in neirong:
            print(i.a.get("href"), i.text)
            link = "https://qqh225.com/" + i.a.get("href")
            f.write(f"{link}  {i.text} \n")
            get_figs(link)
Esempio n. 9
0
def get_figs(page_url):
    for page_i in range(1, 26):
        if page_i == 1:
            multi_page = page_url
        else:
            multi_page = "{}_{}.html".format(
                page_url.split(".html")[0], page_i)
        page = get_response(multi_page, useproxy=False, retry_counter=10)
        if page.url == f"{main_domain}/cip.asp":
            break
        page.encoding = 'gbk'
        soup = bb(page.text, 'lxml')
        dir_name_patten = re.search(r'([\S*\]*]\S+\s*\S+[\]])(_*)(\W*)',
                                    soup.title.text)  # 从title提取相册名
        if dir_name_patten:
            dir_name = dir_name_patten.group(1)
        else:
            dir_name = soup.title.text
        dir_name = re.sub(r'[/:*?"<>|]', '-', dir_name)  # 验证是否包含不合法字符,并替换
        xiangce_dir = os.path.join(FIG_BASE, dir_name)
        if not os.path.exists(xiangce_dir):
            os.mkdir(xiangce_dir)
        neirong = soup.find_all('p')
        with open(os.path.join(xiangce_dir, "fig_urls.txt"), 'a') as f:
            for i in tqdm(neirong):
                # tqdm 来展示当前页面下载进度
                try:
                    # print(i.img.get("src"))
                    fig_url = i.img.get("src")
                    # 增加判断本地文件是否存在,存在则跳过
                    fig_path = os.path.join(xiangce_dir,
                                            f'{fig_url.split("/")[-1]}')
                    if os.path.exists(fig_path):
                        continue
                    else:
                        with open(fig_path, 'wb') as fig_f:
                            fig_f.write(
                                get_response(fig_url,
                                             useproxy=False,
                                             retry_counter=3).content)
                        f.write(f"{fig_url} \n")
                except requests.Timeout:
                    continue
                except AttributeError:
                    # 第一个p标签存着人物介绍
                    with open(os.path.join(xiangce_dir, "profile.txt"),
                              "a",
                              encoding="utf-8") as profile_f:
                        profile_f.writelines(i.text)
                    continue
Esempio n. 10
0
def get_figs(page_url):
    for page_i in range(1, 26):
        if page_i == 1:
            multi_page = page_url
        else:
            multi_page = "{}_{}.html".format(
                page_url.split(".html")[0], page_i)
        # page = requests.get(multi_page, headers=headers)
        page = get_response(multi_page, useproxy=False, retry_counter=10)
        page.encoding = 'gbk'
        soup = bb(page.text, 'lxml')
        dir_name = soup.title.text.split("P]")[0]
        dir_name = re.sub(r'[/:*?"<>|]', '-', dir_name)  # 验证是否包含不合法字符,并替换
        if page.url == f"{main_domain}/cip.asp":
            break
        xiangce_dir = os.path.join(FIG_BASE, dir_name)
        if not os.path.exists(xiangce_dir):
            os.mkdir(xiangce_dir)
        neirong = soup.find_all('p')
        # print(neirong)
        with open(os.path.join(xiangce_dir, "fig_urls.txt"), 'a') as f:
            for i in neirong:
                try:
                    print(i.img.get("src"))
                    fig_url = i.img.get("src")
                    # 增加判断本地文件是否存在,存在则跳过
                    fig_path = os.path.join(xiangce_dir,
                                            f'{fig_url.split("/")[-1]}')
                    if os.path.exists(fig_path):
                        continue
                    else:
                        socket.setdefaulttimeout(300)
                        opener = urllib.request.build_opener()
                        opener.addheaders = [
                            ('User-Agent', UserAgent(verify_ssl=False).chrome)
                        ]
                        urllib.request.install_opener(opener)
                        urllib.request.urlretrieve(fig_url, fig_path, progress)
                        f.write(fig_url)
                        f.write("\r\n")
                except:
                    continue
Esempio n. 11
0
def get_figs(page_url):
    for page_i in range(1, 26):
        if page_i == 1:
            multi_page = page_url
        else:
            multi_page = "{}_{}.html".format(
                page_url.split(".html")[0], page_i)
        page = requests.get(multi_page, headers=headers)
        page.encoding = 'gbk'
        soup = bb(page.text, 'lxml')
        dir_name = soup.title.text.split("P]")[0]
        dir_name = re.sub(r'[/:*?"<>|]', '-', dir_name)  # 验证是否包含不合法字符,并替换
        if page.url == "https://qqi668.com/cip.asp":
            break
        xiangce_dir = os.path.join(os.getcwd(), dir_name)
        if not os.path.exists(xiangce_dir):
            os.mkdir(xiangce_dir)
        neirong = soup.find_all('p')
        # print(neirong)
        with open(os.path.join(xiangce_dir, "fig_urls.txt"), 'a') as f:
            for i in neirong:
                try:
                    print(i.img.get("src"))
                    fig_url = i.img.get("src")
                    # 增加判断本地文件是否存在,存在则跳过
                    fig_path = os.path.join(xiangce_dir,
                                            f'{fig_url.split("/")[-1]}')
                    if os.path.exists(fig_path):
                        continue
                    else:
                        socket.setdefaulttimeout(300)
                        opener = urllib.request.build_opener()
                        opener.addheaders = [(
                            'User-Agent',
                            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36'
                        )]
                        urllib.request.install_opener(opener)
                        urllib.request.urlretrieve(fig_url, fig_path, progress)
                        f.write(fig_url)
                        f.write("\r\n")
                except:
                    continue
def site1():
    j = ''
    e = {}
    p = []
    t = 0
    s = 1
    q = ''
    backdir9 = os.getcwd()
    fine = backdir9 + '\\Data\\name.txt'
    fine1 = backdir9 + '\\Data\\count.txt'
    with open(fine, 'r+') as (f):
        f.truncate()
    with open(fine1, 'r+') as (f):
        f.truncate()
    popup.lead2()
    with open(fine, 'r+') as (f):
        j = f.read()
        f.seek(0)
        f.truncate()
    web = 'http://gen.lib.rus.ec/search.php?req='
    web1 = 'http://gen.lib.rus.ec/'
    web2 = 'http://93.174.95.29'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
    }
    book = j
    name = book.replace(' ', '+')
    r = requests.get((web + name), headers=headers)
    soup = bb(r.text, 'lxml')
    match = soup.findAll('td', width='500')
    x = 1
    for d in match:
        g = d.a.text
        o = d.a['href']
        e[g] = o

    with io.open(fine, 'r+', encoding='utf-8') as (f):
        for a in e.keys():
            m = str(x)
            f.write(m + '.  ' + a + '\n\n')
            x += 1

    os.startfile(fine)
    popup.lead3()
    with open(fine1, 'r+') as (f):
        t = int(f.readline())
        f.truncate()
    for d in e.keys():
        if s == t:
            q = d
        s += 1

    new = e[q]
    new2 = web1 + new
    new3 = requests.get(new2, headers=headers)
    soup1 = bb(new3.text, 'lxml')
    match1 = soup1.find('a', {'title': 'Gen.lib.rus.ec'})['href']
    new4 = requests.get(match1, headers=headers)
    soup2 = bb(new4.text, 'lxml')
    latest = web2 + soup2.h2.a['href']
    wb.open_new(latest)
Esempio n. 13
0
p = predict_image(cv2.imread("E:\system\Pictures\ml project\{}".format(lki)))
print(categories[p])

# In[131]:

from bs4 import BeautifulSoup as bb
import requests as rr

# In[132]:

res = rr.get("https://en.wikipedia.org/wiki/{}".format(categories[p]))

# In[133]:

bes = bb(res.text, 'lxml')

# In[134]:

lin = bes.find('table', class_='infobox biota')

# In[135]:

logo = lin.find("a", class_="image")
tree = logo.img.get('src')

# In[136]:

from IPython.display import Image
from IPython.core.display import HTML
print("The Image of Tree is :")