예제 #1
0
def get_content(htmml, page):
    output = """page {}, author: {}, sex: {}, age: {}, thumb: {}, comment: {}\n{}\n-----\n"""  #output format
    soup = BeautifulSoup(htmml, 'html.parser')
    con = soup.find(name='div', attrs={'class': 'col1 old-style-col1'})
    #my_print(con)

    con_list = con.find_all('div', class_='article')

    for i in con_list:
        my_print(i)
        author = i.find('h2').string
        content = i.find('div', class_='content').find('span').get_text()
        stats = i.find('div', class_='stats')
        vote = stats.find('span',
                          class_='stats-vote').find('i',
                                                    class_='number').string
        comment = stats.find('span', class_='stats-comments').find(
            'i', class_='number').string
        author_info = i.find('div', class_='articleGender')
        if author_info is not None:
            class_list = author_info['class']
            if 'womenIcon' in class_list:
                gender = 'women'
            elif 'manIcon' in class_list:
                gender = 'man'
            else:
                gender = ''
            age = author_info.string
        else:
            gender = ''
            age = ''

        save_txt(
            output.format(page, author, gender, age, vote, comment, content))
예제 #2
0
 def get_cookie(self):
     
     url = "https://www.lagou.com/jobs/list_{}?labelWords=&fromSearch=true&suginput=".format(self.job)
     s = requests.Session()
     bs_myprint.my_print(self.url)
     h = self.get_headers()
     #h = urlencode({'headers':self.get_headers()})
     bs_myprint.my_print(h)
     s.get(url, data=self.data,  headers=h, timeout=5)
     return s.cookies
예제 #3
0
def get_pic_list(url):
    html = download_page(url)
    soup = BeautifulSoup(html, 'html.parser')
    box_list = soup.find_all('div', class_='list4-box')
    ref = get_url_ref(url)[:-1]
    for box in box_list:
        a_tag = box.find('li', class_='title').find('a')
        link = a_tag.get('href')
        my_print(ref + link)
        #html = download_page(ref + link)
        get_pic(ref + link)
        time.sleep(1)
예제 #4
0
def main():
    queue = [i for i in range(1, 2)]
    threads = []
    while len(queue) > 0 or len(threads) > 0:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)

        while len(threads) < 1 and len(queue) > 0:
            cur_page = queue.pop(0)
            url = 'https://www.51tietu.net/xiezhen/{}'.format(cur_page)
            thread = threading.Thread(target=get_pic_list, args=(url, ))
            thread.setDaemon(True)
            thread.start()
            my_print('down load page {}'.format(cur_page))
            threads.append(thread)
예제 #5
0
def get_pic(link):
    html = download_page(link)
    soup = BeautifulSoup(html, 'html.parser')
    pic_box = soup.find('div', class_='pic-box')

    title = soup.find('h1').string.strip()

    remove = string.punctuation
    table = str.maketrans('', '', remove)
    title = title.translate(table)

    create_dir(RESULT_DIR + '{}'.format(title))
    my_print(RESULT_DIR + '{}'.format(title))

    pic_list = pic_box.find_all('img')
    i = 0

    headers = get_headers(link)

    for pic in pic_list:
        link = pic.get('src')

        if os.path.exists(RESULT_DIR + '{}/'.format(title) + str(i) + '.jpg'):
            i = i + 1
            continue

        my_print(link, DEBUG)

        session = requests.Session()
        session.mount('https://', HTTPAdapter(max_retries=3))

        try:
            r = session.get(link, headers=headers, timeout=5)
            with open(RESULT_DIR + '{}/'.format(title) + str(i) + '.jpg',
                      'wb') as f:
                i = i + 1
                f.write(r.content)
                time.sleep(1)
        except requests.exceptions.ConnectionError as e:
            print('url failed')
예제 #6
0
    def get_json_page(self, page):
        self.page = page
        data = self.data#{'first':'true', 'pn':str(page), 'kd':self.job}
        html = requests.post(self.url, data=data, headers=self.get_headers(), 
                cookies=self.get_cookie(), timeout=3
        )
        bs_myprint.my_print(html.status_code)

        result_json = json.loads(html.text)

        if result_json['msg'] is not None or result_json['msg'] is 'null':
            bs_myprint.my_print(result_json)
            return None
        
        list_con = result_json['content']['positionResult']['result']
        list_hr = result_json['content']['hrInfoMap']  # dic
        info_list = []

        info_hr = []

        for k in list_con:
            info = []
            info.append(k.get('companyShortName', '无'))
            info.append(k.get('companyFullName', '无'))
            info.append(k.get('industryField', '无'))
            info.append(k.get('financeStage', '无'))
            info.append(k.get('companySize', '无'))
            info.append(k.get('salary', '无'))
            info.append(k.get('city', '无'))
            info.append(k.get('education', '无'))
            info_list.append(info)
            bs_myprint.my_print(info)

        for k in list_hr.keys():
            value = list_hr[k]
            hr = []
            hr.append(value.get('realName', '无'))
            hr.append(value.get('userId', '无'))
            if value.get('portrait', '无') is not None:
                hr.append('https://www.lgstatic.com/'+ value.get('portrait', '无'))
            info_hr.append(hr)
            bs_myprint.my_print(hr)
            
        return info_list, info_hr
예제 #7
0
def get_json(url, page, lang_name):
    data = {'first': 'true', 'pn': str(page), 'kd': lang_name}

    #session_data = requests.Session()
    #session_data.get(url, headers=headers, timeout=3)
    #cookie_data = get_cookie()
    #time.sleep(3)
    html = requests.post(url,
                         data=data,
                         headers=headers,
                         cookies=get_cookie(),
                         timeout=5)
    #result_json = session_data.post(url, data, headers=headers).json()
    #result_json = requests.post(url, data, headers=headers).json()
    result_json = json.loads(html.text)

    if result_json['msg'] is not None:
        bs_myprint.my_print(result_json)
        return None

    list_con = result_json['content']['positionResult']['result']
    list_hr = result_json['content']['hrInfoMap']  # dic
    info_list = []

    info_hr = []

    for k in list_con:
        info = []
        info.append(k.get('companyShortName', '无'))
        info.append(k.get('companyFullName', '无'))
        info.append(k.get('industryField', '无'))
        info.append(k.get('financeStage', '无'))
        info.append(k.get('companySize', '无'))
        info.append(k.get('salary', '无'))
        info.append(k.get('city', '无'))
        info.append(k.get('education', '无'))
        info_list.append(info)
        bs_myprint.my_print(info)

    for k in list_hr.keys():
        value = list_hr[k]
        hr = []
        hr.append(value.get('realName', '无'))
        hr.append(value.get('userId', '无'))
        if value.get('portrait', '无') is not None:
            hr.append('https://www.lgstatic.com/' + value.get('portrait', '无'))
        info_hr.append(hr)
        bs_myprint.my_print(hr)

    return info_list, info_hr