Beispiel #1
0
    def get_album_info(self):
        """获取专辑信息"""
        url = 'http://download.csdn.net/addalbum/%d' % self.album_id
        page = netx.get(url, cookies=self.cookies, need_print=False)
        soup = BeautifulSoup(page, "html.parser")
        key_list = [
            'title',
            ('discription', 'textarea'),
            'tag',
            ('categorys', 'select'),
            ('category', 'select'),
            'type',
            'imagesrc',
            'ids',
            'album',
        ]
        params = dict()
        for key in key_list:
            # 处理 tag 名
            if len(key) == 2:
                key, tag_name = key
            else:
                tag_name = 'input'

            # 查找
            if key == 'type':
                # 要选中的
                key_tag = soup.find(tag_name,
                                    attrs={
                                        'name': key,
                                        'checked': 'checked'
                                    })
            else:
                key_tag = soup.find(tag_name, attrs={'name': key})
            if not key_tag:
                print('没有找到 %s' % key)
                continue

            # 取值
            if key == 'imagesrc':
                value = None
            elif key == 'ids':
                value = list()
            elif 'value' in key_tag.attrs:
                value = key_tag['value']
            elif 'def' in key_tag.attrs:
                value = key_tag['def']
            else:
                value = key_tag.text
            params[key] = value

        # 读取 文件
        ul = soup.select_one('ul.add_source_list.items-list')
        a_list = ul.select('a.item')
        for a in a_list:
            params['ids'].append(a['id'])
        # 倒序排序
        params['ids'].sort(reverse=True)
        self.params = params
        print(params)
Beispiel #2
0
    def run(self):
        """运行"""
        url = 'https://phoenix.ziroom.com/v7/room/detail.json'
        params = {
            'id': self.room_id,
            'city_code': self.city_code
        }
        params = netx.parse_params("""
house_id	60163300
sign	3819b75389894cc18418b81e882d560a
size	4
timestamp	1520072313
os	android:7.0
network	WIFI
sign_open	1
app_version	5.5.0
imei	868030026509339
id	61015398
ip	192.168.199.128
uid	0
city_code	110000
page	1
model	MI 5
        """)
        print(url)
        print(params)
        while True:
            result = netx.get(url, params, result_type='json', need_print=False)
            if result['status'] == 'success':
                data = result['data']
            print(result)
            return
Beispiel #3
0
 def translate(en):
     js = Py4Js()
     tk = js.getTk(en)
     tk2 = get_google_tk(en)
     if tk != tk2:
         print('计算的 tk 不相等')
         filex.write('data/error_tk.txt', en + '\n', 'a')
         return en
     url = "http://translate.google.cn/translate_a/single?client=t" \
           "&sl=en&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
           "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
           "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, urllib.parse.quote(en))
     result = netx.get(url, need_print=False)
     """
     [[["测试","test",null,null,2],[null,null,"Cèshì","test"]],...]
     """
     if result:
         result = json.loads(result)
         if result:
             # 第一个结果
             first_result = result[0]
             # 前几个为翻译,最后 1 个可能带拼音
             cn = ''
             for translation in first_result:
                 if len(translation) == 5:
                     # 翻译中的第一个即是结果
                     cn += translation[0]
             cn = GoogleTranslator.process_result(en, cn)
             return cn
     return None
Beispiel #4
0
    def get_dior_details(source_file, result_file):
        """获取口红详情"""
        lipstick_list = filex.read_lines(source_file, ignore_line_separator=True)
        length = len(lipstick_list)
        for i in range(length):
            lipstick = Lipstick.from_string(lipstick_list[i])
            print('获取第 %d/%d个口红信息' % (i + 1, length))

            url = ChooseLipstick.dior_host + urllib.parse.quote(lipstick.url)
            page = netx.get(url, need_print=False)
            soup = BeautifulSoup(page, "html.parser")
            cover_img_tag = soup.select_one('.png-bg.cover-bg')
            # all_image = cover_img['data-zoom-views']
            cover_img = cover_img_tag.select_one('.js-cover-img')['src']
            cover_img = ChooseLipstick.dior_host + cover_img

            # name = soup.select_one('.quickbuy-title').string
            # desc = soup.select_one('.quickbuy-subtitle').string
            price = soup.select_one('.details-price.js-order-value').string.strip()
            color_name = soup.select_one('.swatches-list').select_one('li.selected').select_one('a')['data-swatch-name']
            # color_span = soup.select_one('.swatch-name.js-swatch-name')
            # color = color_span.select_one('span').string
            # swatches_list = soup.select_one('.swatches-list.js-products-selector')
            # swatches = swatches_list.select_one('li.selected')
            lipstick.url = url
            lipstick.price = price
            lipstick.name = color_name
            lipstick.img = ','.join((lipstick.img, cover_img))
            filex.write_lines(result_file, [str(lipstick)], mode='a', add_line_separator=True)
Beispiel #5
0
 def get_ysl_list(result_file):
     """读取口红列表"""
     # 官网的读不出来列表,反正也不多,手动加一下
     url_list = [
         'http://www.yslbeautycn.com/product/00030YSL.html',
         'http://www.yslbeautycn.com/product/00031YSL.html',
     ]
     result = list()
     i = 0
     for details_url in url_list:
         page = netx.get(details_url, need_print=False)
         soup = BeautifulSoup(page, "html.parser")
         category = soup.select_one('.pdp_top_content_wrapper').select_one('.product_subtitle').string
         category = category.replace('圣罗兰', '')
         # image = soup.select_one('.primary_image')['src']
         # color_2 = soup.select_one('.product_image.b-product_img')['src']
         color_list = soup.select_one('.swatches.js_swatches.color.contentcarousel_list')
         for color_li in color_list.select('li'):
             for color_div in color_li.select('div'):
                 url = color_div.select_one('a')['href']
                 color_image = color_div.select_one('img')['src']
                 name = color_div.select_one('span').string
                 name = name.replace('(', '(').replace(')', ')')
                 split_list = name.split('(', 1)
                 if len(split_list) > 1:
                     name = split_list[0].strip()
                     other = '(' + split_list[1].strip()
                 else:
                     other = ''
                 i += 1
                 lipstick = Lipstick('%03d' % i, category, name, url, '', other, color_image)
                 result.append(str(lipstick))
     filex.write_lines(result_file, result, add_line_separator=True)
Beispiel #6
0
    def get_dior_list(result_file):
        """读取口红列表"""
        url = 'https://www.dior.cn/beauty/zh_cn/%E9%A6%99%E6%B0%9B%E4%B8%8E%E7%BE%8E%E5%AE%B9/%E5%BD%A9%E5%A6%86/%E5' \
              '%94%87%E9%83%A8/%E5%94%87%E8%86%8F/fr-lipsticks-%E5%94%87%E8%86%8F.html '
        page = netx.get(url, need_print=False)

        # 解析结果
        soup = BeautifulSoup(page, "html.parser")
        result = list()
        i = 0
        for category in soup.select('.category.js-category'):
            '大的分组'
            category_name = category.select_one('.category-title').string.replace('Dior迪奥', '')
            print('\n分组:%s' % category_name)
            for column in category.select('.column.product'):
                '每一个系列'
                legend_name = column.select_one('.legend-name').string.replace('Dior迪奥', '')
                legend_desc = column.select_one('.legend-description').string.strip()
                print('系列名:' + legend_name)
                legend_swatches_list = column.select_one('.legend-swatches-list')
                for legend_li in legend_swatches_list.select('li'):
                    a = legend_li.find('a')
                    url = a['href']
                    color = a.find('img')
                    image = ChooseLipstick.dior_host + color['src']
                    i += 1
                    lipstick = Lipstick('%03d' % i, category_name + '-' + legend_name, '', url, '', legend_desc, image)
                    result.append(str(lipstick) + '\n')
        filex.write_lines(result_file, result)
Beispiel #7
0
    def test_browse(self):
        """
        测试访问
        通过学习加密我们知道,discuz 将相关内容保存在了 cookie 中
        解密时,没有过期时间,会一直解密成功,然后用密码去判断(该密码仅用于判断,与真实密码无关)
        cookie 是会失效的,但是如果我们将 cookie 保存下来,那么就可以一直使用了,永不失效(除非更改密码)。
        """
        url = 'http://localhost/'
        r = netx.get(url, need_print=False)
        if '退出' in r:
            print('已登录')
        else:
            print('未登录')

        r = netx.get(url,
                     cookies=netx.parse_cookies_from_file(self.cookie_file),
                     need_print=False)
        if '退出' in r:
            print('已登录')
        else:
            print('未登录')
Beispiel #8
0
 def get_in_thread(self, element, element_index, thread_id):
     url = self.url
     url += str(int(time.time() * 1000))
     print(url)
     result = netx.get(url, need_print=False)
     result = result[result.index('(') + 1:result.index(')')]
     result = json.loads(result)
     num_list = list(filter(lambda x: len(str(x)) == 11, result['numArray']))
     num_list = [str(x) for x in num_list]
     print(num_list)
     print(f'获取到号码 {len(num_list)} 个')
     num_list = list(filter(lambda x: x not in self.numbers, num_list))
     print(f'过滤后,剩号码 {len(num_list)} 个')
     self.numbers.extend(num_list)
     print(f'当前号码 {len(self.numbers)} 个')
Beispiel #9
0
 def get_ysl_details(source_file, result_file):
     lines = filex.read_lines(source_file, ignore_line_separator=True)
     length = len(lines)
     for i in range(length):
         print('获取 %d/%d ' % (i + 1, length))
         line = lines[i]
         lipstick = Lipstick.from_string(line)
         # 有一些颜色没指定,打开会转到默认颜色
         page = netx.get(lipstick.url, need_print=False)
         soup = BeautifulSoup(page, "html.parser")
         cover_image = soup.select_one('.primary_image')['src']
         color_image2 = soup.select_one('.product_tab_shades_left').select_one('.product_image.b-product_img')['src']
         price = soup.select_one('.product_price.price_sale.b-product_price-sale').text.strip()
         lipstick.img = ','.join((lipstick.img, color_image2, cover_image))
         lipstick.price = price
         filex.write_lines(result_file, [str(lipstick)], mode='a', add_line_separator=True)
Beispiel #10
0
 def get_num(self):
     url = self.url
     url += str(int(time.time() * 1000))
     print(url)
     result = netx.get(url)
     result = result[result.index('(') + 1:result.index(')')]
     result = json.loads(result)
     num_list = list(filter(lambda x: len(str(x)) == 11, result['numArray']))
     num_list = [str(x) for x in num_list]
     print(num_list)
     print(f'获取到号码 {len(num_list)} 个')
     old_numbers = filex.read_lines(self.num_file_path, ignore_line_separator=True)
     print(f'之前有号码 {len(old_numbers)} 个')
     num_list = list(filter(lambda x: x not in old_numbers, num_list))
     print(f'过滤后,剩号码 {len(num_list)} 个')
     filex.write_lines(self.num_file_path, num_list, 'a', add_line_separator=True)
Beispiel #11
0
    def get_floor_of_page(tid, page, result_file):
        """
        获取某一页的所有楼层
        :param tid: 贴子id
        :param page: 页数
        :param result_file:结果文件 
        :return: 
        """
        url = 'http://tieba.baidu.com/p/%s?pn=%d' % (tid, page)
        page = netx.get(url)
        print('开始解析')
        # 解析结果
        soup = BeautifulSoup(page, "html.parser")
        floor = soup.select('.l_post.j_l_post.l_post_bright')
        result = []
        for div in floor:
            # 含有clearfix的是广告
            if 'clearfix' not in div['class']:
                data_field = json.loads(div['data-field'])
                # print(data_field)

                author = data_field['author']
                name = author['user_name']
                level_id = author['level_id']

                content = data_field['content']
                post_no = content['post_no']
                post_id = content['post_id']

                content = div.select_one('#post_content_' + str(post_id))
                if content is not None:
                    content = content.text
                else:
                    content = ''
                result.append(str(Floor(floor_no=post_no, name=name, level=level_id, content=content.lstrip())) + '\n')

        filex.write_lines(result_file, result, mode='a')