Ejemplo n.º 1
0
    def gooo(self):
        num = 1
        i = 50060
        # while True:
        responseS = "https://nba.udn.com/nba/index?gr=www"
        response = requests.get(responseS)
        time.sleep(5)
        # 判斷 HTTP 回傳代碼是 200 OK
        if response.status_code == 200:
            # 開啟 UTF-8 編碼的文字檔
            f = codecs.open('nba.txt', 'w', encoding='utf-8')
            f.write(response.text)
            f.close()

            d = pyquery.PyQuery(response.text)
            # print(d)
            posts = d('div#mainbar>div#news>#news_body>dl>dt')
            for post in posts.items():
                # print(post)
                txt1 = post('h3').text()
                # print("####################################""""start""""######################################")
                href = post('a').attr('href')
                if href is None:
                    return
                fhref = "https://nba.udn.com" + href
                # grabPage(fhref,txt1,i)

                if fhref is None:
                    return
                print("詳細頁面-------------------------------------------------------------------------")

                response = requests.get(fhref)
                time.sleep(3)
                # print(response)
                # 判斷 HTTP 回傳代碼是 200 OK
                if response.status_code == 200:
                    # 開啟 UTF-8 編碼的文字檔
                    f = codecs.open('nda_det.txt', 'w', encoding='utf-8')
                    f.write(response.text)
                    f.close()


                    # 將下載回來的原始碼轉成 PyQuery 的文件實體
                    e = pyquery.PyQuery(response.text)
                    # 讀取店家資料      
                    # print(e)
                    dt = e('div#story_body_content>span>p').text()
                    dtc = (dt.split(' ', 1 ))
                    # print(dtc[1])
                    print(txt1)
                    # print(fhref)

                    ####

                    with connection.cursor() as cursor:
                        cursor.execute("select Title from intime_news")
                        data = cursor.fetchone()
                        if data != None:
                            print("QQQQQQQQQQQQQQ",data,"QQQQQQQQQQQQ")
                    
                    if data == None:
                        with connection.cursor() as cursor:
                            sql = """insert into intime_news(Title,news,http)
                                values(%s,%s,%s)"""
                            cursor.execute(sql,(txt1,dtc[1],fhref))
                    elif data != None:
                        if data[0] != txt1:
                            with connection.cursor() as cursor:
                                sql = """insert into intime_news(Title,news,http)
                                        values(%s,%s,%s)"""
                                try: 
                                    cursor.execute(sql,(txt1,dtc[1],fhref))
                                except:
                                    connection.rollback()
                    time.sleep(1)  # 自定义延时  
                    i = i+1
        else:
            print('搜尋結果回傳代碼並非 200')
        num += 1
Ejemplo n.º 2
0
    def parse(self, response):
        """
        目前发现两种情况的页面
        http://yongzhou.huangye88.com/xinxi/946_232725872.html
        http://shannan.huangye88.com/xinxi/9465_232727550.html


        移动站
        http://m.huangye88.com/yiqiyibiao/232631426.html      已被删除
        http://m.huangye88.com/d-zunyi/13451-232631425.html   正常
        http://m.huangye88.com/d-nanjing/946-232631428.html   正常2
        http://m.huangye88.com/jiancai/232631442.html         带图片的

        """
        cate_name_3 = ''
        cate_name_2 = ''
        price_unit = ''
        update_time = ''
        price = ''
        send_time = ''
        keywords = ''
        com_username = ''
        send_money = ''
        thumb = ''
        title = ''
        offer_num = ''
        detail = ''
        seller = ''
        min_price = ''
        cate_name_1 = ''
        to_area = ''
        fax = ''
        thumb_2 = ''
        brand = ''
        thumb_1 = ''
        attrs_kv = []
        min_amount = ''
        auth = ''
        telephone = ''
        ww = ''
        wechat = ''
        source_url = ''
        com_addr = ''
        qq = ''
        mobile = ''
        com_url = ''
        from_area = ''
        max_price = ''
        com_name = ''

        source_url = response.url
        # try:
        #     title = response.xpath('//div[@class="pro-text"]/h1/text()').extract()[0]
        # except:
        #     pass
        # if not title:
        #     try:
        #         title = response.xpath('//div[@class="topinfo"]/h1/text()').extract()[0]
        #     except:
        #         pass
        # try:
        #     price = response.xpath('//h3[@class="big"]/text()').extract()[0].replace(u'\xa0', '').replace(u'¥', '')
        # except:
        #     pass
        # if not price:
        #     try:
        #         price = response.xpath('//h3[@class="pirce"]/text()').extract()[0].replace(u'元', '')
        #     except:
        #         pass
        # if price:
        #     min_price = price
        #     max_price = price
        # attr_key = []
        # attr_value = []
        # try:
        #     attr_key = response.xpath('//td[@class="attribute"]/div/text()').extract()
        # except:
        #     pass
        #
        # try:
        #     attr_value = response.xpath('//td[@class="attribute-value"]/div/text()').extract()
        # except:
        #     pass
        #
        # try:
        #     for i in range(len(attr_key)):
        #         k = attr_key[i]
        #         v = attr_value[i]
        #         str = k + '|' + v
        #         attrs_kv.append(str)
        # except:
        #     pass
        # imgs = []
        # try:
        #     imgs = response.xpath('//div[@id="picsUrl"]/a/@big').extract()
        # except:
        #     pass
        # try:
        #     thumb = imgs[0]
        # except:
        #     pass
        # try:
        #     thumb_1 = imgs[1]
        # except:
        #     pass
        # try:
        #     thumb_1 = imgs[2]
        # except:
        #     pass

        # ------------------------------------移动站---------------------------------------
        # 通过这个判断商品信息是否已被删除
        if response.xpath('//section[@class="mianbaoxie"]/span'):
            # 未被删除
            try:
                title = response.xpath(
                    '//div[@class="text-desc"]/div/h1/text()').extract()[0]
            except:
                pass
            try:
                price = response.xpath(
                    '//ul[@class="no-price"]/li/span/text()').extract()[0]
            except:
                pass
            if not price:
                try:
                    price = response.xpath(
                        '//span[@class="price left"]/text()').extract()[0]
                except:
                    pass
            try:
                for i in response.xpath('//div[@class="list-desc h"]/ul/li'):
                    k = i.xpath('label/text()').extract()[0]
                    v = i.xpath('span/text()').extract()[0]
                    str = k + '|' + v
                    attrs_kv.append(str)
            except:
                pass
            imgs = []
            try:
                imgs = response.xpath(
                    '//ul[@class="swiper-wrapper"]/li/img/@data-src').extract(
                    )
            except:
                pass
            try:
                thumb = imgs[0]
            except:
                pass
            try:
                thumb_1 = imgs[1]
            except:
                pass
            try:
                thumb_2 = imgs[2]
            except:
                pass
            try:
                # cate_name_1 = response.xpath('//section[@class="mianbaoxie"]/a[1]/text()').extract()[0]
                # cate_name_2 = response.xpath('//section[@class="mianbaoxie"]/a[2]/text()').extract()[0]
                # cate_name_3 = response.xpath('//section[@class="mianbaoxie"]/a[3]/text()').extract()[0]
                # 请求接口得到分类
                rsp = requests.post('http://192.168.14.1:8000/pre_api/',
                                    data={'title': title})
                rsp = json.loads(rsp.text)["data"]
                cate_name_1 = rsp[0]
                cate_name_2 = rsp[1]
                cate_name_3 = rsp[2]

            except:
                pass
            try:
                com_name = response.xpath(
                    '//li[@class="last"]/span/a/text()').extract()[0]
                com_url = response.xpath(
                    '//li[@class="last"]/span/a/@href').extract()[0]
            except:
                pass
            try:
                for i in response.xpath('//div[@class="list-desc"]/ul/li'):
                    if i.xpath('a'):
                        if u'地区' == i.xpath('a/label/text()').extract()[0]:
                            com_addr = i.xpath('a/span/text()').extract()[0]
                    if i.xpath('label') and u'联系' == i.xpath(
                            'label/text()').extract()[0]:
                        seller = i.xpath('span/text()').extract()[0].replace(
                            u'\xa0', '')
                        mobile = i.xpath('span/a/text()').extract()[0]
            except:
                pass
            doc = pyquery.PyQuery(response.text)
            detail_doc = ''
            try:
                detail_doc = doc('.limit-height')
                for i in detail_doc('img').items():
                    src = i.attr('src')
                    if not src:
                        i.remove()
                    upyun_pic = shuffle_image_push(response.url, src)
                    i.attr('src', upyun_pic)
            except:
                pass
            detail = detail_doc.outer_html()
            detail = detail + u'<p>%s</p><p>联系人:%s</p><p>企业地址:%s</p>' % (
                com_name, seller, com_addr)
            if thumb:
                try:
                    thumb = shuffle_image_push(response.url, thumb)
                except:
                    pass
            if thumb_1:
                try:
                    thumb_1 = shuffle_image_push(response.url, thumb_1)
                except:
                    pass
            if thumb_2:
                try:
                    thumb_2 = shuffle_image_push(response.url, thumb_2)
                except:
                    pass
            goods_data = {
                'source_url': source_url,
                'title': title,
                'price': price,
                'min_price': min_price,
                'max_price': max_price,
                'price_unit': price_unit,
                'min_amount': min_amount,
                'keywords': keywords,
                'brand': brand,
                'to_area': to_area,
                'from_area': from_area,
                'attrs_kv': attrs_kv,
                'cate_name_1': cate_name_1,
                'cate_name_2': cate_name_2,
                'cate_name_3': cate_name_3,
                'thumb': thumb,
                'thumb_1': thumb_1,
                'thumb_2': thumb_2,
                'detail': detail,
                'com_name': com_name,
                'com_addr': com_addr,
                'seller': seller,
                'telephone': telephone,
                'mobile': mobile,
                'qq': qq,
                'ww': ww,
                'wechat': wechat,
                'fax': fax,
                'com_url': com_url,
                'update_time': datetime.datetime.now().strftime('%Y-%m-%d'),
                'sendtime': '',
                'com_username': '',
                'send_money': '',
                'offer_num': '',
                'auth': ''
            }
            try:
                yield scrapy.Request(url=com_url,
                                     meta={"goods_data": goods_data},
                                     callback=self.parse2)
            except:
                pass
Ejemplo n.º 3
0
def get_article():
    r = requests.get(
        'https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA')  #request 请求提取网页正文
    document = pyquery.PyQuery(r.text)  # pyquery 提取正文
    return document('#js_content').text()  # 提取微信公众号正文伪代码
Ejemplo n.º 4
0
def test1():
    r = requests.get("https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA")
    document = pyquery.PyQuery(r.text)
    return document("#js_content").text()
Ejemplo n.º 5
0
def get_artical(url):
    art=requests.get(url) 
    doc=pyquery.PyQuery(art.text)
    return doc('#js_content').text()
Ejemplo n.º 6
0
def request_shiyanbin_schedule():
    response = requests.get(SHIYANBIN_SCHEDULE_LINK)
    if response.ok:
        return pyquery.PyQuery(response.content.decode('utf-8'))('table')
Ejemplo n.º 7
0
def test_repo():
    resp = pyquery.PyQuery(server.repo('r1'))
    assert resp.find('h3:contains("provided packages")')
    assert resp.find('h3:contains("dependencies")')
Ejemplo n.º 8
0
    def parse_company3(self, response):
        goods_data = response.meta["goods_data"]
        com_word = response.meta["com_word"]
        com_data = response.meta["com_data"]
        print(response.url)
        content_js = response.text
        doc = pyquery.PyQuery(content_js)
        aa = doc('article.intro-list ul')
        for i in aa('li').items():
            if i('.c-left').text() == u'主营产品或服务' and not com_data["product"]:
                com_data["product"] = i('.c-right').text()
            if i('.c-left').text(
            ) == u'主营行业' and not com_data["main_industry"]:
                com_data["main_industry"] = i('.c-right').text()
            if i('.c-left').text() == u'企业类型':
                com_data["comtype"] = i('.c-right').text()
            if i('.c-left').text() == u'经营模式' and not com_data["busmode"]:
                com_data["busmode"] = i('.c-right').text()
            if i('.c-left').text() == u'注册地址':
                com_data["com_reg_addr"] = i('.c-right').text()
            if i('.c-left').text() == u'经营地址' and not com_data["address"]:
                com_data["address"] = i('.c-right').text()
            if i('.c-left').text() == u'公司成立时间' and not com_data["regyear"]:
                com_data["regyear"] = i('.c-right').text()
            if i('.c-left').text() == u'法定代表人/负责人' and not com_data["ceo"]:
                com_data["ceo"] = i('.c-right').text()
            if i('.c-left').text() == u'员工人数':
                com_data["employ"] = i('.c-right').text()
            if i('.c-left').text() == u'年营业额':
                com_data["annulsale"] = i('.c-right').text()
            if i('.c-left').text() == u'经营品牌':
                com_data["brand_name"] = i('.c-right').text()
            if i('.c-left').text() == u'注册资本' and not com_data["regcapital"]:
                com_data["regcapital"] = i('.c-right').text()
            if i('.c-left').text() == u'主要客户群':
                com_data["customer"] = i('.c-right').text()
            if i('.c-left').text() == u'主要市场':
                com_data["main_addr"] = i('.c-right').text()
            if i('.c-left').text() == u'是否提供OEM服务':
                com_data["OEM"] = i('.c-right').text()
            if i('.c-left').text() == u'研发部门人数':
                com_data["rdnum"] = i('.c-right').text()
            if i('.c-left').text() == u'厂房面积':
                com_data["com_area"] = i('.c-right').text()
            if i('.c-left').text() == u'质量控制':
                com_data["qc"] = i('.c-right').text()
            if i('.c-left').text() == u'管理体系认证':
                com_data["management_system"] = i('.c-right').text()
            if i('.c-left').text() == u'认证信息' and not com_data["com_auth"]:
                com_data["com_auth"] = i('.c-right').text()
            if i('.c-left').text() == u'开户银行':
                com_data["bank_type"] = i('.c-right').text()
        if 'null' in com_data["regcapital"]:
            com_data["regcapital"] = u'无需验资'
        com_data[
            "source_url"] = 'http://' + com_word + '.wx.hc360.com/shop/show.html'

        if goods_data["detail"]:
            Item = HuicongGoodsFenbuItem()
            Item["com_data"] = com_data
            Item["goods_data"] = goods_data
            yield Item
Ejemplo n.º 9
0
        pass

if WEB_SCRAPING:
    try:
        with urllib.request.urlopen(STANDINGS_URL) as url:
            with open(os.path.join(DATA_PATH, 'standings.html'), 'wb') as f:
                f.write(url.read())
    except:
        print('Failed web scraping ' + STANDINGS_URL)
        exit()

users = []
try:
    path = os.path.join(DATA_PATH, 'standings.html')
    with open(path) as f:
        standings = pyquery.PyQuery(f.read())
except:
    print('Not found ' + path)
    exit()
PM = urllib.parse.parse_qs(
    urllib.parse.urlparse(
        standings.find('.bodySubtitle a').attr('href')).query)['pm'][0]
for e in standings.find('.stat tr:nth-child(n+3)'):
    e = pyquery.PyQuery(e).find('td:nth-child(2)')
    user_name = e.text()
    user_id = urllib.parse.parse_qs(
        urllib.parse.urlparse(e.find('a').attr('href')).query)['cr'][0]
    users.append((user_name, user_id))

if WEB_SCRAPING:
    for user_name, user_id in users:
Ejemplo n.º 10
0
def parseHTML(collect):
    with open('ptt.html', 'r', encoding='utf-8') as f:
        html = f.read()
        dom = pyquery.PyQuery(html)
        item = {}
        author = dom('span.article-meta-tag').filter(lambda i: pyquery.PyQuery(
            this).text() == '作者').siblings('span.article-meta-value').text()
        authorID = author.split()[0]
        authorName = ''.join(author.split()[1:])
        authorName = authorName.replace(')', '').replace('(', '')
        item['authorID'] = authorID
        item['authorName'] = authorName
        #print(authorID,'\n', authorName)

        title = dom('span.article-meta-tag').filter(lambda i: pyquery.PyQuery(
            this).text() == '標題').siblings('span.article-meta-value').text()
        title = title.split('] ')[-1]
        #print(title)
        item['title'] = title

        publish_time = dom('span.article-meta-tag').filter(
            lambda i: pyquery.PyQuery(this).text() == '時間').siblings(
                'span.article-meta-value').text()
        publish_time = publish_time.split('] ')[-1]
        #print(publish_time)
        item['publish_time'] = publish_time

        print('content---------------')
        # contents=dom('div#main-content:last-child')
        # print(contents.text())

        contents = dom('div#main-content').children().items()
        #print(contents.items())
        final_content = ''
        for content in contents:
            #print(content.children().text())

            if '時間' in content.children().text():
                #print('\n'.join(content.__str__().split('\n')[1:]))
                final_content = '\n'.join(content.__str__().split('\n')[1:])
                item['time'] = final_content
                break

        print('final content', final_content)

        pushes = dom('div.push').items()
        #print(type(pushes))

        for push in pushes:
            item2 = {}
            #print(push.children('span.push-userid').text())
            #print(push.children('span.push-content').text())
            #print(''.join(push.children('span.push-ipdatetime').text().split(' ')[1:]))
            item2['userID'] = push.children('span.push-userid').text()
            item2['userComment'] = push.children('span.push-content').text()
            item2['userTime'] = ''.join(
                push.children('span.push-ipdatetime').text().split(' ')[1:])
            #print('--------------------------')
            item2.update(item)
            #print('item2',item2)
            collect.append(item2)

        with open('crawler.pickle', 'wb') as f:
            pickle.dump(collect, f, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 11
0
    def parse(self, response):
        print(response.url)
        title = ""
        price = ""
        offer_num = ""
        send_time = ""
        send_money = ""
        com_name = ""
        buy_sell_num = ""
        com_addr = ""
        auth = ""
        com_url = ""
        mobile = ""
        telephone = ""
        seller = ""
        attrs_kv = []
        detail = ""
        thumb_1 = ""
        thumb_2 = ""
        thumb = ""
        cate_name_1 = ""
        cate_name_2 = ""
        cate_name_3 = ""
        min_price = max_price = 0
        price_unit = ''
        content = data = ''
        if response.xpath('//h1[@class="proTitle"]/text()'):
            try:
                try:
                    title = response.xpath(
                        '//h1[@class="proTitle"]/text()').extract()[0]
                except:
                    pass
                try:
                    price = response.xpath(
                        '//div[@class="topPriceRig"]/text()').extract()[1]
                except:
                    pass
                if not price:
                    try:
                        price = response.xpath(
                            '//div[@class="topPriceRig"]/text()').extract()[0]
                        mprice = price.replace('\r', '').replace(
                            '\n', '').replace('\t', '').replace(' ',
                                                                '').split('-')
                        min_price = mprice[0].strip().replace(u'¥', '')
                        max_price = mprice[1].strip().replace(u'¥', '')
                    except:
                        pass
                if not price:
                    try:
                        price = response.xpath(
                            '//div[@class="topPriceRig telBra"]/text()'
                        ).extract()[0]
                    except:
                        pass
                try:
                    price = price.replace('\r', '').replace('\n', '').replace(
                        '\t', '').replace(' ', '')
                except:
                    pass
                try:
                    if u'¥' in price:
                        price = price.replace(u'¥', '')
                except:
                    pass
                try:
                    offer_num = response.xpath(
                        '//span[@class="supply-numb"]/text()').extract()[0]
                except:
                    pass
                try:
                    for i in response.xpath('//div[@class="item-row-w"]'):
                        row = i.xpath('string(.)')
                        if u'发货期限' in row[0].extract():
                            send_time = i.xpath('text()').extract()[1]
                    send_time = send_time.replace('\r', '').replace(
                        '\n', '').replace('\t', '').replace(' ', '')
                except:
                    pass
                try:
                    buy_sell_num = response.xpath(
                        '//li[@class="line-btm"]/div/a/text()').extract()[0]
                except:
                    pass
                try:
                    com_name = response.xpath(
                        '//div[@class="comply-name"]/p/a/text()').extract()[0]
                    for i in response.xpath(
                            '//div[@class="item-mmt-txt"]/ul/li'):
                        row = i.xpath('string(.)')
                        if u'所在地区' in row[0].extract():
                            com_addr = i.xpath('div/p/text()').extract()[0]
                        if u'认证信息' in row[0].extract():
                            try:
                                auth = i.xpath('div/a/text()').extract()[0]
                            except:
                                auth = i.xpath('div/text()').extract()[0]
                    com_url = response.xpath(
                        '//p[@class="cName"]/a/@href').extract()[0]
                except:
                    pass
                try:
                    mobile = response.xpath(
                        '//em[@class="c-red"]/text()').extract()[0][1:]
                    telephone = response.xpath(
                        '//div[@class="p tel1"]/em/text()').extract()[0]
                    telephone = telephone[1:].split(' ')[0]
                    if not seller:
                        seller = response.xpath(
                            '//div[@class="p name"]/em/text()').extract(
                            )[0][1:]
                except:
                    pass
                try:
                    for i in response.xpath(
                            '//div[@class="d-vopy  parameter "]/ul/li'):
                        key = i.xpath('span/text()').extract()[0].replace(
                            '\r', '').replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ',
                                                                      '')[:-1]
                        value = i.xpath('p/text()').extract()[0].replace(
                            '\r', '').replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                        str = key + '|' + value
                        attrs_kv.append(str)
                except:
                    pass
                try:
                    thumb = response.xpath(
                        '//ul[@id="thumblist"]/li[1]/div/a/@rel').extract()[0]
                    thumb = re.findall(r"largeimage: '(.*?)'", thumb)[0]
                    thumb_1 = response.xpath(
                        '//ul[@id="thumblist"]/li[2]/div/a/@rel').extract()[0]
                    thumb_1 = re.findall(r"largeimage: '(.*?)'", thumb_1)[0]
                    thumb_2 = response.xpath(
                        '//ul[@id="thumblist"]/li[3]/div/a/@rel').extract()[0]
                    thumb_2 = re.findall(r"largeimage: '(.*?)'", thumb_2)[0]
                except:
                    pass
                try:
                    json_data = re.findall(r'"supCatClass":(.*?),"supcatId"',
                                           response.text)[0]
                    json_data = json.loads(json_data)
                    cate_name_1 = json_data[0]["catName"]
                    cate_name_2 = json_data[1]["catName"]
                    cate_name_3 = json_data[2]["catName"]
                except:
                    pass
            except:
                pass

        ss = response.xpath('//script/text()').extract()
        update_time = ''
        keys = []
        for i in ss:
            text = i
            for j in text.split('var'):
                keys.append(j.strip())
        for i in keys:
            i = i.replace('null',
                          'None').replace('false',
                                          'False').replace('true', 'True')
            if i:
                try:
                    exec i in locals()
                except Exception as e:
                    pass
        try:
            com_username = company_username.decode('utf-8')
        except:
            com_username = ''
        try:
            keywords = productWord
        except:
            try:
                keywords = searchVal
            except:
                try:
                    keywords = urllib.unquote(keywordencode).decode('gbk')
                except:
                    keywords = ''
        try:
            keywords = keywords.decode('utf-8')
        except:
            pass
        try:
            update_time = supplyInfoJson['pubDate'].split(' ')[0]
        except:
            update_time = (datetime.datetime.now() -
                           datetime.timedelta(30)).strftime('%Y-%m-%d')
        try:
            brand = supplyInfoJson['brandName']
        except:
            brand = ''
        try:
            brand = brand.decode('utf-8')
        except:
            pass
        try:
            businAttList = supplyInfoJson['businAttList']
        except:
            businAttList = []
        from_area = ''
        if businAttList:
            for i in businAttList:
                if i['attname'] == '产地':
                    from_area = i['attvalue']
                if not brand:
                    if i['attname'] == '品牌':
                        brand = i['attvalue']
        try:
            from_area = from_area.decode('utf-8')
        except:
            pass

        try:
            seller = companyContactor
        except:
            try:
                seller = contactor
            except:
                pass
        try:
            fax = companyJson['fax']
        except:
            fax = ''
        to_area = qq = ww = wechat = ''
        try:
            detail = supplyInfoJson['introduce']
            detail = detail.decode("utf-8")
        except:
            pass
        if u'质量保证,欢迎咨询洽谈' in detail or not detail:
            my_doc = pyquery.PyQuery(response.text)
            my_doc = my_doc("#introduce")
            detail = my_doc.outer_html()
        if detail:
            try:
                doc = pyquery.PyQuery(detail)
            except:
                pass
            print "start up upyun detail"
            try:
                for i in doc('a').items():
                    if i.attr('href') and 'hc360' in i.attr('href'):
                        i.remove()
            except:
                pass
            try:
                for i in doc('img').items():
                    try:
                        if i.attr('data-ke-src'):
                            i.attr('data-ke-src', '')
                    except:
                        pass
                    src = i.attr('src')
                    try:
                        if 'hc360' not in src or 'no_pic' in src or 'nopic' in src:
                            i.remove()
                            continue
                    except:
                        pass
                    try:
                        if thumb and 'no_pic' in thumb:
                            thumb = src
                        if thumb and 'nopic' in thumb:
                            thumb = src
                    except:
                        pass
                    upyun_pic = ''
                    try:
                        upyun_pic = image_push(response.url, src)
                    except:
                        pass
                    if 'hc360' in upyun_pic:
                        i.remove()
                        continue
                    i.attr('src', upyun_pic)
            except:
                pass

            try:
                for i in doc('img').items():
                    if i.attr('src'):
                        src = i.attr('src')
                        if 'hc360' in src or '//' == src:
                            i.remove()
                        if i.attr('data-ke-src'):
                            i.remove_attr('data-ke-src')
                        if i.attr('data-mce-src'):
                            i.remove_attr('data-mce-src')
                        if i.attr('data-cke-saved-src'):
                            i.remove_attr('data-cke-saved-src')
            except:
                pass
            try:
                for i in doc('*').items():
                    if i.attr('src') and 'hc360' in i.attr('src'):
                        i.attr('src', '')
                    if i.attr('data-tfs-url'):
                        i.attr('data-tfs-url', '')
                    if i.attr('data-url'):
                        i.attr('data-url', '')
            except:
                pass
            detail = doc.outer_html()
            try:
                detail = detail.replace('<div style="overflow:hidden;">',
                                        '<div>')
            except:
                pass
        if detail and u'正在加载' in detail:
            detail = ''
        try:
            min_amount = int(
                response.xpath('//tr[@class="item-cur-tab"]/td/text()').
                extract()[0].split('-')[0].strip())
        except:
            min_amount = 1
        try:
            price = re.search(r'\d+\.?\d+', price).group()
        except:
            price = 0
        if not min_price:
            min_price = price
        if not max_price:
            max_price = price
        if offer_num:
            try:
                res = re.search(r'(\d+)(.+)', offer_num.replace(' ',
                                                                '')).groups()
                offer_num = res[0]
                if len(res) > 1:
                    price_unit = res[1]
            except:
                pass
        print "start up upyun thumb"
        if thumb:
            thumb = image_push(response.url, thumb)
        if 'hc360' in thumb:
            thumb = ''
        if thumb_1:
            thumb_1 = image_push(response.url, thumb_1)
            if 'hc360' in thumb_1:
                thumb_1 = ''
        if thumb_2:
            thumb_2 = image_push(response.url, thumb_2)
            if 'hc360' in thumb_2:
                thumb_2 = ''

        goods_data = {
            'source_url': response.url,
            'title': title,
            'price': price,
            'offer_num': offer_num,
            'send_time': send_time,
            'send_money': send_money,
            'com_name': com_name,
            'com_addr': com_addr,
            'auth': auth,
            'com_url': com_url,
            'mobile': mobile,
            'telephone': telephone,
            'seller': seller,
            'attrs_kv': attrs_kv,
            'detail': detail,
            'thumb_1': thumb_1,
            'thumb_2': thumb_2,
            'thumb': thumb,
            'cate_name_1': cate_name_1,
            'cate_name_2': cate_name_2,
            'cate_name_3': cate_name_3,
            'update_time': datetime.datetime.now().strftime('%Y-%m-%d'),
            'com_username': com_username,
            'keywords': keywords,
            'min_amount': min_amount,
            'min_price': min_price,
            'max_price': max_price,
            'price_unit': price_unit,
            'brand': brand,
            'to_area': to_area,
            'from_area': from_area,
            'qq': qq,
            'ww': ww,
            'fax': fax,
            'wechat': wechat,
        }

        # 获取企业url判断企业是否已被爬取
        com_url = ""
        try:
            com_url = response.xpath(
                '//p[@class="cName"]/a/@href').extract()[0]
        except:
            pass
        if not com_url:
            try:
                com_url = response.xpath(
                    '//div[@class="goods-tit goods-tit-blue"]/a/@href'
                ).extract()[0]
            except:
                pass
        # 取出企业的关键词
        reg = 'http://(.*?).b2b.hc360.com'
        com_word = re.findall(reg, com_url)[0]
        print(" ")
        test_com_url = 'http://' + com_word + '.wx.hc360.com/shop/show.html'
        conn = pymysql.connect(host='192.168.14.90',
                               port=3306,
                               user='******',
                               passwd='123456',
                               db='hc360',
                               charset='utf8')
        cursor = conn.cursor()
        cursor.execute(
            "select * from com_tmp where url = '{}'".format(test_com_url))
        conn.commit()
        result = cursor.fetchone()
        if not result:
            # 企业没有爬过
            try:
                cursor.execute(
                    "in sert into com_tmp (url) v  alues ('{}')".format(
                        test_com_url))
                conn.commit()
            except:
                pass
            cursor.close()
            conn.close()
            # 爬取该企业的信息,并将企业信息放入Item 的 com_data中,与goods_data 一起交给mongoPipe处理
            url_1 = "http://detail.b2b.hc360.com/detail/turbine/template/moblie,vmoblie,getcontact_us.html?username="******"goods_data": goods_data,
                                         "com_word": com_word
                                     },
                                     callback=self.parse_company)
            except:
                pass
        else:
            cursor.close()
            conn.close()
            # 企业爬过了
            if goods_data["detail"]:
                Item = HuicongGoodsFenbuItem()
                Item["goods_data"] = goods_data
                Item["com_data"] = ""
                yield Item
Ejemplo n.º 12
0
def test_pkg_external():
    resp = pyquery.PyQuery(server.pkg('six'))
    assert resp.find('h2:contains("external package: six")')
Ejemplo n.º 13
0
def test_pkg_disambituate():
    resp = pyquery.PyQuery(server.pkg('pkg2'))
    repo_links = resp.find('a[href^="/repo/"]')
    assert len(repo_links) == 2
    assert repo_links.eq(0).attr('href') == '/repo/r2.htm'
    assert repo_links.eq(1).attr('href') == '/repo/r5.htm'
Ejemplo n.º 14
0
def test_pkg_redirect():
    resp = pyquery.PyQuery(server.pkg('pkg1'))
    meta = resp.find('meta[http-equiv="refresh"]')
    assert meta.attr('content') == '0;/repo/r1.htm'
'''这是一个通过网络请求获得网页内容,使用分词工具对中文字符串
进行分词,统计词频,得出结果,并发送到指定邮箱的程序'''
import requests
import pyquery
from pyquery import PyQuery
from mymodule import stats_word
'''访问网址'''
image_url = "https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA"
'''将网络中的内容全部赋值给response'''
response = requests.get(image_url)
'''提取网址中的正文内容'''
document = pyquery.PyQuery(response.text)
content = document('#js_content').text()
statList = stats_word.stats_text_cn(content, 100)
statstring = ''.join(str(i) for i in statList)

import getpass
sender = input("输入发件人邮箱")
password = getpass.getpass("输入发件人邮箱密码(可复制粘贴):")
recipients = input("输入收件人邮箱")

import yagmail

yag = yagmail.SMTP(user=sender, password=password, host='smtp.163.com')

yag.send(recipients, '19100305 yxying1992主题:张小龙微信公开课演讲稿中文词频前100名统计',
         statstring)
Ejemplo n.º 16
0
    def monitoring(self):
        """执行"""
        #是否有权限
        self.local.focuson = True
        self.local.sayhello = True
        self.local.addbuddy = True
        self.local.sendmsg = True

        #开始监控
        count = 0
        while True:
            if count > 0:
                myqueue.get()
                self.showMsg(self.local.url +
                             "\t第 %s 次执行完毕,睡眠中等待下一次执行\r" % count)
                time.sleep(int(self.monitoring_['apart']))
                myqueue.put(self.local.url)
            #获取需要执行的用户
            page = 1
            userlist = []
            try:
                while True:
                    try:
                        url = self.local.url + "/home.php?mod=space&do=friend&view=online&type=member&page=" + str(
                            page)
                        html = self.local.conn.get(url, headers=self.headers)
                        html.encodeing = self.local.encodeing
                        q = pyquery.PyQuery(html.text.encode().decode('utf8'))
                        # formhashhref = q("a[href*='action=logout']").attr('href')#验证字符串
                        # if formhashhref:
                        # 	self.local.formhash = formhashhref.split('=')[-1]
                        li = q('#friend_ul li')
                        for row in li:
                            rowq = pyquery.PyQuery(row)
                            liid = rowq.attr('id')
                            li_uid = liid.split('_')[1]  #得到用户uid
                            usertext = rowq('a').eq(2).text()  #取出a标签中文本
                            if li_uid != '1' and usertext == '收听TA':
                                userlist.append(int(li_uid))
                        #查询页数
                        try:
                            pageCountstrs = q(
                                'input[name=custompage]').next().attr('title')
                            if pageCountstrs:
                                pageCount = re.match(".*?([0-9]+).*?",
                                                     pageCountstrs, re.S)
                                if pageCount:
                                    if page < int(pageCount[1]):
                                        page += 1
                                    else:
                                        break
                            else:
                                break
                        except Exception as e:
                            break

                    except Exception as e:
                        break

                for uid in userlist:
                    isAdmin = self.isAdmin(uid)
                    if not isAdmin:  #排除管理员
                        # #关注
                        if self.local.focuson:
                            self.focusOn(uid)
                        #打招呼
                        if self.local.sayhello:
                            self.sayHello(uid)

                        # 发消息
                        if self.local.sendmsg:
                            self.sendMsg(uid)
                        #加好友
                        if self.local.addbuddy:
                            self.addBuddy(uid)

                        if self.local.focuson == False and self.local.sayhello == False and self.local.sendmsg == False and self.local.addbuddy == False:
                            self.showErrorMsg(
                                error,
                                self.local.url + "\t由于此站点没有任何权限,将关闭此站点\r")
            except Exception as e:
                pass
                # print(e,11111111111111111111111111111111)
            count += 1
Ejemplo n.º 17
0
def get_article():
    r = requests.get('https://mp.weixin.qq.com/s/pLmuGoc4bZrMNl7MSoWgiA')
    document = pyquery.PyQuery(r.text)
    return document('#js_content').text()
Ejemplo n.º 18
0
    def login(self, url, loginnum=1):

        #初始化
        print(url, '登录...')
        self.local.url = url
        self.local.formhash = ''
        self.local.conn = requests.Session()
        filejson = self.path + 'cookies/' + self.local.url.replace(
            'http://', '').replace('https://', '') + '.json'
        if os.path.exists(filejson):
            try:
                with open(filejson, 'r', encoding='utf8') as f:
                    listCookies = json.loads(f.read())
            except Exception as e:
                self.showErrorMsg(cookielog,
                                  self.local.url + "\tcookie文件格式错误\r")

            #添加cookie(登陆操作)
            try:
                for cookie in listCookies:
                    self.local.conn.cookies.set(cookie.get('name'),
                                                cookie.get('value'))
            except Exception as e:
                # print(e)
                self.showErrorMsg(
                    cookielog, self.local.url + "\tcookie设置错误,请更换cookie文件\r")

            #访问一次首页
            try:
                getlogin = self.local.conn.get(self.local.url,
                                               headers=self.headers)
                time.sleep(1)
                #防止跳转域名
                urllist = getlogin.url.split('/')
                self.local.url = urllist[0] + '/' + urllist[1] + '/' + urllist[
                    2]

                if getlogin.status_code != 200:

                    self.showErrorMsg(
                        loginerror, self.local.url + "\t此网站无法正常访问,返回错误:" +
                        str(getlogin.status_code) + "\r")

            except Exception as e:
                if loginnum > 3:
                    self.showErrorMsg(
                        loginerror,
                        self.local.url + "\t此网站无法正常访问,返回错误:" + str(e) + "\r")
                else:
                    self.login(self.local.url, loginnum + 1)

            #编码方式
            encodeing = 'utf8'
            try:
                encodeing = requests.utils.get_encodings_from_content(
                    getlogin.text)[0]
            except Exception as e:
                pass

            try:
                getlogin.encodeing = encodeing
                q = pyquery.PyQuery(getlogin.text.encode().decode('utf8'))
                self.local.formhash = q("input[name='formhash']").val()  #验证字符串
            except Exception as e:
                pass

            # getlogin.encodeing = encodeing
            self.local.encodeing = encodeing
            try:
                if encodeing is 'utf8':
                    self.local.message = self.monitoring_['msg']
                else:
                    self.local.message = self.monitoring_['msg'].encode(
                        'utf8').decode('utf-8').encode('gbk')
            except Exception as e:
                self.local.message = self.monitoring_['msg']

            try:
                # q = pyquery.PyQuery(getlogin.text.encode().decode('utf8'))
                # self.local.log = open(self.path+'log/'+self.local.url.replace('http://','').replace('https://','')+'.log','a',encoding='utf8')
                # self.local.log.write("\r\r\r"+time.strftime('%Y-%m-%d %H:%M:%S')+'\t'+self.local.url+"\r")

                # usergroup = q('a[href$="home.php?mod=spacecp&ac=usergroup"]').text()
                # if usergroup:
                # 	self.showSuccessMsg(self.local.url+"\t登陆成功\r")
                # else:
                # 	self.showErrorMsg(loginerror,self.local.url+'\t登录失败\r')
                #执行
                # self.monitoring()
                pass
            except Exception as e:
                print(e)

            # 获取管理组
            try:
                adminGroupList = self.getAdminGroup()
                if not adminGroupList:

                    shutil.copyfile(
                        filejson,
                        self.path + 'cookie2/' + self.local.url.replace(
                            'http://', '').replace('https://', '') + '.json')
                    self.showErrorMsg(
                        loginerror,
                        self.local.url + '\t无法获取管理组,为了安全将关闭此站点的一切操作!\r')
                else:
                    self.local.log = open(
                        self.path + 'log/' + self.local.url.replace(
                            'http://', '').replace('https://', '') + '.log',
                        'a',
                        encoding='utf8')
                    self.local.log.write("\r\r\r" +
                                         time.strftime('%Y-%m-%d %H:%M:%S') +
                                         '\t' + self.local.url + "\r")
                    self.showSuccessMsg(self.local.url + '\t管理组:' +
                                        str(adminGroupList) + '\r')
                    time.sleep(1)

            except Exception as e:
                print(e)
                myqueue.put(self.local.url)

            #开始监控
            self.monitoring()
Ejemplo n.º 19
0
def parse_page(data: bytes):
    text = data.decode(encoding)
    doc = pyquery.PyQuery(text)
    title = doc('strong font').text()
    content = doc('p').text()
    return (title, content)
Ejemplo n.º 20
0
def test_index():
    resp = pyquery.PyQuery(server.index())
    assert resp.find('#repos')
    assert resp.find('#packages')
    assert resp.find('#external')