コード例 #1
0
class GetAllInfo:
    def __init__(self, html):
        self.oss = StoreOSS(**config.EHCO_OSS)
        self.tree = fromstring(html.decode('utf-8', 'ignore'))

    def get_image_respone(self, url):
        '''
        下载指定url二进制的文件
        '''
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        try:
            r = requests.get(url, timeout=20, stream=True, headers=headers)
            r.raise_for_status()
            # print '图片下载成功!url: {}'.format(url)
            time.sleep(1)
            return r.content
        except:
            print '图片下载失败!url: {}'.format(url)
            time.sleep(1)
            return None

    def up_to_server(self, respone, filename):
        '''
        将原图下载,并上传到阿里云服务器
        Args:
            url :图片的源地址
            filename:图片文件名
        '''
        # 设置文件目录
        web_folder = "comments/" + filename
        try:
            status = self.oss.put(web_folder, respone).status
            if status != 200:
                print '图片上传失败了'
            else:
                pass
                # print filename, '上传成功'
        except:
            pass
        else:
            # print("deal_response_image", url)
            pass

    def format_img_url(self):
        img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/comments/'
        img_name = '{}.jpg'.format(uuid.uuid1())
        aliyun_url = '{}{}'.format(img_head, img_name)
        return aliyun_url, img_name

    # 获取公司图片并上传到阿里云
    def get_company_image(self):
        img_dict = {}
        img = self.tree.xpath('//*[@class="logo"]/div//img')[0]
        if img is not None:
            img_url = img.get('src')
            response = self.get_image_respone(img_url)
            if response:
                aliyun_url, filename = self.format_img_url()
                # self.up_to_server(response,filename)
                img_dict['old_img'] = img_url
                img_dict['new_img'] = aliyun_url
        return img_dict

    # 获取公司的基本信息
    def get_company_base(self):
        base_dict, base_list = {}, []
        company_name = self.tree.xpath(
            '//*[@id="company-top"]/div/div[2]/div[1]/text()')[0].strip()
        base_list.append(company_name)
        company_base = self.tree.xpath(
            '//*[@id="company-top"]/div/div[2]//span//text()')
        company_base = list(
            filter(lambda x: x, map(lambda x: x.strip(), company_base)))

        for i, each in enumerate(company_base):
            if any(x in each for x in ['电话', '邮箱', '官网', '地址']):
                base_list.append(company_base[i + 1])
        for name, v in zip(['name', 'tel', 'email', 'website', 'address'],
                           base_list):
            base_dict[name] = v
        return base_dict

    # 获取公司的详细经营信息
    def get_company_detail(self):
        detail_dict = {}
        all_name = [
            'register_capital', 'pay_capital', 'operate_state',
            'establish_date', 'register_number', 'organization_code',
            'taxpayer_number', 'social_code', 'company_type', 'industry',
            'approval_date', 'register_authority', 'affiliated_area',
            'english_name', 'used_name', 'operate_mode', 'personnel_scale',
            'business_term', 'scope_operation'
        ]

        all_company_detail = self.tree.xpath(
            '//*[@id="Cominfo"]/table[2]//tr//text()')
        all_company_detail = list(
            filter(lambda x: x, map(lambda x: x.strip(), all_company_detail)))
        del (all_company_detail[-5:-1])

        for name, v in zip(all_name, all_company_detail[1::2]):
            detail_dict[name] = v
        return detail_dict

    # 获取股东信息
    def get_shareholder_detail(self):
        all_shareholder, all_shareholder_dict = [], {}
        all_shareholder_detail = self.tree.xpath(
            '//*[@id="Sockinfo"]/table//tr//text()')
        all_shareholder_detail = list(
            filter(lambda x: x, map(lambda x: x.strip(),
                                    all_shareholder_detail)))[5:]
        all_shareholder_detail = [
            all_shareholder_detail[i:i + 6]
            for i in xrange(0, len(all_shareholder_detail), 6)
        ]
        for each in all_shareholder_detail:
            del (each[1])
            all_shareholder.append(each)
        all_shareholder_dict['shareholder'] = all_shareholder
        return all_shareholder_dict

    # 获取主要人员信息
    def get_main_member(self):
        all_members, all_members_dict = [], {}
        all_main_member = self.tree.xpath(
            '//*[@id="Mainmember"]/table//tr//text()')
        all_main_member = list(
            filter(lambda x: x, map(lambda x: x.strip(),
                                    all_main_member)))[3:][1::2]
        for name, v in zip(all_main_member[::2], all_main_member[1::2]):
            all_members.append((name, v))
        all_members_dict['member'] = all_members
        return all_members_dict

    @property
    def get_detail(self):
        all_detail = {}
        img_dict = self.get_company_image()
        base_dict = self.get_company_base()
        detail_dict = self.get_company_detail()
        shareholder_dict = self.get_shareholder_detail()
        member_dict = self.get_main_member()
        all_detail.update(img_dict)
        all_detail.update(base_dict)
        all_detail.update(detail_dict)
        all_detail.update(shareholder_dict)
        all_detail.update(member_dict)
        return all_detail
コード例 #2
0
 def __init__(self, html):
     self.oss = StoreOSS(**config.EHCO_OSS)
     self.tree = fromstring(html.decode('utf-8', 'ignore'))
コード例 #3
0
 def __init__(self):
     self.db = StoreMysqlPool(**config.CONN_DB)
     self.oss = StoreOSS(**config.EHCO_OSS)
     self.q = Queue.Queue()
コード例 #4
0
 def __init__(self):
     super(ToutiaoExtractor, self).__init__()
     self.html_parser = HTMLParser.HTMLParser()
     self.oss = StoreOSS(**config.EHCO_OSS)
コード例 #5
0
class BaiduImage:
    def __init__(self):
        self.db = StoreMysqlPool(**config.CONN_DB)
        self.oss = StoreOSS(**config.EHCO_OSS)
        self.q = Queue.Queue()

    def get_image_respone(self, url):
        '''
        下载指定url二进制的文件
        '''
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        try:
            r = requests.get(url, timeout=20, stream=True, headers=headers)
            r.raise_for_status()
            print '图片下载成功!url: {}'.format(url)
            time.sleep(1)
            return r.content
        except:
            # print '图片下载失败!url: {}'.format(url)
            time.sleep(1)
            return None

    def up_to_server(self, respone, filename):
        '''
        将原图下载,并上传到阿里云服务器
        Args:
            url :图片的源地址
            filename:图片文件名
        '''
        # 设置文件目录
        web_folder = "comments/" + filename
        try:
            status = self.oss.put(web_folder, respone).status
            if status != 200:
                print '图片上传失败了'
            else:
                pass
                # print filename, '上传成功'
        except:
            pass
        else:
            # print("deal_response_image", url)
            pass

    def format_img_url(self):
        img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/comments/'
        img_name = '{}.jpg'.format(uuid.uuid1())
        aliyun_url = '{}{}'.format(img_head, img_name)
        return aliyun_url, img_name

    def strip_img(self, html):

        try:
            tree = fromstring(html.decode('utf-8'))
            imgs = tree.xpath('.//img')
            for img in imgs:
                img_src = img.get('src')
                # st = time.time()
                response = self.get_image_respone(img_src)
                # print("get_image_respone end time:{}".format(time.time() - st))
                if response:
                    aliyun_url, filename = self.format_img_url()
                    img.set('src', aliyun_url)
                    self.up_to_server(response, filename)
                else:
                    img.getparent().remove(img)
            content = etree.tostring(tree, encoding='utf-8',
                                     method='html').strip()
            return content[5:-6]
        except:
            pass

    def get_all_id_content(self, id_num=0):
        sql = """select id,content from comments limit {},500""".format(id_num)
        data = self.db.query(sql)
        if data:
            for row in data:
                _id = row[0]
                content = row[1]
                yield (_id, content)
        else:
            time.sleep(60 * 5)

    def get_tasks(self):
        while 1:
            # if self.q.qsize() < 400:
            print("get_tasks")
            for each in self.get_all_id_content():
                self.q.put(each)
            else:
                time.sleep(60 * 5)

    @staticmethod
    def find_img(s):
        pattern = re.compile(r'src="(.*?)"')
        return re.search(pattern, s)

    def deal_task(self):
        time.sleep(2)
        while 1:
            try:
                id_content = self.q.get()
                _id = id_content[0]
                html = id_content[1]
                if self.find_img(id_content[1]):
                    content = self.strip_img(html)
                    update_sql = """update `comments` set content="{}" where id = {}""".format(
                        MySQLdb.escape_string(base64.b64encode(str(content))),
                        _id)
                    self.db.do(update_sql)
                    print("insert: {}".format(_id))
                else:
                    # i = time.time()
                    update_sql = """update `comments` set content="{}" where id = {}""".format(
                        MySQLdb.escape_string(base64.b64encode(str(html))),
                        _id)
                    self.db.do(update_sql)
                    # print("update_sql:{}".format(time.time() -i))
            except:
                print('queue is empty!')
                time.sleep(60 * 5)

    def start(self):
        thread_list = []
        thread_list.append(threading.Thread(target=self.get_tasks))
        for i in range(10):
            t = threading.Thread(target=self.deal_task)
            thread_list.append(t)

        for t in thread_list:
            t.start()
コード例 #6
0
 def __init__(self):
     self.oss = StoreOSS(**config.EHCO_OSS)
コード例 #7
0
class Image_to_server(object):
    def __init__(self):
        self.oss = StoreOSS(**config.EHCO_OSS)

    def get_image_respone(self, url, headers={}, data=None):
        '''
        下载指定url二进制的文件
        '''
        headers = {
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        try:
            r = requests.get(url, timeout=30, stream=True, headers=headers)
            r.raise_for_status()
            print '图片下载成功!url: {}'.format(url)
            time.sleep(1)
            return r.content
        except:
            print '图片下载失败!url: {}'.format(url)
            time.sleep(1)
            return -1

    @staticmethod
    def encoding(self, data):
        types = ['utf-8', 'gb2312', 'gbk', 'gb18030', 'iso-8859-1']
        for t in types:
            try:
                return data.decode(t)
            except Exception:
                pass
        return None

    def save_file(content):
        with open('text.jpg', 'wb') as f:
            f.write(content)

    def get_new_url(self):
        '''
        返回新的url地址
        Args:
            url :图片的源地址
        Returns
            url :图片的新地址
        '''
        image_head = "http://website201710.oss-cn-shanghai.aliyuncs.com/p2p/"
        file_name = '{}.jpg'.format(uuid.uuid1())
        new_url = image_head + file_name
        return new_url

    def up_to_server(self, url, filename):
        '''
        将原图下载,并上传到阿里云服务器
        Args:
            url :图片的源地址
            filname:图片文件名
        '''
        # 设置文件目录
        web_folder = "p2p/" + filename
        # 图片的respones
        img_content = self.get_image_respone(url)
        if img_content:
            try:
                status = self.oss.put(web_folder, img_content).status
                if status != 200:
                    print '图片上传失败了'
                else:
                    print filename, '上传成功'
            except:
                pass
        else:
            print("deal_response_image", url)
        time.sleep(1)
コード例 #8
0
class up2Aliyun:
    def __init__(self):
        self.db = StoreMysqlPool(**config.CONN_DB)
        self.oss = StoreOSS(**config.EHCO_OSS)
        self.q = Queue.Queue()

    def get_100_imgs(self):
        sql = """SELECT id,thumb,detail from products where id in (select id from(select id from product_extend where STATUS=0 limit 100)tmp)"""
        data = self.db.query(sql)
        if len(data)>1:
            for row in data:
                img_dict = {}
                _id = int(row[0])
                img_url = row[1]
                img_split_urls = set(img_url.split(';'))
                detail = row[2].replace('\\&quot;', '')
                img_urls = self.get_imgs_src_from_detail(detail)
                img_urls.extend(img_split_urls)
                img_urls = list(filter(lambda x: x, img_urls))
                img_dict[_id] = img_urls
                yield img_dict
        else:
            time.sleep(60 * 5)

    @staticmethod
    def get_imgs_src_from_detail(html_str):
        img_urls = list()
        html = fromstring(html_str)
        imgs = html.xpath('.//img')
        if imgs:
            for img in imgs:
                img_src = img.get('src')
                # 去除空的和太短的不是图片链接的
                if img_src:
                    img_urls.append(img_src)
        return img_urls

    def get_image_respone(self, url, data=None):
        '''
        下载指定url二进制的文件
        '''
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        try:
            r = requests.get(url, timeout=20, stream=True, headers=headers)
            r.raise_for_status()
            # print '图片下载成功!url: {}'.format(url)
            time.sleep(1)
            return r.content
        except:
            print '图片下载失败!url: {}'.format(url)
            time.sleep(1)
            return None

    def up_to_server(self, respone, filename):
        '''
        将原图下载,并上传到阿里云服务器
        Args:
            url :图片的源地址
            filename:图片文件名
        '''
        # 设置文件目录
        web_folder = "products/" + filename
        try:
            status = self.oss.put(web_folder, respone).status
            if status != 200:
                print '图片上传失败了'
            else:
                pass
                # print filename, '上传成功'
        except:
            pass
        else:
            # print("deal_response_image", url)
            pass

    def format_img_url(self):
        img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/products/'
        img_name = '{}.jpg'.format(uuid.uuid1())
        aliyun_url = '{}{}'.format(img_head, img_name)
        return aliyun_url, img_name

    def get_tasks(self):
        while 1:
            if self.q.qsize() < 400:
                # print('get_1000_imgs')
                id_list = []
                for img_dict in self.get_100_imgs():
                    id_list.append(img_dict.keys()[0])
                    self.q.put(img_dict)
                update_sql = """update product_extend set status=2 where id in {}""".format(tuple(id_list))
                self.db.do(update_sql)
            else:
                time.sleep(10)

    def deal_task(self):
        time.sleep(3)
        while 1:
            # 有可能当前线程判断有数据,但转眼间被别的线程取走了,导致当前线程取不到出错,线程死掉,所以要trycatch
            try:
                # if not self.q.empty():
                img_id_dict = self.q.get()
                _id = img_id_dict.keys()[0]
                img_urls = img_id_dict[_id]
                if img_urls:
                    print('deal_id: ', _id)
                    img_dict = {}
                    for img_url in img_urls:
                        respone = self.get_image_respone(img_url)
                        if respone:
                            aliyun_url, filename = self.format_img_url()
                            # print('aliyun_link: ', aliyun_url)
                            self.up_to_server(respone, filename)
                            img_dict[img_url] = aliyun_url
                    update_sql = """update product_extend set source_thumb="{}" where id = {}""".format(MySQLdb.escape_string(str(img_dict)), _id)
                    self.db.do(update_sql)

            except:
                print('队列已空!稍等...')
                time.sleep(60 * 5)

    def start(self):
        thread_list = []
        thread_list.append(threading.Thread(target=self.get_tasks))
        for i in range(7):
            t = threading.Thread(target=self.deal_task)
            thread_list.append(t)

        for t in thread_list:
            t.start()
コード例 #9
0
class BaiduImage:
    def __init__(self):
        self.save_db = StoreMysqlPool(**config.EHCO_DB)
        self.db = StoreMysqlPool(**config.CONN_DB)
        self.oss = StoreOSS(**config.EHCO_OSS)
        self.q = Queue.Queue()

    def get_image_respone(self, url):
        '''
        下载指定url二进制的文件
        '''
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        try:
            r = requests.get(url, timeout=20, stream=True, headers=headers)
            r.raise_for_status()
            # print '图片下载成功!url: {}'.format(url)
            time.sleep(1)
            return r.content
        except:
            print '图片下载失败!url: {}'.format(url)
            time.sleep(1)
            return None

    def up_to_server(self, respone, filename):
        '''
        将原图下载,并上传到阿里云服务器
        Args:
            url :图片的源地址
            filename:图片文件名
        '''
        # 设置文件目录
        web_folder = "bimages/" + filename
        try:
            status = self.oss.put(web_folder, respone).status
            if status != 200:
                print '图片上传失败了'
            else:
                pass
                # print filename, '上传成功'
        except:
            pass
        else:
            # print("deal_response_image", url)
            pass

    def format_img_url(self):
        img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/bimages/'
        img_name = '{}.jpg'.format(uuid.uuid1())
        aliyun_url = '{}{}'.format(img_head, img_name)
        return aliyun_url, img_name

    def get_10_images(self, url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
            }
            r = requests.get(url, timeout=15, headers=headers)
            content = r.content
            # print(content)
            pattern = re.compile(r'"thumbURL":"(.*?)",')
            all_images = re.findall(pattern, content)
            return all_images[:10]
        except:
            time.sleep(10)


    def get_keywords_from_db(self,id_num):
        sql = """select `keyword` from `spider_keyword` limit {},100""".format(id_num)
        data = self.db.query(sql)
        if len(data)>1:
            for row in data:
                keyword = row[0]
                yield keyword
        else:
            time.sleep(60*5)

    def get_tasks(self):
        id_num = 0
        while 1:
            if self.q.qsize() < 400:
                for keyword in self.get_keywords_from_db(id_num):
                    self.q.put(keyword)
                id_num += 100
                # time.sleep(60 * 60)
            else:
                time.sleep(10)

    def deal_task(self):
        time.sleep(2)
        while 1:
            try:
                keyword = self.q.get()
                if keyword:
                    aliyun_image_urls = []
                    print(keyword)
                    url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&fp=result&queryWord={}&oe=utf-8&word={}'.format(keyword,keyword)
                    image_urls = self.get_10_images(url)
                    if image_urls:
                        for image in image_urls:
                            response = self.get_image_respone(image)
                            if response:
                                aliyun_url,filename = self.format_img_url()
                                self.up_to_server(response,filename)
                                aliyun_image_urls.append(aliyun_url)
                        insert_sql = """insert into `baidu_image` (`keyword`,`urls`) values ('{}','{}')""".format(keyword,MySQLdb.escape_string(str(aliyun_image_urls).replace("'", '"')))
                        self.save_db.do(insert_sql)
                        # print("insert{}".format(keyword))
                    else:
                        print('not deal',keyword)

            except:
                print('queue is empty!')
                time.sleep(60*5)

    def start(self):
        thread_list = []
        thread_list.append(threading.Thread(target=self.get_tasks))
        for i in range(8):
            t = threading.Thread(target=self.deal_task)
            thread_list.append(t)

        for t in thread_list:
            t.start()