class GetAllInfo: def __init__(self, html): self.oss = StoreOSS(**config.EHCO_OSS) self.tree = fromstring(html.decode('utf-8', 'ignore')) def get_image_respone(self, url): ''' 下载指定url二进制的文件 ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } try: r = requests.get(url, timeout=20, stream=True, headers=headers) r.raise_for_status() # print '图片下载成功!url: {}'.format(url) time.sleep(1) return r.content except: print '图片下载失败!url: {}'.format(url) time.sleep(1) return None def up_to_server(self, respone, filename): ''' 将原图下载,并上传到阿里云服务器 Args: url :图片的源地址 filename:图片文件名 ''' # 设置文件目录 web_folder = "comments/" + filename try: status = self.oss.put(web_folder, respone).status if status != 200: print '图片上传失败了' else: pass # print filename, '上传成功' except: pass else: # print("deal_response_image", url) pass def format_img_url(self): img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/comments/' img_name = '{}.jpg'.format(uuid.uuid1()) aliyun_url = '{}{}'.format(img_head, img_name) return aliyun_url, img_name # 获取公司图片并上传到阿里云 def get_company_image(self): img_dict = {} img = self.tree.xpath('//*[@class="logo"]/div//img')[0] if img is not None: img_url = img.get('src') response = self.get_image_respone(img_url) if response: aliyun_url, filename = self.format_img_url() # self.up_to_server(response,filename) img_dict['old_img'] = img_url img_dict['new_img'] = aliyun_url return img_dict # 获取公司的基本信息 def get_company_base(self): base_dict, base_list = {}, [] company_name = self.tree.xpath( '//*[@id="company-top"]/div/div[2]/div[1]/text()')[0].strip() base_list.append(company_name) company_base = self.tree.xpath( '//*[@id="company-top"]/div/div[2]//span//text()') company_base = list( filter(lambda x: x, map(lambda x: x.strip(), company_base))) for i, each in enumerate(company_base): if any(x in each for x in ['电话', '邮箱', '官网', '地址']): base_list.append(company_base[i + 1]) for name, v in zip(['name', 'tel', 'email', 'website', 'address'], base_list): base_dict[name] = v return base_dict # 获取公司的详细经营信息 def get_company_detail(self): detail_dict = {} all_name = [ 'register_capital', 'pay_capital', 'operate_state', 'establish_date', 'register_number', 'organization_code', 'taxpayer_number', 'social_code', 'company_type', 'industry', 'approval_date', 'register_authority', 'affiliated_area', 'english_name', 'used_name', 'operate_mode', 'personnel_scale', 'business_term', 'scope_operation' ] all_company_detail = self.tree.xpath( '//*[@id="Cominfo"]/table[2]//tr//text()') all_company_detail = list( filter(lambda x: x, map(lambda x: x.strip(), all_company_detail))) del (all_company_detail[-5:-1]) for name, v in zip(all_name, all_company_detail[1::2]): detail_dict[name] = v return detail_dict # 获取股东信息 def get_shareholder_detail(self): all_shareholder, all_shareholder_dict = [], {} all_shareholder_detail = self.tree.xpath( '//*[@id="Sockinfo"]/table//tr//text()') all_shareholder_detail = list( filter(lambda x: x, map(lambda x: x.strip(), all_shareholder_detail)))[5:] all_shareholder_detail = [ all_shareholder_detail[i:i + 6] for i in xrange(0, len(all_shareholder_detail), 6) ] for each in all_shareholder_detail: del (each[1]) all_shareholder.append(each) all_shareholder_dict['shareholder'] = all_shareholder return all_shareholder_dict # 获取主要人员信息 def get_main_member(self): all_members, all_members_dict = [], {} all_main_member = self.tree.xpath( '//*[@id="Mainmember"]/table//tr//text()') all_main_member = list( filter(lambda x: x, map(lambda x: x.strip(), all_main_member)))[3:][1::2] for name, v in zip(all_main_member[::2], all_main_member[1::2]): all_members.append((name, v)) all_members_dict['member'] = all_members return all_members_dict @property def get_detail(self): all_detail = {} img_dict = self.get_company_image() base_dict = self.get_company_base() detail_dict = self.get_company_detail() shareholder_dict = self.get_shareholder_detail() member_dict = self.get_main_member() all_detail.update(img_dict) all_detail.update(base_dict) all_detail.update(detail_dict) all_detail.update(shareholder_dict) all_detail.update(member_dict) return all_detail
def __init__(self, html): self.oss = StoreOSS(**config.EHCO_OSS) self.tree = fromstring(html.decode('utf-8', 'ignore'))
def __init__(self): self.db = StoreMysqlPool(**config.CONN_DB) self.oss = StoreOSS(**config.EHCO_OSS) self.q = Queue.Queue()
def __init__(self): super(ToutiaoExtractor, self).__init__() self.html_parser = HTMLParser.HTMLParser() self.oss = StoreOSS(**config.EHCO_OSS)
class BaiduImage: def __init__(self): self.db = StoreMysqlPool(**config.CONN_DB) self.oss = StoreOSS(**config.EHCO_OSS) self.q = Queue.Queue() def get_image_respone(self, url): ''' 下载指定url二进制的文件 ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } try: r = requests.get(url, timeout=20, stream=True, headers=headers) r.raise_for_status() print '图片下载成功!url: {}'.format(url) time.sleep(1) return r.content except: # print '图片下载失败!url: {}'.format(url) time.sleep(1) return None def up_to_server(self, respone, filename): ''' 将原图下载,并上传到阿里云服务器 Args: url :图片的源地址 filename:图片文件名 ''' # 设置文件目录 web_folder = "comments/" + filename try: status = self.oss.put(web_folder, respone).status if status != 200: print '图片上传失败了' else: pass # print filename, '上传成功' except: pass else: # print("deal_response_image", url) pass def format_img_url(self): img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/comments/' img_name = '{}.jpg'.format(uuid.uuid1()) aliyun_url = '{}{}'.format(img_head, img_name) return aliyun_url, img_name def strip_img(self, html): try: tree = fromstring(html.decode('utf-8')) imgs = tree.xpath('.//img') for img in imgs: img_src = img.get('src') # st = time.time() response = self.get_image_respone(img_src) # print("get_image_respone end time:{}".format(time.time() - st)) if response: aliyun_url, filename = self.format_img_url() img.set('src', aliyun_url) self.up_to_server(response, filename) else: img.getparent().remove(img) content = etree.tostring(tree, encoding='utf-8', method='html').strip() return content[5:-6] except: pass def get_all_id_content(self, id_num=0): sql = """select id,content from comments limit {},500""".format(id_num) data = self.db.query(sql) if data: for row in data: _id = row[0] content = row[1] yield (_id, content) else: time.sleep(60 * 5) def get_tasks(self): while 1: # if self.q.qsize() < 400: print("get_tasks") for each in self.get_all_id_content(): self.q.put(each) else: time.sleep(60 * 5) @staticmethod def find_img(s): pattern = re.compile(r'src="(.*?)"') return re.search(pattern, s) def deal_task(self): time.sleep(2) while 1: try: id_content = self.q.get() _id = id_content[0] html = id_content[1] if self.find_img(id_content[1]): content = self.strip_img(html) update_sql = """update `comments` set content="{}" where id = {}""".format( MySQLdb.escape_string(base64.b64encode(str(content))), _id) self.db.do(update_sql) print("insert: {}".format(_id)) else: # i = time.time() update_sql = """update `comments` set content="{}" where id = {}""".format( MySQLdb.escape_string(base64.b64encode(str(html))), _id) self.db.do(update_sql) # print("update_sql:{}".format(time.time() -i)) except: print('queue is empty!') time.sleep(60 * 5) def start(self): thread_list = [] thread_list.append(threading.Thread(target=self.get_tasks)) for i in range(10): t = threading.Thread(target=self.deal_task) thread_list.append(t) for t in thread_list: t.start()
def __init__(self): self.oss = StoreOSS(**config.EHCO_OSS)
class Image_to_server(object): def __init__(self): self.oss = StoreOSS(**config.EHCO_OSS) def get_image_respone(self, url, headers={}, data=None): ''' 下载指定url二进制的文件 ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } try: r = requests.get(url, timeout=30, stream=True, headers=headers) r.raise_for_status() print '图片下载成功!url: {}'.format(url) time.sleep(1) return r.content except: print '图片下载失败!url: {}'.format(url) time.sleep(1) return -1 @staticmethod def encoding(self, data): types = ['utf-8', 'gb2312', 'gbk', 'gb18030', 'iso-8859-1'] for t in types: try: return data.decode(t) except Exception: pass return None def save_file(content): with open('text.jpg', 'wb') as f: f.write(content) def get_new_url(self): ''' 返回新的url地址 Args: url :图片的源地址 Returns url :图片的新地址 ''' image_head = "http://website201710.oss-cn-shanghai.aliyuncs.com/p2p/" file_name = '{}.jpg'.format(uuid.uuid1()) new_url = image_head + file_name return new_url def up_to_server(self, url, filename): ''' 将原图下载,并上传到阿里云服务器 Args: url :图片的源地址 filname:图片文件名 ''' # 设置文件目录 web_folder = "p2p/" + filename # 图片的respones img_content = self.get_image_respone(url) if img_content: try: status = self.oss.put(web_folder, img_content).status if status != 200: print '图片上传失败了' else: print filename, '上传成功' except: pass else: print("deal_response_image", url) time.sleep(1)
class up2Aliyun: def __init__(self): self.db = StoreMysqlPool(**config.CONN_DB) self.oss = StoreOSS(**config.EHCO_OSS) self.q = Queue.Queue() def get_100_imgs(self): sql = """SELECT id,thumb,detail from products where id in (select id from(select id from product_extend where STATUS=0 limit 100)tmp)""" data = self.db.query(sql) if len(data)>1: for row in data: img_dict = {} _id = int(row[0]) img_url = row[1] img_split_urls = set(img_url.split(';')) detail = row[2].replace('\\"', '') img_urls = self.get_imgs_src_from_detail(detail) img_urls.extend(img_split_urls) img_urls = list(filter(lambda x: x, img_urls)) img_dict[_id] = img_urls yield img_dict else: time.sleep(60 * 5) @staticmethod def get_imgs_src_from_detail(html_str): img_urls = list() html = fromstring(html_str) imgs = html.xpath('.//img') if imgs: for img in imgs: img_src = img.get('src') # 去除空的和太短的不是图片链接的 if img_src: img_urls.append(img_src) return img_urls def get_image_respone(self, url, data=None): ''' 下载指定url二进制的文件 ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } try: r = requests.get(url, timeout=20, stream=True, headers=headers) r.raise_for_status() # print '图片下载成功!url: {}'.format(url) time.sleep(1) return r.content except: print '图片下载失败!url: {}'.format(url) time.sleep(1) return None def up_to_server(self, respone, filename): ''' 将原图下载,并上传到阿里云服务器 Args: url :图片的源地址 filename:图片文件名 ''' # 设置文件目录 web_folder = "products/" + filename try: status = self.oss.put(web_folder, respone).status if status != 200: print '图片上传失败了' else: pass # print filename, '上传成功' except: pass else: # print("deal_response_image", url) pass def format_img_url(self): img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/products/' img_name = '{}.jpg'.format(uuid.uuid1()) aliyun_url = '{}{}'.format(img_head, img_name) return aliyun_url, img_name def get_tasks(self): while 1: if self.q.qsize() < 400: # print('get_1000_imgs') id_list = [] for img_dict in self.get_100_imgs(): id_list.append(img_dict.keys()[0]) self.q.put(img_dict) update_sql = """update product_extend set status=2 where id in {}""".format(tuple(id_list)) self.db.do(update_sql) else: time.sleep(10) def deal_task(self): time.sleep(3) while 1: # 有可能当前线程判断有数据,但转眼间被别的线程取走了,导致当前线程取不到出错,线程死掉,所以要trycatch try: # if not self.q.empty(): img_id_dict = self.q.get() _id = img_id_dict.keys()[0] img_urls = img_id_dict[_id] if img_urls: print('deal_id: ', _id) img_dict = {} for img_url in img_urls: respone = self.get_image_respone(img_url) if respone: aliyun_url, filename = self.format_img_url() # print('aliyun_link: ', aliyun_url) self.up_to_server(respone, filename) img_dict[img_url] = aliyun_url update_sql = """update product_extend set source_thumb="{}" where id = {}""".format(MySQLdb.escape_string(str(img_dict)), _id) self.db.do(update_sql) except: print('队列已空!稍等...') time.sleep(60 * 5) def start(self): thread_list = [] thread_list.append(threading.Thread(target=self.get_tasks)) for i in range(7): t = threading.Thread(target=self.deal_task) thread_list.append(t) for t in thread_list: t.start()
class BaiduImage: def __init__(self): self.save_db = StoreMysqlPool(**config.EHCO_DB) self.db = StoreMysqlPool(**config.CONN_DB) self.oss = StoreOSS(**config.EHCO_OSS) self.q = Queue.Queue() def get_image_respone(self, url): ''' 下载指定url二进制的文件 ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } try: r = requests.get(url, timeout=20, stream=True, headers=headers) r.raise_for_status() # print '图片下载成功!url: {}'.format(url) time.sleep(1) return r.content except: print '图片下载失败!url: {}'.format(url) time.sleep(1) return None def up_to_server(self, respone, filename): ''' 将原图下载,并上传到阿里云服务器 Args: url :图片的源地址 filename:图片文件名 ''' # 设置文件目录 web_folder = "bimages/" + filename try: status = self.oss.put(web_folder, respone).status if status != 200: print '图片上传失败了' else: pass # print filename, '上传成功' except: pass else: # print("deal_response_image", url) pass def format_img_url(self): img_head = 'http://website201710.oss-cn-shanghai.aliyuncs.com/bimages/' img_name = '{}.jpg'.format(uuid.uuid1()) aliyun_url = '{}{}'.format(img_head, img_name) return aliyun_url, img_name def get_10_images(self, url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } r = requests.get(url, timeout=15, headers=headers) content = r.content # print(content) pattern = re.compile(r'"thumbURL":"(.*?)",') all_images = re.findall(pattern, content) return all_images[:10] except: time.sleep(10) def get_keywords_from_db(self,id_num): sql = """select `keyword` from `spider_keyword` limit {},100""".format(id_num) data = self.db.query(sql) if len(data)>1: for row in data: keyword = row[0] yield keyword else: time.sleep(60*5) def get_tasks(self): id_num = 0 while 1: if self.q.qsize() < 400: for keyword in self.get_keywords_from_db(id_num): self.q.put(keyword) id_num += 100 # time.sleep(60 * 60) else: time.sleep(10) def deal_task(self): time.sleep(2) while 1: try: keyword = self.q.get() if keyword: aliyun_image_urls = [] print(keyword) url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&fp=result&queryWord={}&oe=utf-8&word={}'.format(keyword,keyword) image_urls = self.get_10_images(url) if image_urls: for image in image_urls: response = self.get_image_respone(image) if response: aliyun_url,filename = self.format_img_url() self.up_to_server(response,filename) aliyun_image_urls.append(aliyun_url) insert_sql = """insert into `baidu_image` (`keyword`,`urls`) values ('{}','{}')""".format(keyword,MySQLdb.escape_string(str(aliyun_image_urls).replace("'", '"'))) self.save_db.do(insert_sql) # print("insert{}".format(keyword)) else: print('not deal',keyword) except: print('queue is empty!') time.sleep(60*5) def start(self): thread_list = [] thread_list.append(threading.Thread(target=self.get_tasks)) for i in range(8): t = threading.Thread(target=self.deal_task) thread_list.append(t) for t in thread_list: t.start()