def download_pic(tel: str):
	url = 'http://www.114best.com/dh/114.aspx?w=' + tel
	bsObj = spider_util.open_url_return_bsobj(url, 5, 20,
											  from_encoding='gbk')
	try:
		img_tag = bsObj.select_one('#span_gsd img')
		if img_tag is None:
			return None
		src = img_tag.get('src')
		gif_link = urljoin(url, src)
		local_filename = 'D:\\tmp\\wj\\temp.gif'
		spider_util.download(gif_link,local_filename,3,10)
		# request.urlretrieve(gif_link, local_filename)
		fp = open(local_filename, 'rb')
		img = Image.open(fp)
		Img = img.convert('L')
		threshold = 200
		table = []
		for i in range(256):
			if i < threshold:
				table.append(0)
			else:
				table.append(1)

		# 图片二值化
		photo = Img.point(table, '1')
		uuid_str = str(uuid.uuid1())
		black_pic = 'D:\\tmp\\wj\\temppng\\'+uuid_str+'.png'
		photo.save(black_pic)
		fp.close()
		os.remove(local_filename)  # 删除gif文件
		return black_pic
	except Exception as e:
		print('发生异常行为',repr(e))
		return None
コード例 #2
0
def doRequest(StockCode=''):
	url = f'http://s.askci.com/stock/financialanalysis/{StockCode}'
	bsobj = spider_util.open_url_return_bsobj(url)
	title_tags = bsobj.select('.right_f_c_tltie')
	table_tags = bsobj.select('.right_f_d_table.mg_tone table')
	for i, title_tag in enumerate(title_tags):
		title = title_tag.get_text().strip()
		table_tag = table_tags[i]
		table_handle(table_tag, i, StockCode)
コード例 #3
0
 def open(self,
          url: str,
          func,
          self_rotation=5,
          timeout=5,
          data=None,
          from_encoding="utf-8"):
     result = spider_util.open_url_return_bsobj(url, self_rotation, timeout,
                                                data, from_encoding)
     return func(result)
コード例 #4
0
def query_mobile_phone_location(tel: str):
    url = 'http://www.zou114.com/shouji/?mobile=' + tel
    bsObj = spider_util.open_url_return_bsobj(
        url, 5, 20, from_encoding='gbk'
    )  # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252
    div = bsObj.select_one('.nrbnr')
    tags = div.find_all('font', {'color': 'red'})
    text = ''
    if tags is not None:
        text = tags[0].get_text() + ' ;卡类型:' + tags[1].get_text()
    return text
コード例 #5
0
def query_telphone_location_114best(tel: str):
    url = 'http://www.114best.com/dh/114.aspx?w=' + tel
    bsObj = spider_util.open_url_return_bsobj(
        url, 5, 20, from_encoding='gbk'
    )  # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252
    tags = bsObj.find_all('font', {'size': 4})
    if tags is None:
        tags = bsObj.find_all('font', {'color': '#008080'})
    text = ''
    for tag in tags:
        text = text + ' ' + tag.get_text().strip()
    return text
コード例 #6
0
ファイル: Producer.py プロジェクト: w341000/PythonTheWord
    def request_by_urllib(self,
                          url: str,
                          func,
                          self_rotation=5,
                          timeout=5,
                          data=None,
                          from_encoding="utf-8"):
        """
		使用urllib请求连接获取数据,并调用自定义函数处理数据
		:param url:
		:param func:
		:param self_rotation:
		:param timeout:
		:param data:
		:param from_encoding:
		:return:
		"""
        result = spider_util.open_url_return_bsobj(url, self_rotation, timeout,
                                                   data, from_encoding)
        self.data.append(func(result))
コード例 #7
0
def query_location2(df: DataFrame):
    df['号码归属地'] = None
    length = len(df)
    for i in range(length):
        format_tel = df.at[i, '格式化电话']
        if format_tel is None or format_tel == '':
            continue
        url = 'https://www.00cha.com/114.asp?t=' + format_tel
        bsObj = spider_util.open_url_return_bsobj(
            url, 5, 20, from_encoding='gbk'
        )  # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252
        tags = bsObj.find_all('font', {'size': 4})
        if tags is None:
            tags = bsObj.find_all('font', {'color': '#008080'})
        text = None
        for tag in tags:
            text = text + ' ' + tag.get_text().strip()
        if format_tel.startswith('0769'):
            text = '广东 东莞'
        df.at[i, '号码归属地'] = text
        spider_util.log_progress(i, length)
コード例 #8
0
def get_list():
	data = []
	r = RedisUtil().get_redis_instance()
	for i in range(1, 2000):
		url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%2520,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
			page=i)
		# head = ['职位名', '公司名', '工作地点', '薪资', '发布时间', '职位详细URL']
		bsobj = spider_util.open_url_return_bsobj(url)
		div_tags = bsobj.select('#resultList .el')[1:]
		for div in div_tags:
			job = div.select_one('a').get_text().strip()
			job_url = div.select_one('a').get('href')
			redis_result = r.sadd(redis_job_set, job_url)
			if redis_result == 0:  # 结果为0,则添加失败,说明已经有该职位url信息
				continue
			company = div.select_one('.t2 a').get_text().strip()
			address = div.select_one('.t3').get_text().strip()
			salary = div.select_one('.t4').get_text().strip()
			money_toplimit = None
			money_lowerlimit = None
			money_unit = None
			time_unit = None
			if '/' in salary:
				money_range = salary.split('/')[0]
				money_unit = money_range[-1]
				money_range = money_range[:-1]
				money_toplimit = money_range
				money_lowerlimit = money_range
				if '-' in money_range:  # 分割薪水上下限
					money_lowerlimit = money_range.split('-')[0]
					money_toplimit = money_range.split('-')[1]
				time_unit = salary.split('/')[1]
			push_time = div.select_one('.t5').get_text().strip()
			item = {'职位名': job, '公司名': company, '工作地点': address, '薪资': salary, '发布时间': push_time, '职位详细URL': job_url,
					'金额上限': money_toplimit, '金额下限': money_lowerlimit, '时间单位': time_unit, '金额单位': money_unit}
			data.append(item)
		spider_util.log_progress(i,2000,start_from_zero=False)

	return DataFrame(data)