def parse(demands, key_word, webtype): # 创建数据库链接客户端 mysql_client = MysqlClient() for demand in demands: try: task_id = demand["data-taskid"] detail_url = "https://task.zbj.com/" + task_id detail_html = requests.get(detail_url) soup = BeautifulSoup(detail_html.text, 'lxml') content = soup.find("div", attrs={"class": "demand-content J-show-hide hide-more"}) # 发布人和时间 user_and_publishtime = content.find("div", attrs={"class": "order-attr"}).get_text() amt = content.find("span", attrs={"class": "orange-color"}).get_text() amt = str(amt).replace('元', '') els = content.find_all("p", attrs={"class": "clearfix"}) details = [el.find("span", attrs={"class": "description"}).get_text() for el in els] result = mysql_client.insert(detail_url, amt, user_and_publishtime, details, key_word, webtype,"zbj-"+task_id) if result == -1: return result except Exception as e: print(str(e)) print("error:https://task.zbj.com/" + demand["data-taskid"]) mysql_client.destory()
def parse(demands, key_word, webtype): # 创建数据库链接客户端 mysql_client = MysqlClient() for demand in demands: try: key_word = demand.find("span",attrs={"class":"skill"}).get_text().strip() demand = demand.find("h4",attrs={"class":"media-heading"}).find("a") detail_url = demand['href'] task_id = str(detail_url).replace("/projects/", "", 1) detail_url= "http://www.taskcity.com/projects/"+task_id detail_html = requests.get(detail_url) soup = BeautifulSoup(detail_html.text, 'lxml') con = soup.find("div", attrs={"class": "col-sm-7"}) # 发包人 publish_user = "******" left = con.find("div", attrs={"class": "pull-left space-right-30"}).find_all("div") # 金额 amt = left[1].find("b").get_text().strip() # 任务名 title = soup.find(id="project_title_name").get_text().strip() # 任务信息 pattern = re.compile("描述") element = soup.find('h3', text=pattern).parent content = element.find_all("div",recursive=False)[3].get_text().strip() # 任务发布时间 pattern = re.compile("发布日期") user_info_action = soup.find('div', text=pattern).get_text() mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", user_info_action) publish_time = str(mat.group(0)) details = [title, '0', content] nowTime_str = datetime.datetime.now().strftime('%H:%M') result = mysql_client.insert(detail_url, amt, publish_user + "发布于" + publish_time + " " + nowTime_str, details, key_word, webtype,"taskcity-"+task_id) if result == -1: return result except Exception as e: print(str(e)) print("error:" + demand["href"]) mysql_client.destory()
def parse(demands, key_word, webtype): # 创建数据库链接客户端 mysql_client = MysqlClient() for demand in demands: try: detail_url = demand["href"] task_id = str(detail_url).replace("http://www.epwk.com/task/", "", 1).replace("/", "", 1) detail_html = requests.get(detail_url) soup = BeautifulSoup(detail_html.text, 'lxml') con = soup.find("div", attrs={"class": "tasktopdet"}) # 发包人 publish_user = con.find("div", attrs={"class": "task-user-header"}).find("span").get_text() # 金额 amt = con.find("div", attrs={"class": "task_user_info"}).find("span", attrs={"class": "nummoney f_l"}).find( "span").get_text() # 任务名 title = con.find("div", attrs={"class": "task_user_info"}).find("h1").get_text() # 任务信息 content = con.find("div", attrs={"class": "task-info-content"}).get_text() # 任务发布时间 user_info_action = con.find("div", attrs={"class": "task-user-info-action"}).find_all("span", attrs={ "class": "dib_vm"}) publish_time = user_info_action[0].get_text() details = [title, '0', content] nowTime_str = datetime.datetime.now().strftime('%H:%M') result = mysql_client.insert(detail_url, amt, publish_user + "发布于" + publish_time + " " + nowTime_str, details, key_word, webtype,"epwk-"+task_id) if result == -1: return result except Exception as e: print(str(e)) print("error:" + demand["href"]) mysql_client.destory()
def parse(demands, key_word, webtype): # 创建数据库链接客户端 mysql_client = MysqlClient() for demand in demands: detail_url = "https://www.mayigeek.com/tab/projectDetail?id="+str(demand["id"]) try: # 发包人 publish_user = "******" # 金额 amt = str(demand['min_price']) # 任务名 title = demand['name'] # 任务信息 content = demand['desc'] # 任务发布时间 publish_time = demand['create_date'] key_word= ",".join(demand['typeStr']) details = [title, '0', content] result = mysql_client.insert(detail_url, amt, publish_user + "发布于" + publish_time, details, key_word, webtype,"mayigeek-"+str(demand["id"])) if result == -1: return result except Exception as e: print(str(e)) print("error:" + detail_url) mysql_client.destory()
def parse(demands, key_word, webtype): # 创建数据库链接客户端 mysql_client = MysqlClient() for demand in demands: try: demand = demand.find("a") detail_url = demand["href"] task_id = str(detail_url).replace("/project/show/", "", 1) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.sxsoft.com', 'Referer': 'https://www.sxsoft.com/page/project', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } detail_url = "https://www.sxsoft.com" + detail_url detail_html = requests.get(detail_url, params=keyword, headers=headers, timeout=3) # detail_html = requests.get(url) soup = BeautifulSoup(detail_html.text, 'lxml') con = soup.find("div", attrs={ "class": "bg-color-f5 clearfix" }).find("div", attrs={"class": "container"}) # 发包人 publish_user = con.find("div", attrs={ "class": "owner clearfix" }).find("span", attrs={ "class": "green" }).get_text().strip() project_msgs = con.find("div", attrs={ "class": "project-msg clearfix" }).find_all("div", attrs={"class": "col-sm-3"}) # 项目类型 key_word = project_msgs[0].find("p", attrs={ "class": "cat-name" }).get_text().strip() # 金额 amt = project_msgs[1].find("p", attrs={ "class": "cat-name" }).get_text().strip() # 任务名 title = con.find("h1", attrs={"class": "project-title"})["title"] # 任务信息 content = con.find("div", attrs={ "class": "project-content clearfix" }).get_text().strip() # 任务发布时间 user_info_action = con.find("div", attrs={ "class": "owner clearfix" }).find_all("span") publish_time = user_info_action[1].get_text().strip() details = [title, '0', content] nowTime_str = datetime.datetime.now().strftime('%H:%M') result = mysql_client.insert( detail_url, amt, publish_user + "发布于" + publish_time + " " + nowTime_str, details, key_word, webtype, "sxsoft-" + task_id) if result == -1: return result except Exception as e: print(str(e)) print("error:" + demand["href"]) mysql_client.destory()
def parse(demands, key_word, webtype): # 创建数据库链接客户端 mysql_client = MysqlClient() for demand in demands: detail_url = "https://zb.oschina.net/project/detail?id=" + str( demand["id"]) urla = 'https://zb.oschina.net/project/detail.html?id=' + str( demand["id"]) # print(detail_url) headers = { 'Accept': 'application/json', 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Host': 'zb.oschina.net', 'Referer': 'https://zb.oschina.net/projects/list.html', 'User-Agent': 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/75.0.3770.100Safari/537.36' } strhtml = requests.get(detail_url, params=keyword, headers=headers, timeout=3) c = strhtml.content unicodestr = json.loads(c) res = jsonpath.jsonpath(unicodestr, '$.data')[0] try: # 发包人 publish_user = "******" # 项目编号 projectNo = res['projectNo'] # 金额 amt_min = res['budgetMinByYuan'] amt_max = res['budgetMaxByYuan'] if amt_max != str(0): amt = str(amt_min) + "-" + str(amt_max) else: amt = "价格面议" # 任务名 title = res['name'] print(title) # 任务信息 content = res['prd'] soup = BeautifulSoup(str(content), 'lxml') content = soup.get_text().strip() # 任务发布时间 publish_time = res['publishTime'] details = [title, '0', content] result = mysql_client.insert(urla, amt, publish_user + "发布于" + publish_time, details, key_word, webtype, "oschina-" + projectNo) if result == -1: print("过期数据不处理", urla) continue except Exception as e: print(str(e)) # print("error:" + detail_url) pass mysql_client.destory()
# coding:utf-8 import datetime import os from xlwt import Workbook from com.parttimejob.db.mysqlclient import MysqlClient mysql = MysqlClient() def create_yesterday_excle(): yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') today = datetime.datetime.now().strftime('%Y-%m-%d') res = mysql.get_partjob_list(yesterday, today) w = Workbook() # 创建一个工作簿 ws = w.add_sheet('1') # 创建一个工作表 col = 0 ws.write(col, 0, '任务编号') ws.write(col, 1, '平台') ws.write(col, 2, '工作类型') ws.write(col, 3, '任务名') ws.write(col, 4, '报酬') ws.write(col, 5, '发布时间') ws.write(col, 6, '任务链接')