def main(wb): print('开始爬取开源中国订单') sheet = wb.create_sheet('开源中国', 1) sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 for i in range(10): url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30¤tTime=&pageSize=20¤tPage='.format( i + 1) id_list = get_id(url) if isinstance(id_list, list): for id in id_list: url = 'https://zb.oschina.net/project/detail?id=%s' % id desc = get_one_page(url) if isinstance(desc, str): desc = html2text.html2text(desc).strip() contact = get_contact(desc) sheet.append([count, desc, url, contact]) count += 1 elif isinstance(desc, tuple): print('开源中国详情爬取出错:%s' % desc[1]) elif isinstance(id_list, tuple): message = '开源中国爬取出错:%s' % id_list[1] print(message) send_message(message) print('结束爬取开源中国订单')
def main(wb, session, OrderModel, WebsiteModel): print('开始爬取人人开发订单') sheet = wb.create_sheet('人人开发', 2) sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count = 1 website = session.query(WebsiteModel).get(3) for i in range(10, 0, -1): url = 'http://www.rrkf.com/serv/request?¤tPage=%d' % i info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] link = info['link'] details = get_detail(link) if isinstance(details, list): rid = 'rr-{}'.format(link.split('=')[1]) contact = get_contact(desc) is_valid = True if '剩余' in details[0] else False pub_time = datetime.strptime( details[1], "%Y-%m-%d %H:%M:%S") if details[1] else None order_query = session.query(OrderModel).get(rid) if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid if is_valid == True: sheet.append( [count, desc, link, pub_time, contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrderModel( id=rid, desc=desc, link=link, contact=contact, category='', pub_time=pub_time, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append( [count, desc, link, pub_time, contact, '']) count += 1 else: message = '人人开发详情爬取第%d行出错:%s' % (details[0], details[1]) print(message) send_message(message) session.commit() elif isinstance(info_list, tuple): message = '人人开发爬取第%d行出错:%s' % (info_list[0], info_list[1]) print(message) send_message(message) print('结束爬取人人开发订单')
def main(wb, session, OrderModel, WebsiteModel): print('开始爬取码市订单') sheet = wb['Sheet'] sheet.title = '码市' sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count = 1 website = session.query(WebsiteModel).get(1) for i in range(10, 0, -1): url = 'https://codemart.com/api/project?page=%d' % i result = get_one_page(url) if isinstance(result, list): for r in result: time_stamp = int(r['pubtime']) / 1000 publish_time = datetime.fromtimestamp(time_stamp) if publish_time < time_point: continue desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description']) cid = 'cm-{}'.format(r['id']) contact = get_contact(desc) link = 'https://codemart.com/project/{}'.format(r['id']) is_valid = True if r['status'] == '招募中' else False order_query = session.query(OrderModel).get(cid) if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrderModel(id=cid, desc=desc, link=link, contact=contact, category=r['cate'], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 session.commit() elif isinstance(result, tuple): message = '码市爬取第%d行出错:%s' % (result[0], result[1]) print(message) send_message(message) print('结束爬取码市订单')
def main(wb, session, OrderModel, WebsiteModel): print('开始爬取实现订单') sheet = wb.create_sheet('实现', 3) sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count = 1 website = session.query(WebsiteModel).get(4) for i in range(10, 0, -1): url = 'https://shixian.com/job/all?page=%d&sort_arrow=down' % i info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] link = info['link'] contact = get_contact(desc) dl_time = datetime.strptime(info['start_time'], "%Y-%m-%d %H:%M:%S") is_valid = True if datetime.now() <= dl_time else False sid= 'sx-' + link.split('/')[-1] cate = get_category(link) if isinstance(cate, str): order_query = session.query(OrderModel).get(sid) if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid if is_valid == True: sheet.append([count, desc, link, '', contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrderModel(id=sid, desc=desc, link=link, contact=contact, category=cate, pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append([count, desc, link, '', contact, '']) count += 1 else: message = '实现详情爬取第%d行出错:%s' % (cate[0], cate[1]) print(message) send_message(message) time.sleep(random.random()/10) session.commit() elif isinstance(info_list, tuple): message = '实现爬取第%d行出错:%s' % (info_list[0], info_list[1]) print(message) send_message(message) print('结束爬取实现订单')
def main(wb, session, OrderModel, WebsiteModel): print('开始爬取猿急送订单') sheet = wb.create_sheet('猿急送', 5) sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count = 1 website = session.query(WebsiteModel).get(6) for i in range(10, 0, -1): url = 'https://www.yuanjisong.com/job/allcity/page%d' % i info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] link = info['link'] contact = get_contact(desc) is_valid = True if info['status'] == '投递职位' else False yid = 'yj-{}'.format(int(link.split('/')[-1])) order_query = session.query(OrderModel).get(yid) if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid # if is_valided == False and is_valid == True: # sheet.append([count, desc, link, contact]) # count += 1 # order_query.is_delete = False if is_valid == True: sheet.append([count, desc, link, '', contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrderModel(id=yid, desc=desc, link=link, contact=contact, category='', pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append([count, desc, link, '', contact, '']) count += 1 session.commit() elif isinstance(info_list, tuple): message = '猿急送爬取第%d行出错:%s' % (info_list[0], info_list[1]) print(message) send_message(message) print('结束爬取猿急送订单')
def main(wb): print('开始爬取51外包订单') sheet = wb.create_sheet('51外包', 3) sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 for i in range(10): url = 'http://www.51waibao.net/Project.html?page={}'.format(i + 1) info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] contact = get_contact(desc) sheet.append([count, desc, info['link'], contact]) count += 1 elif isinstance(info_list, tuple): message = '51外包爬取出错:%s' % info_list[1] print(message) send_message(message) print('结束爬取51外包订单')
def main(wb): print('开始爬取猿急送订单') sheet = wb.create_sheet('猿急送', 4) sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 for i in range(10): url = 'https://www.yuanjisong.com/job/allcity/page{}'.format(i + 1) info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] contact = get_contact(desc) sheet.append([count, desc, info['link'], contact]) count += 1 elif isinstance(info_list, tuple): message = '猿急送爬取出错:%s' % info_list[1] print(message) send_message(message) print('结束爬取猿急送订单')
def main(wb): print('开始爬取实现订单') sheet = wb.create_sheet('实现', 5) sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 for i in range(10): url = 'https://shixian.com/job/all?page={}&sort_arrow=down'.format(i + 1) info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] contact = get_contact(desc) sheet.append([count, desc, info['link'], contact]) count += 1 elif isinstance(info_list, tuple): message = '实现爬取出错:%s' % info_list[1] print(message) send_message(message) print('结束爬取实现订单')
def main(wb): print('开始爬取人人开发订单') sheet = wb.create_sheet('人人开发', 2) sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 for i in range(10): url = 'http://www.rrkf.com/serv/request?¤tPage={}'.format(i + 1) info_list = get_info(url) if isinstance(info_list, list): for info in info_list: desc = info['desc'] contact = get_contact(desc) sheet.append([count, desc, info['link'], contact]) count += 1 elif isinstance(info_list, tuple): message = '人人开发爬取出错:%s' % info_list[1] print(message) send_message(message) print('结束爬取人人开发订单')
def main(wb): print('开始爬取码市订单') start_time = time.time() sheet = wb['Sheet'] sheet.title = '码市' sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 for i in range(10): url = 'https://codemart.com/api/project?page={}'.format(i + 1) result = get_one_page(url, start_time) if isinstance(result, list): for r in result: desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description']) contact = get_contact(desc) sheet.append([ count, desc, 'https://codemart.com/project/{}'.format(r['id']), contact ]) count += 1 elif isinstance(result, tuple): message = '码市爬取出错:%s' % result[1] print(message) send_message(message) print('结束爬取码市订单')
def main(wb, session, OrdrModel, WebsiteModel): print('开始爬取51外包订单') sheet = wb.create_sheet('51外包', 4) sheet.append(['单据编号', '订单描述', '链接', '分配人员']) count = 1 website = session.query(WebsiteModel).get(5) for i in range(10, 0, -1): url = 'http://www.51waibao.net/Project.html?page=%d' % i link_list = get_links(url) if isinstance(link_list, list): for link in link_list: result = get_detail(link) if isinstance(result, list): date_str = result[3] publish_time = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") if publish_time < time_point: continue desc = result[4] contact = get_contact(desc) wid = 'wy-' + result[0] is_valid = False if '项目已过期' in result[2] else True order_query = session.query(OrdrModel).get(wid) if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrdrModel( id=wid, desc=desc, link=link, contact=contact, category=result[1], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append( ['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count += 1 else: message = '51外包详情爬取第%d行出错:%s' % (result[0], result[1]) print(message) send_message(message) time.sleep(random.random() / 10) session.commit() elif isinstance(link_list, tuple): message = '51外包爬取第%d行出错:%s' % (link_list[0], link_list[1]) print(message) send_message(message) print('结束爬取51外包订单')
def main(wb, session, OrderModel, WebsiteModel): print('开始爬取开源中国订单') sheet = wb.create_sheet('开源中国', 1) sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员']) count = 1 website = session.query(WebsiteModel).get(2) for i in range(10, 0, -1): url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30¤tTime=&pageSize=20¤tPage=%d' % i id_list = get_id(url) if isinstance(id_list, list): for oid, otype in id_list: if otype == 2: url = 'https://zb.oschina.net/reward/detail?id=%d' % oid link = 'https://zb.oschina.net/reward/detail.html?id=%s' % oid else: url = 'https://zb.oschina.net/project/detail?id=%s' % oid link = 'https://zb.oschina.net/project/detail.html?id=%s' % oid result = get_one_page(url) if isinstance(result, list): publish_time = result[3] if publish_time < time_point: continue desc = html2text.html2text(result[0]).strip() is_valid = True if result[1] == 3 else False contact = get_contact(desc) oid = 'oc-{}'.format(oid // 10) order_query = session.query(OrderModel).filter_by( desc=desc, pub_time=publish_time).first() if order_query: is_valided = order_query.is_valid order_query.is_valid = is_valid if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 if is_valided == False: order_query.is_delete = False if is_valided == True and is_valid == False: order_query.is_delete = True else: order = OrderModel( id=oid, desc=desc, link=link, contact=contact, category=result[2], pub_time=publish_time, is_valid=is_valid, is_delete=False if is_valid else True) order.website = website session.add(order) if is_valid == True: sheet.append( [count, desc, link, publish_time, contact, '']) count += 1 elif isinstance(result, tuple): message = '开源中国详情爬取第%d行出错:%s' % (result[0], result[1]) print(message) send_message(message) session.commit() elif isinstance(id_list, tuple): message = '开源中国爬取第%d行出错:%s' % (id_list[0], id_list[1]) print(message) send_message(message) print('结束爬取开源中国订单')