Exemple #1
0
def main(wb):
    print('开始爬取开源中国订单')
    sheet = wb.create_sheet('开源中国', 1)
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    for i in range(10):
        url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30&currentTime=&pageSize=20&currentPage='.format(
            i + 1)
        id_list = get_id(url)
        if isinstance(id_list, list):
            for id in id_list:
                url = 'https://zb.oschina.net/project/detail?id=%s' % id
                desc = get_one_page(url)
                if isinstance(desc, str):
                    desc = html2text.html2text(desc).strip()
                    contact = get_contact(desc)
                    sheet.append([count, desc, url, contact])
                    count += 1
                elif isinstance(desc, tuple):
                    print('开源中国详情爬取出错:%s' % desc[1])
        elif isinstance(id_list, tuple):
            message = '开源中国爬取出错:%s' % id_list[1]
            print(message)
            send_message(message)
    print('结束爬取开源中国订单')
def main(wb, session, OrderModel, WebsiteModel):
    print('开始爬取人人开发订单')
    sheet = wb.create_sheet('人人开发', 2)
    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(3)
    for i in range(10, 0, -1):
        url = 'http://www.rrkf.com/serv/request?&currentPage=%d' % i
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                link = info['link']
                details = get_detail(link)
                if isinstance(details, list):
                    rid = 'rr-{}'.format(link.split('=')[1])
                    contact = get_contact(desc)
                    is_valid = True if '剩余' in details[0] else False
                    pub_time = datetime.strptime(
                        details[1],
                        "%Y-%m-%d %H:%M:%S") if details[1] else None
                    order_query = session.query(OrderModel).get(rid)
                    if order_query:
                        is_valided = order_query.is_valid
                        order_query.is_valid = is_valid
                        if is_valid == True:
                            sheet.append(
                                [count, desc, link, pub_time, contact, ''])
                            count += 1
                            if is_valided == False:
                                order_query.is_delete = False
                        if is_valided == True and is_valid == False:
                            order_query.is_delete = True
                    else:
                        order = OrderModel(
                            id=rid,
                            desc=desc,
                            link=link,
                            contact=contact,
                            category='',
                            pub_time=pub_time,
                            is_valid=is_valid,
                            is_delete=False if is_valid else True)
                        order.website = website
                        session.add(order)
                        if is_valid == True:
                            sheet.append(
                                [count, desc, link, pub_time, contact, ''])
                            count += 1
                else:
                    message = '人人开发详情爬取第%d行出错:%s' % (details[0], details[1])
                    print(message)
                    send_message(message)
            session.commit()
        elif isinstance(info_list, tuple):
            message = '人人开发爬取第%d行出错:%s' % (info_list[0], info_list[1])
            print(message)
            send_message(message)
    print('结束爬取人人开发订单')
def main(wb, session, OrderModel, WebsiteModel):
    print('开始爬取码市订单')
    sheet = wb['Sheet']
    sheet.title = '码市'
    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(1)
    for i in range(10, 0, -1):
        url = 'https://codemart.com/api/project?page=%d' % i
        result = get_one_page(url)
        if isinstance(result, list):
            for r in result:
                time_stamp = int(r['pubtime']) / 1000
                publish_time = datetime.fromtimestamp(time_stamp)
                if publish_time < time_point:
                    continue
                desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
                cid = 'cm-{}'.format(r['id'])
                contact = get_contact(desc)
                link = 'https://codemart.com/project/{}'.format(r['id'])
                is_valid = True if r['status'] == '招募中' else False
                order_query = session.query(OrderModel).get(cid)
                if order_query:
                    is_valided = order_query.is_valid
                    order_query.is_valid = is_valid
                    if is_valid == True:
                        sheet.append(
                            [count, desc, link, publish_time, contact, ''])
                        count += 1
                        if is_valided == False:
                            order_query.is_delete = False
                    if is_valided == True and is_valid == False:
                        order_query.is_delete = True
                else:
                    order = OrderModel(id=cid,
                                       desc=desc,
                                       link=link,
                                       contact=contact,
                                       category=r['cate'],
                                       pub_time=publish_time,
                                       is_valid=is_valid,
                                       is_delete=False if is_valid else True)
                    order.website = website
                    session.add(order)
                    if is_valid == True:
                        sheet.append(
                            [count, desc, link, publish_time, contact, ''])
                        count += 1
            session.commit()
        elif isinstance(result, tuple):
            message = '码市爬取第%d行出错:%s' % (result[0], result[1])
            print(message)
            send_message(message)
    print('结束爬取码市订单')
Exemple #4
0
def main(wb, session, OrderModel, WebsiteModel):
    print('开始爬取实现订单')
    sheet = wb.create_sheet('实现', 3)
    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(4)
    for i in range(10, 0, -1):
        url = 'https://shixian.com/job/all?page=%d&sort_arrow=down' % i
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                link = info['link']
                contact = get_contact(desc)
                dl_time = datetime.strptime(info['start_time'], "%Y-%m-%d %H:%M:%S")
                is_valid = True if datetime.now() <= dl_time else False
                sid= 'sx-' + link.split('/')[-1]
                cate = get_category(link)
                if isinstance(cate, str):
                    order_query = session.query(OrderModel).get(sid)
                    if order_query:
                        is_valided = order_query.is_valid
                        order_query.is_valid = is_valid
                        if is_valid == True:
                            sheet.append([count, desc, link, '', contact, ''])
                            count += 1
                            if is_valided == False:
                                order_query.is_delete = False
                        if is_valided == True and is_valid == False:
                            order_query.is_delete = True
                    else:
                        order = OrderModel(id=sid, desc=desc, link=link, contact=contact, category=cate,
                                          pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True)
                        order.website = website
                        session.add(order)
                        if is_valid == True:
                            sheet.append([count, desc, link, '', contact, ''])
                            count += 1
                else:
                    message = '实现详情爬取第%d行出错:%s' % (cate[0], cate[1])
                    print(message)
                    send_message(message)
                time.sleep(random.random()/10)
            session.commit()
        elif isinstance(info_list, tuple):
            message = '实现爬取第%d行出错:%s' % (info_list[0], info_list[1])
            print(message)
            send_message(message)
    print('结束爬取实现订单')
Exemple #5
0
def main(wb, session, OrderModel, WebsiteModel):
    print('开始爬取猿急送订单')
    sheet = wb.create_sheet('猿急送', 5)
    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(6)
    for i in range(10, 0, -1):
        url = 'https://www.yuanjisong.com/job/allcity/page%d' % i
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                link = info['link']
                contact = get_contact(desc)
                is_valid = True if info['status'] == '投递职位' else False
                yid = 'yj-{}'.format(int(link.split('/')[-1]))
                order_query = session.query(OrderModel).get(yid)
                if order_query:
                    is_valided = order_query.is_valid
                    order_query.is_valid = is_valid
                    # if is_valided == False and is_valid == True:
                    #     sheet.append([count, desc, link, contact])
                    #     count += 1
                    #     order_query.is_delete = False
                    if is_valid == True:
                        sheet.append([count, desc, link, '', contact, ''])
                        count += 1
                        if is_valided == False:
                            order_query.is_delete = False
                    if is_valided == True and is_valid == False:
                        order_query.is_delete = True
                else:
                    order = OrderModel(id=yid, desc=desc, link=link, contact=contact, category='',
                                      pub_time=None, is_valid=is_valid, is_delete=False if is_valid else True)
                    order.website = website
                    session.add(order)
                    if is_valid == True:
                        sheet.append([count, desc, link, '', contact, ''])
                        count += 1
            session.commit()
        elif isinstance(info_list, tuple):
            message = '猿急送爬取第%d行出错:%s' % (info_list[0], info_list[1])
            print(message)
            send_message(message)
    print('结束爬取猿急送订单')
def main(wb):
    print('开始爬取51外包订单')
    sheet = wb.create_sheet('51外包', 3)
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    for i in range(10):
        url = 'http://www.51waibao.net/Project.html?page={}'.format(i + 1)
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                contact = get_contact(desc)
                sheet.append([count, desc, info['link'], contact])
                count += 1
        elif isinstance(info_list, tuple):
            message = '51外包爬取出错:%s' % info_list[1]
            print(message)
            send_message(message)
    print('结束爬取51外包订单')
Exemple #7
0
def main(wb):
    print('开始爬取猿急送订单')
    sheet = wb.create_sheet('猿急送', 4)
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    for i in range(10):
        url = 'https://www.yuanjisong.com/job/allcity/page{}'.format(i + 1)
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                contact = get_contact(desc)
                sheet.append([count, desc, info['link'], contact])
                count += 1
        elif isinstance(info_list, tuple):
            message = '猿急送爬取出错:%s' % info_list[1]
            print(message)
            send_message(message)
    print('结束爬取猿急送订单')
Exemple #8
0
def main(wb):
    print('开始爬取实现订单')
    sheet = wb.create_sheet('实现', 5)
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    for i in range(10):
        url = 'https://shixian.com/job/all?page={}&sort_arrow=down'.format(i + 1)
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                contact = get_contact(desc)
                sheet.append([count, desc, info['link'], contact])
                count += 1
        elif isinstance(info_list, tuple):
            message = '实现爬取出错:%s' % info_list[1]
            print(message)
            send_message(message)
    print('结束爬取实现订单')
Exemple #9
0
def main(wb):
    print('开始爬取人人开发订单')
    sheet = wb.create_sheet('人人开发', 2)
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    for i in range(10):
        url = 'http://www.rrkf.com/serv/request?&currentPage={}'.format(i + 1)
        info_list = get_info(url)
        if isinstance(info_list, list):
            for info in info_list:
                desc = info['desc']
                contact = get_contact(desc)
                sheet.append([count, desc, info['link'], contact])
                count += 1
        elif isinstance(info_list, tuple):
            message = '人人开发爬取出错:%s' % info_list[1]
            print(message)
            send_message(message)
    print('结束爬取人人开发订单')
Exemple #10
0
def main(wb):
    print('开始爬取码市订单')
    start_time = time.time()
    sheet = wb['Sheet']
    sheet.title = '码市'
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    for i in range(10):
        url = 'https://codemart.com/api/project?page={}'.format(i + 1)
        result = get_one_page(url, start_time)
        if isinstance(result, list):
            for r in result:
                desc = ILLEGAL_CHARACTERS_RE.sub(r'', r['description'])
                contact = get_contact(desc)
                sheet.append([
                    count, desc,
                    'https://codemart.com/project/{}'.format(r['id']), contact
                ])
                count += 1
        elif isinstance(result, tuple):
            message = '码市爬取出错:%s' % result[1]
            print(message)
            send_message(message)
    print('结束爬取码市订单')
Exemple #11
0
def main(wb, session, OrdrModel, WebsiteModel):
    print('开始爬取51外包订单')
    sheet = wb.create_sheet('51外包', 4)
    sheet.append(['单据编号', '订单描述', '链接', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(5)
    for i in range(10, 0, -1):
        url = 'http://www.51waibao.net/Project.html?page=%d' % i
        link_list = get_links(url)
        if isinstance(link_list, list):
            for link in link_list:
                result = get_detail(link)
                if isinstance(result, list):
                    date_str = result[3]
                    publish_time = datetime.strptime(date_str,
                                                     "%Y-%m-%d %H:%M:%S")
                    if publish_time < time_point:
                        continue
                    desc = result[4]
                    contact = get_contact(desc)
                    wid = 'wy-' + result[0]
                    is_valid = False if '项目已过期' in result[2] else True
                    order_query = session.query(OrdrModel).get(wid)
                    if order_query:
                        is_valided = order_query.is_valid
                        order_query.is_valid = is_valid
                        if is_valid == True:
                            sheet.append(
                                [count, desc, link, publish_time, contact, ''])
                            count += 1
                            if is_valided == False:
                                order_query.is_delete = False
                        if is_valided == True and is_valid == False:
                            order_query.is_delete = True
                    else:
                        order = OrdrModel(
                            id=wid,
                            desc=desc,
                            link=link,
                            contact=contact,
                            category=result[1],
                            pub_time=publish_time,
                            is_valid=is_valid,
                            is_delete=False if is_valid else True)
                        order.website = website
                        session.add(order)
                        if is_valid == True:
                            sheet.append(
                                ['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
                            count += 1
                else:
                    message = '51外包详情爬取第%d行出错:%s' % (result[0], result[1])
                    print(message)
                    send_message(message)
                time.sleep(random.random() / 10)
            session.commit()
        elif isinstance(link_list, tuple):
            message = '51外包爬取第%d行出错:%s' % (link_list[0], link_list[1])
            print(message)
            send_message(message)
    print('结束爬取51外包订单')
Exemple #12
0
def main(wb, session, OrderModel, WebsiteModel):
    print('开始爬取开源中国订单')
    sheet = wb.create_sheet('开源中国', 1)
    sheet.append(['单据编号', '订单描述', '链接', '发布时间', '联系方式', '分配人员'])
    count = 1
    website = session.query(WebsiteModel).get(2)
    for i in range(10, 0, -1):
        url = 'https://zb.oschina.net/project/contractor-browse-project-and-reward?applicationAreas=&moneyMinByYuan=&moneyMaxByYuan=&sortBy=30&currentTime=&pageSize=20&currentPage=%d' % i
        id_list = get_id(url)
        if isinstance(id_list, list):
            for oid, otype in id_list:
                if otype == 2:
                    url = 'https://zb.oschina.net/reward/detail?id=%d' % oid
                    link = 'https://zb.oschina.net/reward/detail.html?id=%s' % oid
                else:
                    url = 'https://zb.oschina.net/project/detail?id=%s' % oid
                    link = 'https://zb.oschina.net/project/detail.html?id=%s' % oid
                result = get_one_page(url)
                if isinstance(result, list):
                    publish_time = result[3]
                    if publish_time < time_point:
                        continue
                    desc = html2text.html2text(result[0]).strip()
                    is_valid = True if result[1] == 3 else False
                    contact = get_contact(desc)

                    oid = 'oc-{}'.format(oid // 10)
                    order_query = session.query(OrderModel).filter_by(
                        desc=desc, pub_time=publish_time).first()
                    if order_query:
                        is_valided = order_query.is_valid
                        order_query.is_valid = is_valid
                        if is_valid == True:
                            sheet.append(
                                [count, desc, link, publish_time, contact, ''])
                            count += 1
                            if is_valided == False:
                                order_query.is_delete = False
                        if is_valided == True and is_valid == False:
                            order_query.is_delete = True
                    else:
                        order = OrderModel(
                            id=oid,
                            desc=desc,
                            link=link,
                            contact=contact,
                            category=result[2],
                            pub_time=publish_time,
                            is_valid=is_valid,
                            is_delete=False if is_valid else True)
                        order.website = website
                        session.add(order)
                        if is_valid == True:
                            sheet.append(
                                [count, desc, link, publish_time, contact, ''])
                            count += 1
                elif isinstance(result, tuple):
                    message = '开源中国详情爬取第%d行出错:%s' % (result[0], result[1])
                    print(message)
                    send_message(message)
            session.commit()
        elif isinstance(id_list, tuple):
            message = '开源中国爬取第%d行出错:%s' % (id_list[0], id_list[1])
            print(message)
            send_message(message)
    print('结束爬取开源中国订单')