Esempio n. 1
0
def parse(demands, key_word, webtype):
    # 创建数据库链接客户端
    mysql_client = MysqlClient()

    for demand in demands:
        try:
            task_id = demand["data-taskid"]
            detail_url = "https://task.zbj.com/" + task_id
            detail_html = requests.get(detail_url)
            soup = BeautifulSoup(detail_html.text, 'lxml')
            content = soup.find("div", attrs={"class": "demand-content J-show-hide hide-more"})
            # 发布人和时间
            user_and_publishtime = content.find("div", attrs={"class": "order-attr"}).get_text()
            amt = content.find("span", attrs={"class": "orange-color"}).get_text()
            amt = str(amt).replace('元', '')
            els = content.find_all("p", attrs={"class": "clearfix"})
            details = [el.find("span", attrs={"class": "description"}).get_text() for el in els]
            result = mysql_client.insert(detail_url, amt, user_and_publishtime, details, key_word, webtype,"zbj-"+task_id)
            if result == -1:
                return result
        except Exception as e:
            print(str(e))
            print("error:https://task.zbj.com/" + demand["data-taskid"])

    mysql_client.destory()
Esempio n. 2
0
def parse(demands, key_word, webtype):
    # 创建数据库链接客户端
    mysql_client = MysqlClient()

    for demand in demands:
        try:
            key_word = demand.find("span",attrs={"class":"skill"}).get_text().strip()
            demand = demand.find("h4",attrs={"class":"media-heading"}).find("a")
            detail_url = demand['href']
            task_id = str(detail_url).replace("/projects/", "", 1)


            detail_url= "http://www.taskcity.com/projects/"+task_id
            detail_html = requests.get(detail_url)
            soup = BeautifulSoup(detail_html.text, 'lxml')

            con = soup.find("div", attrs={"class": "col-sm-7"})

            # 发包人
            publish_user = "******"

            left = con.find("div", attrs={"class": "pull-left space-right-30"}).find_all("div")

            # 金额
            amt = left[1].find("b").get_text().strip()

            # 任务名
            title = soup.find(id="project_title_name").get_text().strip()

            # 任务信息

            pattern = re.compile("描述")
            element = soup.find('h3', text=pattern).parent

            content = element.find_all("div",recursive=False)[3].get_text().strip()

            # 任务发布时间
            pattern = re.compile("发布日期")
            user_info_action = soup.find('div', text=pattern).get_text()

            mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", user_info_action)
            publish_time = str(mat.group(0))
            details = [title, '0', content]

            nowTime_str = datetime.datetime.now().strftime('%H:%M')

            result = mysql_client.insert(detail_url, amt, publish_user + "发布于" + publish_time + " " + nowTime_str,
                                         details,
                                         key_word, webtype,"taskcity-"+task_id)

            if result == -1:
                return result

        except Exception as e:
            print(str(e))
            print("error:" + demand["href"])

    mysql_client.destory()
Esempio n. 3
0
def parse(demands, key_word, webtype):
    # 创建数据库链接客户端
    mysql_client = MysqlClient()

    for demand in demands:
        try:
            detail_url = demand["href"]
            task_id = str(detail_url).replace("http://www.epwk.com/task/", "", 1).replace("/", "", 1)
            detail_html = requests.get(detail_url)
            soup = BeautifulSoup(detail_html.text, 'lxml')

            con = soup.find("div", attrs={"class": "tasktopdet"})

            # 发包人
            publish_user = con.find("div", attrs={"class": "task-user-header"}).find("span").get_text()

            # 金额
            amt = con.find("div", attrs={"class": "task_user_info"}).find("span", attrs={"class": "nummoney f_l"}).find(
                "span").get_text()

            # 任务名
            title = con.find("div", attrs={"class": "task_user_info"}).find("h1").get_text()

            # 任务信息
            content = con.find("div", attrs={"class": "task-info-content"}).get_text()

            # 任务发布时间
            user_info_action = con.find("div", attrs={"class": "task-user-info-action"}).find_all("span", attrs={
                "class": "dib_vm"})
            publish_time = user_info_action[0].get_text()

            details = [title, '0', content]

            nowTime_str = datetime.datetime.now().strftime('%H:%M')

            result = mysql_client.insert(detail_url, amt, publish_user + "发布于" + publish_time + " " + nowTime_str,
                                         details,
                                         key_word, webtype,"epwk-"+task_id)

            if result == -1:
                return result

        except Exception as e:
            print(str(e))
            print("error:" + demand["href"])

    mysql_client.destory()
Esempio n. 4
0
def parse(demands, key_word, webtype):
    # 创建数据库链接客户端
    mysql_client = MysqlClient()

    for demand in demands:
        detail_url = "https://www.mayigeek.com/tab/projectDetail?id="+str(demand["id"])
        try:
            # 发包人
            publish_user = "******"

            # 金额
            amt = str(demand['min_price'])

            # 任务名
            title = demand['name']

            # 任务信息
            content = demand['desc']

            # 任务发布时间
            publish_time = demand['create_date']

            key_word= ",".join(demand['typeStr'])


            details = [title, '0', content]

            result = mysql_client.insert(detail_url, amt, publish_user + "发布于" + publish_time,
                                         details,
                                         key_word, webtype,"mayigeek-"+str(demand["id"]))

            if result == -1:
                return result

        except Exception as e:
            print(str(e))
            print("error:" + detail_url)

    mysql_client.destory()
Esempio n. 5
0
def parse(demands, key_word, webtype):
    # 创建数据库链接客户端
    mysql_client = MysqlClient()

    for demand in demands:
        try:
            demand = demand.find("a")
            detail_url = demand["href"]
            task_id = str(detail_url).replace("/project/show/", "", 1)

            headers = {
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                'Accept-Encoding':
                'gzip, deflate, br',
                'Accept-Language':
                'zh-CN,zh;q=0.9,en;q=0.8',
                'Cache-Control':
                'max-age=0',
                'Connection':
                'keep-alive',
                'Host':
                'www.sxsoft.com',
                'Referer':
                'https://www.sxsoft.com/page/project',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
            }

            detail_url = "https://www.sxsoft.com" + detail_url
            detail_html = requests.get(detail_url,
                                       params=keyword,
                                       headers=headers,
                                       timeout=3)
            # detail_html = requests.get(url)
            soup = BeautifulSoup(detail_html.text, 'lxml')

            con = soup.find("div", attrs={
                "class": "bg-color-f5 clearfix"
            }).find("div", attrs={"class": "container"})

            # 发包人
            publish_user = con.find("div", attrs={
                "class": "owner clearfix"
            }).find("span", attrs={
                "class": "green"
            }).get_text().strip()

            project_msgs = con.find("div",
                                    attrs={
                                        "class": "project-msg clearfix"
                                    }).find_all("div",
                                                attrs={"class": "col-sm-3"})

            # 项目类型
            key_word = project_msgs[0].find("p", attrs={
                "class": "cat-name"
            }).get_text().strip()

            # 金额
            amt = project_msgs[1].find("p", attrs={
                "class": "cat-name"
            }).get_text().strip()

            # 任务名
            title = con.find("h1", attrs={"class": "project-title"})["title"]

            # 任务信息
            content = con.find("div",
                               attrs={
                                   "class": "project-content clearfix"
                               }).get_text().strip()

            # 任务发布时间
            user_info_action = con.find("div",
                                        attrs={
                                            "class": "owner clearfix"
                                        }).find_all("span")

            publish_time = user_info_action[1].get_text().strip()

            details = [title, '0', content]

            nowTime_str = datetime.datetime.now().strftime('%H:%M')

            result = mysql_client.insert(
                detail_url, amt,
                publish_user + "发布于" + publish_time + " " + nowTime_str,
                details, key_word, webtype, "sxsoft-" + task_id)

            if result == -1:
                return result

        except Exception as e:
            print(str(e))
            print("error:" + demand["href"])

    mysql_client.destory()
Esempio n. 6
0
def parse(demands, key_word, webtype):
    # 创建数据库链接客户端
    mysql_client = MysqlClient()

    for demand in demands:
        detail_url = "https://zb.oschina.net/project/detail?id=" + str(
            demand["id"])
        urla = 'https://zb.oschina.net/project/detail.html?id=' + str(
            demand["id"])
        # print(detail_url)

        headers = {
            'Accept':
            'application/json',
            'Accept-Encoding':
            'gzip,deflate,br',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8',
            'Connection':
            'keep-alive',
            'Host':
            'zb.oschina.net',
            'Referer':
            'https://zb.oschina.net/projects/list.html',
            'User-Agent':
            'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/75.0.3770.100Safari/537.36'
        }

        strhtml = requests.get(detail_url,
                               params=keyword,
                               headers=headers,
                               timeout=3)

        c = strhtml.content
        unicodestr = json.loads(c)

        res = jsonpath.jsonpath(unicodestr, '$.data')[0]

        try:
            # 发包人
            publish_user = "******"
            # 项目编号
            projectNo = res['projectNo']

            # 金额
            amt_min = res['budgetMinByYuan']
            amt_max = res['budgetMaxByYuan']
            if amt_max != str(0):
                amt = str(amt_min) + "-" + str(amt_max)
            else:
                amt = "价格面议"

            # 任务名
            title = res['name']

            print(title)

            # 任务信息
            content = res['prd']

            soup = BeautifulSoup(str(content), 'lxml')
            content = soup.get_text().strip()

            # 任务发布时间
            publish_time = res['publishTime']

            details = [title, '0', content]

            result = mysql_client.insert(urla, amt,
                                         publish_user + "发布于" + publish_time,
                                         details, key_word, webtype,
                                         "oschina-" + projectNo)

            if result == -1:
                print("过期数据不处理", urla)
                continue

        except Exception as e:
            print(str(e))
            # print("error:" + detail_url)
            pass
    mysql_client.destory()
Esempio n. 7
0
# coding:utf-8
import datetime
import os

from xlwt import Workbook

from com.parttimejob.db.mysqlclient import MysqlClient

mysql = MysqlClient()


def create_yesterday_excle():
    yesterday = (datetime.datetime.now() -
                 datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    today = datetime.datetime.now().strftime('%Y-%m-%d')

    res = mysql.get_partjob_list(yesterday, today)

    w = Workbook()  # 创建一个工作簿
    ws = w.add_sheet('1')  # 创建一个工作表

    col = 0

    ws.write(col, 0, '任务编号')
    ws.write(col, 1, '平台')
    ws.write(col, 2, '工作类型')
    ws.write(col, 3, '任务名')
    ws.write(col, 4, '报酬')
    ws.write(col, 5, '发布时间')
    ws.write(col, 6, '任务链接')