def run(self):
        ###队列不为空,获取任务
        while not self.task_q.empty():
            url = self.task_q.get()
            print(url, "----------- q.get 一个url --------------")

            time.sleep(2)
            response = requests.get(url, headers=self.headers)
            ##标题
            title = re.findall(r'"RecruitPostName":"(.*?)",', response.text)
            ##地点
            location = re.findall(r'"LocationName":"(.*?)",', response.text)
            ##岗位
            types = []
            for i in title:
                type = i.split("-")[1]
                types.append(type)

            ##工作内容
            content = re.findall(r'"Responsibility":"(.*?)",', response.text)
            ##发布时间
            pub_time = re.findall(r'"LastUpdateTime":"(.*?)",', response.text)
            ##url
            url = re.findall(r'"PostURL":"(.*?)",', response.text)

            ##save
            ##构建json对象
            # data = {}
            # for i in range(len(title)):
            #     data["title"]=title[i]
            #     data["location"]=location[i]
            #     data["content"]=content[i].replace('\r\n',"").replace("\\r","").replace("\\n","").strip()
            #     data["pub_time"]=pub_time[i]
            #     data["types"]=types[i]
            #     data["url"]=url[i]
            #     ##json存贮
            #     with open("tenxun.josn","a+",encoding="utf-8")as f:
            #         f.write(json.dumps(data,ensure_ascii=False)+"\n")
            ##存MySQL
            """
            多线程存SQL要加锁
            """
            for i in range(len(title)):
                s1 = title[i]
                s2 = location[i]
                s3 = types[i]
                s4 = url[i]
                s5 = content[i]
                s6 = pub_time[i]
                s7 = time.localtime()
                from mysqlhelper import MysqlHelper
                sql = "insert into tenxun_thread(title,location,types,url,content,pub_time,crawl_time)value(%s,%s,%s,%s,%s,%s,%s)"
                data = (s1, s2, s3, s4, s5, s6, s7)

                mysql = MysqlHelper()
                ## 锁 ()
                with self.lock:
                    mysql.execute_modify_sql(sql, data)
                    time.sleep(0.01)
                    print("---------插入成功 ------------")
Beispiel #2
0
def put_mysql():
    helper = MysqlHelper()
    # CREATE TABLE 2048_test(id int primary key auto_increment,name varchar(50),  score int) DEFAULT CHARSET = UTF8mb4;

    insert_sql = 'INSERT INTO 2048_test(name, score)VALUES(%s, %s)'
    shuju = (name, score)
    helper.execute_modify_sql(insert_sql, shuju)
Beispiel #3
0
class TestprojectPipeline(object):
    def __init__(self):
        self.helper = MysqlHelper()

    def process_item(self, item, spider):
        # 建表语句
        # create table lianjia_test_temp(title varchar(50)) default charset='utf8mb4'
        title = item['title']
        house = item['house']
        typ = item['typ']
        area = item['area']
        orient = item['orient']
        cost = item['cost']
        insert_sql = 'INSERT INTO lianjia_test_temp(title,house,typ,area,orient,cost) values (%s,%s,%s,%s,%s,%s)'
        data = (title, house, typ, area, orient, cost,)
        self.helper.excuate_modify_sql(insert_sql, data)
        return item

    def close_spider(self, spider):
        pass
Beispiel #4
0
def read_mysql():
    helper = MysqlHelper()
    helper.cursor.execute('select * from 2048_test order by score desc')
    C = helper.cursor.fetchall()
    if len(C) > 4:
        for i in range(5):
            for j in range(2):
                data[i][j] = C[i][j + 1]
    else:
        for i in range(len(C)):
            for j in range(2):
                data[i][j] = C[i][j + 1]
Beispiel #5
0
from mysqlhelper import MysqlHelper
from hashlib import sha1
from getpass import getpass
from string

mysql=MysqlHelper('db5')
# 注册函数
def register():
    # 接收用户名
    username=input("请输入注册的用户名")

    sel='select username from user where username=%s'
    r=mysql.get_all(sel,L=username)
    if not r:
        # 用户名可用,接收用户输入密码
        pwd1=getpass('请输入密码')
        pwd2=getpass('请再次输入密码')
        # 判断密码是否一致
        if pwd1==pwd2:
            # 把用户信息存到user表中,并且提示注册成功
            s=sha1()
            s.update(pwd1.encode())
            password = s.hexdigest()
            # 插入到数据库
            ins= 'insert into user values(%s,%s)'
            mysql.execute_sql(ins,L=[username,password])
            print('注册成功')
        else:
            print('密码不一致')
    else:
        print('用户名已存在')
Beispiel #6
0
import requests
import json
from w3lib.html import remove_tags
import login_weibo
from mysqlhelper import MysqlHelper
import time
import re

login_weibo.login()  # 登录微博
helper = MysqlHelper()
# base_url = 'https://m.weibo.cn/api/container/getIndex?' \
#            'uid=3591355593&luicode=10000011&' \
#            'lfid=100103type%3D1%26t%3D10%26q%' \
#            '3D%23%E5%90%B4%E4%BA%A6%E5%87%A1%E5%A4' \
#            '%AA%E9%AB%98%E4%BA%86%E8%97%8F%E4%B8%8D%E4%BD%8F' \
#            '%23&containerid=1076033591355593&page={}'
# 微博的url变了没有出现page下面这个还能用
base_url = 'https://m.weibo.cn/api/container/getIndex?' \
           'type=uid&value=1887344341&containerid=1076031887344341&page={}'

for i in range(73):
    url = base_url.format(i + 1)
    print('正在爬取第' + str(i) + '页...')

    response = requests.get(url)
    time.sleep(3)
    res_dict = json.loads(re.sub(r'\\\\', '', response.text))
    cards = res_dict['data']['cards']

    for card in cards:
        if 'mblog' in card:
def get_page(num):
    data = {}
    print("当前进程是{},--进程号是{}".format(num, os.getpid()))
    url = "https://careers.tencent.com/tencentcareer/api/post/Query?categoryId=40001001,40001002,40001003,40001004,40001005,40001006&pageIndex={}&pageSize=10".format(
        str(num))
    headers = {
        'accept':
        'application/json, text/plain, */*',
        'cookie':
        '_ga=GA1.2.659377294.1547435949; pgv_pvi=3635187712; loading=agree',
        'referer':
        'https://careers.tencent.com/search.html?query=ot_40001001,ot_40001002,ot_40001003,ot_40001004,ot_40001005,ot_40001006&index=2',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    time.sleep(2)
    response = requests.get(url, headers=headers)
    ##标题
    title = re.findall(r'"RecruitPostName":"(.*?)",', response.text)
    ##地点
    location = re.findall(r'"LocationName":"(.*?)",', response.text)
    ##岗位
    types = []
    for i in title:
        type = i.split("-")[1]
        types.append(type)

    ##工作内容
    content = re.findall(r'"Responsibility":"(.*?)",', response.text)
    ##发布时间
    pub_time = re.findall(r'"LastUpdateTime":"(.*?)",', response.text)
    ##url
    url = re.findall(r'"PostURL":"(.*?)",', response.text)

    ###save
    ##构建json对象
    # for i in range(len(title)):
    #     data["title"]=title[i]
    #     data["location"]=location[i]
    #     data["content"]=content[i].replace('\r\n',"").replace("\\r","").replace("\\n","").strip()
    #     data["pub_time"]=pub_time[i]
    #     data["types"]=types[i]
    #     data["url"]=url[i]
    #     ##json存贮
    #     with open("tenxun.josn","a+",encoding="utf-8")as f:
    #         f.write(json.dumps(data,ensure_ascii=False)+"\n")

    #存MySQL
    for i in range(len(title)):
        s1 = title[i]
        s2 = location[i]
        s3 = types[i]
        s4 = url[i]
        s5 = content[i]
        s6 = pub_time[i]

        s7 = time.localtime()

        from mysqlhelper import MysqlHelper

        sql = "insert into tenxun(title,location,types,url,content,pub_time,crawl_time)value(%s,%s,%s,%s,%s,%s,%s)"
        data = (s1, s2, s3, s4, s5, s6, s7)

        mysql = MysqlHelper()
        mysql.execute_modify_sql(sql, data)
        time.sleep(0.01)
        print("---------插入成功 ------------")
Beispiel #8
0
 def __init__(self):
     self.helper = MysqlHelper()