コード例 #1
0
# -*-coding:utf8-*-

import re
import sys
import time

import requests
from cookie_manager import *

from v2.database.mysql import MySQL

reload(sys)

sys.setdefaultencoding('utf-8')

MySQLClient = MySQL()
MySQLClient.set_table("weibo")


def get_user_info(user_id):
    user_id = str(user_id)
    user_url = "https://weibo.cn/" + user_id
    print "正在获取%s的用户信息" % user_id
    try:
        html = requests.get(user_url, headers=get_head())
        content = str(html.content)

        if len(content) == 0:
            print "微博反爬虫,等待1分钟"
            return None
        try:
コード例 #2
0
ファイル: export.py プロジェクト: kingking888/spider-5
# -*-coding:utf8-*-

import math
import sys

from v2.database.mysql import MySQL

reload(sys)

sys.setdefaultencoding('utf-8')

MySQLClient = MySQL()
count = 0
filename = "export-" + str(count) + ".xls"
w = Workbook()
ws = w.add_sheet("sheet1")
while True:
    sql = "select * from JUGEMENT WHERE CONTENT != '" "' LIMIT " + str(
        count * 5000) + ",5000"
    info = MySQLClient.fetchmany(sql)
    row = -1
    for ii in info:
        try:
            row = row + 1
            ws.write(row, 0, ii[0])
            ws.write(row, 1, ii[1])
            data = ii[2].decode('utf8')
            size = math.ceil(len(data) / 2000)
            if size > 1:
                for j in range(int(size)):
                    ws.write(row, 2 + j, data[j * 2000:(j + 1) * 2000])
コード例 #3
0
# -*-coding:utf8-*-

import json
import sys
import time

import requests

from v2.database.mysql import MySQL

reload(sys)

sys.setdefaultencoding('utf-8')

MySQLClient = MySQL()
MySQLClient.set_table("article")

head = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Cookie':
    'wxuin=338385195; devicetype=Windows10; version=62060028; lang=zh_CN; pass_ticket=9v1YpDhPxTO7rsg7FARokFZmMqF8c9H6TSNNoRwwN03iXEir2F9yy6OzIRAGSKP3; wap_sid2=CKuyraEBElxNbktXRG5ETW41YlMyanhSd0RWM2ZnNmhkMVlxQl9OVUtsQUhoS0xqZloyb1FlaWNjeVFCMnZiTXdPOGtPLXpOMVRIaHk0V2hRalVuZWRGQlV2TGhhSzBEQUFBfjCFv6fTBTgNQJVO',
    'Pragma': 'no-cache',
    'Referer':
    'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NjA4NzUwNw==&scene=123&uin=MzM4Mzg1MTk1&key=a5093a3f4494c6b19f6e56165af5e5625076f5a6ac48e36474027d512210395610ea2b2e65ec43c246d905fba8054e628cd34e23819baa2b177e818db6d9a3bf4d0f791b67f964bc17d47e4763ec30e4&devicetype=Windows+10&version=62060028&lang=zh_CN&a8scene=1&pass_ticket=BzTu%2BTqcSTidBngcCpl%2FI1MyUnjkvwlJ9RXDEBD1b%2Bk2ijuomwg%2FkE%2Fs2Y%2BQUO5e&winzoom=1',
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400',
    'X-Requested-With': 'XMLHttpRequest',
コード例 #4
0
ファイル: guidestar.py プロジェクト: kingking888/spider-5
# -*-coding:utf8-*-

import sys

import requests
import threadpool

from v2.database.mysql import MySQL

reload(sys)

sys.setdefaultencoding('utf-8')

MySQLClient = MySQL()
MySQLClient.set_table("guidestar")

head = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    # 'Cookie': 'ASP.NET_SessionId=j1nnkwzpqaypnq5kcaznqyxw; _vwo_uuid_v2=CF82AEDFCF3A07291CAD602214800A38|3fae756324fbb19989d68cf9c76496d9; ki_r=; D_IID=94834CDF-C931-3064-B649-41961E3BA689; D_UID=23AF5DAB-95A9-3D5C-8EA2-51C863072D66; D_ZID=97A1EF18-AB6A-37DA-9565-8A7EE917CE19; D_ZUID=DA852CA1-6D8F-30F7-82BD-00D31C90B04F; D_HID=A4B35261-78D7-30A5-B57D-39044DA0BB43; D_SID=218.106.154.158:NrtZxzJf/+Hp+nLBPX4cr5f/6dyOSWUhBj7c1p1nr0Q; _bizo_bzid=35dec6bc-3cc3-4300-9a49-bbd77eb92db3; _bizo_cksm=90E4D32BE01356B3; __hssrc=1; hubspotutk=2685ce170e2683a593dfe0ec91c26ddf; messagesUtk=2685ce170e2683a593dfe0ec91c26ddf; _ga=GA1.2.1002349185.1515415595; _gid=GA1.2.581669412.1515415595; __hstc=126119634.2685ce170e2683a593dfe0ec91c26ddf.1515414630692.1515414630692.1515467274027.2; _bizo_np_stats=; __atuvc=4%7C2; today=1; .gifAuth=8C2BB863824BA5C8206368BB8263C0FAFCF364B1461CD14DA0CF3FE176FEB66AE432B0BEB917137B142C9E12F2487B827F791E3F6150AE2ADA1ED292B1BD1BD57CCA66D84639930A257CE3FAF33886FAC3CA3625734C9FEDFBBD4DDE77AB77D32E131C23E7003594131020E4; NOPCOMMERCE.AUTH=47BF8E5D34189E8BB19895D52BFA8D78202744A58463E28154477CEAB9422E9A4B4BEA866FDE9139D3456CFD17D783CB76BD5955355476D4EB31D2AAA161042B6E30485762D1235B34D1E9CAF3226F925C7A7FC01CE2C444073FE75271601D74392B17B341AC814EFC14ED6DF4BBD6624429BB977E2D49E934DC56E5; Nop.customer=5e5e1df6-de0b-45dd-a640-3e99e58e5aa3; ki_t=1515414415903%3B1515467252882%3B1515473026982%3B2%3B22; mp_5d9e4f46acaba87f5966b8c0d2e47e6e_mixpanel=%7B%22distinct_id%22%3A%20%22663973866%40qq.com%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%2C%22__mps%22%3A%20%7B%7D%2C%22__mpso%22%3A%20%7B%7D%2C%22__mpus%22%3A%20%7B%7D%2C%22__mpa%22%3A%20%7B%7D%2C%22__mpu%22%3A%20%7B%7D%2C%22__mpap%22%3A%20%5B%5D%7D; mp_mixpanel__c=0',
    'Cookie':
    'D_SID=218.106.154.158:NrtZxzJf/+Hp+nLBPX4cr5f/6dyOSWUhBj7c1p1nr0Q; ASP.NET_SessionId=amawt25xz05imognypl1gi53; _vwo_uuid_v2=4B725826543816CDEEAF23246A720113|4921158b8831650568cb284b74298e65; ki_r=; D_IID=94834CDF-C931-3064-B649-41961E3BA689; D_UID=23AF5DAB-95A9-3D5C-8EA2-51C863072D66; D_ZID=97A1EF18-AB6A-37DA-9565-8A7EE917CE19; D_ZUID=DA852CA1-6D8F-30F7-82BD-00D31C90B04F; D_HID=A4B35261-78D7-30A5-B57D-39044DA0BB43; today=1; .gifAuth=F0950F8EA7CE3FE231C301281D4E437291E08C4364BF4BD5EE38D3130ABD77A1C64670A58ED56E73FAD44E75FE0472FA8D744C3F4A835932023068E3D31E942F3ABB6CF4768DCCE411B16D66383C3BB6A0D67762A616320B0C41C48279B056E7DF1A7CB2597B7CFD8636197A; NOPCOMMERCE.AUTH=C328A0B2DCEDB48D2069CD8F050BDCB5CAC193557D204DCCE967E4680A09E9B94D021DBDEC416E89063C74ECB175E1272DC013120615C8BF9744C7976AE99A95E8D392D306253113E09D1C4E908ECC1BF6B094EDF932CDAE43CB1F30EC069B620906002394FBF3F67086525E30BC733452A73729B5AC5C1B998260D8; Nop.customer=f69798b7-ce73-4844-a891-1559c9647f0a; mp_5d9e4f46acaba87f5966b8c0d2e47e6e_mixpanel=%7B%22distinct_id%22%3A%20%221025711995%40qq.com%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fwww.guidestar.org%2F%22%2C%22%24initial_referring_domain%22%3A%20%22www.guidestar.org%22%2C%22__mps%22%3A%20%7B%7D%2C%22__mpso%22%3A%20%7B%7D%2C%22__mpus%22%3A%20%7B%7D%2C%22__mpa%22%3A%20%7B%7D%2C%22__mpu%22%3A%20%7B%7D%2C%22__mpap%22%3A%20%5B%5D%7D; ki_t=1515495454400%3B1515548393911%3B1515548431544%3B2%3B9; mp_mixpanel__c=0',
    'Host': 'www.guidestar.org',
    'Origin': 'https://www.guidestar.org',
    'Pragma': 'no-cache',
    'Referer': 'https://www.guidestar.org/search',
    'User-Agent':
コード例 #5
0
from v2.database.mysql import MySQL

reload(sys)
sys.setdefaultencoding('utf-8')


def check_contain_chinese(check_str):
    for ch in check_str.decode('utf-8'):
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False


ignore_word = ["成功"]

MySQLClient = MySQL()
count = 0
while True:
    resList = MySQLClient.fetchmany(
        "select * from JUGEMENT WHERE WORDS is null LIMIT " +
        str(count * 5000) + ",5000")
    if resList is None or len(resList) == 0:
        break
    for res in resList:
        seg_list = jieba.cut(res[2])
        new_list = []
        print res[0]
        for word in seg_list:
            if check_contain_chinese(word) and word not in ignore_word:
                new_list.append(word)
        sql = "UPDATE JUGEMENT SET WORDS='" + " ".join(
コード例 #6
0
#!/usr/bin/python
# -*- coding:utf-8 -*-

import sys

from v2.database.mysql import MySQL

MySQLClient = MySQL()

reload(sys)
sys.setdefaultencoding('utf-8')

ignore_words = ["我", "你", "他", "她", "的", "地", "月", "年", "在"]
if __name__ == "__main__":

    word_dict = {}
    count = 0

    while True:
        word_lst = []
        print "正在获取分词数据..."
        resList = MySQLClient.fetchmany(
            "select WORDS from JUGEMENT WHERE WORDS is not null LIMIT " +
            str(count * 5000) + ",5000")
        if resList is None or len(resList) == 0:
            print "计算完成,正在整理输出词频结果..."
            break
        for res in resList:
            word_lst.extend(res[0].split(" "))
        count = count + 1
        print "已经成功获取" + str(count * 5000) + "条数据"
コード例 #7
0
ファイル: spider.py プロジェクト: kingking888/spider-5
# -*-coding:utf8-*-

import sys
import time

from request.request_manager import RequestManager

from v2.database.mysql import MySQL

reload(sys)
sys.setdefaultencoding('utf-8')

MySQLClient = MySQL()
ids = MySQLClient.fetchmany("select * from JUGEMENT WHERE URL = ''")
urls = []
for aid in ids:
    url = "https://www.itslaw.com/api/v1/detail?timestamp=1502224564992&judgementId=" + str(
        aid[0]).replace("\n",
                        "") + "&area=1&sortType=1&conditions=searchWord%B"
    urls.append(url)
count = len(urls)

head = {
    'Accept':
    'application/json, text/plain, */*',
    'Accept-Encoding':
    'gzip, deflate, sdch, br',
    'Accept-Language':
    'zh-CN,zh;q=0.8,en;q=0.6',
    'Cache-Control':
    'no-cache',
コード例 #8
0
ファイル: id_generateor.py プロジェクト: kingking888/spider-5
# -*-coding:utf8-*-

import re
import sys
import time

import requests

from v2.database.mysql import MySQL

reload(sys)

sys.setdefaultencoding('utf-8')

MySQLClient = MySQL()

urls = []
years = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"]
for i in range(5, 2000)[::-1]:
    for year in years:
        startIndex = 0 + 20 * i
        url = "https://www.itslaw.com/api/v1/caseFiles?startIndex=" + str(
            startIndex) + "&countPerPage=20&sortType=1&conditions=region%2B1%2B1%2B%E5%8C%97%E4%BA%AC%E5%B8%82" \
                          "&conditions=caseType%2B2%2B10%2B%E5%88%91%E4%BA%8B" \
                          "&conditions=trialYear%2B" + year + "%2B7%2B" + year
        urls.append(url)

head = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',