# -*-coding:utf8-*- import re import sys import time import requests from cookie_manager import * from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') MySQLClient = MySQL() MySQLClient.set_table("weibo") def get_user_info(user_id): user_id = str(user_id) user_url = "https://weibo.cn/" + user_id print "正在获取%s的用户信息" % user_id try: html = requests.get(user_url, headers=get_head()) content = str(html.content) if len(content) == 0: print "微博反爬虫,等待1分钟" return None try:
# -*-coding:utf8-*- import math import sys from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') MySQLClient = MySQL() count = 0 filename = "export-" + str(count) + ".xls" w = Workbook() ws = w.add_sheet("sheet1") while True: sql = "select * from JUGEMENT WHERE CONTENT != '" "' LIMIT " + str( count * 5000) + ",5000" info = MySQLClient.fetchmany(sql) row = -1 for ii in info: try: row = row + 1 ws.write(row, 0, ii[0]) ws.write(row, 1, ii[1]) data = ii[2].decode('utf8') size = math.ceil(len(data) / 2000) if size > 1: for j in range(int(size)): ws.write(row, 2 + j, data[j * 2000:(j + 1) * 2000])
# -*-coding:utf8-*- import json import sys import time import requests from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') MySQLClient = MySQL() MySQLClient.set_table("article") head = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'wxuin=338385195; devicetype=Windows10; version=62060028; lang=zh_CN; pass_ticket=9v1YpDhPxTO7rsg7FARokFZmMqF8c9H6TSNNoRwwN03iXEir2F9yy6OzIRAGSKP3; wap_sid2=CKuyraEBElxNbktXRG5ETW41YlMyanhSd0RWM2ZnNmhkMVlxQl9OVUtsQUhoS0xqZloyb1FlaWNjeVFCMnZiTXdPOGtPLXpOMVRIaHk0V2hRalVuZWRGQlV2TGhhSzBEQUFBfjCFv6fTBTgNQJVO', 'Pragma': 'no-cache', 'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NjA4NzUwNw==&scene=123&uin=MzM4Mzg1MTk1&key=a5093a3f4494c6b19f6e56165af5e5625076f5a6ac48e36474027d512210395610ea2b2e65ec43c246d905fba8054e628cd34e23819baa2b177e818db6d9a3bf4d0f791b67f964bc17d47e4763ec30e4&devicetype=Windows+10&version=62060028&lang=zh_CN&a8scene=1&pass_ticket=BzTu%2BTqcSTidBngcCpl%2FI1MyUnjkvwlJ9RXDEBD1b%2Bk2ijuomwg%2FkE%2Fs2Y%2BQUO5e&winzoom=1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400', 'X-Requested-With': 'XMLHttpRequest',
# -*-coding:utf8-*- import sys import requests import threadpool from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') MySQLClient = MySQL() MySQLClient.set_table("guidestar") head = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', # 'Cookie': 'ASP.NET_SessionId=j1nnkwzpqaypnq5kcaznqyxw; _vwo_uuid_v2=CF82AEDFCF3A07291CAD602214800A38|3fae756324fbb19989d68cf9c76496d9; ki_r=; D_IID=94834CDF-C931-3064-B649-41961E3BA689; D_UID=23AF5DAB-95A9-3D5C-8EA2-51C863072D66; D_ZID=97A1EF18-AB6A-37DA-9565-8A7EE917CE19; D_ZUID=DA852CA1-6D8F-30F7-82BD-00D31C90B04F; D_HID=A4B35261-78D7-30A5-B57D-39044DA0BB43; D_SID=218.106.154.158:NrtZxzJf/+Hp+nLBPX4cr5f/6dyOSWUhBj7c1p1nr0Q; _bizo_bzid=35dec6bc-3cc3-4300-9a49-bbd77eb92db3; _bizo_cksm=90E4D32BE01356B3; __hssrc=1; hubspotutk=2685ce170e2683a593dfe0ec91c26ddf; messagesUtk=2685ce170e2683a593dfe0ec91c26ddf; _ga=GA1.2.1002349185.1515415595; _gid=GA1.2.581669412.1515415595; __hstc=126119634.2685ce170e2683a593dfe0ec91c26ddf.1515414630692.1515414630692.1515467274027.2; _bizo_np_stats=; __atuvc=4%7C2; today=1; .gifAuth=8C2BB863824BA5C8206368BB8263C0FAFCF364B1461CD14DA0CF3FE176FEB66AE432B0BEB917137B142C9E12F2487B827F791E3F6150AE2ADA1ED292B1BD1BD57CCA66D84639930A257CE3FAF33886FAC3CA3625734C9FEDFBBD4DDE77AB77D32E131C23E7003594131020E4; NOPCOMMERCE.AUTH=47BF8E5D34189E8BB19895D52BFA8D78202744A58463E28154477CEAB9422E9A4B4BEA866FDE9139D3456CFD17D783CB76BD5955355476D4EB31D2AAA161042B6E30485762D1235B34D1E9CAF3226F925C7A7FC01CE2C444073FE75271601D74392B17B341AC814EFC14ED6DF4BBD6624429BB977E2D49E934DC56E5; Nop.customer=5e5e1df6-de0b-45dd-a640-3e99e58e5aa3; ki_t=1515414415903%3B1515467252882%3B1515473026982%3B2%3B22; mp_5d9e4f46acaba87f5966b8c0d2e47e6e_mixpanel=%7B%22distinct_id%22%3A%20%22663973866%40qq.com%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%2C%22__mps%22%3A%20%7B%7D%2C%22__mpso%22%3A%20%7B%7D%2C%22__mpus%22%3A%20%7B%7D%2C%22__mpa%22%3A%20%7B%7D%2C%22__mpu%22%3A%20%7B%7D%2C%22__mpap%22%3A%20%5B%5D%7D; mp_mixpanel__c=0', 'Cookie': 'D_SID=218.106.154.158:NrtZxzJf/+Hp+nLBPX4cr5f/6dyOSWUhBj7c1p1nr0Q; ASP.NET_SessionId=amawt25xz05imognypl1gi53; _vwo_uuid_v2=4B725826543816CDEEAF23246A720113|4921158b8831650568cb284b74298e65; ki_r=; D_IID=94834CDF-C931-3064-B649-41961E3BA689; D_UID=23AF5DAB-95A9-3D5C-8EA2-51C863072D66; D_ZID=97A1EF18-AB6A-37DA-9565-8A7EE917CE19; D_ZUID=DA852CA1-6D8F-30F7-82BD-00D31C90B04F; D_HID=A4B35261-78D7-30A5-B57D-39044DA0BB43; today=1; .gifAuth=F0950F8EA7CE3FE231C301281D4E437291E08C4364BF4BD5EE38D3130ABD77A1C64670A58ED56E73FAD44E75FE0472FA8D744C3F4A835932023068E3D31E942F3ABB6CF4768DCCE411B16D66383C3BB6A0D67762A616320B0C41C48279B056E7DF1A7CB2597B7CFD8636197A; NOPCOMMERCE.AUTH=C328A0B2DCEDB48D2069CD8F050BDCB5CAC193557D204DCCE967E4680A09E9B94D021DBDEC416E89063C74ECB175E1272DC013120615C8BF9744C7976AE99A95E8D392D306253113E09D1C4E908ECC1BF6B094EDF932CDAE43CB1F30EC069B620906002394FBF3F67086525E30BC733452A73729B5AC5C1B998260D8; Nop.customer=f69798b7-ce73-4844-a891-1559c9647f0a; mp_5d9e4f46acaba87f5966b8c0d2e47e6e_mixpanel=%7B%22distinct_id%22%3A%20%221025711995%40qq.com%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fwww.guidestar.org%2F%22%2C%22%24initial_referring_domain%22%3A%20%22www.guidestar.org%22%2C%22__mps%22%3A%20%7B%7D%2C%22__mpso%22%3A%20%7B%7D%2C%22__mpus%22%3A%20%7B%7D%2C%22__mpa%22%3A%20%7B%7D%2C%22__mpu%22%3A%20%7B%7D%2C%22__mpap%22%3A%20%5B%5D%7D; ki_t=1515495454400%3B1515548393911%3B1515548431544%3B2%3B9; mp_mixpanel__c=0', 'Host': 'www.guidestar.org', 'Origin': 'https://www.guidestar.org', 'Pragma': 'no-cache', 'Referer': 'https://www.guidestar.org/search', 'User-Agent':
from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') def check_contain_chinese(check_str): for ch in check_str.decode('utf-8'): if u'\u4e00' <= ch <= u'\u9fff': return True return False ignore_word = ["成功"] MySQLClient = MySQL() count = 0 while True: resList = MySQLClient.fetchmany( "select * from JUGEMENT WHERE WORDS is null LIMIT " + str(count * 5000) + ",5000") if resList is None or len(resList) == 0: break for res in resList: seg_list = jieba.cut(res[2]) new_list = [] print res[0] for word in seg_list: if check_contain_chinese(word) and word not in ignore_word: new_list.append(word) sql = "UPDATE JUGEMENT SET WORDS='" + " ".join(
#!/usr/bin/python # -*- coding:utf-8 -*- import sys from v2.database.mysql import MySQL MySQLClient = MySQL() reload(sys) sys.setdefaultencoding('utf-8') ignore_words = ["我", "你", "他", "她", "的", "地", "月", "年", "在"] if __name__ == "__main__": word_dict = {} count = 0 while True: word_lst = [] print "正在获取分词数据..." resList = MySQLClient.fetchmany( "select WORDS from JUGEMENT WHERE WORDS is not null LIMIT " + str(count * 5000) + ",5000") if resList is None or len(resList) == 0: print "计算完成,正在整理输出词频结果..." break for res in resList: word_lst.extend(res[0].split(" ")) count = count + 1 print "已经成功获取" + str(count * 5000) + "条数据"
# -*-coding:utf8-*- import sys import time from request.request_manager import RequestManager from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') MySQLClient = MySQL() ids = MySQLClient.fetchmany("select * from JUGEMENT WHERE URL = ''") urls = [] for aid in ids: url = "https://www.itslaw.com/api/v1/detail?timestamp=1502224564992&judgementId=" + str( aid[0]).replace("\n", "") + "&area=1&sortType=1&conditions=searchWord%B" urls.append(url) count = len(urls) head = { 'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'no-cache',
# -*-coding:utf8-*- import re import sys import time import requests from v2.database.mysql import MySQL reload(sys) sys.setdefaultencoding('utf-8') MySQLClient = MySQL() urls = [] years = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"] for i in range(5, 2000)[::-1]: for year in years: startIndex = 0 + 20 * i url = "https://www.itslaw.com/api/v1/caseFiles?startIndex=" + str( startIndex) + "&countPerPage=20&sortType=1&conditions=region%2B1%2B1%2B%E5%8C%97%E4%BA%AC%E5%B8%82" \ "&conditions=caseType%2B2%2B10%2B%E5%88%91%E4%BA%8B" \ "&conditions=trialYear%2B" + year + "%2B7%2B" + year urls.append(url) head = { 'Accept': 'application/json, text/plain, */*', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',