def proxy_weixin(account): proxies = random_proxy() if proxies.get("random_int") == 1: ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3, auth=auth, proxies={"http": "{ip}".format(ip=proxies.get("http"))}, timeout=10) else: ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3, proxies={"http": "{ip}".format(ip=proxies.get("http"))}, timeout=5) # ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3, auth=auth, timeout=5) dot = ws_api.get_gzh_info(account) return dot
def GetgzhList(keyword): isSucess = False mostTryCounts = 20 #最大尝试次数 count = 0 while (isSucess == False and count < mostTryCounts): count = count + 1 iplist = read_Proxies() #得到代理IP列表 itemList = [] IP = {} for ip in iplist: try: ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=20) itemList = get_data(ws_api.search_gzh(keyword), 1) #得到数据,并转换数据 print("返回后列表长度:" + str(len(itemList))) if (len(itemList) != 0): IP = ip isSucess = True break except: print("访问出错") continue if (isSucess == False): print("ERROR" + " 可能关键字不存在") else: print("SUCESS") return [itemList, IP, keyword]
def get_news(gzh): ws_api = wechatsogou.WechatSogouAPI() articles = ws_api.get_gzh_article_by_history(gzh) for i in articles['article']: article_title_list.append(i['title']) article_content_list.append(i['content_url']) send_news()
def getInfo(gzh): # global k ws_api = wechatsogou.WechatSogouAPI() articles = ws_api.get_gzh_article_by_history(gzh) for i in articles['article']: # print(i) print(i['title'], i['abstract'], i['content_url'])
def __init__(self): logging.getLogger("wechatsogou").setLevel(logging.WARNING) logging.getLogger("peewee").setLevel(logging.WARNING) logging.getLogger("requests").setLevel(logging.WARNING) self.logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) self.get_bad_proxies() self.WxTable = WechatInfo() self.get_conn() self.create_target() self.get_saved_data() self.proxies_list = NewGenerationProxy({ 'anony': 'L4', 'post': 'false', 'speed': 3000 }) proxyLine = self.proxies_list.getProxy() self.wx_api = wechatsogou.WechatSogouAPI(timeout=8, proxies={ 'http': proxyLine, 'https': proxyLine }) SpiderConfig = Config.SpiderConfig self.headers = SpiderConfig.headers.json() self.weChat_table = WechatInfo() self.proxies_table = UnableProxies() self.crawled_table = CrawledData() self.rk = RClient('ghost2017b', 'Ghost2017b', '107539', 'a8bd936aa1574ddb96d14564c1a0d022')
def startWeChatSpider(self, wechat_id_path, base_path, log_path): ws_api = wechatsogou.WechatSogouAPI() ids = self.readAllLinesFromExcel(wechat_id_path, 'gongzhonghao') for id in ids: finishedIdPath = base_path + '/' + id[0] + '_finished_id.csv' saveFilePath = base_path + '/' + id[0] + '_.csv' isFinishedIdFileExits = os.path.exists(finishedIdPath) isSaveFilePath = os.path.exists(saveFilePath) if isSaveFilePath is False: self.writeToCSV(saveFilePath, [ 'title', 'abstract', 'author', 'content_url', 'copyright_stat', 'cover', 'datetime', 'fileid', 'main', 'send_id', 'source_url', 'type' ]) if isFinishedIdFileExits is False: self.writeToCSV(finishedIdPath, ['finished_id']) history_list = ws_api.get_gzh_article_by_history(id[0]) if len(history_list) > 0: self.parseData(id[0], history_list, finishedIdPath, saveFilePath) time.sleep(1) wechatSpider.writeToTxt( log_path, str(wechatSpider.getCurrntTime() + ": finished get gongzhonghao..."))
def main(): print("抓取公众号的基础信息...") gzh_info = [] ws_api = wechatsogou.WechatSogouAPI() for name in gzh_name: print("正在抓取公众号:", name) info = ws_api.get_gzh_info(name) gzh_info.append(info) if (not os.path.isdir(dir_name)): os.mkdir(dir_name) file_name = os.path.join(dir_name, "gzh_info.xls") print("正在写入文件:", file_name) wb = Workbook() ws = wb.add_sheet('sheet1') colums = list(gzh_info[0].keys()) for j, col in enumerate(colums): ws.write(0, j, col) for i, row in enumerate(gzh_info): for j, col in enumerate(colums): ws.write(i + 1, j, row[col]) wb.save(file_name) print("写入文件完成:", file_name)
def get_article(gzh, titleList): articleList = [] deltaList = [] maxConut = 3 keyword = gzh count = 0 isSuccess = False page = 1 while (1): while (page <= 10): iplist = read_Proxies() print('read ip============================================') for ip in iplist: try: # captcha_break_time:验证码重输次数 ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10, captcha_break_time=2) itemList = [] print('scrapy====%s====article==========page %d' % (gzh, page)) time.sleep(10) itemList = get_data( ws_api.search_article(keyword, page=page), gzh) # 得到数据,并转换数据 if itemList == False: continue page = page + 1 print("\nreturn article list length:" + str(len(itemList))) for art in itemList: print(art['title']) unique = art['title'] + '/' + art['time'] articleList.append(unique) if unique not in titleList: # 增量,在此处存入消息队列 print('kafka') Kafka_fun(art) deltaList.append(art['title']) print("next article list") isSuccess == True break except Exception as e: print("read article error,check ip is validable?") print(e) check_ip(ip) continue if (isSuccess == False): count = count + 1 if (count > maxConut): print("OK,article locked!") # 封锁后直接返回已爬取的 return False else: get_ip() # 得到代理IP列表 continue else: break print("Finish") return articleList
def dynamic_config(): # 可配置参数 # 直连 ws_api = wechatsogou.WechatSogouAPI() # 验证码输入错误的重试次数,默认为1 ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) # 所有requests库的参数都能在这用 # 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用 ws_api = wechatsogou.WechatSogouAPI(proxies={ "http": "127.0.0.1:8888", "https": "127.0.0.1:8888", }) # 如 设置超时 ws_api = wechatsogou.WechatSogouAPI(timeout=0.1)
def __init__(self, name): self.name = name self.get_gzh = '' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } # 连接至微信搜狗搜索接口WechatSogouAPI self.wechat_gzh = wechatsogou.WechatSogouAPI(timeout=1, )
def wechat_info_list(self, nickname): """ 接入若快打码平台 获取公众号信息 :param nickname: 公众号名称 :return: """ ocr_config = { 'type': 'ruokuai', 'dama_name': config.ruokuai_name, 'dama_pswd': config.ruokuai_pswd, 'dama_soft_id': config.ruokuai_soft_id, 'dama_soft_key': config.ruokuai_soft_key } ws_api = wechatsogou.WechatSogouAPI(ocr_config=ocr_config) # 验证码输入错误的重试次数,默认为1 ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) return ws_api.get_gzh_article_by_history( keyword=nickname, identify_image_callback_sogou=identify_image_callback_ruokuai_sogou, identify_image_callback_weixin= identify_image_callback_ruokuai_weixin)
def get_article(gzhList, ip): articleList = [] iplist = read_Proxies() deltaList = [] maxConut = 1 titleList = read_file( "./baiduspiderProject_new/baiduspider/jsonfile/sougou.json") for gzh in gzhList: keyword = gzh count = 0 isSuccess = False while (1): for ip in iplist: try: ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10) itemList = get_data( ws_api.get_gzh_article_by_history(keyword), 2) # 得到数据,并转换数据 print("返回后文章列表长度:" + str(len(itemList))) for art in itemList: print(art) articleList.append(art) #存入文章列表 if art['title'] not in titleList: # # 增量,在此处存入消息队列 # deltaList.append(art['title']) print("下一组文章") isSuccess = True break except: print("文章访问出错") continue if (isSuccess == False): count = count + 1 if (count > maxConut): print("尽力了,文章被封锁了!") # 封锁后直接返回已爬取的 write_file( "./baiduspiderProject_new/baiduspider/jsonfile/sougou_delta.json", deltaList) return articleList else: get_ip() # 得到代理IP列表 continue else: break write_file( "./baiduspiderProject_new/baiduspider/jsonfile/sougou_delta.json", deltaList) print("Finish") return articleList
def get_article_content(gzh): """ 该函数接受公众号参数返回该公众号最新的10篇文章以及这些文章中最新的时间戳 :param gzh: 公众号 :return: 最新的更新时间以及新的文章列表 数据结构 = { 'gzh': { 'wechat_name': '', # 名称 'wechat_id': '', # 微信id 'introduction': '', # 简介 'authentication': '', # 认证 'headimage': '' # 头像 }, 'article': [ { 'datetime': int, # 群发datatime 10位时间戳 'title': '', # 文章标题 'abstract': '', # 摘要 'content_url': '', # 文章链接 'cover': '', # 封面图 'author': '', # 作者 'copyright_stat': int, # 文章类型 }, ... ] } """ ws_api = wechatsogou.WechatSogouAPI() result = ws_api.get_gzh_article_by_history(gzh) # print(result) articles = result.get('article') new_articles = [] latest_time = 0 latest_time_in_file = '' with open('/Users/mac/Desktop/Python-Scrapy/weixinarticle/latest_time.txt', 'r') as f: latest_time_in_file = f.read() for article in articles: # 如果不是第一次则处理更新的文章 if len(latest_time_in_file) > 0 and article['datetime'] > int( latest_time_in_file): article['article_content'] = request_article(article) new_articles.append(article) # 第一次获取公众号最近的10篇文章 elif len(latest_time_in_file) == 0: article['article_content'] = request_article(article) new_articles.append(article) if article['datetime'] > latest_time: latest_time = article['datetime'] return {'latest_time': latest_time, 'articles': new_articles}
def content(): keyword=request.args.get('key') vx_obj = wechatsogou.WechatSogouAPI() lists = [] sugg_keywords = [] md5_string = '' keywords = '' title = '' des = '' #try: if keyword.strip() != '': lists = vx_obj.search_article(keyword) for list in lists: wx_url = list['article']['url'] hash = hashlib.md5() hash.update(bytes(wx_url)) md5_str = hash.hexdigest() #list['article'].append('wx_url_md5') list['article']['wx_url_md5']=md5_str wx_urls = WXUrls(md5_str = md5_str,wx_url=wx_url) wx_urls.save() sugg_keywords = vx_obj.get_sugg(keyword) #except: # print('value errot') key_count = len(sugg_keywords) if key_count == 1: title = keywords= sugg_keywords[0] elif key_count > 1: title = keyword+'_'+sugg_keywords[0] for sugg_key in sugg_keywords: keywords = keywords+ ','+sugg_key keywords = keywords[1:] else: title =keywords= keyword if title.strip() != '': hash = hashlib.md5()#md5对象,md5不能反解,但是加密是固定的,就是关系是一一对应,所以有缺陷,可以被对撞出来 hash.update(bytes(title))#要对哪个字符串进行加密,就放这里 md5_string = hash.hexdigest()#拿到加密字符串 keywrods_id = Keywords(md5_string = md5_string,title=keyword) keywrods_id.save() else: print '404.html' return render_template('content.html',content_list = lists,title=title,keywords=keywords,des=des,sugg_keywords=sugg_keywords)
def parse_url(self, url): print(url) try: response = requests.get(url) r = etree.HTML(response.text, etree.HTMLParser(encoding='utf-8')) for item in r.xpath( '/html/body/div[4]/div[1]/div[5]/ul/div[@class="newpicsmall_list"]' ): url = item.xpath('a/@href')[0] name = item.xpath('a/li[@class="xiaobiaotizi"]/text()') if len(name) == 0: continue name = name[0] print(name, url) ws_api = wechatsogou.WechatSogouAPI() try: wechat_info = ws_api.get_gzh_info( name, identify_image_callback=self.identify_image_callback) except wechatsogou.exceptions.WechatSogouVcodeOcrException as e: result = rc.rk_report_error(self.result_id) print("验证码错误,报错上传:", result["Result"]) continue if wechat_info is not None: item = [] item.append(wechat_info['wechat_id']) # 微信公众号名称 item.append(wechat_info['wechat_name']) # 微信公众号id item.append(wechat_info['introduction']) # 简介 item.append(wechat_info['authentication']) #认证 item.append(wechat_info['headimage']) #头像url item.append(wechat_info['open_id']) item.append(wechat_info['qrcode']) # 二维码 item.append(int(wechat_info['post_perm'])) # 最近一月群发量 item.append(int(wechat_info['view_perm'])) # 最近一月阅读数 item.append(wechat_info['profile_url']) # 最近10条群发页链接 item.append(url) # 原始url item.append( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) self.wechat_infos.append(item) self.insert_to_mysql() nextpage = r.xpath( '//*[@id="content-pagenation"]/div/div/div/a[@class="next"]/@href' )[0] if len(nextpage) > 0: self.parse_url(nextpage) except Exception as e: print(traceback.format_exc())
def get_articles(original=True, timedel=1): ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) accounts = ['infoQ', '成都美食'] articles = [] for account in accounts: articles.extend(reformat(ws_api.get_gzh_article_by_history(account))) # 时间过滤,只选取规定天数以内的 timestamp = int((datetime.now() - timedelta(days=timedel)).timestamp()) articles = [ article for article in articles if article['datetime'] > timestamp ] #监测是否为原创 if original: for article in articles: if article['copyright_stat'] != 100: articles.remove(article) return articles
def get_gzh_article_info(keyword): ws_api = wechatsogou.WechatSogouAPI() #搜索公众号 #print(ws_api.search_gzh(keyword)) #获取特定一个公众号信息 #print(ws_api.get_gzh_info('腾讯')) #【搜索】微信文章 #print(ws_api.search_article('腾讯')) get_gzh_info = ws_api.get_gzh_info(keyword) item_list = [] item = {} #gzh info item['name'] = keyword item['gzh_wechat_name'] = get_gzh_info['wechat_name'] item['gzh_wechat_id'] = get_gzh_info['wechat_id'] item_list.append(item) return item_list
def weixin_spider(): cout = 1 wx_accounts = get_weixin_auth().get("data") for i in xrange(len(wx_accounts)): wx_account = random.choice(wx_accounts) if wx_account.has_key("account"): account = wx_account.get("account") author_none = proxy_weixin(account) if author_none == None: with open("author_none.txt", "a+") as w: w.write(account + "\r\n") else: ws_api = wechatsogou.WechatSogouAPI() doc = ws_api.get_gzh_article_by_history(keyword=account) name = doc['gzh']['wechat_name'] for d in doc['article']: try: html = ws_api.get_article_content(d['content_url'])['content_html'] except: continue dom = etree.HTML(html) d['content'] = "".join(dom.xpath("//text()")).replace("\n", "").replace(" ", "") d['url'] = d['content_url'] d['pubtime'] = parse_date(d['datetime']) d['site_name'] = u"微信公众号" d['author'] = account d['keyword'] = name md5 = hashlib.md5() md5.update(d['content_url']) url_md5 = md5.hexdigest() d['url_md5'] = url_md5 d.pop("send_id") d.pop("datetime") d.pop("type") d.pop("main") d.pop("abstract") d.pop("fileid") d.pop("content_url") d.pop("source_url") d.pop("cover") d.pop("copyright_stat") item_fileds(d, "data_wemedia", False) time.sleep(1) time.sleep(10) cout += 1
def gzh_history(name): ws_api = wechatsogou.WechatSogouAPI() print("正在抓取公众号:", name) gzh_hist = ws_api.get_gzh_article_by_history(name) file_name = "gzh_hist_" + name + ".xls" file_name = os.path.join(dir_name, file_name) print("正在写入文件:", file_name) gzh_article = gzh_hist['article'] wb = Workbook() ws = wb.add_sheet('sheet1') colums = list(gzh_article[0].keys()) for j, col in enumerate(colums): ws.write(0, j, col) for i, row in enumerate(gzh_article): for j, col in enumerate(colums): ws.write(i + 1, j, row[col]) wb.save(file_name) print("写入文件完成:", file_name)
def GetgzhList(keyword, page): isSucess = False mostTryCounts = 3 # 最大尝试次数 count = 0 while (isSucess == False and count < mostTryCounts): count = count + 1 iplist = read_Proxies() # 得到代理IP列表 itemList = [] IP = {} ss = 0 # 成功的次数 ff = 0 # 不成功的次数 isFinaly = False for ip in iplist: try: ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=5) tempList = ws_api.search_gzh(keyword, page=page) itemList = get_data(tempList, 1) # 数据列表 if (tempList != []): label = tempList[0] # 判断是否最后一页的标志量 isFinaly = label['isFinaly'] print('!!!!!!!!!!!!!' + str(isFinaly)) print("返回后公众号列表长度:" + str(len(itemList))) ss = ss + 1 if (len(itemList) != 0 and isFinaly == True): print("已经爬到最后一页") isSucess = True break if (len(itemList) != 0): IP = ip isSucess = True break except Exception as e: print("公众号访问出错,检测ip是否失效") ff = ff + 1 print(e) check_ip(ip) continue if isSucess == False and ss <= ff: get_ip() if isSucess == False: print("ERROR" + " 可能关键字不存在或者已经爬到最后一页") else: print("SUCESS") return [itemList, isFinaly]
def get_article(gzh,titleList): articleList = [] deltaList = [] maxConut = 3 keyword = gzh count = 0 isSuccess = False while (1): iplist = read_Proxies() print('读取ip============================================') for ip in iplist: try: ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10) itemList = get_data(ws_api.get_gzh_article_by_history(keyword), 2) # 得到数据,并转换数据 print("\n返回后文章列表长度:" + str(len(itemList))) for art in itemList: print(art['title']) articleList.append(art) # 存入文章列表 if art['title']+"/"+art['time'] not in titleList: # # 增量,在此处存入消息队列 Kafka_fun(art) # deltaList.append(art['title']) print("下一组文章") isSuccess = True break except Exception as e: print("文章访问出错,检测ip是否失效") print(e) check_ip(ip) continue if (isSuccess == False): count = count + 1 if (count > maxConut): print("尽力了,文章被封锁了!") # 封锁后直接返回已爬取的 return False else: get_ip() # 得到代理IP列表 continue else: break print("Finish") return articleList
def spiderSogoWeixinSearch(self, identifyCodeTime, keyWord, pageSize): if (keyWord == '') or (pageSize < 0): return False if identifyCodeTime <= 0: identifyCodeTime = 1 ws_api = wechatsogou.WechatSogouAPI(identifyCodeTime) result = [] for i in range(0, pageSize): try: pageInfo = ws_api.search_article(keyWord, i + 1) for j in range(0, len(pageInfo)): result.append(pageInfo[j]['article']['url']) time.sleep(3) except Exception as err: print(err) return result
def getAllPageWeixin(): data = [] ws_api = wechatsogou.WechatSogouAPI() for i in range(1, 10): onePageData = ws_api.search_article( 'flyme', page=i, timesn=2, ) if onePageData: data.extend(onePageData) if len(onePageData) < 10: break time.sleep(1) temp = [] for x in data: if '吉他' not in x['gzh']['wechat_name']: #有一个吉他乐队跟flyme重名了,过滤掉它 temp.append(x) temp = arctical_filter(temp) return temp
def test(inProxy=None): api = wechatsogou.WechatSogouAPI(timeout=8) proxyLine = '54.223.188.100:6666' proxyLine = '140.227.80.50:3128' # antispider proxyLine = '114.110.21.146:53281' proxyLine = '45.6.216.79:80' proxyLine = '133.18.55.242:80' # antispider proxyLine = '85.114.25.202:8080' # OK proxyLine = '159.89.163.248:53281' # antispider proxyLine = '110.164.181.164:8080' proxyLine = '200.87.134.30:53281' proxyLine = '115.203.219.81:33885' # OK proxyLine = '180.122.147.226:24636' # antispider if inProxy != None: proxyLine = inProxy if proxyLine != None: api.requests_kwargs['proxies'] = inProxy print(proxyLine) result = api.search_article('Java') pprint(result)
def getArticleSummary(name): ws_api = wechatsogou.WechatSogouAPI() gzh_article = ws_api.get_gzh_article_by_history(name) data_list = [] j = gzh_article['article'] print(len(gzh_article['article'])) cur = conn.cursor() cur.execute("SELECT VERSION()") data = cur.fetchone() print("Database version : %s " % data) if j: for i in j: title = i['title'] author = i['author'] dt = timestamp_to_date(i['datetime']) selectsql = "SELECT COUNT(`title`) AS count FROM `dgzx`.`news` \ WHERE `title` = '%s' AND `author` = '%s' AND `date` = '%s'" % \ (title, author, dt) cur.execute(selectsql) results = cur.fetchall() if results[0][0] < 1: sql = "INSERT INTO `dgzx`.`news` (`title`, `author`, `ifrom`, `date`) \ VALUES (%s,%s,%s,%s)" cur.execute(sql, (title, author, name, dt)) conn.commit() print('SQL Success: ' + title) try: getArticleDescription(i['content_url'], i['cover'], title, author, dt) except: print('SQL Update Failed for '+ title) print('Failed URL:' + i['content_url']) pass else: print("Existed: " + title + " " + author + " " + dt) cur.close()
def get_articles(headline=True, original=True, timedel=1, add_account=None): with open('gzh.txt', 'r') as f: accounts = [account.strip() for account in f.readlines()] # add_account必须是一个list或None if add_account is not None: if isinstance(list, add_account): accounts.extend(add_account) with open('gzh.txt', 'w') as f: for account in accounts: f.write(account) else: print('add_account should be a list') ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) articles = [] for account in accounts: articles.extend(reformat(ws_api.get_gzh_article_by_history(account))) # 时间过滤,只选取规定天数以内的 timestamp = int((datetime.now() - timedelta(days=timedel)).timestamp()) articles = [ article for article in articles if article['datetime'] > timestamp ] # 头条文章过滤,是否选取头条文章,默认是 if headline: articles = [article for article in articles if article['main'] == 1] # 原创文章过滤,是否选取原创文章,默认是 if original: articles = [ article for article in articles if article['copyright_stat'] == 100 ] return articles
# -*- coding: utf-8 -*- import wechatsogou ws_api = wechatsogou.WechatSogouAPI() # 搜索特定的一组微信公众号 wx_list = ["医美圈", "医美视界", "皮秒"] # 根据关键词搜索公众号的文章 def search_article(): for l in wx_list: res = ws_api.get_gzh_article_by_history(l) article = res['article'] for a in article: print('公众号:' + l) print('标题:' + a['title']) print('摘要:' + a['abstract'] + "...") print('文章链接:' + a['content_url']) if a['source_url'] == "": print('阅读原文链接:无') print('\n') search_article()
def get_article(gzh, titleList): articleList = [] deltaList = [] maxConut = 3 keyword = gzh count = 0 isSuccess = False with open('wechatatricles_zhima.txt', 'a+') as fp: fp.write('公众号:%s===============\n' % gzh) log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) fp.write('time:%s\n' % log_time) while (1): iplist = read_Proxies() print('读取ip============================================') for ip in iplist: try: ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10) itemList = get_data(ws_api.get_gzh_article_by_history(keyword), 2) # 得到数据,并转换数据 print("\n返回后文章列表长度:" + str(len(itemList))) with open('wechatatricles_zhima.txt', 'a+') as fp: fp.write('文章*************************%d\n' % len(itemList)) log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) fp.write('time:%s\n' % log_time) for art in itemList: print(art['title']) articleList.append(art) # 存入文章列表 if art['title'] + "/" + art['time'] not in titleList: # # 增量,在此处存入消息队列 Kafka_fun(art) # deltaList.append(art['title']) print("下一组文章") isSuccess = True break except Exception as e: with open('wechatatricles_zhima.txt', 'a+') as fp: fp.write('00000000000000000000000000000000000000\n') fp.write('文章访问出错\n') fp.write('%s\n' % str(e)) log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) fp.write('time:%s\n' % log_time) print("文章访问出错,检测ip是否失效") print(e) check_ip(ip) continue if (isSuccess == False): count = count + 1 if (count > maxConut): with open('wechatatricles_zhima.txt', 'a+') as fp: fp.write('11111111111111111111111111111111111111111\n') fp.write('尽力了,文章被封锁了!\n') log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) fp.write('time:%s\n' % log_time) print("尽力了,文章被封锁了!") # 封锁后直接返回已爬取的 return False else: get_ip() # 得到代理IP列表 continue else: break print("Finish") with open('wechatatricles_zhima.txt', 'a+') as fp: fp.write('*************************\n') fp.write('Finish\n') fp.write('\n\n') return articleList
def __init__(self): super(Clean_Wechat, self).__init__('no', 'no', 'no', 'no') self.ws_api = wechatsogou.WechatSogouAPI()
# coding=utf-8 # -*- coding utf-8 -*- # python 2.7 import wechatsogou import json # 直连 ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) info = ws_api.search_gzh('商城') data = json.dumps(info, indent=4, ensure_ascii=False) print data