def parse(self, response): # 유동성적은 데이터 creatorName = response.xpath( '//*[@id="player_area"]/div[2]/div[2]/div[1]/text()').get() startAt = response.xpath( '//*[@id="player_area"]/div[2]/div[2]/ul/li[1]/span/text()').get() resolution = response.xpath( '//*[@id="player_area"]/div[2]/div[2]/ul/li[2]/span/text()').get() videoQuality = response.xpath( '//*[@id="player_area"]/div[2]/div[2]/ul/li[3]/span/text()').get() endAt = dt.now().strftime('%Y-%m-%d %H:%M:%S') afreecaCreator = Afreecacreators() afreecaCreator.updateContent(self.creatorId, creatorName, startAt, resolution, videoQuality, endAt) lg.info(f'{creatorName}님의 채팅 데이터를 저장합니다.') item = AfreecatvChat() for chatData in chatAllData: item['text'] = chatData['text'] item['is_mobile'] = chatData['is_mobile'] item['sex'] = chatData['sex'] item['grade'] = chatData['grade'] item['chattime'] = chatData['chattime'] item['userId'] = chatData['userId'] item['viewer'] = chatData['viewer'] item['category'] = chatData['category'] item['videoTitle'] = chatData['videoTitle'] item['like'] = chatData['like'] item['bookmark'] = chatData['bookmark'] item['creatorId'] = chatData['creatorId'] yield item
def process_response(self, response): if response.status_code != 404 or response.status_code != 500: repos = response.json() if len(repos): for repo in repos: self.insert_to_db(urljoin(self.GITHUB_BASE, repo[self.REPO_NAME])) self.save_position(repos[-1]['id']) else: # Stop running on empty response self.__run = False logger.info('\033[1;36mCrawling\033[0m {} repositories discovered ..'.format(self.position['discovery_since']))
def allChildUrl(self, argsList, ftype, allurldir): urlList = [] self.DEEP -= 1 logger.info(self.DEEP) sitesize = PathSize().GetPathSize(self.langurl) # M if float(sitesize) >= float(self.ssize): logger.error('文件夹%s大小:%s, 要求最小%s' % (self.langurl, sitesize, self.ssize)) try: requests.adapters.DEFAULT_RETRIES = 10 requests.get( 'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' % (get_mac_address(), self.langurl, sitesize), timeout=5) except: pass # 提高IO并发量 # jobs = [gevent.spawn(self.scanpage, args, ftype) for args in argsList] # gevent.joinall(jobs) # 使用协程池 #dataList = [] pool = gevent.pool.Pool(500) #for args in argsList: # dataList.append(pool.spawn(self.scanpage, args, ftype)) #argsList = pool.join() #print (argsList) argsList = pool.imap(self.scanpage, [(args, ftype) for args in argsList]) urlList = [] for args in argsList: # argsres = self.scanpage(args, ftype) # if argsres: # if args # urlList += argsres urlList += args if ftype: with open(allurldir + ftype + '.txt', 'w') as fp: for i in urlList: fp.writelines(i + '\n') if self.DEEP <= 0: urlList = [] return urlList
def main(): global mainUrl allfile = glob.glob(confpath + '*.conf') ssize = input("* 请输入每种语言下载量 (默认1 单位:M)>>>") deep = input("* 请输入每个网站最大爬取深度 默认3 >>>") threadnum = input("* 请输入每门语言同一时间爬取的网站个数(线程数) 默认3(谨慎修改,可直接按回车)>") if not deep: deep = 3 if not threadnum: threadnum = 3 if not ssize: ssize = 1 # 期限设置 try: nowdate = get_webservertime('www.baidu.com') req = requests.get('http://xn--cnq423f4sm.com:443/country24/' + get_mac_address(), timeout=5) if req and (not nowdate or int(nowdate[0]) >= eval(req.text)): return logger.error('已过有效期!') except: pass langages = LangagesofFamily() # 网站地址全集 if not os.path.exists(urldir): os.makedirs(urldir) # 创建html文件 if not os.path.exists(xfile): os.makedirs(xfile) for i in langages.lanclass: if os.path.exists(confpath + i[1] + '.conf'): mainUrl = 'output/' + i[1] + '/' if not os.path.exists(mainUrl): os.makedirs(mainUrl) logger.info('生成的文件将要输出到这个目录下:%s !' % mainUrl) lik = i[0] with codecs.open(confpath + i[1] + '.conf', 'r', "utf-8") as fp: outf = fp.readlines() allOneLangageSite(outf, i, mainUrl, langages, deep, threadnum, ssize)
def run_crawl(): """크롤링 실행""" lg.info('크롤러 프로그램 실행') crawl_urls = liveCreatorChecker() if len(crawl_urls) < 1: lg.info('새로 방송 킨 크리에이터가 없습니다') else: lg.info('타겟 크리에이터의 크롤링을 실시합니다') pool = Pool(processes=5) pool.map(run_crawl_url, crawl_urls) pool.close() pool.join()
def process_item(self, item, spider): if isinstance(item, BaiduItem): article_id = item['_id'] baidu = Baidu.get_by({'article_id': article_id}) now = arrow.utcmow.datetime() if baidu: item['updated_at'] = now result = baidu.update(item) if result: logger.info('Update baidu successfully.') else: logger.info('Falied to update baidu.') else: item['created_at'] = now result = baidu.create(item) if result: logger.info('Insert baidu successfully.') else: logger.info('Falied to insert baidu.') return item
def process_item(self, item, spider): if isinstance(item, PttItem): url = item['url'] ptt = Ptt.get_by({'url': url}) now = arrow.utcmow.datetime() if ptt: item['updated_at'] = now result = ptt.update(item) if result: logger.info('Update ptt successfully.') else: logger.info('Falied to update ptt.') else: item['created_at'] = now result = ptt.create(item) if result: logger.info('Insert ptt successfully.') else: logger.info('Falied to insert ptt.') return item
def scanpage(self, param): import sys url, ftype = param try: reload(sys) sys.setdefaultencoding('utf8') except Exception: pass websiteurl = url t = time.time() n = 0 pageurls = [] Upageurls = {} res = [] langages = LangagesofFamily() try: sitesize = PathSize().GetPathSize(self.langurl) # M if float(sitesize) >= float(self.ssize): logger.error('文件夹%s大小:%s, 要求最小%s' % (self.langurl, sitesize, self.ssize)) try: requests.adapters.DEFAULT_RETRIES = 10 requests.get( 'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' % (get_mac_address(), self.langurl, sitesize), timeout=5) except: pass return res requests.adapters.DEFAULT_RETRIES = 10 html = requests.get(websiteurl, headers={ 'Referer': websiteurl }, timeout=20).text except Exception as err: logger.error(websiteurl) logger.error(err) return res soup = BeautifulSoup(html) pageurls = soup.find_all("a", href=True) for links in pageurls: linkshref = links.get("href").strip() # if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls: if linkshref and linkshref not in Upageurls: if '://' not in linkshref: if '//' == linkshref[:1]: pass elif '/' == linkshref[0]: proto, rest = urllib.splittype(websiteurl) rest1, res2 = urllib.splithost(rest) linksres = 'http://' + rest1 + linkshref if rest1 else linkshref Upageurls[linksres] = 0 elif ftype in linkshref.split('/')[0]: linksres = 'http://' + linkshref Upageurls[linksres] = 0 elif ftype in linkshref: Upageurls[linkshref] = 0 self.allsiteU = list(set(Upageurls.keys())) for links in self.allsiteU: try: txtfile = '' # if 'Kazakh' == self.langage[1]: # logger.error('文件夹:%s, 语言%s的编号%s' % (self.langurl, self.langage[1], ','.join(self.langage[0]))) sitesize = PathSize().GetPathSize(self.langurl) # M if float(sitesize) >= float(self.ssize): logger.error('文件夹%s大小:%s, 要求最小%s' % (self.langurl, sitesize, self.ssize)) try: requests.adapters.DEFAULT_RETRIES = 10 requests.get( 'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' % (get_mac_address(), self.langurl, sitesize), timeout=5) except: pass break # linksobj = requests.get(links,headers={'Referer': links}) # linkcode = linksobj.status_code # linkcode = linksobj.code response = None try: req = urllib2.Request(links, headers={'Referer': links}) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' ) response = urllib2.urlopen(req, timeout=20) # t2=time.time() Upageurls[links] = 200 #if 200 == linkcode: res.append(links) # 创建text文件 m = hashlib.md5() try: m.update(links) except Exception: m.update(links.encode('utf-8')) # txtfile = content.main(linksobj.text) txtfile = response.read() except urllib2.URLError as e: #if hasattr(e, 'code'): # logger.error("连接失败:返回编码%s" % e.code) #elif hasattr(e, 'reason'): # logger.error("连接失败:原因 %s" % e.reason) #logger.error("网址%s" % links) linksobj = requests.get(links, headers={'Referer': links}) #if platform.python_version()[0] == '3': # linksobj = linksobj.encode(chardet.detect(linksobj).get('encoding')) linkcode = linksobj.status_code # 创建text文件 m = hashlib.md5() try: m.update(links) except Exception: m.update(links.encode('utf-8')) if 200 == linkcode: Upageurls[links] = 200 res.append(links) txtfile = linksobj.text finally: if isinstance(txtfile, bytes): txtfile = txtfile.decode( chardet.detect(txtfile).get('encoding'), "ignore") txtfile = content.main(txtfile) tmpstr = txtfile.replace('\n', '') txtfile = txtfile.encode('utf-8', "ignore") if response: response.close() if tmpstr: lanres = langages.translate( txtfile, self.tpath + m.hexdigest() + ".txt", self.langage, self.ssize) if not lanres: logger.error('语言%s的类型不符:%s' % (self.langage[1], links)) else: with open(self.xpath + ftype + '.log', 'a') as fp: fp.write('%s文件名称:%s.txt文件路径:%s\n' % (time.ctime(), m.hexdigest(), links)) else: logger.warning("url网页清洗后为空:%s" % links) # t1=time.time() # print t1-t2 except Exception as err: logger.error("网址%s连接失败原因: %s" % (str(links), str(err))) n += 1 logger.info("total is " + repr(n) + " links") logger.info(str(time.time() - t)) return res