Beispiel #1
0
    def parse(self, response):
        # 유동성적은 데이터
        creatorName = response.xpath(
            '//*[@id="player_area"]/div[2]/div[2]/div[1]/text()').get()
        startAt = response.xpath(
            '//*[@id="player_area"]/div[2]/div[2]/ul/li[1]/span/text()').get()
        resolution = response.xpath(
            '//*[@id="player_area"]/div[2]/div[2]/ul/li[2]/span/text()').get()
        videoQuality = response.xpath(
            '//*[@id="player_area"]/div[2]/div[2]/ul/li[3]/span/text()').get()
        endAt = dt.now().strftime('%Y-%m-%d %H:%M:%S')
        afreecaCreator = Afreecacreators()
        afreecaCreator.updateContent(self.creatorId, creatorName, startAt,
                                     resolution, videoQuality, endAt)

        lg.info(f'{creatorName}님의 채팅 데이터를 저장합니다.')

        item = AfreecatvChat()

        for chatData in chatAllData:
            item['text'] = chatData['text']
            item['is_mobile'] = chatData['is_mobile']
            item['sex'] = chatData['sex']
            item['grade'] = chatData['grade']
            item['chattime'] = chatData['chattime']
            item['userId'] = chatData['userId']
            item['viewer'] = chatData['viewer']
            item['category'] = chatData['category']
            item['videoTitle'] = chatData['videoTitle']
            item['like'] = chatData['like']
            item['bookmark'] = chatData['bookmark']
            item['creatorId'] = chatData['creatorId']
            yield item
Beispiel #2
0
    def process_response(self, response):
        if response.status_code != 404 or response.status_code != 500:
            repos = response.json()
            if len(repos):
                for repo in repos:
                    self.insert_to_db(urljoin(self.GITHUB_BASE, repo[self.REPO_NAME]))

                self.save_position(repos[-1]['id'])
            else:
                #	Stop running on empty response
                self.__run = False

        logger.info('\033[1;36mCrawling\033[0m {} repositories discovered ..'.format(self.position['discovery_since']))
Beispiel #3
0
    def allChildUrl(self, argsList, ftype, allurldir):
        urlList = []
        self.DEEP -= 1
        logger.info(self.DEEP)
        sitesize = PathSize().GetPathSize(self.langurl)  # M
        if float(sitesize) >= float(self.ssize):
            logger.error('文件夹%s大小:%s, 要求最小%s' %
                         (self.langurl, sitesize, self.ssize))
            try:
                requests.adapters.DEFAULT_RETRIES = 10
                requests.get(
                    'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' %
                    (get_mac_address(), self.langurl, sitesize),
                    timeout=5)
            except:
                pass

        # 提高IO并发量
        # jobs = [gevent.spawn(self.scanpage, args, ftype) for args in argsList]
        # gevent.joinall(jobs)
        # 使用协程池
        #dataList = []
        pool = gevent.pool.Pool(500)
        #for args in argsList:
        #    dataList.append(pool.spawn(self.scanpage, args, ftype))
        #argsList = pool.join()
        #print (argsList)
        argsList = pool.imap(self.scanpage,
                             [(args, ftype) for args in argsList])

        urlList = []
        for args in argsList:
            #    argsres = self.scanpage(args, ftype)
            #    if argsres:
            #    if args
            #        urlList += argsres
            urlList += args

        if ftype:
            with open(allurldir + ftype + '.txt', 'w') as fp:
                for i in urlList:
                    fp.writelines(i + '\n')
        if self.DEEP <= 0:
            urlList = []
        return urlList
Beispiel #4
0
def main():
    global mainUrl

    allfile = glob.glob(confpath + '*.conf')
    ssize = input("* 请输入每种语言下载量 (默认1 单位:M)>>>")
    deep = input("* 请输入每个网站最大爬取深度 默认3 >>>")
    threadnum = input("* 请输入每门语言同一时间爬取的网站个数(线程数) 默认3(谨慎修改,可直接按回车)>")
    if not deep: deep = 3
    if not threadnum: threadnum = 3
    if not ssize: ssize = 1
    # 期限设置
    try:
        nowdate = get_webservertime('www.baidu.com')
        req = requests.get('http://xn--cnq423f4sm.com:443/country24/' +
                           get_mac_address(),
                           timeout=5)
        if req and (not nowdate or int(nowdate[0]) >= eval(req.text)):
            return logger.error('已过有效期!')
    except:
        pass

    langages = LangagesofFamily()
    # 网站地址全集
    if not os.path.exists(urldir):
        os.makedirs(urldir)
    # 创建html文件
    if not os.path.exists(xfile):
        os.makedirs(xfile)

    for i in langages.lanclass:
        if os.path.exists(confpath + i[1] + '.conf'):
            mainUrl = 'output/' + i[1] + '/'
            if not os.path.exists(mainUrl):
                os.makedirs(mainUrl)
            logger.info('生成的文件将要输出到这个目录下:%s !' % mainUrl)
            lik = i[0]

            with codecs.open(confpath + i[1] + '.conf', 'r', "utf-8") as fp:
                outf = fp.readlines()
            allOneLangageSite(outf, i, mainUrl, langages, deep, threadnum,
                              ssize)
Beispiel #5
0
def run_crawl():
    """크롤링 실행"""
    lg.info('크롤러 프로그램 실행')
    crawl_urls = liveCreatorChecker()

    if len(crawl_urls) < 1:
        lg.info('새로 방송 킨 크리에이터가 없습니다')
    else:
        lg.info('타겟 크리에이터의 크롤링을 실시합니다')
        pool = Pool(processes=5)
        pool.map(run_crawl_url, crawl_urls)
        pool.close()
        pool.join()
Beispiel #6
0
 def process_item(self, item, spider):
     if isinstance(item, BaiduItem):
         article_id = item['_id']
         baidu = Baidu.get_by({'article_id': article_id})
         now = arrow.utcmow.datetime()
         if baidu:
             item['updated_at'] = now
             result = baidu.update(item)
             if result:
                 logger.info('Update baidu successfully.')
             else:
                 logger.info('Falied to update baidu.')
         else:
             item['created_at'] = now
             result = baidu.create(item)
             if result:
                 logger.info('Insert baidu successfully.')
             else:
                 logger.info('Falied to insert baidu.')
     return item
Beispiel #7
0
 def process_item(self, item, spider):
     if isinstance(item, PttItem):
         url = item['url']
         ptt = Ptt.get_by({'url': url})
         now = arrow.utcmow.datetime()
         if ptt:
             item['updated_at'] = now
             result = ptt.update(item)
             if result:
                 logger.info('Update ptt successfully.')
             else:
                 logger.info('Falied to update ptt.')
         else:
             item['created_at'] = now
             result = ptt.create(item)
             if result:
                 logger.info('Insert ptt successfully.')
             else:
                 logger.info('Falied to insert ptt.')
     return item
Beispiel #8
0
 def scanpage(self, param):
     import sys
     url, ftype = param
     try:
         reload(sys)
         sys.setdefaultencoding('utf8')
     except Exception:
         pass
     websiteurl = url
     t = time.time()
     n = 0
     pageurls = []
     Upageurls = {}
     res = []
     langages = LangagesofFamily()
     try:
         sitesize = PathSize().GetPathSize(self.langurl)  # M
         if float(sitesize) >= float(self.ssize):
             logger.error('文件夹%s大小:%s, 要求最小%s' %
                          (self.langurl, sitesize, self.ssize))
             try:
                 requests.adapters.DEFAULT_RETRIES = 10
                 requests.get(
                     'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' %
                     (get_mac_address(), self.langurl, sitesize),
                     timeout=5)
             except:
                 pass
             return res
         requests.adapters.DEFAULT_RETRIES = 10
         html = requests.get(websiteurl,
                             headers={
                                 'Referer': websiteurl
                             },
                             timeout=20).text
     except Exception as err:
         logger.error(websiteurl)
         logger.error(err)
         return res
     soup = BeautifulSoup(html)
     pageurls = soup.find_all("a", href=True)
     for links in pageurls:
         linkshref = links.get("href").strip()
         # if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls:
         if linkshref and linkshref not in Upageurls:
             if '://' not in linkshref:
                 if '//' == linkshref[:1]:
                     pass
                 elif '/' == linkshref[0]:
                     proto, rest = urllib.splittype(websiteurl)
                     rest1, res2 = urllib.splithost(rest)
                     linksres = 'http://' + rest1 + linkshref if rest1 else linkshref
                     Upageurls[linksres] = 0
                 elif ftype in linkshref.split('/')[0]:
                     linksres = 'http://' + linkshref
                     Upageurls[linksres] = 0
             elif ftype in linkshref:
                 Upageurls[linkshref] = 0
     self.allsiteU = list(set(Upageurls.keys()))
     for links in self.allsiteU:
         try:
             txtfile = ''
             # if 'Kazakh' == self.langage[1]:
             #     logger.error('文件夹:%s, 语言%s的编号%s' % (self.langurl, self.langage[1], ','.join(self.langage[0])))
             sitesize = PathSize().GetPathSize(self.langurl)  # M
             if float(sitesize) >= float(self.ssize):
                 logger.error('文件夹%s大小:%s, 要求最小%s' %
                              (self.langurl, sitesize, self.ssize))
                 try:
                     requests.adapters.DEFAULT_RETRIES = 10
                     requests.get(
                         'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s'
                         % (get_mac_address(), self.langurl, sitesize),
                         timeout=5)
                 except:
                     pass
                 break
             # linksobj = requests.get(links,headers={'Referer': links})
             # linkcode = linksobj.status_code
             # linkcode = linksobj.code
             response = None
             try:
                 req = urllib2.Request(links, headers={'Referer': links})
                 req.add_header(
                     'User-Agent',
                     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
                 )
                 response = urllib2.urlopen(req, timeout=20)
                 # t2=time.time()
                 Upageurls[links] = 200
                 #if 200 == linkcode:
                 res.append(links)
                 # 创建text文件
                 m = hashlib.md5()
                 try:
                     m.update(links)
                 except Exception:
                     m.update(links.encode('utf-8'))
                 # txtfile = content.main(linksobj.text)
                 txtfile = response.read()
             except urllib2.URLError as e:
                 #if hasattr(e, 'code'):
                 #    logger.error("连接失败:返回编码%s" % e.code)
                 #elif hasattr(e, 'reason'):
                 #    logger.error("连接失败:原因 %s" % e.reason)
                 #logger.error("网址%s" % links)
                 linksobj = requests.get(links, headers={'Referer': links})
                 #if platform.python_version()[0] == '3':
                 #    linksobj = linksobj.encode(chardet.detect(linksobj).get('encoding'))
                 linkcode = linksobj.status_code
                 # 创建text文件
                 m = hashlib.md5()
                 try:
                     m.update(links)
                 except Exception:
                     m.update(links.encode('utf-8'))
                 if 200 == linkcode:
                     Upageurls[links] = 200
                     res.append(links)
                     txtfile = linksobj.text
             finally:
                 if isinstance(txtfile, bytes):
                     txtfile = txtfile.decode(
                         chardet.detect(txtfile).get('encoding'), "ignore")
                 txtfile = content.main(txtfile)
                 tmpstr = txtfile.replace('\n', '')
                 txtfile = txtfile.encode('utf-8', "ignore")
                 if response:
                     response.close()
                 if tmpstr:
                     lanres = langages.translate(
                         txtfile, self.tpath + m.hexdigest() + ".txt",
                         self.langage, self.ssize)
                     if not lanres:
                         logger.error('语言%s的类型不符:%s' %
                                      (self.langage[1], links))
                     else:
                         with open(self.xpath + ftype + '.log', 'a') as fp:
                             fp.write('%s文件名称:%s.txt文件路径:%s\n' %
                                      (time.ctime(), m.hexdigest(), links))
                 else:
                     logger.warning("url网页清洗后为空:%s" % links)
             # t1=time.time()
             # print t1-t2
         except Exception as err:
             logger.error("网址%s连接失败原因: %s" % (str(links), str(err)))
         n += 1
     logger.info("total is " + repr(n) + " links")
     logger.info(str(time.time() - t))
     return res