Beispiel #1
0
    def translate(self, inputFile, outputFile, lik, ssize):
        try:
            fin = inputFile.decode('utf-8')
        except Exception:
            pass
        lineTuple = langid.classify(inputFile)  #调用langid来对该行进行语言检测
        if lineTuple[0] in lik[0] or lineTuple[
                0] in self.countrylist:  #如果该行语言大部分为中文
            if lineTuple[0] not in lik[0]:
                countr = lik[1]
                outurl = outputFile.split('/')
                outurl[-3] = countr
                outurlstr = '/'.join(outurl[:-3])
                sitesize = PathSize().GetPathSize(outurlstr)  # M
                if float(sitesize) >= float(ssize):
                    return True

                outurlFile = '/'.join(outurl)
            p = re.compile(r'[\n]+')
            with codecs.open(outputFile, 'w', "utf-8") as fout:  #以写的方式打开输出文件
                try:
                    fout.writelines(p.sub('\n', inputFile))
                except Exception:
                    fout.writelines(p.sub('\n', fin))
            return True
        else:
            logger.error('文件内容的语言%s和想获取的文章的语言%s不符合!' %
                         (lineTuple[0], ','.join(lik[0])))
            return False
Beispiel #2
0
def dehtml(text):
    try:
        import sys
        reload(sys)
        sys.setdefaultencoding('utf8')
    except Exception:
        pass
    try:
        parser = _DeHTMLParser()
        parser.feed(text)
        parser.close()
        return parser.text()
    except:
        print_exc(file=stderr)
        logger.error(text)
        return text
Beispiel #3
0
    def allChildUrl(self, argsList, ftype, allurldir):
        urlList = []
        self.DEEP -= 1
        logger.info(self.DEEP)
        sitesize = PathSize().GetPathSize(self.langurl)  # M
        if float(sitesize) >= float(self.ssize):
            logger.error('文件夹%s大小:%s, 要求最小%s' %
                         (self.langurl, sitesize, self.ssize))
            try:
                requests.adapters.DEFAULT_RETRIES = 10
                requests.get(
                    'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' %
                    (get_mac_address(), self.langurl, sitesize),
                    timeout=5)
            except:
                pass

        # 提高IO并发量
        # jobs = [gevent.spawn(self.scanpage, args, ftype) for args in argsList]
        # gevent.joinall(jobs)
        # 使用协程池
        #dataList = []
        pool = gevent.pool.Pool(500)
        #for args in argsList:
        #    dataList.append(pool.spawn(self.scanpage, args, ftype))
        #argsList = pool.join()
        #print (argsList)
        argsList = pool.imap(self.scanpage,
                             [(args, ftype) for args in argsList])

        urlList = []
        for args in argsList:
            #    argsres = self.scanpage(args, ftype)
            #    if argsres:
            #    if args
            #        urlList += argsres
            urlList += args

        if ftype:
            with open(allurldir + ftype + '.txt', 'w') as fp:
                for i in urlList:
                    fp.writelines(i + '\n')
        if self.DEEP <= 0:
            urlList = []
        return urlList
Beispiel #4
0
def get_links_from_url(url, ftype, obj):
    try:
        if '://' not in url:
            if '//' == url[:1]:
                pass
            elif '/' == url[0]:
                proto, rest = urllib.splittype(url)
                rest1, res2 = urllib.splithost(rest)
                linksres = 'http://' + rest1 + url if rest1 else url
        begin = datetime.datetime.now()
        if ftype in url:
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
                'Referer': url
            }
            response = yield httpclient.AsyncHTTPClient().fetch(
                httpclient.HTTPRequest(url,
                                       headers=headers,
                                       validate_cert=False))
            html = response.body if isinstance(response.body, str) \
                else response.body.decode(chardet.detect(response.body).get('encoding'), "ignore")
            # 清洗数据
            if response.code == 200:
                obj.scanpage((html, ftype, url))
                end = datetime.datetime.now()
                update_sql = "UPDATE  " + obj.ctable + " SET state=1 where url='%s'" % url
                obj.curcheck.execute(update_sql)
# 清洗数据
            urls = [
                urljoin(url, remove_fragment(new_url))
                for new_url in get_links(html)
            ]
            #插入记录
            for u in urls:
                insert_sql = "insert into " + obj.ctable + " (site,url) values (?,?)"
                obj.curcheck.execute(insert_sql, (obj.table, u))
            obj.concheck.commit()

    except Exception as e:
        logger.error('错误信息:%s网页地址:%s' % (e, url))
        raise gen.Return([])

    raise gen.Return(urls)
Beispiel #5
0
def allOneLangageSite(outf, lik, mainUrl, langages, deep, threadnum, ssize):
    n, sn, = 0, 1
    # pool=multiprocessing.Pool(processes=5) #限制并行进程数为5
    # 多线程
    pool_args = []
    pool = threadpool.ThreadPool(int(threadnum))
    print('starting at:%s' %
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

    for j in outf:
        conflist = j.strip().replace('\n', '').replace('\r', '').split('=')
        n += 1
        if len(conflist) >= 3:
            f = conflist[2]
            if not '://' in f:
                f = 'http://' + f
            startUrlList = [f]
            try:
                ftype = sld.get_second_level_domain(j[2])
            except Exception as e:
                logger.error("url获取域名问题:%s !" % e)
                ftype = ''.join(f.split("://")[1:]).split("/")[0]
            finally:
                if not ftype:
                    ftype = ''.join(f.split("://")[1:]).split("/")[0]
                print('%s - %s域名:%s' % (time.ctime(), lik[1], ftype))
                # pool.apply_async(craw_run, (startUrlList,ftype, lik,langages, mainUrl, deep, ssize, conflist))
                args = [
                    startUrlList, ftype, lik, langages, mainUrl, deep, ssize,
                    conflist
                ]
                pool_args.append((args, None))
        else:
            logger.error('配置文件%s的第%s行有问题!' % (lik[1] + '.conf', n))
    # pool.close()
    # pool.join()
    reqs = threadpool.makeRequests(craw_run, pool_args)
    [pool.putRequest(req) for req in reqs]
    pool.wait()
Beispiel #6
0
def main():
    global mainUrl

    allfile = glob.glob(confpath + '*.conf')
    ssize = input("* 请输入每种语言下载量 (默认1 单位:M)>>>")
    deep = input("* 请输入每个网站最大爬取深度 默认3 >>>")
    threadnum = input("* 请输入每门语言同一时间爬取的网站个数(线程数) 默认3(谨慎修改,可直接按回车)>")
    if not deep: deep = 3
    if not threadnum: threadnum = 3
    if not ssize: ssize = 1
    # 期限设置
    try:
        nowdate = get_webservertime('www.baidu.com')
        req = requests.get('http://xn--cnq423f4sm.com:443/country24/' +
                           get_mac_address(),
                           timeout=5)
        if req and (not nowdate or int(nowdate[0]) >= eval(req.text)):
            return logger.error('已过有效期!')
    except:
        pass

    langages = LangagesofFamily()
    # 网站地址全集
    if not os.path.exists(urldir):
        os.makedirs(urldir)
    # 创建html文件
    if not os.path.exists(xfile):
        os.makedirs(xfile)

    for i in langages.lanclass:
        if os.path.exists(confpath + i[1] + '.conf'):
            mainUrl = 'output/' + i[1] + '/'
            if not os.path.exists(mainUrl):
                os.makedirs(mainUrl)
            logger.info('生成的文件将要输出到这个目录下:%s !' % mainUrl)
            lik = i[0]

            with codecs.open(confpath + i[1] + '.conf', 'r', "utf-8") as fp:
                outf = fp.readlines()
            allOneLangageSite(outf, i, mainUrl, langages, deep, threadnum,
                              ssize)
Beispiel #7
0
 def scanpage(self, param):
     import sys
     url, ftype = param
     try:
         reload(sys)
         sys.setdefaultencoding('utf8')
     except Exception:
         pass
     websiteurl = url
     t = time.time()
     n = 0
     pageurls = []
     Upageurls = {}
     res = []
     langages = LangagesofFamily()
     try:
         sitesize = PathSize().GetPathSize(self.langurl)  # M
         if float(sitesize) >= float(self.ssize):
             logger.error('文件夹%s大小:%s, 要求最小%s' %
                          (self.langurl, sitesize, self.ssize))
             try:
                 requests.adapters.DEFAULT_RETRIES = 10
                 requests.get(
                     'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' %
                     (get_mac_address(), self.langurl, sitesize),
                     timeout=5)
             except:
                 pass
             return res
         requests.adapters.DEFAULT_RETRIES = 10
         html = requests.get(websiteurl,
                             headers={
                                 'Referer': websiteurl
                             },
                             timeout=20).text
     except Exception as err:
         logger.error(websiteurl)
         logger.error(err)
         return res
     soup = BeautifulSoup(html)
     pageurls = soup.find_all("a", href=True)
     for links in pageurls:
         linkshref = links.get("href").strip()
         # if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls:
         if linkshref and linkshref not in Upageurls:
             if '://' not in linkshref:
                 if '//' == linkshref[:1]:
                     pass
                 elif '/' == linkshref[0]:
                     proto, rest = urllib.splittype(websiteurl)
                     rest1, res2 = urllib.splithost(rest)
                     linksres = 'http://' + rest1 + linkshref if rest1 else linkshref
                     Upageurls[linksres] = 0
                 elif ftype in linkshref.split('/')[0]:
                     linksres = 'http://' + linkshref
                     Upageurls[linksres] = 0
             elif ftype in linkshref:
                 Upageurls[linkshref] = 0
     self.allsiteU = list(set(Upageurls.keys()))
     for links in self.allsiteU:
         try:
             txtfile = ''
             # if 'Kazakh' == self.langage[1]:
             #     logger.error('文件夹:%s, 语言%s的编号%s' % (self.langurl, self.langage[1], ','.join(self.langage[0])))
             sitesize = PathSize().GetPathSize(self.langurl)  # M
             if float(sitesize) >= float(self.ssize):
                 logger.error('文件夹%s大小:%s, 要求最小%s' %
                              (self.langurl, sitesize, self.ssize))
                 try:
                     requests.adapters.DEFAULT_RETRIES = 10
                     requests.get(
                         'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s'
                         % (get_mac_address(), self.langurl, sitesize),
                         timeout=5)
                 except:
                     pass
                 break
             # linksobj = requests.get(links,headers={'Referer': links})
             # linkcode = linksobj.status_code
             # linkcode = linksobj.code
             response = None
             try:
                 req = urllib2.Request(links, headers={'Referer': links})
                 req.add_header(
                     'User-Agent',
                     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
                 )
                 response = urllib2.urlopen(req, timeout=20)
                 # t2=time.time()
                 Upageurls[links] = 200
                 #if 200 == linkcode:
                 res.append(links)
                 # 创建text文件
                 m = hashlib.md5()
                 try:
                     m.update(links)
                 except Exception:
                     m.update(links.encode('utf-8'))
                 # txtfile = content.main(linksobj.text)
                 txtfile = response.read()
             except urllib2.URLError as e:
                 #if hasattr(e, 'code'):
                 #    logger.error("连接失败:返回编码%s" % e.code)
                 #elif hasattr(e, 'reason'):
                 #    logger.error("连接失败:原因 %s" % e.reason)
                 #logger.error("网址%s" % links)
                 linksobj = requests.get(links, headers={'Referer': links})
                 #if platform.python_version()[0] == '3':
                 #    linksobj = linksobj.encode(chardet.detect(linksobj).get('encoding'))
                 linkcode = linksobj.status_code
                 # 创建text文件
                 m = hashlib.md5()
                 try:
                     m.update(links)
                 except Exception:
                     m.update(links.encode('utf-8'))
                 if 200 == linkcode:
                     Upageurls[links] = 200
                     res.append(links)
                     txtfile = linksobj.text
             finally:
                 if isinstance(txtfile, bytes):
                     txtfile = txtfile.decode(
                         chardet.detect(txtfile).get('encoding'), "ignore")
                 txtfile = content.main(txtfile)
                 tmpstr = txtfile.replace('\n', '')
                 txtfile = txtfile.encode('utf-8', "ignore")
                 if response:
                     response.close()
                 if tmpstr:
                     lanres = langages.translate(
                         txtfile, self.tpath + m.hexdigest() + ".txt",
                         self.langage, self.ssize)
                     if not lanres:
                         logger.error('语言%s的类型不符:%s' %
                                      (self.langage[1], links))
                     else:
                         with open(self.xpath + ftype + '.log', 'a') as fp:
                             fp.write('%s文件名称:%s.txt文件路径:%s\n' %
                                      (time.ctime(), m.hexdigest(), links))
                 else:
                     logger.warning("url网页清洗后为空:%s" % links)
             # t1=time.time()
             # print t1-t2
         except Exception as err:
             logger.error("网址%s连接失败原因: %s" % (str(links), str(err)))
         n += 1
     logger.info("total is " + repr(n) + " links")
     logger.info(str(time.time() - t))
     return res