コード例 #1
0
 def __init__(self, name, setting={}):
     if setting:
         if setting.get('port'):
             self._port = setting.get('port')
     if (not os.path.exists('db/')):
         os.mkdir('db/')
     self._name = name
     self._dbPath = self._dbPath.format(name)
     conn = None
     try:
         conn = sqlite3.connect(self._dbPath)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS {}
         (id INTEGER PRIMARY KEY AUTOINCREMENT,
         name VARCHAR(50) NOT NULL,
         method VARCHAR(10) NOT NULL,
         url TEXT NOT NULL,
         tmSpan REAL NOT NULL,
         concurrency INT NOT NULL DEFAULT 0,
         countPer10s INT NOT NULL DEFAULT 0,
         state INT NOT NULL DEFAULT 0,
         size INT NOT NULL DEFAULT 0,
         tm CHAR(20));'''.format(self._tbName))
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     if (conn): conn.close()
コード例 #2
0
 def downloadThread2(self, *args):
     url = args[0]
     ssp = args[1]
     try:
         self._concurrency -= 1
         printInfo(url['url'])
         self._downloadPageNum += 1
         startTm = time.time()
         url['_startTm'] = startTm
         html = execDownload(url, ssp)
         url['_endTm'] = time.time()
         tmSpan = url['_endTm'] - startTm
         state = 1
         if (html == "_error_"):
             state = 0
             ssp.downloadError(url)
         else:
             data = ssp.saveHtml(url, html)
             if data and (isinstance(data, dict) or isinstance(data, list)):
                 self._extractQueue.put((ssp, data))
         if self.statistics:
             curIndex = int(time.time() / 10) % 2
             self.statistics.addRecode(
                 ssp, url, tmSpan, state, self._concurrency,
                 self._downloadPagePer10s[1 - curIndex],
                 len(html) if html and state else 0)
     except Exception as err:
         self.log(err, logging.ERROR)
コード例 #3
0
 def __init__(self, name):
   try:
     self._urlFilename=self._urlFilename.format(name)
     self._dicFilename=self._dicFilename.format(name)
     self._indexFilename=self._indexFilename.format(name)
     if(not os.path.exists('db/')):
       os.mkdir('db/')
     self._urlfile = io.open(self._urlFilename, "a+",encoding="utf-8")
     self._dicfile = io.open(self._dicFilename, "a+",encoding="utf-8")
     self._indexfile = io.open(self._indexFilename, "a+",encoding="utf-8")
     self._urlfile.seek(0)
     self._dicfile.seek(0)
     self._indexfile.seek(0)
     index = self._indexfile.read()
     if(index):
       self._i = int(index)
     line = 'start'
     while(line):
       line = self._dicfile.readline()
       if(line):
         self._dic.add(line[:-1])
     i = 0
     line = 'start'
     while(line):
       line = self._urlfile.readline()
       if(i<self._i):
         i=i+1
         continue
       if(line):
         self._urls.append(json.loads(line[:-1]))
   except Exception as err:
     printInfo(err)
コード例 #4
0
 def log(self, msg, level=logging.DEBUG):
     if (isinstance(msg, UnicodeEncodeError)):
         printInfo('UnicodeEncodeError', msg)
         return
     printInfo(msg)
     if (level == logging.ERROR):
         logger = logging.getLogger()
         logging.LoggerAdapter(logger, None).log(level, msg)
コード例 #5
0
 def clearRecode(self):
     conn = sqlite3.connect(self._dbPath)
     try:
         cursor = conn.cursor()
         cursor.execute("delete from request_tm")
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     conn.close()
コード例 #6
0
def listImg(html, baseUrl=None, start=None, end=None, before=None):
    if (not html or html.find("<img") < 0): return []
    section = getSection(html, start, end, before)
    s = section[0]
    e = section[1]

    if (s < 0 or e < s): return None
    html = html[s:e]
    if (not html or html.find("<img") < 0): return None

    patternLst = _getRegex(u'<img[\s]+[^>]*>')
    patternUrl = _getRegex(u'src[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]')
    patternTitle = _getRegex(u'alt[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
    lstStr = patternLst.findall(html)
    # lst=[]
    dic = Dict()
    for i in lstStr:
        url = None
        title = None
        tmp = patternUrl.search(i)
        if tmp:
            url = tmp.group("url")

        tmp = patternTitle.search(i)
        if tmp:
            title = tmp.group("title")
        try:
            if (url):
                url = url.strip().lower()
                if (baseUrl and url[:7] != "http://"
                        and url[:8] != "https://"):
                    absUrl = absoluteUrl(baseUrl, url)
                    if (absUrl != url):
                        d = dic[absUrl]
                        if d:
                            if not d.alt or (title
                                             and len(d.alt) < len(title)):
                                d['alt'] = title
                        else:
                            dic[absUrl] = Dict({
                                'url': absUrl,
                                'alt': title,
                                'relativeUrl': url
                            })
                        # lst.append(Dict({'url':absUrl,'alt':title, 'relativeUrl':url}))
                else:
                    d = dic[url]
                    if d:
                        if not d.alt or (title and len(d.alt) < len(title)):
                            d['alt'] = title
                    else:
                        dic[url] = Dict({'url': url, 'alt': title})
                    # lst.append(Dict({'url':url,'alt':title}))
        except Exception as ex:
            printInfo(ex)

    return list(dic.values())
コード例 #7
0
def listA(html,baseUrl=None,start=None,end=None,before=None):
  if(not html): return []
  section = getSection(html,start,end,before)
  s = section[0]
  e = section[1]
  if(s < 0 or e < s): return []
  html = html[s:e]
  if(not html or html.find("<a")<0): return []

  patternLst = _getRegex(u'<a[\s]+[^>]*>[\s\S]*?</a>')
  patternUrl = _getRegex(u'href[\s]*=[\s\'"]*(?P<url>.*?)[\'"\s>]') 
  patternTitle1 = _getRegex(u'title[\s]*=[\s\'"]*(?P<title>.*?)[\'"\s>]')
  patternTitle2 = _getRegex(u'<a[\s]+[^>]*>(?P<title>.*?)</a>')

  strA = patternLst.findall(html)
  dic = Dict()
  for i in strA:
    url = None
    title = None
    tmp = patternUrl.search(i)
    if tmp: url = tmp.group("url")
      
    tmp = patternTitle1.search(i)
    if tmp: title = tmp.group("title")
    if(not title):
      tmp = patternTitle2.search(i)
      if tmp: 
        title = tmp.group("title")
        title = _getRegex('<[^<>]+>').sub('',title)

    try:
      if(url):
        url = url.strip().lower()
        if(baseUrl and url[:7]!="http://" and  url[:8]!="https://"):
          absUrl = absoluteUrl(baseUrl,url)
          if(absUrl!=url):
            d = dic[absUrl]
            if d:
              if not d.title or (title and len(d.title)<len(title)):
                d['title'] = title
            else:
              dic[absUrl] = Dict({'url':absUrl,'title':title,'relativeUrl':url})
        else:
          if url.rfind('/')<9:
            url += '/'
          d = dic[url]
          if d:
            if not d.title or (title and len(d.title)<len(title)):
              d['title'] = title
          else:
            dic[url] = Dict({'url':url,'title':title})
    except Exception as ex:
      printInfo(ex)
    
  return list(dic.values())
コード例 #8
0
 def _insertRT(self, datas):
     conn = sqlite3.connect(self._dbPath)
     try:
         cursor = conn.cursor()
         cursor.executemany(
             "insert into request_tm(name,method,url,tmSpan,concurrency,countPer10s,state,size,tm) values(?,?,?,?,?,?,?,?,?)",
             tuple(datas))
         conn.commit()
     except Exception as err:
         printInfo('error', err)
     conn.close()
コード例 #9
0
 def _select(self, sql):
     conn = sqlite3.connect(self._dbPath)
     rows = []
     header = []
     try:
         cursor = conn.cursor().execute(sql)
         for i in cursor.description:
             header.append(i[0])
         for row in cursor:
             rows.append(row)
     except Exception as err:
         printInfo('error', err)
     conn.close()
     return (rows, header)
コード例 #10
0
 def _dealRequestTmThread(self):
     while (self._runflag):
         try:
             datas = []
             i = 0
             while (True):
                 i += 1
                 if self._requestTmq.empty():
                     break
                 data = self._requestTmq.get_nowait()
                 datas.append(data)
                 if i > 1000: break
             if datas: self._insertRT(datas)
         except Exception as err:
             printInfo('error', err)
         time.sleep(1)
コード例 #11
0
    def extract(self, url, html, ssp):
        mds = url.get("model")
        if (not mds):
            mds = ssp.models
        models = []
        if (mds):
            for modelName in mds:
                m = ExtractModel.get(modelName)
                if (m):
                    models.append(m)
                else:
                    printInfo('no model ' + modelName)

        return ssp.extract(Dict(url), html, models, mds)


# print (ExtractModel.auto_all)
コード例 #12
0
 def _iniModels(self, rootdir):
     try:
         if (not os.path.isdir(rootdir)):
             return
         list = os.listdir(rootdir)
         for i in range(0, len(list)):
             name = list[i]
             path = os.path.join(rootdir, name)
             if os.path.isfile(path):
                 f = open(path, 'r')
                 if sys.version_info.major == 2:
                     ExtractModel[name[:-5]] = json.loads(
                         f.read().decode('utf-8'))
                 else:
                     ExtractModel[name[:-5]] = json.loads(f.read())
                 f.close()
     except Exception as err:
         printInfo(err)
コード例 #13
0
 def resetUrls(self, urls):
   self._lock.acquire()
   try:
     flag=False
     for url in urls:
       if(not isinstance(url,dict)):
         url={'url':url}
       id=md5(url['url'])
       self._urls.append(url)
       self._dic.add(id)
       self._writeFile(url,id)
       flag=True
     if(flag):
       self._flushFile()
   except Exception as err:
     printInfo(err)
   finally:
     self._lock.release()
コード例 #14
0
 def __init__(self):
     try:
         if (not os.path.exists('db/')):
             os.mkdir('db/')
         self._cookiefile = io.open(self._cookieFilename,
                                    "a+",
                                    encoding="utf-8")
         self._cookiefile.seek(0)
         line = 'start'
         while (line):
             line = self._cookiefile.readline()
             if (line):
                 line = line[:-1]
                 start = line.index(',')
                 if (start > 0):
                     self._cookies[line[0:start]] = line[start + 1:]
         self._refreshCookieFile()
     except Exception as err:
         printInfo(err)
コード例 #15
0
 def saveUrl(self, urls,i=None):
   # if (type(urls).__name__=='dict'):
   #   urls=urls["Urls"]
   self._lock.acquire()
   try:
     flag=False
     for url in urls:
       if(not isinstance(url,dict)):
         url={'url':url}
       if(md5(url['url']) not in self._dic):
         self._urls.append(url)
         self._dic.add(md5)
         self._writeFile(url,md5)
         flag=True
     if(flag):
       self._flushFile()
   except Exception as err:
     printInfo(err)
   finally:
     self._lock.release()
コード例 #16
0
    def popUrl(self):
        db = self._connect()
        lst = []
        while (True):
            if (lst.count == 10): return None
            tbName = self._tbName
            i = random.randint(0, 9)
            printInfo('popUrl', i)
            if (i in lst): continue
            lst.append(i)
            if (i):
                tbName = tbName + str(i)
            if (self._tbCache.get(tbName)): continue

            url = db[tbName].find_one({"state": 0})
            if (url):
                db[tbName].update({"_id": url["_id"]}, {"$set": {"state": 1}})
                if (i in self._totalCount): self._totalCount[i] -= 1
            else:
                self._tbCache[tbName] = True
            return url
コード例 #17
0
 def __init__(self, name):
     try:
         self._htmlPath = self._htmlPath.format(name)
         if (not os.path.exists('db/')):
             os.mkdir('db/')
         if (not os.path.exists('htmls/')):
             os.mkdir('htmls/')
         if (not os.path.exists(self._htmlPath)):
             os.mkdir(self._htmlPath)
         self._dbPath = self._dbPath.format(name)
         conn = sqlite3.connect(self._dbPath)
         c = conn.cursor()
         c.execute('''CREATE TABLE IF NOT EXISTS htmls
         (id INTEGER PRIMARY KEY autoincrement,
         json  TEXT  NOT NULL,
         state INT NOT NULL DEFAULT 0,
         tm  TEXT);''')
         conn.commit()
         conn.close()
     except Exception as err:
         printInfo(err)
コード例 #18
0
 def __init__(self):
     try:
         self._iniModels('models/')
     except Exception as err:
         printInfo(err)
コード例 #19
0
def convert2Dic(html):
    try:
        start = html.find('<')
        end = html.find('>')
        html = html[start + 1:end].strip('/').strip()
        html = re.sub('(\\s|&nbsp;)+', ' ', html, 0)
        html = re.sub('(\')+', '"', html, 0)
        html = re.sub('(=\s*")+', '="', html, 0)
        lstC = []  #list(html)
        N = len(html)
        i = 0
        first = False
        flag = False
        while i < N:
            if html[i] == '"':
                lstC.append(html[i])
                first = not first
            elif not first and html[i] == '=' and html[i + 1] != '"':
                lstC.append(html[i])
                lstC.append('"')
                flag = True
            elif not first and flag and html[i] == ' ':
                flag = False
                lstC.append('"')
                lstC.append(html[i])
            else:
                lstC.append(html[i])
            i += 1
        html = ''.join(lstC)
        paras = html.split('"')
        dic = Dict()
        lastP = None
        first = True
        for para in paras:
            if (first):
                first = False
                tmp = para.split()
                dic['tag'] = tmp[0]
                if (len(tmp) > 1):
                    lastP = tmp[1].strip().strip('=').strip()
                continue
            if (lastP):
                if (not dic[lastP]):
                    dic[lastP] = para
                else:
                    dic[lastP] += ' '
                    dic[lastP] += para
                lastP = None
            elif para:
                if (para.find('=') > 0):
                    lastP = para.strip().strip('=').strip()
                else:
                    dic[para] = ''
        return dic
    except Exception as err:
        printInfo(err)
        try:
            tag = ''
            if (html.find('</') < 0 and html.find('/>') < 0):
                start = html.find('<')
                end = html.find(' ', start + 1)
                tag = '</' + html[start + 1:end] + '>'
            tree = ET.XML(html + tag)
            return XmlDictConfig(tree)
        except Exception as err:
            printInfo(err)
    return None
コード例 #20
0
 def __del__(self):
   self._urlfile.close()
   self._dicfile.close()
   self._indexfile.close()
   printInfo('__del__')
コード例 #21
0
def log(err, data):
    printInfo(err, data)
    logger = logging.getLogger()
    logging.LoggerAdapter(logger, None).log(logging.ERROR, err)
    logging.LoggerAdapter(logger, None).log(logging.ERROR, data)
コード例 #22
0
 def downloadError(self, url, err=None):
     printInfo('error url:', url, err)
     self.url_store.updateState(url, 2)
コード例 #23
0
 def renderUrl(self, url, callback):
     printInfo('Need to implement method "renderUrl"')
コード例 #24
0
 def customDown(self, url):
     printInfo('Need to implement method "customDown"')
コード例 #25
0
 def log(self, msg, level=logging.ERROR):
     printInfo(msg)
     logger = logging.getLogger()
     logging.LoggerAdapter(logger, None).log(level, msg)
コード例 #26
0
 def downloadRender(self, url, ssp):
     printInfo(url['url'])
     self._downloadPageNum += 1
     ssp.renderUrl(url, self.down_callback)