def updateState(self, url, state): db = self._connect() if (not isinstance(url, dict)): id = md5(url) else: id = md5(url["url"]) db[self._tbName].update({"_id": id}, {"$set": {"state": state}})
def updateState(self, url, state): conn = sqlite3.connect(self._dbPath) try: if(not isinstance(url,dict)): id = md5(url) else: id = md5(url["url"]) conn.cursor().execute("update urls set state={} where id='{}'".format(state, id)) conn.commit() except Exception as err: self.log(err) conn.close()
def resetUrls(self, urls): db = self._connect() for url in urls: if (not isinstance(url, dict)): id = md5(url) url = {'url': url, '_id': id, 'state': 0} else: id = md5(url["url"]) url['_id'] = id url['state'] = 0 if (not self.checkUrl(url["url"], None)): db[self._tbName].insert(url) else: db[self._tbName].update({"_id": url["_id"]}, {"$set": { "state": 0 }})
def resetUrls(self, urls): datas=[] for url in urls: if(not isinstance(url,dict)): id = md5(url) url={'url':url} else: id = md5(url["url"]) datas.append((id,json.dumps(url),getTimeNow())) if not len(datas): return conn = sqlite3.connect(self._dbPath) try: cursor = conn.cursor() # tmp = tuple(datas) cursor.executemany("REPLACE into urls(id,json,state,tm) values(?,?,0,?)",tuple(datas)) conn.commit() except Exception as err: self.log(err) conn.close()
def checkUrl(self, url, i): if not self._duplicateRemoval: return False db = self._connect() id = md5(url) tbName = self._tbName if (i): tbName = tbName + str(i) url = db[tbName].find_one({"_id": id}) if (not url and not i): url = db[self._tbName].find_one({"_id": id}) return url
def checkUrl(self,url): if not self._duplicateRemoval: return False conn = sqlite3.connect(self._dbPath) try: flag=False id = md5(url) cursor = conn.cursor().execute("select id from urls where id='{}' limit 0,1".format(id)) for row in cursor: flag = True except Exception as err: self.log(err) conn.close() return flag
def saveUrl(self, urls, i=None): db = self._connect() flag = False for url in urls: if (not isinstance(url, dict)): id = md5(url) url = {'url': url, '_id': id, 'state': 0} else: id = md5(url["url"]) url['_id'] = id url['state'] = 0 if (i != 0): i = self._getIndex(url["url"]) if (not self.checkUrl(url["url"], i)): tbName = self._tbName if (i): tbName = tbName + str(i) db[tbName].insert(url) if (i in self._totalCount): self._totalCount[i] += 1 else: self._totalCount[i] = 1 flag = True if (flag): self._tbCache[tbName] = False
def _saveHtml(self, url, html): suffix = '.htm' index = url.rfind('/') if (index > 10): index = url.rfind('.', index) if (index > 0 and len(url) - index < 10): suffix = url[index:] filename = md5(url) + suffix if (not os.path.exists(self._htmlPath)): os.mkdir(self._htmlPath) file = io.open(self._htmlPath + filename, "w", encoding="utf-8") file.write(html) file.close() return filename
def resetUrls(self, urls): self._lock.acquire() try: flag=False for url in urls: if(not isinstance(url,dict)): url={'url':url} id=md5(url['url']) self._urls.append(url) self._dic.add(id) self._writeFile(url,id) flag=True if(flag): self._flushFile() except Exception as err: printInfo(err) finally: self._lock.release()
def saveUrl(self, urls,i=None): # if (type(urls).__name__=='dict'): # urls=urls["Urls"] self._lock.acquire() try: flag=False for url in urls: if(not isinstance(url,dict)): url={'url':url} if(md5(url['url']) not in self._dic): self._urls.append(url) self._dic.add(md5) self._writeFile(url,md5) flag=True if(flag): self._flushFile() except Exception as err: printInfo(err) finally: self._lock.release()
def _saveHtml(self, url, html): filename = md5(url) + '.htm' file = io.open(self._htmlPath + filename, "w", encoding="utf-8") file.write(html) file.close()
def checkUrl(self,url): if not self._duplicateRemoval: return False _r = redis.Redis(connection_pool=self._pool) result = _r.sadd(self._setName, md5(url)) return not result
def checkUrl(self,url): return md5(url) in self._dic