def getDownloadPageUrl(self, detailsResponse): ''' get download page url by details page :param detailsResponse: :return: ''' try: torrentResultClass = TorrentResultClass() if detailsResponse is not None and detailsResponse.status_code == 200: detailsResponse.encoding = "gbk" soup = bs(detailsResponse.text, "lxml") title = soup.head.title.text aTagList = soup.find_all("a") for a in aTagList: downloadPageUrl = a.string if downloadPageUrl: if self.isDownloadPageUrl(downloadPageUrl): torrentResultClass.setCategory(self.getCategoryFromDatailsPageUrl(str(detailsResponse.url))) torrentResultClass.setCrawData(self.formatDate()) torrentResultClass.setDetailsPageUrl(str(detailsResponse.url)) torrentResultClass.setResponse(self.urlToResponse([downloadPageUrl.strip()])[0]) torrentResultClass.setTitle(self.clearTitle(title)) else: continue # return self.getTorrentDownloadUrl(torrentResultClass) return torrentResultClass except BaseException: logger.error(traceback.format_exc()) return None
def computeDetailsImageCount(self, detailsResponse): ''' compute details image count bt detailsResponse :param detailsResponse: :return: ''' try: if detailsResponse is not None and detailsResponse.status_code == 200: detailsResponse.encoding = "gbk" soup = bs(detailsResponse.text, "lxml") inputTagList = soup.find_all("input") imageCount = 0 for inputTag in inputTagList: try: imageDownloadUrl = inputTag['data-src'] imageCount += 1 except KeyError: continue return imageCount else: logger.error("detalisResponse is None or status code not 200 in the computeDetailsImageCount function") return 0 except BaseException: logger.error(traceback.format_exc()) return 0
def getCategoryFromDatailsPageUrl(cls, detailsPageUrl): try: return Category.categoryNumberToNameDict[str( detailsPageUrl.split("/")[4])] except BaseException: logger.error(traceback.format_exc()) return "unknown"
def getDetailsPageUrl(self, responseList): ''' get details page url by list page input url to queue :return: ''' if not responseList: logger.error("get response list is empty in details page function") detailsPageUrlList = [] # get details page url from list page response try: for response in responseList: if response is not None and response.status_code == 200: response.encoding = "gbk" soup = bs(response.text, "lxml") trList = soup.find_all("tr") for tr in trList: tdList = tr.find_all("td") if len(tdList) == 5: urlTagList = tdList[1].select('a[href]') if urlTagList: url = urlTagList[0]["href"] # title = urlTagList[0].string if url not in self.excludeUrlList and not self.urlInExclued(url): detailsPageUrlList.append(self.domains + url.strip()) # url to response logger.info("get details page response ...") detailsPageResponseList = self.urlToResponse(detailsPageUrlList) self.totalDownload = len(detailsPageResponseList) logger.info("present craw count:{0}".format(str(len(detailsPageResponseList)))) # put to queue for detailsPageResponse in detailsPageResponseList: self._taskQueue.put(detailsPageResponse) except BaseException: logger.error(traceback.format_exc())
def delete(self, tableName, position=None): ''' delete row from table, if position is None, clean table :param tableName: :param position: position is a dictionary :return: ''' conn = self.pool.connection() cursor = conn.cursor() sql = "" try: if position is not None: if len(position) == 1: key = [key for key in position.keys()][0] sql = "DELETE FROM " + str(tableName) + " WHERE " + str( key) + " = " + "'" + position[key] + "'" else: keys = [key for key in position.keys()] sql = "DELETE FROM " + str(tableName) + " WHERE " for key in keys: sql += str( key) + " = " + "'" + position[key] + "'" + " and " sql = sql[:len(sql) - 5] else: sql = "TRUNCATE TABLE {0}".format(tableName) cursor.execute(sql) conn.commit() return True except BaseException: logger.error(traceback.format_exc()) logger.error("error sql ---> " + sql) return False finally: cursor.close() conn.close()
def insert(self, tableName, info): ''' insert data to tabel :param tableName: :param info: a dictionary :return: ''' conn = self.pool.connection() cursor = conn.cursor() sql = "" try: columnList = [column for column in info.keys()] valueList = [value for value in info.values()] sql = "INSERT INTO {0} ({1}) VALUES ({2})".format( tableName, ", ".join(columnList), ", ".join(["%s" for _ in range(len(valueList))])) cursor.execute(sql, valueList) conn.commit() return True except pymysql.err.IntegrityError: return True except BaseException: logger.error(traceback.format_exc()) logger.error("error sql ---> " + sql) return False finally: cursor.close() conn.close()
def torrentResultClassPreprocessing(self, torrentResultClass): ''' the preprocessing torrentResultClass is used to write to the database :param torrentResultClass: :return: ''' if torrentResultClass is None: logger.error("torrentResultClass is None in the torrentResultClassPreprocessign function") return None try: torrentResultDict = {} torrentResultDict.setdefault("id", None) torrentResultDict.setdefault("category", torrentResultClass.getCategory()) torrentResultDict.setdefault("title", torrentResultClass.getTitle()) torrentResultDict.setdefault("detailsPageUrl", torrentResultClass.getDetailsPageUrl()) torrentResultDict.setdefault("downloadPageUrl", torrentResultClass.getDownloadPageUrl()) torrentResultDict.setdefault("torrentDownloadUrl", torrentResultClass.getTorrentDownloadUrl()) torrentResultDict.setdefault("savePath", torrentResultClass.getSavePath()) torrentResultDict.setdefault("crawData", torrentResultClass.getCrawData()) torrentResultDict.setdefault("md5", torrentResultClass.getMd5()) torrentResultDict.setdefault("magnet", torrentResultClass.getMagnet()) torrentResultDict.setdefault("downloaded", torrentResultClass.getDownloaded()) return torrentResultDict except BaseException: logger.error(traceback.format_exc()) return None
def getTorrentDownloadUrl(self, torrentResultClass): ''' get torrent download url by download information page :param torrentResultClass: :return: ''' if torrentResultClass is None: logger.error("get download page url failed, because torrentResultClass is None") return None try: response = torrentResultClass.getResponse() # i don't knonw why i might get a string here if isinstance(response, str): return None if response is not None and response.status_code == 200: response.encoding = "utf-8" downloadUrl = self.torrentDownloadUrl(response.text) torrentResultClass.setDownloadPageUrl(str(response.url)) torrentResultClass.setTorrentDownloadUrl(downloadUrl) return torrentResultClass else: return None except BaseException: logger.error(traceback.format_exc()) return None
def clearTitle(cls, title): try: title = title.split(" - ")[0] title = title.replace("\xa0", "") return title except BaseException: logger.error(traceback.format_exc()) return title
def filterTorrentName(cls, torrentName): try: torrentName = re.sub(r'[?\\*|“<>:/]', '', torrentName) torrentName = re.sub(r'[\/\\\:\*\?\"\<\>\|]', '', torrentName) return torrentName except BaseException: logger.error(traceback.format_exc()) return torrentName
def existInDatabase(cls, tableName, position): result = cls._database.queryIsExist(tableName, position) try: if result: return True else: return False except BaseException: logger.error(traceback.format_exc()) return False
def update(self, tableName, info): ''' 指定表名,和更新数据更新表 更新数据为一个字典,例如: { "key_values":{"key1":"value1","key2":"value2"...}, "postions":{"key1":"value1","key2":"value2"...} } key_values接受多个参数,但是注意该表里是否有该keys postions接受多个参数,但是目前判断条件只用and,如果需要or请重写代码 实例: info = { "key_values":{ "name":"naonao", "age":"23", "date":"940208", "sex":"man" }, "postions":{ "sex":"man", "date":"940208" } } 返回: True ''' conn = self.pool.connection() cursor = conn.cursor() sql = "" try: infoKeyList = [key for key in info['key_values'].keys()] infoValueList = [value for value in info['key_values'].values()] positionKeyList = [key for key in info['postions'].keys()] positionValueList = [value for value in info['postions'].values()] infoKeyList = list(map(lambda key: key + " = %s", infoKeyList)) positionKeyList = list( map(lambda key: key + " = %s", positionKeyList)) sql = "UPDATE {tableName} SET {infoKey} WHERE {positionKey}".format( tableName=tableName, infoKey=", ".join(infoKeyList), positionKey=" AND ".join(positionKeyList)) cursor.execute(sql, infoValueList + positionValueList) conn.commit() return True except BaseException: logger.error(traceback.format_exc()) logger.error("error sql ---> " + sql) return False finally: cursor.close() conn.close()
def executeCustomSQL(self, sql): conn = self.pool.connection() cursor = conn.cursor() try: cursor.execute(sql) conn.commit() return True except BaseException: logger.error(traceback.format_exc()) logger.error("error sql ---> " + sql) return False finally: cursor.close() conn.close()
def downloadTorrentFile(self, torrentResultClass): ''' download torrent file :param torrentResultClass: :return: ''' if torrentResultClass is None: logger.error("torrentResultClass is None in the downloadTorrentFile function") return None torrentName = self.filterTorrentName(self.clearTitle(torrentResultClass.getTitle())) torrentDownloadUrl = torrentResultClass.getTorrentDownloadUrl() detailsPageUrl = torrentResultClass.getDetailsPageUrl() if not torrentName: logger.error("get torrent name failed") return None if not torrentDownloadUrl: logger.error("get torrent download url failed") return None if not detailsPageUrl: logger.error("get details page url failed") return None try: categoryName = self.getCategoryFromDatailsPageUrl(detailsPageUrl) torrentResponse = requests.get(torrentDownloadUrl, headers=self.headers) torrentMd5 = self.computeMD5ByFile(torrentResponse.content) # check save path is vailed torrentPath = self.checkDirExist( os.path.join(SAVE_PATH, "torrent", categoryName, self.year(), self.month(), self.day(), torrentName + ".torrent")) with open(torrentPath, "wb+") as file: file.write(torrentResponse.content) torrentResultClass.setSavePath(torrentPath) torrentResultClass.setCrawData(self.formatDate()) torrentResultClass.setMd5(torrentMd5) torrentResultClass.setMagnet(self.torrentToMagnet(torrentResponse.content)) torrentResultClass.setDownloaded(0) return torrentResultClass except BaseException: logger.error(traceback.format_exc()) return None
def getImageDownloadUrlGenerator(self, detailsResponse): ''' get download page url by details page this is a generator function generator imageResultClass :param detailsResponse: :return: imageResultClassList ''' try: # imageResultClassList = [] if detailsResponse is not None and detailsResponse.status_code == 200: detailsResponse.encoding = "gbk" soup = bs(detailsResponse.text, "lxml") title = soup.head.title.text inputTagList = soup.find_all("input") for inputTag in inputTagList: try: imageDownloadUrl = inputTag['data-src'] except KeyError: continue if imageDownloadUrl: # print("is ok") if self.existInDatabase("image", {"imageDownloadUrl": imageDownloadUrl}): continue if self.isImageDownloadPageUrl(imageDownloadUrl): # imageResultClass.setCategory(self.getCategoryFromDatailsPageUrl(str(detailsResponse.url))) imageResultClass = ImageResultClass() imageResultClass.setCrawDate(self.formatDate()) imageResultClass.setDetailsPageUrl(str(detailsResponse.url)) imageResultClass.setDetailsPageImageCount(str(self.computeDetailsImageCount(detailsResponse))) imageResultClass.setImageDownloadUrl(imageDownloadUrl) imageResultClass.setResponse(self.urlToResponse([imageDownloadUrl])[0]) imageResultClass.setTitle(self.clearTitle(title)) imageResultClass.setCrawDate(self.formatDate()) yield imageResultClass # imageResultClassList.append(imageResultClass) else: continue # return imageResultClassList except BaseException: logger.error(traceback.format_exc()) return None
def queryIsExist(self, tableName, position): ''' check is exist :param table_name: :param key: position condition --> MD5 :param value: position value --> XXX.... :return: ''' conn = self.pool.connection() # conn.autocommit(1) cursor = conn.cursor() sql = "" try: if len(position) == 1: key = [key for key in position.keys()][0] sql = "SELECT * FROM " + str(tableName) + " WHERE " + str( key) + " = " + "'" + position[key] + "'" sql += " FOR UPDATE" else: keys = [key for key in position.keys()] sql = "SELECT * FROM " + str(tableName) + " WHERE " for key in keys: sql += str( key) + " = " + "'" + position[key] + "'" + " and " sql = sql[:len(sql) - 5] sql += " FOR UPDATE" # sql = "SELECT * FROM {0} WHERE {1} = '{2}'".format(table_name, key, value) cursor.execute(sql) conn.commit() resultList = cursor.fetchall() if len(resultList) != 0: return True else: return False except BaseException: logger.error(traceback.format_exc()) logger.error("error sql ---> " + sql) return False finally: cursor.close() conn.close()
def writeToDatabase(self, torrentResultDict): ''' information write to database :param infoDict: :return: ''' if torrentResultDict is None: logger.error("torrentResultDict is None") return None try: result = self._database.queryIsExist("torrent", {"md5": torrentResultDict["md5"]}) progressBar = self.computeProgressBar(self.completed, self.totalDownload) if not result: logger.info("Torrent completed: {progressBar: <10}".format(progressBar=progressBar) + \ "category: {category: <20}".format(category=torrentResultDict['category']) + \ "Title:{title}".format(title=torrentResultDict['title']), level="ALL") self._database.insert("torrent", torrentResultDict) else: logger.info("Torrent completed:{progressBar: <5} torrent already exist database.".format( progressBar=progressBar)) except BaseException: logger.error(traceback.format_exc()) logger.error("An error occurred in the function ---> wirteToDataBase") return None
def saveImageGenerator(self, imageResultClassGenerator): ''' save image by image result class list :param imageResultClassList: :return: ''' if imageResultClassGenerator is None: logger.error("imageResultClass generator is None") return None for imageResultClass in imageResultClassGenerator: try: # imageResultClass = imageResultClassGenerator.__next__() if isinstance(imageResultClass, str): continue response = imageResultClass.getResponse() if response is not None and response.status_code == 200: imageByte = imageResultClass.getResponse().content imageMd5 = self.computeMD5ByFile(imageByte) imageResultClass.setMd5(imageMd5) imageName = os.path.split(imageResultClass.getImageDownloadUrl())[1] imageDirName = imageResultClass.getTitle() # check save path is vailed imageSavePath = self.checkDirExist(os.path.join(SAVE_PATH, "image", self.year(), self.month(), self.day(), imageDirName, imageName)) imageResultClass.setSavePath(imageSavePath) with open(imageSavePath, "wb+") as fo: fo.write(imageByte) yield imageResultClass # except StopIteration: # break except BaseException: logger.error(traceback.format_exc()) return None
def imageResultClassPreprocessing(self, imageResultClass): ''' the preprocessing infoClass is used to write to the database :param infoClass: :return: ''' if imageResultClass is None: logger.error("imageResultClass is None in the imageResultClassPreprocessign function") return None try: imageResultDict = {} imageResultDict.setdefault("id", None) imageResultDict.setdefault("title", imageResultClass.getTitle()) imageResultDict.setdefault("detailsPageUrl", imageResultClass.getDetailsPageUrl()) imageResultDict.setdefault("detailsPageImageCount", imageResultClass.getDetailsPageImageCount()) imageResultDict.setdefault("imageDownloadUrl", imageResultClass.getImageDownloadUrl()) imageResultDict.setdefault("savePath", imageResultClass.getSavePath()) imageResultDict.setdefault("crawData", imageResultClass.getCrawDate()) imageResultDict.setdefault("md5", imageResultClass.getMd5()) return imageResultDict except BaseException: logger.error(traceback.format_exc()) return None
def writeToDatabase(self, imageResultDict): ''' information write to database :param infoDict: :return: ''' if imageResultDict is None: logger.error("imageResultDict is None") return None try: result = self._database.queryIsExist("image", {"md5": imageResultDict["md5"]}) progressBar = self.computeProgressBar(self.completed, self.totalDownload) if not result: logger.info("Image completed: {progressBar: <10}".format(progressBar=progressBar) + \ "Title:{title}".format(title=imageResultDict['title']), level="ALL") self._database.insert("image", imageResultDict) except BaseException: logger.error(traceback.format_exc()) logger.error("An error occurred in the function ---> wirteToDataBase") return None
def query(self, tableName, position=None): ''' 输入表名和位置查询当前行,若位置为空则查询全部. 如果postion存在key"query_col"则请求选择列 例如: postions = { "query_col":"name" } 请求name列所有内容 postions = { "name:"wen lyuzhao", "age":"24" } 请求name=wen lyuzhao并且age=24的行 返回格式为一个列表字典,例如: [ {"name":"naonao","name":"mama"} ... ] ''' conn = self.pool.connection() cursor = conn.cursor() sql = "" try: # judge have or not positions key if position is not None: # has postion # get column all values if have "query_col" key if "query_col" in position.keys(): rows = position["query_col"] sql = "SELECT " + str(rows) + " FROM " + str(tableName) else: # has postion sql = "SELECT * FROM " + str(tableName) + " WHERE " for key in position.keys(): sql += str(key) + " = " + "'" + str( position[key]) + "'" + " and " sql = sql[:len(sql) - 5] else: # get all sql = "SELECT * FROM " + str(tableName) sql += " FOR UPDATE" cursor.execute(sql) conn.commit() resultTupleList = cursor.fetchall() resultDictList = [] for result in resultTupleList: resultDict = {} resultDict.setdefault("id", result[0]) resultDict.setdefault("category", result[1]) resultDict.setdefault("title", result[2]) resultDict.setdefault("detailsPageUrl", result[3]) resultDict.setdefault("downloadPageUrl", result[4]) resultDict.setdefault("torrentDownloadUrl", result[5]) resultDict.setdefault("savePath", result[6]) resultDict.setdefault("crawData", result[7]) resultDict.setdefault("md5", result[8]) resultDict.setdefault("magnet", result[9]) resultDict.setdefault("downloaded", result[10]) resultDictList.append(resultDict) return resultDictList except BaseException as e: logger.error(traceback.format_exc()) logger.error("error sql ---> " + sql) return [] finally: cursor.close() conn.close()