Exemple #1
0
 def getDownloadPageUrl(self, detailsResponse):
     '''
     get download page url by details page
     :param detailsResponse:
     :return:
     '''
     try:
         torrentResultClass = TorrentResultClass()
         if detailsResponse is not None and detailsResponse.status_code == 200:
             detailsResponse.encoding = "gbk"
             soup = bs(detailsResponse.text, "lxml")
             title = soup.head.title.text
             aTagList = soup.find_all("a")
             for a in aTagList:
                 downloadPageUrl = a.string
                 if downloadPageUrl:
                     if self.isDownloadPageUrl(downloadPageUrl):
                         torrentResultClass.setCategory(self.getCategoryFromDatailsPageUrl(str(detailsResponse.url)))
                         torrentResultClass.setCrawData(self.formatDate())
                         torrentResultClass.setDetailsPageUrl(str(detailsResponse.url))
                         torrentResultClass.setResponse(self.urlToResponse([downloadPageUrl.strip()])[0])
                         torrentResultClass.setTitle(self.clearTitle(title))
                 else:
                     continue
         # return self.getTorrentDownloadUrl(torrentResultClass)
         return torrentResultClass
     except BaseException:
         logger.error(traceback.format_exc())
         return None
Exemple #2
0
 def computeDetailsImageCount(self, detailsResponse):
     '''
     compute details image count bt detailsResponse
     :param detailsResponse:
     :return:
     '''
     try:
         if detailsResponse is not None and detailsResponse.status_code == 200:
             detailsResponse.encoding = "gbk"
             soup = bs(detailsResponse.text, "lxml")
             inputTagList = soup.find_all("input")
             imageCount = 0
             for inputTag in inputTagList:
                 try:
                     imageDownloadUrl = inputTag['data-src']
                     imageCount += 1
                 except KeyError:
                     continue
             return imageCount
         else:
             logger.error("detalisResponse is None or status code not 200 in the computeDetailsImageCount function")
             return 0
     except BaseException:
         logger.error(traceback.format_exc())
         return 0
Exemple #3
0
 def getCategoryFromDatailsPageUrl(cls, detailsPageUrl):
     try:
         return Category.categoryNumberToNameDict[str(
             detailsPageUrl.split("/")[4])]
     except BaseException:
         logger.error(traceback.format_exc())
         return "unknown"
Exemple #4
0
 def getDetailsPageUrl(self, responseList):
     '''
     get details page url by list page
     input url to queue
     :return:
     '''
     if not responseList:
         logger.error("get response list is empty in details page function")
     detailsPageUrlList = []
     # get details page url from list page response
     try:
         for response in responseList:
             if response is not None and response.status_code == 200:
                 response.encoding = "gbk"
                 soup = bs(response.text, "lxml")
                 trList = soup.find_all("tr")
                 for tr in trList:
                     tdList = tr.find_all("td")
                     if len(tdList) == 5:
                         urlTagList = tdList[1].select('a[href]')
                         if urlTagList:
                             url = urlTagList[0]["href"]
                             # title = urlTagList[0].string
                             if url not in self.excludeUrlList and not self.urlInExclued(url):
                                 detailsPageUrlList.append(self.domains + url.strip())
         # url to response
         logger.info("get details page response ...")
         detailsPageResponseList = self.urlToResponse(detailsPageUrlList)
         self.totalDownload = len(detailsPageResponseList)
         logger.info("present craw count:{0}".format(str(len(detailsPageResponseList))))
         # put to queue
         for detailsPageResponse in detailsPageResponseList:
             self._taskQueue.put(detailsPageResponse)
     except BaseException:
         logger.error(traceback.format_exc())
Exemple #5
0
 def delete(self, tableName, position=None):
     '''
     delete row from table, if position is None, clean table
     :param tableName:
     :param position:    position is a dictionary
     :return:
     '''
     conn = self.pool.connection()
     cursor = conn.cursor()
     sql = ""
     try:
         if position is not None:
             if len(position) == 1:
                 key = [key for key in position.keys()][0]
                 sql = "DELETE FROM " + str(tableName) + " WHERE " + str(
                     key) + " = " + "'" + position[key] + "'"
             else:
                 keys = [key for key in position.keys()]
                 sql = "DELETE FROM " + str(tableName) + " WHERE "
                 for key in keys:
                     sql += str(
                         key) + " = " + "'" + position[key] + "'" + " and "
                 sql = sql[:len(sql) - 5]
         else:
             sql = "TRUNCATE TABLE {0}".format(tableName)
         cursor.execute(sql)
         conn.commit()
         return True
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("error sql ---> " + sql)
         return False
     finally:
         cursor.close()
         conn.close()
Exemple #6
0
 def insert(self, tableName, info):
     '''
     insert data to tabel
     :param tableName:
     :param info: a dictionary
     :return:
     '''
     conn = self.pool.connection()
     cursor = conn.cursor()
     sql = ""
     try:
         columnList = [column for column in info.keys()]
         valueList = [value for value in info.values()]
         sql = "INSERT INTO {0} ({1}) VALUES ({2})".format(
             tableName, ", ".join(columnList),
             ", ".join(["%s" for _ in range(len(valueList))]))
         cursor.execute(sql, valueList)
         conn.commit()
         return True
     except pymysql.err.IntegrityError:
         return True
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("error sql ---> " + sql)
         return False
     finally:
         cursor.close()
         conn.close()
Exemple #7
0
 def torrentResultClassPreprocessing(self, torrentResultClass):
     '''
     the preprocessing torrentResultClass is used to write to the database
     :param torrentResultClass:
     :return:
     '''
     if torrentResultClass is None:
         logger.error("torrentResultClass is None in the torrentResultClassPreprocessign function")
         return None
     try:
         torrentResultDict = {}
         torrentResultDict.setdefault("id", None)
         torrentResultDict.setdefault("category", torrentResultClass.getCategory())
         torrentResultDict.setdefault("title", torrentResultClass.getTitle())
         torrentResultDict.setdefault("detailsPageUrl", torrentResultClass.getDetailsPageUrl())
         torrentResultDict.setdefault("downloadPageUrl", torrentResultClass.getDownloadPageUrl())
         torrentResultDict.setdefault("torrentDownloadUrl", torrentResultClass.getTorrentDownloadUrl())
         torrentResultDict.setdefault("savePath", torrentResultClass.getSavePath())
         torrentResultDict.setdefault("crawData", torrentResultClass.getCrawData())
         torrentResultDict.setdefault("md5", torrentResultClass.getMd5())
         torrentResultDict.setdefault("magnet", torrentResultClass.getMagnet())
         torrentResultDict.setdefault("downloaded", torrentResultClass.getDownloaded())
         return torrentResultDict
     except BaseException:
         logger.error(traceback.format_exc())
         return None
Exemple #8
0
 def getTorrentDownloadUrl(self, torrentResultClass):
     '''
     get torrent download url by download information page
     :param torrentResultClass:
     :return:
     '''
     if torrentResultClass is None:
         logger.error("get download page url failed, because torrentResultClass is None")
         return None
     try:
         response = torrentResultClass.getResponse()
         # i don't knonw why i might get a string here
         if isinstance(response, str):
             return None
         if response is not None and response.status_code == 200:
             response.encoding = "utf-8"
             downloadUrl = self.torrentDownloadUrl(response.text)
             torrentResultClass.setDownloadPageUrl(str(response.url))
             torrentResultClass.setTorrentDownloadUrl(downloadUrl)
             return torrentResultClass
         else:
             return None
     except BaseException:
         logger.error(traceback.format_exc())
         return None
Exemple #9
0
 def clearTitle(cls, title):
     try:
         title = title.split(" - ")[0]
         title = title.replace("\xa0", "")
         return title
     except BaseException:
         logger.error(traceback.format_exc())
         return title
Exemple #10
0
 def filterTorrentName(cls, torrentName):
     try:
         torrentName = re.sub(r'[?\\*|“<>:/]', '', torrentName)
         torrentName = re.sub(r'[\/\\\:\*\?\"\<\>\|]', '', torrentName)
         return torrentName
     except BaseException:
         logger.error(traceback.format_exc())
         return torrentName
Exemple #11
0
 def existInDatabase(cls, tableName, position):
     result = cls._database.queryIsExist(tableName, position)
     try:
         if result:
             return True
         else:
             return False
     except BaseException:
         logger.error(traceback.format_exc())
         return False
Exemple #12
0
    def update(self, tableName, info):
        '''
        指定表名,和更新数据更新表
        更新数据为一个字典,例如:
        {
            "key_values":{"key1":"value1","key2":"value2"...},
            "postions":{"key1":"value1","key2":"value2"...}
        }
        key_values接受多个参数,但是注意该表里是否有该keys
        postions接受多个参数,但是目前判断条件只用and,如果需要or请重写代码

        实例:
            info = {
            "key_values":{
                            "name":"naonao",
                            "age":"23",
                            "date":"940208",
                            "sex":"man"
                        },
            "postions":{
                            "sex":"man",
                            "date":"940208"
                        }
            }
        返回:     True
        '''
        conn = self.pool.connection()
        cursor = conn.cursor()
        sql = ""
        try:
            infoKeyList = [key for key in info['key_values'].keys()]
            infoValueList = [value for value in info['key_values'].values()]
            positionKeyList = [key for key in info['postions'].keys()]
            positionValueList = [value for value in info['postions'].values()]

            infoKeyList = list(map(lambda key: key + " = %s", infoKeyList))
            positionKeyList = list(
                map(lambda key: key + " = %s", positionKeyList))
            sql = "UPDATE {tableName} SET {infoKey} WHERE {positionKey}".format(
                tableName=tableName,
                infoKey=", ".join(infoKeyList),
                positionKey=" AND ".join(positionKeyList))
            cursor.execute(sql, infoValueList + positionValueList)
            conn.commit()
            return True
        except BaseException:
            logger.error(traceback.format_exc())
            logger.error("error sql ---> " + sql)
            return False
        finally:
            cursor.close()
            conn.close()
Exemple #13
0
 def executeCustomSQL(self, sql):
     conn = self.pool.connection()
     cursor = conn.cursor()
     try:
         cursor.execute(sql)
         conn.commit()
         return True
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("error sql ---> " + sql)
         return False
     finally:
         cursor.close()
         conn.close()
Exemple #14
0
 def downloadTorrentFile(self, torrentResultClass):
     '''
     download torrent file
     :param torrentResultClass:
     :return:
     '''
     if torrentResultClass is None:
         logger.error("torrentResultClass is None in the downloadTorrentFile function")
         return None
     torrentName = self.filterTorrentName(self.clearTitle(torrentResultClass.getTitle()))
     torrentDownloadUrl = torrentResultClass.getTorrentDownloadUrl()
     detailsPageUrl = torrentResultClass.getDetailsPageUrl()
     if not torrentName:
         logger.error("get torrent name failed")
         return None
     if not torrentDownloadUrl:
         logger.error("get torrent download url failed")
         return None
     if not detailsPageUrl:
         logger.error("get details page url failed")
         return None
     try:
         categoryName = self.getCategoryFromDatailsPageUrl(detailsPageUrl)
         torrentResponse = requests.get(torrentDownloadUrl, headers=self.headers)
         torrentMd5 = self.computeMD5ByFile(torrentResponse.content)
         # check save path is vailed
         torrentPath = self.checkDirExist(
             os.path.join(SAVE_PATH,
                          "torrent",
                          categoryName,
                          self.year(),
                          self.month(),
                          self.day(),
                          torrentName + ".torrent"))
         with open(torrentPath, "wb+") as file:
             file.write(torrentResponse.content)
         torrentResultClass.setSavePath(torrentPath)
         torrentResultClass.setCrawData(self.formatDate())
         torrentResultClass.setMd5(torrentMd5)
         torrentResultClass.setMagnet(self.torrentToMagnet(torrentResponse.content))
         torrentResultClass.setDownloaded(0)
         return torrentResultClass
     except BaseException:
         logger.error(traceback.format_exc())
         return None
Exemple #15
0
 def getImageDownloadUrlGenerator(self, detailsResponse):
     '''
     get download page url by details page
     this is a generator function
     generator imageResultClass
     :param detailsResponse:
     :return: imageResultClassList
     '''
     try:
         # imageResultClassList = []
         if detailsResponse is not None and detailsResponse.status_code == 200:
             detailsResponse.encoding = "gbk"
             soup = bs(detailsResponse.text, "lxml")
             title = soup.head.title.text
             inputTagList = soup.find_all("input")
             for inputTag in inputTagList:
                 try:
                     imageDownloadUrl = inputTag['data-src']
                 except KeyError:
                     continue
                 if imageDownloadUrl:
                     # print("is ok")
                     if self.existInDatabase("image", {"imageDownloadUrl": imageDownloadUrl}):
                         continue
                     if self.isImageDownloadPageUrl(imageDownloadUrl):
                         # imageResultClass.setCategory(self.getCategoryFromDatailsPageUrl(str(detailsResponse.url)))
                         imageResultClass = ImageResultClass()
                         imageResultClass.setCrawDate(self.formatDate())
                         imageResultClass.setDetailsPageUrl(str(detailsResponse.url))
                         imageResultClass.setDetailsPageImageCount(str(self.computeDetailsImageCount(detailsResponse)))
                         imageResultClass.setImageDownloadUrl(imageDownloadUrl)
                         imageResultClass.setResponse(self.urlToResponse([imageDownloadUrl])[0])
                         imageResultClass.setTitle(self.clearTitle(title))
                         imageResultClass.setCrawDate(self.formatDate())
                         yield imageResultClass
                         # imageResultClassList.append(imageResultClass)
                 else:
                     continue
         # return imageResultClassList
     except BaseException:
         logger.error(traceback.format_exc())
         return None
Exemple #16
0
 def queryIsExist(self, tableName, position):
     '''
     check is exist
     :param table_name:
     :param key:    position condition     --> MD5
     :param value:  position value         --> XXX....
     :return:
     '''
     conn = self.pool.connection()
     # conn.autocommit(1)
     cursor = conn.cursor()
     sql = ""
     try:
         if len(position) == 1:
             key = [key for key in position.keys()][0]
             sql = "SELECT * FROM " + str(tableName) + " WHERE " + str(
                 key) + " = " + "'" + position[key] + "'"
             sql += " FOR UPDATE"
         else:
             keys = [key for key in position.keys()]
             sql = "SELECT * FROM " + str(tableName) + " WHERE "
             for key in keys:
                 sql += str(
                     key) + " = " + "'" + position[key] + "'" + " and "
             sql = sql[:len(sql) - 5]
             sql += " FOR UPDATE"
         # sql = "SELECT * FROM {0} WHERE {1} = '{2}'".format(table_name, key, value)
         cursor.execute(sql)
         conn.commit()
         resultList = cursor.fetchall()
         if len(resultList) != 0:
             return True
         else:
             return False
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("error sql ---> " + sql)
         return False
     finally:
         cursor.close()
         conn.close()
Exemple #17
0
 def writeToDatabase(self, torrentResultDict):
     '''
     information write to database
     :param infoDict:
     :return:
     '''
     if torrentResultDict is None:
         logger.error("torrentResultDict is None")
         return None
     try:
         result = self._database.queryIsExist("torrent", {"md5": torrentResultDict["md5"]})
         progressBar = self.computeProgressBar(self.completed, self.totalDownload)
         if not result:
             
             logger.info("Torrent completed: {progressBar: <10}".format(progressBar=progressBar) + \
                         "category: {category: <20}".format(category=torrentResultDict['category']) + \
                         "Title:{title}".format(title=torrentResultDict['title']),
                         level="ALL")
             self._database.insert("torrent", torrentResultDict)
         else:
             logger.info("Torrent completed:{progressBar: <5} torrent already exist database.".format(
                 progressBar=progressBar))
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("An error occurred in the function ---> wirteToDataBase")
         return None
Exemple #18
0
 def saveImageGenerator(self, imageResultClassGenerator):
     '''
     save image by image result class list
     :param imageResultClassList:
     :return:
     '''
     if imageResultClassGenerator is None:
         logger.error("imageResultClass generator is None")
         return None
     for imageResultClass in imageResultClassGenerator:
         try:
             # imageResultClass = imageResultClassGenerator.__next__()
             if isinstance(imageResultClass, str):
                 continue
             response = imageResultClass.getResponse()
             if response is not None and response.status_code == 200:
                 imageByte = imageResultClass.getResponse().content
                 imageMd5 = self.computeMD5ByFile(imageByte)
                 imageResultClass.setMd5(imageMd5)
                 imageName = os.path.split(imageResultClass.getImageDownloadUrl())[1]
                 imageDirName = imageResultClass.getTitle()
                 # check save path is vailed
                 imageSavePath = self.checkDirExist(os.path.join(SAVE_PATH,
                                                                 "image",
                                                                 self.year(),
                                                                 self.month(),
                                                                 self.day(),
                                                                 imageDirName,
                                                                 imageName))
                 imageResultClass.setSavePath(imageSavePath)
                 with open(imageSavePath, "wb+") as fo:
                     fo.write(imageByte)
                 yield imageResultClass
         # except StopIteration:
         #     break
         except BaseException:
             logger.error(traceback.format_exc())
             return None
Exemple #19
0
 def imageResultClassPreprocessing(self, imageResultClass):
     '''
     the preprocessing infoClass is used to write to the database
     :param infoClass:
     :return:
     '''
     if imageResultClass is None:
         logger.error("imageResultClass is None in the imageResultClassPreprocessign function")
         return None
     try:
         
         imageResultDict = {}
         imageResultDict.setdefault("id", None)
         imageResultDict.setdefault("title", imageResultClass.getTitle())
         imageResultDict.setdefault("detailsPageUrl", imageResultClass.getDetailsPageUrl())
         imageResultDict.setdefault("detailsPageImageCount", imageResultClass.getDetailsPageImageCount())
         imageResultDict.setdefault("imageDownloadUrl", imageResultClass.getImageDownloadUrl())
         imageResultDict.setdefault("savePath", imageResultClass.getSavePath())
         imageResultDict.setdefault("crawData", imageResultClass.getCrawDate())
         imageResultDict.setdefault("md5", imageResultClass.getMd5())
         return imageResultDict
     except BaseException:
         logger.error(traceback.format_exc())
         return None
Exemple #20
0
 def writeToDatabase(self, imageResultDict):
     '''
     information write to database
     :param infoDict:
     :return:
     '''
     if imageResultDict is None:
         logger.error("imageResultDict is None")
         return None
     try:
         result = self._database.queryIsExist("image", {"md5": imageResultDict["md5"]})
         progressBar = self.computeProgressBar(self.completed, self.totalDownload)
         if not result:
             logger.info("Image completed: {progressBar: <10}".format(progressBar=progressBar) + \
                         "Title:{title}".format(title=imageResultDict['title']),
                         level="ALL")
             self._database.insert("image", imageResultDict)
     except BaseException:
         logger.error(traceback.format_exc())
         logger.error("An error occurred in the function ---> wirteToDataBase")
         return None
Exemple #21
0
    def query(self, tableName, position=None):
        '''
        输入表名和位置查询当前行,若位置为空则查询全部.
        如果postion存在key"query_col"则请求选择列
        例如:
        postions = {
                    "query_col":"name"
                    }
        请求name列所有内容

        postions = {
                    "name:"wen lyuzhao",
                    "age":"24"
                    }
        请求name=wen lyuzhao并且age=24的行

        返回格式为一个列表字典,例如:
        [
            {"name":"naonao","name":"mama"}
            ...
        ]
        '''
        conn = self.pool.connection()
        cursor = conn.cursor()
        sql = ""
        try:
            # judge have or not positions key
            if position is not None:
                # has postion
                # get column all values if have "query_col" key
                if "query_col" in position.keys():
                    rows = position["query_col"]
                    sql = "SELECT " + str(rows) + " FROM " + str(tableName)
                else:
                    # has postion
                    sql = "SELECT * FROM " + str(tableName) + " WHERE "
                    for key in position.keys():
                        sql += str(key) + " = " + "'" + str(
                            position[key]) + "'" + " and "
                    sql = sql[:len(sql) - 5]
            else:
                # get all
                sql = "SELECT * FROM " + str(tableName)
            sql += " FOR UPDATE"
            cursor.execute(sql)
            conn.commit()
            resultTupleList = cursor.fetchall()
            resultDictList = []
            for result in resultTupleList:
                resultDict = {}
                resultDict.setdefault("id", result[0])
                resultDict.setdefault("category", result[1])
                resultDict.setdefault("title", result[2])
                resultDict.setdefault("detailsPageUrl", result[3])
                resultDict.setdefault("downloadPageUrl", result[4])
                resultDict.setdefault("torrentDownloadUrl", result[5])
                resultDict.setdefault("savePath", result[6])
                resultDict.setdefault("crawData", result[7])
                resultDict.setdefault("md5", result[8])
                resultDict.setdefault("magnet", result[9])
                resultDict.setdefault("downloaded", result[10])
                resultDictList.append(resultDict)
            return resultDictList
        except BaseException as e:
            logger.error(traceback.format_exc())
            logger.error("error sql ---> " + sql)
            return []
        finally:
            cursor.close()
            conn.close()