Ejemplo n.º 1
0
class Recovery():
    def __init__(self):
        self.settings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.root = '/home/dev/Data/rsyncData/prd4/sites'
        self.dest = '/home/dev/Data/rsyncData/prd4/local'
        self.resume = '/home/dev/Repository/news/Tegenaria/tSpider/tSpider/dataRecovery/resume.txt'

    def start(self):
        sites = os.listdir(self.root)
        if os.path.exists(self.resume) is False:
            print 'resume file does not exit and create an new one'
            self.file.writeToTxtCover(self.resume, '\n')
        finished = []
        items = self.file.readFromTxt(self.resume).strip().split('\n')
        for item in items:
            finished.append(item)
        for site in sites:
            p1 = '{0}/{1}/html'.format(self.root, site)
            if os.path.exists(p1) is False:
                print '{0} has no html.'.format(site)
                continue
            allTime = os.listdir(p1)
            for t in allTime:
                p2 = '{0}/{1}'.format(p1, t)
                files = os.listdir(p2)
                for file in files:
                    fromFile = '{0}/{1}'.format(p2, file)
                    if fromFile not in finished:
                        toFile = '{0}/{1}'.format(self.dest, file)
                        if self.doraemon.copyFile(fromFile, toFile):
                            self.file.writeToTxtAdd(self.resume, fromFile)
                            print '{0} is recovered.'.format(fromFile)
            print '{0} is finished.'.format(site)
Ejemplo n.º 2
0
class ChuansongmeReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/"
        self.mongo = "gongzhonghao_test"
        self.finished_ids = "gongzhonghao_test"
        self.log_path = "/home/dev/Data/rsyncData/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_item = html.xpath("./*[contains(@class, 'pagedlist_item')]")
        if len(href_item) == 0:
            print 'No data for: {0}'.format(key)
            return
        self.doraemon.hashSet(self.finished_ids, key, key)
        data = {
            'id': key,
            'url': current_url
        }
        print 'Start to store mongo {0}'.format(data['url'])
        self.doraemon.storeMongodb(self.mongo, data)
        print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                tmp_url = "https://chuansongme.com/account/{0}".format(key)
                new_urls.append([tmp_url, key])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
Ejemplo n.º 3
0
class XueqiuReceptor():
    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "xueqiu_test"
        self.finished_ids = "xueqiu_test"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath(
            ".//*[contains(@class, 'search__user__card__content')]")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        for item in href_contens:
            href = item.xpath(".//*[contains(@class, 'user-name')]/@href")
            title_content = item.xpath(
                ".//*[contains(@class, 'user-name')]//span/text()")
            title = "".join(title_content).strip()
            if len(href) > 0 and title == key:
                url = "https://xueqiu.com/u{0}".format(href[0])
                self.doraemon.hashSet(self.finished_ids, url, url)
                data = {'id': key, 'url': url}
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://xueqiu.com/k?q={0}".format(name)
                new_urls.append([tmp_url, name])

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls,
                             5,
                             self.log_path,
                             None,
                             callback=self.parse)
Ejemplo n.º 4
0
class ProcessTimeoutHandler():
    def __init__(self):
        self.doraemon = Doraemon()
        self.file = FileIOMiddleware()
        self.settings = Settings()

        self.cache_file = self.settings.TIMEOUT_CACHE_FILE
        self.timeout = self.settings.PROCESS_TIMEOUT
        self.timeout_content = self.settings.PROCESS_TIMEOUT_CONTENT

    def updateTimeoutCacheFile(self, processes):
        if self.doraemon.isFileExists(self.cache_file):
            self.doraemon.deleteFile(self.cache_file)
        for process in processes:
            tmp = '{0}-{1}-{2}'.format(process.pid, process.pname,
                                       process.past)
            self.file.writeToTxtAdd(self.cache_file, tmp)

    def getTimeoutCache(self):
        result = []
        if self.doraemon.isFileExists(self.cache_file):
            data = self.file.readFromTxt(self.cache_file)
            pidTimeoutList = data.split('\n')
            for item in pidTimeoutList:
                if self.doraemon.isEmpty(item) is False:
                    tmp = item.split('-')
                    if len(tmp) == 3:
                        result.append(
                            ProcessTimeoutDto(int(tmp[0]), tmp[1],
                                              float(tmp[2]), False))
        return result

    def findTarget(self, pid, pids):
        result = None
        for p in pids:
            if p.pid == pid:
                result = p
        return result

    def filterTimeoutProcesses(self, curpids, prepids):
        result = []
        if self.doraemon.isEmpty(curpids):
            return result
        for p in curpids:
            pre = self.findTarget(p.pid, prepids)
            if pre is not None:
                if (pre.pname == 'chrome' and self.doraemon.isExceedTimeoutInterval(self.timeout, pre.past)) or \
                   (pre.pname == 'python' and self.doraemon.isExceedTimeoutInterval(self.timeout_content, pre.past)):
                    result.append(
                        ProcessTimeoutDto(pre.pid, pre.pname, pre.past, True))
            else:
                result.append(p)
        return result

    def getCurrentProcesses(self):
        result = []
        pids = psutil.pids()
        if len(pids):
            print 'No Chrome process.'
        for pid in pids:
            try:
                p = psutil.Process(pid)
                pname = p.name()
                if pname == 'chrome' or pname == 'python':
                    print 'Start to store process {0}.'.format(pid)
                    result.append(
                        ProcessTimeoutDto(pid, pname, p._create_time, False))
            except Exception as e:
                print 'Exception {0} to find process: pid - {1}'.format(
                    e, p.pid)
        return result

    def processTimeoutProcesses(self, curpids, prepids):
        updatedPids = self.filterTimeoutProcesses(curpids, prepids)
        notTimeoutProcesses = []
        for p in updatedPids:
            if p.isTimeout:
                try:
                    print 'kill timeout process: pid - {0}'.format(p.pid)
                    os.kill(p.pid, signal.SIGKILL)
                except Exception as e:
                    print 'Exception {0} to kill process: pid - {1}'.format(
                        e, p.pid)
            else:
                notTimeoutProcesses.append(p)
        self.updateTimeoutCacheFile(notTimeoutProcesses)

    def start(self):
        print 'Start to process.'
        self.processTimeoutProcesses(self.getCurrentProcesses(),
                                     self.getTimeoutCache())
Ejemplo n.º 5
0
class Doraemon():
    def __init__(self):
        settings = Settings()
        settings.CreateCommonSettings()
        self.file = FileIOMiddleware()
        self.rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT)
        self.bf_urls = BloomFilter(self.rconn, settings.BLOOMFILTER_URLS)
        self.bf_content = BloomFilter(self.rconn, settings.BLOOMFILTER_CONTENT)
        self.bf_authors = BloomFilter(self.rconn, settings.BLOOMFILTER_AUTHORS)
        self.disable_restart_interval = settings.DISABLE_RESTART_INTERVAL
        self.bf_weixin_url = BloomFilter(self.rconn,
                                         settings.FINISHED_WEIXIN_URL_ARTICLE)
        self.bf_weixin_content = BloomFilter(
            self.rconn, settings.FINISHED_WEIXIN_CONTENT_ARTICLE)
        self.bf_weixin_id = BloomFilter(self.rconn,
                                        settings.FINISHED_WEIXIN_URL_ID)
        self.bf_finished_image_id = BloomFilter(self.rconn,
                                                settings.FINISHED_IMAGE_ID)
        self.bf_finished_temp_weixin = BloomFilter(
            self.rconn, settings.FINISHED_TEMP_WEIXIN)
        self.md5 = hashlib.md5()
        self.max_concurrency = settings.MAX_CONCURRENCY
        self.concurrency_file = settings.CONCURRENCY_FILE
        self.concurrency_refresh_file = settings.CONCURRENCY_REFRESH_FILE
        self.refresh_concurrency_interval = settings.REFRESH_CONCURRENCY_INTERVAL
        self.max_concurrency_spider = settings.MAX_CONCURRENCY_SPIDER
        self.concurrency_file_spider = settings.CONCURRENCY_FILE_SPIDER
        self.concurrency_refresh_file_spider = settings.CONCURRENCY_REFRESH_FILE_SPIDER
        self.refresh_concurrency_interval_spider = settings.REFRESH_CONCURRENCY_INTERVAL_SPIDER
        self.bf_huxiu_nlp = BloomFilter(self.rconn,
                                        settings.FINISHED_HUXIU_NLP)
        self.sites_info = settings.SITES_INFO
        self.sites_debug = settings.SITES_DEBUG

    def sshUpload(self, address, port, username, password, fromFile, toFile):
        transport = paramiko.Transport((address, port))
        try:
            print 'Start to upload file: {0}'.format(fromFile)
            transport.connect(username=username, password=password)
            sftp = paramiko.SFTPClient.from_transport(transport)
            sftp.put(fromFile, toFile)
            transport.close()
            print 'Finished to upload file: {0}'.format(fromFile)
            return True
        except Exception as e:
            print 'Exception {0} to upload file: {1}'.format(
                e.message, fromFile)
            return False

    def moveFile(self, fromfile=None, tofile=None):
        if fromfile is None or os.path.exists(fromfile) is False:
            print "Source file {0} is not exits".format(fromfile)
            return False
        try:
            retry = 1
            retryLimit = 60
            while not os.path.exists(tofile) and retry <= retryLimit:
                if retry > 1:
                    time.sleep(1)
                shutil.move(fromfile, tofile)
                retry += 1
            if retryLimit < retry:
                raise Exception('Move file retry limit time reached.')
            return True
        except Exception as e:
            raise Exception(
                "Exception {0} to move file {1} to file {2}.".format(
                    e.message, fromfile, tofile))

    def copyFile(self, fromfile=None, tofile=None):
        if fromfile is None or os.path.exists(fromfile) is False:
            print "Source file {0} is not exits".format(fromfile)
            return False
        try:
            retry = 1
            retryLimit = 60
            while not os.path.exists(tofile) and retry <= retryLimit:
                if retry > 1:
                    time.sleep(1)
                shutil.copy(fromfile, tofile)
                retry += 1
            if retryLimit < retry:
                raise Exception('Copy file retry limit time reached.')
            return True
        except Exception as e:
            raise Exception(
                "Exception {0} to copy file {1} to file {2}.".format(
                    e.message, fromfile, tofile))

    def createFilePath(self, path):
        isFilePathExists = os.path.exists(path)
        if isFilePathExists is False:
            os.makedirs(path)

    def isExceedRestartInterval(self, path, restart_interval):
        isRestartPathExists = os.path.exists(path)
        if isRestartPathExists is False:
            print 'restart file does not exit and create an new one'
            self.file.writeToTxtCover(path, time.time())
            return True
        past = float(self.file.readFromTxt(path))
        now = time.time()
        isExceed = ((now - past) // 60 >= restart_interval) or (
            self.disable_restart_interval is True)
        if isExceed is True:
            print 'exceeds the restart interval and restart'
            self.file.writeToTxtCover(path, time.time())
        else:
            print 'does not exceed the restart interval and stop'
        return isExceed

    def isExceedTimeoutInterval(self, timeout, past):
        if self.isEmpty(timeout):
            print 'timeout is empty'
            return False
        now = time.time()
        isExceed = ((now - past) // 60 >= timeout)
        if isExceed is True:
            print 'exceeds the timeout.'
        else:
            print 'does not exceeds the timeout.'
        return isExceed

    def isEmpty(self, obj=None):
        if isinstance(obj, unicode):
            obj = obj.encode('utf-8')
        if isinstance(obj, str):
            return len([obj for i in obj if i.strip()]) == 0
        elif isinstance(obj, int) or isinstance(obj, float):
            return False
        elif isinstance(obj, list) or isinstance(obj, dict) or isinstance(
                obj, tuple) or isinstance(obj, set):
            return len(obj) == 0
        else:
            return obj == None

    def isNumber(self, item):
        return isinstance(item, int) or isinstance(item, float)

    def isTitleEmpty(self, title, url):
        if self.isEmpty(title):
            print 'Empty title for: {0}'.format(url)
            return True
        return False

    def isUrlValid(self, url, good_keys, bad_keys, regx, valid):
        is_match = False
        for regx_item in regx:
            if regx_item.match(url) != None:
                is_match = True
        if is_match == False:
            print 'Invalid url for not match: {0}'.format(url)
            return False
        for good in good_keys:
            if valid == True:
                continue
            if good in url:
                print 'Match good key: {0}'.format(good)
                valid = True
        for bad in bad_keys:
            if valid == False:
                continue
            if bad in url:
                print 'Match bad key: {0}'.format(bad)
                valid = False
        return valid

    def getImageTypeFromUrl(self, url):
        if 'jpeg' or '.jpg' in url:
            return 'jpg'
        if '.png' in url or 'png' in url:
            return 'png'
        if '.gif' in url or 'gif' in url:
            return 'gif'
        else:
            print 'Other image type use default type'
            return 'png'

    def isDuplicated(self, filter, content):
        content_encode = str(content).encode("utf-8")
        if filter.isContains(content_encode):
            print 'Content {0} duplicated!'.format(content)
            return True
        else:
            filter.insert(content_encode)
            print 'Content {0} not duplicated!'.format(content)
            return False

    def isFinished(self, filter, content):
        content_encode = str(content).encode("utf-8")
        if filter.isContains(content_encode):
            print 'Content {0} exists!'.format(content)
            return True
        else:
            return False

    def storeFinished(self, filter, content):
        print 'Start to store content: {0}'.format(content)
        content_encode = str(content).encode("utf-8")
        filter.insert(content_encode)

    def storeMongodb(self, mongo_url, data):
        mongo = MongoMiddleware()
        mongo.insert(mongo_url, data)

    def storeTxt(self, id, content, finished_txt_path, name):
        try:
            self.createFilePath(finished_txt_path)
            print 'Start to store txt: {0}'.format(id)
            self.file.writeToTxtCover(
                '{0}/{1}_{2}.txt'.format(finished_txt_path, name, id), content)
            print 'End to store txt: {0}'.format(id)
        except Exception as e:
            print 'Exception {0} to store txt: {1}'.format(e.message, id)

    def storeTxtAdd(self, author_txt_path, author_name, settingName):
        try:
            self.createFilePath(author_txt_path)
            print 'Start to store txt: {0}'.format(author_name)
            self.file.writeToTxtAdd(
                '{0}/{1}_authors.txt'.format(author_txt_path, settingName),
                author_name)
        except Exception as e:
            print 'Exception to store txt: {0} , for {1}'.format(
                author_name, e.strerror)
        print 'End to store txt: {0}'.format(author_name)

    def storeHtml(self, id, content, finished_html_path):
        try:
            self.createFilePath(finished_html_path)
            print 'Start to store html: {0}'.format(id)
            self.file.writeToHtmlCover(
                '{0}/{1}.html'.format(finished_html_path, id), content)
            print 'End to store html: {0}'.format(id)
            return True
        except Exception as e:
            print 'Exception {0} to store html: {1}'.format(e.message, id)
            return False

    def filter(self, filter, url_titles):
        new_url_titles = []
        for url_title in url_titles:
            if self.isFinished(filter, url_title[1]) is False:
                new_url_titles.append(url_title)
        return new_url_titles

    def imageFilter(self, filter, ids):
        new_ids = []
        for id in ids:
            if self.isFinished(filter, id) is False:
                new_ids.append(id)
        return new_ids

    def readNewUrls(self, filter, url_path):
        print 'Start to read urls'
        isUrlPathExit = os.path.exists(url_path)
        new_url_titles = []
        if isUrlPathExit is True:
            url_titles = np.array(
                self.file.readColsFromCSV(url_path, ['url', 'title']))
            new_url_titles = self.filter(filter, url_titles)
        return new_url_titles

    def readNewImageIds(self, filter, content_path):
        print 'Start to read ids'
        isContentPathExit = os.path.exists(content_path)
        new_ids = []
        id_list = []
        if isContentPathExit is True:
            ids = np.array(self.file.readColsFromCSV(content_path, ['id']))
            for id in ids:
                id_list.append(id[0])
            new_ids = self.imageFilter(filter, id_list)
        return new_ids

    def downloadImage(self, image_url, store_path, image_name):
        try:
            self.createFilePath(store_path)
            print 'start to download image: {0}'.format(image_url)
            urllib.urlretrieve(image_url,
                               '{0}/{1}'.format(store_path, image_name))
            return True
        except Exception as e:
            print 'exception to download image: {0} for {1}'.format(
                image_url, e.message)
            return False

    def hashSet(self, name, key, value):
        self.rconn.hset(name, key, value)

    def getHashSet(self, name, key):
        return self.rconn.hget(name, key)

    def getAllHasSet(self, name):
        return self.rconn.hgetall(name)

    def delHashSet(self, name, key):
        return self.rconn.hdel(name, key)

    def delKey(self, key):
        return self.rconn.delete(key)

    def getKeyLen(self, key):
        return self.rconn.hlen(key)

    def getDateOfDaysBefore(self, days):
        return (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")

    def getCurrentYear(self):
        return time.strftime('%Y', time.localtime(time.time()))

    def getCurrentDate(self):
        return time.strftime('%Y-%m-%d', time.localtime(time.time()))

    def getCurrentLocalTime(self):
        return datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    def getDateTime(self, string, dateFormat, pattern, isMatchDate):
        try:
            match = re.search(dateFormat, string)
            strp = datetime.strptime(match.group(), pattern)
            if isMatchDate:
                print "'Match date success: {0}".format(strp.date())
                return strp.date()
            else:
                print "'Match date success: {0}".format(strp.date())
                return strp.date()
        except:
            print("'Match date fail")
            return None

    def getDateFromChinese(self, string):
        year = self.getCurrentYear()
        try:
            if "今天" in string or \
               "秒前" in string or \
               "分钟前" in string or \
               "小时前" in string or \
               "Today" in string:
                return self.getDateOfDaysBefore(0)
            if "昨天" in string or \
               "1天前" in string or \
               "Yesterday" in string:
                return self.getDateOfDaysBefore(1)
            if "前天" in string or \
               "2天前" in string or \
               "2 days ago" in string:
                return self.getDateOfDaysBefore(2)
            if "3天前" in string or \
               "3 days ago" in string:
                return self.getDateOfDaysBefore(3)
            if "4天前" in string or \
               "4 days ago" in string:
                return self.getDateOfDaysBefore(4)
            if "5天前" in string or \
               "5 days ago" in string:
                return self.getDateOfDaysBefore(5)
            if "6天前" in string or \
               "6 days ago" in string:
                return self.getDateOfDaysBefore(6)
            if "1周前" in string or \
               "1 week ago" in string:
                return self.getDateOfDaysBefore(7)
            if "年" not in string and "月" in string and "日" in string:
                data = re.split(",", string.replace('月', ',').replace('日', ''))
                return "{0}-{1}-{2}".format(year,
                                            self.getNumberFromString(data[0]),
                                            self.getNumberFromString(data[1]))
            if "年" in string and "月" in string and "日" in string:
                data = re.split(
                    ",",
                    string.replace('年', ',').replace('月',
                                                     ',').replace('日', ','))
                return "{0}-{1}-{2}".format(self.getNumberFromString(data[0]),
                                            self.getNumberFromString(data[1]),
                                            self.getNumberFromString(data[2]))
        except:
            print("Fail to match date from Chinese.")
            return None

    def getNumberFromString(self, string):
        return ''.join(re.findall(r'\d+', string)).strip()

    def getFinalDate(self, year, month, day):
        return "{0}-{1}-{2}".format(year, month, day)

    def formateMonthDay(self, MD):
        return '{:02d}'.format(MD)

    def getDateFromString(self, string_date):
        _date_chinese = self.getDateFromChinese(string_date)
        if _date_chinese is not None:
            string_date = _date_chinese

        _date_year_month_day_crossing = self.getDateTime(
            string_date, r'\d{4}-\d{1,2}-\d{1,2}', '%Y-%m-%d', True)
        _date_year2_month_day_crossing = self.getDateTime(
            string_date, r'\d{2}-\d{1,2}-\d{1,2}', '%y-%m-%d', True)
        _date_month_day_crossing = self.getDateTime(string_date,
                                                    r'\d{1,2}-\d{1,2}',
                                                    '%m-%d', True)
        _date_year_month_day_dot = self.getDateTime(string_date,
                                                    r'\d{4}.\d{1,2}.\d{1,2}',
                                                    '%Y.%m.%d', True)
        _date_month_day_dot = self.getDateTime(string_date, r'\d{1,2}.\d{1,2}',
                                               '%m.%d', True)
        _date_year_month_day_slash = self.getDateTime(
            string_date, r'\d{4}\/\d{1,2}\/\d{1,2}', '%Y/%m/%d', True)
        _date_month_day_slash = self.getDateTime(string_date,
                                                 r'\d{1,2}\/\d{1,2}', '%m/%d',
                                                 True)
        _time_hour_minute_second = self.getDateTime(
            string_date, r'\d{1,2}:\d{1,2}:\d{1,2}', '%H:%M:%S', False)
        _time_hour_minute = self.getDateTime(string_date, r'\d{1,2}:\d{1,2}',
                                             '%H:%M', False)

        year = self.getCurrentYear()

        if _date_year_month_day_crossing is not None:
            return self.getFinalDate(
                _date_year_month_day_crossing.year,
                self.formateMonthDay(_date_year_month_day_crossing.month),
                self.formateMonthDay(_date_year_month_day_crossing.day))
        if _date_year2_month_day_crossing is not None:
            return self.getFinalDate(
                _date_year2_month_day_crossing.year,
                self.formateMonthDay(_date_year2_month_day_crossing.month),
                self.formateMonthDay(_date_year2_month_day_crossing.day))
        if _date_year_month_day_crossing is None and _date_month_day_crossing is not None:
            return self.getFinalDate(
                year, self.formateMonthDay(_date_month_day_crossing.month),
                self.formateMonthDay(_date_month_day_crossing.day))

        if _date_year_month_day_dot is not None:
            return self.getFinalDate(
                _date_year_month_day_dot.year,
                self.formateMonthDay(_date_year_month_day_dot.month),
                self.formateMonthDay(_date_year_month_day_dot.day))
        if _date_year_month_day_dot is None and _date_month_day_dot is not None:
            return self.getFinalDate(
                year, self.formateMonthDay(_date_month_day_dot.month),
                self.formateMonthDay(_date_month_day_dot.day))

        if _date_year_month_day_slash is not None:
            return self.getFinalDate(
                _date_year_month_day_slash.year,
                self.formateMonthDay(_date_year_month_day_slash.month),
                self.formateMonthDay(_date_year_month_day_slash.day))
        if _date_year_month_day_slash is None and _date_month_day_slash is not None:
            return self.getFinalDate(
                year, self.formateMonthDay(_date_month_day_slash.month),
                self.formateMonthDay(_date_month_day_slash.day))

        if _time_hour_minute_second is not None or _time_hour_minute is not None:
            return self.getCurrentDate()

    def getMD5(self, content):
        self.md5.update(content.encode('utf-8'))
        return self.md5.hexdigest()

    def compressImage(self, origin_image_path, destination_image_path,
                      multiplier):
        try:
            sImg = Image.open(origin_image_path)
            w, h = sImg.size
            dImg = sImg.resize((int(w / multiplier), int(h / multiplier)),
                               Image.ANTIALIAS)
            os.remove(origin_image_path)
            dImg.save(destination_image_path)
            print "Compress picture {0} success!".format(
                destination_image_path)
        except Exception as e:
            print "Compress picture {0} failed for {1}".format(
                destination_image_path, e.message)

    def getSizeOfImage(self, image_path):
        try:
            img = Image.open(image_path)
            return img.size
        except Exception as e:
            print "Exception to open picture {0}, for {1}.".format(
                image_path, e.message)

    def getFileSize(self, file_path):
        try:
            fsize = os.path.getsize(file_path)
            fsize = fsize / float(1024)
            return round(fsize, 2)
        except Exception as e:
            print "Exception to get file size of {0}, for {1}.".format(
                file_path, e.message)

    def getFileList(self, diractory):
        file_list = []
        isFilePathExists = os.path.exists(diractory)
        if isFilePathExists is True:
            file_list = os.listdir(diractory)
        return file_list

    def isFileExists(self, file_path):
        return os.path.exists(file_path)

    def deleteFile(self, file_path):
        try:
            print "Start to delete file: {0}".format(file_path)
            os.remove(file_path)
            print "Finished to delete file: {0}".format(file_path)
        except Exception as e:
            print "Exception to delete file: {0} for : {1}".format(
                file_path, e.message)

    def tar(self, directory):
        file_list = os.listdir(directory)
        if len(file_list) == 0:
            print "There is no file to compress for: {0}".format(directory)
            return
        try:
            print "Start to compress directory: {0}".format(directory)
            t = tarfile.open(directory + ".tar.gz", "w:gz")
            for root, dir, files in os.walk(directory):
                for file in files:
                    fullpath = os.path.join(root, file)
                    t.add(fullpath)
            t.close()
            print "Finished to compress directory: {0}".format(directory)
        except Exception as e:
            print "Exception to compress directory: {0} for :{1}".format(
                directory, e.message)

    def tarList(self, directory):
        file_list = os.listdir(directory)
        if len(file_list) == 0:
            print "There is no file to compress for: {0}".format(directory)
            return
        try:
            print "Start to compress directory: {0}".format(directory)
            lst = []
            t = tarfile.open(directory + ".tar.gz", "w:gz")
            for root, dir, files in os.walk(directory):
                for file in files:
                    fullpath = os.path.join(root, file)
                    t.add(fullpath)
                    lst.append(fullpath)
            t.close()
            print "Finished to compress directory: {0}".format(directory)
            return lst
        except Exception as e:
            print "Exception to compress directory: {0} for :{1}".format(
                directory, e.message)
            return []

    def isCamelReadyToRun(self, settings):
        if self.isWorkTime(settings.START_TIME, settings.END_TIME) is False:
            return False
        if self.isConcurrencyAllowToRun(self.concurrency_refresh_file,
                                        self.refresh_concurrency_interval,
                                        self.concurrency_file,
                                        self.max_concurrency) is False:
            return False
        if self.isExceedRestartInterval(settings.RESTART_PATH,
                                        settings.RESTART_INTERVAL) is False:
            self.recoveryConcurrency(self.concurrency_file,
                                     self.max_concurrency)
            return False
        return True

    def isSpiderReadyToRun(self):
        return self.isConcurrencyAllowToRun(
            self.concurrency_refresh_file_spider,
            self.refresh_concurrency_interval_spider,
            self.concurrency_file_spider, self.max_concurrency_spider)

    def isWorkTime(self, start_time, end_time):
        if self.isNumber(start_time) is False:
            print 'start time is empty'
            return False
        if self.isNumber(end_time) is False:
            print 'end time is empty'
            return False
        if self.isAfterHour(start_time) and self.isBeforeHour(end_time):
            print 'it is work time'
            return True
        else:
            print 'it is not work time before {0} or after {1}'.format(
                start_time, end_time)
            return False

    def isAfterHour(self, hour):
        if self.isNumber(hour) is False:
            print 'input hour is empty.'
            return
        current_time = time.strftime('%H', time.localtime(time.time()))
        if int(hour) < int(current_time):
            return True
        else:
            return False

    def isBeforeHour(self, hour):
        if self.isNumber(hour) is False:
            print 'input hour is empty.'
            return
        current_time = time.strftime('%H', time.localtime(time.time()))
        if int(hour) >= int(current_time):
            return True
        else:
            return False

    def readFile(self, file):
        waiting = 0
        data = self.file.readFromTxt(file).strip()
        while self.isEmpty(data):
            print 'file {0} is under update, waitting... {1} s'.format(
                file, waiting)
            time.sleep(1)
            waiting += 1
            data = self.file.readFromTxt(file).strip()
        return data

    def isConcurrencyAllowToRun(self, concurrency_refresh_file,
                                refresh_concurrency_interval, concurrency_file,
                                max_concurrency):
        self.updateConcurrencyFile(concurrency_refresh_file,
                                   refresh_concurrency_interval,
                                   concurrency_file, max_concurrency)
        isFilePathExists = os.path.exists(concurrency_file)
        if isFilePathExists is False:
            print 'concurrency file not exists and create an new one with max concurrency: {0}'.format(
                str(max_concurrency))
            self.file.writeToTxtCover(concurrency_file, str(max_concurrency))
        concurrency_available = int(self.readFile(concurrency_file))
        print 'concurrency file exists : {0}'.format(
            str(concurrency_available))
        if int(concurrency_available) > 0:
            print 'app is able to run.'
            new_concurrency_available = concurrency_available - 1
            print 'new concurrency is : {0}'.format(
                str(new_concurrency_available))
            self.file.writeToTxtCover(concurrency_file,
                                      str(new_concurrency_available))
            return True
        else:
            print 'app is not able to run for no available concurrency.'
            return False
        return True

    def recoveryConcurrency(self, concurrency_file, max_concurrency):
        isFilePathExists = os.path.exists(concurrency_file)
        if isFilePathExists is False:
            print 'concurrency file not exists and create an new one with max concurrency: {0}'.format(
                str(max_concurrency))
            self.file.writeToTxtCover(concurrency_file, str(max_concurrency))
            return
        concurrency_available = int(self.readFile(concurrency_file))
        print 'concurrency file exists and start to recovery: {0}'.format(
            str(concurrency_available))
        if int(concurrency_available) < max_concurrency:
            print 'start to recovery concurrenct.'
            new_concurrency_available = concurrency_available + 1
            print 'new concurrency is : {0}'.format(
                str(new_concurrency_available))
            self.file.writeToTxtCover(concurrency_file,
                                      str(new_concurrency_available))
        else:
            print 'concurrency is not normal and write max concurrency to it.'
            self.file.writeToTxtCover(concurrency_file, str(max_concurrency))

    def updateConcurrencyFile(self, concurrency_refresh_file,
                              refresh_concurrency_interval, concurrency_file,
                              max_concurrency):
        if self.isExceedRestartInterval(concurrency_refresh_file,
                                        refresh_concurrency_interval) is True:
            print 'refresh concurrency file: {0}'.format(str(max_concurrency))
            self.file.writeToTxtCover(concurrency_file, str(max_concurrency))

    def createCamelData(self, title, url, id, download_time, source):
        return camelDto(title, url, id, download_time, source)

    def createCamelMongoJson(self, camelDto):
        return {
            'title': camelDto.title,
            'url': camelDto.url,
            'id': camelDto.id,
            'download_time': camelDto.download_time,
            'source': camelDto.source
        }

    def createSpiderData(self, url, origin_url, public_time, author_name,
                         title, id, download_time, source, images,
                         is_open_cache, content):
        return spiderDto(url, origin_url, public_time, author_name, title, id,
                         download_time, source, images, is_open_cache, content)

    def createSpiderMongoJson(self, spiderDto):
        return {
            'url': spiderDto.url,
            'origin_url,': spiderDto.origin_url,
            'public_time': spiderDto.public_time,
            'author_name': spiderDto.author_name,
            'title': spiderDto.title,
            'id': spiderDto.id,
            'download_time': spiderDto.download_time,
            'source': spiderDto.source,
            'images': spiderDto.images,
            'is_open_cache': spiderDto.is_open_cache
        }

    def updateImages(self, images, newImages):
        for image in newImages:
            data = image.strip()
            if self.isEmpty(data) is False and data not in images:
                images.append(data)

    def completeImageUrls(self, newImages, current_url):
        result = []
        if len(newImages) == 0:
            print 'No images urls to process'
            return result
        for url in newImages:
            entireUrl = urlparse.urljoin(current_url, url).strip()
            if re.match('https', entireUrl) is not None:
                result.append(entireUrl)
        return result

    def getSitesInfo(self, isdebug=False):
        if isdebug:
            site_info_path = self.sites_debug
        else:
            site_info_path = self.sites_info
        content = self.file.readFromTxt(site_info_path)
        if self.isEmpty(content):
            print 'sites info is empty'
            return None
        sitesInfo = content.split('[SITE]')
        results = []
        for site in sitesInfo:
            if self.isEmpty(site):
                continue
            results.append(self.extractSiteInfo(site))
        return results

    def extractSiteInfo(self, siteInfo):
        items = siteInfo.split('\n')
        result = siteInfoDto(domain=None,
                             name=None,
                             restart_interval=None,
                             url_parallel_number=None,
                             content_parallel_number=None,
                             is_open_cache=None,
                             work_time_start=None,
                             work_time_end=None,
                             good_keys=[],
                             bad_keys=[],
                             href_items=[],
                             href=[],
                             url_match=[],
                             url_title_match=[],
                             url_id_tag=[],
                             content_match=[],
                             content_child_match=[],
                             content_url_match=[],
                             content_id_tag=[],
                             article_match=[],
                             content_title_match=[],
                             content_image_match=[],
                             content_time_match=[],
                             need_self_image=None,
                             url_timeout=None,
                             content_timeout=None)
        for item in items:
            if self.isEmpty(item):
                continue
            content = item.split('==')
            key = ''.join(content[0]).strip()
            value = ''.join(content[1]).strip()
            if key == 'DOMAIN':
                result.domain = value
                continue
            if key == 'NAME':
                result.name = value
                continue
            if key == 'RESTARTINTERVAL':
                result.restart_interval = int(value)
                continue
            if key == 'URLPARALLELNUMBER':
                result.url_parallel_number = int(value)
                continue
            if key == 'CONTENTPARALLELNUMBER':
                result.content_parallel_number = int(value)
                continue
            if key == 'ISOPENCACHE':
                result.is_open_cache = bool(value)
                continue
            if key == 'WORKTIMESTART':
                result.work_time_start = int(value)
                continue
            if key == 'WORKTIMEEND':
                result.work_time_end = int(value)
                continue
            if key == 'GOODKEYS':
                if self.isEmpty(value) is False:
                    result.good_keys.append(value)
                continue
            if key == 'BADKEYS':
                if self.isEmpty(value) is False:
                    result.bad_keys.append(value)
                continue
            if key == 'URLMATCH':
                if self.isEmpty(value) is False:
                    result.url_match.append(self.extractRegxRule(value))
                continue
            if key == 'URLTITLEMATCH':
                if self.isEmpty(value) is False:
                    result.url_title_match.append(self.extractHtmlTag(value))
                continue
            if key == 'URLIDTAG':
                if self.isEmpty(value) is False:
                    result.url_id_tag.append(self.extractHtmlTag(value))
                continue
            if key == 'CONTENTURLMATCH':
                if self.isEmpty(value) is False:
                    result.content_url_match.append(
                        self.extractRegxRule(value))
                continue
            if key == 'CONTENTIDTAG':
                if self.isEmpty(value) is False:
                    result.content_id_tag.append(self.extractHtmlTag(value))
                continue
            if key == 'HREFITEMS':
                if self.isEmpty(value) is False:
                    result.href_items.append(self.extractHtmlTag(value))
                continue
            if key == 'HREF':
                if self.isEmpty(value) is False:
                    result.href.append(self.extractHtmlTag(value))
                continue
            if key == 'ARTICLEMATCH':
                if self.isEmpty(value) is False:
                    result.article_match.append(self.extractHtmlTag(value))
                continue
            if key == 'CONTENTMATCH':
                if self.isEmpty(value) is False:
                    result.content_match.append(self.extractHtmlTag(value))
                continue
            if key == 'CONTENTCHILDMATCH':
                if self.isEmpty(value) is False:
                    result.content_child_match.append(
                        self.extractHtmlTag(value))
                continue
            if key == 'CONTENTTITLEMATCH':
                if self.isEmpty(value) is False:
                    result.content_title_match.append(
                        self.extractHtmlTag(value))
                continue
            if key == 'CONTENTIMAGEMATCH':
                if self.isEmpty(value) is False:
                    result.content_image_match.append(
                        self.extractHtmlTag(value))
                continue
            if key == 'CONTENTTIMEMATCH':
                if self.isEmpty(value) is False:
                    result.content_time_match.append(
                        self.extractHtmlTag(value))
                continue
            if key == 'NEEDSELFIMAGE':
                if self.isEmpty(value) is False:
                    result.need_self_image = value == 'True'
            if key == 'NEEDSELFHTML':
                if self.isEmpty(value) is False:
                    result.need_self_html = value == 'True'
            if key == 'URLTIMEOUT':
                if self.isEmpty(value) is False:
                    result.url_timeout = value
            if key == 'CONTENTTIMEOUT':
                if self.isEmpty(value) is False:
                    result.content_timeout = value
        return result

    def getUrlId(self, url, idTag):
        id = None
        for item in idTag:
            matchItem = item.regx
            if matchItem in url:
                index = url.index(item.regx) + item.index
                if len(url) <= index:
                    continue
                id = url[index]
                if id == None:
                    continue
        return id

    def extractRegxRule(self, regxMatch):
        return re.compile(regxMatch)

    def extractHtmlTag(self, regxMatch):
        items = regxMatch.split('|')
        id = ''.join(items[0]).strip()
        index = int(items[1])
        return regxMatchDto(id, index)

    def getMatchContent(self, content, regx):
        if regx.index == -1 or len(content) == 0:
            return content
        return content[regx.index]

    def uploadFileApi(self, url, fileName, fullPath):
        try:
            with open(fullPath, mode="r") as f:
                file = {"file": (fileName, f.read())}

                encode_data = encode_multipart_formdata(file)

                file_data = encode_data[0]
                headers_from_data = {"Content-Type": encode_data[1]}
                response = requests.post(url=url,
                                         headers=headers_from_data,
                                         data=file_data).json()
                if response['code'] != 200:
                    print 'Fail to upload file {0} through api {1}'.format(
                        fileName, url)
                    return False
                print 'Success to upload file {0} through api {1}'.format(
                    fileName, url)
                return True
        except Exception as e:
            print 'Exception to upload file {0} through api {1} : {2}'.format(
                fileName, url, e.message)
            return False
Ejemplo n.º 6
0
class WoshipmReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "whoispm_receptor"
        self.finished_ids = "woshipm_receptor"
        self.log_path = "/home/dev/Data/rsyncData/test/"
        self.regx = re.compile("/u/[0-9]{0,}")

    def parse(self, response):
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        key = response['request_title'].strip()
        href_contens = html.xpath("./a")
        if len(href_contens) == 0:
            print 'No data for: {0}'.format(key)
            return
        for item in href_contens:
            href = item.xpath("@href")
            title_content = item.xpath(".//text()")
            title = "".join(title_content).strip()
            if len(href) > 0 and title == key:
                isValidUrl = self.regx.match(href[0])
                if isValidUrl is None:
                    print 'Invalid url for not match: {0}'.format(href[0])
                    continue
                url = "http://www.woshipm.com{0}".format(href[0])
                self.doraemon.hashSet(self.finished_ids, url, url)
                data = {
                    'id': key,
                    'url': url
                }
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
Ejemplo n.º 7
0
class FengReceptor():

    def __init__(self):
        self.settings = Settings()
        self.getSettings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.doraemon.createFilePath(self.work_path_prd2)

    def getSettings(self):
        self.work_path_prd2 = "/home/dev/Data/rsyncData/test/"
        self.mongo = "feng_receptor"
        self.finished_ids = "feng_receptor"
        self.log_path = "/home/dev/Data/rsyncData/test/"

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        key = response['request_title'].strip()
        str = response['response'].page_source.encode('utf-8')
        str_n = str[str.find('(') + 1:-21]
        str_n = str_n.replace('null', 'None')
        dics = eval(str_n)
        if len(dics['items']) == 0:
            print 'No data for: {0}'.format(key)
            self.doraemon.hashSet(self.finished_ids, key, key)
            return
        for item in dics['items']:
            name = item['name'].replace('&lt;','').replace('em&gt;','').replace('\\/','')
            id = item['id']
            if len(id) > 0 and name == key:
                url = "https://feng.ifeng.com/author/{0}".format(id)
                self.doraemon.hashSet(self.finished_ids, key, key)
                data = {
                    'id': key,
                    'url': url
                }
                print 'Start to store mongo {0}'.format(data['url'])
                self.doraemon.storeMongodb(self.mongo, data)
                print 'Finished for {0}'.format(key)

    def start_requests(self):
        print 'Start requests'
        new_urls = []
        all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids))
        txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt'
        gonzhonghao = self.file.readFromTxt(txt_path)
        keys = gonzhonghao.split('\n')

        for key in keys:
            key = key.strip()
            if key not in all_finished_id:
                name = key.strip()
                tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name)
                new_urls.append([tmp_url, name])
            else:
                print 'Finished or no data for {0}'.format(key)
                self.doraemon.hashSet(self.finished_ids, key, key)

        if len(new_urls) == 0:
            print 'No more urls.'
            return

        request = BrowserRequest()
        request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
Ejemplo n.º 8
0
class StoreFiles():
    def __init__(self,
                 htmlpath=None,
                 imagepath=None,
                 templatepath=None,
                 articleurl=None,
                 alidomain=None,
                 alidomaindeepinews=None,
                 alidomaindeepinewsimg=None,
                 ipwebserver0=None,
                 portwebserver0=None,
                 userrootwebserver0=None,
                 userrootpasswordwebserver0=None,
                 htmlwebserver0=None,
                 needselfimage=None,
                 needselfhtml=None,
                 localhtmlpath=None,
                 logpath=None):
        self.doraemon = Doraemon()
        self.file = FileIOMiddleware()
        self.image_count = 0
        self.htmlpath = htmlpath
        self.imagepath = imagepath
        self.templatepath = templatepath
        self.articleurl = articleurl
        self.alidomain = alidomain
        self.alidomaindeepinews = alidomaindeepinews
        self.alidomaindeepinewsimg = alidomaindeepinewsimg
        self.ipwebserver0 = ipwebserver0
        self.portwebserver0 = portwebserver0
        self.userrootwebserver0 = userrootwebserver0
        self.userrootpasswordwebserver0 = userrootpasswordwebserver0
        self.htmlwebserver0 = htmlwebserver0
        self.needselfimage = needselfimage
        self.needselfhtml = needselfhtml
        self.localhtmlpath = localhtmlpath
        self.logpath = logpath

    def parseContentRegxRule(self, content_regx_rule):
        result = matchRules(None, None, None)
        rules = [
            matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[\=]',
                       r'[\'](.*?)[\']'),
            matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[,]',
                       r'[\'](.*?)[\']'),
        ]
        for rule in rules:
            tag = re.findall(rule.tag, content_regx_rule)
            key = re.findall(rule.key, content_regx_rule)
            value = re.findall(rule.value, content_regx_rule)
            if self.doraemon.isEmpty(tag) is False and \
               self.doraemon.isEmpty(key) is False and \
               self.doraemon.isEmpty(value) is False:
                result.tag = tag[0]
                result.key = key[0]
                result.value = value[0]
                break
        return result

    def addHighlightTextInner(self, content):
        return '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \
               '<p class="article_paragraph">' + \
                    '<br class="article_paragraph_border"/>' + \
               '</p>'

    def addHighlightTextOuter(self, node, content):
        return '{0}<p class="article_paragraph">'.format(node) + \
                      '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                      '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addImgNode(self, node, dataSrc, dataRef, width, dataRatio):
        if width == None:
            width = 1000
        return '{0}<p class="article_paragraph_imag">'.format(node) + \
                      '<img data-ratio="{0}"'.format(dataRatio) + \
                           'data-src="{0}"'.format(dataSrc) + \
                           'data-ref="{0}"'.format(dataRef) + \
                           'data-type="jpeg"' + \
                           'data-w={0} '.format(width) + \
                           'class="article_paragraph_img"/>' + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                     '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addTextNodeOuter(self, node, content):
        if self.doraemon.isEmpty(content):
            return ''
        return '{0}<p class="article_paragraph">{1}'.format(node, content) + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                      '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addParagraphGapNode(self, node):
        return '{0}<p class="article_paragraph">'.format(node) + \
                     '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def addH1Node(self, node, content):
        return '{0}<p label="h1" class="article_paragraph_h1">'.format(node) + \
                    '<span class="article_paragraph_h1_1">' + \
                        '<span class="article_paragraph_h1_1_1">' + \
                            '<span class="article_paragraph_h1_1_1_1">{0}'.format(content) + \
                            '</span>' + \
                        '</span>' + \
                    '</span>' + \
                  '</p>' + \
                  '<p class="article_paragraph">' + \
                     '<br class="article_paragraph_border"/>' + \
                  '</p>'

    def extractImgSize(self, style, mode):
        size = re.findall(r'{0}:(.*?)px;'.format(mode), style)
        if len(size) == 1:
            return size[0].strip()
        return None

    def extractImg(self, url, node):
        result = imgInfo(None, None, None)
        if isinstance(node, NavigableString):
            return None
        if node.name != 'img' and len(node.contents) == 0:
            return result
        if node.name == 'img':
            if node.attrs.has_key('src') and self.doraemon.isEmpty(result.src):
                if 'data:image/' not in node.attrs['src']:
                    result.src = node.attrs['src']
            if node.attrs.has_key('_src') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['_src']:
                    result.src = node.attrs['_src']
            if node.attrs.has_key('data-original') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['data-original']:
                    result.src = node.attrs['data-original']
            if node.attrs.has_key('data-src') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['data-src']:
                    result.src = node.attrs['data-src']
            if node.attrs.has_key('data-lazy-src') and self.doraemon.isEmpty(
                    result.src):
                if 'data:image/' not in node.attrs['data-lazy-src']:
                    result.src = node.attrs['data-lazy-src']
            if node.attrs.has_key('width') and result.width == None:
                result.width = node.attrs['width']
            if node.attrs.has_key('height') and result.height == None:
                result.height = node.attrs['height']
            if node.attrs.has_key('data-w') and result.width == None:
                result.width = node.attrs['data-w']
            if node.attrs.has_key('data-h') and result.height == None:
                result.height = node.attrs['data-h']
            if node.attrs.has_key('data-backh') and result.height == None:
                result.height = node.attrs['data-backh']
            if node.attrs.has_key('data-backw') and result.width == None:
                result.width = node.attrs['data-backw']
            if node.attrs.has_key('data-wscnh') and result.height == None:
                result.height = node.attrs['data-wscnh']
            if node.attrs.has_key('data-wscnw') and result.width == None:
                result.width = node.attrs['data-wscnw']
            if node.attrs.has_key('style') and (result.width == None
                                                or result.height == None):
                result.width = self.extractImgSize(node.attrs['style'],
                                                   'width')
                result.height = self.extractImgSize(node.attrs['style'],
                                                    'height')
            if isinstance(result.width, int) and isinstance(
                    result.height, int):
                result.dataRatio = float(
                    float(result.height) / float(result.width))
            if result.src != None:
                result.src = urlparse.urljoin(url, result.src).strip()
            return result
        if len(node.contents) > 0:
            for n in node.contents:
                result = self.extractImg(url, n)
                if result != None:
                    return result
        return result

    def nodeTraversal(self, url, node, newNode, articleId):
        if node.name == 'strong' and \
           node.parent.name == 'div' and \
           self.doraemon.isEmpty(node.string) is False:
            newNode = '{0}{1}'.format(
                newNode, self.addHighlightTextOuter(newNode, node.string))
        if (node.name == 'h1' or \
           node.name == 'h2' or \
           node.name == 'h3' or \
           node.name == 'h4') and \
           self.doraemon.isEmpty(node.string) is False:
            newNode = '{0}{1}'.format(newNode,
                                      self.addH1Node(newNode, node.string))
        if isinstance(node, NavigableString) or \
           node.name == 'a' or \
           node.name == 'p' or \
           node.name == 'span' or \
           node.name == 'section':
            if isinstance(node, NavigableString):
                newNode = self.addTextNodeOuter(newNode, str(node))
            else:
                if self.doraemon.isEmpty(node.text) == False:
                    newNode = self.addTextNodeOuter(newNode, node.text)
        img = self.extractImg(url, node)
        updatedNode = updateNode(False, newNode, None, None)
        if img != None and img.src != None:
            updatedNode.isImageNode = True
            updatedNode.imageOriginUrl = img.src
            updatedNode.imageNewUrl = img.src
            try:
                imageType = self.doraemon.getImageTypeFromUrl(
                    updatedNode.imageOriginUrl)
                imageId = '{0}_{1}'.format(articleId, self.image_count)
                newImageName = '{0}.{1}'.format(imageId, imageType)
                if self.doraemon.downloadImage(updatedNode.imageOriginUrl,
                                               self.imagepath, newImageName):
                    imageInfo = Image.open('{0}/{1}'.format(
                        self.imagepath, newImageName))
                    if self.doraemon.isEmpty(imageInfo.width) is False:
                        img.width = imageInfo.width
                    if self.doraemon.isEmpty(imageInfo.height) is False:
                        img.height = imageInfo.height
                    if isinstance(img.width, int) and isinstance(
                            img.height, int):
                        img.dataRatio = float(
                            float(img.height) / float(img.width))
                    if self.needselfimage:
                        updatedNode.imageNewUrl = 'https://{0}.{1}/{2}/{3}'.format(
                            self.alidomaindeepinews, self.alidomain,
                            self.alidomaindeepinewsimg, newImageName)
                        imageUpload = AliUpload(
                            '{0}'.format(self.imagepath), newImageName,
                            '{0}'.format(self.alidomaindeepinews),
                            '{0}'.format(self.alidomaindeepinewsimg))
                        if imageUpload.start():
                            updatedNode.node = '{0}{1}'.format(
                                newNode,
                                self.addImgNode(newNode,
                                                updatedNode.imageNewUrl,
                                                updatedNode.imageNewUrl,
                                                img.width, img.dataRatio))
                            self.image_count += 1
                    else:
                        updatedNode.node = '{0}{1}'.format(
                            newNode,
                            self.addImgNode(newNode, img.src, img.src,
                                            img.width, img.dataRatio))
                else:
                    updatedNode.node = '{0}{1}'.format(
                        newNode,
                        self.addImgNode(newNode, img.src, img.src, img.width,
                                        img.dataRatio))
            except Exception as e:
                updatedNode.node = '{0}{1}'.format(
                    newNode,
                    self.addImgNode(newNode, img.src, img.src, img.width,
                                    img.dataRatio))
                print 'Exception {0} to download image: {1}'.format(
                    e.message, updatedNode.imageOriginUrl)

        return updatedNode

    def updateTemplate(self, template, articleHeadDescription,
                       articleHeadAuthor, articleHeadTitle,
                       articleHeadOriginUrl, articleBodyTitle,
                       articleBodyAuthor, articleBodyPublishTime,
                       articleBodyParagraph, articleBodyOriginUrl):
        template = template.replace('ArticleHeadDescription',
                                    articleHeadDescription)
        template = template.replace('ArticleHeadAuthor', articleHeadAuthor)
        template = template.replace('ArticleHeadTitle', articleHeadTitle)
        template = template.replace('ArticleHeadOriginUrl',
                                    articleHeadOriginUrl)
        template = template.replace('ArticleBodyTitle', articleBodyTitle)
        template = template.replace('ArticleBodyAuthor', articleBodyAuthor)
        template = template.replace('ArticleBodyPublishTime',
                                    articleBodyPublishTime)
        template = template.replace('ArticleBodyParagraph',
                                    articleBodyParagraph)
        template = template.replace('ArticleBodyOriginUrl',
                                    articleBodyOriginUrl)
        return template

    def hasText(self, nodes):
        for node in nodes:
            if isinstance(node, NavigableString):
                continue
            if node.name == 'img' or \
               node.name == 'a' or \
               node.name == 'p' or \
               node.name == 'span' or \
               node.name == 'section':
                return True
        return False

    def goDeepToArticleBody(self, contents):
        if isinstance(contents, NavigableString):
            return contents
        if len(contents) == 0:
            return contents
        if self.hasText(contents):
            return contents
        if len(contents) > 0:
            for n in contents:
                if isinstance(n, NavigableString):
                    continue
                return self.goDeepToArticleBody(n.contents)

    def storeFiles(self, data, page_source, content_regx_rule):
        if self.needselfhtml == False:
            return data
        try:
            self.image_count = 0
            newData = copy.copy(data)
            newArticleId = self.doraemon.getMD5('{0}_{1}'.format(
                data.author_name, data.id))
            newData.url = '{0}{1}.html'.format(self.articleurl, newArticleId)
            template = self.file.readFromTxt(self.templatepath)
            match = self.parseContentRegxRule(content_regx_rule)
            if match.tag is None or \
               match.key is None or \
               match.value is None:
                print 'No match rule available for html'
                return data
            soup = BeautifulSoup(page_source, 'lxml')
            matchTags = soup.select('{0}[{1}="{2}"]'.format(
                match.tag, match.key, match.value))
            if len(matchTags) == 0:
                print 'No tag matched for html'
                return data
            nodes = self.goDeepToArticleBody(matchTags[0].contents)
            articleContent = ''
            for node in nodes:
                if isinstance(node, NavigableString):
                    continue
                if self.doraemon.isEmpty(node):
                    continue
                newNode = ''
                updateNodeInfo = self.nodeTraversal(data.url, node, newNode,
                                                    newArticleId)
                articleContent = '{0}{1}'.format(articleContent,
                                                 updateNodeInfo.node)
                if updateNodeInfo.isImageNode:
                    if updateNodeInfo.imageOriginUrl in newData.images:
                        for i in newData.images:
                            if updateNodeInfo.imageOriginUrl in i or \
                               updateNodeInfo.imageOriginUrl == i:
                                newData.images[newData.images.index(
                                    i)] = updateNodeInfo.imageNewUrl
                    else:
                        newData.images.append(updateNodeInfo.imageNewUrl)
            template = self.updateTemplate(template, newData.title,
                                           '深度资讯DeepINews', newData.title,
                                           newData.url, newData.title,
                                           newData.source, newData.public_time,
                                           articleContent, data.url)
            if self.doraemon.storeHtml(newArticleId, template, self.htmlpath):
                htmlName = '{0}.html'.format(newArticleId)
                fromFile = '{0}/{1}'.format(self.htmlpath, htmlName)
                toFile = '{0}/{1}'.format(self.localhtmlpath, htmlName)
                if self.doraemon.copyFile(fromFile, toFile):
                    print 'Copy file {0} done.'.format(fromFile)
                    return newData
                else:
                    message1 = 'Copy file {0} fail.'.format(fromFile)
                    print message1
                    self.file.logger(self.logpath, message1)
            return data
        except Exception as e:
            message2 = 'Exception {0} when update : {1}'.format(
                e.message, data.url)
            print message2
            self.file.logger(self.logpath, message2)
            return data
Ejemplo n.º 9
0
class SSHUpload():
    def __init__(self):
        self.settings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()

    def writeBack(self, fromFiles):
        print "Start to update retry file: {0}".format(
            self.settings.RETRY_FILE)
        writeBackContent = ''
        for file in fromFiles:
            writeBackContent = '{0}{1}\n'.format(writeBackContent, file)
        self.file.writeToTxtAdd(self.settings.RETRY_FILE, writeBackContent)
        print "Finished to update retry file: {0}".format(
            self.settings.RETRY_FILE)

    def updateRemoveFile(self, fromFiles):
        if self.doraemon.isEmpty(fromFiles):
            print "No need to update to remove retry file."
            return
        content = self.readFile()
        print "Start to delete retry file: {0}".format(
            self.settings.RETRY_FILE)
        os.remove(self.settings.RETRY_FILE)
        print "Finished to delete retry file: {0}".format(
            self.settings.RETRY_FILE)
        for file in fromFiles:
            if file in content:
                del content[content.index(file)]
            else:
                content.append(file)
        self.writeBack(content)

    def updateAddFile(self, fromFiles):
        if self.doraemon.isEmpty(fromFiles):
            print "No need to update to add retry file"
            return
        content = self.readFile()
        if self.doraemon.isEmpty(content):
            self.writeBack(fromFiles)
            return
        print "Start to delete retry file: {0}".format(
            self.settings.RETRY_FILE)
        os.remove(self.settings.RETRY_FILE)
        print "Finished to delete retry file: {0}".format(
            self.settings.RETRY_FILE)
        for file in content:
            if file not in fromFiles:
                fromFiles.append(file)
        self.writeBack(fromFiles)

    def readFile(self):
        files = []
        isRetryFileExists = os.path.exists(self.settings.RETRY_FILE)
        if isRetryFileExists == False:
            return files
        content = self.file.readFromTxt(self.settings.RETRY_FILE)
        if self.doraemon.isEmpty(content):
            return files
        items = content.split('\n')
        for item in items:
            if self.doraemon.isEmpty(item):
                continue
            files.append(item)
        return files

    def retry(self):
        while True:
            files = self.readFile()
            updateFiles = []
            try:
                for fromFile in files:
                    fileParts = re.split(r'[/]', fromFile)
                    fileName = fileParts[len(fileParts) - 1]
                    toFile = '{0}/{1}'.format(self.settings.HTML_WEBSERVER0,
                                              fileName)
                    if self.doraemon.sshUpload(
                            self.settings.IP_WEBSERVER0,
                            self.settings.PORT_WEBSERVER0,
                            self.settings.USER_ROOT_WEBSERVER0,
                            self.settings.USER_ROOT_PASSWORD_WEBSERVER0,
                            fromFile, toFile):
                        updateFiles.append(fromFile)
                        print 'Success to retry to upload: {0}'.format(
                            fromFile)
                self.updateRemoveFile(updateFiles)
            except Exception as e:
                self.updateRemoveFile(updateFiles)
                print 'Exception {0} to retry to upload: {1}'.format(
                    e.message, fromFile)

    def startUpload(self):
        fromFile = '{0}.tar.gz'.format(self.settings.LOCAL_HTML_PATH)
        if not os.listdir(self.settings.LOCAL_HTML_PATH) and os.path.exists(
                fromFile) is False:
            print 'no html file to tar'
            return
        uploadedList = []
        if os.path.exists(fromFile) is False:
            uploadedList = self.doraemon.tarList(self.settings.LOCAL_HTML_PATH)
        while os.path.exists(fromFile):
            try:
                if self.doraemon.uploadFileApi(self.settings.UPLOAD_HTML_API,
                                               'local.tar.gz', fromFile):
                    os.remove(fromFile)
                    for file in uploadedList:
                        self.doraemon.deleteFile(file)
                    print 'Success to upload html file: {0}'.format(fromFile)
            except Exception as e:
                print 'Exception {0} to upload html file: {1}'.format(
                    e.message, fromFile)
Ejemplo n.º 10
0
class UpdateMonitorFiles():
    def __init__(self, siteinfo=None):
        self.siteinfo = siteinfo
        self.globalSettings = Settings()
        self.doraemon = Doraemon()
        self.getSettings()
        self.file = FileIOMiddleware()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.work_path_prd4 = self.settings.WORK_PATH_PRD1
        self.work_path_prd3 = self.settings.WORK_PATH_PRD2
        self.content_backup_path = self.settings.FINISHED_BACKUP_PATH
        self.content_backup_post_path = self.settings.FINISHED_BACKUP_POST_PATH
        self.url_backup_path = self.settings.URL_BACKUP_PATH
        self.url_backup_post_path = self.settings.URL_BACKUP_POST_PATH
        self.monitor_site_template_path = self.globalSettings.MONITOR_SITE_TEMPLATE_PATH
        self.monitor_spiders_template_path = self.globalSettings.MONITOR_SPIDERS_TEMPLATE_PATH
        self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL
        self.monitor_site_webserver0 = self.globalSettings.MONITOR_SITE_HTML_WEBSERVER0
        self.monitor_site_url = self.globalSettings.MONITOR_SITE_URL
        self.monitor_upload_webserver0 = self.globalSettings.MONITOR_UPLOAD_PATH_WEBSERVER0

    def updateSpiders(self, siteName, ycount1, tcount1, turl1, diff1, ycount2,
                      tcount2, turl2, diff2):
        return '<tr>' + \
                    '<th align="center" valign="middle">{0}</th>'.format(siteName) + \
                    '<td align="center" valign="middle">{0}</td>'.format(ycount1) + \
                    '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl1, tcount1) + \
                    '<td align="center" valign="middle">{0}</td>'.format(diff1) + \
                    '<td align="center" valign="middle">{0}</td>'.format(ycount2) + \
                    '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl2, tcount2) + \
                    '<td align="center" valign="middle">{0}</td>'.format(diff2) + \
               '</tr>'

    def updateSite(self, number, title, url):
        return '<tr>' + \
                     '<td align="center" valign="middle">{0}</td>'.format(number) + \
                     '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(url, title) + \
               '</tr>'

    def uploadFile(self, fromFile, toFile):
        while os.path.exists(fromFile):
            try:
                if self.doraemon.sshUpload(
                        self.globalSettings.IP_WEBSERVER0,
                        self.globalSettings.PORT_WEBSERVER0,
                        self.globalSettings.USER_ROOT_WEBSERVER0,
                        self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0,
                        fromFile, toFile):
                    print 'Success to retry to upload monitor file: {0}'.format(
                        fromFile)
                    return True
            except Exception as e:
                print 'Exception {0} to upload monitor site file: {1}'.format(
                    e.message, fromFile)
                return False

    def updateSingleSite(self, preBackupPath, postBackupPath, siteName):
        singleSiteData = singleSiteDto(self.siteinfo.name, 0, 0, None, 0)
        isPreBackupFileExists = os.path.exists(preBackupPath)
        isPostBackupFileExists = os.path.exists(postBackupPath)
        preCsvContent = None
        if isPreBackupFileExists:
            print "Start to read url back up file: {0}".format(
                self.settings.NAME)
            preCsvContent = self.file.readColsFromCSV(preBackupPath,
                                                      ['title', 'url'])
            singleSiteData.tcount = len(preCsvContent.values)
        else:
            print "Url back up file not exits: {0}".format(self.settings.NAME)
            singleSiteData.tcount = 0

        if isPostBackupFileExists:
            print "Start to read post url back up file: {0}".format(
                self.settings.NAME)
            postCsvContent = self.file.readColsFromCSV(postBackupPath,
                                                       ['title', 'url'])
            singleSiteData.ycount = len(postCsvContent.values)
        else:
            print "Post url back up file not exits: {0}".format(
                self.settings.NAME)
            singleSiteData.ycount = 0
        singleSiteData.diff = singleSiteData.tcount - singleSiteData.ycount
        if preCsvContent is not None:
            if preCsvContent.empty:
                print "No new back up url: {0}".format(self.settings.NAME)
            else:
                template = self.file.readFromTxt(
                    self.monitor_site_template_path)
                finalContent = ''
                number = 1
                for item in preCsvContent.values:
                    finalContent = "{0}{1}".format(
                        finalContent, self.updateSite(number, item[1],
                                                      item[0]))
                    number += 1
                template = template.replace(
                    'UpdateTime', self.doraemon.getCurrentLocalTime())
                template = template.replace('ServerName', siteName)
                template = template.replace('SiteName', self.siteinfo.name)
                template = template.replace('MainContent', finalContent)
                turl = '{0}{1}_{2}.html'.format(self.monitor_site_url,
                                                self.settings.NAME, siteName)
                singleSiteData.turl = turl
                uploadLocalHtmlPath = '{0}/{1}_{2}.html'.format(
                    self.monitor_upload_local, self.settings.NAME, siteName)
                self.file.writeToHtmlCover(uploadLocalHtmlPath, template)
        return singleSiteData

    def processAllSites(self, allSitesData=None):
        template = self.file.readFromTxt(self.monitor_spiders_template_path)
        mainContent = ''
        t = totalDto(0, 0, 0, 0, 0, 0)
        for data in allSitesData:
            mainContent = '{0}{1}'.format(
                mainContent,
                self.updateSpiders(data.prd3.sitename, data.prd3.ycount,
                                   data.prd3.tcount, data.prd3.turl,
                                   data.prd3.diff, data.prd4.ycount,
                                   data.prd4.tcount, data.prd4.turl,
                                   data.prd4.diff))
            t.prd3ytotal += data.prd3.ycount
            t.prd3ttotal += data.prd3.tcount
            t.prd4ytotal += data.prd4.ycount
            t.prd4ttotal += data.prd4.tcount
        t.prd3difftotal = t.prd3ttotal - t.prd3ytotal
        t.prd4difftotal = t.prd4ttotal - t.prd4ytotal
        mainContent = '{0}{1}'.format(
            mainContent,
            self.updateSpiders('Summary', t.prd3ytotal, t.prd3ttotal, '',
                               t.prd3difftotal, t.prd4ytotal, t.prd4ttotal, '',
                               t.prd4difftotal))
        template = template.replace('UpdateTime',
                                    self.doraemon.getCurrentLocalTime())
        template = template.replace('MainContent', mainContent)
        localHtmlPath = '{0}/index.html'.format(self.monitor_upload_local)
        self.file.writeToHtmlCover(localHtmlPath, template)
        self.doraemon.tar(self.monitor_upload_local)
        fromFile = '{0}.tar.gz'.format(self.monitor_upload_local)
        self.uploadFile(
            fromFile,
            '{0}/monitor.tar.gz'.format(self.monitor_upload_webserver0))
        os.remove(fromFile)

    def processSingleSite(self):
        spidersContent = allSitesDto(None, None)
        spidersContent.prd3 = self.updateSingleSite(self.url_backup_path,
                                                    self.url_backup_post_path,
                                                    'prd3')
        spidersContent.prd4 = self.updateSingleSite(
            self.content_backup_path, self.content_backup_post_path, 'prd4')
        return spidersContent
Ejemplo n.º 11
0
class CamelBone():
    def __init__(self, siteinfo=None, callback=callable):
        self.siteinfo = siteinfo
        self.callBack = callback
        self.globalSettings = Settings()
        self.file = FileIOMiddleware()
        self.doraemon = Doraemon()
        self.getSettings()

    def getSettings(self):
        self.settings = self.globalSettings.CreateSettings(self.siteinfo)
        self.log_path = self.globalSettings.LOG_PATH_PRD2
        self.today = self.globalSettings.TODAY
        self.source = self.settings.SOURCE_NAME
        self.work_path_prd2 = self.settings.WORK_PATH_PRD2
        self.mongo = self.settings.MONGO_URLS
        self.name = self.settings.NAME
        self.max_pool_size = self.settings.MAX_POOL_SIZE_URL
        self.urls = self.settings.URLS
        self.max_concurrency = self.globalSettings.MAX_CONCURRENCY
        self.concurrency_file = self.globalSettings.CONCURRENCY_FILE
        self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH
        self.url_timeout = self.settings.URL_TIMEOUT
        self.createPath()

    def createPath(self):
        self.doraemon.createFilePath(self.work_path_prd2)
        self.doraemon.createFilePath(self.log_path)
        self.doraemon.createFilePath(self.url_backup_folder_path)

    def parse(self, response):
        time.sleep(1)
        current_url = response['response'].current_url.encode('gbk')
        print 'Start to parse: {0}'.format(current_url)
        html = etree.HTML(response['response'].page_source)
        results = self.callBack(current_url, html)
        if len(results) == 0:
            message1 = 'No url for page: {0}'.format(current_url)
            self.file.logger(self.log_path, message1)
            print message1
        for item in results:
            is_title_empty = self.doraemon.isEmpty(item.title)
            if (is_title_empty is False) and (self.doraemon.isDuplicated(
                    self.doraemon.bf_urls, item.title) is False):
                message2 = 'Start to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message2)
                print message2
                self.doraemon.storeMongodb(
                    self.mongo, self.doraemon.createCamelMongoJson(item))
                message3 = 'End to store mongo {0}'.format(item.url)
                self.file.logger(self.log_path, message3)
                print message3
                self.file.logger(self.log_path,
                                 'Done for {0}'.format(item.url))
            else:
                if is_title_empty is True:
                    message4 = 'Empty title for {0}'.format(item.url)
                    self.file.logger(self.log_path, message4)
                    print message4
                else:
                    print 'Finished title for {0}'.format(item.url)
        print 'End to parse {0}'.format(current_url)

        del current_url, results, html
        gc.collect()

    def start(self, isdebug=False):
        if self.doraemon.isCamelReadyToRun(
                self.settings) is False and isdebug is False:
            message5 = 'It is not ready to run for {0}'.format(self.name)
            print message5
            return
        message6 = 'Start {0} requests'.format(self.name)
        self.file.logger(self.log_path, message6)
        print message6

        new_urls = []
        content = self.file.readFromTxt(self.urls)
        url_list = content.split('\n')

        for url in url_list:
            if self.doraemon.isEmpty(url) is False:
                new_urls.append([url, ''])

        if len(new_urls) == 0:
            print 'No url.'
            return
        request = BrowserRequest()
        content = request.start_chrome(new_urls,
                                       self.url_timeout,
                                       self.max_pool_size,
                                       self.log_path,
                                       None,
                                       callback=self.parse)
        self.doraemon.recoveryConcurrency(self.concurrency_file,
                                          self.max_concurrency)
        message7 = 'End for {0} requests of {1}.'.format(
            str(len(content)), self.name)
        self.file.logger(self.log_path, message7)
        print message7

        del new_urls, content, url_list, request
        gc.collect()
Ejemplo n.º 12
0
class Settings():
    def __init__(self):
        self.file = FileIOMiddleware()
        self.RSYNC_PRD1 = "/home/dev/Data/rsyncData/prd4"
        self.RSYNC_PRD2 = "/home/dev/Data/rsyncData/prd3"
        self.CAMEL_FOOD = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/food"
        self.SITES_INFO = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/cobwebs/sites_info.txt"
        self.SITES_DEBUG = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/cobwebs/sites_debug.txt"

        self.SELENIUM_TIMEOUT = 120  #second
        self.CHROMEDRIVER_PATH = "/usr/bin/chromedriver"

        #timeout handler
        self.PROCESS_TIMEOUT = 2  # minutes
        self.TIMEOUT_CACHE_FILE = "/home/dev/Data/rsyncData/timeout.cache"
        self.PROCESS_TIMEOUT_CONTENT = 60  # minutes

        #concurrency
        self.REFRESH_CONCURRENCY_INTERVAL = 30  #minute
        self.MAX_CONCURRENCY = 10
        self.CONCURRENCY_FILE = "{0}/max_concurrency.txt".format(
            self.RSYNC_PRD2)
        self.CONCURRENCY_REFRESH_FILE = "{0}/concurrency_refresh.txt".format(
            self.RSYNC_PRD2)

        self.REFRESH_CONCURRENCY_INTERVAL_SPIDER = 30  # minute
        self.MAX_CONCURRENCY_SPIDER = 10
        self.CONCURRENCY_FILE_SPIDER = "{0}/max_concurrency.txt".format(
            self.RSYNC_PRD1)
        self.CONCURRENCY_REFRESH_FILE_SPIDER = "{0}/concurrency_refresh.txt".format(
            self.RSYNC_PRD1)

        self.USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
        ]
        self.ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        self.ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9,en;q=0.8"
        self.ACCEPT_ENC0DING = "gzip, deflate"
        self.CONNECTION = "keep-alive"
        self.CACHE_CONTROL = "max-age=0"
        self.PRAGMA = "no-cache"
        self.UPGRADE_INSECURE_REQUESTS = "1"

        self.LOG_PATH = "{0}/log".format(self.RSYNC_PRD1)
        self.LOG_PATH_PRD2 = "{0}/log".format(self.RSYNC_PRD2)

        self.MONGO_URI = 'mongodb://127.0.0.1:27017'
        self.MONGO_DEEPINEWS = 'DeepNewsDatabase'

        self.REDIS_HOST = '127.0.0.1'
        self.REDIS_PORT = 6379

        self.BLOOMFILTER_URLS = "tegenaria:urls"
        self.BLOOMFILTER_CONTENT = "tegenaria:content"
        self.BLOOMFILTER_AUTHORS = "tegenaria:authors"

        self.TODAY = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        self.YESTERDAY = (datetime.now() -
                          timedelta(days=1)).strftime("%Y-%m-%d")

        self.CHRONUS_SETTINGS = "{0}/log/chronus.csv".format(self.RSYNC_PRD1)

        self.DISABLE_RESTART_INTERVAL = False

        #sogo-sogo-weixin
        self.VALID_PROXY_POOL_SOGO_ACCOUNT = "valid_proxy_pool:sogo_account"
        self.INVALID_PROXY_POOL_SOGO_ACCOUNT = "invalid_proxy_pool:sogo_account"
        self.VALID_PROXY_POOL_SOGO_ARTICLE_LIST = "valid_proxy_pool:sogo_article_list"
        self.INVALID_PROXY_POOL_SOGO_ARTICLE_LIST = "invalid_proxy_pool:sogo_article_list"
        self.VALID_PROXY_POOL_WX = "valid_proxy_pool:wx"
        self.INVALID_PROXY_POOL_WX = "invalid_proxy_pool:wx"
        self.FINISHED_SOGO_ACCOUNT = "finished:sogo_account"
        self.FINISHED_SOGO_ARTICLE_LIST = "finished:sogo_article_list"
        self.FINISHED__WX = "finished:wx"

        # shenjian-weixin
        self.FINISHED_WEIXIN_URL_ID = "finished:weixin_url_id"
        self.FINISHED_WEIXIN_URL_ARTICLE = "finished:weixin_url_article"
        self.FINISHED_WEIXIN_CONTENT_ARTICLE = "finished:weixin_content_article"

        #sites
        self.URL_DEEPINEWS_10002_ARTICLE = "http://www.deepinews.com:10002/article/"
        self.URL_DEEPINEWS_10002_IMAGE = 'http://www.deepinews.com:10002/img/'
        # self.URL_DEEPINEWS_10002_ARTICLE = 'http://192.168.163.26:8081/article/'
        # self.URL_DEEPINEWS_10002_IMAGE = 'http://192.168.163.26:8081/img/'

        #images filter
        self.FINISHED_IMAGE_ID = "finished:image_id"

        #temp folder for html and img
        self.TEMP_FOLDER_HTML = '/home/dev/Data/Production/data4deepinews/html'
        self.TEMP_FOLDER_IMG = '/home/dev/Data/Production/data4deepinews/img'
        self.FINISHED_TEMP_WEIXIN = "finished:temp_weixin"

        #remove server information
        self.HOST_PASSWORD_FILE = '/home/dev/Repository/news/servers/webserver0.txt'
        self.HOST_INFO = self.getServerInfo(self.HOST_PASSWORD_FILE)
        self.HOST_NAME = self.HOST_INFO.ip
        self.USER_NAME = 'root'
        self.PASSWORD = self.HOST_INFO.password
        self.PORT = 22
        self.REMOTE_IMG_PATH = '/home/dev/Data/Production/img_tmp'
        self.REMOTE_HTML_PATH = '/home/dev/Data/Production/html_tmp'
        self.MAX_UPLOAD_PROCESS = 20

        #aliyun oss access token
        self.ALI_OSS_TOKEN_FILE = '/home/dev/Repository/news/servers/aliyun.txt'
        self.ALI_OSS_INFO = self.getServerInfo(self.ALI_OSS_TOKEN_FILE)

        #refresh the redis interval
        self.REFRESH_REDIS_INTERVAL = 1440

        #huxiu_nlp
        self.FINISHED_HUXIU_NLP = "finished:huxiu_nlp"

        #mongodb
        self.SPIDERDB = "SPIDERS"

        #article url
        self.ARTICLE_URL = "https://www.deepinews.com/article/"

        #aliyun
        self.ALI_DOMAIN = "oss-cn-beijing.aliyuncs.com"
        self.ALI_BUCKET_NAME_DEEPINEWS = "deepinews"
        self.ALI_BUCKET_NAME_DEEPINEWS_IMG = "img"

        # local html info
        self.LOCAL_HTML_PATH = "{0}/local".format(self.RSYNC_PRD1)

        # webserver0 html info
        self.WEBSERVER0_PASSWORD_FILE = '/home/dev/Repository/news/servers/webserver0.txt'
        self.WEBSERVER0_INFO = self.getServerInfo(
            self.WEBSERVER0_PASSWORD_FILE)
        self.IP_WEBSERVER0 = self.WEBSERVER0_INFO.ip
        self.PORT_WEBSERVER0 = 22
        self.USER_ROOT_WEBSERVER0 = "root"
        self.USER_ROOT_PASSWORD_WEBSERVER0 = self.WEBSERVER0_INFO.password
        self.HTML_WEBSERVER0 = "/home/dev/Data/Production/article"
        self.RETRY_FILE = "{0}/retry.txt".format(self.RSYNC_PRD1)
        self.UPLOAD_HTML_API = "https://www.deepinews.com/api/articles/uploadhtml"

        #webserver0 mongo data info
        self.LOCAL_MONGO_DATA_PATH = "/home/dev/Data/Production/data4deepinews/{0}.csv".format(
            self.TODAY)
        self.REMOTE_MONGO_DATA_PATH = "/home/dev/Data/Production/data4deepinews/{0}.csv".format(
            self.TODAY)

        #template
        self.TEMPLATE_PATH = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/storeHtml/template_1.html"

        #monitor
        self.MONITOR_SPIDERS_URL = "https://www.deepinews.com/sites/index.html"
        self.MONITOR_SITE_URL = "https://www.deepinews.com/sites/"
        self.MONITOR_SPIDERS_TEMPLATE_PATH = \
            "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/spiderMonitor/index.html"
        self.MONITOR_SITE_TEMPLATE_PATH = \
            "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/spiderMonitor/site.html"
        self.MONITOR_UPLOAD_LOCAL = "{0}/monitor".format(self.RSYNC_PRD1)
        self.MONITOR_UPLOAD_PATH_WEBSERVER0 = "/home/dev/Data/Production"
        self.MONITOR_SITE_HTML_WEBSERVER0 = "/home/dev/Data/Production/statics/sites"

    def getServerInfo(self, file):
        contents = self.file.readFromTxt(file)
        machines = contents.split('\n')
        result = None
        for machine in machines:
            if machine == '':
                continue
            info = machine.split('==')
            result = machineDto(info[0].strip(), info[1].strip())
        return result

    def SettingsFormat(self, SETTINGS_NAME, SOURCE_NAME, RESTART_INTERVAL,
                       MAX_POOL_SIZE_URL, MAX_POOL_SIZE_CONTENT, IS_OPEN_CACHE,
                       START_TIME, END_TIME, URL_TIMEOUT, CONTENT_TIMEOUT):
        return settingsSpec(SETTINGS_NAME, SOURCE_NAME, RESTART_INTERVAL,
                            MAX_POOL_SIZE_URL, MAX_POOL_SIZE_CONTENT,
                            IS_OPEN_CACHE, START_TIME, END_TIME, URL_TIMEOUT,
                            CONTENT_TIMEOUT)

    def CreateSettings(self, siteinfo=None):
        print "Create setting for: {0}".format(siteinfo.domain)
        return self.SettingsFormat(
            siteinfo.domain, siteinfo.name, siteinfo.restart_interval,
            siteinfo.url_parallel_number, siteinfo.content_parallel_number,
            siteinfo.is_open_cache, siteinfo.work_time_start,
            siteinfo.work_time_end, siteinfo.url_timeout,
            siteinfo.content_timeout)

    def CreateCommonSettings(self):
        return self.SettingsFormat('0', '0', '0', '0', '0', '0', '0', '0', '0',
                                   '0')