Example #1
0
class Reactor:
    def __init__(
        self, rule_file_path, config_file_path, site_config, in_folder_path, out_folder_path, shared_dir, log_file
    ):
        self.rule_file_path = rule_file_path + "/reactor_rules.rrule"
        self.config_file_path = config_file_path + "/reactor_config.config"
        config_dir_path = site_config[: site_config.rfind("/")]
        self.in_folder_path = in_folder_path
        self.out_folder_path = out_folder_path
        self.shared_dir = shared_dir
        self.rule_list = []
        self.log_file = log_file
        self.buildRules()
        self.executor = Executor(config_dir_path)
        self.count = 0
        self.INVALID_TAGS = ["table", "tbody", "tr", "td"]
        self.full_update = os.getenv("REACTOR_UPDATE_ALL") == "YES"
        self.update_set = None

    def __str__(self):
        return 'Reactoring files in folder "' + self.in_folder_path + '" to folder "' + self.out_folder_path

    def buildRules(self):
        parser = RuleParser()
        self.rule_list = parser.parseFile(self.rule_file_path)

    def doReactorWork(self):
        if not self.ensureInputFolderExists():
            print "Error: Input folder does not exists."
            return

        self.ensureOutputFolderExists()

        if not self.full_update:
            self.genUpdateList()

        print "Generating navigation files..."
        self.genNavFiles()

        print "Begin processing..."
        self.processFilesRecursively(self.doWork)

        self.executor.finished()

        print "Done!"
        return

    def ensureInputFolderExists(self):
        return os.path.exists(self.in_folder_path)

        # 确认输出目录是否存在,如果不存在,就创建这个目录

    def ensureOutputFolderExists(self):
        if not os.path.exists(self.out_folder_path):
            os.makedirs(self.out_folder_path)

    def genUpdateList(self):
        update_list_path = self.shared_dir + "/updatelist.dat"
        update_list = [line.strip() for line in open(update_list_path)]
        self.update_set = Set(update_list)

        # 生成index.xml,l、c目录下的xml

    def genNavFiles(self):
        indexPath = self.shared_dir + "/dir.xml"

        if len(indexPath) > 0:
            homePath = os.getenv("HOME")
            command = "%s/labrador/butts/reactor/producer --index-file=%s --webroot-dir=%s --log-file=%s" % (
                homePath,
                indexPath,
                self.out_folder_path,
                self.log_file,
            )
            os.system(command)
        else:
            print "Index file not found!"

    def getDataFilePathForFileName(self, fileName):
        return self.in_folder_path + "/" + fileName[:2] + "/" + fileName

    def dataFileExists(self, fileName, filePath):
        return (filePath.find("/a/") != -1) and os.path.exists(self.getDataFilePathForFileName(fileName))

    def integrateParentWithData(self, fileName, parentFile):
        dataFile = self.getDataFilePathForFileName(fileName)
        data = codecs.open(dataFile, "r", "utf-8")
        dataContent = data.read()
        data.close()

        dataContent = html.unescape_string(dataContent)
        # get rid of something like " "
        #   =>   => " "
        dataContent = html.unescape_string(dataContent)
        dataContent = dataContent.replace("<o:p>", "<p>").replace("</o:p>", "</p>")
        # dataContent = dataContent.replace("<st1:", "<!--<st1:").replace("st1:chsdate>", "st1:chsdate>-->").replace("st1:chmetcnv>", "st1:chmetcnv>-->").replace("st1:personname>", "st1:personname>-->")
        # 20130327 fix#374
        str_result = dataContent
        r = re.compile("xml:namespace prefix = (.*?) ns")
        s_match = r.findall(str_result)
        for c in s_match:
            namespace_list = re.findall("<" + c + ":.*?>", str_result)
            for namespace_r in namespace_list:
                str_result = str_result.replace(namespace_r, "")
            namespace_list = re.findall("</" + c + ":.*?>", str_result)
            for namespace_r in namespace_list:
                str_result = str_result.replace(namespace_r, "")
        namespace_list = re.findall("<\?xml:namespace prefix.*?>", str_result)
        for namespace_r in namespace_list:
            str_result = str_result.replace(namespace_r, "")
        dataContent = str_result

        parent = codecs.open(parentFile, "r", "utf-8")
        parentContent = parent.read()
        parentContent = html.unescape_string(parentContent)
        parent.close()

        dataSoup = BeautifulSoup(dataContent, "lxml")
        parentSoup = BeautifulSoup(parentContent, "lxml")

        dataSoup.article.insert(0, parentSoup.parentpageurl)

        return dataSoup

        # 递归处理文件

    def processFilesRecursively(self, processFunction):
        for root, dirs, files in os.walk(self.out_folder_path):
            for fileName in files:
                processFunction(root, fileName)

    def doWork(self, root, fileName):
        if not fileName.endswith(".xml"):
            return

        srcFile = root + "/" + fileName
        resultFilePath = srcFile

        if self.dataFileExists(fileName, srcFile):
            soup = self.integrateParentWithData(fileName, srcFile)
        else:
            xmlDataFile = codecs.open(srcFile, "r", "utf-8")
            xmlData = xmlDataFile.read()
            xmlData = html.unescape_string(xmlData)
            xmlDataFile.close()
            soup = BeautifulSoup(xmlData, "lxml")

        soup = self.semantify(soup, resultFilePath)

        # 最后做断句处理
        divider = Divider(soup, self.config_file_path)
        soup = divider.doWork()

        resultFile = codecs.open(resultFilePath, "w", "utf-8")
        resultFile.write(self.beautiful_soup_tag_to_unicode(soup))
        resultFile.close()

        self.count += 1
        print "Processed: %d" % self.count

        # try to resolve the maximum-recursion problem

    def beautiful_soup_tag_to_unicode(self, tag):
        try:
            return unicode(tag)
        except RuntimeError as e:
            if not str(e).startswith("maximum recursion"):
                raise
                # If you have more than 480 level of nested tags you can hit the maximum recursion level
            out = []
            for mystring in tag.findAll(text=True):
                mystring = mystring.strip()
                if not mystring:
                    continue
                out.append(mystring)
            return u"<pre>%s</pre>" % "\n".join(out)

            # 语义化处理

    def semantify(self, soup, resultFilePath):
        # 建立originUrl为key,[hash, absoluteUrl]为value的字典
        hashNodeRecords = {}
        try:
            dom = parseString(self.beautiful_soup_tag_to_unicode(soup).encode("utf-8"))
            hashNodes = dom.getElementsByTagName("hashnode")
            for hashNode in hashNodes:
                hashValue = (hashNode.getElementsByTagName("hash")[0]).toprettyxml()[7:-8].strip()
                absolute = (hashNode.getElementsByTagName("absoluteurl")[0]).toprettyxml()[13:-15].strip()
                origin = (hashNode.getElementsByTagName("originalurl")[0]).toprettyxml()[13:-15].strip()
                hashNodeRecords[origin] = [hashValue, absolute]
        except Exception as e:
            pass

            # 去掉注释
        comments = soup.find_all(text=(lambda text: isinstance(text, Comment)))
        [comment.extract() for comment in comments]

        # 将相对URL替换为绝对URL,并添加hash属性
        for img_element in soup.find_all("img"):
            if img_element.has_attr("src"):
                originUrl = img_element["src"]
                if hashNodeRecords.has_key(originUrl) and hashNodeRecords[originUrl]:
                    img_element["src"] = hashNodeRecords[originUrl][1]
                    img_element["hash"] = hashNodeRecords[originUrl][0]

                    # 利用反射机制,动态调用方法,所有方法的实现都在executor.Executor类中
        for rule in self.rule_list:
            for script_code in soup.find_all(rule.target.split(" ")[0]):
                # 默认所有对象都需要处理
                # 当指定条件的对象不能满足的时候,再跳过处理过程
                needToProcess = True
                for condition in rule.condition:
                    conMethod = getattr(Executor, condition[0])
                    if not conMethod(self.executor, script_code, condition[1:]):
                        needToProcess = False  # 条件不满足,跳过
                        break

                if needToProcess:
                    for act in rule.action:
                        actMethod = getattr(Executor, act[0])
                        if len(act) == 1:
                            actMethod(self.executor, script_code)
                        else:
                            args = [rule.target] + act[1:]
                            actMethod(self.executor, script_code, args)

                    if len(rule.logLevel.strip()) > 0 and len(rule.logMsg) > 0:
                        self.executor.doLog(rule.logLevel, resultFilePath, rule.logMsg)

        return soup