Ejemplo n.º 1
0
    def doWork(self, root, fileName):
        if not fileName.endswith(".xml"):
            return

        srcFile = root + "/" + fileName
        resultFilePath = srcFile

        if self.dataFileExists(fileName, srcFile):
            soup = self.integrateParentWithData(fileName, srcFile)
        else:
            xmlDataFile = codecs.open(srcFile, "r", "utf-8")
            xmlData = xmlDataFile.read()
            xmlData = html.unescape_string(xmlData)
            xmlDataFile.close()
            soup = BeautifulSoup(xmlData, "lxml")

        soup = self.semantify(soup, resultFilePath)

        # 最后做断句处理
        divider = Divider(soup, self.config_file_path)
        soup = divider.doWork()

        resultFile = codecs.open(resultFilePath, "w", "utf-8")
        resultFile.write(self.beautiful_soup_tag_to_unicode(soup))
        resultFile.close()

        self.count += 1
        print "Processed: %d" % self.count
Ejemplo n.º 2
0
    def integrateParentWithData(self, fileName, parentFile):
        dataFile = self.getDataFilePathForFileName(fileName)
        data = codecs.open(dataFile, "r", "utf-8")
        dataContent = data.read()
        data.close()

        dataContent = html.unescape_string(dataContent)
        # get rid of something like " "
        #   =>   => " "
        dataContent = html.unescape_string(dataContent)
        dataContent = dataContent.replace("<o:p>", "<p>").replace("</o:p>", "</p>")
        # dataContent = dataContent.replace("<st1:", "<!--<st1:").replace("st1:chsdate>", "st1:chsdate>-->").replace("st1:chmetcnv>", "st1:chmetcnv>-->").replace("st1:personname>", "st1:personname>-->")
        # 20130327 fix#374
        str_result = dataContent
        r = re.compile("xml:namespace prefix = (.*?) ns")
        s_match = r.findall(str_result)
        for c in s_match:
            namespace_list = re.findall("<" + c + ":.*?>", str_result)
            for namespace_r in namespace_list:
                str_result = str_result.replace(namespace_r, "")
            namespace_list = re.findall("</" + c + ":.*?>", str_result)
            for namespace_r in namespace_list:
                str_result = str_result.replace(namespace_r, "")
        namespace_list = re.findall("<\?xml:namespace prefix.*?>", str_result)
        for namespace_r in namespace_list:
            str_result = str_result.replace(namespace_r, "")
        dataContent = str_result

        parent = codecs.open(parentFile, "r", "utf-8")
        parentContent = parent.read()
        parentContent = html.unescape_string(parentContent)
        parent.close()

        dataSoup = BeautifulSoup(dataContent, "lxml")
        parentSoup = BeautifulSoup(parentContent, "lxml")

        dataSoup.article.insert(0, parentSoup.parentpageurl)

        return dataSoup