Exemple #1
0
    def getPageContent(self, filename, from_where='local'):
        if from_where == 'local':
            parts = filename.split("::")
            if len(parts) == 3:
                binReader = BinReader(parts[1])
                _, content = binReader.readone_at(int(parts[2]))
                if len(content) == 0:
                    raise Exception("file name:{} , content error".format(filename))
                return content

            if len(parts) == 1:
                with open(filename) as f:
                    content = f.read()
                    if len(content) == 0:
                        raise Exception("file name:{} , content error".format(filename))
                    return content
        elif from_where == 'remote':
            #TODO
            # 从远程获取bin文件内容
            content = self.bin_read_client.getHtml(filename)
            if isinstance(content, unicode):
                content = content.encode('utf-8')
            return content

        else:
            raise Exception("unknown from_where")
Exemple #2
0
    def getPageContent(self, filename):
        parts = filename.split("::")
        if len(parts) == 3:
            binReader = BinReader(parts[1])
            _, content = binReader.readone_at(int(parts[2]))
            if len(content) == 0:
                raise Exception("file name:{} , content error".format(filename))
            return content

        if len(parts) == 1:
            with open(filename) as f:
                content = f.read()
                if len(content) == 0:
                    raise Exception("file name:{} , content error".format(filename))
                return content
Exemple #3
0
    def get_cv_html_page(jdid):
        cv_db = 'cv_crawler'

        try:
            coll = "page_store_%s" % jdid.split('://')[0]
        except Exception as e:
            traceback.print_exc()
            return None

        doc = GetHtmlPage.CvClient[cv_db][coll].find_one({'indexUrl':jdid})
        if not doc:
            print "cvid: %s not exists" % jdid
            return None

        return BinReader.getPageContent(doc['pageContentPath'])