Exemple #1
0
    def parse(self):
        path = []
        for i in self.path.split('/'):
            path.append(filename(i))
        path[-1] = filename(path[-1], ".htm")
        self.path = "/".join(path)

        htm = self.content
        soup = BeautifulSoup(htm)
        main = soup.find('div', {'id': 'content'})
        title = main.find("h2").string

        try:
            print title
            print "\n"
        except:
            pass

        return {
            "keyword":
            soup.find('meta', {'name': "Keywords"})['content'],
            "title":
            title,
            "content":
            ''.join([
                unicode(i) for i in main.find("div", {
                    "id": "text"
                }).contents
            ]),
            "url":
            self.url
        }
Exemple #2
0
    def parse(self):
        path=[]
        for i in self.path.split('/'):
            path.append(filename(i))
        path[-1]=filename(path[-1].split(' ',2)[0]+".txt")
        self.path="/".join(path).decode('utf-8')
        htm=self.content
        soup=BeautifulSoup(htm)
        cn_title=soup.find('h1').find("span",{"id":"ctl00_MasterContentPlaceHolder_TitleLabel"}).string
        en_title=soup.find('h2').find("span").string
        if not en_title:
            en_title=""
        from zspy.html2txt import html2txt
        brief=soup.find("span",{"id":"ctl00_MasterContentPlaceHolder_AbstractLabel"}).string
        if brief:
            brief=html2txt(brief)
        else:brief=""

        page_author=soup.find("span",{"id":"ctl00_MasterContentPlaceHolder_AuthorDataList"})
        if page_author:
            author='\t'.join(
            [i.string for i in page_author.findAll('a')]
            )
        else:author=''
        page_word=soup.find("span",{"id":"ctl00_MasterContentPlaceHolder_KeywordDatalist"})
        if page_word:
            page_word=page_word.findAll('a')
        else:
            page_word=[]
        keyword='\t'.join([
            i.string
            for i in page_word
        ])
        
        magezine=soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_PeriodicalLink"}).string
        time=soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_IssueLink"}).string
        kind=' >>> '.join([
            soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_topnavigation"}).string,
            soup.find("a",{"id":"ctl00_MasterContentPlaceHolder_subnavigation"}).string
        ])

        return {
            "url":self.url,
            "cn_title":cn_title,
            "en_title":en_title,
            "keyword":keyword,
            "author":author,
            "magezine":magezine,
            "time":time,
            "kind":kind,
            "brief":brief.replace('" class="highLight">','')
        }
Exemple #3
0
 def _local_saver(downer,meta):
     if exist(meta.link):return
     else:
         print "saveing",meta.link
     meta['summary']=img_saver(downer,meta.link,meta['summary_detail']['value'],"../../img/")
     template_render(meta,template_name)
     c=template_render(meta,template_name)
     file_saver(join(dirname,filename(meta.title)+'.htm'),c)
     db_saver(meta,c)
Exemple #4
0
 def _local_saver(downer, meta):
     if exist(meta.link): return
     else:
         print "saveing", meta.link
     meta['summary'] = img_saver(downer, meta.link,
                                 meta['summary_detail']['value'],
                                 "../../img/")
     template_render(meta, template_name)
     c = template_render(meta, template_name)
     file_saver(join(dirname, filename(meta.title) + '.htm'), c)
     db_saver(meta, c)
Exemple #5
0
    def parse(self):
        path = []
        for i in self.path.split('/'):
            path.append(filename(i))
        path[-1] = filename(path[-1].split(' ', 2)[0] + ".htm")
        self.path = "/".join(path)

        htm = self.content
        soup = BeautifulSoup(htm)
        title = soup.find('div', {'id': 'artibodyTitle'}).find("h1").string

        content = ''.join([
            unicode(i) for i in soup.find("div", {
                "id": "artibody"
            }).contents
        ])

        try:
            print title
            print "\n"
        except:
            pass
        return {"url": self.url, "title": title, "content": content}
Exemple #6
0
    def parse(self):
        path = []
        for i in self.path.split("/"):
            path.append(filename(i))
        path[-1] = filename(path[-1], ".htm")
        self.path = "/".join(path)

        htm = self.content
        soup = BeautifulSoup(htm)
        main = soup.find("div", {"id": "content"})
        title = main.find("h2").string

        try:
            print title
            print "\n"
        except:
            pass

        return {
            "keyword": soup.find("meta", {"name": "Keywords"})["content"],
            "title": title,
            "content": "".join([unicode(i) for i in main.find("div", {"id": "text"}).contents]),
            "url": self.url,
        }
    def parse(self):
        path=[]
        for i in self.path.split('/'):
            path.append(filename(i))
        path[-1]=filename(path[-1].split(' ',2)[0]+".htm")
        self.path="/".join(path)
        
        htm=self.content
        soup=BeautifulSoup(htm)
        title=soup.find('div',{'id':'artibodyTitle'}).find("h1").string

        content=''.join([unicode(i) for i in soup.find("div",{"id":"artibody"}).contents])


        try:
            print title
            print "\n"
        except:
            pass
        return {
            "url":self.url,
            "title":title,
            "content":content
        }
Exemple #8
0
def site(index_url, parser, once=True):

    from zspy.filesys import makedirs, filename

    site_name = filename(index_url.split("://", 2)[1])
    task = load_db(site_name)

    from down import Downer
    downer = Downer(task)

    if once and index_url in downer.history:
        print "Continue %s" % index_url
    else:
        print "New Start %s" % index_url
        downer.add(parser(index_url, "%s/%s" % (config.FETCH_TO, site_name)))
    return downer
Exemple #9
0
def site(index_url,parser,once=True):
        
    from zspy.filesys import makedirs,filename

    site_name=filename(index_url.split("://",2)[1])
    task=load_db(site_name)

    from down import Downer
    downer=Downer(task)
    
    if once and index_url in downer.history:
        print "Continue %s"%index_url
    else:
        print "New Start %s"%index_url        
        downer.add(
            parser(
                index_url,
                "%s/%s"%(
                    config.FETCH_TO,
                    site_name
                )
            )
        )
    return downer
Exemple #10
0
    def parse(self):
        path = []
        for i in self.path.split('/'):
            path.append(filename(i))
        path[-1] = filename(path[-1].split(' ', 2)[0] + ".txt")
        self.path = "/".join(path).decode('utf-8')
        htm = self.content
        soup = BeautifulSoup(htm)
        cn_title = soup.find('h1').find(
            "span", {
                "id": "ctl00_MasterContentPlaceHolder_TitleLabel"
            }).string
        en_title = soup.find('h2').find("span").string
        if not en_title:
            en_title = ""
        from zspy.html2txt import html2txt
        brief = soup.find("span", {
            "id": "ctl00_MasterContentPlaceHolder_AbstractLabel"
        }).string
        if brief:
            brief = html2txt(brief)
        else:
            brief = ""

        page_author = soup.find(
            "span", {"id": "ctl00_MasterContentPlaceHolder_AuthorDataList"})
        if page_author:
            author = '\t'.join([i.string for i in page_author.findAll('a')])
        else:
            author = ''
        page_word = soup.find(
            "span", {"id": "ctl00_MasterContentPlaceHolder_KeywordDatalist"})
        if page_word:
            page_word = page_word.findAll('a')
        else:
            page_word = []
        keyword = '\t'.join([i.string for i in page_word])

        magezine = soup.find(
            "a", {
                "id": "ctl00_MasterContentPlaceHolder_PeriodicalLink"
            }).string
        time = soup.find("a", {
            "id": "ctl00_MasterContentPlaceHolder_IssueLink"
        }).string
        kind = ' >>> '.join([
            soup.find("a", {
                "id": "ctl00_MasterContentPlaceHolder_topnavigation"
            }).string,
            soup.find("a", {
                "id": "ctl00_MasterContentPlaceHolder_subnavigation"
            }).string
        ])

        return {
            "url": self.url,
            "cn_title": cn_title,
            "en_title": en_title,
            "keyword": keyword,
            "author": author,
            "magezine": magezine,
            "time": time,
            "kind": kind,
            "brief": brief.replace('" class="highLight">', '')
        }