Beispiel #1
0
    def getContent(self, text, folder):

        links_p = re.compile('<td class="td6"><a href=bbstcon\?board=(.*?)>')
        result = links_p.findall(text)
        url_board = 'http://bbs.sysu.edu.cn/bbstcon?board='
        #url+result[0]...
        for i in result:
            each_page_link = url_board + i
            print each_page_link
            content = requests.get(each_page_link, headers=self.header)
            content.encoding = 'gbk'
            s = content.text
            #req=urllib2.Request(each_page_link,headers=self.header)
            #resp=urllib2.urlopen(req).read()
            #c=content.decode('utf-8')
            #c=content
            #print c
            #print type(s)
            #print resp.decode('gbk')
            html = etree.HTML(s)
            #print content.decode('gbk')
            #t= chardet.detect(content)
            #print content['encoding']
            title = html.xpath('//title/text()')[0]
            #t=title[0].decode('gbk').encode('utf-8')
            #t= title[0].decode('gbk')
            #t= unicode(title[0],'gbk')
            try:
                print title
            except:
                print "Can't decode title, return"
                return 0
            filename = re.sub(u' - 逸仙时空BBS', '', title)
            filename = Toolkit.filename_filter(filename)
            f_fullpath = os.path.join(folder, filename)
            try:
                Toolkit.save2filecn(f_fullpath, title)
                Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n')
                Toolkit.save2filecn(f_fullpath, each_page_link)
                Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n')
            except:
                print each_page_link
                print "Create file error, go to next article"
                return 0
            detail = html.xpath('//td[@class="border content2"]')
            #print detail
            for i in detail:
                #print type(i)
                Toolkit.save2filecn(f_fullpath, i.xpath('string(.)'))
                #print i.xpath('string(.)')

            #f = open('log.txt','w')
            #f = codecs.open(filename,'w',encod)
            #f.write(t)
            #f.close()
            #print t
            #Toolkit.save2filezn("log",t)

            time.sleep(5)
Beispiel #2
0
    collection=session.get(fav,headers=headers)
    fav_content= collection.text
    #print(fav_content)
    p=re.compile('var favs = {(.*?)};',re.S|re.M)
    result=p.findall(fav_content)[0].strip()

    new_result='{'+result+'}'
    #print(type(new_result))
    #print(new_result)
    data=json.loads(new_result)
    use_data= data['list']
    host='https://xueqiu.com'
    for i in use_data:
        url=host+ i['target']
        print(url)
        txt_content=session.get(url,headers=headers).text
        #print(txt_content.text)

        tree=etree.HTML(txt_content)
        title=tree.xpath('//title/text()')[0]

        filename = re.sub('[\/:*?"<>|]', '-', title)
        print(filename)

        content=tree.xpath('//div[@class="detail"]')
        for i in content:
            Toolkit.save2filecn(filename, i.xpath('string(.)'))
        #print(content)
        #Toolkit.save2file(filename,)
        time.sleep(10)
Beispiel #3
0
    fav_content= collection.text
    #print fav_content
    p=re.compile('var favs = {(.*?)};',re.S|re.M)
    result=p.findall(fav_content)[0].strip()

    new_result='{'+result+'}'
    #print type(new_result)
    #print new_result
    data=json.loads(new_result)
    use_data= data['list']
    host='https://xueqiu.com'
    for i in use_data:
        url=host+ i['target']
        print url
        txt_content=session.get(url,headers=headers).text
        #print txt_content.text

        tree=etree.HTML(txt_content)
        title=tree.xpath('//title/text()')[0]

        filename = re.sub('[\/:*?"<>|]', '-', title)
        print filename

        content=tree.xpath('//div[@class="detail"]')
        Toolkit.save2filecn(filename,"Link: %s\n\n" %url)
        for i in content:
            Toolkit.save2filecn(filename, i.xpath('string(.)'))
        #print content
        #Toolkit.save2file(filename,)
        time.sleep(10)
Beispiel #4
0
            time.sleep(1)
            try:
                at_content = session.get(at_addr,
                                         headers=headers).text.encode("UTF-8")
            except:
                print at_addr
        at_name = re.findall(
            '<h1 class="article__bd__title">(.*)</h1><div class="article__bd__detail">',
            at_content)
        at_name[0] = at_name[0].replace('|', '--')
        cp_content = re.findall(r'\>\$(.*?)\$\<', at_content)
        cp_code = ['cd_none']
        cp_name = ['nm_none']
        if len(cp_content) > 0:
            cp_code = re.findall(r'\((.*)\)', cp_content[0])
            cp_name = re.findall(r'(.*)\(', cp_content[0])
        cp_addr = cp_host + cp_code[0]

        'index  | name | code | at'
        save_data = save_data + '\r\n' + str(index).zfill(5) + ' | ' + cp_name[
            0] + ' | [' + cp_code[0] + '](' + cp_addr.encode(
                "UTF-8") + ')' + ' | [' + at_name[0] + '](' + at_addr.encode(
                    "UTF-8") + ')'
        #print save_data
        #save_data = save_data + '\r\n' + str(index).zfill(5) + ' | [' + at_name[0] + '](' + at_addr.encode("UTF-8") + ')'
        time.sleep(1)
#print save_data
reload(sys)
sys.setdefaultencoding('utf-8')
Toolkit.save2filecn('data', save_data)