Beispiel #1
0
def FormatHtml(f, idx):
        #results = d.find(id='bodyMainResults')
        resultDiv = SoupStrainer('div', id='bodyMainResults')
        res = BeautifulSoup(f, parseOnlyThese=resultDiv)

        #tables = res.findChildren('table', attrs={'class':'resultRow'})
        #tables = res.contents[0]
        tables = res.findChildren('table', attrs={'cellspacing':'0','cellpadding':'10'})

        for tab in tables:
                a = tab.find('a')
                link = a['href']
                span = a.findChild('span')
                #print span.contents
                #article = span.contents[0]  
                article = ' '.join([s.string for s in span.contents if s.string])
                iList = tab.findAll('i')
                journal = iList[0].contents[0]
                volumn = iList[1].contents[0]
                pubDate = iList[2].contents[0]
                pages = iList[3].contents[0]
                tds = [td for td in tab.contents]
                item = tds[1].find('td', attrs={'align':'left','width':'95%','colspan':'2'})
                author = item.contents[10]
                #td1 = tds[1]
                #author = td1.contents[10]
                ie = PAMIE()
                ie.navigate(link)
                ie.linkClick('References')
                #ie.quit()
                idx += 1
                print "[", idx, "]", "\n\t", link, "\n\t", article, "\n\t", author, "\n\t", journal, "\n\t", volumn, "\n\t", pages, "\n"
                #print "[", idx, "]", "\n\t", article, "\n\t", journal, "\n\t", volumn, "\n\t", pages, "\n"
        print "FETCH page, to ", idx
        return idx