def FormatHtml(f, idx): #results = d.find(id='bodyMainResults') resultDiv = SoupStrainer('div', id='bodyMainResults') res = BeautifulSoup(f, parseOnlyThese=resultDiv) #tables = res.findChildren('table', attrs={'class':'resultRow'}) #tables = res.contents[0] tables = res.findChildren('table', attrs={'cellspacing':'0','cellpadding':'10'}) for tab in tables: a = tab.find('a') link = a['href'] span = a.findChild('span') #print span.contents #article = span.contents[0] article = ' '.join([s.string for s in span.contents if s.string]) iList = tab.findAll('i') journal = iList[0].contents[0] volumn = iList[1].contents[0] pubDate = iList[2].contents[0] pages = iList[3].contents[0] tds = [td for td in tab.contents] item = tds[1].find('td', attrs={'align':'left','width':'95%','colspan':'2'}) author = item.contents[10] #td1 = tds[1] #author = td1.contents[10] ie = PAMIE() ie.navigate(link) ie.linkClick('References') #ie.quit() idx += 1 print "[", idx, "]", "\n\t", link, "\n\t", article, "\n\t", author, "\n\t", journal, "\n\t", volumn, "\n\t", pages, "\n" #print "[", idx, "]", "\n\t", article, "\n\t", journal, "\n\t", volumn, "\n\t", pages, "\n" print "FETCH page, to ", idx return idx