def parse_citmap(doc): def get_docs(sp): out = [] if sp is not None: for a in sp.findAll("a"): try: if "articleDetails" in a["href"]: up = urltoarnumber(a['href']) if up not in out: out.append(up) except: pass return out #to_file("dump.soup", doc) soup = BeautifulSoup(doc) citing = get_docs(soup.find("div", {'id':'colFirst'})) citedby = get_docs(soup.find("div", {'id':'colSecond'})) return dict(citing=citing, citedby=citedby)