def handle(self, **options): since = get_last_change() writer = get_writer() try: while True: changes = settings.db.changes(since=since) since = changes["last_seq"] for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: continue if "type" in doc and doc["type"] == "page": print "indexing", doc["url"] soup = BeautifulSoup(doc["content"]) if soup.body is None: continue desc = soup.findAll('meta', attrs={ "name": desc_re }) writer.update_document( title=unicode(soup.title(text=True)[0]) if soup.title is not None and len(soup.title(text=True)) > 0 else doc["url"], url=unicode(doc["url"]), desc=unicode(desc[0]["content"]) if len(desc) > 0 and desc[0]["content"] is not None else u"", rank=doc["rank"], content=unicode(soup.title(text=True)[0] + "\n" + doc["url"] + "\n" + "".join(soup.body(text=True))) ) writer.commit() writer = get_writer() set_last_change(since) finally: set_last_change(since)
def handle(self, **options): since = get_last_change() writer = get_writer() try: while True: changes = settings.db.changes(since=since) since = changes["last_seq"] for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: continue if "type" in doc and doc["type"] == "page": print "indexing", doc["url"] soup = BeautifulSoup(doc["content"]) if soup.body is None: continue desc = soup.findAll('meta', attrs={"name": desc_re}) writer.update_document( title=unicode(soup.title( text=True)[0]) if soup.title is not None and len(soup.title(text=True)) > 0 else doc["url"], url=unicode(doc["url"]), desc=unicode(desc[0]["content"]) if len(desc) > 0 and desc[0]["content"] is not None else u"", rank=doc["rank"], content=unicode( soup.title(text=True)[0] + "\n" + doc["url"] + "\n" + "".join(soup.body(text=True)))) writer.commit() writer = get_writer() set_last_change(since) finally: set_last_change(since)
Energy = [] Mu = [] Muen = [] table = soup.find('table') for row in table.findAll('tr'): col = row.findAll('td') if len(str(col).split()) == 3: Energy.append(col[0].find(text=True)) Mu.append(col[1].find(text=True)) Muen.append(col[2].find(text=True)) print col[1] plt.loglog(Energy, Mu, label='Mu') plt.loglog(Energy, Muen, label='Muen') plt.title(soup.title(text=True)) plt.legend() plt.show() URL = 'http://physics.nist.gov/PhysRefData/XrayMassCoef/ComTab/bone.html' response = urllib2.urlopen(URL) html = response.read() soup = BeautifulSoup(html) Energy = [] Mu = [] Muen = [] table = soup.find('table') for row in table.findAll('tr'): col = row.findAll('td') if len(str(col).split()) == 3: