def test_harvest(self): task = SampleHarvestChunk() luigi.build([task], local_scheduler=True) want_path = os.path.join(FIXTURES, 'sample_dnb_oai_response.xml') want = BeautifulSoup.BeautifulStoneSoup(open(want_path).read()) got = BeautifulSoup.BeautifulStoneSoup(task.output().open().read()) _, temp = tempfile.mkstemp() task.output().copy(temp) self.assertEquals(want.prettify(), got.prettify(), msg='diff ' '{} {}'.format(want_path, temp))
def get_rtmp_url(self, url_page, quality ): page_soup = BS.BeautifulSoup( urllib2.urlopen(url_page).read() ) movie_object = page_soup.find("object", classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000") movie = movie_object.find("param", {"name":"movie"}) movie_url = "http" + self.unescape_xml(movie['value'].split("http")[-1]) xml_soup = BS.BeautifulStoneSoup( urllib2.urlopen(movie_url).read() ) movie_url = xml_soup.find("video", {'lang': get_lang()})['ref'] xml_soup = BS.BeautifulStoneSoup( urllib2.urlopen(movie_url).read() ) base_soup = xml_soup.find("urls") movie_url = base_soup.find("url", {"quality": quality}).string return movie_url
def testCategoryLinksInPosts(self): """Make sure category links in posts are correct""" main.main("init blog_unit_test") main.config.override_options = { "site.url":"http://www.yoursite.com", "blog.path":"/blog" } #Write a blog post with categories: src = """--- title: This is a test post categories: Category 1, Category 2 date: 2009/08/16 00:00:00 --- This is a test post """ f = open(os.path.join(self.build_path,"_posts","01. Test post.html"),"w") f.write(src) f.close() main.main("build") #Open up one of the permapages: page = open(os.path.join(self.build_path,"_site","blog","2009", "08","16","this-is-a-test-post","index.html")).read() soup = BeautifulSoup.BeautifulStoneSoup(page) print soup.findAll("a") assert soup.find("a",attrs={'href':'/blog/category/category-1'}) assert soup.find("a",attrs={'href':'/blog/category/category-2'})
def getdate(thedate): url = "http://lishi.tianqi.com/heqing/" + thedate + ".html" print "getting the " + thedate r = requests.get(url) html_1 = urllib.urlopen(url).read() html = html_1.decode('gb2312', 'ignore').encode('utf-8') #besoup=BeautifulSoup(html.decode("utf-8","ignore") ) soup = BeautifulSoup.BeautifulStoneSoup(html) name = thedate + ".txt" fw = open(name, "w") for ul in soup.find("div", {"class": "tqtongji2"}).findAll("ul"): # curday=[] for li in ul.findAll("li"): try: data = li.a.contents[0].decode("utf-8") #curday.append(data) except: data = li.contents[0].decode("utf-8") #curday.append(data) fw.write(data) fw.write("\t") # curmonth.append(curday) fw.write("\n") fw.close() print "finished the " + thedate
def main(): import passcode key = passcode.code del passcode url = 'http://isbndb.com/api/books.xml?access_key=%s&index1=isbn&value1=%s' form = cgi.FieldStorage() try: isbn = form['isbn'].value value = True except KeyError: value = False if not value: printcontent(input_content) sys.exit() open = urllib.urlopen(url % (key, isbn)) text = open.read() open.close() template = cite_web(BS.BeautifulStoneSoup(text)) content = """\ <h2>{{cite book}}</h2> %s <textarea>%s</textarea> """ % (subpagelink, template) printcontent(content) del key
def _ids_received(self, message, callback, error_callback): if not message.status_code == Soup.KnownStatusCode.OK: error_callback('Pubmed replied with error code %d.' % message.status_code) else: response_data = message.response_body.flatten().get_data() parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data) # Check whether there were any hits at all if int(parsed_response.esearchresult.count.string) == 0: return # Nothing to do anymore # Continue with a second request asking for the summaries web_env = parsed_response.esearchresult.webenv.string query_key = parsed_response.esearchresult.querykey.string log_debug('Continuing Pubmed query (downloading summaries)') query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env) message = Soup.Message.new(method='GET', uri_string=query) def mycallback(session, message, user_data): self._summaries_received(message, callback, error_callback) soup_session.queue_message(message, mycallback, None)
def main(): urlList = open("seed.txt", "r").read().splitlines() allowDomainList = set(open("allowDomain.txt", "r").read().splitlines()) readURL = set() while (urlList): url = urlList.pop(0) domain = urlparse.urlparse(url)[1] if not domain in allowDomainList: continue encodedURL = urllib.quote_plus(url) if encodedURL in readURL: continue readURL.add(encodedURL) # URLをGET try: urlpointer = urllib.urlopen(url) contentsType = urlpointer.headers["Content-Type"] if (contentsType.find("text/html") == -1 and contentsType.find("text/xml") == -1): print "not html contents", contentsType continue data = urlpointer.read() filename = "./data/" + encodedURL fp = open(filename, "w") fp.write(data) fp.close() print url except: print "cantLoadContents" continue # SOUP化する try: soup = BeautifulSoup.BeautifulStoneSoup( unicode(data, "utf-8", "ignore")) except: print "cantCreateSoup" continue #リンク抽出 for item in soup.findAll("a"): if item.has_key("href"): foundURL = urlparse.urljoin(url, item["href"]) domain = urlparse.urlparse(foundURL)[1] if not domain in allowDomainList: continue if urllib.quote_plus(foundURL) in readURL: continue urlList.append(foundURL) time.sleep(1)
def getscale(xmlpath): soup = bs.BeautifulStoneSoup(open(xmlpath)) xscale = float(soup.find('key', key="micronsPerPixel_XAxis")['value']) yscale = float(soup.find('key', key="micronsPerPixel_YAxis")['value']) return (xscale, yscale)
def testFeedLinksAreURLs(self): """Make sure feed links are full URLs and not just paths""" main.main("init blog_unit_test") #Write a post to the _posts dir: permalink = "/blog/2009/08/16/test-post" src = """--- title: This is a test post permalink: %(permalink)s date: 2009/08/16 00:00:00 --- This is a test post """ %{'permalink':permalink} f = open(os.path.join(self.build_path,"_posts","01. Test post.html"),"w") f.write(src) f.close() main.config.override_options = { "site.url":"http://www.yoursite.com", "blog.path":"/blog", "blog.auto_permalink.enabled": True, "blog.auto_permalink.path": "/blog/:year/:month/:day/:title" } main.main("build") feed = open(os.path.join(self.build_path,"_site","blog","feed", "index.xml")).read() soup = BeautifulSoup.BeautifulStoneSoup(feed) for link in soup.findAll("link"): assert(link.contents[0].startswith("http://"))
def scrapePdf(pdfdata, id=0): pdfxml = scraperwiki.pdftoxml(pdfdata) soup = BeautifulSoup.BeautifulStoneSoup(pdfxml) soup = soup.findAll('text') left = {} left['nome'] = soup[0]['left'] left['cargo'] = soup[1]['left'] left['vinculo'] = soup[2]['left'] left['cargo_comissao'] = soup[3]['left'] left['jornada'] = soup[4]['left'] left['sit_funcional'] = soup[5]['left'] left['secretaria'] = soup[6]['left'] left['un_orcamentaria'] = soup[7]['left'] left['un_gestora'] = soup[8]['left'] left['un_administrativa'] = soup[9]['left'] left['municipio'] = soup[10]['left'] for r in range(0,11): print soup[r] for x in soup: if x['left'] == left['nome']: data = {} data['id'] = id data['nome'] = x.text data['cargo'] = '' data['vinculo'] = '' data['cargo_comissao'] = '' data['jornada'] = '' data['sit_funcional'] = '' data['secretaria'] = '' data['un_orcamentaria'] = '' data['un_gestora'] = '' data['un_administrativa'] = '' elif x['left'] == left['cargo'] and x.text: data['cargo'] = x.text elif x['left'] == left['vinculo'] and x.text: data['vinculo'] = x.text elif x['left'] == left['cargo_comissao'] and x.text: data['cargo_comissao'] = x.text elif x['left'] == left['jornada'] and x.text: data['jornada'] = x.text elif x['left'] == left['sit_funcional'] and x.text: data['sit_funcional'] = x.text #algumas colunas truncam no pdf de teste e ficam nesse mesmo <text> elif x['left'] == left['secretaria'] and x.text: s = x.text.split(' ') len_s = len(s) if len_s == 3: data['secretaria'] = s[0] data['un_orcamentaria'] = s[1] data['un_gestora'] = s[2] elif len_s == 2: data['secretaria'] = s[0] data['un_orcamentaria'] = s[1] else: data['secretaria'] = x.text elif x['left'] == left['un_orcamentaria'] and x.text: data['un_orcamentaria'] = x.text elif x['left'] == left['un_gestora'] and x.text: data['un_gestora'] = x.text elif x['left'] == left['un_administrativa'] and x.text: data['un_administrativa'] = x.text elif x['left'] == left['municipio'] and x.text: data['municipio'] = x.text scraperwiki.datastore.save(["id"], data) id = id + 1 else: if x.text: print x['left'] + ' - Erro: ' + x.text scraperwiki.sqlite.save_var('last_id', int(id))
def show_pitch(self): """Show the movie's summary. """ dureeRE = re.compile('[^0-9]*([0-9]+)(mn|min)') idx = self.items.index(self.preview.selectedItems()[0]) self.editor.clear() font = self.editor.font() if not self.videos[idx].pitch: try: datas = self.index[self.liststore[idx][1]] except KeyError: page = urllib2.urlopen(self.liststore[idx][2]).read() soup = BS.BeautifulSoup(page) base_node = soup.find('div', {"class": "recentTracksCont"}) data_resume = u"" for i in base_node.findAll('p'): if len(data_resume) != 0: data_resume += "\n" try: data_resume += BS.BeautifulStoneSoup( i.string, convertEntities=BS.BeautifulStoneSoup.HTML_ENTITIES ).contents[0] if i["class"] == "accroche": data_resume += "\n" except: pass try: time = dureeRE.search(page).group(1) except: time = "0" datas = (data_resume, time) self.index[self.liststore[idx][1]] = datas self.videos[idx].pitch = datas[0] self.videos[idx].time = datas[1] font.setPointSize(font.pointSize() + 1) font.setBold(True) self.editor.setCurrentFont(font) self.editor.append(self.videos[idx].title) font.setPointSize(font.pointSize() - 1) font.setBold(False) self.editor.setCurrentFont(font) t = "".join([ self.videos[idx].date, u" durée : ", self.videos[idx].time, " min.\n" ]) self.editor.append(t) self.editor.append(self.videos[idx].pitch) self.editor.verticalScrollBar().setValue(0) # Need to return False for drag and drop return False
def return_pwsid(self, sbmessage, stationid): res_xml = self.http_get_query(self.endpoints['WsCurrent'], {'ID': stationid}) parsed_res = BeautifulSoup.BeautifulStoneSoup(res_xml) res = parsed_res.current_observation.location.full.string + ' ' res = self.parse_wunderground_respone(parsed_res, res) sbmessage.respond(res.encode('latin-1'))
def get_day(self, channel_id, day): """抓取单天记录""" url = "http://hz.tvsou.com/jm/hw/hw8901.asp?id=%s&Date=%s" % ( channel_id, day.strftime("%Y-%m-%d")) content = self.get_content(url) content = content.decode('gb18030').encode('utf-8') self.xml_content = BeautifulSoup.BeautifulStoneSoup( content, fromEncoding="utf-8") programs = [] items = self.xml_content.findAll('c') for item in items: s_time = item.find('pt') program_title = item.find('pn') tvsou_tags = item.find('pp') fid2 = item.find('fid2').string fid = item.find('fid').string if fid2 and fid2 != '0': tvsou_wiki_id = fid2 elif fid and fid != '0': tvsou_wiki_id = fid else: tvsou_wiki_id = '0' if program_title: s_time = s_time.string s_time = time.strptime(s_time, "%Y-%m-%d %H:%M:%S") program_title = program_title.string program_title = unescape(program_title) program_title = program_title.replace("(本节目表由搜视网提供)", "") program_title = program_title.strip() # 转换 TVSOU 属性为汉字 tvsou_tags = tvsou_tags.string tags = [] if tvsou_tags: tvsou_tags = tvsou_tags.split(',[' ']],[') for tvsou_tag in tvsou_tags: tvsou_tag = tvsou_tag.strip() if tvsou_properties.has_key(tvsou_tag): tags.append(tvsou_properties[tvsou_tag]) wiki = {} if tvsou_wiki_id and tvsou_wiki_id != '0': wiki['tvsou_id'] = str(tvsou_wiki_id) program = { "stime": time.strftime("%H:%M", s_time), "title": program_title.strip(), "date": time.strftime("%Y-%m-%d", s_time), "tags": tags, "wiki": wiki } programs.append(program) return programs
def _chefjivevalleypig(self, irc, type, s): params = urlencode(dict(input=s, type=type)) url = 'http://www.cs.utexas.edu/users/jbc/bork/bork.cgi?' + params resp = web.getUrl(url, headers=HEADERS) resp = re.sub('&(ampway|emp);', '&', resp) resp = BS.BeautifulStoneSoup( resp, convertEntities=BS.BeautifulStoneSoup.HTML_ENTITIES).contents[0] resp = re.sub('\n', ' ', resp) irc.reply(resp.encode('utf-8', 'ignore').strip())
def wunderground(self, sbmessage): res_xml = self.http_get_query(self.endpoints['GeoCurrent'], {'query': sbmessage.arguments}) parsed_res = BeautifulSoup.BeautifulStoneSoup(res_xml) res = \ parsed_res.current_observation.display_location.full.string \ + ' ' res = self.parse_wunderground_respone(parsed_res, res) sbmessage.respond(res.encode('latin-1'))
def _get_doc_from_xml(page): try: try: doc = minidom.parseString(page.strip().encode('utf-8')) except UnicodeDecodeError: doc = minidom.parseString(page.strip()) lookup_function = _lookup_xml_from_dom except ExpatError, e: doc = BeautifulSoup.BeautifulStoneSoup(page) lookup_function = _lookup_xml_from_soup
def update_restaurants(db, restaurants, restaurant_url): this_version = str(uuid4()) print "This Version: " + this_version print "Restaurant Count: " + str(len(restaurants)) for restaurant in restaurants: restaurant_id = restaurant['value'] print str(restaurants.index(restaurant)) + "\t" + time.asctime( time.gmtime()) + "\t" + restaurant_id # get restaruant data restaurant_full_url = restaurant_url % restaurant_id restaurant_page = urllib2.urlopen(restaurant_full_url) restaurant_xml = BeautifulSoup.BeautifulStoneSoup(restaurant_page) restaurant_data = restaurant_xml.markers.marker doc = db.get(restaurant_id) if not doc: doc = {} # make document doc['name'] = restaurant.contents[0] if (float(restaurant_data['lng'] or False) and float(restaurant_data['lat'] or False)): doc['loc'] = [ round(float(restaurant_data['lng']), 5), round(float(restaurant_data['lat']), 5) ] doc['telephone'] = restaurant_data['bookingtel'] doc['url'] = restaurant_data['restaurantwebsite'] doc['cuisine'] = restaurant_data['cuisine'] doc['availability'] = availability(restaurant_data['availablefri'], restaurant_data['availablefriday'], restaurant_data['availablesat'], restaurant_data['availablesatday'], restaurant_data['availabledec']) doc['limitations'] = limitations( restaurant_data['restaurantphonebookings'], restaurant_data['restaurantcarduse'], restaurant_data['restaurantcardusetype'], restaurant_data['restaurantmaxpeople'], ) doc['version'] = this_version db[restaurant_id] = doc # create/update # attach image to completed document image = urllib2.urlopen(config['imageurl'] % restaurant_data['image']) file = str(image.read()) db.put_attachment(db[restaurant_id], file, restaurant_data['image']) time.sleep(config['delay']) # prevent server swamping if config['delete']: delete_old(db, this_version)
def wu_pws(self, sbmessage): stations = self.http_get_query(self.endpoints['GeoLookup'], {'query': sbmessage.arguments}) stations_parsed = BeautifulSoup.BeautifulStoneSoup(stations) nearby_stations = stations_parsed.location.nearby_weather_stations station_id = nearby_stations.pws.station.id.string station_id = station_id.replace('<![CDATA[', '') station_id = station_id.replace(']]>', '') self.return_pwsid(sbmessage, station_id)
def get_config_info(xml): soup = bs.BeautifulStoneSoup(open(xml, 'rb')) info = {} info['xsize'] = int(soup.find('key', key='pixelsPerLine')['value']) info['ysize'] = int(soup.find('key', key='linesPerFrame')['value']) info['xmpp'] = float( soup.find('key', key='micronsPerPixel_XAxis')['value']) info['ympp'] = float( soup.find('key', key='micronsPerPixel_YAxis')['value']) widths = soup('key', key='positionCurrent_ZAxis') info['z_width'] = abs( float(widths[1]['value']) - float(widths[0]['value'])) return info
def getlyrics(artistname,songname): #make names lowercase for folders and remove trailing newflines artistname = mReplace(artistname.strip(),{'\'':'','(Live)':''}) songname = mReplace(songname.strip(),{'\'':'','(Live)':''}) #set lyrics folder, the folder used by default rhythmbox lyrics plugin is ~/.lyrics artistfolder = os.path.join(lyricsfolder,''.join(c for c in artistname[:128].lower() if c in validChars)) #check if lyrics folder exists, if not then create it if not os.path.isdir(lyricsfolder): if verbose: print "Lyrics folder: %s doesn't exist. Creating it..." % lyricsfolder os.mkdir(lyricsfolder) lyricfile = os.path.join(artistfolder,''.join(c for c in songname[:128].lower() if c in validChars) + '.lyric') #make the names ready for the intertubes urls #check if the lyric file already exists if os.path.isfile(lyricfile) == False: lyrics = fetchlyrics(artistname,songname) if lyrics: #remove html entities lyrics = str(BeautifulSoup.BeautifulStoneSoup(lyrics,convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES)) #check if the artist folder exists, if not then create it if not os.path.isdir(artistfolder): if verbose: print "Artist folder: %s doesn't exist. Creating it..." % artistfolder os.mkdir(artistfolder) #write the lyrics to their appropriate file f = file (lyricfile, 'w') f.write (lyrics) f = file (lyricfile, 'r') lyrics = mReplace(f.read(),wordDict).split('\n') if verbose: print "Found lyrics. Writing to %s" % (lyricfile) printlyrics(lyrics) f.close () return True else: #append the info to the unfound list f = file (lyricsfolder + "/missingsongs.txt", 'a') f.write (artistname + " : " + songname +"\n") f.close () if verbose: print "Failed to find lyrics for song: %s : %s" % (artistname, songname) return False else: if verbose: print "Lyrics file already exist for: %s : %s" % (artistname, songname) f = file (lyricfile, 'r') lyrics = mReplace(f.read(),wordDict).split('\n') printlyrics(lyrics) f.close () return True
def getXMLValue(app, xml_str, tag): rtn = '' try: soup = BeautifulSoup.BeautifulStoneSoup(xml_str) except: app.response.out.write('err: could not parse ' + xml_str) return rtn node = soup.find(tag) if not node: app.response.out.write('err: could not find ' + tag + '\n') else: rtn = soup.find(tag).string return rtn
def getEachPage(self, html): soup = BeautifulSoup.BeautifulStoneSoup(html) paimai = soup.findAll('img', {"class": "vipicbg"}) titles = [] hrefs = [] for each in paimai: title = each.get('alt') href = each.get('src') titles.append(title) hrefs.append(href) timess = soup.findAll('div', {"class": "img"}) times = [] for each in timess: time = each.nextSibling.nextSibling time = time.string times.append(time) return titles, times, hrefs
def get_random_pmids(sample_size, email, query, seed=None): # Do an initial query to get the total number of hits url = url_template.format(random_index=1, query=query, email=email) r = requests.get(url) initial_response = r.text soup = BeautifulSoup.BeautifulStoneSoup(initial_response) translated_query = soup.querytranslation.string population_size = int(soup.esearchresult.count.string) print "Double-check PubMed's translation of your query: %s" % translated_query print "Number of PMIDs returned by this query: %i" % population_size print "Off to randomly sample %i of them!" % sample_size if seed: random.seed(seed) print "Seed has been set before sampling." pmid_pattern = re.compile( "<Id>(?P<pmid>\d+)</Id>" ) # do this as an re because it is simple and fast if sample_size > population_size: print "sample size is bigger than population size, so using population size" sample_size = population_size random_indexes = random.sample(range(1, population_size), sample_size) pmids = [] for random_index in random_indexes: r = get_nth_pmid(random_index, query, email) try: pmid = pmid_pattern.search(r.text).group("pmid") #hope this is transient, try the random number + 1 except AttributeError: print "got an error extracting pmid, trying again with subsequent index" r = get_nth_pmid(random_index + 1, query, email) pmid = pmid_pattern.search(r.text).group("pmid") print "pmid:" + pmid pmids.append(pmid) time.sleep( 1 / 3 ) #NCBI requests no more than three requests per second at http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen return pmids
def find_lyrics(self, track): try: (artist, title) = track.get_tag_raw('artist')[0].encode("utf-8"), \ track.get_tag_raw('title')[0].encode("utf-8") except TypeError: raise LyricsNotFoundException if not artist or not title: raise LyricsNotFoundException artist = urllib.quote(artist.replace(' ', '_')) title = urllib.quote(title.replace(' ', '_')) url = 'http://lyrics.wikia.com/wiki/%s:%s' % (artist, title) try: html = common.get_url_contents(url, self.user_agent) except: raise LyricsNotFoundException try: soup = BeautifulSoup.BeautifulSoup(html) except HTMLParser.HTMLParseError: raise LyricsNotFoundException lyrics = soup.findAll(attrs={"class": "lyricbox"}) if lyrics: lyrics = re.sub( r' Send.*?Ringtone to your Cell ', '', '\n'.join( self.remove_div(lyrics[0].renderContents().replace( '<br />', '\n')).replace('\n\n\n', '').split('\n')[0:-7])) else: raise LyricsNotFoundException lyrics = self.remove_script(lyrics) lyrics = self.remove_html_tags( unicode( BeautifulSoup.BeautifulStoneSoup( lyrics, convertEntities=BeautifulSoup.BeautifulStoneSoup. HTML_ENTITIES))) return (lyrics, self.name, url)
def get_config_info(xml): soup = bs.BeautifulStoneSoup(open(xml, 'rb')) info = {} info['xsize'] = int(soup.find('key', key='pixelsPerLine')['value']) info['ysize'] = int(soup.find('key', key='linesPerFrame')['value']) info['xmpp'] = float( soup.find('key', key='micronsPerPixel_XAxis')['value']) info['ympp'] = float( soup.find('key', key='micronsPerPixel_YAxis')['value']) if os.path.splitext(xml)[1] == '.xml': widths = soup('key', key='positionCurrent_ZAxis') info['z_width'] = abs( float(widths[1]['value']) - float(widths[0]['value'])) # elif os.path.splitext(xml)[1] == '.cfg': # width = float(soup('key', key='motorStepSize_ZAxis')[0]['value']) # info['z_width'] = abs(width) else: info['z_width'] = 1 return info
def run(self): stopover = tempfile.mkdtemp(prefix='gluish-') oai_harvest(url=self.url, begin=self.begin, end=self.end, prefix=self.prefix, directory=stopover, collection=self.collection, delay=self.delay) with self.output().open('w') as output: output.write("""<collection xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> """) for path in iterfiles(stopover): with open(path) as handle: soup = BeautifulSoup.BeautifulStoneSoup(handle.read()) for record in soup.findAll('record'): output.write(str(record)) # or unicode? output.write('</collection>\n')
def metrolyrics(artist,title): artist = urllib.quote(artist.lower().replace(' ','-')) title = urllib.quote(title.lower().replace(' ','-')) if verbose: print "Trying to fetch lyrics from metrolyrics.com" try: lyrics = urllib.urlopen("http://www.metrolyrics.com/%s-lyrics-%s.html" % (title,artist)) except: if verbose: print "Could not connect to metrolyrics.com. Exiting..." return text = lyrics.read() text = text.replace('</sc"+"ript>"','') #beautifulsoup chokes on this particular tag so we have to get rid of it. soup = BeautifulSoup.BeautifulSoup(text) lyrics = soup.findAll(attrs= {"id" : "lyrics"}) if not lyrics: if verbose: print "Lyrics not found at metrolyrics.com" return else: #this removes formatting and converts from html entities return '\n'.join(map(lambda x: x.strip(),remove_html_tags(unicode(BeautifulSoup.BeautifulStoneSoup( lyrics[0].renderContents(),convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES) ))[2:].replace('\r','\n').split('\n'))[:-2])
def _summaries_received(self, message, callback, error_callback): if not message.status_code == Soup.KnownStatusCode.OK: error_callback('Pubmed replied with error code %d.' % message.status_code) else: response_data = message.response_body.flatten().get_data() parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data) # get information for all documents documents = parsed_response.esummaryresult.findAll('docsum') papers = [] for document in documents: info = {} # Extract information info['pubmed_id'] = str(document.id.string) # This is needed for retrieving the paper in # import_paper_after_search info['data'] = info['pubmed_id'] doi = document.findAll('item', {'name': 'doi'}) if doi: info['doi'] = doi[0].string info['import_url'] = 'http://dx.doi.org/' + info['doi'] info['title'] = document.findAll('item', {'name': 'Title'})[0].string info['authors'] = [str(author.string) for author in \ document.findAll('item', {'name': 'Author'})] info['journal'] = document.findAll( 'item', {'name': 'FullJournalName'})[0].string pubdate = document.findAll('item', {'name': 'PubDate'}) if pubdate and pubdate[0]: info['year'] = pubdate[0].string[:4] #TODO: Retrieve abstract papers.append(info) callback(papers)
def get_top_K_pages(phrase, K): """ In which we coax a mighty search engine into giving us what we want. TODO: References: - http://en.wikibooks.org/wiki/Python_Programming/Internet - http://docs.python.org/library/urllib2.html """ global W, T_to_be_visited # TODO: use urllib.quote instead of str.replace search_url = yahoo_url % (phrase.replace(' ', '+'), str(K)) # Sleep for a few seconds, just in case we are calling the search engine too frequently time.sleep(search_lag_time) search_results = urllib2.urlopen(urllib2.Request(search_url, None, headers)) clickurls = BeautifulSoup.SoupStrainer('clickurl') results_soup = BeautifulSoup.BeautifulStoneSoup(search_results, parseOnlyThese=clickurls) logging.debug('Search results: ' + results_soup.prettify()) # order of W is not important at the moment W = set([link.string for link in results_soup.findAll('clickurl')]) T_to_be_visited = list(W.copy())
def lyricsmode(artist,title): artist = urllib.quote(artist.lower().replace(' ','_')) title = urllib.quote(title.lower().replace(' ','_')) if verbose: print "Trying to fetch lyrics from lyricsmode.com" try: lyrics = urllib.urlopen('http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title)) except: if verbose: print "Could not connect to lyricsmode.com. Exiting..." return text = lyrics.read().decode('latin-1').replace( u'\xb7','') soup = BeautifulSoup.BeautifulSoup(text) #lyricsmode places the lyrics in a span with an id of "lyrics" lyrics = soup.findAll(attrs= {"id" : "lyrics"}) if not lyrics: if verbose: print "Lyrics not found at lyricsmode.com" return [] else: #this function removes formatting and converts html entities into ascii since lyricsmode obfuscates the lyrics. return remove_html_tags(unicode(BeautifulSoup.BeautifulStoneSoup( lyrics[0].renderContents(), convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES )).replace('<br />','\n').strip('\r\n\t\t'))