Example #1
0
 def test_harvest(self):
     task = SampleHarvestChunk()
     luigi.build([task], local_scheduler=True)
     want_path = os.path.join(FIXTURES, 'sample_dnb_oai_response.xml')
     want = BeautifulSoup.BeautifulStoneSoup(open(want_path).read())
     got = BeautifulSoup.BeautifulStoneSoup(task.output().open().read())
     _, temp = tempfile.mkstemp()
     task.output().copy(temp)
     self.assertEquals(want.prettify(),
                       got.prettify(),
                       msg='diff '
                       '{} {}'.format(want_path, temp))
Example #2
0
    def get_rtmp_url(self, url_page, quality ):
        page_soup = BS.BeautifulSoup( urllib2.urlopen(url_page).read() )

        movie_object = page_soup.find("object", classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000")
        movie = movie_object.find("param", {"name":"movie"})
        movie_url = "http" + self.unescape_xml(movie['value'].split("http")[-1])

        xml_soup = BS.BeautifulStoneSoup( urllib2.urlopen(movie_url).read() )
        movie_url = xml_soup.find("video", {'lang': get_lang()})['ref']

        xml_soup = BS.BeautifulStoneSoup( urllib2.urlopen(movie_url).read() )
        base_soup = xml_soup.find("urls")
        movie_url = base_soup.find("url", {"quality": quality}).string
        return movie_url
Example #3
0
    def testCategoryLinksInPosts(self):
        """Make sure category links in posts are correct"""
        main.main("init blog_unit_test")
        main.config.override_options = {
            "site.url":"http://www.yoursite.com",
            "blog.path":"/blog"
            }
        #Write a blog post with categories:
        src = """---
title: This is a test post
categories: Category 1, Category 2
date: 2009/08/16 00:00:00
---
This is a test post
"""
        f = open(os.path.join(self.build_path,"_posts","01. Test post.html"),"w")
        f.write(src)
        f.close()
        main.main("build")
        #Open up one of the permapages:
        page = open(os.path.join(self.build_path,"_site","blog","2009",
                                 "08","16","this-is-a-test-post","index.html")).read()
        soup = BeautifulSoup.BeautifulStoneSoup(page)
        print soup.findAll("a")
        assert soup.find("a",attrs={'href':'/blog/category/category-1'})
        assert soup.find("a",attrs={'href':'/blog/category/category-2'})
Example #4
0
def getdate(thedate):

    url = "http://lishi.tianqi.com/heqing/" + thedate + ".html"
    print "getting the " + thedate
    r = requests.get(url)
    html_1 = urllib.urlopen(url).read()

    html = html_1.decode('gb2312', 'ignore').encode('utf-8')
    #besoup=BeautifulSoup(html.decode("utf-8","ignore") )
    soup = BeautifulSoup.BeautifulStoneSoup(html)

    name = thedate + ".txt"
    fw = open(name, "w")
    for ul in soup.find("div", {"class": "tqtongji2"}).findAll("ul"):
        #   curday=[]
        for li in ul.findAll("li"):
            try:
                data = li.a.contents[0].decode("utf-8")
                #curday.append(data)
            except:
                data = li.contents[0].decode("utf-8")
                #curday.append(data)
            fw.write(data)
            fw.write("\t")

        # curmonth.append(curday)
        fw.write("\n")

    fw.close()
    print "finished the " + thedate
Example #5
0
def main():
    import passcode
    key = passcode.code
    del passcode
    url = 'http://isbndb.com/api/books.xml?access_key=%s&index1=isbn&value1=%s'
    form = cgi.FieldStorage()
    try:
        isbn = form['isbn'].value
        value = True
    except KeyError:
        value = False
    if not value:
        printcontent(input_content)
        sys.exit()
    open = urllib.urlopen(url % (key, isbn))
    text = open.read()
    open.close()
    template = cite_web(BS.BeautifulStoneSoup(text))
    content = """\
    <h2>{{cite book}}</h2>
    %s
    <textarea>%s</textarea>
    """ % (subpagelink, template)
    printcontent(content)
    del key
Example #6
0
    def _ids_received(self, message, callback, error_callback):

        if not message.status_code == Soup.KnownStatusCode.OK:
            error_callback('Pubmed replied with error code %d.' %
                           message.status_code)
        else:
            response_data = message.response_body.flatten().get_data()
            parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data)

            # Check whether there were any hits at all
            if int(parsed_response.esearchresult.count.string) == 0:
                return  # Nothing to do anymore

            # Continue with a second request asking for the summaries
            web_env = parsed_response.esearchresult.webenv.string
            query_key = parsed_response.esearchresult.querykey.string
            log_debug('Continuing Pubmed query (downloading summaries)')
            query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env)

            message = Soup.Message.new(method='GET', uri_string=query)

            def mycallback(session, message, user_data):
                self._summaries_received(message, callback, error_callback)

            soup_session.queue_message(message, mycallback, None)
Example #7
0
def main():
    urlList = open("seed.txt", "r").read().splitlines()
    allowDomainList = set(open("allowDomain.txt", "r").read().splitlines())
    readURL = set()

    while (urlList):
        url = urlList.pop(0)
        domain = urlparse.urlparse(url)[1]
        if not domain in allowDomainList:
            continue

        encodedURL = urllib.quote_plus(url)
        if encodedURL in readURL:
            continue

        readURL.add(encodedURL)

        # URLをGET
        try:
            urlpointer = urllib.urlopen(url)
            contentsType = urlpointer.headers["Content-Type"]
            if (contentsType.find("text/html") == -1
                    and contentsType.find("text/xml") == -1):
                print "not html contents", contentsType
                continue

            data = urlpointer.read()
            filename = "./data/" + encodedURL

            fp = open(filename, "w")
            fp.write(data)
            fp.close()
            print url
        except:
            print "cantLoadContents"
            continue

        # SOUP化する
        try:
            soup = BeautifulSoup.BeautifulStoneSoup(
                unicode(data, "utf-8", "ignore"))
        except:
            print "cantCreateSoup"
            continue

        #リンク抽出
        for item in soup.findAll("a"):
            if item.has_key("href"):
                foundURL = urlparse.urljoin(url, item["href"])
                domain = urlparse.urlparse(foundURL)[1]

                if not domain in allowDomainList:
                    continue

                if urllib.quote_plus(foundURL) in readURL:
                    continue

                urlList.append(foundURL)

        time.sleep(1)
Example #8
0
def getscale(xmlpath):

    soup = bs.BeautifulStoneSoup(open(xmlpath))
    xscale = float(soup.find('key', key="micronsPerPixel_XAxis")['value'])
    yscale = float(soup.find('key', key="micronsPerPixel_YAxis")['value'])

    return (xscale, yscale)
Example #9
0
    def testFeedLinksAreURLs(self):
        """Make sure feed links are full URLs and not just paths"""
        main.main("init blog_unit_test")
        #Write a post to the _posts dir:
        permalink = "/blog/2009/08/16/test-post"
        src = """---
title: This is a test post
permalink: %(permalink)s
date: 2009/08/16 00:00:00
---
This is a test post
""" %{'permalink':permalink}
        f = open(os.path.join(self.build_path,"_posts","01. Test post.html"),"w")
        f.write(src)
        f.close()
        main.config.override_options = {
            "site.url":"http://www.yoursite.com",
            "blog.path":"/blog",
            "blog.auto_permalink.enabled": True,
            "blog.auto_permalink.path": "/blog/:year/:month/:day/:title" }
        main.main("build")
        feed = open(os.path.join(self.build_path,"_site","blog","feed",
                                 "index.xml")).read()
        soup = BeautifulSoup.BeautifulStoneSoup(feed)
        for link in soup.findAll("link"):
            assert(link.contents[0].startswith("http://"))
Example #10
0
def scrapePdf(pdfdata, id=0):
    pdfxml = scraperwiki.pdftoxml(pdfdata)
    soup = BeautifulSoup.BeautifulStoneSoup(pdfxml)
    soup = soup.findAll('text')
    left = {}
    left['nome'] = soup[0]['left']
    left['cargo'] = soup[1]['left']
    left['vinculo'] = soup[2]['left']
    left['cargo_comissao'] = soup[3]['left']
    left['jornada'] = soup[4]['left']
    left['sit_funcional'] = soup[5]['left']
    left['secretaria'] = soup[6]['left']
    left['un_orcamentaria'] = soup[7]['left']
    left['un_gestora'] = soup[8]['left']
    left['un_administrativa'] = soup[9]['left']
    left['municipio'] = soup[10]['left']
    for r in range(0,11):
        print soup[r]
    for x in soup:
        if x['left'] == left['nome']:
            data = {}
            data['id'] = id
            data['nome'] = x.text
            data['cargo'] = ''
            data['vinculo'] = ''
            data['cargo_comissao'] = ''
            data['jornada'] = ''
            data['sit_funcional'] = ''
            data['secretaria'] = ''
            data['un_orcamentaria'] = ''
            data['un_gestora'] = ''
            data['un_administrativa'] = ''
        elif x['left'] == left['cargo'] and x.text: data['cargo'] = x.text
        elif x['left'] == left['vinculo'] and x.text: data['vinculo'] = x.text
        elif x['left'] == left['cargo_comissao'] and x.text: data['cargo_comissao'] = x.text
        elif x['left'] == left['jornada'] and x.text: data['jornada'] = x.text
        elif x['left'] == left['sit_funcional'] and x.text: data['sit_funcional'] = x.text
        #algumas colunas truncam no pdf de teste e ficam nesse mesmo <text>
        elif x['left'] == left['secretaria'] and x.text:
            s = x.text.split('             ')
            len_s = len(s)
            if len_s == 3:
                data['secretaria'] = s[0]
                data['un_orcamentaria'] = s[1]
                data['un_gestora'] = s[2]
            elif len_s == 2:
                data['secretaria'] = s[0]
                data['un_orcamentaria'] = s[1]
            else: data['secretaria'] = x.text        
        elif x['left'] == left['un_orcamentaria'] and x.text: data['un_orcamentaria'] = x.text
        elif x['left'] == left['un_gestora'] and x.text: data['un_gestora'] = x.text
        elif x['left'] == left['un_administrativa'] and x.text: data['un_administrativa'] = x.text
        elif x['left'] == left['municipio'] and x.text:
            data['municipio'] = x.text
            scraperwiki.datastore.save(["id"], data)
            id = id + 1
        else:
            if x.text: print x['left'] + ' - Erro: ' + x.text
    scraperwiki.sqlite.save_var('last_id', int(id))
Example #11
0
    def show_pitch(self):
        """Show the movie's summary.

        """
        dureeRE = re.compile('[^0-9]*([0-9]+)(mn|min)')
        idx = self.items.index(self.preview.selectedItems()[0])
        self.editor.clear()
        font = self.editor.font()

        if not self.videos[idx].pitch:
            try:
                datas = self.index[self.liststore[idx][1]]
            except KeyError:
                page = urllib2.urlopen(self.liststore[idx][2]).read()
                soup = BS.BeautifulSoup(page)
                base_node = soup.find('div', {"class": "recentTracksCont"})
                data_resume = u""

                for i in base_node.findAll('p'):
                    if len(data_resume) != 0:
                        data_resume += "\n"
                    try:
                        data_resume += BS.BeautifulStoneSoup(
                            i.string,
                            convertEntities=BS.BeautifulStoneSoup.HTML_ENTITIES
                        ).contents[0]
                        if i["class"] == "accroche":
                            data_resume += "\n"
                    except:
                        pass
                try:
                    time = dureeRE.search(page).group(1)
                except:
                    time = "0"
                datas = (data_resume, time)
                self.index[self.liststore[idx][1]] = datas

            self.videos[idx].pitch = datas[0]
            self.videos[idx].time = datas[1]

        font.setPointSize(font.pointSize() + 1)
        font.setBold(True)
        self.editor.setCurrentFont(font)

        self.editor.append(self.videos[idx].title)

        font.setPointSize(font.pointSize() - 1)
        font.setBold(False)
        self.editor.setCurrentFont(font)
        t = "".join([
            self.videos[idx].date, u"   durée : ", self.videos[idx].time,
            " min.\n"
        ])
        self.editor.append(t)
        self.editor.append(self.videos[idx].pitch)
        self.editor.verticalScrollBar().setValue(0)

        # Need to return False for drag and drop
        return False
Example #12
0
    def return_pwsid(self, sbmessage, stationid):
        res_xml = self.http_get_query(self.endpoints['WsCurrent'],
                                      {'ID': stationid})

        parsed_res = BeautifulSoup.BeautifulStoneSoup(res_xml)
        res = parsed_res.current_observation.location.full.string + ' '
        res = self.parse_wunderground_respone(parsed_res, res)
        sbmessage.respond(res.encode('latin-1'))
Example #13
0
    def get_day(self, channel_id, day):
        """抓取单天记录"""
        url = "http://hz.tvsou.com/jm/hw/hw8901.asp?id=%s&Date=%s" % (
            channel_id, day.strftime("%Y-%m-%d"))

        content = self.get_content(url)
        content = content.decode('gb18030').encode('utf-8')

        self.xml_content = BeautifulSoup.BeautifulStoneSoup(
            content, fromEncoding="utf-8")

        programs = []
        items = self.xml_content.findAll('c')
        for item in items:
            s_time = item.find('pt')
            program_title = item.find('pn')
            tvsou_tags = item.find('pp')
            fid2 = item.find('fid2').string
            fid = item.find('fid').string
            if fid2 and fid2 != '0':
                tvsou_wiki_id = fid2
            elif fid and fid != '0':
                tvsou_wiki_id = fid
            else:
                tvsou_wiki_id = '0'

            if program_title:
                s_time = s_time.string
                s_time = time.strptime(s_time, "%Y-%m-%d %H:%M:%S")
                program_title = program_title.string
                program_title = unescape(program_title)
                program_title = program_title.replace("(本节目表由搜视网提供)", "")
                program_title = program_title.strip()

                # 转换 TVSOU 属性为汉字
                tvsou_tags = tvsou_tags.string
                tags = []
                if tvsou_tags:
                    tvsou_tags = tvsou_tags.split(',[' ']],[')

                    for tvsou_tag in tvsou_tags:
                        tvsou_tag = tvsou_tag.strip()
                        if tvsou_properties.has_key(tvsou_tag):
                            tags.append(tvsou_properties[tvsou_tag])

                wiki = {}
                if tvsou_wiki_id and tvsou_wiki_id != '0':
                    wiki['tvsou_id'] = str(tvsou_wiki_id)

                program = {
                    "stime": time.strftime("%H:%M", s_time),
                    "title": program_title.strip(),
                    "date": time.strftime("%Y-%m-%d", s_time),
                    "tags": tags,
                    "wiki": wiki
                }
                programs.append(program)
        return programs
Example #14
0
 def _chefjivevalleypig(self, irc, type, s):
     params = urlencode(dict(input=s, type=type))
     url = 'http://www.cs.utexas.edu/users/jbc/bork/bork.cgi?' + params
     resp = web.getUrl(url, headers=HEADERS)
     resp = re.sub('&(ampway|emp);', '&amp;', resp)
     resp = BS.BeautifulStoneSoup(
         resp,
         convertEntities=BS.BeautifulStoneSoup.HTML_ENTITIES).contents[0]
     resp = re.sub('\n', ' ', resp)
     irc.reply(resp.encode('utf-8', 'ignore').strip())
Example #15
0
    def wunderground(self, sbmessage):
        res_xml = self.http_get_query(self.endpoints['GeoCurrent'],
                                      {'query': sbmessage.arguments})

        parsed_res = BeautifulSoup.BeautifulStoneSoup(res_xml)
        res = \
            parsed_res.current_observation.display_location.full.string \
            + ' '
        res = self.parse_wunderground_respone(parsed_res, res)
        sbmessage.respond(res.encode('latin-1'))
Example #16
0
def _get_doc_from_xml(page):
    try:
        try:
            doc = minidom.parseString(page.strip().encode('utf-8'))
        except UnicodeDecodeError:
            doc = minidom.parseString(page.strip())
        lookup_function = _lookup_xml_from_dom
    except ExpatError, e:
        doc = BeautifulSoup.BeautifulStoneSoup(page)
        lookup_function = _lookup_xml_from_soup
Example #17
0
def update_restaurants(db, restaurants, restaurant_url):
    this_version = str(uuid4())
    print "This Version: " + this_version
    print "Restaurant Count: " + str(len(restaurants))

    for restaurant in restaurants:
        restaurant_id = restaurant['value']
        print str(restaurants.index(restaurant)) + "\t" + time.asctime(
            time.gmtime()) + "\t" + restaurant_id

        # get restaruant data
        restaurant_full_url = restaurant_url % restaurant_id
        restaurant_page = urllib2.urlopen(restaurant_full_url)
        restaurant_xml = BeautifulSoup.BeautifulStoneSoup(restaurant_page)
        restaurant_data = restaurant_xml.markers.marker

        doc = db.get(restaurant_id)
        if not doc:
            doc = {}

        # make document
        doc['name'] = restaurant.contents[0]
        if (float(restaurant_data['lng'] or False)
                and float(restaurant_data['lat'] or False)):
            doc['loc'] = [
                round(float(restaurant_data['lng']), 5),
                round(float(restaurant_data['lat']), 5)
            ]
        doc['telephone'] = restaurant_data['bookingtel']
        doc['url'] = restaurant_data['restaurantwebsite']
        doc['cuisine'] = restaurant_data['cuisine']
        doc['availability'] = availability(restaurant_data['availablefri'],
                                           restaurant_data['availablefriday'],
                                           restaurant_data['availablesat'],
                                           restaurant_data['availablesatday'],
                                           restaurant_data['availabledec'])
        doc['limitations'] = limitations(
            restaurant_data['restaurantphonebookings'],
            restaurant_data['restaurantcarduse'],
            restaurant_data['restaurantcardusetype'],
            restaurant_data['restaurantmaxpeople'],
        )
        doc['version'] = this_version

        db[restaurant_id] = doc  # create/update

        # attach image to completed document
        image = urllib2.urlopen(config['imageurl'] % restaurant_data['image'])
        file = str(image.read())
        db.put_attachment(db[restaurant_id], file, restaurant_data['image'])

        time.sleep(config['delay'])  # prevent server swamping

    if config['delete']:
        delete_old(db, this_version)
Example #18
0
    def wu_pws(self, sbmessage):
        stations = self.http_get_query(self.endpoints['GeoLookup'],
                                       {'query': sbmessage.arguments})

        stations_parsed = BeautifulSoup.BeautifulStoneSoup(stations)

        nearby_stations = stations_parsed.location.nearby_weather_stations
        station_id = nearby_stations.pws.station.id.string
        station_id = station_id.replace('<![CDATA[', '')
        station_id = station_id.replace(']]>', '')

        self.return_pwsid(sbmessage, station_id)
Example #19
0
def get_config_info(xml):
    soup = bs.BeautifulStoneSoup(open(xml, 'rb'))
    info = {}
    info['xsize'] = int(soup.find('key', key='pixelsPerLine')['value'])
    info['ysize'] = int(soup.find('key', key='linesPerFrame')['value'])
    info['xmpp'] = float(
        soup.find('key', key='micronsPerPixel_XAxis')['value'])
    info['ympp'] = float(
        soup.find('key', key='micronsPerPixel_YAxis')['value'])
    widths = soup('key', key='positionCurrent_ZAxis')
    info['z_width'] = abs(
        float(widths[1]['value']) - float(widths[0]['value']))

    return info
Example #20
0
def getlyrics(artistname,songname):
    #make names lowercase for folders and remove trailing newflines
    artistname = mReplace(artistname.strip(),{'\'':'','(Live)':''})
    songname = mReplace(songname.strip(),{'\'':'','(Live)':''})

    #set lyrics folder, the folder used by default rhythmbox lyrics plugin is ~/.lyrics
    artistfolder = os.path.join(lyricsfolder,''.join(c for c in artistname[:128].lower() if c in validChars))
    #check if lyrics folder exists, if not then create it
    if not os.path.isdir(lyricsfolder):
        if verbose: print "Lyrics folder: %s doesn't exist. Creating it..." % lyricsfolder
        os.mkdir(lyricsfolder)

    lyricfile = os.path.join(artistfolder,''.join(c for c in songname[:128].lower() if c in validChars) + '.lyric')

    #make the names ready for the intertubes urls
    #check if the lyric file already exists
    if os.path.isfile(lyricfile) == False:
        lyrics = fetchlyrics(artistname,songname)
        if lyrics:
            #remove html entities
            lyrics = str(BeautifulSoup.BeautifulStoneSoup(lyrics,convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES))
            #check if the artist folder exists, if not then create it
            if not os.path.isdir(artistfolder):
                if verbose: print "Artist folder: %s doesn't exist. Creating it..." % artistfolder
                os.mkdir(artistfolder)
            #write the lyrics to their appropriate file
            f = file (lyricfile, 'w')
            f.write (lyrics)
            f = file (lyricfile, 'r')
            lyrics = mReplace(f.read(),wordDict).split('\n')
            if verbose: print "Found lyrics. Writing to %s" % (lyricfile)
            printlyrics(lyrics)
            f.close ()
            return True
        else:
        #append the info to the unfound list
            f = file (lyricsfolder + "/missingsongs.txt", 'a')
            f.write (artistname + " : " + songname +"\n")
            f.close ()
            if verbose: print "Failed to find lyrics for song: %s  :  %s" % (artistname, songname)
            return False
    else:
        if verbose: print "Lyrics file already exist for: %s  :  %s" % (artistname, songname)
        f = file (lyricfile, 'r')
        lyrics = mReplace(f.read(),wordDict).split('\n')
        printlyrics(lyrics)
        f.close ()
        return True
Example #21
0
def getXMLValue(app, xml_str, tag):

    rtn = ''
    try:
        soup = BeautifulSoup.BeautifulStoneSoup(xml_str)
    except:
        app.response.out.write('err: could not parse ' + xml_str)
        return rtn

    node = soup.find(tag)
    if not node:
        app.response.out.write('err: could not find ' + tag + '\n')
    else:
        rtn = soup.find(tag).string

    return rtn
Example #22
0
 def getEachPage(self, html):
     soup = BeautifulSoup.BeautifulStoneSoup(html)
     paimai = soup.findAll('img', {"class": "vipicbg"})
     titles = []
     hrefs = []
     for each in paimai:
         title = each.get('alt')
         href = each.get('src')
         titles.append(title)
         hrefs.append(href)
     timess = soup.findAll('div', {"class": "img"})
     times = []
     for each in timess:
         time = each.nextSibling.nextSibling
         time = time.string
         times.append(time)
     return titles, times, hrefs
Example #23
0
def get_random_pmids(sample_size, email, query, seed=None):
    # Do an initial query to get the total number of hits
    url = url_template.format(random_index=1, query=query, email=email)
    r = requests.get(url)
    initial_response = r.text
    soup = BeautifulSoup.BeautifulStoneSoup(initial_response)
    translated_query = soup.querytranslation.string
    population_size = int(soup.esearchresult.count.string)

    print "Double-check PubMed's translation of your query: %s" % translated_query
    print "Number of PMIDs returned by this query: %i" % population_size
    print "Off to randomly sample %i of them!" % sample_size

    if seed:
        random.seed(seed)
        print "Seed has been set before sampling."

    pmid_pattern = re.compile(
        "<Id>(?P<pmid>\d+)</Id>"
    )  # do this as an re because it is simple and fast

    if sample_size > population_size:
        print "sample size is bigger than population size, so using population size"
        sample_size = population_size
    random_indexes = random.sample(range(1, population_size), sample_size)

    pmids = []
    for random_index in random_indexes:
        r = get_nth_pmid(random_index, query, email)
        try:
            pmid = pmid_pattern.search(r.text).group("pmid")

        #hope this is transient, try the random number + 1
        except AttributeError:
            print "got an error extracting pmid, trying again with subsequent index"
            r = get_nth_pmid(random_index + 1, query, email)
            pmid = pmid_pattern.search(r.text).group("pmid")

        print "pmid:" + pmid
        pmids.append(pmid)
        time.sleep(
            1 / 3
        )  #NCBI requests no more than three requests per second at http://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
    return pmids
Example #24
0
    def find_lyrics(self, track):
        try:
            (artist, title) = track.get_tag_raw('artist')[0].encode("utf-8"), \
                track.get_tag_raw('title')[0].encode("utf-8")
        except TypeError:
            raise LyricsNotFoundException

        if not artist or not title:
            raise LyricsNotFoundException

        artist = urllib.quote(artist.replace(' ', '_'))
        title = urllib.quote(title.replace(' ', '_'))

        url = 'http://lyrics.wikia.com/wiki/%s:%s' % (artist, title)

        try:
            html = common.get_url_contents(url, self.user_agent)
        except:
            raise LyricsNotFoundException

        try:
            soup = BeautifulSoup.BeautifulSoup(html)
        except HTMLParser.HTMLParseError:
            raise LyricsNotFoundException
        lyrics = soup.findAll(attrs={"class": "lyricbox"})
        if lyrics:
            lyrics = re.sub(
                r' Send.*?Ringtone to your Cell ', '', '\n'.join(
                    self.remove_div(lyrics[0].renderContents().replace(
                        '<br />', '\n')).replace('\n\n\n',
                                                 '').split('\n')[0:-7]))
        else:
            raise LyricsNotFoundException

        lyrics = self.remove_script(lyrics)
        lyrics = self.remove_html_tags(
            unicode(
                BeautifulSoup.BeautifulStoneSoup(
                    lyrics,
                    convertEntities=BeautifulSoup.BeautifulStoneSoup.
                    HTML_ENTITIES)))

        return (lyrics, self.name, url)
Example #25
0
def get_config_info(xml):
    soup = bs.BeautifulStoneSoup(open(xml, 'rb'))
    info = {}
    info['xsize'] = int(soup.find('key', key='pixelsPerLine')['value'])
    info['ysize'] = int(soup.find('key', key='linesPerFrame')['value'])
    info['xmpp'] = float(
        soup.find('key', key='micronsPerPixel_XAxis')['value'])
    info['ympp'] = float(
        soup.find('key', key='micronsPerPixel_YAxis')['value'])
    if os.path.splitext(xml)[1] == '.xml':
        widths = soup('key', key='positionCurrent_ZAxis')
        info['z_width'] = abs(
            float(widths[1]['value']) - float(widths[0]['value']))
    # elif os.path.splitext(xml)[1] == '.cfg':
    #     width = float(soup('key', key='motorStepSize_ZAxis')[0]['value'])
    #     info['z_width'] = abs(width)
    else:
        info['z_width'] = 1

    return info
Example #26
0
    def run(self):
        stopover = tempfile.mkdtemp(prefix='gluish-')
        oai_harvest(url=self.url,
                    begin=self.begin,
                    end=self.end,
                    prefix=self.prefix,
                    directory=stopover,
                    collection=self.collection,
                    delay=self.delay)

        with self.output().open('w') as output:
            output.write("""<collection
                xmlns="http://www.openarchives.org/OAI/2.0/"
                xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
            """)
            for path in iterfiles(stopover):
                with open(path) as handle:
                    soup = BeautifulSoup.BeautifulStoneSoup(handle.read())
                    for record in soup.findAll('record'):
                        output.write(str(record))  # or unicode?
            output.write('</collection>\n')
Example #27
0
def metrolyrics(artist,title):
    artist = urllib.quote(artist.lower().replace(' ','-'))
    title = urllib.quote(title.lower().replace(' ','-'))
    if verbose: print "Trying to fetch lyrics from metrolyrics.com"
    try:
        lyrics = urllib.urlopen("http://www.metrolyrics.com/%s-lyrics-%s.html" % (title,artist))
    except:
        if verbose: print "Could not connect to metrolyrics.com. Exiting..."
        return
    text = lyrics.read()
    text = text.replace('</sc"+"ript>"','') #beautifulsoup chokes on this particular tag so we have to get rid of it.
    soup = BeautifulSoup.BeautifulSoup(text)
    lyrics = soup.findAll(attrs= {"id" : "lyrics"})
    if not lyrics:
        if verbose: print "Lyrics not found at metrolyrics.com"
        return
    else:
        #this removes formatting and converts from html entities
        return '\n'.join(map(lambda x: x.strip(),remove_html_tags(unicode(BeautifulSoup.BeautifulStoneSoup(
                lyrics[0].renderContents(),convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES)
                ))[2:].replace('\r','\n').split('\n'))[:-2])
Example #28
0
    def _summaries_received(self, message, callback, error_callback):
        if not message.status_code == Soup.KnownStatusCode.OK:
            error_callback('Pubmed replied with error code %d.' %
                           message.status_code)
        else:
            response_data = message.response_body.flatten().get_data()
            parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data)

            # get information for all documents
            documents = parsed_response.esummaryresult.findAll('docsum')
            papers = []
            for document in documents:
                info = {}
                # Extract information
                info['pubmed_id'] = str(document.id.string)
                # This is needed for retrieving the paper in
                # import_paper_after_search
                info['data'] = info['pubmed_id']
                doi = document.findAll('item', {'name': 'doi'})
                if doi:
                    info['doi'] = doi[0].string
                    info['import_url'] = 'http://dx.doi.org/' + info['doi']

                info['title'] = document.findAll('item',
                                                 {'name': 'Title'})[0].string
                info['authors'] = [str(author.string) for author in \
                                          document.findAll('item',
                                                           {'name': 'Author'})]
                info['journal'] = document.findAll(
                    'item', {'name': 'FullJournalName'})[0].string

                pubdate = document.findAll('item', {'name': 'PubDate'})
                if pubdate and pubdate[0]:
                    info['year'] = pubdate[0].string[:4]

                #TODO: Retrieve abstract

                papers.append(info)

            callback(papers)
Example #29
0
def get_top_K_pages(phrase, K):
    """
    In which we coax a mighty search engine into giving us what we want.
    TODO:
    References:
      - http://en.wikibooks.org/wiki/Python_Programming/Internet
      - http://docs.python.org/library/urllib2.html
  """
    global W, T_to_be_visited
    # TODO: use urllib.quote instead of str.replace
    search_url = yahoo_url % (phrase.replace(' ', '+'), str(K))
    # Sleep for a few seconds, just in case we are calling the search engine too frequently
    time.sleep(search_lag_time)
    search_results = urllib2.urlopen(urllib2.Request(search_url, None,
                                                     headers))
    clickurls = BeautifulSoup.SoupStrainer('clickurl')
    results_soup = BeautifulSoup.BeautifulStoneSoup(search_results,
                                                    parseOnlyThese=clickurls)
    logging.debug('Search results: ' + results_soup.prettify())
    # order of W is not important at the moment
    W = set([link.string for link in results_soup.findAll('clickurl')])
    T_to_be_visited = list(W.copy())
Example #30
0
def lyricsmode(artist,title):
    artist = urllib.quote(artist.lower().replace(' ','_'))
    title = urllib.quote(title.lower().replace(' ','_'))
    if verbose: print "Trying to fetch lyrics from lyricsmode.com"
    try:
        lyrics = urllib.urlopen('http://www.lyricsmode.com/lyrics/%s/%s/%s.html' % (artist[0],artist, title))
    except:
        if verbose: print "Could not connect to lyricsmode.com. Exiting..."
        return
    text = lyrics.read().decode('latin-1').replace( u'\xb7','')
    soup = BeautifulSoup.BeautifulSoup(text)
    #lyricsmode places the lyrics in a span with an id of "lyrics"
    lyrics = soup.findAll(attrs= {"id" : "lyrics"})
    if not lyrics:
        if verbose: print "Lyrics not found at lyricsmode.com"
        return []
    else:
        #this function removes formatting and converts html entities into ascii since lyricsmode obfuscates the lyrics.
        return remove_html_tags(unicode(BeautifulSoup.BeautifulStoneSoup(
                                lyrics[0].renderContents(),
                                convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES
                                )).replace('<br />','\n').strip('\r\n\t\t'))