Python bfsの例、BeautifulSoup.bfs Pythonの例

コード例 #1

0

ファイルを表示

ファイル: joyo.py プロジェクト: laiello/pef

	def get_good_page(self,url):
		d = dl(url)
		content = d.get()
		if (content != None):
			soup = bfs(content)
			count = soup.findAll('div',{'id':'resultCount'})[0].contents[0]
			string = re.search('\d*,\d{3}',count).group()
			string = re.sub(',','',string)
			page = int(int(string)/24) + 1
			return page
		return 1

コード例 #2

0

ファイルを表示

ファイル: joyo.py プロジェクト: laiello/pef

	def get_good_list(self):
		#1 get page count
		url = self.baseurl + self.url
		
		page = self.get_good_page(url)
		# start get good list
		for i in range(1,page):
			good_list = []
			#print "start get page " + str(i) + "  " + self.url
			fetch_url = url + '&page=' + str(i)
			print fetch_url
			d = dl(fetch_url)
			content = d.get()

			if (content != None):
				soup = bfs(content)
				# search good list
				product1 = soup.findAll('div',{'class':re.compile('result [\S ]*product')})
				num = len(product1)
				
				# insert to db
				d = db.db()
					
				# get product id
				for p_id in range(num):
					product_id = product1[p_id].attrs[2][1]
					good_list.append(product_id)
					#print product_id
					
					#data = []
					#data.append(product_id)
					d.query('insert into joyo value (\'' + product_id + '\')')
					#del data
			#print (good_list)
			
			# unset good_list
			#del(good_list)
				
		return True

コード例 #3

0

ファイルを表示

ファイル: adslib.py プロジェクト: vilhelmp/adapy

def search(query, **kwargs):
    """
    query       :  Normal string to ADS
                   or dictionary for advanced search
    
    """
    
    
    ### test code to get it up and running
    
    # main access
    # TODO : either access via Z39.50 or via URLlib/mecahnise etc
    
    # wishlist
    # TODO : simple search
    # TODO : advanced search
    # TODO : browse
    
    
    import locale
    # this reads the environment and inits the right locale
    locale.setlocale(locale.LC_ALL, "")
    
    
    try:
        # the mechanize module exports urllib2 as well...
        import mechanize
        import urllib
    except (ImportError):
        print 'You need the \"mechanize\" and urllib module'
        ' for this script to work.'
    
    try:
        from BeautifulSoup import BeautifulSoup as bfs
    except (ImportError):
        print 'You need the BeautifulSoup module...'
    
    
    import scipy
    import sys
    
    #from string import lower, upper
    # search URL
    # http://adsabs.harvard.edu/cgi-bin/nph-basic_connect?qsearch=The+Search+String
    
    # to parse the search string from "The Search String" to "The+Search+String"
    # urllib.quote(url, safe=":/")
    
    ############################################
    ######## GET THE FORM
    
    #~  Ping to know which server to use.
    working_mirror = 0
    
    #~ while not got_reply:
       #~ try:
           #~ # try to get the form
           #~ response = mechanize.urlopen(mirrors[working_mirror] + types_q[search_type])
       #~ except mechanize.URLError:
           #~ # if we can't get it, try another mirror
           #~ if not i < len(mirrors):
               #~ break
           #~ else:
               #~ working_mirror += 1
           #~ pass
       #~ else:
           #~ got_reply = True
    #~ 
    #~ if not got_reply and working_mirror >= len(mirrors):
           #~ # TODO log output
           #~ sys.stderr.write('ERROR :  You have to be connected to the internet to access the NASA ADS database and it has to be online (on all mirrors).')
    #~ else:
            #~ # TODO log output
        #~ print ('got reply from : {0}'.format(mirrors[working_mirror]))
    
    
    
    
    #~  Then check if going for the advanced interface.
    #~ advanced = int((type(query) == type({}))
    if advanced:
        # ADVANCED QUERY 
        #
        # Should I use http://adsabs.harvard.edu/abstract_service.html
        # or the full ADS Labs?
        response = mechanize.urlopen(mirrors[working_mirror] + advanced_q)
        forms = mechanize.ParseResponse(response, backwards_compat=False)
        response.close()
        form = forms[0]
        #~ if arg.has_key('dbg_snd_form'): # for test purposes
        #~ return form
        #~ form['qsearch'] = '^Persson 2012'
        
        ######## SUBMIT FORM
        #~ clicked_form = form.click()
        
        #~ result = mechanize.urlopen(clicked_form)
        
        pass
        
    elif not advanced:
        # SIMPLE QUERY 
        baseurl = (mirrors[working_mirror] + 
        'cgi-bin/nph-basic_connect?qsearch=')
        
        result = mechanize.urlopen( urllib.quote(baseurl + query, safe = ":/=?^") )
        # test below
        data = urllib.urlencode({'qsearch' : '^Persson'})
        baseurl = (mirrors[working_mirror] + 
        'cgi-bin/nph-basic_connect?')
        f = urllib.urlopen(baseurl, data)
    ############################################
    ######## PARSE RESULTS
    
    page = result.readlines()
    result.close()
    
    # start parsing the results
    t = bfs(' '.join(page))
    tables = t.findAll('table')
    
    r = tables[1].findAll('td')[0]
    y = r.findAll('strong')[0].contents[0]
    nres = int(y)
    if nres<1:
        return 0
    
    # get table with results
    resulttable = tables[2]
    # get the rows of the table
    rows = resulttable.findAll('tr')
    # get each result entry per list item
    entries = [rows[i:i+3][1:] for i in scipy.arange(2,57,3)][:-1]

    ############################################
    ######## GET RESULTLIST

    ###### the problem with this is that web is in UNICODE, 
    # ie. Jørgensen, æ and åäö and ßü etc are represented by funny numbers and '\'
        
    #resultlist = [_Result(i) for i in entries]
    return _Resultlist(entries)

コード例 #4

0

ファイルを表示

ファイル: adslib.py プロジェクト: tk575/adapy

def search(query, **kwargs):
    """
    query       :  Normal string to ADS
                   or dictionary for advanced search
    
    """

    ### test code to get it up and running

    # main access
    # TODO : either access via Z39.50 or via URLlib/mecahnise etc

    # wishlist
    # TODO : simple search
    # TODO : advanced search
    # TODO : browse

    import locale
    # this reads the environment and inits the right locale
    locale.setlocale(locale.LC_ALL, "")

    try:
        # the mechanize module exports urllib2 as well...
        import mechanize
        import urllib
    except (ImportError):
        print 'You need the \"mechanize\" and urllib module'
        ' for this script to work.'

    try:
        from BeautifulSoup import BeautifulSoup as bfs
    except (ImportError):
        print 'You need the BeautifulSoup module...'

    import scipy
    import sys

    #from string import lower, upper
    # search URL
    # http://adsabs.harvard.edu/cgi-bin/nph-basic_connect?qsearch=The+Search+String

    # to parse the search string from "The Search String" to "The+Search+String"
    # urllib.quote(url, safe=":/")

    ############################################
    ######## GET THE FORM

    #~  Ping to know which server to use.
    working_mirror = 0

    #~ while not got_reply:
    #~ try:
    #~ # try to get the form
    #~ response = mechanize.urlopen(mirrors[working_mirror] + types_q[search_type])
    #~ except mechanize.URLError:
    #~ # if we can't get it, try another mirror
    #~ if not i < len(mirrors):
    #~ break
    #~ else:
    #~ working_mirror += 1
    #~ pass
    #~ else:
    #~ got_reply = True
    #~
    #~ if not got_reply and working_mirror >= len(mirrors):
    #~ # TODO log output
    #~ sys.stderr.write('ERROR :  You have to be connected to the internet to access the NASA ADS database and it has to be online (on all mirrors).')
    #~ else:
    #~ # TODO log output
    #~ print ('got reply from : {0}'.format(mirrors[working_mirror]))

    #~  Then check if going for the advanced interface.
    #~ advanced = int((type(query) == type({}))
    if advanced:
        # ADVANCED QUERY
        #
        # Should I use http://adsabs.harvard.edu/abstract_service.html
        # or the full ADS Labs?
        response = mechanize.urlopen(mirrors[working_mirror] + advanced_q)
        forms = mechanize.ParseResponse(response, backwards_compat=False)
        response.close()
        form = forms[0]
        #~ if arg.has_key('dbg_snd_form'): # for test purposes
        #~ return form
        #~ form['qsearch'] = '^Persson 2012'

        ######## SUBMIT FORM
        #~ clicked_form = form.click()

        #~ result = mechanize.urlopen(clicked_form)

        pass

    elif not advanced:
        # SIMPLE QUERY
        baseurl = (mirrors[working_mirror] +
                   'cgi-bin/nph-basic_connect?qsearch=')

        result = mechanize.urlopen(urllib.quote(baseurl + query, safe=":/=?^"))
        # test below
        data = urllib.urlencode({'qsearch': '^Persson'})
        baseurl = (mirrors[working_mirror] + 'cgi-bin/nph-basic_connect?')
        f = urllib.urlopen(baseurl, data)
    ############################################
    ######## PARSE RESULTS

    page = result.readlines()
    result.close()

    # start parsing the results
    t = bfs(' '.join(page))
    tables = t.findAll('table')

    r = tables[1].findAll('td')[0]
    y = r.findAll('strong')[0].contents[0]
    nres = int(y)
    if nres < 1:
        return 0

    # get table with results
    resulttable = tables[2]
    # get the rows of the table
    rows = resulttable.findAll('tr')
    # get each result entry per list item
    entries = [rows[i:i + 3][1:] for i in scipy.arange(2, 57, 3)][:-1]

    ############################################
    ######## GET RESULTLIST

    ###### the problem with this is that web is in UNICODE,
    # ie. Jørgensen, æ and åäö and ßü etc are represented by funny numbers and '\'

    #resultlist = [_Result(i) for i in entries]
    return _Resultlist(entries)