def get_good_page(self,url): d = dl(url) content = d.get() if (content != None): soup = bfs(content) count = soup.findAll('div',{'id':'resultCount'})[0].contents[0] string = re.search('\d*,\d{3}',count).group() string = re.sub(',','',string) page = int(int(string)/24) + 1 return page return 1
def get_good_list(self): #1 get page count url = self.baseurl + self.url page = self.get_good_page(url) # start get good list for i in range(1,page): good_list = [] #print "start get page " + str(i) + " " + self.url fetch_url = url + '&page=' + str(i) print fetch_url d = dl(fetch_url) content = d.get() if (content != None): soup = bfs(content) # search good list product1 = soup.findAll('div',{'class':re.compile('result [\S ]*product')}) num = len(product1) # insert to db d = db.db() # get product id for p_id in range(num): product_id = product1[p_id].attrs[2][1] good_list.append(product_id) #print product_id #data = [] #data.append(product_id) d.query('insert into joyo value (\'' + product_id + '\')') #del data #print (good_list) # unset good_list #del(good_list) return True
def search(query, **kwargs): """ query : Normal string to ADS or dictionary for advanced search """ ### test code to get it up and running # main access # TODO : either access via Z39.50 or via URLlib/mecahnise etc # wishlist # TODO : simple search # TODO : advanced search # TODO : browse import locale # this reads the environment and inits the right locale locale.setlocale(locale.LC_ALL, "") try: # the mechanize module exports urllib2 as well... import mechanize import urllib except (ImportError): print 'You need the \"mechanize\" and urllib module' ' for this script to work.' try: from BeautifulSoup import BeautifulSoup as bfs except (ImportError): print 'You need the BeautifulSoup module...' import scipy import sys #from string import lower, upper # search URL # http://adsabs.harvard.edu/cgi-bin/nph-basic_connect?qsearch=The+Search+String # to parse the search string from "The Search String" to "The+Search+String" # urllib.quote(url, safe=":/") ############################################ ######## GET THE FORM #~ Ping to know which server to use. working_mirror = 0 #~ while not got_reply: #~ try: #~ # try to get the form #~ response = mechanize.urlopen(mirrors[working_mirror] + types_q[search_type]) #~ except mechanize.URLError: #~ # if we can't get it, try another mirror #~ if not i < len(mirrors): #~ break #~ else: #~ working_mirror += 1 #~ pass #~ else: #~ got_reply = True #~ #~ if not got_reply and working_mirror >= len(mirrors): #~ # TODO log output #~ sys.stderr.write('ERROR : You have to be connected to the internet to access the NASA ADS database and it has to be online (on all mirrors).') #~ else: #~ # TODO log output #~ print ('got reply from : {0}'.format(mirrors[working_mirror])) #~ Then check if going for the advanced interface. #~ advanced = int((type(query) == type({})) if advanced: # ADVANCED QUERY # # Should I use http://adsabs.harvard.edu/abstract_service.html # or the full ADS Labs? response = mechanize.urlopen(mirrors[working_mirror] + advanced_q) forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() form = forms[0] #~ if arg.has_key('dbg_snd_form'): # for test purposes #~ return form #~ form['qsearch'] = '^Persson 2012' ######## SUBMIT FORM #~ clicked_form = form.click() #~ result = mechanize.urlopen(clicked_form) pass elif not advanced: # SIMPLE QUERY baseurl = (mirrors[working_mirror] + 'cgi-bin/nph-basic_connect?qsearch=') result = mechanize.urlopen( urllib.quote(baseurl + query, safe = ":/=?^") ) # test below data = urllib.urlencode({'qsearch' : '^Persson'}) baseurl = (mirrors[working_mirror] + 'cgi-bin/nph-basic_connect?') f = urllib.urlopen(baseurl, data) ############################################ ######## PARSE RESULTS page = result.readlines() result.close() # start parsing the results t = bfs(' '.join(page)) tables = t.findAll('table') r = tables[1].findAll('td')[0] y = r.findAll('strong')[0].contents[0] nres = int(y) if nres<1: return 0 # get table with results resulttable = tables[2] # get the rows of the table rows = resulttable.findAll('tr') # get each result entry per list item entries = [rows[i:i+3][1:] for i in scipy.arange(2,57,3)][:-1] ############################################ ######## GET RESULTLIST ###### the problem with this is that web is in UNICODE, # ie. Jørgensen, æ and åäö and ßü etc are represented by funny numbers and '\' #resultlist = [_Result(i) for i in entries] return _Resultlist(entries)
def search(query, **kwargs): """ query : Normal string to ADS or dictionary for advanced search """ ### test code to get it up and running # main access # TODO : either access via Z39.50 or via URLlib/mecahnise etc # wishlist # TODO : simple search # TODO : advanced search # TODO : browse import locale # this reads the environment and inits the right locale locale.setlocale(locale.LC_ALL, "") try: # the mechanize module exports urllib2 as well... import mechanize import urllib except (ImportError): print 'You need the \"mechanize\" and urllib module' ' for this script to work.' try: from BeautifulSoup import BeautifulSoup as bfs except (ImportError): print 'You need the BeautifulSoup module...' import scipy import sys #from string import lower, upper # search URL # http://adsabs.harvard.edu/cgi-bin/nph-basic_connect?qsearch=The+Search+String # to parse the search string from "The Search String" to "The+Search+String" # urllib.quote(url, safe=":/") ############################################ ######## GET THE FORM #~ Ping to know which server to use. working_mirror = 0 #~ while not got_reply: #~ try: #~ # try to get the form #~ response = mechanize.urlopen(mirrors[working_mirror] + types_q[search_type]) #~ except mechanize.URLError: #~ # if we can't get it, try another mirror #~ if not i < len(mirrors): #~ break #~ else: #~ working_mirror += 1 #~ pass #~ else: #~ got_reply = True #~ #~ if not got_reply and working_mirror >= len(mirrors): #~ # TODO log output #~ sys.stderr.write('ERROR : You have to be connected to the internet to access the NASA ADS database and it has to be online (on all mirrors).') #~ else: #~ # TODO log output #~ print ('got reply from : {0}'.format(mirrors[working_mirror])) #~ Then check if going for the advanced interface. #~ advanced = int((type(query) == type({})) if advanced: # ADVANCED QUERY # # Should I use http://adsabs.harvard.edu/abstract_service.html # or the full ADS Labs? response = mechanize.urlopen(mirrors[working_mirror] + advanced_q) forms = mechanize.ParseResponse(response, backwards_compat=False) response.close() form = forms[0] #~ if arg.has_key('dbg_snd_form'): # for test purposes #~ return form #~ form['qsearch'] = '^Persson 2012' ######## SUBMIT FORM #~ clicked_form = form.click() #~ result = mechanize.urlopen(clicked_form) pass elif not advanced: # SIMPLE QUERY baseurl = (mirrors[working_mirror] + 'cgi-bin/nph-basic_connect?qsearch=') result = mechanize.urlopen(urllib.quote(baseurl + query, safe=":/=?^")) # test below data = urllib.urlencode({'qsearch': '^Persson'}) baseurl = (mirrors[working_mirror] + 'cgi-bin/nph-basic_connect?') f = urllib.urlopen(baseurl, data) ############################################ ######## PARSE RESULTS page = result.readlines() result.close() # start parsing the results t = bfs(' '.join(page)) tables = t.findAll('table') r = tables[1].findAll('td')[0] y = r.findAll('strong')[0].contents[0] nres = int(y) if nres < 1: return 0 # get table with results resulttable = tables[2] # get the rows of the table rows = resulttable.findAll('tr') # get each result entry per list item entries = [rows[i:i + 3][1:] for i in scipy.arange(2, 57, 3)][:-1] ############################################ ######## GET RESULTLIST ###### the problem with this is that web is in UNICODE, # ie. Jørgensen, æ and åäö and ßü etc are represented by funny numbers and '\' #resultlist = [_Result(i) for i in entries] return _Resultlist(entries)