def get(self): url=self.request.get('url',"") if url: if "://" not in url: url = "http://"+url logging.info("ProxyHandler url: '%s' " %(url)) uthparams = openanything.fetch(siteName+'/urltohash?url=%s' %(urllib.quote(url))) logging.info("ProxyHandler urltohash: '%s' status: '%s' " %(uthparams.get('data','uh oh'),uthparams.get('status','?'))) hashinfo = json.loads(uthparams.get('data','[]')) finalurl='' if hashinfo: thehash = hashinfo[0].get('hash','') if thehash: h2uparams = openanything.fetch(siteName+'/hashtourl/%s' %(thehash)) logging.info("ProxyHandler hashtourl: '%s' status: '%s' " %(h2uparams.get('data','uh oh'),h2uparams.get('status','?'))) urlinfo = json.loads(h2uparams.get('data','[]')) if urlinfo: finalurl = urlinfo[0].get('url','').encode('utf8') if finalurl: self.redirect(finalurl) else: template = JINJA_ENVIRONMENT.get_template('errorpage.html') svgVals = { 'error':"Proxy can't find a hash for %s" % url } self.response.set_status(404) self.response.write(template.render(svgVals))
def get_data(self, strURL): try: oa = openanything.fetch(strURL) if oa['status'] == 200: return oa['data'] except Exception: return False
def fetch(self, url, agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0"): """Fetch down a url using openanything. This function is wrapped for simplicity.""" result = openanything.fetch(url, None, None, agent) if not result['status'] == 200: raise StandardError('Tried to fetch ' + url + ' and got ' + str(result['status'])) return result['data']
def get(self): url=self.request.get('url',"") if url: if "://" not in url: url = "http://"+url filename = urlparse.urlsplit(url).path.split('/')[-1] bits= filename.split('.') key = bits[0] resource = int(newbase60.sxgtonum(urllib.unquote(key))) logging.info("ArchiveUrlToHashHandler url: '%s' " %(url)) uthparams = openanything.fetch('http://web.archive.org/cdx/search/cdx?url=%s' %(urllib.quote(url))) logging.info("ArchiveUrlToHashHandler urltohash: '%s' status: '%s' " %(uthparams.get('data','uh oh'),uthparams.get('status','?'))) #format is com,svgur)/i/au.svg 20160829212327 http://svgur.com/i/AU.svg image/svg+xml 200 LY7RXMB7SLQLKEB63LGFNYY7F3SYRCNQ 3079 output=[] for line in uthparams.get('data','').splitlines(): qpath,fetchdate,foundurl,mimetype,result,base32hash,length = line.split(' ') if result == '200': output.append({'url':foundurl,'hash':'sha1-%s' % (Base32toBase64(base32hash)), 'date':datetime.datetime.strptime(fetchdate,'%Y%m%d%H%M%S').isoformat()}) if output: self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(output)) else: template = JINJA_ENVIRONMENT.get_template('errorpage.html') svgVals = { 'error':"No url here like '%s'" % url } self.response.set_status(404) self.response.write(template.render(svgVals))
def f(paper): for a in soup.findAll('a'): for c in a.contents: if str(c).lower().find('bibtex') != -1: print thread.get_ident(), 'found bibtex link:', a params_bibtex = openanything.fetch( 'http://scholar.google.com' + a['href']) if params_bibtex['status'] == 200 or params_bibtex[ 'status'] == 302: paper = update_paper_from_bibtex_html( paper, params_bibtex['data']) return
def download_if_dne(href, filename): if os.path.isfile(filename): # print 'already downloaded:', href return False else: try: print 'downloading:', href oa = openanything.fetch(href) if oa['status']==200: file = open( filename, 'w' ) file.write( oa['data'] ) file.close() return True except KeyboardInterrupt: raise except: print '\tdownload failed -', sys.exc_info()[0] return False
def get(self): url=self.request.get('url',"") if not url: url = "http://svgur.com/i/AU.svg" if url: if "://" not in url: url = "http://"+url thehash='' uthparams = openanything.fetch(siteName+'/urltohash?url=%s' %(urllib.quote(url))) if uthparams.get('status') == 200: hashinfo = json.loads(uthparams.get('data','[]')) if hashinfo: thehash = hashinfo[0].get('hash','') template = JINJA_ENVIRONMENT.get_template('dweb.html') vals = { 'url':url, 'proxyurl':'/proxy?url=%s' %(urllib.quote(url)), 'urltohash':'/urltohash?url=%s' %(urllib.quote(url)), 'iaurltohashraw':'http://web.archive.org/cdx/search/cdx?url=%s' %(urllib.quote(url)), 'iaurltohash':'/iaurltohash?url=%s' %(urllib.quote(url)), 'haurltohash':'https://hash-archive.org/history/%s' %(url), 'hashtourl':'/hashtourl/'+thehash, 'hasharchive':'https://hash-archive.org/sources/'+thehash, } self.response.write(template.render(vals))
def _import_unknown_citation_old(params, orig_url, paper=None): if params['data'].startswith('%PDF'): # we have a live one! try: filename = params['url'][params['url'].rfind('/') + 1:] data = params['data'] print thread.get_ident(), 'importing paper =', filename if not paper: md5_hexdigest = get_md5_hexdigest_from_data(data) paper, created = get_or_create_paper_via( full_text_md5=md5_hexdigest) if created: #paper.title = filename paper.save_file( defaultfilters.slugify(filename.replace('.pdf', '')) + '.pdf', data) paper.import_url = orig_url paper.save() print thread.get_ident(), 'imported paper =', filename else: print thread.get_ident( ), 'paper already exists: paper =', paper.id, paper.doi, paper.title, paper.get_authors_in_order( ) else: paper.save_file( defaultfilters.slugify(filename.replace('.pdf', '')) + '.pdf', data) paper.import_url = orig_url paper.save() except: traceback.print_exc() if paper: paper.delete() paper = None else: # see try: web_dir_root = params['url'][:params['url'].find('/', 8)] web_dir_current = params['url'][:params['url'].rfind('/')] for a in p_html_a.findall(params['data']): try: href = p_html_a_href.search(a).group(1) except: print thread.get_ident( ), 'couldn\'t figure out href from link:', a continue # strip params if href.find('?') > 0: href = href[:href.find('?')] # normalize to fully qualified name if not href.lower().startswith('http'): if href.startswith('/'): href = web_dir_root + href else: href = web_dir_current + '/' + href if href.lower().endswith('.pdf'): print "href", href paper = _import_unknown_citation(openanything.fetch(href), orig_url, paper=paper) if paper: update_paper_from_bibtex_html(paper, params['data']) paper.save() break except: traceback.print_exc() if paper: paper.delete() paper = None return paper
def find_and_attach_pdf(paper, urls, visited_urls=set()): # search for a PDF linked directly for url in urls: if url.find('?') > 0: url = url[:url.find('?')] if url.lower().endswith('pdf'): print thread.get_ident(), 'found pdf link:', url visited_urls.add(url) params = openanything.fetch(url) if params['status'] == 200 or params['status'] == 302: if params['data'].startswith('%PDF'): # we have a live one! try: filename = params['url'][params['url'].rfind('/') + 1:] print thread.get_ident(), 'importing paper =', filename paper.save_file( defaultfilters.slugify(filename.replace( '.pdf', '')) + '.pdf', params['data']) paper.save() return True except: traceback.print_exc() for url in urls: visited_urls.add(url) params = openanything.fetch(url) if params['status'] == 200 or params['status'] == 302: if params['data'].startswith('%PDF'): # we have a live one! try: filename = params['url'][params['url'].rfind('/') + 1:] print thread.get_ident(), 'importing paper =', filename paper.save_file( defaultfilters.slugify(filename.replace('.pdf', '')) + '.pdf', params['data']) paper.save() return True except: traceback.print_exc() else: soup = BeautifulSoup.BeautifulSoup(params['data']) promising_links = set() for a in soup.findAll('a', href=True): if len(a.contents) > 8: continue web_dir_root = params['url'][:params['url'].find('/', 8)] web_dir_current = params['url'][:params['url'].rfind('/')] href = a['href'] if not href.lower().startswith('http'): if href.startswith('/'): href = web_dir_root + href else: href = web_dir_current + '/' + href x = href if x.find('?') > 0: x = x[:x.find('?')] if x.lower().endswith('pdf'): if href not in visited_urls: print thread.get_ident(), 'found pdf link:', a promising_links.add(href) continue for c in a.contents: c = str(c).lower() if c.find('pdf') != -1: if href not in visited_urls: print thread.get_ident(), 'found pdf link:', a promising_links.add(href) continue if promising_links: print promising_links if find_and_attach_pdf(paper, list(promising_links), visited_urls=visited_urls): return
def _import_ieee_citation(params, paper=None): print thread.get_ident(), 'downloading ieee citation:', params['url'] try: print thread.get_ident(), 'parsing...' soup = BeautifulSoup.BeautifulSoup(params['data'].replace( '<!-BMS End-->', '').replace('<in>', '')) print soup.find('span', attrs={'class': 'headNavBlueXLarge2'}) p_arnumber = re.compile('<arnumber>[0-9]+</arnumber>', re.IGNORECASE) match = p_arnumber.search(params['data']) if match: arnumber = match.group(0) print 'arnumber', arnumber params_bibtex = openanything.fetch( 'http://ieeexplore.ieee.org/xpls/citationAct', post_data={ 'dlSelect': 'cite_abs', 'fileFormate': 'BibTex', 'arnumber': arnumber, 'Submit': 'Download' }) print params_bibtex if params_bibtex['status'] == 200 or params_bibtex['status'] == 302: paper = update_paper_from_bibtex_html(paper, params_bibtex['data']) if not paper: paper, created = get_or_create_paper_via( title=html_strip( str(soup.find('title').string).replace('IEEEXplore#', '')), doi=re.search('Digital Object Identifier: ([a-zA-Z0-9./]*)', params['data']).group(1), ) if created: paper.save() else: print thread.get_ident(), 'paper already imported' if not _should_we_reimport_paper(paper): return # publisher, created = Publisher.objects.get_or_create( # name=html_strip( BeautifulSoup.BeautifulSoup( re.search( 'This paper appears in: (.*)', params['data'] ).group(1) ).a.strong.string ), # ) # print 'publisher', publisher # if created: publisher.save() source_string = html_strip( BeautifulSoup.BeautifulSoup( re.search('This paper appears in: (.*)', params['data']).group(1)).a.strong.string) try: location = html_strip( re.search('Location: (.*)', params['data']).group(1)) except: location = '' source, created = Source.objects.get_or_create( name=source_string, issue=html_strip(''), location=location, publication_date=None, publisher=None, ) paper.import_url = params['url'] paper.source = source paper.source_session = '' #paper.source_pages = html_strip( re.search( 'On page(s):(.*)<BR>', params['data'], re.DOTALL ).group(1) ), paper.abstract = html_strip( soup.findAll('td', attrs={'class': 'bodyCopyBlackLargeSpaced'})[0].contents[-1]) paper.save() for node in soup.findAll('a', attrs={'class': 'bodyCopy'}): if node.contents[0] == 'PDF': file_url = IEEE_BASE_URL + node['href'] print thread.get_ident(), 'downloading paper from', file_url params = openanything.fetch(file_url) if params['status'] == 200 or params['status'] == 302: if params['data'].startswith('%PDF'): ext = params['url'][params['url'].rfind('.') + 1:] if not ext or len(ext) > 5: ext = 'pdf' paper.save_file( defaultfilters.slugify(paper.doi) + '_' + defaultfilters.slugify(paper.title) + '.' + defaultfilters.slugify(ext), params['data']) paper.save() else: print thread.get_ident( ), 'this isn\'t a pdf file:', params['url'] break else: print thread.get_ident( ), 'error downloading paper:', params print thread.get_ident( ), 'imported paper =', paper.id, paper.doi, paper.title, paper.get_authors_in_order( ) return paper except: traceback.print_exc()
def _import_acm_citation(params, paper=None): print thread.get_ident(), 'downloading acm citation:', params['url'] try: print thread.get_ident(), 'parsing...' soup = BeautifulSoup.BeautifulSoup(params['data']) title = [] for node in soup.findAll('td', attrs={'class': 'medium-text'})[0].findAll('strong'): title.append(node.string) try: doi = str( soup.find('form', attrs={ 'name': 'popbinder' }).nextSibling.table.findAll('tr')[-1].findAll('td') [-1].a.string) if doi.startswith('http://doi.acm.org/'): doi = doi[len('http://doi.acm.org/'):] except: doi = '' full_text_data = None full_text_filename = None for node in soup.findAll('a', attrs={'name': 'FullText'}): if node.contents[1] == 'Pdf': file_url = ACM_BASE_URL + '/' + node['href'] print thread.get_ident(), 'downloading paper from', file_url params_file = openanything.fetch(file_url) if params_file['status'] == 200 or params_file['status'] == 302: try: ext = params_file['url'] if ext.find('?') > -1: ext = file_url[0:ext.find('?')] ext = ext[ext.rfind('.') + 1:] except: ext = 'unknown' if params_file['data'].startswith('%PDF'): #paper.save_file( defaultfilters.slugify(paper.doi) +'_'+ defaultfilters.slugify(paper.title) +'.pdf', params_file['data'] ) full_text_filename = defaultfilters.slugify( doi) + '_' + defaultfilters.slugify( title) + '.' + defaultfilters.slugify(ext) full_text_data = params_file['data'] elif params_file['data'].find( '<!DOCTYPE') > -1 and params_file['data'].find( 'logfrm') > -1: # it appears we have an ACM login page... global ACM_USERNAME global ACM_PASSWORD if not ACM_USERNAME: dialog = gtk.MessageDialog( type=gtk.MESSAGE_QUESTION, buttons=gtk.BUTTONS_OK_CANCEL, flags=gtk.DIALOG_MODAL) #dialog.connect('response', lambda x,y: dialog.destroy()) dialog.set_markup( '<b>ACM Login</b>\n\nEnter your ACM username and password:'******'username': ACM_USERNAME, 'password': ACM_PASSWORD, 'submit': 'Login' } params_login = openanything.fetch( 'https://portal.acm.org/poplogin.cfm?is=0&dl=ACM&coll=ACM&comp_id=1220288&want_href=delivery%2Ecfm%3Fid%3D1220288%26type%3Dpdf%26CFID%3D50512225%26CFTOKEN%3D24664038&CFID=50512225&CFTOKEN=24664038&td=1200684914991', post_data=post_data, ) print "params_login['url']", params_login['url'] cfid = re.search('CFID=([0-9]*)', params_login['data']).group(1) cftoken = re.search('CFTOKEN=([0-9]*)', params_login['data']).group(1) new_file_url = file_url[0:file_url.find( '&CFID=')] + '&CFID=%s&CFTOKEN=%s' % (cfid, cftoken) print 'new_file_url', new_file_url params_file = openanything.fetch(new_file_url) if params_file['status'] == 200 or params_file[ 'status'] == 302: if params_file['data'].startswith('%PDF'): full_text_filename = defaultfilters.slugify( doi) + '_' + defaultfilters.slugify( title) + '.' + defaultfilters.slugify( ext) full_text_data = params_file['data'] else: print thread.get_ident( ), 'error downloading paper - still not a pdf after login:'******'error downloading paper - after login:'******'this does not appear to be a pdf file...' ext = params_file['url'][params_file['url']. rfind('.') + 1:] if not ext or len(ext) > 5: ext = 'unknown' #paper.save_file( defaultfilters.slugify(paper.doi) +'_'+ defaultfilters.slugify(paper.title) +'.'+ defaultfilters.slugify(ext), params_file['data'] ) full_text_filename = defaultfilters.slugify( doi) + '_' + defaultfilters.slugify( title) + '.' + defaultfilters.slugify(ext) full_text_data = params_file['data'] #paper.save() break else: print thread.get_ident( ), 'error downloading paper:', params_file if not paper: if full_text_data: md5_hexdigest = get_md5_hexdigest_from_data(full_text_data) else: md5_hexdigest = None paper, created = get_or_create_paper_via( title=html_strip(''.join(title)), doi=doi, full_text_md5=md5_hexdigest, ) if created: if full_text_filename and full_text_data: paper.save_file(full_text_filename, full_text_data) paper.save() else: print thread.get_ident(), 'paper already imported' if not _should_we_reimport_paper(paper): return else: paper.title = html_strip(''.join(title)) paper.doi = doi paper.save() if full_text_filename and full_text_data: paper.save_file(full_text_filename, full_text_data) paper.import_url = params['url'] try: paper.source_session = html_strip( re.search('SESSION:(.*)', params['data']).group(1)) except: pass try: abstract_node = soup.find('p', attrs={'class': 'abstract'}).string if abstract_node: paper.abstract = html_strip(abstract_node) else: paper.abstract = '' except: pass paper.save() p_bibtex_link = re.compile("popBibTex.cfm[^']+") bibtex_link = p_bibtex_link.search(params['data']) if bibtex_link: params_bibtex = openanything.fetch('http://portal.acm.org/' + bibtex_link.group(0)) if params_bibtex['status'] == 200 or params_bibtex['status'] == 302: update_paper_from_bibtex_html(paper, params_bibtex['data']) node = soup.find('div', attrs={'class': 'sponsors'}) if node: for node in node.contents: if isinstance(node, BeautifulSoup.NavigableString): sponsor_name = html_strip(node.replace(':', '')) if sponsor_name: sponsor, created = Sponsor.objects.get_or_create( name=sponsor_name, ) if created: sponsor.save() paper.sponsors.add(sponsor) if soup.find('a', attrs={'name': 'references'}): for node in soup.find('a', attrs={ 'name': 'references' }).parent.findNextSibling('table').findAll('tr'): node = node.findAll('td')[2].div line = None doi = '' acm_referencing_url = '' for a in node.findAll('a'): if a['href'].startswith('citation'): line = html_strip(a.string) acm_referencing_url = ACM_BASE_URL + '/' + a['href'] if a['href'].startswith('http://dx.doi.org'): doi = html_strip(a.string) if not line: line = html_strip(node.contents[0]) reference, created = Reference.objects.get_or_create( line_from_referencing_paper=line, url_from_referencing_paper=acm_referencing_url, doi_from_referencing_paper=doi, referencing_paper=paper, ) if created: reference.save() if soup.find('a', attrs={'name': 'citings'}): for node in soup.find('a', attrs={ 'name': 'citings' }).parent.findNextSibling('table').findAll('tr'): node = node.findAll('td')[1].div if node.string: reference, created = Reference.objects.get_or_create( line_from_referenced_paper=html_strip(node.string), referenced_paper=paper, ) if created: reference.save() else: line = '' doi = '' for a in node.findAll('a'): if a['href'].startswith('citation'): line = html_strip(a.string) url_from_referenced_paper = ACM_BASE_URL + '/' + a[ 'href'] if a['href'].startswith('http://dx.doi.org'): doi = html_strip(a.string) reference, created = Reference.objects.get_or_create( line_from_referenced_paper=line, url_from_referenced_paper=url_from_referenced_paper, doi_from_referenced_paper=doi, referenced_paper=paper, ) if created: reference.save() paper.save() print thread.get_ident( ), 'imported paper =', paper.doi, paper.title, paper.get_authors_in_order( ) return paper except: traceback.print_exc()
def import_citation(url, paper=None, callback=None): active_threads[thread.get_ident()] = 'importing: ' + url try: params = openanything.fetch(url) if params['status'] != 200 and params['status'] != 302: print thread.get_ident(), 'unable to download: %s (%i)' % ( url, params['status']) # gtk.gdk.threads_enter() # error = gtk.MessageDialog( type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, flags=gtk.DIALOG_MODAL ) # error.set_markup('<b>Unable to Download Paper</b>\n\nThe following url:\n<i>%s</i>\n\nreturned the HTTP error code: %i' % ( url.replace('&', '&'), params['status'] )) # error.run() # gtk.gdk.threads_leave() return if params['data'].startswith('%PDF'): # this is a pdf file filename = params['url'][params['url'].rfind('/') + 1:] # strip params if filename.find('?') > 0: filename = filename[:filename.find('?')] data = params['data'] print thread.get_ident(), 'importing paper =', filename if not paper: md5_hexdigest = get_md5_hexdigest_from_data(data) paper, created = get_or_create_paper_via( full_text_md5=md5_hexdigest) if created: #paper.title = filename paper.save_file( defaultfilters.slugify(filename.replace('.pdf', '')) + '.pdf', data) paper.import_url = url paper.save() print thread.get_ident(), 'imported paper =', filename else: print thread.get_ident( ), 'paper already exists: paper =', paper.id, paper.doi, paper.title, paper.get_authors_in_order( ) else: paper.save_file( defaultfilters.slugify(filename.replace('.pdf', '')) + '.pdf', data) paper.import_url = url paper.save() return paper if params['url'].startswith('http://portal.acm.org/citation'): paper = _import_acm_citation(params, paper=paper) if paper and callback: callback() return paper # if params['url'].startswith('http://dx.doi.org'): # paper = import_unknown_citation(params) # if paper and refresh_after: main_gui.refresh_middle_pane_search() # return paper if params['url'].startswith('http://ieeexplore.ieee.org'): if params['url'].find('search/wrapper.jsp') > -1: paper = _import_ieee_citation(openanything.fetch( params['url'].replace('search/wrapper.jsp', 'xpls/abs_all.jsp')), paper=paper) if paper and callback: callback() else: paper = _import_ieee_citation(params, paper=paper) if paper and callback: callback() return paper if params['url'].startswith('http://scholar.google.com'): paper = _import_google_scholar_citation(params, paper=paper) if paper and callback: callback() return paper # let's see if there's a pdf somewhere in here... paper = _import_unknown_citation(params, params['url'], paper=paper) if paper and callback: callback() if paper: return paper except: traceback.print_exc() gtk.gdk.threads_enter() error = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, flags=gtk.DIALOG_MODAL) error.connect('response', lambda x, y: error.destroy()) error.set_markup( '<b>Unknown Error</b>\n\nUnable to download this resource.') error.run() gtk.gdk.threads_leave() gtk.gdk.threads_enter() error = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, flags=gtk.DIALOG_MODAL) error.connect('response', lambda x, y: error.destroy()) error.set_markup( '<b>No Paper Found</b>\n\nThe given URL does not appear to contain or link to any PDF files. (perhaps you have it buy it?) Try downloading the file and adding it using "File >> Import..."\n\n%s' % pango_escape(url)) error.run() gtk.gdk.threads_leave() if active_threads.has_key(thread.get_ident()): del active_threads[thread.get_ident()]
def urlfix(url): return svgfix(openanything.fetch(url).get('data', ''))
def download(location=None, max_span=DEFAULT_MAX_SPAN): # init dirs for dir in [ 'data', os.path.join('data','tiles') ]: if not os.path.isdir(dir): os.mkdir(dir) # download the base page if location: print 'downloading the following location:', location oa = openanything.fetch( 'http://maps.google.com/maps?q='+urllib.quote_plus(location) ) else: print 'downloading the default world map' oa = openanything.fetch('http://maps.google.com') if oa['status']!=200: print 'error connecting to http://maps.google.com - aborting' return html = oa['data'] # find our loc,lat,lng p = re.compile('laddr:"([^"]+)"') m = p.search(html) if m: location = m.group(1) print '\tlocation =',location else: if location: print '\tlocation not found - aborting' return p = re.compile('center:{lat:([0-9.-]+),lng:([0-9.-]+)}') m = p.search(html) if m: lat, lng = float(m.group(1)), float(m.group(2)) else: lat, lng = 37.0625,-95.677068 print '\tlatitude, longitude = %f, %f' % (lat, lng) # find our zoom level p = re.compile('span:{lat:([0-9.]+),lng:([0-9.]+)}') m = p.search(html) if m: span_lat, span_lng = float(m.group(1)), float(m.group(2)) else: span_lat, span_lng = 32, 64 print '\tspan-latitude, span-longitude = %f, %f' % (span_lat, span_lng) mapfiles = 'http://www.google.com/intl/en_us/mapfiles/94/maps2' # perform some base transformations html = html.replace(' ', '') # beautifulsoup doesn't like this char html = html.replace('window.document.title = vPage.title;', 'window.document.title = "Offline Google Maps - http://code.google.com/p/ogmaps/";') html = html.replace('http://mt0.google.com/mt?', 'data/tiles/mt?') html = html.replace('http://mt1.google.com/mt?', 'data/tiles/mt?') html = html.replace('http://mt2.google.com/mt?', 'data/tiles/mt?') html = html.replace('http://mt3.google.com/mt?', 'data/tiles/mt?') html = html.replace('body{margin-top: 3px;margin-bottom: 0;margin-left: 8px;}', 'body{margin:0px;}') html = html.replace('#map {left: 20em;margin-left: 8px;margin-right: 20em;', '#map {') html = html.replace('var height = getWindowHeight() - offsetTop - 10;', 'var height = getWindowHeight() - offsetTop;') # get our kitchen soup = BeautifulSoup(html) hide_if_found( soup.find('div', attrs={'id':'header'}) ) hide_if_found( soup.find('div', attrs={'id':'guser'}) ) hide_if_found( soup.find('div', attrs={'id':'gbar'}) ) hide_if_found( soup.find('div', attrs={'id':'gbh'}) ) hide_if_found( soup.find('div', attrs={'id':'hp'}) ) hide_if_found( soup.find('div', attrs={'id':'panel'}) ) hide_if_found( soup.find('a', attrs={'id':'paneltoggle'}) ) o = soup.find('div', attrs={'id':'actions'}) if o: o['style'] = 'display:none;' # get main.js and transmogrify if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'main.js')): print 'downloading:', mapfiles+'/main.js' oa = openanything.fetch(mapfiles+'/main.js') js = oa['data'] js = js.replace('function rf(a,b){','function rf(a,b){b = b.replace("tiles/mt?","tiles/mt_");') js = js.replace('mb("/maps/gen_204?ev=failed_tile&cad="+f)','mb("data/transparent.png")') js = js.replace('document.body.style[Nk]=$(0);document.body.style[Fe]=$(8)','') js = js.replace('this.u.id="hmtctl";','this.u.id="hmtctl";this.u.style.display="none";') file = open( os.path.join(RUN_FROM_DIR, 'data', 'main.js'), 'w' ) file.write( js ) file.close() # get mod_cb.js and transmogrify if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'mod_cb.js')): print 'downloading:', mapfiles+'/mod_cb.js' oa = openanything.fetch(mapfiles+'/mod_cb.js') js = oa['data'] js = js.replace('/mapfiles/cb','data') js = js.replace('c.id="cbcontrol";','c.id="cbcontrol";c.style.display="none;";') file = open( os.path.join(RUN_FROM_DIR, 'data', 'mod_cb.js'), 'w' ) file.write( js ) file.close() # get mod_traffic_app.js and transmogrify if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'mod_traffic_app.js')): print 'downloading:', mapfiles+'/mod_traffic_app.js' oa = openanything.fetch(mapfiles+'/mod_traffic_app.js') js = oa['data'] js = js.replace('/maps/tldata','data/tldata') file = open( os.path.join(RUN_FROM_DIR, 'data', 'mod_traffic_app.js'), 'w' ) file.write( js ) file.close() # get mod_ms.js and transmogrify if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'mod_ms.js')): print 'downloading:', mapfiles+'/mod_ms.js' oa = openanything.fetch(mapfiles+'/mod_ms.js') js = oa['data'] js = js.replace('http://maps.google.com','data') js = js.replace('/mapfiles','') file = open( os.path.join(RUN_FROM_DIR, 'data', 'mod_ms.js'), 'w' ) file.write( js ) file.close() # get other scripts scripts = [ 'mod_mymaps.js', 'mod_mpl_host.js', 'mod_kml.js', 'mod_le.js', ] for s in scripts: download_if_dne( mapfiles+'/'+s, os.path.join(RUN_FROM_DIR, 'data', s) ) # get linked scripts for tag in soup.findAll('link'): try: href = tag['href'] filename = href.split('/')[-1] download_if_dne( href, os.path.join(RUN_FROM_DIR, 'data', filename) ) tag['href'] = 'data/'+filename except: print 'error:', tag # get all static images for tag in soup.findAll('img'): try: src = tag['src'] filename = src.split('/')[-1] download_if_dne( src, os.path.join(RUN_FROM_DIR, 'data', filename) ) tag['src'] = 'data/'+filename except: # print 'error:', tag pass # get other misc files download_if_dne( 'http://www.google.com/mapfiles/cb/bounds_cippppt.txt', os.path.join(RUN_FROM_DIR, 'data', 'bounds_cippppt.txt') ) download_if_dne( 'http://maps.google.com/maps/tldata?tldtype=1&hl=en&country=us&callback=_xdc_._1f9onnphn', os.path.join(RUN_FROM_DIR, 'data', 'tldata') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/arrow-white.png', os.path.join(RUN_FROM_DIR, 'data', 'arrow-white.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/arrow.png', os.path.join(RUN_FROM_DIR, 'data', 'arrow.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/lmc.png', os.path.join(RUN_FROM_DIR, 'data', 'lmc.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/lmc-bottom.png', os.path.join(RUN_FROM_DIR, 'data', 'lmc-bottom.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/slider.png', os.path.join(RUN_FROM_DIR, 'data', 'slider.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/scale.png', os.path.join(RUN_FROM_DIR, 'data', 'scale.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/arrowtransparent.png', os.path.join(RUN_FROM_DIR, 'data', 'arrowtransparent.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/overcontract.gif', os.path.join(RUN_FROM_DIR, 'data', 'overcontract.gif') ) download_if_dne( 'http://maps.google.com/mapfiles/etna.jpg', os.path.join(RUN_FROM_DIR, 'data', 'etna.jpg') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/drag_cross_67_16.png', os.path.join(RUN_FROM_DIR, 'data', 'drag_cross_67_16.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iws2.png', os.path.join(RUN_FROM_DIR, 'data', 'iws2.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw2.png', os.path.join(RUN_FROM_DIR, 'data', 'iw2.png') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_close.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_close.gif') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_plus.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_plus.gif') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_fullscreen.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_fullscreen.gif') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_minus.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_minus.gif') ) download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/transparent.gif', os.path.join(RUN_FROM_DIR, 'data', 'transparent.gif') ) # download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/', os.path.join(RUN_FROM_DIR, 'data', '') ) # some post transformations, then write to disk html = soup.prettify() html = html.replace(mapfiles, 'data') html = html.replace('http://www.google.com/intl/en_us/mapfiles', 'data') html = html + '<style>div.contextmenu {display:none;}</style>' file = open( os.path.join(RUN_FROM_DIR, 'ogmap.html'), 'w') file.write( html ) file.close() #print html # get map data for zl in range(17,-3,-1): get_map_data( get_tile_coords( lat-span_lat, lng-span_lng, zl ), get_tile_coords( lat+span_lat, lng+span_lng, zl ), zl, max_span ) print '\nyour offline google map is ready at:', RUN_FROM_DIR+'ogmap.html'
def getFile(source): """ Use openanything to open a file or url and return a dictionary of info about it """ file = openanything.fetch(source) return file
import openanything useragent="Python-chy" url="http://sports.163.com/special/00051K7F/rss_sportslq.xml" params=openanything.fetch(url,agent=useragent)