Exemple #1
0
 def get(self):
   url=self.request.get('url',"")
   if url:
       if "://" not in url:
           url = "http://"+url
   logging.info("ProxyHandler url: '%s' " %(url))
   uthparams = openanything.fetch(siteName+'/urltohash?url=%s' %(urllib.quote(url)))
   logging.info("ProxyHandler urltohash: '%s' status: '%s' " %(uthparams.get('data','uh oh'),uthparams.get('status','?')))
   hashinfo = json.loads(uthparams.get('data','[]'))
   finalurl=''
   if hashinfo:
       thehash = hashinfo[0].get('hash','')
       if thehash:
           h2uparams = openanything.fetch(siteName+'/hashtourl/%s' %(thehash))
           logging.info("ProxyHandler hashtourl: '%s' status: '%s' " %(h2uparams.get('data','uh oh'),h2uparams.get('status','?')))
           urlinfo = json.loads(h2uparams.get('data','[]'))
           if urlinfo:
               finalurl = urlinfo[0].get('url','').encode('utf8')
   if finalurl:
       self.redirect(finalurl)
   else:
       template = JINJA_ENVIRONMENT.get_template('errorpage.html')
       svgVals = { 'error':"Proxy can't find a hash for %s" % url }
       self.response.set_status(404)
       self.response.write(template.render(svgVals))
Exemple #2
0
 def get_data(self, strURL):
     try:
         oa = openanything.fetch(strURL)
         if oa['status'] == 200:
             return oa['data']
     except Exception:
         return False
Exemple #3
0
 def fetch(self, url, agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0"):
     """Fetch down a url using openanything. This function is wrapped
     for simplicity."""
     result =  openanything.fetch(url, None, None, agent)
     if not result['status'] == 200:
         raise StandardError('Tried to fetch ' + url + ' and got ' + str(result['status']))
     return result['data']
Exemple #4
0
 def get(self):
   url=self.request.get('url',"")
   if url:
       if "://" not in url:
           url = "http://"+url
   filename = urlparse.urlsplit(url).path.split('/')[-1]
   bits= filename.split('.')
   key = bits[0]
   resource = int(newbase60.sxgtonum(urllib.unquote(key)))
   logging.info("ArchiveUrlToHashHandler url: '%s' " %(url))
   uthparams = openanything.fetch('http://web.archive.org/cdx/search/cdx?url=%s' %(urllib.quote(url)))
   logging.info("ArchiveUrlToHashHandler urltohash: '%s' status: '%s' " %(uthparams.get('data','uh oh'),uthparams.get('status','?')))
   #format is com,svgur)/i/au.svg 20160829212327 http://svgur.com/i/AU.svg image/svg+xml 200 LY7RXMB7SLQLKEB63LGFNYY7F3SYRCNQ 3079
   output=[]
   for line in uthparams.get('data','').splitlines():
       qpath,fetchdate,foundurl,mimetype,result,base32hash,length = line.split(' ')
       if result == '200':
           output.append({'url':foundurl,'hash':'sha1-%s' % (Base32toBase64(base32hash)), 'date':datetime.datetime.strptime(fetchdate,'%Y%m%d%H%M%S').isoformat()})
   if output:
       self.response.headers['Content-Type'] = 'application/json'
       self.response.write(json.dumps(output))
   else:
       template = JINJA_ENVIRONMENT.get_template('errorpage.html')
       svgVals = { 'error':"No url here like '%s'" % url }
       self.response.set_status(404)
       self.response.write(template.render(svgVals))
Exemple #5
0
 def f(paper):
     for a in soup.findAll('a'):
         for c in a.contents:
             if str(c).lower().find('bibtex') != -1:
                 print thread.get_ident(), 'found bibtex link:', a
                 params_bibtex = openanything.fetch(
                     'http://scholar.google.com' + a['href'])
                 if params_bibtex['status'] == 200 or params_bibtex[
                         'status'] == 302:
                     paper = update_paper_from_bibtex_html(
                         paper, params_bibtex['data'])
                     return
Exemple #6
0
def download_if_dne(href, filename):
    if os.path.isfile(filename):
#        print 'already downloaded:', href
        return False
    else:
        try:
            print 'downloading:', href
            oa = openanything.fetch(href)
            if oa['status']==200:
                file = open( filename, 'w' )
                file.write( oa['data'] )
                file.close()
            return True
        except KeyboardInterrupt:
            raise
        except:
            print '\tdownload failed -', sys.exc_info()[0]
            return False
Exemple #7
0
 def get(self):
   url=self.request.get('url',"")
   if not url:
       url = "http://svgur.com/i/AU.svg"
   if url:
       if "://" not in url:
           url = "http://"+url
   thehash=''
   uthparams = openanything.fetch(siteName+'/urltohash?url=%s' %(urllib.quote(url)))
   if uthparams.get('status') == 200:
       hashinfo = json.loads(uthparams.get('data','[]'))
       if hashinfo:
           thehash = hashinfo[0].get('hash','')
   template = JINJA_ENVIRONMENT.get_template('dweb.html')
   vals = { 'url':url, 'proxyurl':'/proxy?url=%s' %(urllib.quote(url)),
           'urltohash':'/urltohash?url=%s' %(urllib.quote(url)), 
           'iaurltohashraw':'http://web.archive.org/cdx/search/cdx?url=%s' %(urllib.quote(url)), 
           'iaurltohash':'/iaurltohash?url=%s' %(urllib.quote(url)), 
           'haurltohash':'https://hash-archive.org/history/%s' %(url), 
           'hashtourl':'/hashtourl/'+thehash,
           'hasharchive':'https://hash-archive.org/sources/'+thehash,
           }
   self.response.write(template.render(vals))
Exemple #8
0
def _import_unknown_citation_old(params, orig_url, paper=None):

    if params['data'].startswith('%PDF'):

        # we have a live one!
        try:
            filename = params['url'][params['url'].rfind('/') + 1:]
            data = params['data']
            print thread.get_ident(), 'importing paper =', filename

            if not paper:
                md5_hexdigest = get_md5_hexdigest_from_data(data)
                paper, created = get_or_create_paper_via(
                    full_text_md5=md5_hexdigest)
                if created:
                    #paper.title = filename
                    paper.save_file(
                        defaultfilters.slugify(filename.replace('.pdf', '')) +
                        '.pdf', data)
                    paper.import_url = orig_url
                    paper.save()
                    print thread.get_ident(), 'imported paper =', filename
                else:
                    print thread.get_ident(
                    ), 'paper already exists: paper =', paper.id, paper.doi, paper.title, paper.get_authors_in_order(
                    )
            else:
                paper.save_file(
                    defaultfilters.slugify(filename.replace('.pdf', '')) +
                    '.pdf', data)
                paper.import_url = orig_url
                paper.save()
        except:
            traceback.print_exc()
            if paper:
                paper.delete()
                paper = None

    else:

        # see
        try:
            web_dir_root = params['url'][:params['url'].find('/', 8)]
            web_dir_current = params['url'][:params['url'].rfind('/')]
            for a in p_html_a.findall(params['data']):
                try:
                    href = p_html_a_href.search(a).group(1)
                except:
                    print thread.get_ident(
                    ), 'couldn\'t figure out href from link:', a
                    continue
                # strip params
                if href.find('?') > 0:
                    href = href[:href.find('?')]
                # normalize to fully qualified name
                if not href.lower().startswith('http'):
                    if href.startswith('/'):
                        href = web_dir_root + href
                    else:
                        href = web_dir_current + '/' + href
                if href.lower().endswith('.pdf'):
                    print "href", href
                    paper = _import_unknown_citation(openanything.fetch(href),
                                                     orig_url,
                                                     paper=paper)
                    if paper:
                        update_paper_from_bibtex_html(paper, params['data'])
                        paper.save()
                        break
        except:
            traceback.print_exc()
            if paper:
                paper.delete()
                paper = None

    return paper
Exemple #9
0
def find_and_attach_pdf(paper, urls, visited_urls=set()):

    # search for a PDF linked directly
    for url in urls:
        if url.find('?') > 0: url = url[:url.find('?')]
        if url.lower().endswith('pdf'):
            print thread.get_ident(), 'found pdf link:', url
            visited_urls.add(url)
            params = openanything.fetch(url)
            if params['status'] == 200 or params['status'] == 302:
                if params['data'].startswith('%PDF'):
                    # we have a live one!
                    try:
                        filename = params['url'][params['url'].rfind('/') + 1:]
                        print thread.get_ident(), 'importing paper =', filename
                        paper.save_file(
                            defaultfilters.slugify(filename.replace(
                                '.pdf', '')) + '.pdf', params['data'])
                        paper.save()
                        return True
                    except:
                        traceback.print_exc()

    for url in urls:
        visited_urls.add(url)
        params = openanything.fetch(url)
        if params['status'] == 200 or params['status'] == 302:
            if params['data'].startswith('%PDF'):
                # we have a live one!
                try:
                    filename = params['url'][params['url'].rfind('/') + 1:]
                    print thread.get_ident(), 'importing paper =', filename
                    paper.save_file(
                        defaultfilters.slugify(filename.replace('.pdf', '')) +
                        '.pdf', params['data'])
                    paper.save()
                    return True
                except:
                    traceback.print_exc()
            else:
                soup = BeautifulSoup.BeautifulSoup(params['data'])
                promising_links = set()
                for a in soup.findAll('a', href=True):
                    if len(a.contents) > 8: continue
                    web_dir_root = params['url'][:params['url'].find('/', 8)]
                    web_dir_current = params['url'][:params['url'].rfind('/')]
                    href = a['href']
                    if not href.lower().startswith('http'):
                        if href.startswith('/'):
                            href = web_dir_root + href
                        else:
                            href = web_dir_current + '/' + href
                    x = href
                    if x.find('?') > 0: x = x[:x.find('?')]
                    if x.lower().endswith('pdf'):
                        if href not in visited_urls:
                            print thread.get_ident(), 'found pdf link:', a
                            promising_links.add(href)
                            continue
                    for c in a.contents:
                        c = str(c).lower()
                        if c.find('pdf') != -1:
                            if href not in visited_urls:
                                print thread.get_ident(), 'found pdf link:', a
                                promising_links.add(href)
                                continue
                if promising_links: print promising_links
                if find_and_attach_pdf(paper,
                                       list(promising_links),
                                       visited_urls=visited_urls):
                    return
Exemple #10
0
def _import_ieee_citation(params, paper=None):
    print thread.get_ident(), 'downloading ieee citation:', params['url']
    try:
        print thread.get_ident(), 'parsing...'
        soup = BeautifulSoup.BeautifulSoup(params['data'].replace(
            '<!-BMS End-->', '').replace('<in>', ''))

        print soup.find('span', attrs={'class': 'headNavBlueXLarge2'})

        p_arnumber = re.compile('<arnumber>[0-9]+</arnumber>', re.IGNORECASE)
        match = p_arnumber.search(params['data'])
        if match:
            arnumber = match.group(0)
            print 'arnumber', arnumber
            params_bibtex = openanything.fetch(
                'http://ieeexplore.ieee.org/xpls/citationAct',
                post_data={
                    'dlSelect': 'cite_abs',
                    'fileFormate': 'BibTex',
                    'arnumber': arnumber,
                    'Submit': 'Download'
                })
            print params_bibtex
            if params_bibtex['status'] == 200 or params_bibtex['status'] == 302:
                paper = update_paper_from_bibtex_html(paper,
                                                      params_bibtex['data'])

        if not paper:
            paper, created = get_or_create_paper_via(
                title=html_strip(
                    str(soup.find('title').string).replace('IEEEXplore#', '')),
                doi=re.search('Digital Object Identifier: ([a-zA-Z0-9./]*)',
                              params['data']).group(1),
            )
            if created: paper.save()
            else:
                print thread.get_ident(), 'paper already imported'
                if not _should_we_reimport_paper(paper):
                    return


#        publisher, created = Publisher.objects.get_or_create(
#            name=html_strip( BeautifulSoup.BeautifulSoup( re.search( 'This paper appears in: (.*)', params['data'] ).group(1) ).a.strong.string ),
#        )
#        print 'publisher', publisher
#        if created: publisher.save()

        source_string = html_strip(
            BeautifulSoup.BeautifulSoup(
                re.search('This paper appears in: (.*)',
                          params['data']).group(1)).a.strong.string)
        try:
            location = html_strip(
                re.search('Location: (.*)', params['data']).group(1))
        except:
            location = ''
        source, created = Source.objects.get_or_create(
            name=source_string,
            issue=html_strip(''),
            location=location,
            publication_date=None,
            publisher=None,
        )

        paper.import_url = params['url']
        paper.source = source
        paper.source_session = ''
        #paper.source_pages = html_strip( re.search( 'On page(s):(.*)<BR>', params['data'], re.DOTALL ).group(1) ),
        paper.abstract = html_strip(
            soup.findAll('td',
                         attrs={'class':
                                'bodyCopyBlackLargeSpaced'})[0].contents[-1])
        paper.save()

        for node in soup.findAll('a', attrs={'class': 'bodyCopy'}):
            if node.contents[0] == 'PDF':
                file_url = IEEE_BASE_URL + node['href']
                print thread.get_ident(), 'downloading paper from', file_url
                params = openanything.fetch(file_url)
                if params['status'] == 200 or params['status'] == 302:
                    if params['data'].startswith('%PDF'):
                        ext = params['url'][params['url'].rfind('.') + 1:]
                        if not ext or len(ext) > 5:
                            ext = 'pdf'
                        paper.save_file(
                            defaultfilters.slugify(paper.doi) + '_' +
                            defaultfilters.slugify(paper.title) + '.' +
                            defaultfilters.slugify(ext), params['data'])
                        paper.save()
                    else:
                        print thread.get_ident(
                        ), 'this isn\'t a pdf file:', params['url']
                    break
                else:
                    print thread.get_ident(
                    ), 'error downloading paper:', params

        print thread.get_ident(
        ), 'imported paper =', paper.id, paper.doi, paper.title, paper.get_authors_in_order(
        )
        return paper
    except:
        traceback.print_exc()
Exemple #11
0
def _import_acm_citation(params, paper=None):
    print thread.get_ident(), 'downloading acm citation:', params['url']
    try:
        print thread.get_ident(), 'parsing...'

        soup = BeautifulSoup.BeautifulSoup(params['data'])

        title = []
        for node in soup.findAll('td',
                                 attrs={'class':
                                        'medium-text'})[0].findAll('strong'):
            title.append(node.string)
        try:
            doi = str(
                soup.find('form', attrs={
                    'name': 'popbinder'
                }).nextSibling.table.findAll('tr')[-1].findAll('td')
                [-1].a.string)
            if doi.startswith('http://doi.acm.org/'):
                doi = doi[len('http://doi.acm.org/'):]
        except:
            doi = ''

        full_text_data = None
        full_text_filename = None
        for node in soup.findAll('a', attrs={'name': 'FullText'}):
            if node.contents[1] == 'Pdf':
                file_url = ACM_BASE_URL + '/' + node['href']
                print thread.get_ident(), 'downloading paper from', file_url
                params_file = openanything.fetch(file_url)
                if params_file['status'] == 200 or params_file['status'] == 302:
                    try:
                        ext = params_file['url']
                        if ext.find('?') > -1:
                            ext = file_url[0:ext.find('?')]
                        ext = ext[ext.rfind('.') + 1:]
                    except:
                        ext = 'unknown'

                    if params_file['data'].startswith('%PDF'):
                        #paper.save_file( defaultfilters.slugify(paper.doi) +'_'+ defaultfilters.slugify(paper.title) +'.pdf', params_file['data'] )
                        full_text_filename = defaultfilters.slugify(
                            doi) + '_' + defaultfilters.slugify(
                                title) + '.' + defaultfilters.slugify(ext)
                        full_text_data = params_file['data']
                    elif params_file['data'].find(
                            '<!DOCTYPE') > -1 and params_file['data'].find(
                                'logfrm') > -1:
                        # it appears we have an ACM login page...

                        global ACM_USERNAME
                        global ACM_PASSWORD
                        if not ACM_USERNAME:
                            dialog = gtk.MessageDialog(
                                type=gtk.MESSAGE_QUESTION,
                                buttons=gtk.BUTTONS_OK_CANCEL,
                                flags=gtk.DIALOG_MODAL)
                            #dialog.connect('response', lambda x,y: dialog.destroy())
                            dialog.set_markup(
                                '<b>ACM Login</b>\n\nEnter your ACM username and password:'******'username': ACM_USERNAME,
                            'password': ACM_PASSWORD,
                            'submit': 'Login'
                        }
                        params_login = openanything.fetch(
                            'https://portal.acm.org/poplogin.cfm?is=0&amp;dl=ACM&amp;coll=ACM&amp;comp_id=1220288&amp;want_href=delivery%2Ecfm%3Fid%3D1220288%26type%3Dpdf%26CFID%3D50512225%26CFTOKEN%3D24664038&amp;CFID=50512225&amp;CFTOKEN=24664038&amp;td=1200684914991',
                            post_data=post_data,
                        )
                        print "params_login['url']", params_login['url']
                        cfid = re.search('CFID=([0-9]*)',
                                         params_login['data']).group(1)
                        cftoken = re.search('CFTOKEN=([0-9]*)',
                                            params_login['data']).group(1)
                        new_file_url = file_url[0:file_url.find(
                            '&CFID=')] + '&CFID=%s&CFTOKEN=%s' % (cfid,
                                                                  cftoken)
                        print 'new_file_url', new_file_url
                        params_file = openanything.fetch(new_file_url)
                        if params_file['status'] == 200 or params_file[
                                'status'] == 302:
                            if params_file['data'].startswith('%PDF'):
                                full_text_filename = defaultfilters.slugify(
                                    doi) + '_' + defaultfilters.slugify(
                                        title) + '.' + defaultfilters.slugify(
                                            ext)
                                full_text_data = params_file['data']
                            else:
                                print thread.get_ident(
                                ), 'error downloading paper - still not a pdf after login:'******'error downloading paper - after login:'******'this does not appear to be a pdf file...'
                        ext = params_file['url'][params_file['url'].
                                                 rfind('.') + 1:]
                        if not ext or len(ext) > 5:
                            ext = 'unknown'
                        #paper.save_file( defaultfilters.slugify(paper.doi) +'_'+ defaultfilters.slugify(paper.title) +'.'+ defaultfilters.slugify(ext), params_file['data'] )
                        full_text_filename = defaultfilters.slugify(
                            doi) + '_' + defaultfilters.slugify(
                                title) + '.' + defaultfilters.slugify(ext)
                        full_text_data = params_file['data']
                    #paper.save()
                    break
                else:
                    print thread.get_ident(
                    ), 'error downloading paper:', params_file

        if not paper:
            if full_text_data:
                md5_hexdigest = get_md5_hexdigest_from_data(full_text_data)
            else:
                md5_hexdigest = None
            paper, created = get_or_create_paper_via(
                title=html_strip(''.join(title)),
                doi=doi,
                full_text_md5=md5_hexdigest,
            )
            if created:
                if full_text_filename and full_text_data:
                    paper.save_file(full_text_filename, full_text_data)
                paper.save()
            else:
                print thread.get_ident(), 'paper already imported'
                if not _should_we_reimport_paper(paper):
                    return
        else:
            paper.title = html_strip(''.join(title))
            paper.doi = doi
            paper.save()
            if full_text_filename and full_text_data:
                paper.save_file(full_text_filename, full_text_data)

        paper.import_url = params['url']

        try:
            paper.source_session = html_strip(
                re.search('SESSION:(.*)', params['data']).group(1))
        except:
            pass
        try:
            abstract_node = soup.find('p', attrs={'class': 'abstract'}).string
            if abstract_node:
                paper.abstract = html_strip(abstract_node)
            else:
                paper.abstract = ''
        except:
            pass
        paper.save()

        p_bibtex_link = re.compile("popBibTex.cfm[^']+")
        bibtex_link = p_bibtex_link.search(params['data'])
        if bibtex_link:
            params_bibtex = openanything.fetch('http://portal.acm.org/' +
                                               bibtex_link.group(0))
            if params_bibtex['status'] == 200 or params_bibtex['status'] == 302:
                update_paper_from_bibtex_html(paper, params_bibtex['data'])

        node = soup.find('div', attrs={'class': 'sponsors'})
        if node:
            for node in node.contents:
                if isinstance(node, BeautifulSoup.NavigableString):
                    sponsor_name = html_strip(node.replace(':', ''))
                    if sponsor_name:
                        sponsor, created = Sponsor.objects.get_or_create(
                            name=sponsor_name, )
                        if created: sponsor.save()
                        paper.sponsors.add(sponsor)

        if soup.find('a', attrs={'name': 'references'}):
            for node in soup.find('a', attrs={
                    'name': 'references'
            }).parent.findNextSibling('table').findAll('tr'):
                node = node.findAll('td')[2].div
                line = None
                doi = ''
                acm_referencing_url = ''
                for a in node.findAll('a'):
                    if a['href'].startswith('citation'):
                        line = html_strip(a.string)
                        acm_referencing_url = ACM_BASE_URL + '/' + a['href']
                    if a['href'].startswith('http://dx.doi.org'):
                        doi = html_strip(a.string)
                if not line: line = html_strip(node.contents[0])
                reference, created = Reference.objects.get_or_create(
                    line_from_referencing_paper=line,
                    url_from_referencing_paper=acm_referencing_url,
                    doi_from_referencing_paper=doi,
                    referencing_paper=paper,
                )
                if created: reference.save()

        if soup.find('a', attrs={'name': 'citings'}):
            for node in soup.find('a', attrs={
                    'name': 'citings'
            }).parent.findNextSibling('table').findAll('tr'):
                node = node.findAll('td')[1].div
                if node.string:
                    reference, created = Reference.objects.get_or_create(
                        line_from_referenced_paper=html_strip(node.string),
                        referenced_paper=paper,
                    )
                    if created: reference.save()
                else:
                    line = ''
                    doi = ''
                    for a in node.findAll('a'):
                        if a['href'].startswith('citation'):
                            line = html_strip(a.string)
                            url_from_referenced_paper = ACM_BASE_URL + '/' + a[
                                'href']
                        if a['href'].startswith('http://dx.doi.org'):
                            doi = html_strip(a.string)
                    reference, created = Reference.objects.get_or_create(
                        line_from_referenced_paper=line,
                        url_from_referenced_paper=url_from_referenced_paper,
                        doi_from_referenced_paper=doi,
                        referenced_paper=paper,
                    )
                    if created: reference.save()

        paper.save()
        print thread.get_ident(
        ), 'imported paper =', paper.doi, paper.title, paper.get_authors_in_order(
        )
        return paper
    except:
        traceback.print_exc()
Exemple #12
0
def import_citation(url, paper=None, callback=None):
    active_threads[thread.get_ident()] = 'importing: ' + url
    try:
        params = openanything.fetch(url)
        if params['status'] != 200 and params['status'] != 302:
            print thread.get_ident(), 'unable to download: %s  (%i)' % (
                url, params['status'])
            #            gtk.gdk.threads_enter()
            #            error = gtk.MessageDialog( type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, flags=gtk.DIALOG_MODAL )
            #            error.set_markup('<b>Unable to Download Paper</b>\n\nThe following url:\n<i>%s</i>\n\nreturned the HTTP error code: %i' % ( url.replace('&', '&amp;'), params['status'] ))
            #            error.run()
            #            gtk.gdk.threads_leave()
            return

        if params['data'].startswith('%PDF'):
            # this is a pdf file
            filename = params['url'][params['url'].rfind('/') + 1:]
            # strip params
            if filename.find('?') > 0: filename = filename[:filename.find('?')]
            data = params['data']
            print thread.get_ident(), 'importing paper =', filename

            if not paper:
                md5_hexdigest = get_md5_hexdigest_from_data(data)
                paper, created = get_or_create_paper_via(
                    full_text_md5=md5_hexdigest)
                if created:
                    #paper.title = filename
                    paper.save_file(
                        defaultfilters.slugify(filename.replace('.pdf', '')) +
                        '.pdf', data)
                    paper.import_url = url
                    paper.save()
                    print thread.get_ident(), 'imported paper =', filename
                else:
                    print thread.get_ident(
                    ), 'paper already exists: paper =', paper.id, paper.doi, paper.title, paper.get_authors_in_order(
                    )
            else:
                paper.save_file(
                    defaultfilters.slugify(filename.replace('.pdf', '')) +
                    '.pdf', data)
                paper.import_url = url
                paper.save()
            return paper

        if params['url'].startswith('http://portal.acm.org/citation'):
            paper = _import_acm_citation(params, paper=paper)
            if paper and callback: callback()
            return paper


#        if params['url'].startswith('http://dx.doi.org'):
#            paper = import_unknown_citation(params)
#            if paper and refresh_after: main_gui.refresh_middle_pane_search()
#            return paper

        if params['url'].startswith('http://ieeexplore.ieee.org'):
            if params['url'].find('search/wrapper.jsp') > -1:
                paper = _import_ieee_citation(openanything.fetch(
                    params['url'].replace('search/wrapper.jsp',
                                          'xpls/abs_all.jsp')),
                                              paper=paper)
                if paper and callback: callback()
            else:
                paper = _import_ieee_citation(params, paper=paper)
                if paper and callback: callback()
            return paper

        if params['url'].startswith('http://scholar.google.com'):
            paper = _import_google_scholar_citation(params, paper=paper)
            if paper and callback: callback()
            return paper

        # let's see if there's a pdf somewhere in here...
        paper = _import_unknown_citation(params, params['url'], paper=paper)
        if paper and callback: callback()
        if paper: return paper

    except:
        traceback.print_exc()
        gtk.gdk.threads_enter()
        error = gtk.MessageDialog(type=gtk.MESSAGE_ERROR,
                                  buttons=gtk.BUTTONS_OK,
                                  flags=gtk.DIALOG_MODAL)
        error.connect('response', lambda x, y: error.destroy())
        error.set_markup(
            '<b>Unknown Error</b>\n\nUnable to download this resource.')
        error.run()
        gtk.gdk.threads_leave()

    gtk.gdk.threads_enter()
    error = gtk.MessageDialog(type=gtk.MESSAGE_ERROR,
                              buttons=gtk.BUTTONS_OK,
                              flags=gtk.DIALOG_MODAL)
    error.connect('response', lambda x, y: error.destroy())
    error.set_markup(
        '<b>No Paper Found</b>\n\nThe given URL does not appear to contain or link to any PDF files. (perhaps you have it buy it?) Try downloading the file and adding it using "File &gt;&gt; Import..."\n\n%s'
        % pango_escape(url))
    error.run()
    gtk.gdk.threads_leave()
    if active_threads.has_key(thread.get_ident()):
        del active_threads[thread.get_ident()]
Exemple #13
0
def urlfix(url):
    return svgfix(openanything.fetch(url).get('data', ''))
Exemple #14
0
def download(location=None, max_span=DEFAULT_MAX_SPAN):
    
    # init dirs
    for dir in [ 'data', os.path.join('data','tiles') ]:
        if not os.path.isdir(dir):
            os.mkdir(dir)

    # download the base page
    if location:
        print 'downloading the following location:', location
        oa = openanything.fetch( 'http://maps.google.com/maps?q='+urllib.quote_plus(location) )
    else:
        print 'downloading the default world map'
        oa = openanything.fetch('http://maps.google.com')
    if oa['status']!=200:
        print 'error connecting to http://maps.google.com - aborting'
        return
    html = oa['data']
    
    # find our loc,lat,lng
    p = re.compile('laddr:"([^"]+)"')
    m = p.search(html)
    if m:
        location = m.group(1)
        print '\tlocation =',location
    else:
        if location:
            print '\tlocation not found - aborting'
            return
    p = re.compile('center:{lat:([0-9.-]+),lng:([0-9.-]+)}')
    m = p.search(html)
    if m:
        lat, lng = float(m.group(1)), float(m.group(2))
    else:
        lat, lng = 37.0625,-95.677068
    print '\tlatitude, longitude = %f, %f' % (lat, lng)
    
    # find our zoom level
    p = re.compile('span:{lat:([0-9.]+),lng:([0-9.]+)}')
    m = p.search(html)
    if m:
        span_lat, span_lng = float(m.group(1)), float(m.group(2))
    else:
        span_lat, span_lng = 32, 64
    print '\tspan-latitude, span-longitude = %f, %f' % (span_lat, span_lng)
    
    mapfiles = 'http://www.google.com/intl/en_us/mapfiles/94/maps2'

    # perform some base transformations
    html = html.replace('&#160;', '') # beautifulsoup doesn't like this char
    html = html.replace('window.document.title = vPage.title;', 'window.document.title = "Offline Google Maps - http://code.google.com/p/ogmaps/";')
    html = html.replace('http://mt0.google.com/mt?', 'data/tiles/mt?')
    html = html.replace('http://mt1.google.com/mt?', 'data/tiles/mt?')
    html = html.replace('http://mt2.google.com/mt?', 'data/tiles/mt?')
    html = html.replace('http://mt3.google.com/mt?', 'data/tiles/mt?')
    html = html.replace('body{margin-top: 3px;margin-bottom: 0;margin-left: 8px;}', 'body{margin:0px;}')
    html = html.replace('#map {left: 20em;margin-left: 8px;margin-right: 20em;', '#map {')
    html = html.replace('var height = getWindowHeight() - offsetTop - 10;', 'var height = getWindowHeight() - offsetTop;')
    
    # get our kitchen
    soup = BeautifulSoup(html)
    
    hide_if_found( soup.find('div', attrs={'id':'header'}) )
    hide_if_found( soup.find('div', attrs={'id':'guser'}) )
    hide_if_found( soup.find('div', attrs={'id':'gbar'}) )
    hide_if_found( soup.find('div', attrs={'id':'gbh'}) )
    hide_if_found( soup.find('div', attrs={'id':'hp'}) )
    hide_if_found( soup.find('div', attrs={'id':'panel'}) )
    hide_if_found( soup.find('a', attrs={'id':'paneltoggle'}) )

    o = soup.find('div', attrs={'id':'actions'})
    if o:
        o['style'] = 'display:none;'

    # get main.js and transmogrify
    if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'main.js')):
        print 'downloading:', mapfiles+'/main.js'
        oa = openanything.fetch(mapfiles+'/main.js')
        js = oa['data']
        js = js.replace('function rf(a,b){','function rf(a,b){b = b.replace("tiles/mt?","tiles/mt_");')
        js = js.replace('mb("/maps/gen_204?ev=failed_tile&cad="+f)','mb("data/transparent.png")')
        js = js.replace('document.body.style[Nk]=$(0);document.body.style[Fe]=$(8)','')
        js = js.replace('this.u.id="hmtctl";','this.u.id="hmtctl";this.u.style.display="none";')
        file = open( os.path.join(RUN_FROM_DIR, 'data', 'main.js'), 'w' )
        file.write( js )
        file.close()
    
    # get mod_cb.js and transmogrify
    if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'mod_cb.js')):
        print 'downloading:', mapfiles+'/mod_cb.js'
        oa = openanything.fetch(mapfiles+'/mod_cb.js')
        js = oa['data']
        js = js.replace('/mapfiles/cb','data')
        js = js.replace('c.id="cbcontrol";','c.id="cbcontrol";c.style.display="none;";')
        file = open( os.path.join(RUN_FROM_DIR, 'data', 'mod_cb.js'), 'w' )
        file.write( js )
        file.close()
    
    # get mod_traffic_app.js and transmogrify
    if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'mod_traffic_app.js')):
        print 'downloading:', mapfiles+'/mod_traffic_app.js'
        oa = openanything.fetch(mapfiles+'/mod_traffic_app.js')
        js = oa['data']
        js = js.replace('/maps/tldata','data/tldata')
        file = open( os.path.join(RUN_FROM_DIR, 'data', 'mod_traffic_app.js'), 'w' )
        file.write( js )
        file.close()
    
    # get mod_ms.js and transmogrify
    if not os.path.isfile(os.path.join(RUN_FROM_DIR, 'data', 'mod_ms.js')):
        print 'downloading:', mapfiles+'/mod_ms.js'
        oa = openanything.fetch(mapfiles+'/mod_ms.js')
        js = oa['data']
        js = js.replace('http://maps.google.com','data')
        js = js.replace('/mapfiles','')
        file = open( os.path.join(RUN_FROM_DIR, 'data', 'mod_ms.js'), 'w' )
        file.write( js )
        file.close()
    
    # get other scripts
    scripts = [ 'mod_mymaps.js', 'mod_mpl_host.js', 'mod_kml.js', 'mod_le.js', ]
    for s in scripts:
        download_if_dne( mapfiles+'/'+s, os.path.join(RUN_FROM_DIR, 'data', s) )
    
    # get linked scripts
    for tag in soup.findAll('link'):
        try:
            href = tag['href']
            filename = href.split('/')[-1]
            download_if_dne( href, os.path.join(RUN_FROM_DIR, 'data', filename) )
            tag['href'] = 'data/'+filename
        except:
            print 'error:', tag

    # get all static images
    for tag in soup.findAll('img'):
        try:
            src = tag['src']
            filename = src.split('/')[-1]
            download_if_dne( src, os.path.join(RUN_FROM_DIR, 'data', filename) )
            tag['src'] = 'data/'+filename
        except:
            # print 'error:', tag
            pass
    
    # get other misc files
    download_if_dne( 'http://www.google.com/mapfiles/cb/bounds_cippppt.txt', os.path.join(RUN_FROM_DIR, 'data', 'bounds_cippppt.txt') )
    download_if_dne( 'http://maps.google.com/maps/tldata?tldtype=1&hl=en&country=us&callback=_xdc_._1f9onnphn', os.path.join(RUN_FROM_DIR, 'data', 'tldata') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/arrow-white.png', os.path.join(RUN_FROM_DIR, 'data', 'arrow-white.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/arrow.png', os.path.join(RUN_FROM_DIR, 'data', 'arrow.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/lmc.png', os.path.join(RUN_FROM_DIR, 'data', 'lmc.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/lmc-bottom.png', os.path.join(RUN_FROM_DIR, 'data', 'lmc-bottom.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/slider.png', os.path.join(RUN_FROM_DIR, 'data', 'slider.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/scale.png', os.path.join(RUN_FROM_DIR, 'data', 'scale.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/arrowtransparent.png', os.path.join(RUN_FROM_DIR, 'data', 'arrowtransparent.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/overcontract.gif', os.path.join(RUN_FROM_DIR, 'data', 'overcontract.gif') )
    download_if_dne( 'http://maps.google.com/mapfiles/etna.jpg', os.path.join(RUN_FROM_DIR, 'data', 'etna.jpg') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/drag_cross_67_16.png', os.path.join(RUN_FROM_DIR, 'data', 'drag_cross_67_16.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iws2.png', os.path.join(RUN_FROM_DIR, 'data', 'iws2.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw2.png', os.path.join(RUN_FROM_DIR, 'data', 'iw2.png') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_close.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_close.gif') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_plus.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_plus.gif') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_fullscreen.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_fullscreen.gif') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/iw_minus.gif', os.path.join(RUN_FROM_DIR, 'data', 'iw_minus.gif') )
    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/transparent.gif', os.path.join(RUN_FROM_DIR, 'data', 'transparent.gif') )
#    download_if_dne( 'http://www.google.com/intl/en_us/mapfiles/', os.path.join(RUN_FROM_DIR, 'data', '') )
    
            
    # some post transformations, then write to disk
    html = soup.prettify()
    html = html.replace(mapfiles, 'data')
    html = html.replace('http://www.google.com/intl/en_us/mapfiles', 'data')
    html = html + '<style>div.contextmenu {display:none;}</style>'
    file = open( os.path.join(RUN_FROM_DIR, 'ogmap.html'), 'w')
    file.write( html )
    file.close()
    #print html

    # get map data
    for zl in range(17,-3,-1):
        get_map_data( get_tile_coords( lat-span_lat, lng-span_lng, zl ), get_tile_coords( lat+span_lat, lng+span_lng, zl ), zl, max_span )
    
    print '\nyour offline google map is ready at:', RUN_FROM_DIR+'ogmap.html'
Exemple #15
0
def getFile(source):
    """ Use openanything to open a file or url and return a dictionary of info about it """
    file = openanything.fetch(source)
    return file
Exemple #16
0
import openanything

useragent="Python-chy"
url="http://sports.163.com/special/00051K7F/rss_sportslq.xml"

params=openanything.fetch(url,agent=useragent)