def parse(self, response): # 创建 contract """ This function parase a property page. @url http://web:3912/properties/property_000000.html @returns items L @scrapes title price description address image_urls @scrapes url project spider server date """ # 比较实用的水平爬取和垂直爬取URL # 水平 urls = response.xpath('//*[contains(@class,"next")]//@href').extract() absUrls = [urlparse.urljoin(response.url, i) for i in urls] # 垂直 urls = response.xpath('//*[@itemprop="url"]/@href').extract() absUrls = [urlparse.urljoin(response.url, i) for i in urls] # 使用爬虫预定义的方法log(),输出在基本字段表中总结的所有内容 self.log("title: %s" % response.xpath('//*[@itemprop="name"][1]/text()').extract()) self.log("price: %s" % response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+')) self.log("description: %s" % response.xpath('//*[@itemprop="description"][1]/text()').extract()) self.log("address: %s" % response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract()) self.log("image_urls: %s" % response.xpath('//*[@itemprop="image"][1]/@src').extract()) # # 填充Item # item = PropertiesItem() # item['title'] = response.xpath('//*[@itemprop="name"][1]/text()').extract() # item['price'] = response.xpath('//*[@itemprop="price"[1]/text()').re('[.0-9]+') # item['description'] = response.xpath('//*[@itemprop="description"][1]/text()').extract() # item['address'] = response.xpath('//*[@itemprop="http://schema.org/Place"][1]/text()').extract() # item['image_urls'] = response.xpath('//*[@itemprop="image"][1]/@src').extract() # return item # 清理 item 装载器与管理字段 L = ItemLoader(item=PropertiesItem(), response=response) L.add_xpath('title', '//*[@itemprop="name"][1]/text()') L.add_xpath('price', '//*[@itemprop="price"][1]/text()', re='[.0-9]+') L.add_xpath('description', '//*[@itemprop="description"][1]/text()') L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()') L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src') # 使用处理器 用来对我们的Xpath/CSS结果进行处理。 # 在爬虫中使用几个这样子的处理器,并按照我们想要的方式输出 L.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) L.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[.0-9]+') L.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) L.add_xpath('address', '//*[@itemprop="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) L.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i : urlparse.urljoin(response.url, i))) # 使用add_value方法设置管理字段 L.add_value('url', reponse.url) L.add_value('project', self.settings.get('BOT_NAME')) L.add_value('spider', self.name) L.add_value('server', socket.gethostname) L.add_value('date', datetime.datetime.now()) return L.load_item() pass
def get_repo_url(mead_tag, nexus_base_url, prefix="hudson-", suffix=""): """ Creates repository Nexus group URL composed of: <nexus_base_url>/content/groups/<prefix><mead_tag><suffix> :param mead_tag: name of the MEAD tag used to create the proxy URL in settings.xml :param nexus_base_url: the base URL of a Nexus instance :param prefix: Nexus group name prefix, default is "hudson-" :param suffix: Nexus group name suffix, e.g. "-jboss-central" or "-reverse" :returns: """ result = urlparse.urljoin(nexus_base_url, "content/groups/") result = urlparse.urljoin(result, "%s%s%s/" % (prefix, mead_tag, suffix)) return result
def _get_directory(self, url, pattern, url_group=1, value_group=2, value_fn=None): response = self.request(url) if value_group is None: if value_fn is None: value_fn = lambda m, u: m.group(0) else: if value_fn is None: #value_fn = lambda v: v value_fn = int count = 0 for match in pattern.finditer(response): count += 1 match_url = urlparse.urljoin(url, match.group(url_group)) if value_group is None: yield ( match_url, value_fn(match, match_url), ) else: yield ( match_url, value_fn(match.group(value_group)), )
def __init__(self, name) : """ @param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call """ try : # Note the removal of the fragment ID. This is necessary, per the HTTP spec req = Request(url=name.split('#')[0]) req.add_header('Accept', 'text/html, application/xhtml+xml') self.data = urlopen(req) self.headers = self.data.info() if URIOpener.CONTENT_LOCATION in self.headers : self.location = urlparse.urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION]) else : self.location = name except urllib_HTTPError : e = sys.exc_info()[1] from pyMicrodata import HTTPError msg = BaseHTTPRequestHandler.responses[e.code] raise HTTPError('%s' % msg[1], e.code) except Exception : e = sys.exc_info()[1] from pyMicrodata import MicrodataError raise MicrodataError('%s' % e)
def download_file(self, file_path, dest_path='.'): """ Download a file from file_path to dest_path :param file_path: Path to the resource to download :type file_path: String :param dest_path: Path to where the downloaded file should be saved :type dest_path: String """ resource_path = urlparse.urljoin(self.connection.path, file_path.strip('/')) resp, content = self.connection.send_get(resource_path) file_name = os.path.basename(file_path) write_to_path = os.path.join(dest_path, file_name) try: file_fd = open(write_to_path, 'wb') file_fd.write(content) except IOError: raise finally: file_fd.close() return resp, content
def parse_editions_url(self, response): name = response.css( 'div.mainContentFloat h1 a::text').extract_first().strip() editions = response.meta['editions'] all_books_data = {} if (name not in editions['name']): editions['name'] = name editions['urls'] = [] for book_page in response.css('a.bookTitle').xpath('@href'): editions['urls'].append((urlparse.urljoin(response.url, book_page.extract()))) next_page = response.xpath("//*[@rel='next']/@href").extract_first() if next_page is not None and (len(editions['urls']) <= 200): yield response.follow(next_page, callback=self.parse_editions_url, meta={'editions': editions}) else: print(editions['name']) print(len(editions['urls'])) for link in editions['urls']: request = scrapy.Request(link, callback=self.parse_editions_data, meta={ 'all_books_data': all_books_data, 'name': editions['name'] }) yield request #yield editions editions = {}
def download_file(request, file_id): """permet a l'utilisateur de telecharger le fichier grace a l'api""" gi = request.galaxy data = gi.datasets.show_dataset(dataset_id=file_id) name = "error" if isinstance(data, dict): dlurl = data.get('download_url') name = data.get('name') name = name.replace(" ", "_") name = name + "." + data.get('file_ext') if not name: name = "download" if dlurl: url = urlparse.urljoin(gi.base_url, dlurl) response = urllib.urlopen(url) stream_response = StreamingHttpResponse(response.read()) stream_response[ 'Content-Disposition'] = 'attachment; filename=' + name else: stream_response = StreamingHttpResponse( "No file download URL corresponds to the given dataset id " + file_id) else: stream_response = StreamingHttpResponse(data) return stream_response
def model_view(self, request, model_databrowse, url): # If the object ID wasn't provided, redirect to the model page, # Which is one level up. if url is None: return http.HttpResponseRedirect( urlparse.urljoin(request.path, '../') ) easy_model = EasyModel( model_databrowse.site, model_databrowse.model ) try: obj = easy_model.object_by_pk(url) except ObjectDoesNotExist: raise http.Http404('Id not found') except ValueError: raise http.Http404('Invalid format key provided') return render( request, 'databrowse/object_detail.html', { 'object': obj, 'root_url': model_databrowse.site.root_url } )
def __init__(self, name): """ @param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call """ try: # Note the removal of the fragment ID. This is necessary, per the HTTP spec req = Request(url=name.split('#')[0]) req.add_header('Accept', 'text/html, application/xhtml+xml') self.data = urlopen(req) self.headers = self.data.info() if URIOpener.CONTENT_LOCATION in self.headers: self.location = urlparse.urljoin( self.data.geturl(), self.headers[URIOpener.CONTENT_LOCATION]) else: self.location = name except urllib_HTTPError: e = sys.exc_info()[1] from pyMicrodata import HTTPError msg = BaseHTTPRequestHandler.responses[e.code] raise HTTPError('%s' % msg[1], e.code) except Exception: e = sys.exc_info()[1] from pyMicrodata import MicrodataError raise MicrodataError('%s' % e)
def install_project(info, basefolder, serverurl, updateMode=False): if not serverurl: roam.utils.warning("No server url set for update") raise ValueError("No server url given") roam.utils.info("Downloading project zip") if updateMode: filename = "{}.zip".format(info['name']) else: filename = "{}-Install.zip".format(info['name']) serverurl = add_slash(serverurl) url = urlparse.urljoin(serverurl, "projects/{}".format(filename)) tempfolder = os.path.join(basefolder, "_updates") if not os.path.exists(tempfolder): os.mkdir(tempfolder) zippath = os.path.join(tempfolder, filename) for status in download_file(url, zippath): yield status yield "Installing" with zipfile.ZipFile(zippath, "r") as z: z.extractall(basefolder) project = roam.project.Project.from_folder(os.path.join(basefolder, info['name'])) os.chdir(project.folder) yield "Running update scripts.." run_install_script(project.settings, "after_update") project.projectUpdated.emit(project)
class CAIssuersParser: '''Parses list of CA's from Mozilla, Chrome, Opera, iOS.''' # https://en.wikipedia.org/wiki/Certificate_authority#Providers CA_LISTS = { 'mozilla': { 'list': urlparser.urljoin( 'https://hg.mozilla.org/releases/mozilla-beta/raw-file/', 'tip/security/nss/lib/ckfw/builtins/certdata.txt', ), 'pattern': '# Issuer ', } } ISSUERS = [] # TODO: parse the other lists and store the CA's into a file def parse_issuers(self): resp = requests.get(self.CA_LISTS['mozilla']['list']) raw_list = resp.text pattern = self.CA_LISTS['mozilla']['pattern'] for line in raw_list.split('\n'): if line.startswith(pattern): issuer = line.lstrip(pattern) if issuer not in self.ISSUERS: print(issuer) self.ISSUERS.append(issuer)
def fetch_data(self, rootfolder, filename, serverurl): """ Download the update zip file for the project from the server """ serverurl = add_slash(serverurl) tempfolder = os.path.join(rootfolder, "_updates") if not os.path.exists(tempfolder): os.mkdir(tempfolder) filename = "{}.zip".format(filename) url = urlparse.urljoin(serverurl, "projects/{}".format(filename)) zippath = os.path.join(tempfolder, filename) if not self.check_url_found(url): yield "Skipping data download" yield "Done" return roam.utils.info("Downloading data zip from {}".format(url)) try: for status in download_file(url, zippath): yield status except UpdateExpection as ex: roam.utils.exception("Error in update for project") yield "Error in downloading data" return yield "Extracting data.." with zipfile.ZipFile(zippath, "r") as z: members = z.infolist() for i, member in enumerate(members): z.extract(member, rootfolder) roam.utils.debug("Extracting: {}".format(member.filename)) yield "Done"
def install_artifacts(artifacts, dirstruct, installdir, basestaticurl): """ Install the artifacts. """ assert basestaticurl.endswith("/"), "Basestaticurl should end with /" installed = [] for reldir, artifactnames in list(dirstruct.items()): destdir = os.path.join(installdir, reldir) if not os.path.exists(destdir): log.warn("Making install directory %s" % destdir) os.makedirs(destdir) else: assert os.path.isdir(destdir) for artifactname in artifactnames: destpath = os.path.abspath(os.path.join(destdir, artifactname)) if artifactname in list(artifacts.keys()): # The artifact must be loaded from jenkins theartifact = artifacts[artifactname] else: # It's probably a static file, we can get it from the static collection staticurl = urlparse.urljoin(basestaticurl, artifactname) theartifact = Artifact(artifactname, staticurl) theartifact.save(destpath) installed.append(destpath) return installed
def export_to_itol(request, file_id): # retrieve newick from galaxy server gi = request.galaxy data = gi.datasets.show_dataset(dataset_id=file_id) if isinstance(data, dict): dlurl = data.get('download_url') if dlurl: url = urlparse.urljoin(gi.base_url, dlurl) response = urllib.urlopen(url) tmpfile = tempfile.NamedTemporaryFile() tmpfile.write(response.read()) tmpfile.flush() # send file to itol server url_itol = 'https://itol.embl.de/upload.cgi' payload = { 'tname': "", 'tfile': open(tmpfile.name, 'rb'), } r = requests.post(url_itol, files=payload) return redirect(r.url) return render(request, 'error.html', { 'errortitle': 'Error querying galaxy', 'errormessage': data })
def parser(self, html): data = [] if not html: return data soup = BeautifulSoup(html, "lxml") for tag in soup.find_all("a", class_="a_title2"): origin = urlparse.urljoin(self.root_url, tag['href']) sub_html = self.downloader(origin) sub_soup = BeautifulSoup(sub_html, 'lxml') detail = sub_soup.find('div', class_='detail_xq w770') title = detail.find('h2').get_text().strip() lis = detail.find_all('li') cnnvd_id = lis[0].get_text().strip()[8:] cve_id = lis[2].find_all('a')[-1].get_text().strip() description = sub_soup.find('div', class_='d_ldjj').get_text().strip() keyword = self.get_keyword(description) created = lis[6].find_all('a')[-1].get_text().strip() data.append({ "title": title, "cnnvd_id": cnnvd_id, "cve_id": cve_id, "description": description, "keyword": keyword, "created": created, "origin": origin, }) return data
def download_pom(repo_url=None, artifact=None, pom_url=None, target_dir=None): """ Downloads a pom file with give GAV (as array) or from given pom_url and saves it as pom.xml into target_dir. :param repo_url: repository URL from which the pom should be downloaded, mandatory only if no pom_url provided :param artifact: MavenArtifact instance, mandatory only if no pom_url provided :param pom_url: URL of the pom to download, not mandatory :target_dir: target directory path, where the pom should be saved, not mandatory :returns: path to the saved pom, useful if no target_dir provided """ if not pom_url: pom_url = urlparse.urljoin( repo_url, "%s/" % string.replace(artifact.groupId, ".", "/")) pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.artifactId) pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.version) pom_url = urlparse.urljoin( pom_url, "%s-%s.pom" % (artifact.artifactId, artifact.version)) handler = None try: handler = urlopen(pom_url) except HTTPError as err: logging.error("Failed to download POM %s. %s", pom_url, err) return None if not target_dir: num = 1 while not target_dir or os.path.exists(target_dir): target_dir = "/tmp/maven-temp-path-%s" % num num += 1 pom_path = os.path.join(target_dir, "pom.xml") if handler.getcode() == 200: pom = handler.read() handler.close() if not os.path.exists(target_dir): os.makedirs(target_dir) pom_file = None try: pom_file = open(pom_path, "w") pom_file.write(pom) finally: if pom_file: pom_file.close() return pom_path
def abs_url(url, response): """Return absolute link""" base = response.xpath('//head/base/@href').extract() if base: base = base[0] else: base = response.url return urlparse.urljoin(base, url)
def _get_new_urls(self, page_url, soup): new_urls = {} #定义字典 links = soup.find_all('a', class_="e") #找节点 for link in links: new_url = link['href'] new_full_url = urlparse.urljoin(page_url, new_url) #拼接 new_urls.add(new_full_url) #添加 return new_urls #返回
def _get_new_urls(self, page_url, soup): new_urls = set() links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm")) for link in links: new_url = link['href'] new_full_url = urlparse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls
def download_pom(repo_url=None, artifact=None, pom_url=None, target_dir=None): """ Downloads a pom file with give GAV (as array) or from given pom_url and saves it as pom.xml into target_dir. :param repo_url: repository URL from which the pom should be downloaded, mandatory only if no pom_url provided :param artifact: MavenArtifact instance, mandatory only if no pom_url provided :param pom_url: URL of the pom to download, not mandatory :target_dir: target directory path, where the pom should be saved, not mandatory :returns: path to the saved pom, useful if no target_dir provided """ if not pom_url: pom_url = urlparse.urljoin(repo_url, "%s/" % string.replace(artifact.groupId, ".", "/")) pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.artifactId) pom_url = urlparse.urljoin(pom_url, "%s/" % artifact.version) pom_url = urlparse.urljoin(pom_url, "%s-%s.pom" % (artifact.artifactId, artifact.version)) handler = None try: handler = urlopen(pom_url) except HTTPError as err: logging.error("Failed to download POM %s. %s", pom_url, err) return None if not target_dir: num = 1 while not target_dir or os.path.exists(target_dir): target_dir = "/tmp/maven-temp-path-%s" % num num += 1 pom_path = os.path.join(target_dir, "pom.xml") if handler.getcode() == 200: pom = handler.read() handler.close() if not os.path.exists(target_dir): os.makedirs(target_dir) pom_file = None try: pom_file = open(pom_path, "w") pom_file.write(pom) finally: if pom_file: pom_file.close() return pom_path
def get_city_name(url_city): '''获取当前所有的城市列表''' base_url = "http://www.tianqihoubao.com" city_url = urlparse.urljoin(base_url, url_city) # print city_url html = requests.get(city_url).content html_tree = etree.HTML(html) links = html_tree.xpath("//td//a/@href") return map(get_name, links)
def _get_new_urls(page_url, links): new_urls = set() for link in links: new_url = link new_full_url = urlparse.urljoin(page_url, new_url) OO0o = urlparse(new_full_url) if OO0o.path.endswith(".action") or OO0o.path.endswith(".do"): new_urls.add(new_full_url) return new_urls
def parse_link_rel(url, fn): """ Read through html file ``fn`` downloaded from ``url``, looking for a link tag of the form: <link rel="alternate" type="application/sage" title="currently ignored" href=".../example.sws" /> This function reads ``fn`` looking for such tags and returns a list of dictionaries of the form {'title': from title field in link, 'url': absolute URL to .sws file} for the corresponding ``.sws`` files. Naturally if there are no appropriate link tags found, the returned list is empty. If the HTML parser raises an HTMLParseError, we simply return an empty list. """ from HTMLParser import HTMLParser class GetLinkRelWorksheets(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.worksheets = [] def handle_starttag(self, tag, attrs): if (tag == 'link' and ('rel', 'alternate') in attrs and ('type', 'application/sage') in attrs): self.worksheets.append({ 'title': [_ for _ in attrs if _[0] == 'title'][0][1], 'url': [_ for _ in attrs if _[0] == 'href'][0][1] }) parser = GetLinkRelWorksheets() with open(fn) as f: try: parser.feed(f.read()) except HTMLParseError: return [] ret = [] for d in parser.worksheets: sws = d['url'] # is that link a relative URL? if not urlparse.urlparse(sws).netloc: # unquote-then-quote to avoid turning %20 into %2520, etc ret.append({ 'url': urlparse.urljoin(url, urllib.quote(urllib.unquote(sws))), 'title': d['title'] }) else: ret.append({'url': sws, 'title': d['title']}) return ret
def _get_new_urls(self, page_url, soup): new_urls = set() #获取所有标签 links = soup.find_all('a', href=re.compile(r'/pg/\d+\.htm')) for link in links: new_url = link['link'] # 全路径 new_full_url = urlparse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls
def __query_implementation(self, identifier_name, value): action_name = 'Sale/Query/{0}={1}'.format(identifier_name, value) request_header = { "MerchantKey": str(self.merchant_key), 'Content-Type': 'application/json', 'Accept': 'application/json' } return requests.get(urlparse.urljoin(self.host_uri, action_name), headers=request_header)
def _get_new_urls(self, page_url, links): #添加爬取到的新url new_url = set() for link in links: new_url = link new_full_url = urlparse.urljoin(page_url, new_url) new_full_url = self.check_url(new_full_url) if (self._judge(new_full_url)): new_urls.add(new_full_url) return new_urls
def retry_with_request(self, retry_sale_request): request_header = { "MerchantKey": str(self.merchant_key), 'Content-Type': 'application/json', 'Accept': 'application/json' } action_name = self.resource_name + '/Retry' return requests.post(urlparse.urljoin(self.host_uri, action_name), data=json.dumps(retry_sale_request, cls=uuid_serialize), headers=request_header)
def _init_request(path, params, headers, creds): credentials = creds or Config() hdrs = { 'Accept': 'application/vnd.pagerduty+json;version=2', 'Authorization': f'Token token={credentials["pagerduty"].api_key}' } if headers: hdrs.update(headers) params = params or {} url = urlparse.urljoin(PAGERDUTY_API_URL, path) return url, params, hdrs
def getDownloadList(url='http://www.yahoo.co.jp'): dom = pq(url) result = set() for img in dom('img').items(): img_url = img.attr['src'] if img_url.startswith('http'): result.add(img_url) else: result.add(urlparse.urljoin(url, img_url)) return result
def _load_from_uri(uri, timeout=None, headers={}, cookies=None): response = requests.get(uri, cookies=cookies, timeout=timeout, headers=headers) content = response.content.strip() parsed_url = urlparse.urlparse(uri) prefix = parsed_url.scheme + '://' + parsed_url.netloc base_path = os.path.normpath(parsed_url.path + '/..') base_uri = urlparse.urljoin(prefix, base_path) return M3U8(content, base_uri=base_uri), response.cookies
def get_arguments(): global base_url global team_name global access_token parser = OptionParser(usage='''usage: %prog [options] Marks all the channels of all users of given team as read. Useful after using the Mattermost bulk import, as otherwise all users have tons of unread messages and the Mattermost client has a hard time loading.''') parser.add_option( "-b", "--base-url", dest="base_url", action="store", type="string", help= "Base URL of Mattermost installation (mandatory), e.g. 'https://mattermost.mycompany.ch/'" ) parser.add_option( "-t", "--team", dest="team", action="store", type="string", help="Team name of which channels should be marked as read (mandatory)" ) parser.add_option( "-a", "--access-token", dest="token", action="store", type="string", help= "A valid Mattermost API access token (optional, can be entered interactively)" ) (options, args) = parser.parse_args() if options.base_url is None: parser.print_help() parser.error("Base URL parameter is mandatory") if options.team is None: parser.error("Team parameter is mandatory") if options.token is None: access_token = getpass.getpass('Mattermost API access token:') else: access_token = options.token base_url = urlparse.urljoin(options.base_url, '/api/v4') team_name = options.team print("team_name", team_name) print("base_url = ", base_url) print("options.base_url", options.base_url)
def _get_months(self, year, type="snapshots"): url = urlparse.urljoin(self.base_url, "%s/%4d/" % ( type, year, )) return self._get_directory( url, self.YEAR_RE, value_fn=lambda v: datetime.date(year, int(v), 1), )
def get_keep(request, article_id): logged_user = request.user article = Article.objects.get(id=article_id) articles = logged_user.article_set.all() if article not in articles: article.user.add(logged_user) # for m2m linking, have tested by shell article.keep_num += 1 article.save() return redirect('/focus/') else: url = urlparse.urljoin('/focus/', article_id) return redirect(url)
def redirect(request, prefix, tiny, converter=default_converter): """ Redirect to a given object from a short URL. """ # Resolve the prefix and encoded ID into a model object and decoded ID. # Many things here could go wrong -- bad prefix, bad value in # SHORTEN_MODELS, no such model, bad encoding -- so just return a 404 if # any of that stuff goes wrong. try: app_label, model_name = settings.SHORTEN_MODELS[prefix].split('.') except KeyError: raise Http404('Bad prefix.') try: model = models.get_model(app_label, model_name) except LookupError: model = False if not model: raise Http404('Bad model specified in SHORTEN_MODELS.') try: id = converter.to_decimal(tiny) except ValueError: raise Http404('Bad encoded ID.') # Try to look up the object. If it's not a valid object, or if it doesn't # have an absolute url, bail again. obj = get_object_or_404(model, pk=id) try: url = obj.get_absolute_url() except AttributeError: raise Http404("'%s' models don't have a get_absolute_url() method." % model.__name__) # We might have to translate the URL -- the badly-named get_absolute_url # actually returns a domain-relative URL -- into a fully qualified one. # If we got a fully-qualified URL, sweet. if urlparse.urlsplit(url)[0]: return HttpResponsePermanentRedirect(url) # Otherwise, we need to make a full URL by prepending a base URL. # First, look for an explicit setting. if hasattr(settings, 'SHORTEN_FULL_BASE_URL') and settings.SHORTEN_FULL_BASE_URL: base = settings.SHORTEN_FULL_BASE_URL # Next, if the sites app is enabled, redirect to the current site. elif Site._meta.installed: base = 'http://%s/' % Site.objects.get_current().domain # Finally, fall back on the current request. else: base = 'http://%s/' % RequestSite(request).domain return HttpResponsePermanentRedirect(urlparse.urljoin(base, url))
def parse_link_rel(url, fn): """ Read through html file ``fn`` downloaded from ``url``, looking for a link tag of the form: <link rel="alternate" type="application/sage" title="currently ignored" href=".../example.sws" /> This function reads ``fn`` looking for such tags and returns a list of dictionaries of the form {'title': from title field in link, 'url': absolute URL to .sws file} for the corresponding ``.sws`` files. Naturally if there are no appropriate link tags found, the returned list is empty. If the HTML parser raises an HTMLParseError, we simply return an empty list. """ from HTMLParser import HTMLParser class GetLinkRelWorksheets(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.worksheets = [] def handle_starttag(self, tag, attrs): if (tag == 'link' and ('rel', 'alternate') in attrs and ('type', 'application/sage') in attrs): self.worksheets.append({'title': [_ for _ in attrs if _[0] == 'title'][0][1], 'url': [_ for _ in attrs if _[0] == 'href'][0][1]}) parser = GetLinkRelWorksheets() with open(fn) as f: try: parser.feed(f.read()) except HTMLParseError: return [] ret = [] for d in parser.worksheets: sws = d['url'] # is that link a relative URL? if not urlparse.urlparse(sws).netloc: # unquote-then-quote to avoid turning %20 into %2520, etc ret.append({'url': urlparse.urljoin(url, urllib.quote(urllib.unquote(sws))), 'title': d['title']}) else: ret.append({'url': sws, 'title': d['title']}) return ret
def tree_visualization(request, file_id): gi = request.galaxy data = gi.datasets.show_dataset(dataset_id=file_id) if isinstance(data, dict): dlurl = data.get('download_url') historyid = data.get('history_id') if dlurl and historyid: url = urlparse.urljoin(gi.base_url, dlurl) response = urllib.urlopen(url) return render(request, template_name='treeviz/tree.html', context={'newick_tree': response.read(), 'history_id': historyid}) return render(request, 'error.html', {'errortitle': 'Error querying galaxy', 'errormessage': data})
def second_pass(self, response): base_url = 'https://www.cityoflaurel.org' links_with_dupes = response.css('div#site-main')[0].css('section#site-content')[0].css('a').xpath('@href').extract() links = list(set(links_with_dupes)) for link in links: print("LINK: " + link + '\n') if link.endswith('.pdf'): link = urlparse.urljoin(base_url, link) yield Request(url=link, callback=self.save_pdf) if "http" not in str(link): yield Request(url=base_url + link + '/', callback=self.third_pass) elif "cityoflaurel" in link: yield Request(url=link, callback=self.third_pass) else: yield { "link": link }
def cleanup_url(self, value_url, source_url, mark): """ Transform relative URLs into absolute URLs if possible. If the value_url is already absolute, or we don't know the source_url, then return the existing value. If the value_url is relative, and we know the source_url, then try to rewrite it. """ value = urlparse.urlparse(value_url) if value.netloc or not source_url: url = value_url else: url = urlparse.urljoin(source_url, value_url) if url.startswith('//'): url = 'http:' + url # MissingSchema fix if mark: url = url + mark return url
def download_scan(self, ignore_files=False): """ Téléchargement des scan TODO: Gérer les authentification HTTP Remplace subprocess par la librairie curl directement """ for scan in self.list_pages_by_chapters(): print(">> Téléchargement du chapitre {}".format(scan[0])) chapter_dir = "%s_%s" % (DEFAULT_SCAN_CHAPTER_DIRNAME, scan[0]) # On créé le répertoire de destination chapter_path = self.create_dir( os.path.join(self.scan_path, self.scan_name, chapter_dir)) for p in scan[1]: img_found = False page = "0%s" % p if p < 10 else p url = urlparse.urljoin( DEFAULT_SCAN_URL, "%s/%s/%s" % (self.scan_name, scan[0], page)) # On cherche une URL valide en fonction de # l'extension des images for ext in DEFAULT_IMG_EXT: dl_file = self.download_file( "%s.%s" % (url, ext), chapter_path, ignore_files) if dl_file: print(">> Téléchargement de la page {}".format(page)) img_found = True break """TODO: log dans un fichier Si on a pas du tout trouvé d'image on saute le téléchargement """ if not img_found: print("la page {} n'a pas été trouvé".format(page)) continue return
def export_to_itol(request, file_id): # retrieve newick from galaxy server gi = request.galaxy data = gi.datasets.show_dataset(dataset_id=file_id) if isinstance(data, dict): dlurl = data.get('download_url') if dlurl: url = urlparse.urljoin(gi.base_url, dlurl) response = urllib.urlopen(url) tmpfile = tempfile.NamedTemporaryFile() tmpfile.write(response.read()) tmpfile.flush() # send file to itol server url_itol = 'https://itol.embl.de/upload.cgi' payload = {'tname': "", 'tfile': open(tmpfile.name, 'rb'), } r = requests.post(url_itol, files=payload) return redirect(r.url) return render(request, 'error.html', {'errortitle': 'Error querying galaxy', 'errormessage': data})
def list_scan_chapters(self): """Parse la page du scan pour savoir combien de chapitre sont disponibles. TODO: Faire en sorte de gérer les chapitres bonus Returns: Nombre de chapitre trouvé """ chapters = [] url = urlparse.urljoin(DEFAULT_CHAPTER_URL, "%s/" % self.scan_name) if self.test_url(url): html = str(urlopen(url).read()) tabs = re.findall( r'(<td class="td">)([A-Za-z0-9\-\ \:]+)(chapitre)\ ([0-9]+)', html) for t in tabs: chapters.append(t[3]) print("chapitre {} trouvé".format(t[3])) return chapters
def download_file(request, file_id): """permet a l'utilisateur de telecharger le fichier grace a l'api""" gi = request.galaxy data = gi.datasets.show_dataset(dataset_id=file_id) name = "error" if isinstance(data, dict): dlurl = data.get('download_url') name = data.get('name') name = name.replace(" ","_") name = name + "." + data.get('file_ext') if not name: name = "download" if dlurl: url = urlparse.urljoin(gi.base_url, dlurl) response = urllib.urlopen(url) stream_response = StreamingHttpResponse(response.read()) stream_response['Content-Disposition'] = 'attachment; filename=' + name else: stream_response = StreamingHttpResponse("No file download URL corresponds to the given dataset id " + file_id) else: stream_response = StreamingHttpResponse(data) return stream_response
def render(self, context): try: obj = self.obj.resolve(context) except template.VariableDoesNotExist: return '' try: prefix = self.get_prefix(obj) except (AttributeError, KeyError): return '' tinyid = converter.from_decimal(obj.pk) if hasattr(settings, 'SHORT_BASE_URL') and settings.SHORT_BASE_URL: return urlparse.urljoin(settings.SHORT_BASE_URL, prefix+tinyid) try: return urlresolvers.reverse('shorturls.views.redirect', kwargs = { 'prefix': prefix, 'tiny': tinyid }) except urlresolvers.NoReverseMatch: return ''
def url(self, path): return urlparse.urljoin(settings.TEST_DOMAIN, path)