def preprocess_link(self, referrer, url): # Modify and filter URLs before crawling if not url: return None fields = urlsplit(urljoin( referrer, url))._asdict() # convert to absolute URLs and split fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing "/" fields['fragment'] = '' # remove targets within a page fields = SplitResult(**fields) if fields.scheme == 'http': httpurl = newurl = fields.geturl() httpsurl = httpurl.replace('http:', 'https:', 1) elif fields.scheme == 'https': httpsurl = newurl = fields.geturl() httpurl = httpsurl.replace('https:', 'http:', 1) else: # Filter the URL without 'http' or 'https' return None if httpurl not in self.url_set and httpsurl not in self.url_set: # Filter URL that already exists in set return newurl else: return None
def build_url(url): url_result = {UrlParser.QUERY: "", UrlParser.FRAGMENT: ""} if not url or UrlParser.SCHEME not in url or not url[UrlParser.SCHEME]: raise Exception("UrlParser:build_url", "Url dictionary is empty or missing key values") url_result[UrlParser.SCHEME] = url[UrlParser.SCHEME] if UrlParser.NETLOC in url and url[UrlParser.NETLOC]: if ( UrlParser.USERNAME in url and url[UrlParser.USERNAME] and url[UrlParser.USERNAME] in url[UrlParser.NETLOC] ): url_result[UrlParser.NETLOC] = url[UrlParser.NETLOC] if UrlParser.NETLOC not in url_result: url_result[UrlParser.NETLOC] = url[UrlParser.HOSTNAME] if UrlParser.PORT in url and url[UrlParser.PORT]: url_result[UrlParser.NETLOC] += str(url[UrlParser.PORT]) if UrlParser.USERNAME in url and url[UrlParser.USERNAME]: credentials = "{}@".format(url[UrlParser.USERNAME]) if UrlParser.PASSWORD in url and url[UrlParser.PASSWORD]: credentials = "{}:{}@".format(url[UrlParser.USERNAME], url[UrlParser.PASSWORD]) url_result[UrlParser.NETLOC] = credentials + url_result[UrlParser.NETLOC] url_result[UrlParser.PATH] = url[UrlParser.FILENAME] if UrlParser.PATH in url and url[UrlParser.PATH]: url_result[UrlParser.PATH] = url[UrlParser.PATH] + "/" + url_result[UrlParser.PATH] url_result[UrlParser.PATH] = re.sub("//+", "/", url_result[UrlParser.PATH]) if UrlParser.QUERY in url and url[UrlParser.QUERY]: url_result[UrlParser.QUERY] = url[UrlParser.QUERY] result = SplitResult(**url_result) return result.geturl()
def build_url(url): url_result = {UrlParser.QUERY: '', UrlParser.FRAGMENT: ''} if not url or UrlParser.SCHEME not in url or not url[UrlParser.SCHEME]: raise Exception('UrlParser:build_url', 'Url dictionary is empty or missing key values') url_result[UrlParser.SCHEME] = url[UrlParser.SCHEME] if UrlParser.NETLOC in url and url[UrlParser.NETLOC]: if UrlParser.USERNAME in url \ and url[UrlParser.USERNAME] \ and url[UrlParser.USERNAME] in url[UrlParser.NETLOC]: url_result[UrlParser.NETLOC] = url[UrlParser.NETLOC] if UrlParser.NETLOC not in url_result: url_result[UrlParser.NETLOC] = url[UrlParser.HOSTNAME] if UrlParser.PORT in url and url[UrlParser.PORT]: url_result[UrlParser.NETLOC] += str(url[UrlParser.PORT]) if UrlParser.USERNAME in url and url[UrlParser.USERNAME]: credentials = '{}@'.format(url[UrlParser.USERNAME]) if UrlParser.PASSWORD in url and url[UrlParser.PASSWORD]: credentials = '{}:{}@'.format(url[UrlParser.USERNAME], url[UrlParser.PASSWORD]) url_result[UrlParser.NETLOC] = credentials + url_result[UrlParser.NETLOC] url_result[UrlParser.PATH] = url[UrlParser.FILENAME] if UrlParser.PATH in url and url[UrlParser.PATH]: url_result[UrlParser.PATH] = url[UrlParser.PATH] + '/' + url_result[UrlParser.PATH] url_result[UrlParser.PATH] = re.sub('//+', '/', url_result[UrlParser.PATH]) if UrlParser.QUERY in url and url[UrlParser.QUERY]: url_result[UrlParser.QUERY] = url[UrlParser.QUERY] result = SplitResult(**url_result) return result.geturl()
def _parseurl(self, url): ret = urlsplit(url) self.username = ret.username self.password = ret.password if ret.port <> None: n = SplitResult(ret.scheme, ret.hostname + ":" + ret.port.__str__(), ret.path, ret.query, ret.fragment) else: n = SplitResult(ret.scheme, ret.hostname, ret.path, ret.query, ret.fragment) self.url = n.geturl()
def clean_link(url): o = urlsplit(url) if not o.scheme.lower() in ALLOWED_URL_SCHEMES: return None o = SplitResult(o.scheme, o.netloc, o.path, o.query, '') while o.query and __utm_matcher.search(o.query): query = __utm_matcher.sub('', o.query) o = SplitResult(o.scheme, o.netloc, o.path, query, '') return o.geturl()
def urlsplit(url, scheme="", allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = "" i = url.find(":") if i > 0: if url[:i] == "http": # optimize the common case scheme = url[:i].lower() url = url[i + 1 :] if url[:2] == "//": netloc, url = _splitnetloc(url, 2) if ("[" in netloc and "]" not in netloc) or ( "]" in netloc and "[" not in netloc ): raise ValueError("Invalid IPv6 URL") if allow_fragments and "#" in url: url, fragment = url.split("#", 1) if "?" in url: url, query = url.split("?", 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: # make sure "url" is not actually a port number (in which case # "scheme" is really part of the path) rest = url[i + 1 :] if not rest or any(c not in "0123456789" for c in rest): # not a port number scheme, url = url[:i].lower(), rest if url[:2] == "//": netloc, url = _splitnetloc(url, 2) if ("[" in netloc and "]" not in netloc) or ( "]" in netloc and "[" not in netloc ): raise ValueError("Invalid IPv6 URL") if allow_fragments and "#" in url: url, fragment = url.split("#", 1) if "?" in url: url, query = url.split("?", 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v
def with_port(url_str): try: port = settings.PORT except AttributeError: port = None if port == 80: port = None url_split = urlsplit(url_str) if port: if not url_split.port and url_split.netloc: scheme, netloc, url, query, fragment = url_split netloc += ":%s" % port url_split = SplitResult(scheme, netloc, url, query, fragment) return url_split.geturl()
def assertApiUrlEqual(self, *args, **kwargs): """ Allows equality comparison of two or more URLs agnostic of API version. This is done by prepending '/api/vx' (where x is equal to the `version` keyword argument or API_CURRENT_VERSION) to each string passed as a positional argument if that URL doesn't already start with that string. Example usage: url = '/api/v1/apps/app/bastacorp/' self.assertApiUrlEqual(url, '/apps/app/bastacorp1/') # settings.API_CURRENT_VERSION = 2 url = '/api/v1/apps/app/bastacorp/' self.assertApiUrlEqual(url, '/apps/app/bastacorp/', version=1) """ PATH = 2 version = kwargs.get('version', settings.API_CURRENT_VERSION) urls = list(args) prefix = '/api/v%d' % version for idx, url in enumerate(urls): urls[idx] = list(urlsplit(url)) if not urls[idx][PATH].startswith(prefix): urls[idx][PATH] = prefix + urls[idx][PATH] urls[idx] = SplitResult(*urls[idx]) eq_(*urls)
def _urlsplit(url, scheme="", allow_fragments=True): """ Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes. """ if _coerce_args: url, scheme, _coerce_result = _coerce_args(url, scheme) allow_fragments = bool(allow_fragments) netloc = query = fragment = "" i = url.find(":") if i > 0: for c in url[:i]: if c not in scheme_chars: break else: start = i + 1 scheme, url = url[:i].lower(), url[start:] if url[:2] == "//": netloc, url = _splitnetloc(url, 2) if ("[" in netloc and "]" not in netloc) or ( "]" in netloc and "[" not in netloc ): raise ValueError("Invalid IPv6 URL") if allow_fragments and "#" in url: url, fragment = url.split("#", 1) if "?" in url: url, query = url.split("?", 1) v = SplitResult(scheme, netloc, url, query, fragment) return _coerce_result(v) if _coerce_args else v
def _request(scheme=None, netloc=None, path=None, query=None, fragment=None): split = SplitResult(scheme=scheme, netloc=netloc, path=path, query=query, fragment=None) return urlunsplit(split)
def _urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes.""" if _coerce_args: url, scheme, _coerce_result = _coerce_args(url, scheme) allow_fragments = bool(allow_fragments) netloc = query = fragment = '' i = url.find(':') if i > 0: for c in url[:i]: if c not in scheme_chars: break else: scheme, url = url[:i].lower(), url[i + 1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if (('[' in netloc and ']' not in netloc) or (']' in netloc and '[' not in netloc)): raise ValueError("Invalid IPv6 URL") if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) return _coerce_result(v) if _coerce_args else v
def wrapped_url(api_url): if api_url: parsed = urlsplit(api_url) path = parsed.path.replace('.json', '').replace('api/','').replace('pages/','').strip('/') path = reverse("web_page_wrapper", args=[path]) return SplitResult('', '', path, parsed.query, parsed.fragment).geturl() else: return api_url
def build(self): query = "&".join({"{}={}".format(k, v) for k, v in self.params.items()}) return urlunsplit(SplitResult( scheme=self.scheme, netloc=self.netloc, path=self.path, query=query, fragment=None))
def _populate_urlsplit_tuple(self, scheme='', netloc='', path='', query='', fragment=''): return SplitResult(scheme, netloc, path, query, fragment)
def find_next_indexes(soup): ''' next page for an album index or an album page ''' indexes = soup.findAll('a', 'pix-navi-page') urls = [] if indexes: max_p = max([int(tag.string) for tag in indexes if tag.string.isdigit()]) result = urlsplit(httplib.html_unescape(indexes[0]['href'])) #i don't want patch urllib.unquote. bug description: http://bugs.python.org/issue1712522 #quick fix is convert to ascii. query_dict = parse_qs(result.query.encode('ascii')) for p in range(1, max_p + 1): query_dict['p'] = p result = SplitResult(result.scheme, result.netloc, result.path, urlencode(query_dict, doseq=True), result.fragment) urls.append(result.geturl()) return urls
def __init__(self, url): """ """ # urlsplit will parse what it can from the provided string. raw = urlsplit(url) if not raw.path: raise ValueError("Invalid argument for MIB source: %s" % url) scheme = raw.scheme if not scheme: scheme = "file" if not raw.netloc else "http" path = raw.path if scheme == "file" and not path.startswith("/"): path = os.path.abspath("./" + path) cooked = SplitResult(scheme, raw.netloc, path, raw.query, raw.fragment) self._url = cooked.geturl() self._scheme = scheme self._path = cooked.path self._filename = os.path.split(cooked.path)[-1]
def hide_thunder(url): ## tweak image display settings.. url_parts = urlsplit(url) # parse query from url url_query_args = parse_qs(url_parts.query) # parse our query args pcmd_list = list(url_query_args["pcmd"][0]) # split into list for editing pcmd_list[9] = "0" # disable Thunder display url_query_args["pcmd"] = "".join(pcmd_list) # save it back fixed_url = urlunsplit( SplitResult(scheme="", netloc="", path=url_parts.path, query=urlencode(url_query_args, True), fragment="")) # re-join url parts return fixed_url
def get_scrape_url(tracker_url, info_hash): if 'announce' in tracker_url: v = urlsplit(tracker_url) sr = SplitResult(v.scheme, v.netloc, v.path.replace('announce', 'scrape'), v.query, v.fragment) result = urlunsplit(sr) else: log.debug( '`announce` not contained in tracker url, guessing scrape address.' ) result = tracker_url + '/scrape' result += '&' if '?' in result else '?' result += 'info_hash=%s' % quote(info_hash.decode('hex')) return result
def _login(self): login_url = urljoin(self.base_url, '/manage_main') # would be the propper way to login, but is not supported by geckodriver/ chromedriver yet # self.driver.switch_to.alert.authenticate(self.login, self.password) # Disabled because it works only in firefox # if self.driver == 'Firefox': # self.driver.get(login_url) # self.driver.switch_to.alert.send_keys(self.login + Keys.TAB + self.password) # self.driver.switch_to.alert.accept() # else: components = urlsplit(login_url) credentials = '%s:%s@' % (self.login, self.password) components_with_auth = SplitResult(components[0], credentials + components[1], *components[2:]) self.driver.get(urlunsplit(components_with_auth))
def assertApiUrlEqual(self, *args, **kwargs): """ Allows equality comparison of two or more URLs agnostic of API version. This is done by prepending '/api/vx' (where x is equal to the `version` keyword argument or API_CURRENT_VERSION) to each string passed as a positional argument if that URL doesn't already start with that string. Also accepts 'netloc' and 'scheme' optional keyword arguments to compare absolute URLs. Example usage: url = '/api/v1/apps/app/bastacorp/' self.assertApiUrlEqual(url, '/apps/app/bastacorp1/') # settings.API_CURRENT_VERSION = 2 url = '/api/v1/apps/app/bastacorp/' self.assertApiUrlEqual(url, '/apps/app/bastacorp/', version=1) """ # Constants for the positions of the URL components in the tuple # returned by urlsplit. Only here for readability purposes. SCHEME = 0 NETLOC = 1 PATH = 2 version = kwargs.get('version', settings.API_CURRENT_VERSION) scheme = kwargs.get('scheme', None) netloc = kwargs.get('netloc', None) urls = list(args) prefix = '/api/v%d' % version for idx, url in enumerate(urls): urls[idx] = list(urlsplit(url)) if not urls[idx][PATH].startswith(prefix): urls[idx][PATH] = prefix + urls[idx][PATH] if scheme and not urls[idx][SCHEME]: urls[idx][SCHEME] = scheme if netloc and not urls[idx][NETLOC]: urls[idx][NETLOC] = netloc urls[idx] = SplitResult(*urls[idx]) eq_(*urls)
def pdf_echo_loopback(self): ''' Adjunct endpoint used with above PDF test echo page that proxies the generated PDF back to the test page ''' # set default PDF server URI pdfServerUri = '%s://%s/services/pdfserver/renderpdf' % (splunk.getDefault('protocol'), cherrypy.config.get('mgmtHostPort')) # get alternate PDF server URI; values seem to be varied so we normalize alertSettings = en.getEntity('configs/conf-alert_actions', 'email', namespace='search') if alertSettings.get('reportServerURL') and alertSettings['reportServerURL'].strip(): pdfServerUri = alertSettings['reportServerURL'].strip() url = urlsplit(pdfServerUri) if len(url.path)<2: url = url._asdict() url['path'] = '/services/pdfserver/renderpdf' pdfServerUri = urlunsplit(SplitResult(**url)) # determine the external address that is most likely accessible urlparts = urlparse.urlparse(pdfServerUri) ai = socket.getaddrinfo(urlparts.hostname, int(urlparts.port or 80), socket.AF_UNSPEC, socket.SOCK_STREAM, 0, socket.AI_PASSIVE)[0] af, socktype, proto, canonname, hostport = ai appserverHost = alertSettings.get('hostname') and alertSettings['hostname'].strip() if appserverHost: logger.info('using configured appserver hostname "%s"' % appserverHost) else: s = socket.socket(af, socktype, proto) s.connect(hostport) sockname = s.getsockname() logger.info('most promising interface looks like %s' % sockname[0]) appserverHost = sockname[0] appserverProtocol = 'https' if splunk.util.normalizeBoolean(cherrypy.config.get('enableSplunkWebSSL', False)) else 'http' # create a fake sso-bypass session utilizing the user's current sessionKey active_session = cherrypy.serving.session session_args = ('timeout', 'clean_freq', 'storage_path', 'servers') args = dict([ (arg_name, getattr(active_session, arg_name)) for arg_name in session_args if hasattr(active_session, arg_name)]) fake_session = cherrypy.serving.session.__class__(**args) fake_session['sessionKey'] = cherrypy.session['sessionKey'] fake_session['SSO_DISABLE'] = 1 fake_session.save() fake_session.release_lock() # set GET args args = { 'target': '%s://%s:%s%s/debug/pdf_echo' % ( appserverProtocol, appserverHost if af == socket.AF_INET else '[%s]' % appserverHost, cherrypy.config['httpport'], cherrypy.request.script_name ), 'mode': 'default', 'session': fake_session.id } # fetch the SSL certificate, if any cert = cherrypy.request.app.root.report.get_cert() if cert: args['cert'] = cert logger.info('Testing PDF server=%s on URI=%s' % (pdfServerUri, args['target'])) # make a request to the registered PDF server for the echo page timeout = 20 h = httplib2.Http(timeout=timeout, disable_ssl_certificate_validation=True) start = time.time() try: serverResponse, serverContent = h.request(pdfServerUri, method='POST', body=urllib.urlencode(args)) except: if time.time() - start > (timeout-1): cherrypy.response.headers['content-type'] = 'text/plain' return "Timed out while waiting for a response" raise cherrypy.response.headers['content-type'] = 'application/pdf' return serverContent
def requestPDF(self, **kw): """ Expects a valid splunk session key to be passed in along with the url to be rendered to PDF Complete parameter list: session_key (required) request_path (required) paperSize - 'a4', 'letter', etc or dimensions in mm '200x400' - default 'letter' orientation - 'portrait' or 'landscape' - default 'portrait' title - Title of report - default 'Splunk Report' override_disposition owner """ request_path = kw.get('request_path') if not request_path: raise SimpleError(400, "Invalid request_path supplied") print_session_key = kw.get('session_key') if not print_session_key: if cherrypy.config.get('debug_report_server'): logger.warn('Using debug user for report server') print_session_key = splunk.auth.getSessionKey( 'admin', 'changeme', hostPath=self.splunkd_urlhost) else: raise SimpleError(400, "Invalid session key supplied") settings = en.getEntity(ALERT_ACTIONS_ENTITY, 'email', namespace='system', sessionKey=print_session_key, owner='nobody') enabled = splunk.util.normalizeBoolean( settings.get('reportServerEnabled')) if not enabled: raise SimpleError(400, 'PDF server is not enabled') report_server_url = settings.get('reportServerURL') if isinstance(report_server_url, basestring): report_server_url = report_server_url.strip() url = urlsplit(report_server_url) if url.netloc and len(url.path) < 2: # user has specified the protocol://host:port only url = url._asdict() url['path'] = DEFAULT_SERVICES_URL report_server_url = urlunsplit(SplitResult(**url)) elif report_server_url is None: report_server_url = DEFAULT_SERVICES_URL else: raise SimpleError(500, "reportServerURL is invalid") if not report_server_url: report_server_url = DEFAULT_SERVICES_URL papersize = kw.get('papersize') if not papersize: papersize = settings.get('reportPaperSize', 'letter') orientation = kw.get('orientation') if not orientation: orientation = settings.get('reportPaperOrientation', 'portrait') title = kw.get('title') if not title: title = settings.get('reportTitle', _('Splunk Report')) owner = kw.get('owner', 'nobody') print_session = self.build_session(owner, print_session_key) try: data = { 'session': print_session.id, 'target': request_path, 'papersize': papersize, 'orientation': orientation, 'title': title, 'footer_right': _('Generated by Splunk at %(time)s') % dict(time='&D'), 'mode': 'splunk' } # see if splunkweb is running in SSL mode; if so pass the certificate to the pdf server cert = self.get_cert() if cert: data['cert'] = cert try: logger.info("Appserver dispatching report request to '%s'" % report_server_url) server_response, server_content = splunk.rest.simpleRequest( report_server_url, postargs=data, rawResult=True) except Exception, e: logger.error( "Appserver failed to dispatch report request to %s: %s" % (report_server_url, e)) raise SimpleError( 500, "Appserver failed to dispatch report request to %s: %s" % (report_server_url, e)) if server_response.status == 404: logger.error( "Appserver got a 404 response while contacting the PDF server at %s - Check that the PDF Server app is installed and that reportServerURL is correct" % report_server_url) raise SimpleError( 500, "Appserver got a 404 response while contacting the PDF server at %s - Check that the PDF Server app is installed and that reportServerURL is correct" % report_server_url) elif server_response.status != 200: if server_content and server_content[0] == '>': logger.error( "Appserver received error from PDF server at %s: %s" % (report_server_url, server_content[1:])) raise SimpleError( server_response.status, "PDF server at %s returned error: %s" % (report_server_url, server_content[1:])) logger.error( "Appserver failed to dispatch report request to %s: %s - %s" % (report_server_url, server_response.status, server_response.reason)) raise SimpleError( 500, "Appserver failed to dispatch report request to %s: %s %s" % (report_server_url, server_response.status, server_response.reason)) # relay the response through to the requester cherrypy.response.headers['content-type'] = server_response[ 'content-type'] cherrypy.response.headers['content-length'] = server_response[ 'content-length'] if kw.get('override_disposition'): cherrypy.response.headers['content-disposition'] = kw[ 'override_disposition'] elif 'content-disposition' in server_response: cherrypy.response.headers[ 'content-disposition'] = server_response[ 'content-disposition'] cherrypy.response.body = server_content return cherrypy.response.body
from circonus.collectd.df import get_df_graph_data from circonus.collectd.graph import get_collectd_graph_data from circonus.collectd.memory import get_memory_graph_data from circonus.collectd.interface import get_interface_graph_data from circonus.tag import get_tags_with, get_telemetry_tag, is_taggable from requests import codes as status_codes from requests.exceptions import HTTPError import requests API_PROTOCOL = "https" API_LOCATION = "api.circonus.com" API_VERSION = 2 API_BASE_SPLIT = SplitResult(scheme=API_PROTOCOL, netloc=API_LOCATION, path="/v%d" % API_VERSION, query="", fragment="") API_BASE_URL = urlunsplit(API_BASE_SPLIT) log = logging.getLogger(__name__) def get_api_url(resource_type_or_cid): """Get a valid fully qualified Circonus API URL for the given resource type or ``cid``. :param str resource_type_or_cid: The resource type or ``cid`` representing a specific resource. :return: The API URL. :rtype: :py:class:`str` """
def handle_distrib(self, message): """React to a file dispatch message. """ pathname1, pathname2 = message.split(" ") dummy, filename = os.path.split(pathname1) # TODO: Should not make any assumptions on filename formats, should # load a description of it from a config file instead. if pathname1.endswith(".hmf"): risestr, satellite = filename[:-4].split("_", 1) risetime = datetime.strptime(risestr, "%Y%m%d%H%M%S") pname = pass_name(risetime, satellite) swath = self._received_passes.get(pname, { "satellite": satellite, "start_time": risetime }) swath["type"] = "binary" if satellite == "FENGYUN_1D": swath["format"] = "CHRPT" else: swath["format"] = "HRPT" swath["instrument"] = ("avhrr/3", "mhs", "amsu") swath["level"] = "0" elif filename.startswith("P042") or filename.startswith("P154"): pds = {} pds["format"] = filename[0] pds["apid1"] = filename[1:8] pds["apid2"] = filename[8:15] pds["apid3"] = filename[15:22] pds["time"] = datetime.strptime(filename[22:33], "%y%j%H%M%S") pds["nid"] = filename[33] pds["ufn"] = filename[34:36] pds["extension"] = filename[36:40] if pds["apid1"][:3] == "042": satellite = "TERRA" elif pds["apid1"][:3] == "154": satellite = "AQUA" else: raise ValueError("Unrecognized satellite ID: " + pds["apid1"][:3]) risetime = pds["time"] pname = pass_name(risetime, satellite) swath = self._received_passes.get(pname, { "satellite": satellite, "start_time": risetime }) instruments = { "0064": "modis", "0141": "ceres+y", "0157": "ceres-y", "0261": "amsu-a1", "0262": "amsu-a1", "0290": "amsu-a2", "0342": "hsb", "0402": "amsr-e", "0404": "airs", "0405": "airs", "0406": "airs", "0407": "airs", "0414": "airs", "0415": "airs", "0419": "airs", "0957": "gbad", } swath["instrument"] = instruments.get(pds["apid1"][3:], pds["apid1"][3:]) swath["format"] = "PDS" swath["type"] = "binary" swath["level"] = "0" swath["number"] = int(pds["ufn"]) # NPP RDRs elif filename.startswith("R") and filename.endswith(".h5"): # Occassionaly RT-STPS produce files with a nonstandard file # naming, lacking the 'RNSCA' field. We will try to deal with this # below (Adam - 2013-06-04): mda = {} idx_start = 0 mda["format"] = filename[0] if filename.startswith("RATMS-RNSCA"): mda["instrument"] = "atms" elif filename.startswith("RCRIS-RNSCA"): mda["instrument"] = "cris" elif filename.startswith("RNSCA-RVIRS"): mda["instrument"] = "viirs" else: if filename.startswith("RATMS_npp"): mda["instrument"] = "atms" elif filename.startswith("RCRIS_npp"): mda["instrument"] = "cris" else: logger.warning("Seems to be a NPP/JPSS RDR " + "file but name is not standard!") logger.warning("filename = " + filename) return None idx_start = -6 mda["start_time"] = datetime.strptime( filename[idx_start + 16:idx_start + 33], "d%Y%m%d_t%H%M%S") end_time = datetime.strptime( filename[idx_start + 16:idx_start + 25] + " " + filename[idx_start + 35:idx_start + 42], "d%Y%m%d e%H%M%S") mda["orbit"] = filename[idx_start + 45:idx_start + 50] # FIXME: swath start and end time is granule dependent. # Get the end time as well! - Adam 2013-06-03: satellite = "NPP" start_time = mda["start_time"] pname = pass_name(start_time, satellite) swath = self._received_passes.get(pname, { "satellite": satellite, "start_time": start_time }) swath['end_time'] = end_time swath["instrument"] = mda["instrument"] swath["format"] = "RDR" swath["type"] = "HDF5" swath["level"] = "0" # metop elif filename[4:12] == "_HRP_00_": instruments = { "AVHR": "avhrr", "ASCA": "ascat", "AMSA": "amsu-a", "ASCA": "ascat", "ATOV": "atovs", "AVHR": "avhrr/3", "GOME": "gome", "GRAS": "gras", "HIRS": "hirs/4", "IASI": "iasi", "MHSx": "mhs", "SEMx": "sem", "ADCS": "adcs", "SBUV": "sbuv", "HKTM": "vcdu34" } satellites = {"M02": "METOP-A", "M01": "METOP-B"} satellite = satellites[filename[12:15]] risetime = datetime.strptime(filename[16:31], "%Y%m%d%H%M%SZ") #falltime = datetime.strptime(filename[16:47], "%Y%m%d%H%M%SZ") pname = pass_name(risetime, satellite) swath = self._received_passes.get(pname, { "satellite": satellite, "start_time": risetime }) swath["instrument"] = instruments[filename[:4]] swath["format"] = "EPS" swath["type"] = "binary" swath["level"] = "0" else: return None if pathname2.endswith(filename): uri = pathname2 else: uri = os.path.join(pathname2, filename) url = urlsplit(uri) if url.scheme in ["", "file"]: scheme = "ssh" netloc = self._emitter uri = urlunsplit( SplitResult(scheme, netloc, url.path, url.query, url.fragment)) elif url.scheme == "ftp": scheme = "ssh" netloc = url.hostname uri = urlunsplit( SplitResult(scheme, netloc, url.path, url.query, url.fragment)) swath["filename"] = os.path.split(url.path)[1] swath["uri"] = uri return swath
def main(argv=None): """Load data into a Cheshire3 database based on parameters in argv.""" global argparser, session, server, db if argv is None: args = argparser.parse_args() else: args = argparser.parse_args(argv) if irods is None: raise MissingDependencyException('icheshire3-load script', 'irods (PyRods)' ) session = Session() server = SimpleServer(session, args.serverconfig) if args.database is None: try: dbid = identify_database(session, os.getcwd()) except EnvironmentError as e: server.log_critical(session, e.message) return 1 server.log_debug( session, "database identifier not specified, discovered: {0}".format(dbid)) else: dbid = args.database try: db = server.get_object(session, dbid) except ObjectDoesNotExistException: msg = """Cheshire3 database {0} does not exist. Please provide a different database identifier using the --database option. """.format(dbid) server.log_critical(session, msg) return 2 else: # Allow for multiple data arguments docFac = db.get_object(session, 'defaultDocumentFactory') for dataArg in args.data: if dataArg.startswith('irods://'): parsed = urlsplit(dataArg) else: # Examine current environment status, myEnv = irods.getRodsEnv() try: host = myEnv.getRodsHost() except AttributeError: host = myEnv.rodsHost # Port try: myEnv.getRodsPort() except AttributeError: port = myEnv.rodsPort # User try: username = myEnv.getRodsUserName() except AttributeError: username = myEnv.rodsUserName netloc = '{0}@{1}:{2}'.format(username, host, port) try: cqm = myEnv.getRodsCwd() except AttributeError: cwd = myEnv.rodsCwd path = '/'.join([cwd, dataArg]) parsed = SplitResult('irods', netloc, path, None, None) dataArg = urlunsplit(parsed) server.log_debug(session, dataArg) if args.format is None or not args.format.startswith('i'): fmt = 'irods' else: fmt = args.format server.log_debug(session, fmt) try: docFac.load(session, dataArg, args.cache, fmt, args.tagname, args.codec) except MissingDependencyException as e: server.log_critical(session, e.reason) missingDependencies = e.dependencies raise MissingDependencyException('cheshire3-load script', missingDependencies) wf = db.get_object(session, 'buildIndexWorkflow') wf.process(session, docFac)