def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ json_string = text(json_string) if not is_text(json_string) and json_string.__class__.__name__ != "FileString": Log.error("only unicode json accepted") try: if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) if flexible: value = hjson2value(json_string) else: value = to_data(json_decoder(text(json_string))) if leaves: value = leaves_to_data(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error(CAN_NOT_DECODE_JSON + " at:\n\t{{sample}}\n\t{{pointer}}\n", sample=sample, pointer=pointer) base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join((c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception: char_str = " " Log.error(CAN_NOT_DECODE_JSON + ":\n{{char_str}}\n{{hexx_str}}\n", char_str=char_str, hexx_str=hexx_str, cause=e)
def _open(self): """ DO NOT USE THIS UNLESS YOU close() FIRST""" if self.settings.host.startswith("mysql://"): # DECODE THE URI: mysql://username:password@host:optional_port/database_name up = strings.between(self.settings.host, "mysql://", "@") if ":" in up: self.settings.username, self.settings.password = unquote( up).split(":") else: self.settings.username = up url = strings.between(self.settings.host, "@", None) hp, self.settings.schema = url.split("/", 1) if ":" in hp: self.settings.host, self.settings.port = hp.split(":") self.settings.port = int(self.settings.port) else: self.settings.host = hp # SSL PEM if self.settings.host in ("localhost", "mysql", '127.0.0.1'): ssl_context = None else: if self.settings.ssl and not self.settings.ssl.pem: Log.error("Expecting 'pem' property in ssl") # ssl_context = ssl.create_default_context(**get_ssl_pem_file(self.settings.ssl.pem)) filename = File(".pem") / URL(self.settings.ssl.pem).host filename.write_bytes(http.get(self.settings.ssl.pem).content) ssl_context = {"ca": filename.abspath} try: self.db = connect( host=self.settings.host, port=self.settings.port, user=coalesce(self.settings.username, self.settings.user), passwd=coalesce(self.settings.password, self.settings.passwd), db=coalesce(self.settings.schema, self.settings.db), read_timeout=coalesce(self.settings.read_timeout, (EXECUTE_TIMEOUT / 1000) - 10 if EXECUTE_TIMEOUT else None, 5 * 60), charset=u"utf8", use_unicode=True, ssl=ssl_context, cursorclass=cursors.SSCursor) except Exception as e: if self.settings.host.find("://") == -1: Log.error(u"Failure to connect to {{host}}:{{port}}", host=self.settings.host, port=self.settings.port, cause=e) else: Log.error( u"Failure to connect. PROTOCOL PREFIX IS PROBABLY BAD", e) self.cursor = None self.partial_rollback = False self.transaction_level = 0 self.backlog = [ ] # accumulate the write commands so they are sent at once if self.readonly: self.begin()
def _get_url(url, branch, **kwargs): with Explanation("get push from {{url}}", url=url, debug=DEBUG): response = http.get(url, **kwargs) data = json2value(response.content.decode("utf8")) if data.error.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data.error, "'", "'")) if is_text(data) and data.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data, "'", "'")) # branch.url = _trim(url) # RECORD THIS SUCCESS IN THE BRANCH return data
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = mo_json.json2value(suite_json) suite = convert.value2json(coalesce(suite.fullname, suite.name)) line = line.replace(suite_json, suite) if source.name.startswith("active-data-codecoverage"): d = convert.json2value(line) if d.source.file.total_covered > 0: return {"id": d._id, "json": line}, False else: return None, False if rownum == 0: value = mo_json.json2value(line) if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int( int(1.0 / coalesce(sample_size, 0.01))) != 0 and jx.filter( [value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > MAX_RECORD_LENGTH: value = mo_json.json2value(line) _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find('"resource_usage":') != -1: value = mo_json.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False
def _get_url(url, branch, **kwargs): with Explanation("get push from {{url}}", url=url, debug=DEBUG): response = http.get(url, **kwargs) data = json2value(response.content.decode("utf8")) if isinstance(data, (text_type, str)) and data.startswith("unknown revision"): Log.error("Unknown push {{revision}}", revision=strings.between(data, "'", "'")) branch.url = _trim(url) # RECORD THIS SUCCESS IN THE BRANCH return data
def _get_url(url, branch, **kwargs): with Explanation("get push from {{url}}", url=url, debug=DEBUG): response = http.get(url, **kwargs) data = json2value(response.content.decode("utf8")) if isinstance(data, (text_type, str)) and data.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data, "'", "'")) branch.url = _trim(url) # RECORD THIS SUCCESS IN THE BRANCH return data
def fix(rownum, line, source, sample_only_filter, sample_size): # ES SCHEMA IS STRICTLY TYPED, USE "code" FOR TEXT IDS line = line.replace('{"id": "bb"}', '{"code": "bb"}').replace('{"id": "tc"}', '{"code": "tc"}') # ES SCHEMA IS STRICTLY TYPED, THE SUITE OBJECT CAN NOT BE HANDLED if source.name.startswith("active-data-test-result"): # "suite": {"flavor": "plain-chunked", "name": "mochitest"} found = strings.between(line, '"suite": {', '}') if found: suite_json = '{' + found + "}" if suite_json: suite = mo_json.json2value(suite_json) suite = convert.value2json(coalesce(suite.fullname, suite.name)) line = line.replace(suite_json, suite) if rownum == 0: value = mo_json.json2value(line) if len(line) > MAX_RECORD_LENGTH: _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} if sample_only_filter and Random.int(int(1.0/coalesce(sample_size, 0.01))) != 0 and jx.filter([value], sample_only_filter): # INDEX etl.id==0, BUT NO MORE if value.etl.id != 0: Log.error("Expecting etl.id==0") return row, True elif len(line) > MAX_RECORD_LENGTH: value = mo_json.json2value(line) _shorten(value, source) _id, value = _fix(value) row = {"id": _id, "value": value} elif line.find('"resource_usage":') != -1: value = mo_json.json2value(line) _id, value = _fix(value) row = {"id": _id, "value": value} else: # FAST _id = strings.between(line, "\"_id\": \"", "\"") # AVOID DECODING JSON row = {"id": _id, "json": line} return row, False
def replace_vars(text, params=None): """ REPLACE {{vars}} WITH ENVIRONMENTAL VALUES """ start = 0 var = strings.between(text, "{{", "}}", start) while var: replace = "{{" + var + "}}" index = text.find(replace, 0) if index == -1: Log.error("could not find {{var}} (including quotes)", var=replace) end = index + len(replace) try: replacement = unicode(Date(var).unix) text = text[:index] + replacement + text[end:] start = index + len(replacement) except Exception, _: start += 1 var = strings.between(text, "{{", "}}", start)
def _get_and_retry(self, url): try: data = http.get_json(**set_default({"url": url}, self.hg)) if data.error.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data.error, "'", "'")) if is_text(data) and data.startswith("unknown revision"): Log.error(UNKNOWN_PUSH, revision=strings.between(data, "'", "'")) # branch.url = _trim(url) # RECORD THIS SUCCESS IN THE BRANCH return data except Exception as e: path = url.split("/") if path[3] == "l10n-central": # FROM https://hg.mozilla.org/l10n-central/tr/json-pushes?full=1&changeset=a6eeb28458fd # TO https://hg.mozilla.org/mozilla-central/json-pushes?full=1&changeset=a6eeb28458fd path = path[0:3] + ["mozilla-central"] + path[5:] return self._get_and_retry("/".join(path)) elif len(path) > 5 and path[5] == "mozilla-aurora": # FROM https://hg.mozilla.org/releases/l10n/mozilla-aurora/pt-PT/json-pushes?full=1&changeset=b44a8c68fc60 # TO https://hg.mozilla.org/releases/mozilla-aurora/json-pushes?full=1&changeset=b44a8c68fc60 path = path[0:4] + ["mozilla-aurora"] + path[7:] return self._get_and_retry("/".join(path)) elif len(path) > 5 and path[5] == "mozilla-beta": # FROM https://hg.mozilla.org/releases/l10n/mozilla-beta/lt/json-pushes?full=1&changeset=03fbf7556c94 # TO https://hg.mozilla.org/releases/mozilla-beta/json-pushes?full=1&changeset=b44a8c68fc60 path = path[0:4] + ["mozilla-beta"] + path[7:] return self._get_and_retry("/".join(path)) elif len(path) > 7 and path[5] == "mozilla-release": # FROM https://hg.mozilla.org/releases/l10n/mozilla-release/en-GB/json-pushes?full=1&changeset=57f513ab03308adc7aa02cc2ea8d73fe56ae644b # TO https://hg.mozilla.org/releases/mozilla-release/json-pushes?full=1&changeset=57f513ab03308adc7aa02cc2ea8d73fe56ae644b path = path[0:4] + ["mozilla-release"] + path[7:] return self._get_and_retry("/".join(path)) elif len(path) > 5 and path[4] == "autoland": # FROM https://hg.mozilla.org/build/autoland/json-pushes?full=1&changeset=3ccccf8e5036179a3178437cabc154b5e04b333d # TO https://hg.mozilla.org/integration/autoland/json-pushes?full=1&changeset=3ccccf8e5036179a3178437cabc154b5e04b333d path = path[0:3] + ["try"] + path[5:] return self._get_and_retry("/".join(path)) raise e
def last_deploy(self): setup_file = File.new_instance(self.directory, 'setup.py') if not setup_file.exists: Log.note("Not a pypi project: {{dir}}", dir=self.directory) return Date.today() setup = setup_file.read() version = json2value(strings.between(setup, "version=", ",")).split(".")[-1] date = unicode2Date(version, format="%y%j") Log.note("PyPi last deployed {{date|datetime}}", date=date, dir=self.directory) return date
def query(self, query): try: with self.esq: self.esq.query(query) return None except Exception as e: f = Except(ERROR, text(e), trace=extract_tb(1)) try: details = str(f) query = json2value(strings.between(details, ">>>>", "<<<<")) return query except Exception as g: Log.error("problem", f)
def _restart_etl_supervisor(conn, please_stop, cpu_count): # READ LOCAL CONFIG FILE, ALTER IT FOR THIS MACHINE RESOURCES, AND PUSH TO REMOTE conf_file = File("./examples/config/etl_supervisor.conf") content = conf_file.read_bytes() find = between(content, "numprocs=", "\n") content = content.replace("numprocs=" + find + "\n", "numprocs=" + str(cpu_count) + "\n") with TempFile() as tempfile: tempfile.write(content) conn.sudo("rm -f /etc/supervisor/conf.d/etl_supervisor.conf") conn.put(tempfile.abspath, "/etc/supervisord.conf", use_sudo=True) conn.run("mkdir -p /home/ec2-user/logs") # START DAEMON (OR THROW ERROR IF RUNNING ALREADY) conn.sudo("supervisord -c /etc/supervisord.conf", warn=True) conn.sudo("supervisorctl reread") conn.sudo("supervisorctl update")
def extract_job_settings(): # These values not directly accessed during testing, but the code requires that they be present. os.environ["NEW_RELIC_APP_NAME"] = "testing" os.environ["BIGQUERY_PRIVATE_KEY_ID"] = "1" os.environ["BIGQUERY_PRIVATE_KEY"] = "1" # USE THE TEST SCHEMA db_url = os.environ["DATABASE_URL"] db_url = db_url.replace(strings.between(db_url, "/", None), DATABASES["default"]["TEST"]["NAME"]) os.environ["DATABASE_URL"] = db_url settings = startup.read_settings(filename=extract_jobs.CONFIG_FILE, complain=False) settings.source.database.ssl = None # NOT REQUIRED FOR TEST DATABASE constants.set(settings.constants) Log.start(settings.debug) return settings
def _setup_etl_supervisor(self, cpu_count): # INSTALL supervsor sudo("apt-get install -y supervisor") with fabric_settings(warn_only=True): sudo("service supervisor start") # READ LOCAL CONFIG FILE, ALTER IT FOR THIS MACHINE RESOURCES, AND PUSH TO REMOTE conf_file = File("./examples/config/etl_supervisor.conf") content = conf_file.read_bytes() find = between(content, "numprocs=", "\n") content = content.replace("numprocs=" + find + "\n", "numprocs=" + str(cpu_count) + "\n") File("./temp/etl_supervisor.conf.alt").write_bytes(content) sudo("rm -f /etc/supervisor/conf.d/etl_supervisor.conf") put("./temp/etl_supervisor.conf.alt", '/etc/supervisor/conf.d/etl_supervisor.conf', use_sudo=True) run("mkdir -p /home/ubuntu/ActiveData-ETL/results/logs") # POKE supervisor TO NOTICE THE CHANGE sudo("supervisorctl reread") sudo("supervisorctl update")
def _synch(settings): cache = File(settings.local_cache) if not cache.exists: cache.create() settings.destination.directory = settings.destination.directory.trim("/") for repo in listwrap(coalesce(settings.repo, settings.repos)): Log.alert("Synch {{repo}}", repo=repo.description) if not strings.between(repo.source.url, "/", ".git"): Log.error("This is not a git reference: {{git_url}}", git_url=repo.source.url) name = coalesce(repo.source.name, strings.between(repo.source.url, "/", ".git")) if not repo.source.branch: Log.note("{{name}} has not branch property", name=name) # DO WE HAVE A LOCAL COPY? local_repo = File.new_instance(cache, name) local_dir = File.new_instance(local_repo, repo.source.directory) if not local_repo.exists: Process("clone repo", ["git", "clone", repo.source.url, name], cwd=cache, shell=True, debug=DEBUG).join(raise_on_error=True) # SWITCH TO BRANCH Process("checkout", ["git", "checkout", repo.source.branch], cwd=local_repo, shell=True, debug=DEBUG).join(raise_on_error=True) # UPDATE THE LOCAL COPY Process("update", ["git", "pull", "origin", repo.source.branch], cwd=local_repo, shell=True, debug=DEBUG).join(raise_on_error=True) # GET CURRENT LISTING OUT OF S3 try: connection = connect_to_region( region_name=repo.destination.region, calling_format="boto.s3.connection.OrdinaryCallingFormat", aws_access_key_id=unwrap(repo.destination.aws_access_key_id), aws_secret_access_key=unwrap( repo.destination.aws_secret_access_key)) bucket = connection.get_bucket(repo.destination.bucket) except Exception as e: Log.error("Problem connecting to {{bucket}}", bucket=repo.destination.bucket, cause=e) remote_prefix = repo.destination.directory.strip('/') + "/" listing = bucket.list(prefix=remote_prefix) metas = { m.key[len(remote_prefix):]: Data(key=m.key, etag=m.etag) for m in listing } net_new = [] Log.note("Look for differences") for local_file in local_dir.leaves: local_rel_file = local_file.abspath[len(local_dir.abspath ):].lstrip(b'/') if "/." in local_rel_file or local_rel_file.startswith("."): continue local_rel_file = local_rel_file.replace("qb/Qb", "qb/qb") remote_file = metas.get(local_rel_file) if not repo.force and remote_file: if remote_file.etag != md5(local_file): net_new.append(local_file) else: net_new.append(local_file) # SEND DIFFERENCES for n in net_new: remote_file = join_path(repo.destination.directory, n.abspath[len(local_dir.abspath):]) remote_file = remote_file.replace("qb/Qb", "qb/qb") try: Log.note("upload {{file}} ({{type}})", file=remote_file, type=n.mime_type) storage = bucket.new_key(remote_file) storage.content_type = n.mime_type storage.set_contents_from_string(n.read_bytes()) storage.set_acl('public-read') except Exception as e: Log.warning("can not upload {{file}} ({{type}})", file=remote_file, type=n.mime_type, cause=e)
def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join( remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception as e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error("Can not decode JSON at:\n\t" + sample + "\n\t" + pointer + "\n") base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join( (c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception as e: char_str = " " Log.error("Can not decode JSON:\n" + char_str + "\n" + hexx_str + "\n", e)
def json2value(json_string, params=Null, flexible=False, leaves=False): """ :param json_string: THE JSON :param params: STANDARD JSON PARAMS :param flexible: REMOVE COMMENTS :param leaves: ASSUME JSON KEYS ARE DOT-DELIMITED :return: Python value """ if isinstance(json_string, str): Log.error("only unicode json accepted") try: if flexible: # REMOVE """COMMENTS""", # COMMENTS, //COMMENTS, AND \n \r # DERIVED FROM https://github.com/jeads/datasource/blob/master/datasource/bases/BaseHub.py# L58 json_string = re.sub(r"\"\"\".*?\"\"\"", r"\n", json_string, flags=re.MULTILINE) json_string = "\n".join(remove_line_comment(l) for l in json_string.split("\n")) # ALLOW DICTIONARY'S NAME:VALUE LIST TO END WITH COMMA json_string = re.sub(r",\s*\}", r"}", json_string) # ALLOW LISTS TO END WITH COMMA json_string = re.sub(r",\s*\]", r"]", json_string) if params: # LOOKUP REFERENCES json_string = expand_template(json_string, params) try: value = wrap(json_decoder(unicode(json_string))) except Exception as e: Log.error("can not decode\n{{content}}", content=json_string, cause=e) if leaves: value = wrap_leaves(value) return value except Exception as e: e = Except.wrap(e) if not json_string.strip(): Log.error("JSON string is only whitespace") c = e while "Expecting '" in c.cause and "' delimiter: line" in c.cause: c = c.cause if "Expecting '" in c and "' delimiter: line" in c: line_index = int(strings.between(c.message, " line ", " column ")) - 1 column = int(strings.between(c.message, " column ", " ")) - 1 line = json_string.split("\n")[line_index].replace("\t", " ") if column > 20: sample = "..." + line[column - 20:] pointer = " " + (" " * 20) + "^" else: sample = line pointer = (" " * column) + "^" if len(sample) > 43: sample = sample[:43] + "..." Log.error("Can not decode JSON at:\n\t" + sample + "\n\t" + pointer + "\n") base_str = strings.limit(json_string, 1000).encode('utf8') hexx_str = bytes2hex(base_str, " ") try: char_str = " " + " ".join((c.decode("latin1") if ord(c) >= 32 else ".") for c in base_str) except Exception as e: char_str = " " Log.error("Can not decode JSON:\n" + char_str + "\n" + hexx_str + "\n", e)