def build(paths, dbname, table): """ Load markdown files into a SQLite database Based on https://github.com/simonw/markdown-to-sqlite, modified to use markdown extensions. """ db = Database(dbname) md = markdown.Markdown( extensions=["fenced_code", "codehilite"], extension_configs={"codehilite": {"guess_lang": "False"}}, ) docs = [] for path in paths: metadata, text = yamldown.load(open(path)) html = md.convert(text) doc = { "_id": hashlib.sha1(str(path).encode("utf8")).hexdigest(), "_path": str(path), "text": text, "html": html, **(metadata or {}), } docs.append(doc) db[table].upsert_all(docs, pk="_id")
def get_yamldown_metadata(yamldown_dir, meta_id) -> dict: yamldown_md_path = os.path.join(yamldown_dir, "{}.md".format(meta_id)) try: with open(yamldown_md_path, "r") as gorule_data: return yamldown.load(gorule_data)[0] except Exception as e: raise click.ClickException("Could not find or read {}: {}".format(yamldown_md_path))
def read_md_file(self) -> Tuple[Dict, str]: yml = {} md = "" if os.path.exists(self.file_path): with codecs.open(self.file_path, "r", 'utf-8') as file: (yml, md) = yamldown.load(file) logging.info((yml, md)) if yml is None: yml = {} return (yml, md)
def load_yamldown(path): """ Loads a YAML file at path and returns it as a dictionary. """ try: with open(path, "r") as f: return yamldown.load(f)[0] except Exception as e: raise click.ClickException(str(e))
def gorule_metadata(metadata, rule_id) -> str: gorule_yamldown = os.path.join(metadata, "rules", "{}.md".format(rule_id)) try: with open(gorule_yamldown, "r") as gorule_data: click.echo("Found {rule} at {path}".format(rule=rule_id, path=gorule_yamldown)) return yamldown.load(gorule_data)[0] except Exception as e: raise click.ClickException("Could not find or read {}: {}".format( gorule_yamldown, str(e)))
def _read_yml_md_file(self) -> Tuple[Dict, str]: yml = {} md = "" if os.path.exists(self.file_path): with codecs.open(self.file_path, "r", 'utf-8') as file: if file.readline().strip() == "---": file.seek(0) (yml, md) = yamldown.load(file) else: file.seek(0) md = file.read() # logging.info((yml, md)) if yml is None: yml = {} return (yml, md)
def _read_yml_md_file(self) -> Tuple[Dict, str]: metadata = {} content = "" if os.path.exists(self.file_path): with codecs.open(self.file_path, "r", 'utf-8') as file: if file.readline().strip() == "---": file.seek(0) (metadata, content) = yamldown.load(file) else: file.seek(0) content = file.read() # logging.info((metadata, content)) if metadata is None: metadata = {} return (metadata, content)
def load_yamldown(path): """ Loads a YAML file at path and returns it as a dictionary. """ try: with open(path, "r") as f: load = yamldown.load(f)[0] if load == None: raise click.ClickException( "No rule present at {}".format(path)) return load except Exception as e: raise click.ClickException(str(e))
def get_data_in_issue(issue): """ Get the YAML-structured data in an issue Args: issue (:obj:`dict`): properties of the GitHub issue for the submission Returns: :obj:`object`: YAML-structured data in an issue """ body = io.StringIO(issue['body'].replace('\r', '')) # hack to make yamldown work with Python 3.9 if not hasattr(yaml, 'FullLoader'): yaml.FullLoader = yaml.Loader data, _ = yamldown.load(body) return data
def get_simulator_submission_from_gh_issue_body(body): """ Get a simulator submission from the YAML-structured data in an issue Args: body (:obj:`str`): body of a GitHub issue for the submission of a simulator Returns: :obj:`SimulatorSubmission`: simulator submission """ body_stream = io.StringIO(body.replace('\r', '')) # hack to make yamldown work with Python 3.9 if not hasattr(yaml, 'FullLoader'): yaml.FullLoader = yaml.Loader data, _ = yamldown.load(body_stream) return get_simulator_submission_from_gh_issue_body_data(data)
def from_markdown_file(file_path, ignore_comments=True): sentences = [] import yamldown as yamldown import regex with codecs.open(file_path, "r", 'utf-8') as in_file_obj: (yml, md) = yamldown.load(in_file_obj) if "title" in yml: sentences.append(yml["title"]) ## Treat headings as sentences. md = regex.sub("^#(.+)\s*\n", "$1рее ", md) if ignore_comments: ## Ignore comments. md = regex.sub("\+\+\+\(.+?\)\+\+\+", "", md) # TODO: Process image alternate texts and captions? sentences.extend(from_plain_text(md)) return sentences
def compile_file(jinja_env, filename, source_dir, destination_dir, path_list): path = '/'.join(path_list) name_extension = os.path.splitext(filename) if name_extension[1] == '.md': output_filename = f'{name_extension[0]}.html' else: output_filename = filename try: with open(os.path.join(source_dir, path, filename)) as stream: metadata, source_code = yamldown.load(stream) except UnicodeDecodeError: metadata = None if metadata: if name_extension[1] == '.md': source_code = md(source_code) stage1 = jinja_env.from_string(stage1_template).render( page=metadata, extends=metadata.get('template'), source_code=source_code) stage2 = jinja_env.from_string(stage1).render(page=metadata) with open(os.path.join(destination_dir, path, output_filename), 'w+') as wstream: wstream.write(stage2) else: path_so_far = destination_dir for part in path_list: path_so_far = os.path.join(path_so_far, part) if not os.path.exists(path_so_far): os.mkdir(path_so_far) with open(os.path.join(source_dir, path, filename), 'rb') as src_stream: with open(os.path.join(destination_dir, path, output_filename), 'wb+') as dest_stream: data = src_stream.read(512) while data != b'': dest_stream.write(data) data = src_stream.read(512)
def main(report, template, date): report_json = json.load(report) header = sorted([{"id": dataset["id"]} for dataset in report_json], key=lambda h: h["id"]) # click.echo(json.dumps(header, indent=4)) rules_directory = os.path.normpath(os.path.join(os.path.dirname(this_script), "../metadata/rules")) rules_descriptions = dict() for rule_path in glob.glob(os.path.join(rules_directory, "gorule*.md")): with open(rule_path) as rule_file: rule = yamldown.load(rule_file)[0] rule_id = rule["id"].lower().replace(":", "-") rules_descriptions[rule_id] = rule["title"] rule_by_dataset = dict() # { # "gorule-0000005": { # "mgi": 30, # "sgd": 25, # "blah": 45 # }, # "gorule-0000006": { # "mgi": 20, # "sgd": 11 # } # } # [ # { # "rule": "gorule-0000005", # "dataset": [ # { # "id": "mgi", # "messages": 20 # } # ] # } # ] ################################### # { # "gorule-0000005": { # "rule": "gorule-0000005", # "mgi": 20, # "sgd": 11, # "wb": 300 # }, # "other": { # "rule": "other", # "mgi": 25, # "sgd": 25, # "wb": 33 # } # } bootstrap_context_mapping = { "warning": "warning", "error": "danger", "info": "primary" } for dataset in report_json: for rule, messages in dataset["messages"].items(): if rule not in rule_by_dataset: level = messages[0]["level"].lower() if len(messages) > 0 else "info" rule_by_dataset[rule] = { dataset["id"]: len(messages), "level": level, "rule": rule } else: rule_by_dataset[rule][dataset["id"]] = len(messages) rule_by_dataset[rule]["level"] = messages[0]["level"].lower() if len(messages) > 0 else "info" # Add empty cells in as 0s for h in header: for rule, amounts in rule_by_dataset.items(): if h["id"] not in amounts: amounts[h["id"]] = 0 # click.echo(json.dumps(rule_by_dataset, indent=4)) rows = sorted(rule_by_dataset.values(), key=lambda n: n["rule"]) # print(json.dumps(rows[0:4], indent=4)) cells = [] for row in rows: contents = [] level = bootstrap_context_mapping[row["level"]] for key, val in row.items(): if key == "rule": continue if key == "level": continue v = { "dataset": key, "amount": val, "has-zero-messages": val==0, "level": level if val > 0 else "primary" } contents.append(v) contents = sorted(contents, key=lambda d: d["dataset"]) cell = { "rule": row["rule"], "title": rules_descriptions.get(row["rule"], ""), "messages": contents, "is-other": row["rule"] == "other" } cells.append(cell) # print(json.dumps(cells[0:4], indent=4)) rendered = pystache.render(template.read(), {"header": header, "rules": cells, "date": date}) print(rendered)
def main(): """The main runner for our script.""" ## Deal with incoming. parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--directory', help='The directory of the GO refs') parser.add_argument('-j', '--json', help='JSON output file') parser.add_argument('-s', '--stanza', help='Stanza-based output file') parser.add_argument('-v', '--verbose', action='store_true', help='More verbose output') args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) LOG.info('Verbose: on') ## Ensure directories and outputs. if not args.directory: die_screaming('need a directory argument') LOG.info('Will operate in: ' + args.directory) ## Ensure output file. if not args.json and not args.stanza: die_screaming('need an output file argument, --json or --stanza') LOG.info('Will output JSON to: ' + args.json) LOG.info('Will output stanza to: ' + args.stanza) ## Main data hold. reference_data = [] ## Get files out of target directory, flipping the frontmatter ## into JSON. LOG.info('Globbing GO ref YAMLs in data directory: ' + args.directory + '/goref-*.md') src_filenames = glob.glob(args.directory + '/go*-*.md') for src_filename in src_filenames: LOG.info('GO ref filename: ' + src_filename) with open(src_filename, "r") as f: yml, md = yamldown.load(f) ## Break the md into the title, abstract, and comments. mdj_text = pypandoc.convert_text(md, 'json', format='markdown') mdj = json.loads(mdj_text) title = 'n/a' abstract = 'n/a' comments = 'n/a' next_block_type = None ## A workaround for the change in JSON format in pandoc in ## 1.18; Ubuntu 16.04 uses 1.16.0.2 and 18.04 uses ## 1.19.2.4. blocks = None if type(mdj) == list: blocks = mdj[1] else: blocks = mdj['blocks'] for block in blocks: ## If is a header and has something there in the ## header. #LOG.info(json.dumps(block)) if block.get('t', False) == "Header": if block.get('c', False) and len(block['c']) >= 2: ## Capture the title. header_text = wtflist2str(block['c'][2]) #LOG.info('header text: ' + header_text) if header_text.casefold() == "comments" or header_text.casefold() == "comment": next_block_type = "comments" #LOG.info("<<next: comments>>") else: ## Otherwise, we're going to assume this ## is an abstract. title = header_text next_block_type = "abstract" #LOG.info("<<next: abstract>>") else: raise Exception("Unknown HEADER") elif block['t'] == "Para": if block.get('c', False) and len(block['c']) > 0: ## Capture the title. para_text = wtflist2str(block['c']) if next_block_type == "comments": comments = para_text #LOG.info('comments text: ' + para_text) elif next_block_type == "abstract": abstract = para_text #LOG.info('abstract text: ' + para_text) else: raise Exception("Unknown PARA") else: raise Exception("Unknown ENTITY") yml['abstract'] = abstract yml['comments'] = comments yml['title'] = title reference_data.append(yml) ## Sort by id. reference_data = sorted(reference_data, key=lambda k: k['id']) ## Final JSON writeout. if args.json: with open(args.json, 'w+') as fhandle: fhandle.write(json.dumps(reference_data, sort_keys=True, indent=4)) ## Final JSON writeout. if args.stanza: with open(args.stanza, 'w+') as fhandle: file_cache = [] for ref in reference_data: stanza_cache = [] if ref.get('id', False): stanza_cache.append('go_ref_id: ' + ref.get('id')) alt_ids = ref.get('alt_id', []) for alt_id in alt_ids: stanza_cache.append('alt_id: ' + alt_id) if ref.get('title', False): stanza_cache.append('title: ' + ref.get('title')) if ref.get('authors', False): stanza_cache.append('authors: ' + ref.get('authors')) if ref.get('year', False): stanza_cache.append('year: ' + str(ref.get('year'))) external_accessions = ref.get('external_accession', []) for external_accession in external_accessions: stanza_cache.append('external_accession: ' + external_accession) if ref.get('abstract', False): stanza_cache.append('abstract: ' + ref.get('abstract')) if ref.get('comments', False): stanza_cache.append('comment: ' + ref.get('comments')) file_cache.append("\n".join(stanza_cache)) fhandle.write(header + "\n\n".join(file_cache))
def main(report, template, date, suppress_rule_tag): report_json = json.load(report) header = sorted([{"id": dataset["id"]} for dataset in report_json], key=lambda h: h["id"]) # click.echo(json.dumps(header, indent=4)) rules_directory = os.path.normpath(os.path.join(os.path.dirname(this_script), "../metadata/rules")) rules_descriptions = dict() # Rule Descriptions is a map of rule ID to a {"title": rule title, "tags": list of possible rule tags} for rule_path in glob.glob(os.path.join(rules_directory, "gorule*.md")): with open(rule_path) as rule_file: rule = yamldown.load(rule_file)[0] rule_id = rule["id"].lower().replace(":", "-") rules_descriptions[rule_id] = { "title": rule["title"], "tags": rule.get("tags", []) } rule_by_dataset = dict() # { # "gorule-0000005": { # "mgi": 30, # "sgd": 25, # "blah": 45 # }, # "gorule-0000006": { # "mgi": 20, # "sgd": 11 # } # } # [ # { # "rule": "gorule-0000005", # "dataset": [ # { # "id": "mgi", # "messages": 20 # } # ] # } # ] ################################### # { # "gorule-0000005": { # "rule": "gorule-0000005", # "mgi": 20, # "sgd": 11, # "wb": 300 # }, # "other": { # "rule": "other", # "mgi": 25, # "sgd": 25, # "wb": 33 # } # } bootstrap_context_mapping = { "warning": "warning", "error": "danger", "info": "primary" } for dataset in report_json: for rule, messages in dataset["messages"].items(): if any([tag in rules_descriptions.get(rule, {}).get("tags", []) for tag in suppress_rule_tag ]): # For any that is passed in to be suppressed, if it is a tag in the rule, then skip the rule. continue if rule not in rule_by_dataset: level = messages[0]["level"].lower() if len(messages) > 0 else "info" rule_by_dataset[rule] = { dataset["id"]: len(messages), "level": level, "rule": rule } else: rule_by_dataset[rule][dataset["id"]] = len(messages) rule_by_dataset[rule]["level"] = messages[0]["level"].lower() if len(messages) > 0 else "info" # Add empty cells in as 0s for h in header: for rule, amounts in rule_by_dataset.items(): if h["id"] not in amounts: amounts[h["id"]] = 0 # click.echo(json.dumps(rule_by_dataset, indent=4)) rows = sorted(rule_by_dataset.values(), key=lambda n: n["rule"]) # print(json.dumps(rows[0:4], indent=4)) cells = [] for row in rows: contents = [] level = bootstrap_context_mapping[row["level"]] for key, val in row.items(): if key == "rule": continue if key == "level": continue v = { "dataset": key, "amount": val, "has-zero-messages": val==0, "level": level if val > 0 else "primary" } contents.append(v) contents = sorted(contents, key=lambda d: d["dataset"]) cell = { "rule": row["rule"], "title": rules_descriptions.get(row["rule"], {}).get("title", ""), "messages": contents, "is-other": row["rule"] == "other" } cells.append(cell) # print(json.dumps(cells[0:4], indent=4)) rendered = pystache.render(template.read(), {"header": header, "rules": cells, "date": date}) print(rendered)
def main(report, template, date, suppress_rule_tag): # Make the input json look more like the "combined report" from reports-page-gen.py report_json = json.load(report) report_json["id"] = "gocam" report_json = [report_json] # header: # [ # {"id": "mgi"}, # {"id": "goa_chicken"} # ... # ] header = sorted([{ "id": dataset["id"] } for dataset in report_json], key=lambda h: h["id"]) # click.echo(json.dumps(header, indent=4)) rules_directory = os.path.normpath( os.path.join(os.path.dirname(this_script), "../metadata/rules")) rules_descriptions = dict() # Rule Descriptions is a map of rule ID to a {"title": rule title, "tags": list of possible rule tags} for rule_path in glob.glob(os.path.join(rules_directory, "gorule*.md")): with open(rule_path) as rule_file: rule = yamldown.load(rule_file)[0] rule_id = rule["id"].lower().replace(":", "-") rules_descriptions[rule_id] = { "title": rule["title"], "tags": rule.get("tags", []) } rule_by_dataset = dict() # { # "gorule-0000005": { # "mgi": 30, # "sgd": 25, # "blah": 45 # }, # "gorule-0000006": { # "mgi": 20, # "sgd": 11 # } # } # [ # { # "rule": "gorule-0000005", # "dataset": [ # { # "id": "mgi", # "messages": 20 # } # ] # } # ] ################################### # { # "gorule-0000005": { # "rule": "gorule-0000005", # "mgi": 20, # "sgd": 11, # "wb": 300 # }, # "other": { # "rule": "other", # "mgi": 25, # "sgd": 25, # "wb": 33 # } # } bootstrap_context_mapping = { "warning": "warning", "error": "danger", "info": "primary" } for dataset in report_json: # Rule: rule ID, messages: List of each message from parsing for rule, messages in dataset["messages"].items(): if any([ tag in rules_descriptions.get(rule, {}).get("tags", []) for tag in suppress_rule_tag ]): # For any that is passed in to be suppressed, if it is a tag in the rule, then skip the rule. continue # If we haven't added the rule, then add the messages, level, and rule ID to the value (keyed to the rule ID) if rule not in rule_by_dataset: level = messages[0]["level"].lower( ) if len(messages) > 0 else "info" rule_by_dataset[rule] = { dataset["id"]: len(messages), "level": level, "rule": rule } else: rule_by_dataset[rule][dataset["id"]] = len(messages) # We can only increase `level`. If level is info, but messages are warn or error, than we reset. # If level is warning, then only error will replace, since it's "higher". if rule_by_dataset[rule]["level"] == "info" and len( messages) > 0 and messages[0]["level"].lower() in [ "error", "warning" ]: rule_by_dataset[rule]["level"] = messages[0][ "level"].lower() elif rule_by_dataset[rule]["level"] == "warning" and len( messages) > 0 and messages[0]["level"].lower( ) == "error": rule_by_dataset[rule]["level"] = "error" # Add empty cells in as 0s for h in header: # h: {"id": "mgi"} for rule, amounts in rule_by_dataset.items(): # rule: "gorule-0000006", amounts: {"mgi": 20, "sgd": 11, ...} # If the header name (the dataset name) is not accounted in the amounts dict, add it as 0 if h["id"] not in amounts: amounts[h["id"]] = 0 # Sort the list of rules -> {set of dataset:number of messages} by rule title (alphabet) rows = sorted(rule_by_dataset.values(), key=lambda n: n["rule"]) # Each "cell" is actually a row in the table. # Each `v` below is a cell contents along the row cells = [] for row in rows: contents = [] level = bootstrap_context_mapping[row["level"]] for key, val in row.items(): if key == "rule": continue if key == "level": continue v = { "dataset": key, "amount": val, "has-zero-messages": val == 0, "level": level if val > 0 else "primary" } contents.append(v) contents = sorted(contents, key=lambda d: d["dataset"]) cell = { "rule": row["rule"], "title": rules_descriptions.get(row["rule"], {}).get("title", ""), "messages": contents, "is-other": row["rule"] == "other" } cells.append(cell) # print(json.dumps(cells[0:4], indent=4)) rendered = pystache.render(template.read(), { "header": header, "rules": cells, "date": date }) print(rendered)
def main(): """The main runner for our script.""" ## Deal with incoming. parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--directory', help='The directory of the GO refs') parser.add_argument('-j', '--json', help='JSON output file') parser.add_argument('-s', '--stanza', help='Stanza-based output file') parser.add_argument('-v', '--verbose', action='store_true', help='More verbose output') args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) LOG.info('Verbose: on') ## Ensure directories and outputs. if not args.directory: die_screaming('need a directory argument') LOG.info('Will operate in: ' + args.directory) ## Ensure output file. if not args.json and not args.stanza: die_screaming('need an output file argument, --json or --stanza') LOG.info('Will output JSON to: ' + args.json) LOG.info('Will output stanza to: ' + args.stanza) ## Main data hold. reference_data = [] ## Get files out of target directory, flipping the frontmatter ## into JSON. LOG.info('Globbing GO ref YAMLs in data directory: ' + args.directory + '/goref-*.md') src_filenames = glob.glob(args.directory + '/go*-*.md') for src_filename in src_filenames: LOG.info('GO ref filename: ' + src_filename) with open(src_filename, "r") as f: yml, md = yamldown.load(f) ## Break the md into the title, abstract, and comments. mdj_text = pypandoc.convert_text(md, 'json', format='markdown') mdj = json.loads(mdj_text) title = 'n/a' abstract = 'n/a' comments = 'n/a' next_block_type = None ## A workaround for the change in JSON format in pandoc in ## 1.18; Ubuntu 16.04 uses 1.16.0.2 and 18.04 uses ## 1.19.2.4. blocks = None if type(mdj) == list: blocks = mdj[1] else: blocks = mdj['blocks'] for block in blocks: ## If is a header and has something there in the ## header. #LOG.info(json.dumps(block)) if block.get('t', False) == "Header": if block.get('c', False) and len(block['c']) >= 2: ## Capture the title. header_text = wtflist2str(block['c'][2]) #LOG.info('header text: ' + header_text) if header_text.casefold( ) == "comments" or header_text.casefold() == "comment": next_block_type = "comments" #LOG.info("<<next: comments>>") else: ## Otherwise, we're going to assume this ## is an abstract. title = header_text next_block_type = "abstract" #LOG.info("<<next: abstract>>") else: raise Exception("Unknown HEADER") elif block['t'] == "Para": if block.get('c', False) and len(block['c']) > 0: ## Capture the title. para_text = wtflist2str(block['c']) if next_block_type == "comments": comments = para_text #LOG.info('comments text: ' + para_text) elif next_block_type == "abstract": abstract = para_text #LOG.info('abstract text: ' + para_text) else: raise Exception("Unknown PARA") else: raise Exception("Unknown ENTITY") yml['abstract'] = abstract yml['comments'] = comments yml['title'] = title reference_data.append(yml) ## Sort by id. reference_data = sorted(reference_data, key=lambda k: k['id']) ## Final JSON writeout. if args.json: with open(args.json, 'w+') as fhandle: fhandle.write(json.dumps(reference_data, sort_keys=True, indent=4)) ## Final JSON writeout. if args.stanza: with open(args.stanza, 'w+') as fhandle: file_cache = [] for ref in reference_data: stanza_cache = [] if ref.get('id', False): stanza_cache.append('go_ref_id: ' + ref.get('id')) alt_ids = ref.get('alt_id', []) for alt_id in alt_ids: stanza_cache.append('alt_id: ' + alt_id) if ref.get('title', False): stanza_cache.append('title: ' + ref.get('title')) if ref.get('authors', False): stanza_cache.append('authors: ' + ref.get('authors')) if ref.get('year', False): stanza_cache.append('year: ' + str(ref.get('year'))) external_accessions = ref.get('external_accession', []) for external_accession in external_accessions: stanza_cache.append('external_accession: ' + external_accession) if ref.get('abstract', False): stanza_cache.append('abstract: ' + ref.get('abstract')) if ref.get('comments', False): stanza_cache.append('comment: ' + ref.get('comments')) file_cache.append("\n".join(stanza_cache)) fhandle.write(header + "\n\n".join(file_cache))