def read_yaml(yaml_file): """Read YAML file and handle errors.""" try: with open(yaml_file) as f: data = yaml.load(f, Loader=yaml.FullLoader) except yaml.scanner.ScannerError as e: raise util.SparvErrorMessage( "An error occurred while reading the configuration file:\n" + str(e)) except FileNotFoundError: raise util.SparvErrorMessage( f"Could not find the config file '{yaml_file}'") return data or {}
def validate_config(config_dict=None, structure=None, parent=""): """Make sure the corpus config doesn't contain invalid keys.""" config_dict = config_dict or config structure = structure or config_structure for key in config_dict: path = (parent + "." + key) if parent else key if key not in structure: if not parent: raise util.SparvErrorMessage( f"Unknown key in config file: '{path}'. No module with that name found.", module="sparv", function="config") else: module_name = parent.split(".", 1)[0] raise util.SparvErrorMessage( f"Unknown key in config file: '{path}'. The module '{module_name}' " f"doesn't have an option with that name.", module="sparv", function="config") elif not structure[key].get("_source"): validate_config(config_dict[key], structure[key], path)
def handle_document_annotation(): """Copy document annotation to text class.""" doc_elem = get("import.document_annotation") # Make sure that if both classes.text and import.document_annotation are set, that they have the same value if get("classes.text") and doc_elem and get("classes.text") != doc_elem: raise util.SparvErrorMessage( "The config keys 'classes.text' and 'import.document_annotation' can't have different values.", "sparv", "config") # If import.document_annotation is set, copy value to classes.text if doc_elem: set_default("classes.text", doc_elem)
def get_source_files(source_files) -> List[str]: """Get list of all available source files.""" if not source_files: if not sparv_config.get("import.importer"): raise util.SparvErrorMessage( "The config variable 'import.importer' must not be empty.", "sparv") try: importer_module, _, importer_function = sparv_config.get( "import.importer").partition(":") file_extension = registry.modules[importer_module].functions[ importer_function]["file_extension"] except KeyError: raise util.SparvErrorMessage( "Could not find the importer '{}'. Make sure the 'import.importer' config value refers to an " "existing importer.".format( sparv_config.get("import.importer")), "sparv") source_files = [ f[1][0] for f in snakemake.utils.listfiles( Path(get_source_path(), "{file}." + file_extension)) ] return source_files
def validate_module_config(): """Make sure that modules don't try to access undeclared config keys.""" for config_key in config_usage: try: _get(config_key, config_structure) except KeyError: annotators = config_usage[config_key] raise util.SparvErrorMessage( "The annotator{} {} {} trying to access the config key '{}' which isn't declared anywhere." .format("s" if len(annotators) > 1 else "", ", ".join(annotators), "are" if len(annotators) > 1 else "is", config_key), "sparv", "config")
def vrt_scrambled( doc: Document = Document(), out: Export = Export("vrt_scrambled/{doc}.vrt"), chunk: Annotation = Annotation("[cwb.scramble_on]"), chunk_order: Annotation = Annotation( "[cwb.scramble_on]:misc.number_random"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt in scrambled order.""" # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) if chunk not in annotation_list: raise util.SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output." .format(chunk)) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, doc=doc, split_overlaps=True) # Read words and document ID word_annotation = list(word.read()) chunk_order_data = list(chunk_order.read()) # Reorder chunks and open/close tags in correct order new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order_data) # Make vrt format vrt_data = create_vrt(new_span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def install_json(jsonfile: ExportInput = ExportInput("[metadata.id].json"), out: OutputCommonData = OutputCommonData( "sbx_metadata.install_json_export_marker"), export_path: str = Config("sbx_metadata.json_export_path"), host: str = Config("sbx_metadata.json_export_host")): """Copy JSON metadata to remote host.""" if not host: raise util.SparvErrorMessage( "'sbx_metadata.json_export_host' not set! JSON export not installed." ) filename = Path(jsonfile).name remote_file_path = os.path.join(export_path, filename) util.install_file(host, jsonfile, remote_file_path) out.write("")
def install_metashare( xmlfile: ExportInput = ExportInput("sbx_metadata/[metadata.id].xml"), out: OutputCommonData = OutputCommonData( "sbx_metadata.install_metashare_marker"), export_path: str = Config("sbx_metadata.metashare_path"), host: str = Config("sbx_metadata.metashare_host")): """Copy META-SHARE file to remote host.""" if not host: raise util.SparvErrorMessage( "'sbx_metadata.metashare_host' not set! META-SHARE export not installed." ) filename = Path(xmlfile).name remote_file_path = os.path.join(export_path, filename) util.install_file(host, xmlfile, remote_file_path) out.write("")
def replace_list(chunk: Annotation, out: Output, find: str = "", sub: str = ""): """Find and replace annotations. Find string must match whole annotation. find and sub are whitespace separated lists of words to replace and their replacement. """ find = find.split() sub = sub.split() if len(find) != len(sub): raise util.SparvErrorMessage( "Find and sub must have the same number of words.") translate = dict((f, s) for (f, s) in zip(find, sub)) out.write((translate.get(val, val) for val in chunk.read()))
def make_standard_xml_export(xml_export, corpus_id: str): """Make license info object for standard XML export.""" if xml_export in ("scrambled", "original"): item = { "licence": "CC-BY", "restriction": "attribution", "download": f"{MENINGSMANGDER_URL}/{corpus_id}.xml.bz2", "type": "corpus", "format": "XML" } if xml_export == "scrambled": item[ "info"] = "this file contains a scrambled version of the corpus" return item elif not xml_export: return else: raise util.SparvErrorMessage( f"Invalid config value for sbx_metadata.xml_export: '{xml_export}'. " "Possible values: 'scrambled', 'original', False")
def load_config(config_file: Optional[str], config_dict: Optional[dict] = None) -> None: """Load both default config and corpus config and merge into one config structure. Args: config_file: Path to corpus config file. If None, only the default config is read. config_dict: Get corpus config from dictionary instead of config file. """ # Read default config if DEFAULT_CONFIG.is_file(): default_config = read_yaml(DEFAULT_CONFIG) else: log.warning( "Default config file is missing: {}".format(DEFAULT_CONFIG)) default_config = {} default_classes = default_config.get("classes", {}) if config_file: # Read corpus config global config_user config_user = read_yaml(config_file) or {} def handle_parents(cfg, current_dir="."): """Combine parent configs recursively.""" combined_parents = {} if cfg.get(PARENT): parents = cfg[PARENT] if isinstance(parents, str): parents = [parents] for parent in parents: parent_path = Path(current_dir, parent) config_parent = read_yaml(parent_path) config_parent = handle_parents(config_parent, parent_path.parent) combined_parents = _merge_dicts(config_parent, combined_parents) cfg = _merge_dicts(cfg, combined_parents) return cfg # If parent configs are specified, inherit their contents config_user = handle_parents(config_user) elif config_dict: config_user = config_dict else: config_user = {} user_classes = config_user.get("classes", {}) # Merge default and corpus config and save to global config variable global config config = _merge_dicts(copy.deepcopy(config_user), default_config) # Set correct classes and annotations from presets apply_presets(user_classes, default_classes) if config_file: handle_document_annotation() # Make sure that the root level only contains dictionaries or lists to save us a lot of headache for key in config: if key == PARENT: continue if not isinstance(config[key], (dict, list)): raise util.SparvErrorMessage( f"The config section '{key}' could not be parsed.", module="sparv", function="config")
def preserved_format( doc: Document = Document(), text: Text = Text(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export( "xml_preserved_format/[xml_export.filename_formatted]"), annotations: ExportAnnotations = ExportAnnotations( "xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations( "xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config( "xml_export.include_empty_attributes")): """Export annotations to XML in export_dir and keep whitespaces and indentation from original file. Args: doc: Name of the original document. text: The corpus text. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read corpus text and document ID corpus_text = text.read() docid = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, h_annotations, doc=doc, flatten=False, split_overlaps=True) sorted_positions = [(pos, span[0], span[1]) for pos, spans in sorted(span_positions.items()) for span in spans] # Root tag sanity check if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]): raise util.SparvErrorMessage( "Root tag is missing! If you have manually specified which elements to include, " "make sure to include an element that encloses all other included elements and " "text content.") # Create root node root_span = sorted_positions[0][2] root_span.set_node() node_stack = [] last_pos = 0 # Keeps track of the position of the processed text for x, (_pos, instruction, span) in enumerate(sorted_positions): # Open node: Create child node under the top stack node if instruction == "open": # Set tail for previous node if necessary if last_pos < span.start: # Get last closing node in this position _, tail_span = [ i for i in span_positions[last_pos] if i[0] == "close" ][-1] tail_span.node.tail = corpus_text[last_pos:span.start] last_pos = span.start # Handle headers if span.is_header: header = annotation_dict[span.name][util.HEADER_CONTENTS][ span.index] header_xml = etree.fromstring(header) header_xml.tag = span.export # Rename element if needed span.node = header_xml node_stack[-1].node.append(header_xml) else: if node_stack: # Don't create root node, it already exists span.set_node(parent_node=node_stack[-1].node) xml_utils.add_attrs(span.node, span.name, annotation_dict, export_names, span.index, include_empty_attributes) if span.overlap_id: if sparv_namespace: span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") else: span.node.set( f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") node_stack.append(span) # Set text if there should be any between this node and the next one next_item = sorted_positions[x + 1] if next_item[1] == "open" and next_item[2].start > span.start: span.node.text = corpus_text[last_pos:next_item[2].start] last_pos = next_item[2].start # Close node else: if span.is_header: continue if last_pos < span.end: # Set node text if necessary if span.start == last_pos: span.node.text = corpus_text[last_pos:span.end] # Set tail for previous node if necessary else: # Get last closing node in this position _, tail_span = [ i for i in span_positions[last_pos] if i[0] == "close" ][-1] tail_span.node.tail = corpus_text[last_pos:span.end] last_pos = span.end # Make sure closing node == top stack node assert span == node_stack[ -1], "Overlapping elements found: {}".format(node_stack[-2:]) # Pop stack and move on to next span node_stack.pop() # Write xml to file etree.ElementTree(root_span.node).write(out, encoding="unicode", method="xml", xml_declaration=True) log.info("Exported: %s", out)
def __init__(self, elements: list, skip: list, header_elements: list, headers: list, encoding: str = util.UTF8, source_dir: str = "src", prefix: str = "", keep_control_chars: bool = True, normalize: str = "NFC"): """Initialize XML parser.""" self.source_dir = source_dir self.encoding = encoding self.keep_control_chars = keep_control_chars self.normalize = normalize self.doc = None self.prefix = prefix self.header_elements = header_elements self.headers = {} self.pos = 0 # Current position in the text data self.subpos = 0 # Sub-position for tags with same position self.tagstack = [] self.targets = { } # Index of elements and attributes that will be renamed during import self.data = {} # Metadata collected during parsing self.text = [] # Text data of the document collected during parsing # Parse elements argument def elsplit(elem): """Split element and attribute.""" elem = elem.replace(r"\:", ";") tag, _, attr = elem.partition(":") tag = tag.replace(";", ":") attr = attr.replace(";", ":") return tag, attr all_elems = set() renames = {} # Element list needs to be sorted to handle plain elements before attributes for element, target in sorted(util.parse_annotation_list(elements)): element, attr = elsplit(element) all_elems.add((element, attr)) if target: # Element and/or attribute should be renamed during import if not attr: renames[element] = target target_element = target target_attr = "" else: target_element = renames.get(element, element) target_attr = target self.targets.setdefault(element, {"attrs": {}}) self.targets[element]["target"] = target_element self.data.setdefault(target_element, { "attrs": set(), "elements": [] }) if target_attr: self.targets[element]["attrs"][attr] = target_attr self.data[target_element]["attrs"].add(target_attr) else: self.data.setdefault(element, {"attrs": set(), "elements": []}) if attr: self.data[element]["attrs"].add(attr) for header in headers: header_source, _, header_target = header.partition(" as ") if not header_target: raise util.SparvErrorMessage( "The header '{}' needs to be bound to a target element.". format(header)) header_source, _, header_source_attrib = header_source.partition( ":") header_source_root, _, header_source_rest = header_source.partition( "/") self.headers.setdefault(header_source_root, {}) self.headers[header_source_root].setdefault(header_source_rest, []) self.headers[header_source_root][header_source_rest].append({ "source": header_source_attrib, "target": elsplit(header_target) }) self.skipped_elems = set(elsplit(elem) for elem in skip) assert self.skipped_elems.isdisjoint( all_elems), "skip and elements must be disjoint"
def make_pretty_xml(span_positions, annotation_dict, export_names, token_name: str, word_annotation, docid, include_empty_attributes: bool, sparv_namespace: Optional[str] = None): """Create a pretty formatted XML string from span_positions. Used by pretty and sentence_scrambled. """ # Root tag sanity check if not valid_root(span_positions[0], span_positions[-1]): raise util.SparvErrorMessage( "Root tag is missing! If you have manually specified which elements to include, " "make sure to include an element that encloses all other included elements and " "text content.") # Create root node root_span = span_positions[0][2] root_span.set_node() add_attrs(root_span.node, root_span.name, annotation_dict, export_names, 0, include_empty_attributes) node_stack = [root_span] last_start_pos = None last_end_pos = -1 current_token_text = None last_node = None inside_token = False def handle_subtoken_text(position, last_start_position, last_end_position, node, token_text): """Handle text for subtoken elements.""" if last_start_position < last_end_position < position: node.tail = token_text[:position - last_end_position] token_text = token_text[position - last_end_position:] elif position > last_start_position: node.text = token_text[:position - last_start_position] token_text = token_text[position - last_start_position:] return token_text # Go through span_positions and build xml tree for _pos, instruction, span in span_positions[1:]: # Handle headers if span.is_header: if instruction == "open": header = annotation_dict[span.name][util.HEADER_CONTENTS][ span.index] # Replace any leading tabs with spaces header = re.sub(r"^\t+", lambda m: INDENTATION * len(m.group()), header, flags=re.MULTILINE) header_xml = etree.fromstring(header) header_xml.tag = span.export # Rename element if needed node_stack[-1].node.append(header_xml) continue # Create child node under the top stack node if instruction == "open": span.set_node(parent_node=node_stack[-1].node) node_stack.append(span) add_attrs(span.node, span.name, annotation_dict, export_names, span.index, include_empty_attributes) if span.overlap_id: if sparv_namespace: span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") else: span.node.set( f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") # Add text if this node is a token if span.name == token_name: inside_token = True # Save text until later last_start_pos = span.start current_token_text = word_annotation[span.index] if inside_token and current_token_text: current_token_text = handle_subtoken_text( span.start, last_start_pos, last_end_pos, last_node, current_token_text) last_start_pos = span.start last_node = span.node # Close node else: if inside_token and current_token_text: current_token_text = handle_subtoken_text( span.end, last_start_pos, last_end_pos, last_node, current_token_text) last_end_pos = span.end last_node = span.node if span.name == token_name: inside_token = False # Make sure closing node == top stack node assert span == node_stack[ -1], "Overlapping elements found: {}".format(node_stack[-2:]) # Pop stack and move on to next span node_stack.pop() # Pretty formatting of XML tree indent(root_span.node) # We use write() instead of tostring() here to be able to get an XML declaration stream = io.StringIO() etree.ElementTree(root_span.node).write(stream, encoding="unicode", method="xml", xml_declaration=True) return stream.getvalue()
def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_missing: bool = False, custom_rule_obj: Optional[dict] = None) -> bool: """ Populate rule with Snakemake input, output and parameter list. Return True if a Snakemake rule should be created. Args: rule: Object containing snakemake rule parameters. config: Dictionary containing the corpus configuration. storage: Object for saving information for all rules. config_missing: True if there is no corpus config file. custom_rule_obj: Custom annotation dictionary from corpus config. """ # Only create certain rules when config is missing if config_missing and not rule.modelbuilder: return False # Skip any annotator that is not available for the selected corpus language if rule.annotator_info["language"] and sparv_config.get("metadata.language") and \ sparv_config.get("metadata.language") not in rule.annotator_info["language"]: return False # Get this function's parameters params = OrderedDict( inspect.signature(rule.annotator_info["function"]).parameters) param_dict = make_param_dict(params) if rule.importer: rule.inputs.append( Path(get_source_path(), "{doc}." + rule.file_extension)) storage.all_importers.setdefault(rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "params": param_dict }) if rule.target_name == sparv_config.get("import.importer"): # Exports always generate corpus text file rule.outputs.append(paths.work_dir / "{doc}" / io.TEXT_FILE) # If importer guarantees other outputs, add them to outputs list if rule.import_outputs: if isinstance(rule.import_outputs, Config): rule.import_outputs = sparv_config.get( rule.import_outputs, rule.import_outputs.default) annotations_ = set() renames = {} # Annotation list needs to be sorted to handle plain annotations before attributes for ann, target in sorted( util.parse_annotation_list(rule.import_outputs)): # Handle annotations renamed during import if target: source_ann, source_attr = BaseAnnotation(ann).split() if not source_attr: renames[ann] = target ann = target else: ann = io.join_annotation( renames.get(source_ann, source_ann), target) annotations_.add(ann) for element in annotations_: rule.outputs.append(paths.work_dir / get_annotation_path(element)) # If import.document_annotation has been specified, add it to outputs if not already there if sparv_config.get("import.document_annotation"): doc_ann_file = paths.work_dir / get_annotation_path( sparv_config.get("import.document_annotation")) if doc_ann_file not in rule.outputs: rule.outputs.append(doc_ann_file) if rule.exporter: storage.all_exporters.setdefault(rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "params": param_dict }) elif rule.installer: storage.all_installers.setdefault(rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "params": param_dict }) output_dirs = set() # Directories where export files are stored custom_params = set() if custom_rule_obj: if custom_rule_obj.get("params"): name_custom_rule(rule, storage) custom_params = set(custom_rule_obj.get("params").keys()) else: # This rule has already been populated, so don't process it again return False # Go though function parameters and handle based on type for param_name, param in params.items(): param_default_empty = param.default == inspect.Parameter.empty param_value: Any # Get parameter value, either from custom rule object or default value if custom_rule_obj: if param_name in custom_rule_obj["params"]: param_value = custom_rule_obj["params"][param_name] custom_params.remove(param_name) elif not param_default_empty: param_value = copy.deepcopy(param.default) else: raise util.SparvErrorMessage( f"Parameter '{param_name}' in custom rule '{rule.full_name}' has no value!", "sparv", "config") else: if param_default_empty: # This is probably an unused custom rule, so don't process it any further, # but save it in all_custom_annotators and all_annotators storage.all_custom_annotators.setdefault( rule.module_name, {}).setdefault(rule.f_name, { "description": rule.description, "params": param_dict }) storage.custom_targets.append( (rule.target_name, rule.description)) storage.all_annotators.setdefault( rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "annotations": [], "params": param_dict }) return False else: param_value = copy.deepcopy(param.default) param_type, param_list, param_optional = registry.get_type_hint_type( param.annotation) # Output if issubclass(param_type, BaseOutput): if not isinstance(param_value, BaseOutput): if not param_value: return False param_value = param_type(param_value) rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes(param_value.name)) missing_configs = param_value.expand_variables(rule.full_name) rule.missing_config.update(missing_configs) ann_path = get_annotation_path(param_value, data=param_type.data, common=param_type.common) if param_type.all_docs: rule.outputs.extend( map( Path, expand(escape_wildcards(paths.work_dir / ann_path), doc=get_source_files(storage.source_files)))) elif param_type.common: rule.outputs.append(paths.work_dir / ann_path) if rule.installer: storage.install_outputs[rule.target_name].append( paths.work_dir / ann_path) else: rule.outputs.append( get_annotation_path(param_value, data=param_type.data)) rule.parameters[param_name] = param_value if "{" in param_value: rule.wildcard_annotations.append(param_name) if rule.annotator: storage.all_annotators.setdefault( rule.module_name, {}).setdefault( rule.f_name, { "description": rule.description, "annotations": [], "params": param_dict }) storage.all_annotators[rule.module_name][ rule.f_name]["annotations"].append( (param_value, param_value.description)) # ModelOutput elif param_type == ModelOutput: rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes(param_value.name)) rule.missing_config.update( param_value.expand_variables(rule.full_name)) model_path = param_value.path rule.outputs.append(model_path) rule.parameters[param_name] = ModelOutput(str(model_path)) storage.model_outputs.append(model_path) # Annotation elif issubclass(param_type, BaseAnnotation): if not isinstance(param_value, BaseAnnotation): if not param_value: return False param_value = param_type(param_value) rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes(param_value.name)) missing_configs = param_value.expand_variables(rule.full_name) if (not param_value or missing_configs) and param_optional: rule.parameters[param_name] = None continue rule.missing_config.update(missing_configs) ann_path = get_annotation_path(param_value, data=param_type.data, common=param_type.common) if param_type.all_docs: rule.inputs.extend( expand(escape_wildcards(paths.work_dir / ann_path), doc=get_source_files(storage.source_files))) elif rule.exporter or rule.installer or param_type.common: rule.inputs.append(paths.work_dir / ann_path) else: rule.inputs.append(ann_path) rule.parameters[param_name] = param_value if "{" in param_value: rule.wildcard_annotations.append(param_name) # ExportAnnotations elif param_type in (ExportAnnotations, ExportAnnotationsAllDocs): if not isinstance(param_value, param_type): param_value = param_type(param_value) rule.parameters[param_name] = param_value source = param.default.config_name annotations = sparv_config.get(f"{source}", []) if not annotations: rule.missing_config.add(f"{source}") export_annotations = util.parse_annotation_list( annotations, add_plain_annotations=False) annotation_type = Annotation if param_type == ExportAnnotations else AnnotationAllDocs plain_annotations = set() possible_plain_annotations = [] for i, (export_annotation_name, export_name) in enumerate(export_annotations): annotation = annotation_type(export_annotation_name) rule.configs.update( registry.find_config_variables(annotation.name)) rule.classes.update(registry.find_classes(annotation.name)) rule.missing_config.update( annotation.expand_variables(rule.full_name)) export_annotations[i] = (annotation, export_name) plain_name, attr = annotation.split() if not attr: plain_annotations.add(plain_name) else: if plain_name not in possible_plain_annotations: possible_plain_annotations.append(plain_name) # Add plain annotations where needed for a in possible_plain_annotations: if a not in plain_annotations: export_annotations.append((annotation_type(a), None)) for annotation, export_name in export_annotations: if param.default.is_input: if param_type == ExportAnnotationsAllDocs: rule.inputs.extend( expand(escape_wildcards( paths.work_dir / get_annotation_path(annotation.name)), doc=get_source_files(storage.source_files))) else: rule.inputs.append( paths.work_dir / get_annotation_path(annotation.name)) rule.parameters[param_name].append((annotation, export_name)) # SourceAnnotations elif param_type == SourceAnnotations: rule.parameters[param_name] = sparv_config.get( f"{param.default.config_name}", None) # Corpus elif param.annotation == Corpus: rule.parameters[param_name] = Corpus( sparv_config.get("metadata.id")) # Language elif param.annotation == Language: rule.parameters[param_name] = Language( sparv_config.get("metadata.language")) # Document elif param.annotation == Document: rule.docs.append(param_name) # AllDocuments (all source documents) elif param_type == AllDocuments: rule.parameters[param_name] = AllDocuments( get_source_files(storage.source_files)) # Text elif param_type == Text: text_path = Path("{doc}") / io.TEXT_FILE if rule.exporter or rule.installer: rule.inputs.append(paths.work_dir / text_path) else: rule.inputs.append(text_path) rule.parameters[param_name] = param_value # Model elif param_type == Model: if param_value is not None: if param_list: rule.parameters[param_name] = [] for model in param_value: if not isinstance(model, Model): model = Model(param_value) rule.configs.update( registry.find_config_variables(model.name)) rule.classes.update(registry.find_classes(model.name)) rule.missing_config.update( model.expand_variables(rule.full_name)) rule.inputs.append(model.path) rule.parameters[param_name].append( Model(str(model.path))) else: if not isinstance(param_value, Model): param_value = Model(param_value) rule.configs.update( registry.find_config_variables(param_value.name)) rule.classes.update(registry.find_classes( param_value.name)) rule.missing_config.update( param_value.expand_variables(rule.full_name)) rule.inputs.append(param_value.path) rule.parameters[param_name] = Model(str(param_value.path)) # Binary elif param.annotation in (Binary, BinaryDir): rule.configs.update(registry.find_config_variables(param.default)) rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables( param.default, rule.full_name) rule.missing_config.update(missing_configs) binary = util.find_binary(param_value, executable=False, allow_dir=param.annotation == BinaryDir) if not binary: rule.missing_binaries.add(param_value) binary = Path(binary if binary else param_value) rule.inputs.append(binary) rule.parameters[param_name] = param.annotation(binary) # Source elif param.annotation == Source: rule.parameters[param_name] = Source(get_source_path()) # Export elif param.annotation == Export: rule.configs.update(registry.find_config_variables(param.default)) rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables( param.default, rule.full_name) rule.missing_config.update(missing_configs) if param.default.absolute_path: export_path = Path(param_value) else: export_path = paths.export_dir / param_value output_dirs.add(export_path.parent) rule.outputs.append(export_path) rule.parameters[param_name] = Export(str(export_path)) if "{doc}" in rule.parameters[param_name]: rule.doc_annotations.append(param_name) if "{" in param_value: rule.wildcard_annotations.append(param_name) # ExportInput elif param.annotation == ExportInput: rule.configs.update(registry.find_config_variables(param.default)) rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables( param.default, rule.full_name) rule.missing_config.update(missing_configs) if param.default.absolute_path: rule.parameters[param_name] = ExportInput(param_value) else: rule.parameters[param_name] = ExportInput(paths.export_dir / param_value) if param.default.all_docs: rule.inputs.extend( expand(escape_wildcards(rule.parameters[param_name]), doc=get_source_files(storage.source_files))) else: rule.inputs.append(Path(rule.parameters[param_name])) if "{" in rule.parameters[param_name]: rule.wildcard_annotations.append(param_name) # Config elif isinstance(param_value, Config): rule.configs.add(param_value.name) config_value = sparv_config.get(param_value, sparv_config.Unset) if config_value is sparv_config.Unset: if param_value.default is not None: config_value = param_value.default elif param_optional: config_value = None else: rule.missing_config.add(param_value) rule.parameters[param_name] = config_value # Everything else else: rule.parameters[param_name] = param_value # For custom rules, warn the user of any unknown parameters if custom_params: print_sparv_warning( "The parameter{} '{}' used in one of your custom rules " "do{} not exist in {}.".format( "s" if len(custom_params) > 1 else "", "', '".join(custom_params), "es" if len(custom_params) == 1 else "", rule.full_name)) storage.all_rules.append(rule) # Add to rule lists in storage update_storage(storage, rule) # Add exporter dirs (used for informing user) if rule.exporter: if rule.abstract: output_dirs = set([p.parent for p in rule.inputs]) rule.export_dirs = [str(p / "_")[:-1] for p in output_dirs] if rule.missing_config: log_handler.messages["missing_configs"][rule.full_name].update( [c for c in rule.missing_config if not c.startswith("<")]) log_handler.messages["missing_classes"][rule.full_name].update( [c[1:-1] for c in rule.missing_config if c.startswith("<")]) if rule.missing_binaries: log_handler.messages["missing_binaries"][rule.full_name].update( rule.missing_binaries) if config.get("debug"): print() console.print("[b]{}:[/b] {}".format(rule.module_name.upper(), rule.f_name)) print() console.print(" [b]INPUTS[/b]") for i in rule.inputs: print(" {}".format(i)) print() console.print(" [b]OUTPUTS[/b]") for o in rule.outputs: print(" {}".format(o)) print() console.print(" [b]PARAMETERS[/b]") for p in rule.parameters: print(" {} = {!r}".format(p, rule.parameters[p])) print() print() return True