Ejemplo n.º 1
0
def make_config(database_dir,
                threads,
                assembler,
                data_type='metagenome',
                interleaved_fastq=False,
                config='config.yaml'):
    """
    Reads template config file with comments from ./template_config.yaml
    updates it by the parameters provided.

    Args:
        config (str): output file path for yaml
        database_dir (str): location of downloaded databases
        threads (int): number of threads per node to utilize
        assembler (str): either spades or megahit
        data_type (str): this is either metagenome or metatranscriptome
    """

    from ruamel.yaml import YAML  #used for yaml reading with comments

    yaml = YAML()
    yaml.version = (1, 1)
    yaml.default_flow_style = False

    template_conf_file = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "template_config.yaml")

    with open(template_conf_file) as template_config:
        conf = yaml.load(template_config)

    conf["tmpdir"] = tempfile.gettempdir()
    conf["threads"] = multiprocessing.cpu_count() if not threads else threads
    conf["preprocess_adapters"] = os.path.join(database_dir, "adapters.fa")
    conf["contaminant_references"] = {
        "PhiX": os.path.join(database_dir, "phiX174_virus.fa")
    }

    if data_type == 'metatranscriptome':
        conf["contaminant_references"]["rRNA"] = os.path.join(
            database_dir, "silva_rfam_all_rRNAs.fa"),

    conf["data_type"] = data_type
    conf["interleaved_fastqs"] = interleaved_fastq

    conf["assembler"] = assembler
    conf["database_dir"] = database_dir
    #conf["refseq_namemap"] = os.path.join(database_dir, "refseq.db")
    #conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree")
    #conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd")

    if os.path.exists(config):
        logging.warning(
            f"Config file {config} already exists, I didn't dare to overwrite it. continue..."
        )
    else:

        with open(config, "w") as f:
            yaml.dump(conf, f)
        logging.info("Configuration file written to %s\n"
                     "You may want to edit it using any text editor." % config)
Ejemplo n.º 2
0
    def yaml(self, yaml_version=None):
        from ruamel.yaml import YAML

        y = YAML()
        y.preserve_quotes = True
        if yaml_version:
            y.version = yaml_version
        return y
Ejemplo n.º 3
0
def convert_yaml(
    yaml_data: str, output: Any, array=True, inject_comments=False
) -> None:
    yaml = YAML(typ="rt")
    yaml.version = "1.1"  # type: ignore  # yaml.version is mis-typed as None
    events = yaml.parse(yaml_data)
    output = JsonnetRenderer(events, output, array, inject_comments).render()
    return output
Ejemplo n.º 4
0
def get_loader(*args, **kwargs) -> YAML:
    yaml = YAML()
    yaml.version = (1, 2)  # type: ignore

    yaml.Parser = Parser
    yaml._constructor = CustomConstructor(yaml, *args,
                                          **kwargs)  # type: ignore

    return yaml
Ejemplo n.º 5
0
def make_config(config, path, data_type, database_dir, threads, assembler):
    """
    Reads template config file with comments from ./template_config.yaml
    updates it by the parameters provided.
    Write the file `config` and complete the sample names and paths for all
    files in `path`.

    Args:
        config (str): output file path for yaml
        path (str): fastq/fasta data directory
        data_type (str): this is either metagenome or metatranscriptome
        database_dir (str): location of downloaded databases
        threads (int): number of threads per node to utilize
        assembler (str): either spades or megahit
    """
    config = os.path.realpath(os.path.expanduser(config))
    os.makedirs(os.path.dirname(config), exist_ok=True)

    path = os.path.realpath(os.path.expanduser(path))
    database_dir = os.path.realpath(os.path.expanduser(database_dir))

    yaml = YAML()
    yaml.version = (1, 1)
    yaml.default_flow_style = False

    template_conf_file = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "template_config.yaml")

    with open(template_conf_file) as template_config:
        conf = yaml.load(template_config)

    samples = get_sample_files(path, data_type)
    logging.info("Found %d samples under %s" % (len(samples), path))

    conf["samples"] = samples
    conf["tmpdir"] = tempfile.gettempdir()
    conf["threads"] = multiprocessing.cpu_count() if not threads else threads
    conf["preprocess_adapters"] = os.path.join(database_dir, "adapters.fa")
    conf["contaminant_references"] = {
        "rRNA": os.path.join(database_dir, "silva_rfam_all_rRNAs.fa"),
        "PhiX": os.path.join(database_dir, "phiX174_virus.fa")
    }

    conf["assembler"] = assembler

    conf["refseq_namemap"] = os.path.join(database_dir, "refseq.db")
    conf["refseq_tree"] = os.path.join(database_dir, "refseq.tree")
    conf["diamond_db"] = os.path.join(database_dir, "refseq.dmnd")

    with open(config, "w") as f:
        yaml.dump(conf, f)
    logging.info("Configuration file written to %s" % config)
Ejemplo n.º 6
0
    def make_config(self):
        """
        Reads template config file with comments from ./template_config.yaml
        updates it by the parameters provided.
        Args:
            config (str): output file path for yaml
            database_dir (str): location of downloaded databases
            threads (int): number of threads per node to utilize
            assembler (str): either spades or megahit
            data_type (str): this is either metagenome or metatranscriptome
        """

        self.config = os.path.join(self.output, 'template_config.yaml')

        yaml = YAML()
        yaml.version = (1, 1)
        yaml.default_flow_style = False

        template_conf_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "template_config.yaml")

        with open(template_conf_file) as template_config:
            conf = yaml.load(template_config)

        if self.assembly != "none":
            self.assembly = [os.path.abspath(p) for p in self.assembly]
        if self.pe1 != "none":
            self.pe1 = [os.path.abspath(p) for p in self.pe1]
        if self.pe2 != "none":
            self.pe2 = [os.path.abspath(p) for p in self.pe2]
        if self.longreads != "none":
            self.longreads = [os.path.abspath(p) for p in self.longreads]

        conf["fasta"] = self.assembly
        conf["max_threads"] = self.threads
        conf["pplacer_threads"] = self.pplacer_threads
        conf["max_memory"] = self.max_memory

        conf["short_reads_1"] = self.pe1
        conf["short_reads_2"] = self.pe2
        conf["long_reads"] = self.longreads
        conf["long_read_type"] = self.longread_type
        conf["min_contig_size"] = self.min_contig_size
        conf["min_bin_size"] = self.min_bin_size

        conf["gtdbtk_folder"] = os.path.abspath(self.gtdbtk)

        with open(self.config, "w") as f:
            yaml.dump(conf, f)
        logging.info("Configuration file written to %s\n"
                     "You may want to edit it using any text editor." %
                     self.config)
Ejemplo n.º 7
0
def get_loader(macros_root: str = None, context: ContextType = {}) -> YAML:
    yaml = YAML()
    yaml.version = (1, 2)  # type: ignore

    yaml.Parser = Parser
    yaml._constructor = CustomConstructor(  # type: ignore
        yaml,
        # macros_root=macros_root,
        macro_provider=MacroProvider(macros_root).get_macro,
        context=context
    )

    return yaml
Ejemplo n.º 8
0
def get_yaml_instance(
    version: VersionType = (1, 2),
    indent: Any = {'mapping': 2, 'sequence': 4, 'offset': 2},
    **kwargs: Any
) -> YAML:
    yaml = YAML(**kwargs)

    yaml.version = version  # type: ignore
    yaml.Representer = CustomRepresenter

    yaml.indent(**indent)

    return yaml
Ejemplo n.º 9
0
def get_yaml_instance(version=(1, 2),
                      indent={
                          'mapping': 2,
                          'sequence': 4,
                          'offset': 2
                      },
                      **kwargs):
    yaml = YAML(**kwargs)

    yaml.Representer = CustomRepresenter

    yaml.version = version
    yaml.indent(**indent)

    return yaml
Ejemplo n.º 10
0
 def write_yaml(self, p_data, p_filename, addnew=False):
     """
     @param p_data: is the yaml data to be written.
     @param p_filename: is the name of the read in yaml file 'rooms.yaml'
     @param addnew: defaults to false, will add '-new' to the saved filename.
     """
     l_now = datetime.datetime.now()
     l_node = self.m_pyhouse_obj._Config.YamlTree[p_filename]
     l_filename = l_node.YamlPath
     l_node.Yaml.insert(0, 'Skip', 'x', comment="Updated: " + str(l_now))
     if addnew:
         l_filename += '-new'
     l_yaml = YAML(typ='rt')
     l_yaml.indent(mapping=2, sequence=4, offset=2)
     l_yaml.version = (1, 2)
     with open(l_filename, 'w+') as l_file:
         l_yaml.dump(p_data, l_file)
     LOG.debug('Saved Yaml file "{}"'.format(p_filename))
Ejemplo n.º 11
0
def get_constructor():
    yaml = YAML()
    yaml._constructor = CustomConstructor(loader=yaml)
    yaml.version = (1, 2)

    return yaml.constructor
Ejemplo n.º 12
0
def convert_yaml_to_jsonnet(file, initialize=settings["initialize"]):

    # Initialize variable
    all_key_df = []
    all_alias_df = []
    f = StringIO()

    # Set file name
    file_name = Path(file).stem

    read_file = open(file, "r").read().replace("infinity", repr("infinity"))
    read_file = clean_comments(read_file)
    yaml = YAML(typ="rt")
    yaml.version = "1.1"  # type: ignore  # yaml.version is mis-typed as None

    # Parse and get events from the yml file
    # These events are Scalar Event, Sequence Start/ End event, Mapping event etc.

    events = yaml.parse(read_file)
    events_df = (
        pd.DataFrame(events, columns=["event"])
        .reset_index()
        .rename({"index": "event_id"}, axis=1)
    )
    post_processed_read_file = copy.deepcopy(read_file).splitlines()

    output, keys, alias = y2j.convert_yaml(
        yaml_data=read_file, output=f, array=False, inject_comments=False
    )
    output = output[: int(len(output) / 2)]
    output[-1] = output[-1].replace(",\n", "")
    output = [item for item in output if item != "#insert_comment"]
    copy_output = copy.deepcopy(output)
    output = "".join(output).splitlines()

    # Get all keys from the YML file
    # eg:  { geo_K10_: "geo"} the key would be geo and is the 10th in position

    all_key_df = pd.DataFrame(keys, columns=["raw_key_id", "key_event"])
    all_key_df["key_id"] = all_key_df.raw_key_id.apply(
        lambda x: x.replace("[", "").replace("]", "").replace("'", "")
    )
    all_key_df[
        ["key_value", "key_line_start", "key_col_start", "key_line_end", "key_col_end"]
    ] = pd.DataFrame(
        all_key_df.key_event.apply(
            lambda c: [
                c.value,
                c.start_mark.line,
                c.start_mark.column,
                c.end_mark.line,
                c.end_mark.column,
            ]
        ).to_list()
    )
    all_key_df["key_file_id"] = Path(file).stem
    all_key_df["line"] = all_key_df.key_line_start.apply(
        lambda x: post_processed_read_file[x]
    )

    # Get all alias from the YML file
    # eg: default: { <<: *geo} the alias would be *geo

    all_alias_df = pd.DataFrame(alias, columns=["event"])
    all_alias_df[
        ["value", "line_start", "col_start", "line_end", "col_end"]
    ] = pd.DataFrame(
        all_alias_df.event.apply(
            lambda c: [
                c.anchor,
                c.start_mark.line,
                c.start_mark.column,
                c.end_mark.line,
                c.end_mark.column,
            ]
        ).to_list()
    )
    all_alias_df["file_id"] = Path(file).stem
    all_alias_df["line"] = all_alias_df.line_start.apply(
        lambda x: post_processed_read_file[x]
    )
    all_alias_df["key"] = all_alias_df.line.apply(lambda x: x.lstrip().split()[0])

    # Convert jsonnet to json and get path for each keys

    _jsonnet = importlib.import_module("_jsonnet")
    jsonnet_str = _jsonnet.evaluate_snippet("default", "\n".join(output))
    obj = json.loads(jsonnet_str)
    d = benedict(obj, keypath_separator="|")
    k = d.keypaths(indexes=True)

    # Associate path to key
    # eg: default: { geo: &geo} (where path to geo is default.geo )
    path_df = pd.DataFrame(k, columns=["path"])
    path_df["key_id"] = path_df.path.apply(lambda x: x.split("|")[-1])

    # Create a dataframe that links the reference tag to the actual key
    # eg: geo: &geo (where geo is the reference variable)

    refer_by = []
    for line_id, line in enumerate(post_processed_read_file):
        if len(re.findall(r": &\S*|- &\S*", line)) > 0:
            pad = len(line) - len(line.lstrip())
            refer_by.append([line, line_id, pad, re.findall(r": &\S*|- &\S*", line)])

    refer_by_df = pd.DataFrame(
        refer_by, columns=["line", "ref_line_start", "ref_col_start", "reference"]
    ).explode("reference")
    refer_by_df["reference"] = refer_by_df.reference.apply(
        lambda x: re.sub("[^._|A-Za-z0-9/-]+", "", x)
    )
    refer_by_df["ref_file_id"] = Path(file).stem
    refer_by_df["list_flag"] = refer_by_df.reference.apply(
        lambda x: True if x.lstrip().startswith("-") else False
    )
    refer_by_df["reference"] = refer_by_df.reference.apply(
        lambda x: x[1:] if x.lstrip().startswith("-") else x
    )

    original_read_file = read_file.splitlines()

    # Get all alais whivh have to be merged to the associated key
    # eg: default: { <<: *geo} -> geo needs to be merged to default

    indirect_ref = all_alias_df[all_alias_df.key.isin(["<<:"])]
    indirect_ref_associated_key_df = indirect_ref.apply(
        lambda x: get_associated_key(
            all_key_df,
            original_read_file[x.line_start],
            x.file_id,
            x.line_start,
            x.col_start,
        ),
        axis=1,
    )
    indirect_ref_key_reference_df = pd.concat(
        [indirect_ref, indirect_ref_associated_key_df], axis=1
    )

    for line_start in indirect_ref_key_reference_df.line_start.unique():
        post_processed_read_file[line_start] = ""

    f = StringIO()
    read_file = "\n".join(post_processed_read_file)

    processed_output, processed_keys, processed_alias = y2j.convert_yaml(
        yaml_data=read_file, output=f, array=False, inject_comments=False
    )
    processed_output = processed_output[: int(len(processed_output) / 2)]
    processed_output[-1] = processed_output[-1].replace(",\n", "")
    processed_output = [item for item in processed_output if item != "#insert_comment"]
    processed_output = "".join(processed_output)

    # Initialize the reference paths for each key

    if file_name == "default":
        if initialize:
            default_reference_key = pd.DataFrame()
        else:
            default_reference_key = pd.read_csv(settings["reference_file"])
    else:
        default_reference_key = pd.read_csv(settings["reference_file"])

    if len(refer_by_df) > 0:
        if refer_by_df.list_flag.any():
            reference_key_list = refer_by_df[refer_by_df.list_flag].apply(
                lambda x: get_associated_key(
                    all_key_df,
                    original_read_file[x.ref_line_start],
                    x.ref_file_id,
                    x.ref_line_start,
                    x.ref_col_start,
                ),
                axis=1,
            )
            reference_key_list = pd.concat(
                [
                    refer_by_df[refer_by_df.list_flag],
                    reference_key_list[
                        [
                            "key_id",
                            "key_line_start",
                            "key_col_start",
                            "key_col_end",
                            "line",
                        ]
                    ],
                ],
                axis=1,
            ).reset_index(drop=True)

            reference_key_list["key_id"] = reference_key_list.apply(
                lambda x: x.key_id + "[" + str(x.name) + "]", axis=1
            )

        reference_key_no_list = pd.merge(
            refer_by_df[~refer_by_df.list_flag],
            all_key_df[
                ["key_id", "key_line_start", "key_col_start", "key_col_end", "line"]
            ],
            on="line",
        )

        if refer_by_df.list_flag.any():
            reference_key = pd.concat(
                [
                    reference_key_no_list[
                        ["reference", "ref_file_id", "list_flag", "key_id"]
                    ],
                    reference_key_list[
                        ["reference", "ref_file_id", "list_flag", "key_id"]
                    ],
                ]
            ).reset_index(drop=True)
        else:
            reference_key = reference_key_no_list[
                ["reference", "ref_file_id", "list_flag", "key_id"]
            ]

        reference_key = pd.merge(reference_key, path_df, on="key_id")
        reference_key["reference_path"] = reference_key.apply(
            lambda x: (
                x.ref_file_id
                + "."
                + ".".join(
                    [
                        "[" + repr(p.replace("_h_", "-")) + "]" if "_h_" in p else p
                        for p in x.path.split("|")
                    ]
                )
            ).replace(".[", "["),
            axis=1,
        )
        default_reference_key = pd.concat(
            [reference_key[["reference", "reference_path"]], default_reference_key]
        )

        if file_name == "default":
            if initialize:
                default_reference_key.to_csv(settings["reference_file"])

    direct_df = all_alias_df[all_alias_df.key != "<<:"]
    indirect_df = all_alias_df[all_alias_df.key == "<<:"]

    indirect_df["key_id"] = indirect_df.apply(
        lambda x: get_associated_key(
            all_key_df,
            original_read_file[x.line_start],
            x.file_id,
            x.line_start,
            x.col_start,
        ),
        axis=1,
    )["key_id"]

    processed_output_list = processed_output.splitlines()
    for idx, val in indirect_df[["key_id", "value"]].iterrows():
        l_id = [
            line_id
            for line_id, e in enumerate(processed_output_list)
            if val.key_id in e
        ][0]
        ob = processed_output_list[l_id].split(":")
        if "*" in processed_output_list[l_id]:
            processed_output_list[l_id] = (
                (ob[0] + ": " + repr("*" + val.value) + " + " + "".join(ob[1:]))
                .replace("null", "")
                .replace(".[", "[")
            )
        else:
            processed_output_list[l_id] = (
                (ob[0] + ": " + repr("*" + val.value) + " " + "".join(ob[1:]))
                .replace("null", "")
                .replace(".[", "[")
            )

    # Associate comment to its associated key

    comments_df = (
        pd.DataFrame(
            [e for e in flatten(events_df.event.apply(lambda x: x.comment)) if e],
            columns=["event"],
        )
        .reset_index()
        .rename({"index": "event_id"}, axis=1)
    )
    comments_df[
        ["comment", "line_start", "col_start", "line_end", "col_end"]
    ] = pd.DataFrame(
        comments_df.event.apply(
            lambda c: [
                c.value,
                c.start_mark.line,
                c.start_mark.column,
                c.end_mark.line,
                c.end_mark.column,
            ]
        ).to_list()
    )
    comments_df["file_id"] = file_name

    associatied_comments_df = comments_df.apply(
        lambda x: get_associated_key_for_comment(
            all_key_df,
            original_read_file[x.line_start],
            x.file_id,
            x.line_start,
            x.col_start,
        ),
        axis=1,
    )

    comments_df = pd.concat([comments_df, associatied_comments_df], axis=1)
    comments_df = comments_df[
        comments_df.comment.apply(lambda x: not bool(re.match("^\n+$", x)))
    ]

    for idx, val in comments_df[["comment", "raw_key_id"]].iterrows():
        l_id = [
            line_id
            for line_id, e in enumerate(processed_output_list)
            if val.raw_key_id in e
        ][0]
        comment_line = (
            "".join(
                [
                    "\n// " + line.lstrip()[1:].lstrip()
                    for line in val.comment.splitlines()
                    if len(line) > 0
                ]
            )
            + "\n"
        )
        processed_output_list[l_id] = comment_line + processed_output_list[l_id]

    final_op = "\n".join(processed_output_list)

    # Perform clean up operations

    for idx, val in default_reference_key.iterrows():
        final_op = final_op.replace(repr("*" + val.reference), val.reference_path)

    for f in re.findall("_K(.+?)_", final_op):
        final_op = final_op.replace("_K" + f + "_", "")
    for f in re.findall("_M(.+?)_", final_op):
        final_op = final_op.replace("_M" + f + "_", "")
    final_op = final_op.replace("_h_", "-").replace("// \n", "")
    final_op_lines = final_op.splitlines()

    for line_id, line in enumerate(final_op_lines):
        if "['<<']" in line:
            final_op_lines[line_id] = line.replace("['<<']:", "").rstrip()[:-1] + " + {"
            final_op_lines[line_id - 1] = ""

    # Create reference local variables

    if file_name != "default":
        final_op_lines = [
            "local default = import 'default.jsonnet';",
            "{",
            "local " + file_name + " = $ ,",
        ] + final_op_lines[1:]
    else:
        final_op_lines = ["{", "local " + file_name + " = $ ,"] + final_op_lines[1:]

    # Format lists within the jsonnet

    complete_jsonnet = ""
    for line_id, line in enumerate(final_op_lines):
        if (
            (":" not in line)
            & ("//" not in line)
            & ("{" not in line)
            & ("}" not in line)
            & (len(line) > 0)
            & ("=" not in line)
            & ("[" not in line)
            & ("]" not in line)
            & (".yml" not in line)
            & ("|||" not in line)
            & (line.rstrip().endswith(","))
        ):
            final_op_lines[line_id - 1] = final_op_lines[line_id - 1].replace("\n", "")

        else:
            final_op_lines[line_id] = final_op_lines[line_id] + "\n"

    complete_jsonnet = "".join(final_op_lines)
    complete_jsonnet = complete_jsonnet.replace("\n\n//", "\n//")
    complete_jsonnet = complete_jsonnet.replace("\n\n|||", "\n|||")

    # Write the output to jsonnet file

    with open(settings["output_path"] + file_name + ".jsonnet", "w") as f:
        f.write(complete_jsonnet)
    print(file_name + " has been processed!")