Ejemplo n.º 1
0
def load_folder(folder: str, file_type: str = "", logger=app_logger) -> List[str]:
    """
    Load all files under given folder, optional with selected file suffix
    Args:
        folder: path of the folder.
        file_type: type of the file, default value is "" for no file type filter.
        logger: logger for logging.

    Returns:
        file_abs_path_lst: the list of files under given folder in absolute path

    """
    abs_path = get_abs_path(folder)
    file_lst = os.listdir(abs_path)
    file_abs_path_lst = [os.path.join(abs_path, x) for x in file_lst]
    if file_type:
        file_abs_path_lst = [
            f for f in file_abs_path_lst if f.lower().endswith(file_type.lower())
        ]
    file_abs_path_lst = [abs_f for abs_f in file_abs_path_lst if os.path.isfile(abs_f)]
    logger.debug(
        f"Fund {file_type} files:\nunder folder: {folder}\nfiles:\n {file_abs_path_lst}"
    )

    return file_abs_path_lst
Ejemplo n.º 2
0
    def __init__(
        self,
        db_info: dict,
        schema: str = "lynx_db",
        output_rules: dict = default_output_rules,
        nomenclature: str = "LipidLynxX",
        logger=app_logger,
    ):
        self.logger = logger
        self.nomenclature = nomenclature
        self.export_rule = load_output_rule(output_rules, nomenclature)
        self.db_sites_rule = self.export_rule.get("DB_SITES", None)
        self.db_separators = self.export_rule.get("SEPARATORS", [])
        if not self.db_sites_rule:
            raise ValueError(
                f"Cannot find output rule for 'MODS' from nomenclature: {nomenclature}."
            )
        self.db_info = db_info.get("DB_INFO", {}).get("0.0_DB", {})
        self.schema = schema
        self.type = "DB"
        self.db_level = str(db_info.get("DB_LEVEL", 0))
        if self.db_level == "0":
            self.db_level = "0.0"
        with open(get_abs_path(lynx_schema_cfg[self.schema]), "r") as s_obj:
            self.validator = Draft7Validator(
                json.load(s_obj),
                resolver=RefResolver(f"file://{core_schema_path}",
                                     referrer=core_schema),
            )

        self.db_count = self.db_info.get("DB_COUNT", 0)
        self.db_site = self.to_db_site_list()
        self.db_site_info = self.to_db_site_info_list()
        self.sum_db_info = self.to_sum_info()
Ejemplo n.º 3
0
def get_json(file: str) -> dict:
    file = get_abs_path(file)
    if file.lower().endswith(".json"):
        with open(file) as file_obj:
            js_obj = json.load(file_obj)
            return js_obj
    else:
        raise IOError(f"Input file: {file} is not json file")
Ejemplo n.º 4
0
def save_table(df: pd.DataFrame, file_name: str) -> (bool, str):
    is_output = False
    abs_output_path = None
    if not df.empty:
        df.to_excel(file_name)
        is_output = True
        abs_output_path = get_abs_path(file_name)

    return is_output, abs_output_path
Ejemplo n.º 5
0
def test_input_rule(test_file):
    app_logger.debug("SETUP TESTS...")
    app_logger.info(test_file)
    in_file = None
    if test_file:
        in_file = get_abs_path(test_file)
    if not in_file:
        in_file = get_abs_path(
            r"lynx/configurations/rules/input/LipidLynxX.json")
    app_logger.info(f"Test file {in_file}")
    rule = InputRules(test_file)
    app_logger.debug(f"Got infile {in_file}")
    app_logger.debug(f"test input rule {rule.sources}")
    if rule.is_validated is False:
        raise Exception(f"FAILED: test input rule {rule.sources}")
    else:
        app_logger.info(f"PASSED: test input rule  {rule.sources}")
    app_logger.info(f"test PASSED")
Ejemplo n.º 6
0
def test_output_rule(test_file):
    app_logger.debug("SETUP TESTS...")
    app_logger.info(test_file)
    in_file = None
    if test_file:
        in_file = get_abs_path(test_file)
    if not in_file:
        in_file = get_abs_path(
            r"lynx/configurations/rules/output/LipidLynxX.json")
    app_logger.info(f"Test file {in_file}")
    rule = OutputRules(test_file)
    app_logger.debug(f"Got Output infile {in_file}")
    app_logger.debug(f"test Output rule {rule.nomenclature}")
    if rule.is_structure_valid is False:
        raise Exception(f"FAILED: test Rule {rule.nomenclature}")
    else:
        app_logger.info(f"PASSED: test Rule {rule.nomenclature}")
        app_logger.info(
            f"Supported LMSD classes: {rule.supported_lmsd_classes}")
    app_logger.info(f"test PASSED")
Ejemplo n.º 7
0
    def __init__(
        self,
        mod_info: dict,
        db: int = 0,
        num_o: int = 0,
        schema: str = "lynx_mod",
        output_rules: dict = default_output_rules,
        nomenclature: str = "LipidLynxX",
        logger=app_logger,
    ):
        self.logger = logger
        self.nomenclature = nomenclature
        self.export_rule = load_output_rule(output_rules, nomenclature)
        self.mod_rule = self.export_rule.get("MODS", None)
        self.mod_rule_orders = self.mod_rule.get("MOD", {}).get("ORDER", [])
        self.mod_separators = self.export_rule.get("SEPARATORS", [])
        if not self.mod_rule:
            raise ValueError(
                f"Cannot find output rule for 'MODS' from nomenclature: {nomenclature}."
            )
        self.mod_info = mod_info.get("MOD_INFO", {})
        self.schema = schema
        self.type = "Modification"
        self.mod_level = str(mod_info.get("MOD_LEVEL", 0))
        if num_o > 0:
            if self.mod_level in ["0", "1"]:
                self.mod_level = "2"
            elif self.mod_level in ["0.1", "1.1"]:
                self.mod_level = "2.1"
            elif self.mod_level in ["0.2", "1.2"]:
                self.mod_level = "2.2"
            else:
                pass
        else:
            pass
        with open(get_abs_path(lynx_schema_cfg[self.schema]), "r") as s_obj:
            self.validator = Draft7Validator(
                json.load(s_obj),
                resolver=RefResolver(f"file://{core_schema_path}",
                                     referrer=core_schema),
            )

        self.db_count = db
        self.additional_o_count = num_o
        self.sum_mod_info = self.to_sum_info()

        self.mod_id = self.sum_mod_info.get("id", "")
        self.mod_linked_ids = self.sum_mod_info.get("linked_ids", {})
        self.mod_list = self.sum_mod_info.get("info", {})
Ejemplo n.º 8
0
    def __init__(self, lipid_code: str, logger=app_logger):

        self.lipid_code = lipid_code
        self.lynx_class_lv0 = ""
        self.schema = "lynx_core"
        with open(get_abs_path(lynx_schema_cfg[self.schema]), "r") as s_obj:
            self.validator = Draft7Validator(json.load(s_obj))

        self.level = "B0"
        self._lipid_level = "B"
        self._max_mod_level = 0
        self.is_modified = False
        self.sum_info = self.__post_init__()

        self.residues = self.sum_info.get("residues", [])
        self.level = self.sum_info.get("level", "")
        self.linked_ids = self.sum_info.get("linked_ids", {})
        self.logger = logger
        self.logger.info(
            f"Level {self.level:4s} FattyAcid created from: {self.lipid_code}"
        )
Ejemplo n.º 9
0
    def __init__(
        self,
        input_data: Union[str, dict, InputDictData],
        level: Union[str, List[str]],
        rule: str = "LipidLynxX",
        input_rules: dict = default_input_rules,
        output_rules: dict = default_output_rules,
        logger=app_logger,
    ):

        if isinstance(input_data, str):
            abs_path = get_abs_path(input_data)
            if abs_path.lower().endswith(".xlsx"):
                df = pd.read_excel(abs_path)
            elif abs_path.lower().endswith(".csv"):
                df = pd.read_csv(abs_path)
            else:
                raise ValueError(f"Cannot read file {abs_path}")
            df.fillna("")
            self.data = df.to_dict(orient="list")
        elif isinstance(input_data, dict):
            self.data = input_data
        else:
            raise ValueError(f"Not supported input {type(input_data)}")
        if isinstance(level, str):
            self.levels = [level]
        else:
            self.levels = level
        self.encoder = Encoder(
            style=rule,
            input_rules=input_rules,
            output_rules=output_rules,
            logger=logger,
        )
        self.header_lst = self.data.keys()
        self.logger = logger
Ejemplo n.º 10
0
def load_cfg_info(cfg_path: str = None) -> Dict[str, str]:
    cfg_dct = {}
    default_fields = [
        "api_version",
        "app_log_level",
        "app_url",
        "app_port",
        "app_prefix",
        "cli_log_level",
        "controlled_vocabularies",
        "defined_alias",
        "input_rules",
        "output_rules",
        "resource_kegg",
        "resource_lion",
        "resource_lion",
        "temp_folder",
        "temp_max_days",
        "temp_max_files",
        "zmq_client_port",
        "zmq_worker_port",
        "zmq_worker_runner",
    ]
    config = configparser.ConfigParser()
    if cfg_path and isinstance(cfg_path, str):
        config_path = get_abs_path(cfg_path)
    else:
        try:
            config_path = get_abs_path("config.ini")
        except FileNotFoundError:
            config_path = get_abs_path("configure.ini")

    config.read(config_path)
    if config.has_section("settings"):
        user_cfg = "settings"
    elif config.has_section("default"):
        user_cfg = "default"
    else:
        user_cfg = ""
        raise ValueError(f"Cannot __load__ settings from file {config_path}")

    if len(user_cfg) > 2:
        options = config.options(user_cfg)
        for field in default_fields:
            if field in options and field in [
                    "controlled_vocabularies",
                    "defined_alias",
                    "input_rules",
                    "output_rules",
            ]:
                cfg_dct[field] = get_abs_path(config.get(user_cfg, field))
            else:
                try:
                    cfg_dct[field] = config.get(user_cfg, field)
                except configparser.NoOptionError:
                    pass

    if "app_url" not in cfg_dct:
        cfg_dct["app_url"] = "127.0.0.1"
    if "app_port" not in cfg_dct:
        cfg_dct["app_port"] = "1399"
    if "zmq_client_port" not in cfg_dct:
        cfg_dct["zmq_client_port"] = 5559
    if "zmq_worker_port" not in cfg_dct:
        cfg_dct["zmq_worker_port"] = 5560
    if "zmq_worker_runner" not in cfg_dct:
        cfg_dct["zmq_worker_runner"] = 5

    usr_app_prefix = cfg_dct.get("app_prefix", "").strip("/")

    if usr_app_prefix:
        if re.match(r"^\s*None\s*$", usr_app_prefix, re.IGNORECASE):
            usr_app_prefix = ""
        else:
            usr_app_prefix = f"/{usr_app_prefix}"
    cfg_dct["app_prefix"] = usr_app_prefix

    return cfg_dct
Ejemplo n.º 11
0
def create_converter_output(
    data: dict,
    output_name: Union[str, Path] = None,
    file_type: str = ".xlsx",
    converted_only: bool = False,
) -> Union[BytesIO, str]:
    file_info = None
    converted_df = pd.DataFrame()
    not_converted_df = pd.DataFrame()
    if data and not converted_only:
        not_converted_dct = {}
        df_lst = []
        for k in data:
            if isinstance(data[k], dict):
                k_pairs = data[k].get("converted", [])
                k_not_converted = data[k].get("skipped", [])
                if k_pairs and isinstance(k, str):
                    df_lst.append(pd.DataFrame(k_pairs, columns=[k, f"{k}_converted"]))

                if k_not_converted:
                    not_converted_dct[f"{k}_skipped"] = k_not_converted
            elif isinstance(data[k], ConvertedListData):
                k_pairs = data[k].converted
                if k_pairs and isinstance(k, str):
                    df_lst.append(pd.DataFrame(k_pairs, columns=[k, f"{k}_converted"]))
                k_not_converted = data[k].skipped
                if k_not_converted:
                    not_converted_dct[f"{k}_skipped"] = k_not_converted
            elif isinstance(data[k], list) and k == "converted":
                k_pairs = data.get("converted", [])
                if k_pairs:
                    df_lst.append(
                        pd.DataFrame(k_pairs, columns=["input", f"converted"])
                    )
            elif isinstance(data[k], list) and k == "skipped":
                k_not_converted = data.get("skipped", [])
                if k_not_converted:
                    not_converted_dct[f"skipped"] = k_not_converted

        if df_lst:
            converted_df = pd.concat(df_lst, axis=1)

        if not_converted_dct:
            not_converted_df = pd.DataFrame.from_dict(
                not_converted_dct, orient="index"
            ).T
    elif data and converted_only:
        converted_df = pd.DataFrame(
            {key: pd.Series(value) for key, value in data.items()}
        )
    else:
        pass

    if not converted_df.empty:
        if output_name:
            try:
                err_msg = None
                if isinstance(output_name, Path):
                    output_name = output_name.as_posix()
                elif isinstance(output_name, str):
                    pass
                else:
                    err_msg = (
                        f"[Type error] Cannot create file: {output_name} as output."
                    )
                if output_name.lower().endswith("csv"):
                    converted_df.to_csv(output_name)
                else:
                    converted_df.to_excel(
                        output_name, sheet_name="converted", index=False
                    )
                if err_msg:
                    file_info = err_msg
                else:
                    file_info = get_abs_path(output_name)
            except IOError:
                file_info = f"[IO error] Cannot create file: {output_name} as output."
        else:
            file_info = BytesIO()
            if file_type.lower().endswith("csv"):
                file_info.write(converted_df.to_csv().encode("utf-8"))

            else:
                output_writer = pd.ExcelWriter(
                    file_info, engine="openpyxl"
                )  # write to BytesIO instead of file path
                converted_df.to_excel(
                    output_writer, sheet_name="converted", index=False
                )
                if not not_converted_df.empty:
                    not_converted_df.to_excel(
                        output_writer, sheet_name="skipped", index=False
                    )
                output_writer.save()
            file_info.seek(0)

    return file_info
Ejemplo n.º 12
0
def create_linker_output(
    data: dict,
    output_name: Union[str, Path] = None,
    file_type: str = ".xlsx",
    export_url: bool = True,
) -> Union[BytesIO, str]:
    file_info = None
    file_linked_resources = {}
    if data:
        for sheet in data:
            sheet_linked_resources = {}
            sheet_data = data.get(sheet, {})
            sheet_export_data = sheet_data.get("export_file_data", {})
            idx = 1
            for lipid_name in sheet_export_data:
                lipid_resources = {}
                if isinstance(sheet_export_data[lipid_name], dict):
                    lipid_resources["Input_Name"] = sheet_export_data[lipid_name].get(
                        "lipid_name", ""
                    )
                    lipid_resources["Shorthand_Notation"] = sheet_export_data[
                        lipid_name
                    ].get("shorthand_name", "")
                    lipid_resources["LipidLynxX"] = sheet_export_data[lipid_name].get(
                        "lynx_name", ""
                    )
                    lipid_resources["BioPAN"] = sheet_export_data[lipid_name].get(
                        "biopan_name", ""
                    )
                    resource_data = sheet_export_data[lipid_name].get(
                        "resource_data", {}
                    )
                    for db_group in resource_data:
                        db_group_resources = resource_data[db_group]
                        for db in db_group_resources:
                            db_resources = db_group_resources.get(db)
                            if db_resources and isinstance(db_resources, dict):
                                if len(list(db_resources.keys())) < 2:
                                    lipid_resources[db] = ";".join(
                                        list(db_resources.keys())
                                    )
                                    lipid_resources[f"Link_{db}"] = ";".join(
                                        [db_resources.get(i) for i in db_resources]
                                    )
                                else:
                                    lipid_resources[db] = json.dumps(
                                        list(db_resources.keys())
                                    )
                                    lipid_resources[f"Link_{db}"] = json.dumps(
                                        [db_resources.get(i) for i in db_resources]
                                    )
                            else:
                                lipid_resources[db] = ""
                sheet_linked_resources[idx] = lipid_resources
                idx += 1
            file_linked_resources[sheet] = sheet_linked_resources

    default_col = ["Input_Name", "Shorthand_Notation", "LipidLynxX", "BioPAN"]
    file_linked_df_dct = {}
    if file_linked_resources:
        for sheet in file_linked_resources:
            sum_df = pd.DataFrame(data=file_linked_resources.get(sheet)).T
            sum_df_columns = sum_df.columns.tolist()
            link_cols = []
            for col in sum_df_columns:
                if col.startswith("Link_"):
                    sum_df_columns.remove(col)
                    if export_url:
                        link_cols.append(col)
                elif col in default_col:
                    sum_df_columns.remove(col)
            sum_df_columns = (
                default_col + natsorted(sum_df_columns) + natsorted(link_cols)
            )
            linked_df = pd.DataFrame(sum_df, columns=sum_df_columns)
            file_linked_df_dct[sheet] = linked_df

    if output_name:
        try:
            err_msg = None
            if isinstance(output_name, Path):
                output_name = output_name.as_posix()
            elif isinstance(output_name, str):
                pass
            else:
                err_msg = f"[Type error] Cannot create file: {output_name} as output."
            if output_name.lower().endswith("csv"):
                for s in file_linked_df_dct:
                    s_df = file_linked_df_dct.get(s, pd.DataFrame())
                    s_df.to_csv(output_name)
                    break
            else:
                output_writer = pd.ExcelWriter(output_name, engine="openpyxl")
                for s in file_linked_df_dct:
                    s_df = file_linked_df_dct.get(s, pd.DataFrame())
                    if not s_df.empty:
                        s_df.to_excel(output_writer, sheet_name=s)
                    else:
                        pass
                output_writer.save()
            if err_msg:
                file_info = err_msg
            else:
                file_info = get_abs_path(output_name)
        except IOError:
            file_info = f"[IO error] Cannot create file: {output_name} as output."
    else:
        file_info = BytesIO()
        if file_type.lower().endswith("csv"):
            for s in file_linked_df_dct:
                s_df = file_linked_df_dct.get(s, pd.DataFrame())
                file_info.write(s_df.to_csv().encode("utf-8"))
                break
        else:
            output_writer = pd.ExcelWriter(
                file_info, engine="openpyxl"
            )  # write to BytesIO instead of file path
            for s in file_linked_df_dct:
                s_df = file_linked_df_dct.get(s, pd.DataFrame())
                if not s_df.empty:
                    s_df.to_excel(output_writer, sheet_name=s)
                else:
                    pass
            output_writer.save()
        file_info.seek(0)

    return file_info
Ejemplo n.º 13
0
from lynx.utils.basics import get_abs_path
from lynx.utils.cfg_reader import app_cfg_info
from lynx.utils.params_loader import (
    build_mod_parser,
    build_input_rules,
    build_output_rules,
)
from lynx.utils.ports import check_port

# Define default values across LipidLynx
# load default values from files defined in config.ini
# following parameters generated will be used as global values

default_input_rules = build_input_rules(app_cfg_info["input_rules"])
default_output_rules = build_output_rules(app_cfg_info["output_rules"])
default_cv_file = get_abs_path(app_cfg_info["controlled_vocabularies"])
default_alias_file = get_abs_path(app_cfg_info["defined_alias"])
default_kegg_file = get_abs_path(app_cfg_info["resource_kegg"])
default_lion_file = get_abs_path(app_cfg_info["resource_lion"])
default_temp_folder = app_cfg_info.get("temp_folder", r"lynx/temp")
default_temp_max_days = int(app_cfg_info.get("temp_max_days", "3"))
default_temp_max_files = int(app_cfg_info.get("temp_max_files", "99"))
default_zmq_worker_runner = int(app_cfg_info.get("zmq_worker_runner", 5))

if os.path.isdir(default_temp_folder):
    pass
else:
    os.mkdir(default_temp_folder)
default_temp_folder = get_abs_path(default_temp_folder)

with open(default_cv_file, "r") as cv_js: