Ejemplo n.º 1
0
def action_info_curation(endpoint: str) -> Tuple[bool, Union[str, dict]]:
    """
        Returns a list of curation statistics

        :param endpoint: curation endpoint to collect statistics from
        
        :return bool:
        :return str:
        :return stats_dict:
    """

    # get curation endpoint path

    endpoint_curation = pathlib.Path(utils.curation_tree_path(endpoint))
    if endpoint_curation.is_dir() is False:
        return False, 'Curation endpoint path does not exist.\n'

    # get statisics files in curation endpint
    stats_file = os.path.join(endpoint_curation, 'statistics.pkl')

    if not os.path.isfile(stats_file):
        return False, 'Statistics file does not exist.\n'

    stats = []
    with (open(stats_file, "rb")) as openfile:
        stats = pickle.load(openfile)

    return True, stats
Ejemplo n.º 2
0
def action_header_curation(endpoint: str) -> Tuple[bool, Union[str, dict]]:
    """
        Returns a dataframe of curation output header

        :param endpoint: curation endpoint
        
        :return bool:
        :return str:
        :return head_:
    """

    # get curation endpoint path

    endpoint_curation = pathlib.Path(utils.curation_tree_path(endpoint))
    if endpoint_curation.is_dir() is False:
        return False, 'Curation endpoint path does not exist.\n'

    # get header file in curation endpint
    header_file = os.path.join(endpoint_curation, 'curated_data_head.pkl')

    if not os.path.isfile(header_file):
        return False, 'Curation header file does not exist.\n'

    head_ = []
    with (open(header_file, "rb")) as openfile:
        head_ = pickle.load(openfile)

    return True, head_
Ejemplo n.º 3
0
def action_kill(curation_endpoint: str) -> Tuple[bool, str]:
    """
        Removes the endpoint tree described by the argument.

        :param curation_endpoint: path to curation endpoint in the repo.

        :return bool:
        :return str:
    """

    if not curation_endpoint:
        return False, 'Empty endpoint name'

    ndir = utils.curation_tree_path(curation_endpoint)

    if not os.path.isdir(ndir):
        return False, "Model {} not found".format(curation_endpoint)

    try:
        shutil.rmtree(ndir, ignore_errors=True)
    except:
        return False, "Failed to remove model {}".format(curation_endpoint)

    sys.stderr.write("Model {} removed\n".format(curation_endpoint))

    return True, "Model {} removed".format(curation_endpoint)
Ejemplo n.º 4
0
    def loadYaml_curation(self, curation_path: str) -> Tuple[bool, str]:       
        """ 
            load a set of parameters from the configuration file present 
            at the model directory

            adds some parameters identifying the model and the 
            hash of the configuration file

            :param curation_path:
        """

        # obtain the path and the default name of the model parameters
        parameters_file_path = utils.curation_tree_path(curation_path)
        
        if not os.path.isdir(parameters_file_path):
            return False, 'Curation "{}" not found'.format(curation_path)

        parameters_file_name = os.path.join(parameters_file_path, 'curation_parameters.yaml')

        # load the main class dictionary (p) from this yaml file
        if not os.path.isfile(parameters_file_name):
            return False, 'Parameters file not found'

        try:
            with open(parameters_file_name, 'r') as pfile:
                self.p = yaml.safe_load(pfile)
        except Exception as e:
            return False, e

        # add keys for the model
        self.p['endpoint'] = curation_path
        self.p['curation_path'] = parameters_file_path

        return True, 'OK'
Ejemplo n.º 5
0
def curation_cmd(commnad_dict: dict) -> Optional[bool]:
    """
        Instantiate curate objectt using commnad_dict from argument parser.

        :param commnad_dict:
                - data_input: input file name to be processed
                - molecule_identifier: column name containing the molecule ID. Usually CAS is used
                - endpoint: curation endpoint name
                - structure_column: column name containing the SMILES string
                - metadata: column names for metadata processing (only for API)
                - separator: file separator if input file is a csv or a tsv
                - remove_problematic: boolean indicating the option of removing problematic structures or not
                - outfile_type: output file type: xlsx, csv, tsv, sdf or json
    """

    import curate.dataset_curation as datacur

    # safety check if curation endpoint exists
    output_dir = utils.curation_tree_path(commnad_dict['endpoint'])
    if not os.path.isdir(output_dir):
        sys.stderr.write("Endpoint name not found in model repository.\n")
        return

    # check of metadata

    if 'metadata' in commnad_dict.keys():
        metadata_ = commnad_dict['metadata'].split(',')
        if (commnad_dict['molecule_identifier']
                in metadata_) or (commnad_dict['structure_column']
                                  in metadata_):
            sys.stderr.write(
                "datacur curate : metadata can't contain the ID nor the SMILES column names.\n"
            )
            return
    else:
        metadata_ = None

    # call of curation functions

    curating = datacur.DataCuration(
        data_input=commnad_dict['data_input'],
        molecule_identifier=commnad_dict['molecule_identifier'],
        structure_column=commnad_dict['structure_column'],
        output_dir=output_dir,
        endpoint=commnad_dict['endpoint'],
        metadata=metadata_,
        separator=commnad_dict['separator'],
        remove_problematic=commnad_dict['remove_problematic'],
        outfile_type=commnad_dict['outfile_type'])

    curating.curate_data()

    curating.get_output_file(smiles_column='structure_curated')
Ejemplo n.º 6
0
def action_curation_results(endpoint: str) -> Tuple[bool, str]:
    """
        Returns the output file

        :param endpoint: curation endpoint
        
        :return bool:
        :return str:
        :return head_:
    """

    # get curation endpoint path
    endpoint_curation = pathlib.Path(utils.curation_tree_path(endpoint))
    if endpoint_curation.is_dir() is False:
        return False, 'Curation endpoint path does not exist.\n'

    # get curation file in curation endpint
    curation_file = [
        f for f in os.listdir(endpoint_curation)
        if f.startswith('curated_data') and 'head' not in f
    ]

    if not curation_file:
        return False, {
            'code': 0,
            'message': 'curations not found for {} directory'.format(endpoint)
        }
    else:
        curation_file_path = os.path.join(endpoint_curation, curation_file[0])

        # curation_ = []
        # if curation_file_path.endswith('.csv'):
        #     with (open(curation_file_path, "rb")) as openfile:
        #         curation_ = pd.read_csv(curation_file_path, delimiter=',')
        #         curation_ = curation_.to_dict('list')
        # elif curation_file_path.endswith('.tsv'):
        #     with (open(curation_file_path, "rb")) as openfile:
        #         curation_ = pd.read_csv(curation_file_path, delimiter='\t')
        #         curation_ = curation_.to_dict('list')
        # elif curation_file_path.endswith('.xlsx'):
        #     curation_ = pd.read_excel(curation_file_path, engine='openpyxl')
        #     curation_ = curation_.to_dict('list')
        # elif curation_file_path.endswith('.json'):
        #     with (open(curation_file_path)) as openfile:
        #         curation_.append(json.load(openfile))
        # elif curation_file_path.endswith('.sdf'):
        #     curation_ = PandasTools.LoadSDF(curation_file_path, smilesName='structure_curated',molColName='name', removeHs=False, strictParsing=True)
        #     curation_ = curation_.to_dict('list')

        return True, curation_file_path
Ejemplo n.º 7
0
    def delta_curation(self, curation: str, parameters: str, iformat: str ='YAML') -> Tuple[str, bool]:
        """
            load a set of parameters from the configuration file present 
            at the curation directory

            also, inserts the keys present in the param_file provided, 
            assuming that it contains a YAML-compatible format, like the one
            generated by manage

            adds some parameters identifying the curation

            :param curation:
            :param parameters:
        """

        if not self.loadYaml_curation(curation):
            return False, 'file not found'
        
        # parse parameter file assuning it will be in
        # a YAML-compatible format
        if iformat == 'JSONS':
            try:
                newp = json.loads(parameters)
            except Exception as e:
                return False, e
        else:
            try:
                with open(parameters, 'r') as pfile:
                    if iformat == 'YAML':
                        newp = yaml.safe_load(pfile)
                    elif iformat == 'JSON':
                        newp = json.load(pfile)
            except Exception as e:
                return False, e
        
        self.applyDelta_curation(newp)

        # dump internal dict to the parameters file
        parameters_file_path = utils.curation_tree_path(curation)
        parameters_file_name = os.path.join(parameters_file_path, 'curation_parameters.yaml')

        try:
            with open(parameters_file_name, 'w') as pfile:
                yaml.dump (self.p, pfile)
        except Exception as e:
            return False, 'unable to write parameters'

        return True, 'OK'
Ejemplo n.º 8
0
def action_list(curation_dir: str) -> Tuple[bool, str]:
    """
        In no argument is provided lists all endpoints present at the repository 
        otherwyse lists all files for the endpoint provided as argument.

        :param curation_dir: path to the endpoint in curation repo
    """

    # if no name is provided, just list the different curation dirs
    if not curation_dir:
        rdir = utils.curation_repository_path()
        if os.path.isdir(rdir) is False:
            return False, 'the curation repository path does not exist. Please run "datacur -c config".\n'

        num_curs = 0
        sys.stderr.write('Curation endpoints found in repository:\n')
        for x in os.listdir(rdir):
            xpath = os.path.join(rdir, x)
            # discard if the item is not a directory
            if not os.path.isdir(xpath):
                continue
            num_curs += 1
            creation_date = get_creation_date(xpath)
            sys.stderr.write("\n{} {}\n".format(x, creation_date))

        sys.stderr.write(
            "\nRetrieved list of curation endpoints from {}\n".format(rdir))

        return True, "{} endpoints found".format(num_curs)

    else:
        # if a path name is provided, list files
        base_path = utils.curation_tree_path(curation_dir)
        num_files = 0
        sys.stderr.write(
            'Files found in curation endpoint {}:\n'.format(curation_dir))
        for x in os.listdir(base_path):
            if x.endswith('.json'):
                continue
            num_files += 1
            xpath = os.path.join(base_path, x)
            creation_date = get_creation_date(xpath)
            sys.stderr.write("\n{} {}\n".format(x, creation_date))

        return True, "Endpoint {} has {} files".format(curation_dir, num_files)
Ejemplo n.º 9
0
def action_new(curation_path: str) -> Tuple[bool, str]:
    """
        Create a new curation endpoint tree, using the given name.
        
        :param curation_path: curation endpoint in curation repository where output will be saved

        :return bool: True when evertyhing has workded, otherwise False.
        :return str: strings that would be the equivalent to the standard error.
    """

    if not curation_path:
        return False, 'empty endpoint curation label\n'

    # importlib does not allow using 'test' and issues a misterious error when we
    # try to use this name. This is a simple workaround to prevent creating paths
    # with this name
    if curation_path == 'test':
        return False, 'the name "test" is disallowed, please use any other name'

    # curation endpoint directory
    ndir = pathlib.Path(utils.curation_tree_path(curation_path))

    # check if there is already a tree for this endpoint
    if ndir.exists():
        return False, "Endpoint {} already exists\n".format(curation_path)

    try:
        ndir.mkdir(parents=True)
        sys.stderr.write("{} created\n".format(ndir))
    except:
        return False, "Unable to create path for {} endpoint".format(
            curation_path)

    # Copy classes skeletons to ndir
    wkd = pathlib.Path(os.path.dirname(os.path.abspath(__file__)))

    # copy parameter yml file
    params_path = wkd / 'children' / 'curation_parameters.yaml'
    shutil.copy(params_path, ndir)

    sys.stderr.write("New endpoint {} created\n".format(curation_path))

    return True, "new endpoint {} created".format(curation_path)
Ejemplo n.º 10
0
def action_remove(curation_endpoint: str) -> Tuple[bool, str]:
    """
        Remove the curation endpoint directory indicated as 
        argument

        :param curation_endpoint: curation endpoint to be removed
    """

    if not curation_endpoint:
        return False, 'Empty curation endpoint'

    rdir = utils.curation_tree_path(curation_endpoint)
    if not os.path.isdir(rdir):
        return False, '{} not found'.format(curation_endpoint)

    shutil.rmtree(rdir, ignore_errors=True)
    sys.stderr.write("Curation endpoint dir {} has been removed\n".format(
        curation_endpoint))

    return True, "Curation endpoint dir {} has been removed".format(
        curation_endpoint)
Ejemplo n.º 11
0
    def calculate_data_stats(self, dataframe: pd.DataFrame):
        """
            Counts how many substances have been processed, how many haven't and the different
            types of substances calculated.

            :param dataframe: curated data dataframe
        """

        data_stats = self.get_number_of_processed_vs_unprocessed(dataframe)
        subs_types_stats = self.get_total_of_smiles_per_type_of_substance(
            dataframe)

        general_stats = {}
        general_stats['curation_stats'] = data_stats
        general_stats['substance_types'] = subs_types_stats

        stats_file = utils.curation_tree_path('/'.join(
            [self.endpoint, 'statistics.pkl']))

        with open(stats_file, 'wb') as fo:
            pickle.dump(general_stats, fo)
Ejemplo n.º 12
0
    def update_file_curation(self, curation: str) -> Union[Tuple[bool,str], bool]:
        """
            Function to save current parameter values modified
            at the object level (i.e: From a interactive python shell)

            :param curation:
        """

        p = self.p
        if not p:
            return False, 'No loaded parameters'

        parameters_file_path = utils.curation_tree_path(curation)
        parameters_file_name = os.path.join(parameters_file_path,
                                            'curation_parameters.yaml')

        try:
            with open(parameters_file_name, 'w') as pfile:
                yaml.dump (p, pfile)
        except Exception as e:
            return False, e
        return True
Ejemplo n.º 13
0
def action_export(curation_endpoint: str) -> Tuple[bool, str]:
    """
        Exports the whole curation endpoint tree indicated in the argument as a single
        tarball file with the same name.

        :param curation_endpoint: path to curation endpoint in the repo.
    """

    if not curation_endpoint:
        return False, 'Empty endpoint name'

    current_path = os.getcwd()
    exportfile = os.path.join(current_path, curation_endpoint + '.tgz')

    base_path = utils.curation_tree_path(curation_endpoint)

    if not os.path.isdir(base_path):
        return False, 'Unable to export, endpoint directory not found'

    # change to curation repository to tar the file from there
    os.chdir(base_path)

    itemend = os.listdir()
    itemend.sort()

    with tarfile.open(exportfile, 'w:gz') as tar:
        for iversion in itemend:
            if not os.path.isdir(iversion):
                continue
            tar.add(iversion)

    # return to current directory
    os.chdir(current_path)
    sys.stderr.write("Endpoint {} exported as {}.tgz\n".format(
        curation_endpoint, curation_endpoint))

    return True, "Endpoint {} exported as {}.tgz".format(
        curation_endpoint, curation_endpoint)