Example #1
0
    def _logically_verifies_cleaned_ontologies(self) -> None:
        """Logically verifies an ontology by running the ELK deductive logic reasoner. Before running the reasoner
        the instantiated RDFLib object is saved locally.

        Returns:
            None.
        """

        log_str = 'Logically Verifying Ontology'
        print(log_str)
        logger.info('PKT: ' + log_str)

        # save graph in order to run reasoner
        filename = self.temp_dir + '/' + self.ont_file_location
        self.ont_graph.serialize(destination=filename, format='xml')
        command = "{} {} --reasoner elk --run-reasoner --assert-implied -o {}"
        return_code = os.system(
            command.format(self.owltools_location, filename, filename))
        if return_code == 0:
            if isinstance(self.bucket, storage.bucket.Bucket):
                uploads_data_to_gcs_bucket(self.bucket, self.processed_data,
                                           self.temp_dir,
                                           self.ont_file_location)
        else:
            log_str = 'ERROR: Reasoner Finished with Errors - {}: {}'.format(
                filename, return_code)
            logger.error(log_str)
            raise Exception(log_str)

        return None
    def _run(self):
        """Method uploads any log files found in temp_directory from a local directory to a specific directory in a
        Google Cloud Storage Bucket every "n" minutes as specified by the input interval variable. This method runs
        the program it is called with finishes. There is also a back-up timer that will kill the program

        Args:
            bucket: A storage Bucket object specifying a Google Cloud Storage bucket.
            original_data: A string specifying the location of the original_data directory for a specific build.
            temp_directory: A local directory where preprocessed data is stored.
            interval: An integer specifying how often the data should be pushed up to the Google Cloud Storage Bucket.

        Returns:
            None.
        """

        # grep for log files in the log_directory
        log_file = glob.glob(self.log_directory + '/*.log')[0].split('/')[-1]
        runtime = 0

        while runtime < self.kill_time:
            uploads_data_to_gcs_bucket(self.bucket, self.gcs_bucket_location,
                                       self.log_directory, log_file)
            time.sleep(self.sleep)
            runtime += self.sleep

        return None
Example #3
0
def main():

    start_time = datetime.now()

    # initialize Google Cloud Storage Bucket object and delete prior logs (if present) from current_build directory
    bucket = storage.Client().get_bucket('pheknowlator'); gcs_log_loc = 'temp_build_inprogress/'
    try: deletes_single_file(bucket, gcs_log_loc + '{}'.format(log))
    except NotFound: pass

    # start logger and configure Google Cloud Storage settings
    log_str = ' STARTING PHEKNOWLATOR KNOWLEDGE GRAPH BUILD '
    print('\n\n' + '*' * 10 + log_str + '*' * 10); logger.info('*' * 10 + log_str + '*' * 10)
    uploads_data_to_gcs_bucket(bucket, gcs_log_loc, log_dir, log)

    # run phase 1 of build
    log_str = 'BUILD PHASE 1: DOWNLOADING BUILD DATA'
    print('#' * 35 + '\n' + log_str + '\n' + '#' * 35); logger.info('#' * 5 + log_str + '#' * 5)
    try: run_phase_1()
    except: logger.error('ERROR: Uncaught Exception: {}'.format(traceback.format_exc()))
    uploads_data_to_gcs_bucket(bucket, gcs_log_loc, log_dir, log)

    # run phase 2 build
    log_str = 'BUILD PHASE 2: DATA PRE-PROCESSING'
    print('#' * 35 + '\n' + log_str + '\n' + '#' * 35); logger.info('#' * 5 + log_str + '#' * 5)
    try: run_phase_2()
    except: logger.error('ERROR: Uncaught Exception: {}'.format(traceback.format_exc()))
    uploads_data_to_gcs_bucket(bucket, gcs_log_loc, log_dir, log)

    # print build statistics and upload logging for data preprocessing and ontology cleaning
    runtime = round((datetime.now() - start_time).total_seconds() / 60, 3)
    log_str = ' COMPLETED BUILD PHASES 1-2: {} MINUTES '.format(runtime)
    print('\n\n' + '*' * 10 + log_str + '*' * 10); logger.info(log_str)  # don't delete needed for build monitoring
    logger.info('EXIT BUILD PHASES 1-2')  # don't delete needed for build monitoring
    uploads_data_to_gcs_bucket(bucket, gcs_log_loc, log_dir, log)
Example #4
0
    def cleans_ontology_data(self) -> None:
        """Performs all needed ontology cleaning tasks by resolving different types of ontology cleaning steps at the
        individual ontology- and the merged ontology-level, each are described below:
            - Individual Ontologies: (1) Parsing Errors, (2) Identifier Errors, (3) Deprecated/Obsolete Errors, and (4)
              Punning Errors.
            - Merged Ontologies: (1) Identifier Errors, (2) Normalizes Duplicate and Existing Concepts, and (3) Punning
              Errors.

        NOTE. The OWL API, when running the ELK reasoner, seems to add back some of the errors that this script removes.
            Example 1: In the Vaccine Ontology, we fix prefix errors where "PR" is recorded as "PRO". If you save the
            ontology without running the reasoner and reload it, the fix remains.
            Example 2: When we create the human subset of the Protein Ontology we verify that it contains only a
            single large connected component.
        For both examples, if you run the reasoner ELK, save the ontology with inferences, and re-load it, "PRO" will
        re-appear and the human pro ontology with contain 3 connected components. Luckily, the merged ontologies are
        not reasoned, thus the version used to build knowledge graphs is free of these errors.

        Returns:
            None.
        """

        log_str = '*** CLEANING INDIVIDUAL ONTOLOGY DATA SOURCES ***'
        print(log_str)
        logger.info(log_str)

        for ont in self.ontology_info.keys():
            if ont != self.merged_ontology_filename:
                print('\nProcessing Ontology: {}'.format(ont.upper()))
                logger.info('\nProcessing Ontology: {}'.format(ont.upper()))
                self.ont_file_location, self.ont_graph = ont, self.reads_gcs_bucket_data_to_graph(
                    ont)
                self.updates_ontology_reporter()  # get starting statistics
                self.fixes_ontology_parsing_errors()
                self.fixes_identifier_errors()
                self.removes_deprecated_obsolete_entities()
                self.fixes_punning_errors()
                self._logically_verifies_cleaned_ontologies()
                # read in cleaned, verified, and updated ontology containing inference
                log_str = 'Reading in Cleaned Ontology -- Needed to Calculate Final Statistics'
                print(log_str)
                logger.info(log_str)
                self.ont_graph = Graph().parse(ont)
                self.updates_ontology_reporter()  # get finishing statistics
                if self.bucket != '':
                    uploads_data_to_gcs_bucket(self.bucket, self.log_location,
                                               log_dir, log)

        log_str = '*** CLEANING MERGED ONTOLOGY DATA ***'
        print('\n\n' + log_str)
        logger.info(log_str)

        self.ont_file_location = self.merged_ontology_filename
        individual_ontologies = self.checks_for_downloaded_ontology_data()
        self.merge_ontologies(individual_ontologies, self.temp_dir + '/',
                              self.ont_file_location)
        if self.bucket != '':
            uploads_data_to_gcs_bucket(self.bucket, self.log_location, log_dir,
                                       log)
        log_str = 'Loading Merged Ontology'
        print('\n' + log_str)
        logger.info(log_str)
        self.ont_graph = Graph().parse(self.temp_dir + '/' +
                                       self.ont_file_location)
        self.updates_ontology_reporter()  # get starting statistics
        self.fixes_identifier_errors()
        self.normalizes_duplicate_classes()
        self.normalizes_existing_classes()
        self.fixes_punning_errors()
        self.updates_ontology_reporter()  # get finishing statistics
        if self.bucket != '':
            uploads_data_to_gcs_bucket(self.bucket, self.log_location, log_dir,
                                       log)
        # serializes final ontology graph and uploads graph data and ontology report to gcs
        self.ont_graph.serialize(destination=self.temp_dir + '/' +
                                 self.ont_file_location,
                                 format='xml')
        ontology_file_formatter(self.temp_dir, '/' + self.ont_file_location,
                                self.owltools_location)
        uploads_data_to_gcs_bucket(self.bucket, self.processed_data,
                                   self.temp_dir, self.ont_file_location)
        if self.bucket != '':
            uploads_data_to_gcs_bucket(self.bucket, self.log_location, log_dir,
                                       log)

        log_str = '*** GENERATING ONTOLOGY CLEANING REPORT ***'
        print('\n\n' + log_str)
        logger.info(log_str)
        self.generates_ontology_report()
        if self.bucket != '':
            uploads_data_to_gcs_bucket(self.bucket, self.log_location, log_dir,
                                       log)

        return None
Example #5
0
    def generates_ontology_report(self) -> None:
        """Parses the ontology_info dictionary in order to create a final ontology report summarizing the cleaning
        results performed on each ontology and the statistics on the final merged set of ontologies.

        Returns:
             None.
        """

        ontology_report_filename = 'ontology_cleaning_report.txt'
        ont_order = sorted([
            x for x in self.ontology_info.keys() if not x.startswith('Phe')
        ]) + [self.ont_file_location]
        with open(self.temp_dir + '/' + ontology_report_filename, 'w') as o:
            o.write('=' * 50 + '\n{}'.format('ONTOLOGY CLEANING REPORT'))
            o.write('\n{}\n'.format(
                str(datetime.datetime.utcnow().strftime('%a %b %d %X UTC %Y')))
                    + '=' * 50 + '\n\n')
            for key in ont_order:
                o.write('\nONTOLOGY: {}\n'.format(key))
                x = self.ontology_info[key]
                if 'Original GCS URL' in x.keys():
                    o.write('\t- Original GCS URL: {}\n'.format(
                        x['Original GCS URL']))
                if 'Processed GCS URL' in x:
                    o.write('\t- Processed GCS URL: {}\n'.format(
                        x['Processed GCS URL']))
                o.write('\t- Statistics:\n\t\t- Before Cleaning: {}\n'.format(
                    x['Starting Statistics']))
                if 'Final Statistics' in x.keys():
                    o.write('\t\t- After Cleaning: {}\n'.format(
                        x['Final Statistics']))
                if 'ValueErrors' in x.keys():
                    o.write('\t- Value Errors: {}\n'.format(x['ValueErrors']))
                if 'IdentifierErrors' in x.keys():
                    o.write('\t- Identifier Errors: {}\n'.format(
                        x['IdentifierErrors']))
                if 'PheKnowLator_MergedOntologies' not in key:
                    if x['Deprecated'] != 'None':
                        o.write('\t- Deprecated Classes:\n')
                        for i in x['Deprecated']:
                            o.write('\t\t- {}\n'.format(str(i)))
                    else:
                        o.write('\t\t\t- {}\n'.format(x['Deprecated']))
                    if x['Obsolete'] != 'None':
                        o.write('\t- Obsolete Classes:\n')
                        for i in x['Obsolete']:
                            o.write('\t\t- {}\n'.format(str(i)))
                    else:
                        o.write('\t\t\t- {}\n'.format(x['Obsolete']))
                o.write('\t- Punning Error:\n\t\t- Classes:\n')
                if x['PunningErrors - Classes'] != 'None':
                    for i in x['PunningErrors - Classes'].split(', '):
                        o.write('\t\t\t- {}\n'.format(i))
                else:
                    o.write('\t\t\t- {}\n'.format(
                        x['PunningErrors - Classes']))
                o.write('\t\t- ObjectProperties:\n')
                if x['PunningErrors - ObjectProperty'] != 'None':
                    for i in x['PunningErrors - ObjectProperty'].split(', '):
                        o.write('\t\t\t- {}\n'.format(i))
                else:
                    o.write('\t\t\t- {}\n'.format(
                        x['PunningErrors - ObjectProperty']))
                if 'Normalized - Duplicates' in x.keys():
                    o.write('\t- Entity Normalization:\n')
                    if x['Normalized - Duplicates'] != 'None':
                        for i in x['Normalized - Duplicates'].split(', '):
                            o.write('\t\t- {}\n'.format(i))
                    else:
                        o.write('\t\t- {}\n'.format(
                            x['Normalized - Duplicates']))
                    o.write(
                        '\t\t- Other Classes that May Need Normalization: {}\n'
                        .format(x['Normalized - NonOnt']))
                    o.write('\t\t- Normalized HGNC IDs: {}\n'.format(
                        x['Normalized - Gene IDs']))
                    o.write(
                        '\t- Deprecated Ontology HGNC Identifiers Needing Alignment:\n'
                    )
                    if x['Normalized - Dep'] != 'None':
                        for i in x['Normalized - Dep']:
                            o.write('\t\t- {}\n'.format(i))
                    else:
                        o.write('\t\t- {}\n'.format(x['Normalized - Dep']))

        uploads_data_to_gcs_bucket(self.bucket, self.processed_data,
                                   self.temp_dir, ontology_report_filename)

        return None