def __init__(self):
     self.cord_dir = os.environ.get('CORD_DIR')
     self.rosetta = Rosetta()
     self.writer = WriterDelegator(rosetta=self.rosetta)
     # line counts for reporting
     self.num_edges = self.count_lines_in_file('edges.txt')
     self.num_nodes = self.count_lines_in_file('nodes.txt')
Beispiel #2
0
 def __init__(self, rosetta):
     self.is_cached_already = False
     self.genetics_normalizer = GeneticsNormalizer()
     self.rosetta = rosetta
     self.writer = WriterDelegator(rosetta)
     self.version = '2020/05/04'
     self.sequence_variant_export_labels = None
    def run(self, nodes_file_name, edges_file_name, provided_by, delimiter):
        self.rosetta = Rosetta()
        self.wdg = WriterDelegator(rosetta)
        self.wdg.normalized = True

        for node in self.get_nodes_from_file(nodes_file_name, delimiter):
            self.wdg.write_node(node, annotate=False)

        for edge in self.get_edges_from_file(edges_file_name,
                                             provided_by=provided_by,
                                             delimiter=delimiter):
            self.wdg.write_edge(edge)
        self.wdg.flush()
Beispiel #4
0
    def __init__(self, plan, machine_question, rosetta, program_number):
        # Plan comes from typegraph and contains
        # transitions: a map from a node index to an (operation, output index) pair
        self.program_number = program_number
        self.machine_question = machine_question
        self.transitions = plan
        self.rosetta = rosetta
        self.prefix = hashlib.md5(
            (str(plan) + str(machine_question['nodes'])).encode()).hexdigest()
        self.cache = Cache(redis_host=os.environ['BUILD_CACHE_HOST'],
                           redis_port=os.environ['BUILD_CACHE_PORT'],
                           redis_db=os.environ['BUILD_CACHE_DB'],
                           prefix=self.prefix)

        self.cache.flush()
        self.log_program()
        #self.excluded_identifiers=set()
        """
        EXCLUSION CANDIDATES:
        UBERON:0000468 multi-cellular organism
        UBERON:0001062 anatomical entity
        UBERON:0000479 tissue
        UBERON:0000062 organ
        UBERON:0000064 organ part
        UBERON:0000467 anatomical system
        UBERON:0000465 material anatomical entity 
        UBERON:0000061 anatomical structure
        UBERON:0010000 multicellular anatomical structure
        UBERON:0011216 organ system subdivision
        UBERON:0000475 organism subdivision
        0002405 immune system
        0001016 nervous system
        0001017 central nervous system
        0001007 digestive system
        0004535 cardiovascular system
        0000949 endocrine system
        0000079 male reproductive system
        0001434 skeletal system
        0000178 blood
        GO:0044267 cellular protein metabolic processes
        GO:0005515 protein binding
        CL:0000548 animal cell
        CL:0000003 native cell
        CL:0000255 eukaryotic cell
        """
        self.excluded_identifiers = self.rosetta.service_context.config.get(
            'bad_identifiers')
        # {'UBERON:0000064','UBERON:0000475','UBERON:0011216','UBERON:0000062','UBERON:0000465','UBERON:0010000','UBERON:0000061', 'UBERON:0000467','UBERON:0001062','UBERON:0000468', 'UBERON:0000479', 'GO:0044267', 'GO:0005515', 'CL:0000548', 'CL:0000003', 'CL:0000255'}

        self.writer_delegator = WriterDelegator(rosetta)
Beispiel #5
0
 def __init__(self):
     self.url = "https://stars-app.renci.org/uberongraph/sparql"
     self.triplestore = TripleStore(self.url)
     self.prefix_set = {
         node_types.DISEASE_OR_PHENOTYPIC_FEATURE: ['HP', 'MONDO'],
         node_types.CELLULAR_COMPONENT: ['CL'],
         node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY: ['GO'],
         node_types.ANATOMICAL_ENTITY: ['UBERON'],
         node_types.CHEMICAL_SUBSTANCE: ['CHEBI']
     }
     self.root_uris = {
         node_types.ANATOMICAL_ENTITY:
         "<http://purl.obolibrary.org/obo/UBERON_0001062>",
         node_types.DISEASE:
         "<http://purl.obolibrary.org/obo/MONDO_0000001>",
         node_types.MOLECULAR_ACTIVITY:
         "<http://purl.obolibrary.org/obo/GO_0003674>",
         node_types.BIOLOGICAL_PROCESS:
         "<http://purl.obolibrary.org/obo/GO_0008150>",
         node_types.CHEMICAL_SUBSTANCE:
         "<http://purl.obolibrary.org/obo/CHEBI_24431>",
         node_types.PHENOTYPIC_FEATURE:
         "<http://purl.obolibrary.org/obo/HP_0000118>",
         node_types.CELL:
         "<http://purl.obolibrary.org/obo/CL_0000000>",
         node_types.CELLULAR_COMPONENT:
         "<http://purl.orolibrary.org/obo/GO_0005575>"
     }
     obo_prefixes = '\n'.join([
         f'PREFIX {pref}: <http://purl.obolibrary.org/obo/{pref}_>'
         for pref in set(
             reduce(lambda x, y: x + y, self.prefix_set.values(), []))
     ])
     self.query = f"""
                 {obo_prefixes}
                 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>        
                 select  ?parent_id ?parent_label ?child_id ?child_label
                 where {{                        
                     ?parent_id rdfs:subClassOf $root_uri .
                     ?child_id rdfs:subClassOf ?parent_id.
                OPTIONAL {{
                 ?parent_id rdfs:label ?parent_label.
                 ?child_id rdfs:label ?child_label.
                 }}                      
                 }}
                     """
     rosetta = Rosetta()
     self.wdg = WriterDelegator(rosetta)
Beispiel #6
0
 def __init__(self, sv_neo4j_credentials, crawl_for_service, recreate_sv_node):
     self.rosetta = Rosetta()
     self.writerDelegator = WriterDelegator(rosetta=self.rosetta)
     self.sv_neo4j_credentials = sv_neo4j_credentials
     self.crawl_for_service = crawl_for_service
     self.genetics_services = GeneticsServices()
     self.recreate_sv_node = recreate_sv_node
     self.written_genes = set()
def check_queue(size):
    rosetta = Rosetta()
    wdg = WriterDelegator(rosetta, push_to_queue=True)
    import time
    # wait a bit before reading the queue

    time.sleep(1)
    res = wdg.channel.queue_declare(queue="neo4j", passive=True)
    return res.method.message_count == size
Beispiel #8
0
    def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator):
        all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes)
        for source_node_id, results in all_results.items():
            # convert the simple edges and nodes to rags objects and write them to the graph
            for (edge, node) in results:
                gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties)
                if self.recreate_sv_node:
                    variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT)
                    variant_node.add_export_labels([node_types.SEQUENCE_VARIANT])
                    writer.write_node(variant_node)
                if gene_node.id not in self.written_genes:
                    writer.write_node(gene_node)
                    self.written_genes.add(gene_node.id)

                predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label)
                gene_edge = KEdge(source_id=source_node_id,
                                  target_id=gene_node.id,
                                  provided_by=edge.provided_by,
                                  ctime=edge.ctime,
                                  original_predicate=predicate,
                                  # standard_predicate=predicate,
                                  input_id=edge.input_id,
                                  properties=edge.properties)
                writer.write_edge(gene_edge)
            logger.info(f'added {len(results)} variant relationships for {source_node_id}')
def process_queue(pool_id=0, errors={}):
    rosetta = Rosetta()
    wdg = WriterDelegator(rosetta, push_to_queue=True)
    print('starting consumer')
    # send a 'close' message to stop consumer consumer at the end assuming that this will go at the end of the nodes and edges.
    wdg.flush()
    wdg.close()
    start_consuming(max_retries=-1)
Beispiel #10
0
def load_gwas_knowledge(rosetta: object, limit: int = None):
    synonymizer = rosetta.synonymizer
    gwas_catalog_dict = rosetta.core.gwascatalog.prepopulate_cache()
    counter = 0
    with WriterDelegator(rosetta) as writer:
        for variant_node, relationships in gwas_catalog_dict.items():
            if relationships:
                writer.write_node(variant_node)
                for (gwas_edge, phenotype_node) in relationships:
                    # these phenotypes are probably already in the DB, but not necessarily
                    writer.write_node(phenotype_node)
                    writer.write_edge(gwas_edge)
            else:
                logger.error(f'GWASCatalog node {variant_node.id} had no phenotypes associated with it.')

            counter += 1
            if limit and counter == limit:
                break
    for r in range(0, num_consumers):
        p_q_partial = partial(process_queue, r, errors)
        error_call_back_partial = partial(write_error_to_file, r)
        success_call_back_partial = partial(write_termination, r)
        pp = pool.apply_async(p_q_partial, [],
                              callback=success_call_back_partial,
                              error_callback=error_call_back_partial)
        finished.append(pp)
    [x.wait() for x in finished]
    pool.close()
    pool.join()


if __name__ == '__main__':
    rosetta = Rosetta()
    wdg = WriterDelegator(rosetta, push_to_queue=True)
    wdg.flush()
    wdg.close()
    # # clear out the queue
    wdg.channel.queue_purge('neo4j')
    # # # # # source nodes len
    source_node_length = 100
    write_to_queue(source_node_length, wdg)
    # # # # expect node_length * 3 in queue
    assert check_queue(source_node_length * 3) == True
    errors = {}
    # start consumer(s)
    start_multiple_consumers(1, errors={})
    # process_queue(1, {})
    print('checking neo4j')
    check_neo4j(source_node_length)
Beispiel #12
0
class GWASCatalog(Service):
    def __init__(self, rosetta):
        self.is_cached_already = False
        self.genetics_normalizer = GeneticsNormalizer()
        self.rosetta = rosetta
        self.writer = WriterDelegator(rosetta)
        self.version = '2020/05/04'
        self.sequence_variant_export_labels = None

    def process_gwas(self):
        # main entry point
        gwas_file = self.get_gwas_file()
        self.parse_gwas_file(gwas_catalog=gwas_file)

    def get_gwas_file(self):
        """
        Get the gwas file
        :return: Array of lines in the `gwas-catalog-associations_ontology-annotated.tsv` file
        """
        # adding a specific version instead of latest to help track things
        self.query_url = f'ftp.ebi.ac.uk/pub/databases/gwas/releases/{self.version}/' \
                         f'gwas-catalog-associations_ontology-annotated.tsv'
        ftpsite = 'ftp.ebi.ac.uk'
        ftpdir = f'/pub/databases/gwas/releases/{self.version}'
        ftpfile = 'gwas-catalog-associations_ontology-annotated.tsv'
        ftp = FTP(ftpsite)
        ftp.login()
        ftp.cwd(ftpdir)
        gwas_catalog = []
        ftp.retrlines(f'RETR {ftpfile}', gwas_catalog.append)
        ftp.quit()
        return gwas_catalog

    def parse_gwas_file(self, gwas_catalog):

        try:
            # get column headers
            file_headers = gwas_catalog[0].split('\t')
            pub_med_index = file_headers.index('PUBMEDID')
            p_value_index = file_headers.index('P-VALUE')
            snps_index = file_headers.index('SNPS')
            trait_ids_index = file_headers.index('MAPPED_TRAIT_URI')
        except (IndexError, ValueError) as e:
            logger.error(f'GWAS Catalog failed to prepopulate_cache ({e})')
            return []

        corrupted_lines = 0
        missing_variant_ids = 0
        missing_phenotype_ids = 0
        variant_to_pheno_cache = defaultdict(set)
        progress_counter = 0
        total_lines = len(gwas_catalog)
        trait_uri_pattern = re.compile(r'[^,\s]+')
        snp_pattern = re.compile(r'[^,;x*\s]+')
        for line in gwas_catalog[1:]:

            line = line.split('\t')
            try:
                # get pubmed id
                pubmed_id = line[pub_med_index]
                # get p-value
                p_value = float(line[p_value_index])
                if p_value == 0:
                    p_value = sys.float_info.min
                # get all traits (possible phenotypes)
                trait_uris = trait_uri_pattern.findall(line[trait_ids_index])
                # find all sequence variants
                snps = snp_pattern.findall(line[snps_index])
            except (IndexError, ValueError) as e:
                corrupted_lines += 1
                logger.warning(f'GWASCatalog corrupted line: {e}')
                continue

            if not (trait_uris and snps):
                corrupted_lines += 1
                logger.warning(f'GWASCatalog corrupted line: {line}')
                continue
            else:
                traits = []
                for trait_uri in trait_uris:
                    try:
                        trait_id = trait_uri.rsplit('/', 1)[1]
                        # ids show up like EFO_123, Orphanet_123, HP_123
                        if trait_id.startswith('EFO'):
                            curie_trait_id = f'EFO:{trait_id[4:]}'
                        elif trait_id.startswith('Orp'):
                            curie_trait_id = f'ORPHANET:{trait_id[9:]}'
                        elif trait_id.startswith('HP'):
                            curie_trait_id = f'HP:{trait_id[3:]}'
                        elif trait_id.startswith('NCIT'):
                            curie_trait_id = f'NCIT:{trait_id[5:]}'
                        elif trait_id.startswith('MONDO'):
                            curie_trait_id = f'MONDO:{trait_id[6:]}'
                        elif trait_id.startswith('GO'):
                            # Biological process or activity
                            # 5k+ of these
                            missing_phenotype_ids += 1
                            continue
                        else:
                            missing_phenotype_ids += 1
                            logger.warning(
                                f'{trait_uri} not a recognized trait format')
                            continue

                        traits.append(curie_trait_id)

                    except IndexError as e:
                        logger.warning(
                            f'trait uri index error:({trait_uri}) not splittable'
                        )

                variant_nodes = set()
                for n, snp in enumerate(snps):
                    if snp.startswith('rs'):
                        dbsnp_curie = f'DBSNP:{snp}'
                        variant_node = KNode(dbsnp_curie,
                                             type=node_types.SEQUENCE_VARIANT)
                        # adding an export label, this will ensure that it will go into the proper queue
                        # hence we can do batch normalization in the writer.
                        variant_node.add_export_labels(
                            [node_types.SEQUENCE_VARIANT])
                        variant_nodes.add(variant_node)
                    else:
                        missing_variant_ids += 1
                        pass

                if traits and variant_nodes:
                    props = {'p_value': p_value}
                    for variant_node in variant_nodes:
                        self.writer.write_node(variant_node)
                        for trait_id in traits:
                            variant_to_pheno_edge, phenotype_node = self.create_variant_to_phenotype_components(
                                variant_node,
                                trait_id,
                                None,
                                pubmed_id=pubmed_id,
                                properties=props)
                            self.writer.write_node(phenotype_node)
                            self.writer.write_edge(variant_to_pheno_edge)
            progress_counter += 1
            if progress_counter % 1000 == 0:
                percent_complete = (progress_counter / total_lines) * 100
                logger.info(f'GWASCatalog progress: {int(percent_complete)}%')

    def create_variant_to_phenotype_components(self,
                                               variant_node,
                                               phenotype_id,
                                               phenotype_label,
                                               pubmed_id=None,
                                               properties={}):
        phenotype_node = KNode(phenotype_id,
                               name=phenotype_label,
                               type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE)
        pubs = []
        if pubmed_id:
            pubs.append(f'PMID:{pubmed_id}')

        predicate = LabeledID(identifier=f'RO:0002200', label=f'has_phenotype')
        edge = self.create_edge(
            variant_node,
            phenotype_node,
            'gwascatalog.sequence_variant_to_disease_or_phenotypic_feature',
            variant_node.id,
            predicate,
            url=self.query_url,
            properties=properties,
            publications=pubs)
        return (edge, phenotype_node)
    def create_gtex_graph(self,
                          data_directory: str,
                          file_name: str,
                          namespace: str,
                          is_sqtl: bool = False) -> object:
        # init the return value
        ret_val: object = None

        # init a progress counter
        line_counter = 0

        try:
            # get the full path to the input file
            full_file_path = f'{data_directory}{file_name}'

            logger.info(
                f'Creating GTEx graph data elements from file: {full_file_path}'
            )

            # walk through the gtex data file and create/write nodes and edges to the graph
            with WriterDelegator(self.rosetta) as graph_writer:
                # init these outside of try catch block
                curie_hgvs = None
                curie_uberon = None
                curie_ensembl = None

                # open the file and start reading
                with open(full_file_path, 'r') as inFH:
                    # open up a csv reader
                    csv_reader = csv.reader(inFH)

                    # read the header
                    header_line = next(csv_reader)

                    # find relevant indices
                    tissue_name_index = header_line.index('tissue_name')
                    tissue_uberon_index = header_line.index('tissue_uberon')
                    hgvs_index = header_line.index('HGVS')
                    ensembl_id_index = header_line.index('gene_id')
                    pval_nominal_index = header_line.index('pval_nominal')
                    pval_slope_index = header_line.index('slope')

                    try:
                        # for the rest of the lines in the file
                        for line in csv_reader:
                            # increment the counter
                            line_counter += 1

                            # get the data elements
                            tissue_name = line[tissue_name_index]
                            uberon = line[tissue_uberon_index]
                            hgvs = line[hgvs_index]
                            ensembl = line[ensembl_id_index].split(".", 1)[0]
                            pval_nominal = line[pval_nominal_index]
                            slope = line[pval_slope_index]

                            # create curies for the various id values
                            curie_hgvs = f'HGVS:{hgvs}'
                            curie_uberon = f'UBERON:{uberon}'
                            curie_ensembl = f'ENSEMBL:{ensembl}'
                            # create variant, gene and GTEx nodes with the HGVS, ENSEMBL or UBERON expression as the id and name
                            variant_node = KNode(
                                curie_hgvs,
                                name=hgvs,
                                type=node_types.SEQUENCE_VARIANT)
                            variant_node.add_export_labels(
                                [node_types.SEQUENCE_VARIANT])
                            gene_node = KNode(curie_ensembl,
                                              name=ensembl,
                                              type=node_types.GENE)
                            gene_node.add_export_labels([node_types.GENE])
                            gtex_node = KNode(
                                curie_uberon,
                                name=tissue_name,
                                type=node_types.ANATOMICAL_ENTITY)

                            if is_sqtl:
                                # sqtl variant to gene always uses the same predicate
                                predicate = self.variant_gene_sqtl_predicate
                            else:
                                # for eqtl use the polarity of slope to get the direction of expression.
                                # positive value increases expression, negative decreases
                                try:
                                    if float(slope) > 0.0:
                                        predicate = self.increases_expression_predicate
                                    else:
                                        predicate = self.decreases_expression_predicate
                                except ValueError as e:
                                    logger.error(
                                        f"Error casting slope to a float on line {line_counter} (slope - {slope}) {e}"
                                    )
                                    continue

                            # get a MD5 hash int of the composite hyper edge ID
                            hyper_edge_id = self.gtu.get_hyper_edge_id(
                                uberon, ensembl, hgvs)

                            # set the properties for the edge
                            edge_properties = [
                                ensembl, pval_nominal, slope, namespace
                            ]

                            ##########################
                            # data details are ready. write all edges and nodes to the graph DB.
                            ##########################

                            # write out the sequence variant node
                            graph_writer.write_node(variant_node)

                            # write out the gene node
                            if gene_node.id not in self.written_genes:
                                graph_writer.write_node(gene_node)
                                self.written_genes.add(gene_node.id)

                            # write out the anatomical gtex node
                            if gtex_node.id not in self.written_anatomical_entities:
                                graph_writer.write_node(gtex_node)
                                self.written_anatomical_entities.add(
                                    gtex_node.id)

                            # associate the sequence variant node with an edge to the gtex anatomy node
                            self.gtu.write_new_association(
                                graph_writer, variant_node, gtex_node,
                                self.variant_anatomy_predicate, hyper_edge_id,
                                None, True)

                            # associate the gene node with an edge to the gtex anatomy node
                            self.gtu.write_new_association(
                                graph_writer, gene_node, gtex_node,
                                self.gene_anatomy_predicate, 0, None, False)

                            # associate the sequence variant node with an edge to the gene node. also include the GTEx properties
                            self.gtu.write_new_association(
                                graph_writer, variant_node, gene_node,
                                predicate, hyper_edge_id, edge_properties,
                                True)

                            # output some feedback for the user
                            if (line_counter % 250000) == 0:
                                logger.info(
                                    f'Processed {line_counter} variants.')

                            # reset written nodes list to avoid memory overflow
                            if len(self.written_anatomical_entities
                                   ) == self.max_nodes:
                                self.written_anatomical_entities = set()
                            if len(self.written_genes) == self.max_nodes:
                                self.written_genes = set()
                    except (KeyError, IndexError) as e:
                        logger.error(
                            f'Exception caught trying to process variant: {curie_hgvs}-{curie_uberon}-{curie_ensembl} at data line: {line_counter}. Exception: {e}, Line: {line}'
                        )

        except Exception as e:
            logger.error(f'Exception caught: Exception: {e}')
            ret_val = e

        # output some final feedback for the user
        logger.info(f'Building complete. Processed {line_counter} variants.')

        # return to the caller
        return ret_val
Beispiel #14
0
class Program:
    def __init__(self, plan, machine_question, rosetta, program_number):
        # Plan comes from typegraph and contains
        # transitions: a map from a node index to an (operation, output index) pair
        self.program_number = program_number
        self.machine_question = machine_question
        self.transitions = plan
        self.rosetta = rosetta
        self.prefix = hashlib.md5(
            (str(plan) + str(machine_question['nodes'])).encode()).hexdigest()
        self.cache = Cache(redis_host=os.environ['BUILD_CACHE_HOST'],
                           redis_port=os.environ['BUILD_CACHE_PORT'],
                           redis_db=os.environ['BUILD_CACHE_DB'],
                           prefix=self.prefix)

        self.cache.flush()
        self.log_program()
        #self.excluded_identifiers=set()
        """
        EXCLUSION CANDIDATES:
        UBERON:0000468 multi-cellular organism
        UBERON:0001062 anatomical entity
        UBERON:0000479 tissue
        UBERON:0000062 organ
        UBERON:0000064 organ part
        UBERON:0000467 anatomical system
        UBERON:0000465 material anatomical entity 
        UBERON:0000061 anatomical structure
        UBERON:0010000 multicellular anatomical structure
        UBERON:0011216 organ system subdivision
        UBERON:0000475 organism subdivision
        0002405 immune system
        0001016 nervous system
        0001017 central nervous system
        0001007 digestive system
        0004535 cardiovascular system
        0000949 endocrine system
        0000079 male reproductive system
        0001434 skeletal system
        0000178 blood
        GO:0044267 cellular protein metabolic processes
        GO:0005515 protein binding
        CL:0000548 animal cell
        CL:0000003 native cell
        CL:0000255 eukaryotic cell
        """
        self.excluded_identifiers = self.rosetta.service_context.config.get(
            'bad_identifiers')
        # {'UBERON:0000064','UBERON:0000475','UBERON:0011216','UBERON:0000062','UBERON:0000465','UBERON:0010000','UBERON:0000061', 'UBERON:0000467','UBERON:0001062','UBERON:0000468', 'UBERON:0000479', 'GO:0044267', 'GO:0005515', 'CL:0000548', 'CL:0000003', 'CL:0000255'}

        self.writer_delegator = WriterDelegator(rosetta)

    def log_program(self):
        logstring = f'Program {self.program_number}\n'
        logstring += 'Nodes: \n'
        for i, cn in enumerate(self.machine_question['nodes']):
            logstring += f' {i}: {cn}\n'
        logstring += 'Transitions:\n'
        for k in self.transitions:
            logstring += f' {k}: {self.transitions[k]}\n'
        total_transitions = len(self.transitions.keys())
        #if  total_transitions < 20:
        #    logger.debug(logstring)
        logger.debug(logstring)
        logger.debug(f'total transitions : {total_transitions}')

    def initialize_instance_nodes(self):
        # No error checking here. You should have caught any malformed questions before this point.
        logger.debug("Initializing program {}".format(self.program_number))

        # Filter out the curies in the question
        non_sv_curies = list(
            map(
                lambda n: n.curie,
                filter(
                    lambda node: node.curie and node.type != node_types.
                    SEQUENCE_VARIANT, self.machine_question['nodes'])))
        sv_curies = list(
            map(
                lambda n: n.curie,
                filter(
                    lambda node: node.curie and node.type == node_types.
                    SEQUENCE_VARIANT, self.machine_question['nodes'])))
        #  batch synonymize them all, getting back a dict
        # normalized_node = { <curie> : KNode() }
        normalized_nodes = Synonymizer.batch_normalize_nodes(non_sv_curies)
        normalized_nodes.update(
            Synonymizer.batch_normalize_sequence_variants(sv_curies))
        # go back to the question an start processing them.
        # during processing we don't need to do synonymization at
        # any point. We will let each service return a KNode
        # we will batch synonymize results later in Buffered writer.
        for n in self.machine_question['nodes']:
            if n.curie:
                # if node is not normalized via synonymization service promote it to Knowledge node.
                start_node = normalized_nodes.get(n.curie,
                                                  self.parse_QNode_to_KNode(n))
                self.process_node(start_node, [n.id])
        return

    def parse_QNode_to_KNode(self, qNode: QNode):
        """Incase of synonymization failure question Node is promoted to Knowledge node"""
        return KNode(qNode.curie, type=[qNode.type])

    def process_op(self, link, source_node, history):
        op_name = link['op']
        key = f"{op_name}({Text.upper_curie(source_node.id)})"
        maxtime = timedelta(minutes=2)
        try:
            try:
                results = self.rosetta.cache.get(key)
            except Exception as e:
                # logger.warning(e)
                results = None
            if results is not None:
                logger.debug(f"cache hit: {key} size:{len(results)}")
            else:
                logger.debug(f"exec op: {key}")
                op = self.rosetta.get_ops(op_name)
                start = dt.now()
                results = op(source_node)
                end = dt.now()
                logger.debug(f'Call {key} took {end-start}')
                if (end - start) > maxtime:
                    logger.warn(f"Call {key} exceeded {maxtime}")
                self.rosetta.cache.set(key, results)
                logger.debug(f"cache.set-> {key} length:{len(results)}")
                logger.debug(f"    {[node for _, node in results]}")
            results = list(
                filter(lambda x: x[1].id not in self.excluded_identifiers,
                       results))
            for edge, node in results:
                edge_label = Text.snakify(edge.original_predicate.label)
                if link['predicate'] is None or edge_label == link[
                        'predicate'] or (isinstance(link['predicate'], list)
                                         and
                                         (edge_label in link['predicate'])):
                    self.process_node(node, history, edge)
                else:
                    pass

        except pika.exceptions.ChannelClosed:
            traceback.print_exc()
            raise
        except Exception as e:
            traceback.print_exc()
            log_text = f"  -- {key}"
            logger.warning(f"Error invoking> {log_text}")

    def process_node(self, node, history, edge=None):
        """
        We've got a new set of nodes (either initial nodes or from a query).  They are attached
        to a particular concept in our query plan. We make sure that they're synonymized and then
        queue up their children
        """
        logger.debug(f'process {node.id}')
        if edge is not None:
            is_source = node.id == edge.source_id
        #Our excluded ids are e.g. uberons, but we might have gotten something else like a CARO
        # so we need to synonymize and then cehck for identifiers
        if node.id in self.excluded_identifiers:
            return
        try:
            result = annotate_shortcut(node, self.rosetta)
            if type(result) == type(None):
                logger.debug(f'No annotator found for {node}')
        except Exception as e:
            logger.error(e)
            logger.error(traceback.format_exc())
        if edge is not None:
            if is_source:
                edge.source_id = node.id
            else:
                edge.target_id = node.id

        # check the node cache, compare to the provided history
        # to determine which ops are valid
        key = node.id

        # print(node.dump())
        # if edge:
        #     print(edge.dump())
        #print("-"*len(history)+"History: ", history)

        # only add a node if it wasn't cached
        completed = self.cache.get(key)  # set of nodes we've been from here
        #print("-"*len(history)+"Completed: ", completed)
        if completed is None:
            completed = set()
            self.cache.set(key, completed)

        self.writer_delegator.write_node(node)
        #logger.debug(f"Sent node {node.id}")

        # make sure the edge is queued for creation AFTER the node
        if edge:
            self.writer_delegator.write_edge(edge)
            #logger.debug(f"Sent edge {edge.source_id}->{edge.target_id}")

        # quit if we've closed a loop
        if history[-1] in history[:-1]:
            #print("-"*len(history)+"Closed a loop!")
            return

        source_id = history[-1]

        # quit if there are no transitions from this node
        if source_id not in self.transitions:
            return

        destinations = self.transitions[source_id]
        completed = self.cache.get(key)
        for target_id in destinations:
            if not self.transitions[source_id][target_id]:
                continue
            # don't turn around
            if len(history) > 1 and target_id == history[-2]:
                continue
            # don't repeat things
            if target_id in completed:
                continue
            completed.add(target_id)
            self.cache.set(key, completed)
            links = self.transitions[source_id][target_id]
            #print("-"*len(history)+f"Destination: {target_id}")
            for link in links:
                print("-" * len(history) + "Executing: ", link['op'])
                self.process_op(link, node, history + [target_id])

    #CAN I SOMEHOW CAPTURE PATHS HERE>>>>

    def run_program(self):
        """Loop over unused nodes, send them to the appropriate operator, and collect the results.
        Keep going until there's no nodes left to process."""
        logger.debug(f"Running program {self.program_number}")
        self.initialize_instance_nodes()
        self.writer_delegator.flush()
        return

    def get_path_descriptor(self):
        """Return a description of valid paths at the concept level.  The point is to have a way to
        find paths in the final graph.  By starting at one end of this, you can get to the other end(s).
        So it assumes an acyclic graph, which may not be valid in the future.  What it should probably
        return in the future (if we still need it) is a cypher query to find all the paths this program
        might have made."""
        path = {}
        used = set()
        node_num = 0
        used.add(node_num)
        while len(used) != len(self.machine_question['nodes']):
            next = None
            if node_num in self.transitions:
                putative_next = self.transitions[node_num]['to']
                if putative_next not in used:
                    next = putative_next
                    dir = 1
            if next is None:
                for putative_next in self.transitions:
                    ts = self.transitions[putative_next]
                    if ts['to'] == node_num:
                        next = putative_next
                        dir = -1
            if next is None:
                logger.error("How can this be? No path across the data?")
                raise Exception()
            path[node_num] = (next, dir)
            node_num = next
            used.add(node_num)
        return path
Beispiel #15
0
class OntologicalHeirarchy(Service):
    """
    Service that makes call to uberongraph to resolve subclass relationships between ontological terms
    """
    def __init__(self):
        self.url = "https://stars-app.renci.org/uberongraph/sparql"
        self.triplestore = TripleStore(self.url)
        self.prefix_set = {
            node_types.DISEASE_OR_PHENOTYPIC_FEATURE: ['HP', 'MONDO'],
            node_types.CELLULAR_COMPONENT: ['CL'],
            node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY: ['GO'],
            node_types.ANATOMICAL_ENTITY: ['UBERON'],
            node_types.CHEMICAL_SUBSTANCE: ['CHEBI']
        }
        self.root_uris = {
            node_types.ANATOMICAL_ENTITY:
            "<http://purl.obolibrary.org/obo/UBERON_0001062>",
            node_types.DISEASE:
            "<http://purl.obolibrary.org/obo/MONDO_0000001>",
            node_types.MOLECULAR_ACTIVITY:
            "<http://purl.obolibrary.org/obo/GO_0003674>",
            node_types.BIOLOGICAL_PROCESS:
            "<http://purl.obolibrary.org/obo/GO_0008150>",
            node_types.CHEMICAL_SUBSTANCE:
            "<http://purl.obolibrary.org/obo/CHEBI_24431>",
            node_types.PHENOTYPIC_FEATURE:
            "<http://purl.obolibrary.org/obo/HP_0000118>",
            node_types.CELL:
            "<http://purl.obolibrary.org/obo/CL_0000000>",
            node_types.CELLULAR_COMPONENT:
            "<http://purl.orolibrary.org/obo/GO_0005575>"
        }
        obo_prefixes = '\n'.join([
            f'PREFIX {pref}: <http://purl.obolibrary.org/obo/{pref}_>'
            for pref in set(
                reduce(lambda x, y: x + y, self.prefix_set.values(), []))
        ])
        self.query = f"""
                    {obo_prefixes}
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>        
                    select  ?parent_id ?parent_label ?child_id ?child_label
                    where {{                        
                        ?parent_id rdfs:subClassOf $root_uri .
                        ?child_id rdfs:subClassOf ?parent_id.
                   OPTIONAL {{
                    ?parent_id rdfs:label ?parent_label.
                    ?child_id rdfs:label ?child_label.
                    }}                      
                    }}
                        """
        rosetta = Rosetta()
        self.wdg = WriterDelegator(rosetta)

    def runner(self):
        for node_type, root_iri in self.root_uris.items():
            nodes, edges = self.term_get_ancestors(node_type, root_iri)
            for index, n in enumerate(nodes):
                self.wdg.write_node(n, annotate=False)
                if ((index / len(nodes)) * 100) % 10 == 0:
                    print((index / len(nodes)) * 100, '% complete')
            for index, e in enumerate(edges):
                self.wdg.write_edge(e)
                if index % 100 == 0:
                    self.wdg.flush()
                if ((index / len(edges)) * 100) % 10 == 0:
                    print((index / len(edges)) * 100, '% complete')
        return

    def term_get_ancestors(self, node_type, root_iri):
        results = self.triplestore.query_template(
            template_text=self.query,
            inputs={'root_uri': root_iri},
            outputs=['parent_id', 'parent_label', 'child_id', 'child_label'])
        print('found total ', len(results), ' results.')
        nodes = set()
        edges = set()
        for index, row in enumerate(results):
            # Output type would be same as input type?
            ancestor_node = KNode(Text.obo_to_curie(row['parent_id']),
                                  name=row['parent_label'],
                                  type=node_type)
            child_node = KNode(Text.obo_to_curie(row['child_id']),
                               name=row['child_label'],
                               type=node_type)
            if ancestor_node.id == child_node.id:
                # refrain from adding edge to the node itself
                continue
            predicate = LabeledID(identifier='rdfs:subClassOf',
                                  label='subclass of')
            edge = self.create_edge(
                source_node=child_node,
                target_node=ancestor_node,
                predicate=predicate,
                provided_by='uberongraph.term_get_ancestors',
                input_id=child_node.id)
            nodes.add(child_node)
            nodes.add(ancestor_node)
            edges.add(edge)
        return nodes, edges
class KGX_File_parser(Service):
    def __init__(self):
        pass

    def get_nodes_from_file(self, file_name, delimiter: str):
        if not file_name:
            return

        with open(file_name) as nodes_file:
            reader = csv.DictReader(nodes_file, delimiter=delimiter)
            for raw_node in reader:
                labels = list(
                    filter(lambda x: x, raw_node['category'].split('|')))
                if not len(labels):
                    labels = ['named_thing']
                id = raw_node['id']
                name = raw_node['name']
                node = KNode(id, type=labels[0], name=name)
                node.add_export_labels(labels)
                yield node

    def get_edges_from_file(self, file_name, provided_by, delimiter):
        """
        All is stuff is till we get kgx to merge edges. For now creating
        a pattern looking like a robokopservice and let writer handle it.
        :param file_name:
        :return:
        """
        if not file_name:
            return

        bl_resolver = BL_lookup()
        with open(file_name) as edge_file:
            reader = csv.DictReader(edge_file, delimiter=delimiter)
            for raw_edge in reader:
                edge_label = raw_edge['edge_label'].split(':')[-1]
                relation_predicate = raw_edge['relation']
                predicate = LabeledID(
                    identifier=
                    relation_predicate,  #bl_resolver.resolve_curie(edge_label),
                    label=edge_label)
                source_node = KNode(raw_edge['subject'])
                target_node = KNode(raw_edge['object'])
                edge = self.create_edge(
                    source_node=source_node,
                    target_node=target_node,
                    input_id=source_node.id,
                    provided_by=provided_by,
                    predicate=predicate,
                )
                edge.standard_predicate = predicate
                yield edge

    def run(self, nodes_file_name, edges_file_name, provided_by, delimiter):
        self.rosetta = Rosetta()
        self.wdg = WriterDelegator(rosetta)
        self.wdg.normalized = True

        for node in self.get_nodes_from_file(nodes_file_name, delimiter):
            self.wdg.write_node(node, annotate=False)

        for edge in self.get_edges_from_file(edges_file_name,
                                             provided_by=provided_by,
                                             delimiter=delimiter):
            self.wdg.write_edge(edge)
        self.wdg.flush()
class Cord19Service(Service):
    def __init__(self):
        self.cord_dir = os.environ.get('CORD_DIR')
        self.rosetta = Rosetta()
        self.writer = WriterDelegator(rosetta=self.rosetta)
        # line counts for reporting
        self.num_edges = self.count_lines_in_file('edges.txt')
        self.num_nodes = self.count_lines_in_file('nodes.txt')

    def count_lines_in_file(self, file_name):
        count = -1  # don't count headers
        with open(os.path.join(self.cord_dir, file_name)) as nodes_file:
            for line in nodes_file:
                count += 1
        return count

    def load_nodes_only(self):
        print('Writing nodes')
        for index, node in self.parse_nodes():
            index += 1
            self.writer.write_node(node)
            if index % 100 == 0:
                print(f'~~~~~~~~~{(index/self.num_nodes)* 100}% complete')

    def load(self, provided_by, limit=0):
        print('writing to graph')
        print('writing nodes')
        self.writer.normalized = True
        for index, node in self.parse_nodes():
            self.writer.write_node(node)
            if index % 1000 == 0:
                print(f'~~~~~~~~~{(index / self.num_edges) * 100} % complete')
        for index, edge in self.parse_edges(provided_by=provided_by,
                                            limit=limit):
            source_node = KNode(edge.source_id)
            target_node = KNode(edge.target_id)
            self.writer.write_node(source_node)
            self.writer.write_node(target_node)
            self.writer.write_edge(edge)
            if index % 10000 == 0:
                print(f'~~~~~~~~~{(index/self.num_edges)* 100} % complete')
        self.writer.flush()
        print('done writing edges')

    def parse_nodes(self, limit=0):
        """
        Parse nodes.
        :param limit: for testing reads first n nodes from file
        :return: dict with node_id as key and KNode as value
        """
        print('parsing nodes...')
        limit_counter = 0
        with open(os.path.join(self.cord_dir, 'nodes.txt')) as nodes_file:
            reader = csv.DictReader(nodes_file, delimiter='\t')
            for raw_node in reader:
                # transform headers to knode attrbutes
                labels = raw_node.get('semantic_type')
                labels = labels.replace(']', '').replace('[', '').replace(
                    '\\', '').replace("'", '')
                labels = labels.split(',')
                node = KNode({
                    'id': raw_node.get('normalized_curie'),
                    'type': labels[0],
                    'name': raw_node.get('name'),
                    'properties': {
                        'input_term': raw_node.get('input_term')
                    }
                })
                node.add_export_labels(labels)
                limit_counter += 1
                if limit and limit_counter > limit:
                    break
                yield limit_counter - 1, node

    def parse_edges(self, provided_by, limit=0):
        """ Construct KEdges"""
        if not provided_by:
            raise RuntimeError(
                'Error edge property provided by is not specified')
        limit_counter = 0
        with open(os.path.join(self.cord_dir, 'edges.txt')) as edges_file:
            reader = csv.DictReader(edges_file, delimiter='\t')
            for edge_raw in reader:
                predicate = LabeledID(identifier='SEMMEDDB:ASSOCIATED_WITH',
                                      label='related_to')
                source_node = KNode(edge_raw['Term1'])
                target_node = KNode(edge_raw['Term2'])
                edge = self.create_edge(source_node=source_node,
                                        target_node=target_node,
                                        input_id=edge_raw['Term1'],
                                        provided_by=provided_by,
                                        predicate=predicate,
                                        publications=[],
                                        properties={
                                            'num_publications':
                                            float(edge_raw['Effective_Pubs']),
                                            'enrichment_p':
                                            float(edge_raw['Enrichment_p'])
                                        })
                edge.standard_predicate = predicate
                limit_counter += 1
                if limit and limit_counter > limit:
                    break
                yield limit_counter - 1, edge

    def parse_covid_pheno(self, phenotypes_file):
        items = []
        self.writer.normalized = True
        with open(phenotypes_file) as csf_file:
            data = csv.DictReader(csf_file, delimiter=',')
            for row in data:
                items.append(row)
        ids = []
        for n in items:
            if n['HP']:
                ids.append(n['HP'])
        import requests
        url = 'https://nodenormalization-sri.renci.org/get_normalized_nodes?'
        curies = '&'.join(list(map(lambda x: f'curie={x}', ids)))
        url += curies
        phenotypes = requests.get(url).json()
        knodes = []
        for n in phenotypes:
            node_data = phenotypes[n]
            i = node_data['id']
            knodes.append(KNode(i['identifier'], type=node_data['type'][0]))

        covid_node = requests.get(
            'https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=MONDO:0100096'
        ).json()
        covid_node = KNode(covid_node['MONDO:0100096']['id']['identifier'],
                           type=covid_node['MONDO:0100096']['type'][0])
        predicate = LabeledID(identifier='RO:0002200', label='has_phenotype')
        self.writer.write_node(covid_node)
        for node, edge_data in zip(knodes, items):
            self.writer.write_node(node)
            property = {}
            if 'Note' in edge_data:
                property = {'notes': edge_data['Note']}
            edge = self.create_edge(source_node=covid_node,
                                    target_node=node,
                                    input_id=covid_node.id,
                                    provided_by='covid_phenotypes_csv',
                                    predicate=predicate,
                                    publications=[],
                                    properties=property)
            edge.standard_predicate = predicate
            self.writer.write_edge(edge)
        self.writer.flush()

    def parse_drug_bank_items(self):
        import requests
        from contextlib import closing
        drug_bank_parsed_tsv = 'https://raw.githubusercontent.com/TranslatorIIPrototypes/CovidDrugBank/master/trials.txt'
        items = []
        tsv_file = requests.get(drug_bank_parsed_tsv, ).text.split('\n')
        reader = csv.DictReader(tsv_file, delimiter="\t")
        for row in reader:
            items.append(row)
        drug_ids = '&'.join([f"curie={item['source']}" for item in items])
        normalize_url = f"https://nodenormalization-sri.renci.org/get_normalized_nodes?{drug_ids}"
        response = requests.get(normalize_url).json()
        nodes = []
        export_labels_fallback = requests.get(
            'https://bl-lookup-sri.renci.org/bl/chemical_substance/ancestors?version=latest'
        ).json()
        export_labels_fallback.append('chemical_substance')
        for drug_id in response:
            node = None
            if response[drug_id] == None:
                node = KNode(drug_id, type='chemical_substance')
                node.add_export_labels(export_labels_fallback)
            else:
                # else use synonimized id so edges are merged
                prefered_curie = response[drug_id]['id']['identifier']
                node = KNode(prefered_curie, type="chemical_substance")
            nodes.append(node)
            self.writer.write_node(node)
        self.writer.flush()
        ## 'manually write in_clinical_trial_for edges
        query = lambda source_id, target_id, count: f"""
        MATCH (a:chemical_substance{{id: '{source_id}'}}) , (b:disease{{id:'{target_id}'}})
        Merge (a)-[e:in_clinical_trial_for{{id: apoc.util.md5([a.id, b.id, 'ROBOKOVID:in_clinical_trial_for']), predicate_id: 'ROBOKOVID:in_clinical_trial_for'}}]->(b)
        SET e.edge_source = "https://www.drugbank.ca/covid-19"
        SET e.relation_label = "in_clinical_trial_for"
        SET e.source_database = "drugbank"
        SET e.predicate_id = "ROBOKOVID:in_clinical_trial_for"
        SET e.relation = "in_clinical_trial_for"
        SET e.count = {count}
        """
        with self.rosetta.type_graph.driver.session() as session:
            for source_node, row in zip(nodes, items):
                q = query(source_node.id, row['object'],
                          row['count'])  # assuming  MONDO:0100096 is in there
                session.run(q)

    @staticmethod
    def convert_dict_to_neo4j_dict(d, exclude=[]):
        lines = []
        for k in d:
            if k in exclude:
                continue
            value = d[k]
            if isinstance(value, str):
                value = f"'{value}'"
            lines.append(f"{k}: {value}")
        lines.append('rectified: true')
        return f"{{{','.join(lines)}}}"

    @staticmethod
    def write_edge_copy(
        session,
        source_id,
        row,
        reverse,
    ):
        if reverse:
            target_id = source_id
            source_id = row['other_id']
        else:
            target_id = row['other_id']
        edge_type = row['edge_type']
        edge_properties = Cord19Service.convert_dict_to_neo4j_dict(
            row['e'], ['id'])
        edge = row['e']
        session.run(f"""
        MATCH (a:named_thing{{id:'{source_id}'}}), (b:named_thing{{id:'{target_id}'}})
        WHERE not (a)-[:{edge_type}]-(b)
        MERGE (a)-[e:{edge_type}{{id: apoc.util.md5([a.id, b.id, '{edge['predicate_id']}']), predicate_id: '{edge['predicate_id']}'}}]->(b)
         
        SET e += {edge_properties}        
                """)

    def rectify_relationships(self):
        """
        Gets edges for NCBITaxon:2697049(Covid-19 virus) and links them to MONDO:0100096(Covid-19 disease
        :return:
        """
        disease_id = "MONDO:0100096"
        taxon_id = "NCBITaxon:2697049"
        as_source_query = lambda source_id, other_id: f"""        
        MATCH (a:named_thing{{id:'{source_id}'}})-[e]->(b)
        WHERE b.id <> '{other_id}'
        return e, b.id as other_id , type(e) as edge_type
        """
        as_target_query = lambda target_id, other_id: f"""        
        MATCH (a)-[e]->(b:named_thing{{id:'{target_id}'}})
        WHERE b.id <> '{other_id}'
        return e, a.id as other_id, type(e) as edge_type
        """
        driver = self.rosetta.type_graph.driver
        with self.rosetta.type_graph.driver.session() as session:
            disease_to_things = [
                dict(**row)
                for row in session.run(as_source_query(disease_id, taxon_id))
            ]
        with driver.session() as session:
            things_to_disease = [
                dict(**row)
                for row in session.run(as_target_query(disease_id, taxon_id))
            ]
        with driver.session() as session:
            taxon_to_things = [
                dict(**row)
                for row in session.run(as_source_query(taxon_id, disease_id))
            ]
        with driver.session() as session:
            things_to_taxon = [
                dict(**row)
                for row in session.run(as_target_query(taxon_id, disease_id))
            ]

        for row in disease_to_things:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          taxon_id, row, False)
        for row in things_to_disease:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          taxon_id, row, True)
        for row in taxon_to_things:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          disease_id, row, False)
        for row in things_to_taxon:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          disease_id, row, True)