Beispiel #1
0
    def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator):
        all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes)
        for source_node_id, results in all_results.items():
            # convert the simple edges and nodes to rags objects and write them to the graph
            for (edge, node) in results:
                gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties)
                if self.recreate_sv_node:
                    variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT)
                    variant_node.add_export_labels([node_types.SEQUENCE_VARIANT])
                    writer.write_node(variant_node)
                if gene_node.id not in self.written_genes:
                    writer.write_node(gene_node)
                    self.written_genes.add(gene_node.id)

                predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label)
                gene_edge = KEdge(source_id=source_node_id,
                                  target_id=gene_node.id,
                                  provided_by=edge.provided_by,
                                  ctime=edge.ctime,
                                  original_predicate=predicate,
                                  # standard_predicate=predicate,
                                  input_id=edge.input_id,
                                  properties=edge.properties)
                writer.write_edge(gene_edge)
            logger.info(f'added {len(results)} variant relationships for {source_node_id}')
Beispiel #2
0
class GWASCatalog(Service):
    def __init__(self, rosetta):
        self.is_cached_already = False
        self.genetics_normalizer = GeneticsNormalizer()
        self.rosetta = rosetta
        self.writer = WriterDelegator(rosetta)
        self.version = '2020/05/04'
        self.sequence_variant_export_labels = None

    def process_gwas(self):
        # main entry point
        gwas_file = self.get_gwas_file()
        self.parse_gwas_file(gwas_catalog=gwas_file)

    def get_gwas_file(self):
        """
        Get the gwas file
        :return: Array of lines in the `gwas-catalog-associations_ontology-annotated.tsv` file
        """
        # adding a specific version instead of latest to help track things
        self.query_url = f'ftp.ebi.ac.uk/pub/databases/gwas/releases/{self.version}/' \
                         f'gwas-catalog-associations_ontology-annotated.tsv'
        ftpsite = 'ftp.ebi.ac.uk'
        ftpdir = f'/pub/databases/gwas/releases/{self.version}'
        ftpfile = 'gwas-catalog-associations_ontology-annotated.tsv'
        ftp = FTP(ftpsite)
        ftp.login()
        ftp.cwd(ftpdir)
        gwas_catalog = []
        ftp.retrlines(f'RETR {ftpfile}', gwas_catalog.append)
        ftp.quit()
        return gwas_catalog

    def parse_gwas_file(self, gwas_catalog):

        try:
            # get column headers
            file_headers = gwas_catalog[0].split('\t')
            pub_med_index = file_headers.index('PUBMEDID')
            p_value_index = file_headers.index('P-VALUE')
            snps_index = file_headers.index('SNPS')
            trait_ids_index = file_headers.index('MAPPED_TRAIT_URI')
        except (IndexError, ValueError) as e:
            logger.error(f'GWAS Catalog failed to prepopulate_cache ({e})')
            return []

        corrupted_lines = 0
        missing_variant_ids = 0
        missing_phenotype_ids = 0
        variant_to_pheno_cache = defaultdict(set)
        progress_counter = 0
        total_lines = len(gwas_catalog)
        trait_uri_pattern = re.compile(r'[^,\s]+')
        snp_pattern = re.compile(r'[^,;x*\s]+')
        for line in gwas_catalog[1:]:

            line = line.split('\t')
            try:
                # get pubmed id
                pubmed_id = line[pub_med_index]
                # get p-value
                p_value = float(line[p_value_index])
                if p_value == 0:
                    p_value = sys.float_info.min
                # get all traits (possible phenotypes)
                trait_uris = trait_uri_pattern.findall(line[trait_ids_index])
                # find all sequence variants
                snps = snp_pattern.findall(line[snps_index])
            except (IndexError, ValueError) as e:
                corrupted_lines += 1
                logger.warning(f'GWASCatalog corrupted line: {e}')
                continue

            if not (trait_uris and snps):
                corrupted_lines += 1
                logger.warning(f'GWASCatalog corrupted line: {line}')
                continue
            else:
                traits = []
                for trait_uri in trait_uris:
                    try:
                        trait_id = trait_uri.rsplit('/', 1)[1]
                        # ids show up like EFO_123, Orphanet_123, HP_123
                        if trait_id.startswith('EFO'):
                            curie_trait_id = f'EFO:{trait_id[4:]}'
                        elif trait_id.startswith('Orp'):
                            curie_trait_id = f'ORPHANET:{trait_id[9:]}'
                        elif trait_id.startswith('HP'):
                            curie_trait_id = f'HP:{trait_id[3:]}'
                        elif trait_id.startswith('NCIT'):
                            curie_trait_id = f'NCIT:{trait_id[5:]}'
                        elif trait_id.startswith('MONDO'):
                            curie_trait_id = f'MONDO:{trait_id[6:]}'
                        elif trait_id.startswith('GO'):
                            # Biological process or activity
                            # 5k+ of these
                            missing_phenotype_ids += 1
                            continue
                        else:
                            missing_phenotype_ids += 1
                            logger.warning(
                                f'{trait_uri} not a recognized trait format')
                            continue

                        traits.append(curie_trait_id)

                    except IndexError as e:
                        logger.warning(
                            f'trait uri index error:({trait_uri}) not splittable'
                        )

                variant_nodes = set()
                for n, snp in enumerate(snps):
                    if snp.startswith('rs'):
                        dbsnp_curie = f'DBSNP:{snp}'
                        variant_node = KNode(dbsnp_curie,
                                             type=node_types.SEQUENCE_VARIANT)
                        # adding an export label, this will ensure that it will go into the proper queue
                        # hence we can do batch normalization in the writer.
                        variant_node.add_export_labels(
                            [node_types.SEQUENCE_VARIANT])
                        variant_nodes.add(variant_node)
                    else:
                        missing_variant_ids += 1
                        pass

                if traits and variant_nodes:
                    props = {'p_value': p_value}
                    for variant_node in variant_nodes:
                        self.writer.write_node(variant_node)
                        for trait_id in traits:
                            variant_to_pheno_edge, phenotype_node = self.create_variant_to_phenotype_components(
                                variant_node,
                                trait_id,
                                None,
                                pubmed_id=pubmed_id,
                                properties=props)
                            self.writer.write_node(phenotype_node)
                            self.writer.write_edge(variant_to_pheno_edge)
            progress_counter += 1
            if progress_counter % 1000 == 0:
                percent_complete = (progress_counter / total_lines) * 100
                logger.info(f'GWASCatalog progress: {int(percent_complete)}%')

    def create_variant_to_phenotype_components(self,
                                               variant_node,
                                               phenotype_id,
                                               phenotype_label,
                                               pubmed_id=None,
                                               properties={}):
        phenotype_node = KNode(phenotype_id,
                               name=phenotype_label,
                               type=node_types.DISEASE_OR_PHENOTYPIC_FEATURE)
        pubs = []
        if pubmed_id:
            pubs.append(f'PMID:{pubmed_id}')

        predicate = LabeledID(identifier=f'RO:0002200', label=f'has_phenotype')
        edge = self.create_edge(
            variant_node,
            phenotype_node,
            'gwascatalog.sequence_variant_to_disease_or_phenotypic_feature',
            variant_node.id,
            predicate,
            url=self.query_url,
            properties=properties,
            publications=pubs)
        return (edge, phenotype_node)
Beispiel #3
0
class Program:
    def __init__(self, plan, machine_question, rosetta, program_number):
        # Plan comes from typegraph and contains
        # transitions: a map from a node index to an (operation, output index) pair
        self.program_number = program_number
        self.machine_question = machine_question
        self.transitions = plan
        self.rosetta = rosetta
        self.prefix = hashlib.md5(
            (str(plan) + str(machine_question['nodes'])).encode()).hexdigest()
        self.cache = Cache(redis_host=os.environ['BUILD_CACHE_HOST'],
                           redis_port=os.environ['BUILD_CACHE_PORT'],
                           redis_db=os.environ['BUILD_CACHE_DB'],
                           prefix=self.prefix)

        self.cache.flush()
        self.log_program()
        #self.excluded_identifiers=set()
        """
        EXCLUSION CANDIDATES:
        UBERON:0000468 multi-cellular organism
        UBERON:0001062 anatomical entity
        UBERON:0000479 tissue
        UBERON:0000062 organ
        UBERON:0000064 organ part
        UBERON:0000467 anatomical system
        UBERON:0000465 material anatomical entity 
        UBERON:0000061 anatomical structure
        UBERON:0010000 multicellular anatomical structure
        UBERON:0011216 organ system subdivision
        UBERON:0000475 organism subdivision
        0002405 immune system
        0001016 nervous system
        0001017 central nervous system
        0001007 digestive system
        0004535 cardiovascular system
        0000949 endocrine system
        0000079 male reproductive system
        0001434 skeletal system
        0000178 blood
        GO:0044267 cellular protein metabolic processes
        GO:0005515 protein binding
        CL:0000548 animal cell
        CL:0000003 native cell
        CL:0000255 eukaryotic cell
        """
        self.excluded_identifiers = self.rosetta.service_context.config.get(
            'bad_identifiers')
        # {'UBERON:0000064','UBERON:0000475','UBERON:0011216','UBERON:0000062','UBERON:0000465','UBERON:0010000','UBERON:0000061', 'UBERON:0000467','UBERON:0001062','UBERON:0000468', 'UBERON:0000479', 'GO:0044267', 'GO:0005515', 'CL:0000548', 'CL:0000003', 'CL:0000255'}

        self.writer_delegator = WriterDelegator(rosetta)

    def log_program(self):
        logstring = f'Program {self.program_number}\n'
        logstring += 'Nodes: \n'
        for i, cn in enumerate(self.machine_question['nodes']):
            logstring += f' {i}: {cn}\n'
        logstring += 'Transitions:\n'
        for k in self.transitions:
            logstring += f' {k}: {self.transitions[k]}\n'
        total_transitions = len(self.transitions.keys())
        #if  total_transitions < 20:
        #    logger.debug(logstring)
        logger.debug(logstring)
        logger.debug(f'total transitions : {total_transitions}')

    def initialize_instance_nodes(self):
        # No error checking here. You should have caught any malformed questions before this point.
        logger.debug("Initializing program {}".format(self.program_number))

        # Filter out the curies in the question
        non_sv_curies = list(
            map(
                lambda n: n.curie,
                filter(
                    lambda node: node.curie and node.type != node_types.
                    SEQUENCE_VARIANT, self.machine_question['nodes'])))
        sv_curies = list(
            map(
                lambda n: n.curie,
                filter(
                    lambda node: node.curie and node.type == node_types.
                    SEQUENCE_VARIANT, self.machine_question['nodes'])))
        #  batch synonymize them all, getting back a dict
        # normalized_node = { <curie> : KNode() }
        normalized_nodes = Synonymizer.batch_normalize_nodes(non_sv_curies)
        normalized_nodes.update(
            Synonymizer.batch_normalize_sequence_variants(sv_curies))
        # go back to the question an start processing them.
        # during processing we don't need to do synonymization at
        # any point. We will let each service return a KNode
        # we will batch synonymize results later in Buffered writer.
        for n in self.machine_question['nodes']:
            if n.curie:
                # if node is not normalized via synonymization service promote it to Knowledge node.
                start_node = normalized_nodes.get(n.curie,
                                                  self.parse_QNode_to_KNode(n))
                self.process_node(start_node, [n.id])
        return

    def parse_QNode_to_KNode(self, qNode: QNode):
        """Incase of synonymization failure question Node is promoted to Knowledge node"""
        return KNode(qNode.curie, type=[qNode.type])

    def process_op(self, link, source_node, history):
        op_name = link['op']
        key = f"{op_name}({Text.upper_curie(source_node.id)})"
        maxtime = timedelta(minutes=2)
        try:
            try:
                results = self.rosetta.cache.get(key)
            except Exception as e:
                # logger.warning(e)
                results = None
            if results is not None:
                logger.debug(f"cache hit: {key} size:{len(results)}")
            else:
                logger.debug(f"exec op: {key}")
                op = self.rosetta.get_ops(op_name)
                start = dt.now()
                results = op(source_node)
                end = dt.now()
                logger.debug(f'Call {key} took {end-start}')
                if (end - start) > maxtime:
                    logger.warn(f"Call {key} exceeded {maxtime}")
                self.rosetta.cache.set(key, results)
                logger.debug(f"cache.set-> {key} length:{len(results)}")
                logger.debug(f"    {[node for _, node in results]}")
            results = list(
                filter(lambda x: x[1].id not in self.excluded_identifiers,
                       results))
            for edge, node in results:
                edge_label = Text.snakify(edge.original_predicate.label)
                if link['predicate'] is None or edge_label == link[
                        'predicate'] or (isinstance(link['predicate'], list)
                                         and
                                         (edge_label in link['predicate'])):
                    self.process_node(node, history, edge)
                else:
                    pass

        except pika.exceptions.ChannelClosed:
            traceback.print_exc()
            raise
        except Exception as e:
            traceback.print_exc()
            log_text = f"  -- {key}"
            logger.warning(f"Error invoking> {log_text}")

    def process_node(self, node, history, edge=None):
        """
        We've got a new set of nodes (either initial nodes or from a query).  They are attached
        to a particular concept in our query plan. We make sure that they're synonymized and then
        queue up their children
        """
        logger.debug(f'process {node.id}')
        if edge is not None:
            is_source = node.id == edge.source_id
        #Our excluded ids are e.g. uberons, but we might have gotten something else like a CARO
        # so we need to synonymize and then cehck for identifiers
        if node.id in self.excluded_identifiers:
            return
        try:
            result = annotate_shortcut(node, self.rosetta)
            if type(result) == type(None):
                logger.debug(f'No annotator found for {node}')
        except Exception as e:
            logger.error(e)
            logger.error(traceback.format_exc())
        if edge is not None:
            if is_source:
                edge.source_id = node.id
            else:
                edge.target_id = node.id

        # check the node cache, compare to the provided history
        # to determine which ops are valid
        key = node.id

        # print(node.dump())
        # if edge:
        #     print(edge.dump())
        #print("-"*len(history)+"History: ", history)

        # only add a node if it wasn't cached
        completed = self.cache.get(key)  # set of nodes we've been from here
        #print("-"*len(history)+"Completed: ", completed)
        if completed is None:
            completed = set()
            self.cache.set(key, completed)

        self.writer_delegator.write_node(node)
        #logger.debug(f"Sent node {node.id}")

        # make sure the edge is queued for creation AFTER the node
        if edge:
            self.writer_delegator.write_edge(edge)
            #logger.debug(f"Sent edge {edge.source_id}->{edge.target_id}")

        # quit if we've closed a loop
        if history[-1] in history[:-1]:
            #print("-"*len(history)+"Closed a loop!")
            return

        source_id = history[-1]

        # quit if there are no transitions from this node
        if source_id not in self.transitions:
            return

        destinations = self.transitions[source_id]
        completed = self.cache.get(key)
        for target_id in destinations:
            if not self.transitions[source_id][target_id]:
                continue
            # don't turn around
            if len(history) > 1 and target_id == history[-2]:
                continue
            # don't repeat things
            if target_id in completed:
                continue
            completed.add(target_id)
            self.cache.set(key, completed)
            links = self.transitions[source_id][target_id]
            #print("-"*len(history)+f"Destination: {target_id}")
            for link in links:
                print("-" * len(history) + "Executing: ", link['op'])
                self.process_op(link, node, history + [target_id])

    #CAN I SOMEHOW CAPTURE PATHS HERE>>>>

    def run_program(self):
        """Loop over unused nodes, send them to the appropriate operator, and collect the results.
        Keep going until there's no nodes left to process."""
        logger.debug(f"Running program {self.program_number}")
        self.initialize_instance_nodes()
        self.writer_delegator.flush()
        return

    def get_path_descriptor(self):
        """Return a description of valid paths at the concept level.  The point is to have a way to
        find paths in the final graph.  By starting at one end of this, you can get to the other end(s).
        So it assumes an acyclic graph, which may not be valid in the future.  What it should probably
        return in the future (if we still need it) is a cypher query to find all the paths this program
        might have made."""
        path = {}
        used = set()
        node_num = 0
        used.add(node_num)
        while len(used) != len(self.machine_question['nodes']):
            next = None
            if node_num in self.transitions:
                putative_next = self.transitions[node_num]['to']
                if putative_next not in used:
                    next = putative_next
                    dir = 1
            if next is None:
                for putative_next in self.transitions:
                    ts = self.transitions[putative_next]
                    if ts['to'] == node_num:
                        next = putative_next
                        dir = -1
            if next is None:
                logger.error("How can this be? No path across the data?")
                raise Exception()
            path[node_num] = (next, dir)
            node_num = next
            used.add(node_num)
        return path
Beispiel #4
0
class OntologicalHeirarchy(Service):
    """
    Service that makes call to uberongraph to resolve subclass relationships between ontological terms
    """
    def __init__(self):
        self.url = "https://stars-app.renci.org/uberongraph/sparql"
        self.triplestore = TripleStore(self.url)
        self.prefix_set = {
            node_types.DISEASE_OR_PHENOTYPIC_FEATURE: ['HP', 'MONDO'],
            node_types.CELLULAR_COMPONENT: ['CL'],
            node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY: ['GO'],
            node_types.ANATOMICAL_ENTITY: ['UBERON'],
            node_types.CHEMICAL_SUBSTANCE: ['CHEBI']
        }
        self.root_uris = {
            node_types.ANATOMICAL_ENTITY:
            "<http://purl.obolibrary.org/obo/UBERON_0001062>",
            node_types.DISEASE:
            "<http://purl.obolibrary.org/obo/MONDO_0000001>",
            node_types.MOLECULAR_ACTIVITY:
            "<http://purl.obolibrary.org/obo/GO_0003674>",
            node_types.BIOLOGICAL_PROCESS:
            "<http://purl.obolibrary.org/obo/GO_0008150>",
            node_types.CHEMICAL_SUBSTANCE:
            "<http://purl.obolibrary.org/obo/CHEBI_24431>",
            node_types.PHENOTYPIC_FEATURE:
            "<http://purl.obolibrary.org/obo/HP_0000118>",
            node_types.CELL:
            "<http://purl.obolibrary.org/obo/CL_0000000>",
            node_types.CELLULAR_COMPONENT:
            "<http://purl.orolibrary.org/obo/GO_0005575>"
        }
        obo_prefixes = '\n'.join([
            f'PREFIX {pref}: <http://purl.obolibrary.org/obo/{pref}_>'
            for pref in set(
                reduce(lambda x, y: x + y, self.prefix_set.values(), []))
        ])
        self.query = f"""
                    {obo_prefixes}
                    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>        
                    select  ?parent_id ?parent_label ?child_id ?child_label
                    where {{                        
                        ?parent_id rdfs:subClassOf $root_uri .
                        ?child_id rdfs:subClassOf ?parent_id.
                   OPTIONAL {{
                    ?parent_id rdfs:label ?parent_label.
                    ?child_id rdfs:label ?child_label.
                    }}                      
                    }}
                        """
        rosetta = Rosetta()
        self.wdg = WriterDelegator(rosetta)

    def runner(self):
        for node_type, root_iri in self.root_uris.items():
            nodes, edges = self.term_get_ancestors(node_type, root_iri)
            for index, n in enumerate(nodes):
                self.wdg.write_node(n, annotate=False)
                if ((index / len(nodes)) * 100) % 10 == 0:
                    print((index / len(nodes)) * 100, '% complete')
            for index, e in enumerate(edges):
                self.wdg.write_edge(e)
                if index % 100 == 0:
                    self.wdg.flush()
                if ((index / len(edges)) * 100) % 10 == 0:
                    print((index / len(edges)) * 100, '% complete')
        return

    def term_get_ancestors(self, node_type, root_iri):
        results = self.triplestore.query_template(
            template_text=self.query,
            inputs={'root_uri': root_iri},
            outputs=['parent_id', 'parent_label', 'child_id', 'child_label'])
        print('found total ', len(results), ' results.')
        nodes = set()
        edges = set()
        for index, row in enumerate(results):
            # Output type would be same as input type?
            ancestor_node = KNode(Text.obo_to_curie(row['parent_id']),
                                  name=row['parent_label'],
                                  type=node_type)
            child_node = KNode(Text.obo_to_curie(row['child_id']),
                               name=row['child_label'],
                               type=node_type)
            if ancestor_node.id == child_node.id:
                # refrain from adding edge to the node itself
                continue
            predicate = LabeledID(identifier='rdfs:subClassOf',
                                  label='subclass of')
            edge = self.create_edge(
                source_node=child_node,
                target_node=ancestor_node,
                predicate=predicate,
                provided_by='uberongraph.term_get_ancestors',
                input_id=child_node.id)
            nodes.add(child_node)
            nodes.add(ancestor_node)
            edges.add(edge)
        return nodes, edges
class KGX_File_parser(Service):
    def __init__(self):
        pass

    def get_nodes_from_file(self, file_name, delimiter: str):
        if not file_name:
            return

        with open(file_name) as nodes_file:
            reader = csv.DictReader(nodes_file, delimiter=delimiter)
            for raw_node in reader:
                labels = list(
                    filter(lambda x: x, raw_node['category'].split('|')))
                if not len(labels):
                    labels = ['named_thing']
                id = raw_node['id']
                name = raw_node['name']
                node = KNode(id, type=labels[0], name=name)
                node.add_export_labels(labels)
                yield node

    def get_edges_from_file(self, file_name, provided_by, delimiter):
        """
        All is stuff is till we get kgx to merge edges. For now creating
        a pattern looking like a robokopservice and let writer handle it.
        :param file_name:
        :return:
        """
        if not file_name:
            return

        bl_resolver = BL_lookup()
        with open(file_name) as edge_file:
            reader = csv.DictReader(edge_file, delimiter=delimiter)
            for raw_edge in reader:
                edge_label = raw_edge['edge_label'].split(':')[-1]
                relation_predicate = raw_edge['relation']
                predicate = LabeledID(
                    identifier=
                    relation_predicate,  #bl_resolver.resolve_curie(edge_label),
                    label=edge_label)
                source_node = KNode(raw_edge['subject'])
                target_node = KNode(raw_edge['object'])
                edge = self.create_edge(
                    source_node=source_node,
                    target_node=target_node,
                    input_id=source_node.id,
                    provided_by=provided_by,
                    predicate=predicate,
                )
                edge.standard_predicate = predicate
                yield edge

    def run(self, nodes_file_name, edges_file_name, provided_by, delimiter):
        self.rosetta = Rosetta()
        self.wdg = WriterDelegator(rosetta)
        self.wdg.normalized = True

        for node in self.get_nodes_from_file(nodes_file_name, delimiter):
            self.wdg.write_node(node, annotate=False)

        for edge in self.get_edges_from_file(edges_file_name,
                                             provided_by=provided_by,
                                             delimiter=delimiter):
            self.wdg.write_edge(edge)
        self.wdg.flush()
class Cord19Service(Service):
    def __init__(self):
        self.cord_dir = os.environ.get('CORD_DIR')
        self.rosetta = Rosetta()
        self.writer = WriterDelegator(rosetta=self.rosetta)
        # line counts for reporting
        self.num_edges = self.count_lines_in_file('edges.txt')
        self.num_nodes = self.count_lines_in_file('nodes.txt')

    def count_lines_in_file(self, file_name):
        count = -1  # don't count headers
        with open(os.path.join(self.cord_dir, file_name)) as nodes_file:
            for line in nodes_file:
                count += 1
        return count

    def load_nodes_only(self):
        print('Writing nodes')
        for index, node in self.parse_nodes():
            index += 1
            self.writer.write_node(node)
            if index % 100 == 0:
                print(f'~~~~~~~~~{(index/self.num_nodes)* 100}% complete')

    def load(self, provided_by, limit=0):
        print('writing to graph')
        print('writing nodes')
        self.writer.normalized = True
        for index, node in self.parse_nodes():
            self.writer.write_node(node)
            if index % 1000 == 0:
                print(f'~~~~~~~~~{(index / self.num_edges) * 100} % complete')
        for index, edge in self.parse_edges(provided_by=provided_by,
                                            limit=limit):
            source_node = KNode(edge.source_id)
            target_node = KNode(edge.target_id)
            self.writer.write_node(source_node)
            self.writer.write_node(target_node)
            self.writer.write_edge(edge)
            if index % 10000 == 0:
                print(f'~~~~~~~~~{(index/self.num_edges)* 100} % complete')
        self.writer.flush()
        print('done writing edges')

    def parse_nodes(self, limit=0):
        """
        Parse nodes.
        :param limit: for testing reads first n nodes from file
        :return: dict with node_id as key and KNode as value
        """
        print('parsing nodes...')
        limit_counter = 0
        with open(os.path.join(self.cord_dir, 'nodes.txt')) as nodes_file:
            reader = csv.DictReader(nodes_file, delimiter='\t')
            for raw_node in reader:
                # transform headers to knode attrbutes
                labels = raw_node.get('semantic_type')
                labels = labels.replace(']', '').replace('[', '').replace(
                    '\\', '').replace("'", '')
                labels = labels.split(',')
                node = KNode({
                    'id': raw_node.get('normalized_curie'),
                    'type': labels[0],
                    'name': raw_node.get('name'),
                    'properties': {
                        'input_term': raw_node.get('input_term')
                    }
                })
                node.add_export_labels(labels)
                limit_counter += 1
                if limit and limit_counter > limit:
                    break
                yield limit_counter - 1, node

    def parse_edges(self, provided_by, limit=0):
        """ Construct KEdges"""
        if not provided_by:
            raise RuntimeError(
                'Error edge property provided by is not specified')
        limit_counter = 0
        with open(os.path.join(self.cord_dir, 'edges.txt')) as edges_file:
            reader = csv.DictReader(edges_file, delimiter='\t')
            for edge_raw in reader:
                predicate = LabeledID(identifier='SEMMEDDB:ASSOCIATED_WITH',
                                      label='related_to')
                source_node = KNode(edge_raw['Term1'])
                target_node = KNode(edge_raw['Term2'])
                edge = self.create_edge(source_node=source_node,
                                        target_node=target_node,
                                        input_id=edge_raw['Term1'],
                                        provided_by=provided_by,
                                        predicate=predicate,
                                        publications=[],
                                        properties={
                                            'num_publications':
                                            float(edge_raw['Effective_Pubs']),
                                            'enrichment_p':
                                            float(edge_raw['Enrichment_p'])
                                        })
                edge.standard_predicate = predicate
                limit_counter += 1
                if limit and limit_counter > limit:
                    break
                yield limit_counter - 1, edge

    def parse_covid_pheno(self, phenotypes_file):
        items = []
        self.writer.normalized = True
        with open(phenotypes_file) as csf_file:
            data = csv.DictReader(csf_file, delimiter=',')
            for row in data:
                items.append(row)
        ids = []
        for n in items:
            if n['HP']:
                ids.append(n['HP'])
        import requests
        url = 'https://nodenormalization-sri.renci.org/get_normalized_nodes?'
        curies = '&'.join(list(map(lambda x: f'curie={x}', ids)))
        url += curies
        phenotypes = requests.get(url).json()
        knodes = []
        for n in phenotypes:
            node_data = phenotypes[n]
            i = node_data['id']
            knodes.append(KNode(i['identifier'], type=node_data['type'][0]))

        covid_node = requests.get(
            'https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=MONDO:0100096'
        ).json()
        covid_node = KNode(covid_node['MONDO:0100096']['id']['identifier'],
                           type=covid_node['MONDO:0100096']['type'][0])
        predicate = LabeledID(identifier='RO:0002200', label='has_phenotype')
        self.writer.write_node(covid_node)
        for node, edge_data in zip(knodes, items):
            self.writer.write_node(node)
            property = {}
            if 'Note' in edge_data:
                property = {'notes': edge_data['Note']}
            edge = self.create_edge(source_node=covid_node,
                                    target_node=node,
                                    input_id=covid_node.id,
                                    provided_by='covid_phenotypes_csv',
                                    predicate=predicate,
                                    publications=[],
                                    properties=property)
            edge.standard_predicate = predicate
            self.writer.write_edge(edge)
        self.writer.flush()

    def parse_drug_bank_items(self):
        import requests
        from contextlib import closing
        drug_bank_parsed_tsv = 'https://raw.githubusercontent.com/TranslatorIIPrototypes/CovidDrugBank/master/trials.txt'
        items = []
        tsv_file = requests.get(drug_bank_parsed_tsv, ).text.split('\n')
        reader = csv.DictReader(tsv_file, delimiter="\t")
        for row in reader:
            items.append(row)
        drug_ids = '&'.join([f"curie={item['source']}" for item in items])
        normalize_url = f"https://nodenormalization-sri.renci.org/get_normalized_nodes?{drug_ids}"
        response = requests.get(normalize_url).json()
        nodes = []
        export_labels_fallback = requests.get(
            'https://bl-lookup-sri.renci.org/bl/chemical_substance/ancestors?version=latest'
        ).json()
        export_labels_fallback.append('chemical_substance')
        for drug_id in response:
            node = None
            if response[drug_id] == None:
                node = KNode(drug_id, type='chemical_substance')
                node.add_export_labels(export_labels_fallback)
            else:
                # else use synonimized id so edges are merged
                prefered_curie = response[drug_id]['id']['identifier']
                node = KNode(prefered_curie, type="chemical_substance")
            nodes.append(node)
            self.writer.write_node(node)
        self.writer.flush()
        ## 'manually write in_clinical_trial_for edges
        query = lambda source_id, target_id, count: f"""
        MATCH (a:chemical_substance{{id: '{source_id}'}}) , (b:disease{{id:'{target_id}'}})
        Merge (a)-[e:in_clinical_trial_for{{id: apoc.util.md5([a.id, b.id, 'ROBOKOVID:in_clinical_trial_for']), predicate_id: 'ROBOKOVID:in_clinical_trial_for'}}]->(b)
        SET e.edge_source = "https://www.drugbank.ca/covid-19"
        SET e.relation_label = "in_clinical_trial_for"
        SET e.source_database = "drugbank"
        SET e.predicate_id = "ROBOKOVID:in_clinical_trial_for"
        SET e.relation = "in_clinical_trial_for"
        SET e.count = {count}
        """
        with self.rosetta.type_graph.driver.session() as session:
            for source_node, row in zip(nodes, items):
                q = query(source_node.id, row['object'],
                          row['count'])  # assuming  MONDO:0100096 is in there
                session.run(q)

    @staticmethod
    def convert_dict_to_neo4j_dict(d, exclude=[]):
        lines = []
        for k in d:
            if k in exclude:
                continue
            value = d[k]
            if isinstance(value, str):
                value = f"'{value}'"
            lines.append(f"{k}: {value}")
        lines.append('rectified: true')
        return f"{{{','.join(lines)}}}"

    @staticmethod
    def write_edge_copy(
        session,
        source_id,
        row,
        reverse,
    ):
        if reverse:
            target_id = source_id
            source_id = row['other_id']
        else:
            target_id = row['other_id']
        edge_type = row['edge_type']
        edge_properties = Cord19Service.convert_dict_to_neo4j_dict(
            row['e'], ['id'])
        edge = row['e']
        session.run(f"""
        MATCH (a:named_thing{{id:'{source_id}'}}), (b:named_thing{{id:'{target_id}'}})
        WHERE not (a)-[:{edge_type}]-(b)
        MERGE (a)-[e:{edge_type}{{id: apoc.util.md5([a.id, b.id, '{edge['predicate_id']}']), predicate_id: '{edge['predicate_id']}'}}]->(b)
         
        SET e += {edge_properties}        
                """)

    def rectify_relationships(self):
        """
        Gets edges for NCBITaxon:2697049(Covid-19 virus) and links them to MONDO:0100096(Covid-19 disease
        :return:
        """
        disease_id = "MONDO:0100096"
        taxon_id = "NCBITaxon:2697049"
        as_source_query = lambda source_id, other_id: f"""        
        MATCH (a:named_thing{{id:'{source_id}'}})-[e]->(b)
        WHERE b.id <> '{other_id}'
        return e, b.id as other_id , type(e) as edge_type
        """
        as_target_query = lambda target_id, other_id: f"""        
        MATCH (a)-[e]->(b:named_thing{{id:'{target_id}'}})
        WHERE b.id <> '{other_id}'
        return e, a.id as other_id, type(e) as edge_type
        """
        driver = self.rosetta.type_graph.driver
        with self.rosetta.type_graph.driver.session() as session:
            disease_to_things = [
                dict(**row)
                for row in session.run(as_source_query(disease_id, taxon_id))
            ]
        with driver.session() as session:
            things_to_disease = [
                dict(**row)
                for row in session.run(as_target_query(disease_id, taxon_id))
            ]
        with driver.session() as session:
            taxon_to_things = [
                dict(**row)
                for row in session.run(as_source_query(taxon_id, disease_id))
            ]
        with driver.session() as session:
            things_to_taxon = [
                dict(**row)
                for row in session.run(as_target_query(taxon_id, disease_id))
            ]

        for row in disease_to_things:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          taxon_id, row, False)
        for row in things_to_disease:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          taxon_id, row, True)
        for row in taxon_to_things:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          disease_id, row, False)
        for row in things_to_taxon:
            with driver.session() as session:
                session.write_transaction(Cord19Service.write_edge_copy,
                                          disease_id, row, True)