def test_update_from_obograph(self):
     # Adding this to cope with odd issues with file_path when running python modules on different systems
     p = get_file_path("uk/ac/ebi/vfb/neo4j/test/resources/vfb_ext.json")
     print(p)
     self.ni.update_from_obograph(file_path=p)
     self.ni.commit()
     result = self.ni.nc.commit_list(["MATCH (p:Property) WHERE p.iri = 'http://purl.obolibrary.org/obo/RO_0002350' RETURN p.label as label"])
     dc = results_2_dict_list(result)
     assert dc[0]['label'] == 'member_of'
     
     result = self.ni.nc.commit_list(["MATCH (p:Class) WHERE p.iri = 'http://purl.obolibrary.org/obo/fbbt/vfb/VFB_10000005' RETURN p.label as label"])
     dc = results_2_dict_list(result)
     assert dc[0]['label'] == 'cluster'
    def gen_neuron2channel(self):
        """Function to add assertions of overlap to BrainName domains.  Currently works with a simple cutoff, but there is scope to modify this to at least specify a proportion of voxel size of domain."""

        # Map neuron 2 neuropil overlap

        # Give this a better name
        # => anatomical individual -> overlaps by abbv.  Now we need channels!
        neurons = [d['shortFormID'] for d in self.neuron_neuropil_overlaps]
        chunked_neurons = chunks(l=neurons, n=500)
        s = []
        rdl = []
        for c in chunked_neurons:
            s.append("MATCH (neuron_ind:Individual)<-[r:Related]-(neuron_channel:Individual) " \
                     "WHERE r.short_form = 'depicts' AND neuron_ind.short_form IN %s " \
                     "RETURN  neuron_ind.short_form, neuron_channel.iri" % str(c))
            result = self.nc.commit_list(s)
            # Check if query returns
            if result:
                rdl.extend(results_2_dict_list(result))
        # Neuron to channel lookup
        neuron2channel = {}
        for r in rdl:
            neuron2channel[
                r['neuron_ind.short_form']] = r['neuron_channel.iri']

        return neuron2channel
    def gen_BrainName_mapping(self):
        """Returns a dict mapping BrainName_abbv to channel."""

        self.cursor.execute("SELECT b2o.BrainName_abbv, oe.shortFormID, o.baseURI " \
                       "FROM BrainName_to_owl b2o " \
                       "JOIN owl_class oe ON (oe.id=b2o.owl_class_id) " \
                       "JOIN ontology o ON (o.id=oe.ontology_id)")

        BN_dict = {}
        dc = self.cursor.fetchall()
        for d in dc:
            BN = d["BrainName_abbv"]
            r = self.nc.commit_list(["MATCH (neuropil:Class)<-[:INSTANCEOF]-(a:Individual) " \
                    "<-[:Related { short_form : 'depicts'}]-(neuropil_channel:Individual)" \
                    "-[:in_register_with]->(bc) " \
                    "-[:Related { short_form : 'depicts'}]->(t:Individual)" \
                    "WHERE neuropil.short_form = '%s' " \
                    "AND t.label = 'JFRC2_template'" \
                    "RETURN neuropil_channel.iri " % d["shortFormID"]])
            nci = results_2_dict_list(r)
            if not nci:
                warnings.warn("No channel mapping for %s" % BN)
            elif len(nci) > 1:
                warnings.warn("Multiple channel mappings for %s" % BN)
            else:
                BN_dict[BN] = nci[0]["neuropil_channel.iri"]
        return BN_dict
Exemple #4
0
    def _generate_lookups(self, conf):
        """Generate  :Class name:ID lookups from DB for loading by label.
         Lookups are defined by standard config that specifies a config:
         name:field:regex, e.g. {'part_of': {'short_form': 'FBbt_.+'}}
         name should match the kwarg for which it is to be used.
         """
        # This is just rolling a relation lookup...

        lookup_config = conf

        lookup = {}
        if lookup_config:
            # Add some type checking here?
            for name, lcs in lookup_config.items():
                lookup[name] = {}
                for c in lcs:
                    q = "MATCH (c%s) where c.%s =~ '%s' RETURN c.label as label" \
                        ", c.short_form as short_form" % (c.neo_label_string, c.field, c.regex)
                    #print(q)
                    rr = self.ew.nc.commit_list([q])
                    r = results_2_dict_list(rr)
                    lookup[name].update({
                        escape_string_for_neo(x['label']): x['short_form']
                        for x in r
                    })
        return lookup
def query(query):
    q = nc.commit_list([query])
    if not q:
        return False
    dc = results_2_dict_list(q)
    if not dc:
        return False
    else:
        return dc
def get_lookup(limit_by_prefix=None):
    if limit_by_prefix:
        regex_string = ':.+|'.join(limit_by_prefix) + ':.+'
        where = " AND a.obo_id =~ '%s' " % regex_string
    else:
        where = ''
    nc = neo4j_connect("https://pdb.virtualflybrain.org", "neo4j", "neo4j")
    lookup_query = "MATCH (a:VFB:Class) WHERE exists (a.obo_id)" + where + " RETURN a.obo_id as id, a.label as name"
    q = nc.commit_list([lookup_query])
    r = results_2_dict_list(q)
    lookup = {x['name']: x['id'] for x in r}
    #print(lookup['neuron'])
    property_query = "MATCH (p:Property) WHERE exists(p.obo_id) RETURN p.obo_id as id, p.label as name"
    q = nc.commit_list([property_query])
    r = results_2_dict_list(q)
    lookup.update({x['name']: x['id'] for x in r})
    #print(lookup['neuron'])
    return lookup
def query_ind_count(query):
    q = nc.commit_list([query])
    if not q:
        return False
    dc = results_2_dict_list(q)
    if not dc:
        return False
    if not ('ind_count' in dc[0].keys()):
        warnings.warn("Query has no ind_count")
        return False
    else:
        return dc[0]['ind_count']
    def testMoveNodeLabels(self):
        self.ncm.To.commit_list(["CREATE (n { short_form : 'VFB_00000002' })"])
        self.ncm.move_node_labels(
            match="MATCH (n { short_form : 'VFB_00000002' })",
            node_key='short_form')
        query = self.ncm.To.commit_list([
            "MATCH (n { short_form : 'VFB_00000002' })"
            "RETURN labels(n) as nlab"
        ])
        query_results = results_2_dict_list(query)

        assert 'Individual' in query_results[0]['nlab']
    def test_commit_from_remote_csv(self):
        # Quite a minimal test...
        self.nc.commit_csv(
            url=
            'https://neo4j.com/docs/developer-manual/3.3/csv/artists-with-headers.csv',
            statement=
            "MERGE (:Artist { name: line.Name, year: toInteger(line.Year)})")

        q = self.nc.commit_list(
            ["MATCH (a:Artist { name: 'ABBA'})"
             "RETURN a.year as y"])
        qr = results_2_dict_list(q)
        for r in qr:
            assert r[
                'y'] == 1992  # Note - this required cast from string to int.
def gen_simple_report(terms):
    nc = neo4j_connect("https://pdb.virtualflybrain.org", "neo4j", "neo4j")
    query = """MATCH (n:Class) WHERE n.iri in %s WITH n 
                OPTIONAL MATCH  (n)-[r]->(p:pub) WHERE r.typ = 'syn' 
                WITH n, 
                COLLECT({ synonym: r.synonym, PMID: 'PMID:' + p.PMID, 
                    miniref: p.label}) AS syns 
                OPTIONAL MATCH (n)-[r]-(p:pub) WHERE r.typ = 'def' 
                with n, syns, 
                collect({ PMID: 'PMID:' + p.PMID, miniref: p.label}) as pubs
                OPTIONAL MATCH (n)-[:SUBCLASSOF]->(super:Class)
                RETURN n.short_form as short_form, n.label as label, 
                n.description as description, syns, pubs,
                super.label, super.short_form
                 """ % str(terms)
    #print(query)
    q = nc.commit_list([query])
    return results_2_dict_list(q)
 def testMoveEdges(self):
     self.ncm.move_nodes(
         match="MATCH (n:Individual { short_form : 'VFB_00000001' })",
         key='iri')
     self.ncm.move_nodes(match="MATCH (n:Class { label : 'neuron' }) ",
                         key='iri')
     self.ncm.move_edges(
         match="MATCH (s:Individual { short_form : 'VFB_00000001'})"
         "-[r]-(o:Class { label : 'neuron' })",
         node_key='iri')
     query = self.ncm.To.commit_list([
         "MATCH (s:Individual { short_form : 'VFB_00000001' })"
         "-[r]-(o:Class { label : 'neuron' }) "
         "RETURN type(r) as rtype"
     ])
     query_results = results_2_dict_list(query)
     for q in query_results:
         assert q['rtype'] == 'INSTANCEOF'
def gen_report(server, query, report_name, column_order=None):
    """Generates a pandas dataframe with
    the results of a cypher query against the
    specified server.
    Args:
        server: server connection as [endpoint, usr, pwd]
        query: cypher query
        report_name: df.name
        column_order: optionally specify column order in df."""
    nc = neo4j_connect(*server)
    print(query)
    r = nc.commit_list([query])
    #print(r)
    dc = results_2_dict_list(r)
    report = pd.DataFrame.from_records(dc)
    report.replace(np.nan, '', regex=True, inplace=True)
    report.name = report_name
    if column_order:
        out = report[column_order]
        out.name = report_name
        return out
    else:
        return report
Exemple #13
0
def roll_cypher_add_syn_pub_link(sfid, s, pub_id_typ, pub_id):
    """Generates a Cypher statement that links an existing class
    to a pub node ..."""
    label = re.sub("'", "\'", s['name'])
    return  "MATCH (a:Class { short_form : \"%s\" }) " \
            "MERGE (p:pub:Individual { %s : \"%s\" }) " \
            "MERGE (a)-[:has_reference { typ : \"syn\", scope: \"%s\", synonym : \"%s\", cat: \"%s\" }]->(p)" \
            "" % (sfid, clean_pub_id_typ(sfid, pub_id_typ), pub_id, s['scope'], label, s['type'])


nc.commit_list(["MERGE (:pub:Individual { FlyBase: 'Unattributed' })"])
q = nc.commit_list([
    "MATCH (c) where c:Class or c:Individual return c.short_form as short_form, c.obo_synonym as syns, c.obo_definition_citation as def"
])
dc = results_2_dict_list(q)
statements = []
for d in dc:
    if d['def']:
        for cit in d['def']:
            if cit:
                def_cit = json.loads(cit)
                for ref in def_cit['oboXrefs']:
                    if ref['id']:
                        statements.append(
                            roll_cypher_add_def_pub_link(
                                sfid=d['short_form'],
                                pub_id=ref['id'],
                                pub_id_typ=ref['database'],
                            ))
    elif d['syns']:
Exemple #14
0
def make_catmaid_vfb_reports(cat_papers, cat_skids, dataset_name):
    """Make comparison with data in VFB for given sets of papers and skids in CATMAID.

    Outputs a file of numbers of SKIDs per paper and a file of CATMAID SKIDs that are not in VFB."""
    save_directory = "../VFB_reporting_results/CATMAID_SKID_reports/"
    comparison_outfile = save_directory + dataset_name + "_comparison.tsv"
    skids_outfile = save_directory + dataset_name + "_new_skids.tsv"
    neuron_skids_outfile = save_directory + dataset_name + "_neuron_only_skids.tsv"

    # Get table of names of catmaid datasets in VFB
    pub_query = "MATCH (api:API)<-[dsxref:hasDbXref]-(ds:DataSet) " \
                "WHERE api.short_form ends with '_catmaid_api' " \
                "RETURN toInteger(dsxref.accession) as CATMAID_ID, ds.short_form as VFB_name"
    q = nc.commit_list([pub_query])
    papers = results_2_dict_list(q)

    vfb_papers = pd.DataFrame.from_dict(papers)
    vfb_papers = vfb_papers.set_index("CATMAID_ID")

    # match up SKIDs per paper and output dict of lists of skids
    skids_by_paper = {}
    for paper_id in cat_papers.index:  # do everything per paper

        # get list of skids in CATMAID data as strings
        skids_in_paper_cat = [
            str(s)
            for s in cat_skids[cat_skids['paper_id'] == paper_id]['skid']
        ]

        # get skids from VFB KB and reformat to list of strings
        query = "MATCH (api:API)<-[dsxref:hasDbXref]-(ds:DataSet)" \
                "<-[:has_source]-(i:Individual)" \
                "-[skid:hasDbXref]->(s:Site) " \
                "WHERE api.short_form ends with '_catmaid_api' " \
                "AND s.short_form starts with 'catmaid_' " \
                "AND dsxref.accession = '" + str(paper_id) +"' WITH i, skid " \
                "MATCH (i)-[:INSTANCEOF]-(c:Class) " \
                "RETURN distinct skid.accession AS `r.catmaid_skeleton_ids`, c.iri"

        q = nc.commit_list([query])
        skids_in_paper_vfb = results_2_dict_list(q)
        vfb_skid_classes_df = pd.DataFrame.from_dict(skids_in_paper_vfb)

        # count skids only annotated as 'neuron'
        try:
            unique_skids = list(
                set([x
                     for x in vfb_skid_classes_df['r.catmaid_skeleton_ids']]))
        except KeyError:  # if dataframe is empty make empty list of skids and empty df with named columns
            unique_skids = []
            vfb_skid_classes_df = pd.DataFrame(
                columns=['c.iri', 'r.catmaid_skeleton_ids'])

        neuron_iris = [
            'http://purl.obolibrary.org/obo/FBbt_00005106',
            'http://purl.obolibrary.org/obo/CL_0000540'
        ]
        neuron_only_skids = []
        for skid in unique_skids:
            if len(vfb_skid_classes_df[vfb_skid_classes_df['r.catmaid_skeleton_ids'] == skid].index) == \
                    len(vfb_skid_classes_df[(vfb_skid_classes_df['r.catmaid_skeleton_ids'] == skid)
                                            & (vfb_skid_classes_df['c.iri'].isin(neuron_iris))].index):
                neuron_only_skids.append(skid)

        skids_in_paper_vfb = unique_skids  # take unique records only

        # comparison of lists of skids
        cat_not_vfb = [
            s for s in skids_in_paper_cat if s not in skids_in_paper_vfb
        ]
        vfb_not_cat = [
            s for s in skids_in_paper_vfb if s not in skids_in_paper_cat
        ]
        skids_by_paper[paper_id] = {
            'skids_in_paper_cat': skids_in_paper_cat,
            'skids_in_paper_vfb': skids_in_paper_vfb,
            'cat_not_vfb': cat_not_vfb,
            'vfb_not_cat': vfb_not_cat,
            'neuron_only': neuron_only_skids
        }

    # make dataframe of list lengths from skid_df
    skids_df = pd.DataFrame.from_dict(skids_by_paper,
                                      orient='index')  # df of lists
    skids_df_count = skids_df.applymap(lambda x: len(x))

    # make combined table with all info, tidy up and save as tsv
    all_papers = pd.merge(cat_papers,
                          vfb_papers,
                          left_index=True,
                          right_index=True,
                          how='left',
                          sort=True)
    all_papers = pd.concat([all_papers, skids_df_count],
                           join="outer",
                           axis=1,
                           sort=True)
    all_papers.rename(columns={
        'name': 'CATMAID_name',
        'VFB_name': 'VFB_name',
        'skids_in_paper_cat': 'CATMAID_SKIDs',
        'skids_in_paper_vfb': 'VFB_SKIDS',
        'cat_not_vfb': 'CATMAID_not_VFB',
        'vfb_not_cat': 'VFB_not_CATMAID',
        'neuron_only': 'neuron_only'
    },
                      inplace=True)
    all_papers.index.name = 'Paper_ID'
    all_papers.to_csv(comparison_outfile, sep="\t")

    # make unique set of skids in vfb
    vfb_skid_list = [
        skid for skidlist in skids_df['skids_in_paper_vfb']
        for skid in skidlist if skid is not None
    ]
    vfb_skid_list = list(set(vfb_skid_list))
    vfb_skid_list = [int(x) for x in vfb_skid_list]

    # filter cat_skids dataframe (df_skids from get_catmaid_papers) to remove rows where skid in VFB
    new_skids_output = cat_skids[~cat_skids['skid'].isin(vfb_skid_list)].sort_values('skid') \
        .reindex(columns=(cat_skids.columns.tolist() + ['FBbt_ID']))
    new_skids_output.to_csv(skids_outfile, sep="\t",
                            index=False)  # output file

    # make unique set of skids annotated only as neuron in VFB
    vfb_neuron_skid_list = [
        skid for skidlist in skids_df['neuron_only'] for skid in skidlist
        if skid is not None
    ]
    vfb_neuron_skid_list = list(set(vfb_neuron_skid_list))
    vfb_neuron_skid_list = [int(x) for x in vfb_neuron_skid_list]

    # output file with skids only annotated as neuron
    neuron_skids_output = cat_skids[cat_skids['skid'].isin(vfb_neuron_skid_list)].sort_values('paper_id') \
        .reindex(columns=(cat_skids.columns.tolist() + ['FBbt_ID']))
    neuron_skids_output.to_csv(neuron_skids_outfile, sep="\t", index=False)

    # TERMINAL OUTPUT
    new_papers = all_papers[all_papers.VFB_name.isnull()]
    print(
        str(len(new_papers.index)) +
        " new papers in CATMAID that are not in VFB")
    print(new_papers["CATMAID_name"])
    print("See " + comparison_outfile + " for differences in numbers of SKIDs")
    print("See " + skids_outfile + " for new SKIDs that are not yet in VFB")
Exemple #15
0
            print("Result: True")
        return True
    else:
        print("Testing assertion:" + description)
        print(query2)
        print("Result: inds_in_datset: %d ; Compliant with pattern: %d" % (r1['ind_count'],  r2['ind_count']))
        # Should probably turn this into a report
        bad_inds = list(set(r1['ind_list']) - set(r2['ind_list']))
        file = open(dataset + ".report", 'w')
        file.write(json.dumps(bad_inds))
        file.close()
        return False


datasets = nc.commit_list(["MATCH (ds:DataSet) RETURN ds.label"])
dc = results_2_dict_list(datasets)

return_state = True

for d in dc:
    ds = d['ds.label']
    dataset_status = True
    print("\n")
    print ("Testing: " + ds)
    final_clauses = " WHERE ds.label = '%s' RETURN COUNT (DISTINCT i) as ind_count" \
                    ", COLLECT(i.short_form) as ind_list" % ds
    base_query = "MATCH (ds:DataSet)<-[:has_source]-(i:Individual)"
    new_base_query = "MATCH (ds:DataSet)<-[:Annotation { short_form: 'source'}]-(i:Individual)"
    if query_ind_count(base_query + final_clauses) == 0:
        if query_ind_count(new_base_query + final_clauses):
            base_query = new_base_query
Exemple #16
0
    def get_ind_id(self, i: VfbInd, context):
        if i.xref_db and i.xref_acc:
            # query to find id
            query = "MATCH (s:Site)<-[r:hasDbXref]-(i:Individual) " \
                    "WHERE s.short_form = '%s' " \
                    "AND r.accession = '%s'" \
                    "RETURN i.short_form as vfb_id" % (i.xref_db, i.xref_acc)
            q = self.ew.nc.commit_list([query])
            if not q:
                self.warn(context_name="%s individual" % context,
                          context=i,
                          message="VFB subject query fail"
                          )  # Better make exception ?
                self.stat = False  # Better try except here?
            else:
                r = results_2_dict_list(q)
                if len(r) == 1:
                    return r[0]['vfb_id']
                elif not r:
                    self.warn(context_name='%s_individual' % context,
                              context=i,
                              message="Unknown xref: %s:%s"
                              "" % (i.xref_db, i.xref_acc))
                    self.stat = False
                else:
                    self.warn(
                        context_name='%s_individual' % context,
                        context=i,
                        message='Multiple matches for xref: %s:%s - %s'
                        '' %
                        (i.xref_db, i.xref_acc, str([x['vfb_id'] for x in r])))
                    self.stat = False

        elif i.id:
            if i.label:
                # subject label used to double-check against DB when ID provided.
                # Is this the behavior we want?
                # TODO - Fix for new object schema
                query = """MATCH (i:Individual { short_form: '%s'})
                           RETURN i.label as name""" % i.id
                q = self.ew.nc.commit_list([query])
                if not q:
                    self.warn(context_name="%s individual" % context,
                              context=i,
                              message="VFB %s query fail" %
                              context)  # Better make exception ?
                    self.stat = False
                else:
                    r = results_2_dict_list(q)
                    if len(r) == 1:
                        name = r[0]['name']
                    elif not r:
                        self.warn(context_name='%s_individual' % context,
                                  context=i,
                                  message="Unknown individual: %s"
                                  "" % i.id)
                        self.stat = False
                        return False
                    if name == i.label:
                        return i.id
                    else:
                        self.warn(context_name="%s individual" % context,
                                  context=i,
                                  message="You provided label: (%s) and ID: (%s), " \
                                          "but in the KB this IS matches %s " \
                                          "" % (i.label, i.id, r[0]['name']))
                        self.stat = False
            else:
                self.warn(
                    context_name="%s individual" % context,
                    context=i,
                    message=
                    "No name provided, so adding relationship to individual without name/ID crosscheck"
                )
                return i.id