Esempio n. 1
0
def make_authorship_rdf(pub_uri, author_uri, rank, corresponding=False):
    """
    Given data values, create the RDF for an authorship
    """
    from vivopump import new_uri
    ardf = ""
    authorship_uri = new_uri()
    add = assert_resource_property(authorship_uri, "rdf:type",
                                   untag_predicate("owl:Thing"))
    ardf = ardf + add
    add = assert_resource_property(authorship_uri, "rdf:type",
                                   untag_predicate("vivo:Authorship"))
    ardf = ardf + add
    add = assert_resource_property(authorship_uri,
                                   "vivo:linkedAuthor", author_uri)
    ardf = ardf + add
    add = assert_resource_property(authorship_uri,
                                   "vivo:linkedInformationResource", pub_uri)
    ardf = ardf + add
    add = assert_data_property(authorship_uri,
                               "vivo:authorRank", rank)
    ardf = ardf + add
    add = assert_data_property(authorship_uri,
                               "vivo:isCorrespondingAuthor", str(corresponding).lower())
    ardf = ardf + add
    return [ardf, authorship_uri]
Esempio n. 2
0
def do_two_step_update(row, column_name, uri, uri_prefix, column_def, data_update, intra, enum, update_graph,
                       debug=False):
    """
    In a two step update, identify intermediate entity that might need to be created, and end path objects that might
    not yet exist or might need to be created.  Cases are:

                          Predicate Single   Predicate Multiple
    VIVO has 0 values     Add, do_the        Add intermediate, do_the
    VIVO has 1 value         do_the          Set compare through intermediate
    VIVO has >1 value     WARNING, do_the    Set compare through intermediate
    :return: alterations in update graph
    """
    from rdflib import RDF, RDFS, Literal, URIRef
    from vivopump import new_uri
    step_def = column_def[0]

    # Find all the intermediate entities in VIVO and then process cases related to count and defs

    step_uris = [o for s, p, o in get_step_triples(update_graph, uri, step_def, debug)]

    if len(step_uris) == 0:

        # VIVO has no values for intermediate, so add a new intermediate and do_the_update on the leaf

        step_uri = URIRef(new_uri(uri_prefix))
        update_graph.add((uri, step_def['predicate']['ref'], step_uri))
        update_graph.add((step_uri, RDF.type, step_def['object']['type']))
        if 'label' in step_def['object']:
            update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                                            datatype=step_def['object'].get('datatype', None),
                                                            lang=step_def['object'].get('lang', None))))
        uri = step_uri
        step_def = column_def[1]
        vivo_objs = {unicode(o): o for s, p, o in get_step_triples(update_graph, uri, step_def)}
        column_values = prepare_column_values(data_update[column_name], intra, step_def, enum, row,
                                              column_name)
        do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, update_graph,
                      debug=debug)

    elif step_def['predicate']['single']:

        # VIVO has 1 or more values, so we need to see if the predicate is expected to be single

        step_uri = step_uris[0]
        if len(step_uris) > 1:
            print "WARNING: Single predicate", column_name, "has", len(step_uris), "values: ", \
                step_uris, "using", step_uri
        uri = step_uri
        step_def = column_def[1]
        vivo_objs = {unicode(o): o for s, p, o in get_step_triples(update_graph, uri, step_def)}
        column_values = prepare_column_values(data_update[column_name], intra, step_def, enum, row,
                                              column_name)
        do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, update_graph,
                      debug=debug)

    else:
        # TODO: Implement set compare through multiple intermediate case -- medium
        print "WARNING: Updating multi-valued multi-step predicates such as ", column_name, " not yet implemented"
    return None
Esempio n. 3
0
    def __do_three_step_update(self, row, column_name, uri, path, data_update):
        """
        Given the current state in the update, and a path length three column_def, add, change or delete intermediate
        and end objects as necessary to perform the requested update
        :param row: row number of the update.  For printing
        :param column_name: column_name of the update.  For printing
        :param uri: uri of the entity at the head of the path
        :param path: the column definition
        :param data_update: the data provided for the update
        :return: Changes in the update_graph
        """
        from rdflib import RDF, RDFS, Literal, URIRef
        from vivopump import new_uri, get_step_triples

        step_def = path[0]
        step_uris = [
            o for s, p, o in
            get_step_triples(self.update_graph, uri, column_name, step_def,
                             self.query_parms, self.verbose)
        ]

        if len(step_uris) == 0:

            # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it

            step_uri = URIRef(new_uri(self.query_parms))
            self.update_graph.add(
                (uri, step_def['predicate']['ref'], step_uri))
            self.update_graph.add(
                (step_uri, RDF.type, step_def['object']['type']))
            if 'label' in step_def['object']:
                self.update_graph.add(
                    (step_uri, RDFS.label,
                     Literal(step_def['object']['label'],
                             datatype=step_def['object'].get('datatype', None),
                             lang=step_def['object'].get('lang', None))))
            self.__do_two_step_update(row, column_name, step_uri, path[1:],
                                      data_update)

        elif step_def['predicate']['single'] == True:

            #   VIVO has 1 or more values for first intermediate, so we need to see if the predicate
            #   is expected to be single

            step_uri = step_uris[0]
            if len(step_uris) > 1:
                print "WARNING: Single predicate", path[0]['object']['name'], "has", len(step_uris), "values: ", \
                    step_uris, "using", step_uri
            self.__do_two_step_update(row, column_name, step_uri, path[1:],
                                      data_update)
        return None
Esempio n. 4
0
def do_three_step_update(row, column_name, uri, uri_prefix, path, data_update, intra, enum, update_graph, debug=False):
    """
    Given the current state in the update, and a path length three column_def, ad, change or delete intermediate and
    end objects as necessary to perform the requested update
    :param row: row number of the update.  For printing
    :param column_name: column_name of the update.  For printing
    :param uri: uri of the entity at the head of the path
    :param path: the column definition
    :param data_update: the data provided for the update
    :param enum: the enumerations
    :param update_graph: the update graph
    :param debug: debug status. For printing.
    :return: Changes in the update_graph
    """
    from rdflib import RDF, RDFS, Literal, URIRef
    from vivopump import new_uri

    step_def = path[0]
    step_uris = [o for s, p, o in get_step_triples(update_graph, uri, step_def, debug)]

    if len(step_uris) == 0:

        # VIVO has no values for first intermediate, so add new intermediate and do a two step update on it

        step_uri = URIRef(new_uri(uri_prefix))
        update_graph.add((uri, step_def['predicate']['ref'], step_uri))
        update_graph.add((step_uri, RDF.type, step_def['object']['type']))
        if 'label' in step_def['object']:
            update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                                            datatype=step_def['object'].get('datatype', None),
                                                            lang=step_def['object'].get('lang', None))))
        do_two_step_update(row, column_name, step_uri, uri_prefix, path[1:], data_update, intra, enum, update_graph,
                           debug=debug)

    elif step_def['predicate']['single']:

        # VIVO has 1 or more values for first intermediate, so we need to see if the predicate is expected to be single

        step_uri = step_uris[0]
        if len(step_uris) > 1:
            print "WARNING: Single predicate", path[0]['object']['name'], "has", len(step_uris), "values: ", \
                step_uris, "using", step_uri
        do_two_step_update(row, column_name, step_uri, uri_prefix, path[1:], data_update, intra, enum, update_graph,
                           debug=debug)
    return None
Esempio n. 5
0
    def test(self):
        """
        Produce a string report regarding testing the configuration of the pump
        :return: the string test report
        :rtype: basestring
        """
        from vivopump import new_uri
        from SPARQLWrapper import SPARQLExceptions
        import urllib2

        result = str(datetime.now()) + " Test results" + "\n" + \
                 "Update definition\t" + self.json_def_filename + " read.\n" + \
                 "Source file name\t" + self.out_filename + ".\n" + \
                 "Enumerations read.\n" + \
                 "Filters\t" + str(self.filter) + "\n" + \
                 "Verbose\t" + str(self.verbose) + "\n" + \
                 "Intra field separator\t" + self.intra + "\n" + \
                 "Inter field separator\t" + self.inter + "\n" + \
                 "VIVO SPARQL API URI\t" + self.query_parms['queryuri'] + "\n" + \
                 "VIVO SPARQL API username\t" + self.query_parms['username'] + "\n" + \
                 "VIVO SPARQL API password\t" + self.query_parms['password'] + "\n" + \
                 "VIVO SPARQL API prefix\t" + self.query_parms['prefix'] + "\n" + \
                 "Prefix for RDF file names\t" + self.rdfprefix + "\n" + \
                 "Uriprefix for new uri\t" + self.query_parms['uriprefix'] + "\n"

        try:
            uri = new_uri(self.query_parms)
            result += "Sample new uri\t" + uri + "\n" + \
                "Simple VIVO is ready for use.\n"
        except urllib2.HTTPError as herror:
            result += "Connection to VIVO failed\t" + str(herror) + "\n" + \
                "Check your Simple VIVO configuration and your VIVO permissions.\n"
        except SPARQLExceptions.EndPointNotFound as notfound:
            result += "Connection to VIVO failed\t" + str(notfound) + "\n" + \
                "Check your Simple VIVO configuration and your VIVO API.\n"
        except urllib2.URLError as uerror:
            result += "Connection to VIVO failed\t" + str(uerror) + "\n" + \
                "Check your Simple VIVO configuration and your VIVO API.\n"
        except:
            result += "Connection to VIVO failed\t" + "\n" + \
                "Check your Simple VIVO configuration and your VIVO API.\n"

        result += str(datetime.now()) + " Test end"
        return result
Esempio n. 6
0
    def test(self):
        """
        Produce a string report regarding testing the configuration of the pump
        :return: the string test report
        :rtype: basestring
        """
        from vivopump import new_uri
        from SPARQLWrapper import SPARQLExceptions
        import urllib2

        result = str(datetime.now()) + " Test results" + "\n" + \
                 "Update definition\t" + self.json_def_filename + " read.\n" + \
                 "Source file name\t" + self.out_filename + ".\n" + \
                 "Enumerations read.\n" + \
                 "Filters\t" + str(self.filter) + "\n" + \
                 "Verbose\t" + str(self.verbose) + "\n" + \
                 "Intra field separator\t" + self.intra + "\n" + \
                 "Inter field separator\t" + self.inter + "\n" + \
                 "VIVO SPARQL API URI\t" + self.query_parms['queryuri'] + "\n" + \
                 "VIVO SPARQL API username\t" + self.query_parms['username'] + "\n" + \
                 "VIVO SPARQL API password\t" + self.query_parms['password'] + "\n" + \
                 "VIVO SPARQL API prefix\t" + self.query_parms['prefix'] + "\n" + \
                 "Prefix for RDF file names\t" + self.rdfprefix + "\n" + \
                 "Uriprefix for new uri\t" + self.query_parms['uriprefix'] + "\n"

        try:
            uri = new_uri(self.query_parms)
            result += "Sample new uri\t" + uri + "\n" + \
                "Simple VIVO is ready for use.\n"
        except urllib2.HTTPError as herror:
            result += "Connection to VIVO failed\t" + str(herror) + "\n" + \
                "Check your Simple VIVO configuration and your VIVO permissions.\n"
        except SPARQLExceptions.EndPointNotFound as notfound:
            result += "Connection to VIVO failed\t" + str(notfound) + "\n" + \
                "Check your Simple VIVO configuration and your VIVO API.\n"
        except urllib2.URLError as uerror:
            result += "Connection to VIVO failed\t" + str(uerror) + "\n" + \
                "Check your Simple VIVO configuration and your VIVO API.\n"
        except:
            result += "Connection to VIVO failed\t" + "\n" + \
                "Check your Simple VIVO configuration and your VIVO API.\n"

        result += str(datetime.now()) + " Test end"
        return result
Esempio n. 7
0
    def __do_three_step_update(self, row, column_name, uri, path, data_update):
        """
        Given the current state in the update, and a path length three column_def, add, change or delete intermediate
        and end objects as necessary to perform the requested update
        :param row: row number of the update.  For logger messages
        :param column_name: column_name of the update.  For logger messages
        :param uri: uri of the entity at the head of the path
        :param path: the column definition
        :param data_update: the data provided for the update
        :return: Changes in the update_graph
        """
        from rdflib import RDF, RDFS, Literal, URIRef
        from vivopump import new_uri, get_step_triples

        step_def = path[0]
        step_uris = [o for s, p, o in get_step_triples(self.update_graph, uri, step_def, self.query_parms)]

        if len(step_uris) == 0:

            #   VIVO has no values for first intermediate, so add new intermediate and do a two step update on it

            step_uri = URIRef(new_uri(self.query_parms))
            self.update_graph.add((uri, step_def['predicate']['ref'], step_uri))
            self.update_graph.add((step_uri, RDF.type, step_def['object']['type']))
            if 'label' in step_def['object']:
                self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                                                     datatype=step_def['object'].get('datatype', None),
                                                                     lang=step_def['object'].get('lang', None))))
            self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update)

        elif step_def['predicate']['single'] == True:

            #   VIVO has 1 or more values for first intermediate, so we need to see if the predicate
            #   is expected to be single

            step_uri = step_uris[0]
            if len(step_uris) > 1:
                logger.warning(u"WARNING: Single predicate {} has {} values: {}. Using {}".
                               format(path[0]['object']['name'], len(step_uris), step_uris, step_uri))
            self.__do_two_step_update(row, column_name, step_uri, path[1:], data_update)
        return None
Esempio n. 8
0
 def test_parameters(self):
     uri = new_uri()
     self.assertTrue(len(uri) > 0)
Esempio n. 9
0
def get_pubmed(pmid, author_uris=None):
    """
    Given a pubmid identifer, return a structure containing the elements
    of the publication of interest to VIVO. Optionally, provide a set of
    author_uris for use in disambiguation.  When find_author returns a
    set of size > 1, the author_uris will be examined for matches to
    assist with disambiguation.
    """
    from vivopump import new_uri
    ardf = ""
    record = get_entrez_record(pmid)
    if record is None:
        return ["", None]
    pub = document_from_pubmed(record)
    if pub['page_end'] == '':
        pub['page_end'] = pub['page_start']
    if pub['date']['month'] == '':
        pub['date']['month'] = '1'
    if pub['date']['day'] == '':
        pub['date']['day'] = '1'
    pub['pub_uri'] = new_uri()
    pub['date_harvested'] = str(datetime.now())
    pub['harvested_by'] = "Python PubMed Add " + __version__
    journal_uri = find_vivo_uri("bibo:issn", pub['issn'])
    if journal_uri is None:
        [add, journal_uri] = make_journal_rdf(pub['journal'], pub['issn'])
        ardf = ardf + add
    pub['journal_uri'] = journal_uri

    pub_date = datetime.strptime(pub['date']['month'] + '/' + pub['date']['day'] + \
                                 '/' + pub['date']['year'], "%m/%d/%Y")
    if pub_date in date_dictionary:
        pub['date_uri'] = date_dictionary[pub_date]
    else:
        [add, pub_date_uri] = make_datetime_rdf(pub_date.isoformat())
        date_dictionary[pub_date] = pub_date_uri
        pub['date_uri'] = pub_date_uri
        ardf = ardf + add

    # Turn each author into a URI reference to an authorship

    pub['authorship_uris'] = []
    for key, author in sorted(pub['authors'].items(), key=lambda x: x[0]):
        try:
            author_uri_set = find_author(author)
        except:
            print "No last name for author", author
            print "Pub\n", pub
            print "Record\n", record
            continue
        if len(author_uri_set) == 0:
            [add, author_uri] = make_author_rdf(author)
            ardf = ardf + add
            print pmid, "Add", author, "at", author_uri
        elif len(author_uri_set) == 1:
            author_uri = list(author_uri_set)[0]
            print pmid, "Found", author, author_uri
        else:
            if author_uris is None:
                author_uri = list(author_uri_set)[0]
                print pmid, "Disambiguate", author, "from", author_uri_set
            else:
                possible_uri_set = author_uri_set.intersection(author_uris)
                if len(possible_uri_set) == 1:
                    author_uri = list(possible_uri_set)[0]
                else:
                    author_uri = list(possible_uri_set)[0]
                    print pmid, "Disambiguate", author, "from", possible_uri_set
            print "Disambiguate:"
            print "  Possible authors in VIVO", author_uri_set
            print "  Possible authors in Source", author_uris
            print "  Selected author", author_uri

        [add, authorship_uri] = make_authorship_rdf(pub['pub_uri'], author_uri,
                                                    key, corresponding=False)
        pub['authorship_uris'].append(authorship_uri)
        ardf = ardf + add

    return [ardf, pub]
Esempio n. 10
0
    def do_update(self):
        """
        read updates from a spreadsheet filename.  Compare to data in VIVO.  Generate add and sub
        rdf as necessary to process requested changes
        """
        from rdflib import URIRef, RDF
        from vivopump import new_uri

        for row, data_update in self.update_data.items():
            uri = URIRef(data_update['uri'])

            if 'remove' in data_update.keys() and data_update['remove'].lower(
            ) == 'true':
                do_remove(row, uri, self.update_graph, self.verbose)
                continue

            if (uri, None, None) not in self.update_graph:

                # If the entity uri can not be found in the update graph, make a new URI ignoring the one in the
                # spreadsheet, if any, and add the URI to the update graph.  Remaining processing is unchanged.
                # Since the new uri does not have triples for the columns in the spreadsheet, each will be added

                uri_string = new_uri(self.uri_prefix)
                if self.verbose:
                    print "Adding an entity for row", row, ".  Will be added at", uri_string
                uri = URIRef(uri_string)
                self.update_graph.add(
                    (uri, RDF.type, self.update_def['entity_def']['type']))
            entity_uri = uri

            for column_name, column_def in self.update_def[
                    'column_defs'].items():
                if column_name not in data_update:
                    continue  # extra column names are allowed in the spreadsheet for annotation
                uri = entity_uri

                if data_update[column_name] == '':
                    continue

                if len(column_def) > 3:
                    raise PathLengthException(
                        "Path lengths > 3 not supported.  Path length for " +
                        column_name + " is " + str(len(column_def)))
                elif len(column_def) == 3:
                    do_three_step_update(row,
                                         column_name,
                                         uri,
                                         self.uri_prefix,
                                         column_def,
                                         data_update,
                                         self.intra,
                                         self.enum,
                                         self.update_graph,
                                         debug=False)
                elif len(column_def) == 2:
                    do_two_step_update(row,
                                       column_name,
                                       uri,
                                       self.uri_prefix,
                                       column_def,
                                       data_update,
                                       self.intra,
                                       self.enum,
                                       self.update_graph,
                                       debug=False)
                elif len(column_def) == 1:
                    step_def = column_def[0]
                    vivo_objs = {}
                    for s, p, o in self.update_graph.triples(
                        (uri, step_def['predicate']['ref'], None)):
                        vivo_objs[unicode(o)] = o
                    column_values = prepare_column_values(
                        data_update[column_name], self.intra, step_def,
                        self.enum, row, column_name)
                    if self.verbose:
                        print row, column_name, column_values, uri, vivo_objs
                    do_the_update(row,
                                  column_name,
                                  uri,
                                  step_def,
                                  column_values,
                                  vivo_objs,
                                  self.update_graph,
                                  debug=self.verbose)

        # Return the add and sub graphs representing the changes that need to be made to the original

        add = self.update_graph - self.original_graph  # Triples in update that are not in original
        if self.verbose:
            print "Triples to add"
            print add.serialize(format='nt')
        sub = self.original_graph - self.update_graph  # Triples in original that are not in update
        if self.verbose:
            print "Triples to sub"
            print sub.serialize(format='nt')
        return [add, sub]
Esempio n. 11
0
def do_two_step_update(row,
                       column_name,
                       uri,
                       uri_prefix,
                       column_def,
                       data_update,
                       intra,
                       enum,
                       update_graph,
                       debug=False):
    """
    In a two step update, identify intermediate entity that might need to be created, and end path objects that might
    not yet exist or might need to be created.  Cases are:

                          Predicate Single   Predicate Multiple
    VIVO has 0 values     Add, do_the        Add intermediate, do_the
    VIVO has 1 value         do_the          Set compare through intermediate
    VIVO has >1 value     WARNING, do_the    Set compare through intermediate
    :return: alterations in update graph
    """
    from rdflib import RDF, RDFS, Literal, URIRef
    from vivopump import new_uri
    step_def = column_def[0]

    # Find all the intermediate entities in VIVO and then process cases related to count and defs

    step_uris = [
        o for s, p, o in get_step_triples(update_graph, uri, step_def, debug)
    ]

    if len(step_uris) == 0:

        # VIVO has no values for intermediate, so add a new intermediate and do_the_update on the leaf

        step_uri = URIRef(new_uri(uri_prefix))
        update_graph.add((uri, step_def['predicate']['ref'], step_uri))
        update_graph.add((step_uri, RDF.type, step_def['object']['type']))
        if 'label' in step_def['object']:
            update_graph.add(
                (step_uri, RDFS.label,
                 Literal(step_def['object']['label'],
                         datatype=step_def['object'].get('datatype', None),
                         lang=step_def['object'].get('lang', None))))
        uri = step_uri
        step_def = column_def[1]
        vivo_objs = {
            unicode(o): o
            for s, p, o in get_step_triples(update_graph, uri, step_def)
        }
        column_values = prepare_column_values(data_update[column_name], intra,
                                              step_def, enum, row, column_name)
        do_the_update(row,
                      column_name,
                      uri,
                      step_def,
                      column_values,
                      vivo_objs,
                      update_graph,
                      debug=debug)

    elif step_def['predicate']['single']:

        # VIVO has 1 or more values, so we need to see if the predicate is expected to be single

        step_uri = step_uris[0]
        if len(step_uris) > 1:
            print "WARNING: Single predicate", column_name, "has", len(step_uris), "values: ", \
                step_uris, "using", step_uri
        uri = step_uri
        step_def = column_def[1]
        vivo_objs = {
            unicode(o): o
            for s, p, o in get_step_triples(update_graph, uri, step_def)
        }
        column_values = prepare_column_values(data_update[column_name], intra,
                                              step_def, enum, row, column_name)
        do_the_update(row,
                      column_name,
                      uri,
                      step_def,
                      column_values,
                      vivo_objs,
                      update_graph,
                      debug=debug)

    else:
        # TODO: Implement set compare through multiple intermediate case -- medium
        print "WARNING: Updating multi-valued multi-step predicates such as ", column_name, " not yet implemented"
    return None
Esempio n. 12
0
    def __do_update(self):
        """
        For each row, process each column.  Compare to data in VIVO.  Generate add and sub
        rdf as necessary to process requested add, change, delete
        """
        from rdflib import URIRef, RDF
        from vivopump import new_uri, prepare_column_values, get_step_triples, PathLengthException

        merges = {}

        for row, data_update in self.update_data.items():

            # Create a URI if empty
            logger.debug("data_update[uri] = {}".format(data_update['uri']))

            if data_update['uri'].strip() == '':
                dict_is_empty = True

                for item in data_update.values():
                    if len(item) != 0:
                        dict_is_empty = False

                if dict_is_empty:
                    # skip blank lines in the input file
                    continue

                #   If the source uri is empty, create one.  Remaining processing is unchanged.
                #   Since the new uri does not have triples for the columns in the spreadsheet, each will be added

                uri_string = new_uri(self.query_parms)
                logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, uri_string))
                uri = URIRef(uri_string)
                self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type']))

            #   Create a URI entity if not found

            else:
                uri = URIRef(data_update['uri'].strip())
                if (uri, None, None) not in self.update_graph:
                    logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, str(uri)))
                    self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type']))

            entity_uri = uri
            action = data_update.get('action', '').lower()

            #   Process remove action if any

            if action == 'remove':
                self.__do_remove(row, uri)
                continue

            #   Collect merge info if any

            if action != '':
                k = action.find('1')
                if k > -1:
                    key = action[0:k]
                    if key not in merges:
                        merges[key] = {}
                        merges[key]['primary'] = None
                        merges[key]['secondary'] = [uri]
                    else:
                        merges[key]['secondary'].append(uri)
                else:
                    if action not in merges:
                        merges[action] = {}
                    merges[action]['primary'] = uri
                    if 'secondary' not in merges[action]:
                        merges[action]['secondary'] = []

            #   For this row, process all the column_defs and then process closure defs if any.  Closures allow
            #   columns to be "reused" providing additional paths from the row entity to entities in the paths.

            for column_name, column_def in self.update_def['column_defs'].items() + \
                    self.update_def.get('closure_defs', {}).items():
                if column_name not in data_update:
                    continue  # extra column names are allowed in the spreadsheet for annotation
                uri = entity_uri

                if data_update[column_name] == '':
                    logger.debug(u"Skipping blank value. row {} column {}".format(row, column_name))
                    continue

                column_def_len = len(column_def)
                logger.debug("column_def length is: {}".format(column_def_len))

                if column_def_len > 3:
                    raise PathLengthException(
                        "ERROR: Path lengths > 3 not supported.  Path length for {} is {}"
                        .format(column_name, column_def_len))
                elif column_def_len == 3:
                    self.__do_three_step_update(row, column_name, uri, column_def, data_update)
                elif column_def_len == 2:
                    self.__do_two_step_update(row, column_name, uri, column_def, data_update)
                elif column_def_len == 1:
                    step_def = column_def[0]
                    vivo_objs = {unicode(o): o for s, p, o in
                                 get_step_triples(self.update_graph, uri, step_def, self.query_parms)}
                    column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum,
                                                          row, column_name)
                    logger.debug(u"{} {} {} {} {}".format(row, column_name, column_values, uri, vivo_objs))
                    self.__do_the_update(row, column_name, uri, step_def, column_values, vivo_objs)

        if any(merges):
            self.__do_merges(merges)

        #   Return the add and sub graphs representing the changes that need to be made to the original

        add = self.update_graph - self.original_graph  # Triples in update that are not in original
        logger.info(u"Triples to add\n{}".format(add.serialize(format='nt')))
        sub = self.original_graph - self.update_graph  # Triples in original that are not in update
        logger.info(u"Triples to sub\n{}".format(sub.serialize(format='nt')))
        return [add, sub]
Esempio n. 13
0
    def do_update(self):
        """
        read updates from a spreadsheet filename.  Compare to data in VIVO.  Generate add and sub
        rdf as necessary to process requested changes
        """
        from rdflib import URIRef, RDF
        from vivopump import new_uri

        for row, data_update in self.update_data.items():
            uri = URIRef(data_update['uri'])

            if 'remove' in data_update.keys() and data_update['remove'].lower() == 'true':
                do_remove(row, uri, self.update_graph, self.verbose)
                continue

            if (uri, None, None) not in self.update_graph:

                # If the entity uri can not be found in the update graph, make a new URI ignoring the one in the
                # spreadsheet, if any, and add the URI to the update graph.  Remaining processing is unchanged.
                # Since the new uri does not have triples for the columns in the spreadsheet, each will be added

                uri_string = new_uri(self.uri_prefix)
                if self.verbose:
                    print "Adding an entity for row", row, ".  Will be added at", uri_string
                uri = URIRef(uri_string)
                self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type']))
            entity_uri = uri

            for column_name, column_def in self.update_def['column_defs'].items():
                if column_name not in data_update:
                    continue  # extra column names are allowed in the spreadsheet for annotation
                uri = entity_uri

                if data_update[column_name] == '':
                    continue


                if len(column_def) > 3:
                    raise PathLengthException(
                        "Path lengths > 3 not supported.  Path length for " + column_name + " is " + str(
                            len(column_def)))
                elif len(column_def) == 3:
                    do_three_step_update(row, column_name, uri, self.uri_prefix, column_def, data_update, self.intra,
                                         self.enum, self.update_graph, debug=False)
                elif len(column_def) == 2:
                    do_two_step_update(row, column_name, uri, self.uri_prefix, column_def, data_update, self.intra,
                                       self.enum, self.update_graph,
                                       debug=False)
                elif len(column_def) == 1:
                    step_def = column_def[0]
                    vivo_objs = {}
                    for s, p, o in self.update_graph.triples((uri, step_def['predicate']['ref'], None)):
                        vivo_objs[unicode(o)] = o
                    column_values = prepare_column_values(data_update[column_name], self.intra, step_def, self.enum,
                                                          row, column_name)
                    if self.verbose:
                        print row, column_name, column_values, uri, vivo_objs
                    do_the_update(row, column_name, uri, step_def, column_values, vivo_objs, self.update_graph,
                                  debug=self.verbose)

        # Return the add and sub graphs representing the changes that need to be made to the original

        add = self.update_graph - self.original_graph  # Triples in update that are not in original
        if self.verbose:
            print "Triples to add"
            print add.serialize(format='nt')
        sub = self.original_graph - self.update_graph  # Triples in original that are not in update
        if self.verbose:
            print "Triples to sub"
            print sub.serialize(format='nt')
        return [add, sub]
Esempio n. 14
0
 def test_parameters(self):
     uri = new_uri()
     self.assertTrue(len(uri) > 0)
Esempio n. 15
0
 def test_new_uri_prefix(self):
     uri = new_uri(uri_prefix='http://my.vivo.edu/individual/')
     print uri
     self.assertTrue(uri.startswith('http://my.vivo.edu'))
Esempio n. 16
0
    def __do_update(self):
        """
        For each row, process each column.  Compare to data in VIVO.  Generate add and sub
        rdf as necessary to process requested add, change, delete
        """
        from rdflib import URIRef, RDF
        from vivopump import new_uri, prepare_column_values, PathLengthException

        merges = {}

        for row, data_update in self.update_data.items():

            # Create a URI if empty

            if data_update['uri'].strip() == '':

                #   If the source uri is empty, create one.  Remaining processing is unchanged.
                #   Since the new uri does not have triples for the columns in the spreadsheet, each will be added

                uri_string = new_uri(self.query_parms)
                logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, uri_string))
                uri = URIRef(uri_string)
                self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type']))

            #   Create a URI entity if not found

            else:
                uri = URIRef(data_update['uri'].strip())
                if (uri, None, None) not in self.update_graph:
                    logger.debug(u"Adding an entity for row {}. Will be added at {}".format(row, str(uri)))
                    self.update_graph.add((uri, RDF.type, self.update_def['entity_def']['type']))

            self.entity_uri = uri
            action = data_update.get('action', '').lower()

            #   Process remove action if any

            if action == 'remove':
                self.__do_remove(row, uri)
                continue

            #   Collect merge info if any

            if action != '':
                k = action.find('1')
                if k > -1:
                    key = action[0:k]
                    if key not in merges:
                        merges[key] = {}
                        merges[key]['primary'] = None
                        merges[key]['secondary'] = [uri]
                    else:
                        merges[key]['secondary'].append(uri)
                else:
                    if action not in merges:
                        merges[action] = {}
                    merges[action]['primary'] = uri
                    if 'secondary' not in merges[action]:
                        merges[action]['secondary'] = []

            #   For this row, process all the column_defs and then process closure defs if any.  Closures allow
            #   columns to be "reused" providing additional paths from the row entity to entities in the paths.

            for column_name, column_def in self.update_def['column_defs'].items() + \
                    self.update_def.get('closure_defs', {}).items():

                #   Skip any columns in the data that are not in the update_def

                if column_name not in data_update:
                    continue

                #   Skip the column if it is empty

                if data_update[column_name] == '':
                    logger.debug(u"Skipping blank value. row {} column {}".format(row, column_name))
                    continue

                #   Process the column values, returning a list of RDF elements

                last_def = column_def[len(column_def) - 1]
                column_values = prepare_column_values(data_update[column_name], self.intra, last_def, self.enum, row,
                                                      column_name)

                #   Process the path depending on its length.  Some day we will refactor this to a recursion

                if len(column_def) > 3:
                    raise PathLengthException(
                        "ERROR: Path lengths > 3 not supported.  Path length for " + column_name + " is " + str(
                            len(column_def)))
                elif len(column_def) == 3:
                    self.__do_three_step_update(row, column_name, self.entity_uri, column_def, data_update)
                elif len(column_def) == 2:
                    self.__do_two_step_update(row, column_name, self.entity_uri, column_def, data_update)
                elif len(column_def) == 1:
                    vivo_objs = {unicode(o): o for s, p, o in
                                 self._get_step_triples(self.entity_uri, last_def)}
                    logger.debug(u"{} {} {} {} {}".format(row, column_name, column_values, self.entity_uri, vivo_objs))
                    self.__do_the_update(row, column_name, self.entity_uri, last_def, column_values, vivo_objs)

        if any(merges):
            self.__do_merges(merges)

        #   Return the add and sub graphs representing the changes that need to be made to the original

        add = self.update_graph - self.original_graph  # Triples in update that are not in original
        logger.info(u"Triples to add\n{}".format(add.serialize(format='nt')))
        sub = self.original_graph - self.update_graph  # Triples in original that are not in update
        logger.info(u"Triples to sub\n{}".format(sub.serialize(format='nt')))
        return [add, sub]
Esempio n. 17
0
    def __do_two_step_update(self, row, column_name, uri, column_def, data_update):
        """
        In a two step update, identify intermediate entity that might need to be created, and end path objects that
        might not yet exist or might need to be created.  Cases are:

                              Predicate Single   Predicate Multiple
        VIVO has 0 values     Add, do_the        Add intermediate, do_the
        VIVO has 1 value         do_the          Set compare through intermediate
        VIVO has >1 value     WARNING, do_the    Set compare through intermediate

        :param: row: current row in spreadsheet
        :param: column_name: name of current column in spreadsheet
        :param: uri: uri in VIVO of the current entity
        :param: column_def: the column def for the current column
        :param: data_update: the column_value
        :return: alterations in update graph
        """
        from rdflib import RDF, RDFS, Literal, URIRef
        from vivopump import new_uri, get_step_triples, prepare_column_values

        step_def = column_def[0]

        #   Determine the add set (which intermediates point to column values that are not yet in VIVO
        #       For each element in the add set, construct the intermediate and call __do_the_update to
        #       construct the leaf
        #   Determine the sub set (which intermediates point to column values that are in VIVO and are
        #   not in the column values
        #       For each element in the sub set, remove the leaf and the intermediate
        #
        #   This framework should also handle single valued predicates, and cases where there are no step_uris.
        #   That is, it should handle everything.  All the code below should be replaced.

        step_uris = [o for s, p, o in
                     get_step_triples(self.update_graph, uri, column_def[0], self.query_parms)]
        vivo_objs = {}
        for step_uri in step_uris:
            for s, p, o in get_step_triples(self.update_graph, step_uri, column_def[1], self.query_parms):
                vivo_objs[unicode(o)] = [o, step_uri]

        #   Nasty hack below.  The predicate property "single" appears to have two meanings.  One has to do
        #   with the semantic graph and one has to do with the cardinality of the data column.  These are not
        #   the same.  When the first step is multiple and the second single, the "second single" is not the
        #   cardinality of the data column.  The cardinality of the data column is multiple if any of the
        #   predicates in the path are multiple.  Here we set the cardinality of the leaf to be used by
        #   prepare_column_values and then set it back.  Nasty.  Create a property for leaf cardinality.

        predicate2_cardinality = column_def[1]['predicate']['single']
        if column_def[0]['predicate']['single'] == False:
            column_def[1]['predicate']['single'] = False
        column_values = prepare_column_values(data_update[column_name], self.intra, column_def[1], self.enum, row,
                                              column_name)
        column_def[1]['predicate']['single'] = predicate2_cardinality

        vivo_values = [vivo_objs[x][0] for x in vivo_objs.keys()]
        if unicode(column_values[0]).lower() == 'none':
            add_values = set()
            sub_values = set(vivo_values)
        else:
            add_values = set(column_values) - set(vivo_values)
            sub_values = set(vivo_values) - set(column_values)
            logger.debug(u"Two step SET COMPARE\n\tRow {}\n\tColumn {}\n\tSource values {}\n\tVIVO values {}" +
                         "\n\tAdd values {}\n\tSub values {}\n\tStep_uris {}".
                         format(row, column_name, column_values, vivo_values, add_values, sub_values, step_uris))

        #   Process the adds

        if len(add_values) > 0:
            if column_def[0]['predicate']['single'] == False:

                #   Multiple intermediaries, single valued-leaves

                for leaf_value in add_values:
                    step_uri = URIRef(new_uri(self.query_parms))
                    self.update_graph.add((uri, step_def['predicate']['ref'], step_uri))
                    if 'type' in step_def['object']:
                        self.update_graph.add((step_uri, RDF.type, step_def['object']['type']))
                    if 'label' in step_def['object']:
                        self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                                                             datatype=step_def['object'].get('datatype',
                                                                                                             None),
                                                                             lang=step_def['object'].get('lang',
                                                                                                         None))))
                    self.__do_the_update(row, column_name, step_uri, column_def[1], [leaf_value], {})
            else:

                #   Multiple values on the single leaf

                if len(step_uris) == 0:
                    step_uri = URIRef(new_uri(self.query_parms))
                    self.update_graph.add((uri, step_def['predicate']['ref'], step_uri))
                    if 'type' in step_def['object']:
                        self.update_graph.add((step_uri, RDF.type, step_def['object']['type']))
                    if 'label' in step_def['object']:
                        self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                              datatype=step_def['object'].get('datatype', None),
                                              lang=step_def['object'].get('lang', None))))
                else:
                    step_uri = step_uris[0]
                self.__do_the_update(row, column_name, step_uri, column_def[1], column_values, {})

        #   Process the subs

        if len(sub_values) > 0:
            if column_def[0]['predicate']['single'] == False:

                #   Handle multiple intermediaries, single leaves, by removing each intermediary and all its
                #   assertions

                for leaf_value in sub_values:
                    step_uri = vivo_objs[unicode(leaf_value)][1]
                    self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri))
                    self.update_graph.remove((step_uri, None, None))
            else:

                #   Handle single intermediary, possibly multiple leaves, by removing each leaf from the intermediary
                #   Then check to see if the intermediary has any remaining leaf assertions and remove if empty

                step_uri = vivo_objs[unicode(next(iter(sub_values)))][1]
                for leaf_value in sub_values:
                    self.update_graph.remove((step_uri, None, leaf_value))
                g = self.update_graph.triples((step_uri, column_def[1]['predicate']['ref'], None))
                if g == set():
                    self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri))
                    self.update_graph.remove((step_uri, None, None))

        return None
Esempio n. 18
0
    def __do_two_step_update(self, row, column_name, uri, column_def, data_update):
        """
        In a two step update, identify intermediate entity that might need to be created, and end path objects that
        might not yet exist or might need to be created.  Cases are:

                              Predicate Single   Predicate Multiple
        VIVO has 0 values     Add, do_the        Add intermediate, do_the
        VIVO has 1 value         do_the          Set compare through intermediate
        VIVO has >1 value     WARNING, do_the    Set compare through intermediate

        :param: row: current row in spreadsheet
        :param: column_name: name of current column in spreadsheet
        :param: uri: uri in VIVO of the current entity
        :param: column_def: the column def for the current column
        :param: data_update: the column_value
        :return: alterations in update graph
        """
        from rdflib import RDF, RDFS, Literal, URIRef
        from vivopump import new_uri, prepare_column_values

        step_def = column_def[0]

        #   Determine the add set (which intermediates point to column values that are not yet in VIVO
        #       For each element in the add set, construct the intermediate and call __do_the_update to
        #       construct the leaf
        #   Determine the sub set (which intermediates point to column values that are in VIVO and are
        #   not in the column values
        #       For each element in the sub set, remove the leaf and the intermediate
        #
        #   This framework should also handle single valued predicates, and cases where there are no step_uris.
        #   That is, it should handle everything.  All the code below should be replaced.

        step_uris = [o for s, p, o in
                     self._get_step_triples(uri, column_def[0])]
        vivo_objs = {}
        for step_uri in step_uris:
            for s, p, o in self._get_step_triples(step_uri, column_def[1]):
                vivo_objs[unicode(o)] = [o, step_uri]

        #   Nasty hack below.  The predicate property "single" appears to have two meanings.  One has to do
        #   with the semantic graph and one has to do with the cardinality of the data column.  These are not
        #   the same.  When the first step is multiple and the second single, the "second single" is not the
        #   cardinality of the data column.  The cardinality of the data column is multiple if any of the
        #   predicates in the path are multiple.  Here we set the cardinality of the leaf to be used by
        #   prepare_column_values and then set it back.  Nasty.  Create a property for leaf cardinality.

        predicate2_cardinality = column_def[1]['predicate']['single']
        if column_def[0]['predicate']['single'] == False:
            column_def[1]['predicate']['single'] = False
        column_values = prepare_column_values(data_update[column_name], self.intra, column_def[1], self.enum, row,
                                              column_name)
        column_def[1]['predicate']['single'] = predicate2_cardinality

        vivo_values = [vivo_objs[x][0] for x in vivo_objs.keys()]
        if unicode(column_values[0]).lower() == 'none':
            add_values = set()
            sub_values = set(vivo_values)
        else:
            add_values = set(column_values) - set(vivo_values)
            sub_values = set(vivo_values) - set(column_values)
            logger.debug(u"Two step SET COMPARE\n\tRow {}\n\tColumn {}\n\tSource values {}\n\tVIVO values {}" +
                         "\n\tAdd values {}\n\tSub values {}\n\tStep_uris {}".
                         format(row, column_name, column_values, vivo_values, add_values, sub_values, step_uris))

        #   Process the adds

        if len(add_values) > 0:
            if column_def[0]['predicate']['single'] == False:

                #   Multiple intermediaries, single valued-leaves

                for leaf_value in add_values:
                    step_uri = URIRef(new_uri(self.query_parms))
                    self.update_graph.add((uri, step_def['predicate']['ref'], step_uri))
                    if 'type' in step_def['object']:
                        self.update_graph.add((step_uri, RDF.type, step_def['object']['type']))
                    if 'label' in step_def['object']:
                        self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                                                             datatype=step_def['object'].get('datatype',
                                                                                                             None),
                                                                             lang=step_def['object'].get('lang',
                                                                                                         None))))
                    self.__do_the_update(row, column_name, step_uri, column_def[1], [leaf_value], {})
            else:

                #   Multiple values on the single leaf

                if len(step_uris) == 0:
                    step_uri = URIRef(new_uri(self.query_parms))
                    self.update_graph.add((uri, step_def['predicate']['ref'], step_uri))
                    if 'type' in step_def['object']:
                        self.update_graph.add((step_uri, RDF.type, step_def['object']['type']))
                    if 'label' in step_def['object']:
                        self.update_graph.add((step_uri, RDFS.label, Literal(step_def['object']['label'],
                                              datatype=step_def['object'].get('datatype', None),
                                              lang=step_def['object'].get('lang', None))))
                else:
                    step_uri = step_uris[0]
                self.__do_the_update(row, column_name, step_uri, column_def[1], column_values, {})

        #   Process the subs

        if len(sub_values) > 0:
            if column_def[0]['predicate']['single'] == False:

                #   Handle multiple intermediaries, single leaves, by removing each intermediary and all its
                #   assertions

                for leaf_value in sub_values:
                    step_uri = vivo_objs[unicode(leaf_value)][1]
                    self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri))
                    self.update_graph.remove((step_uri, None, None))
            else:

                #   Handle single intermediary, possibly multiple leaves, by removing each leaf from the intermediary
                #   Then check to see if the intermediary has any remaining leaf assertions and remove if empty

                step_uri = vivo_objs[unicode(next(iter(sub_values)))][1]
                for leaf_value in sub_values:
                    self.update_graph.remove((step_uri, None, leaf_value))
                g = self.update_graph.triples((step_uri, column_def[1]['predicate']['ref'], None))
                if g == set():
                    self.update_graph.remove((uri, step_def['predicate']['ref'], step_uri))
                    self.update_graph.remove((step_uri, None, None))

        return None
Esempio n. 19
0
 def test_new_uri_default(self):
     uri = new_uri()
     print uri
     self.assertTrue(len(uri) > 0)