Ejemplo n.º 1
0
def align_ntriples(ntriples_file_to_align,
                   alignment_file,
                   predicates_to_align=[rdfs_label],
                   alignment_uri="http://example.org/aligned",
                   number_of_words=5):

    print("Generating file to align into memory")
    ff = open(ntriples_file_to_align)
    triples_to_fragment = pyTripleSimple.SimpleTripleStore()
    triples_to_fragment.load_ntriples(ff)

    ff.close()

    print("Generating fragments to align")

    fte_obj = FreeTextExpanderTripleStore(triples_to_fragment,
                                          predicates_to_align)
    fte_obj.generate(number_of_words)
    alignment_files = fte_obj.write_out_to_ntriples()

    print("Reading alignment file into memory")

    alignment_obj = pyTripleSimple.SimpleTripleStore()
    fa = open(alignment_file)
    alignment_obj.load_ntriples(fa)
    fa.close()

    aligned_obj = pyTripleSimple.SimpleTripleStore()

    for alignment_file in alignment_files:
        faf = open(alignment_file)
        fragments_obj = pyTripleSimple.SimpleTripleStore()
        print("Loading fragments to align into memory '%s'" % alignment_file)
        fragments_obj.load_ntriples(faf)
        for fragment in fragments_obj.iterator_triples():
            result = alignment_obj.simple_pattern_match(
                [("s", "p", "o")], [("o", "in", [fragment.object])], ["s"])

            if result:
                aligned_obj.load_ntriples([
                    "<%s> <%s> %s ." %
                    (fragment.subject, alignment_uri, result[0][0][0])
                ])

    ntriples_aligned_file = ntriples_file_to_align + ".alignment.nt"
    print("Exporting aligned file")

    fo = open(ntriples_aligned_file, "w")
    aligned_obj.export_to_ntriples_file(fo)
    fo.close()
Ejemplo n.º 2
0
def main(ntriples_file_name, free_text_predicates=None):
    f = open(ntriples_file_name, "r")

    ts = pyTripleSimple.SimpleTripleStore(
    )  #pyTripleSimple.ShelveTripleEngine(ntriples_file_name)

    print('Loading "%s"' % os.path.abspath(ntriples_file_name))
    start_time = time.clock()
    ts.load_ntriples(f)
    end_time = time.clock()
    print("Finished loading ntriples file")
    #print("Number of triples %s loaded in %s seconds (%s triples/second)" % (number_of_triples, end_time - start_time,(number_of_triples * 1.0)/ (end_time - start_time)))

    if free_text_predicates is not None:
        ft = FreeTextSimpleTripleStore(
            ts, predicates_to_index=free_text_predicates)
    else:
        ft = FreeTextSimpleTripleStore(ts)

    ft.generate()
    file_names = ft.write_out_to_ntriples(ntriples_file_name + ".")

    print("Generated free text triples '%s'" % ntriples_file_name)
    for file_name in file_names:
        print("Wrote '%s'" % file_name)

    return file_names
Ejemplo n.º 3
0
    def test_find_triples(self):
        ts = pyTripleSimple.SimpleTripleStore()
        ts.load_ntriples(self.test_source)

        r1 = ts.find_triples(subjects="<http://example.org/resource999>")
        self.assertEquals(set([]),r1,"Should return an empty list")

        r2  = ts.find_triples(subjects="<http://example.org/resource9>")
        self.assertEquals(1,len(r2))

        r3 = ts.find_triples(predicates="http://example.org/property")
        self.assertEquals(30,len(r3))

        r4 = ts.find_triples(objects="<http://example.org/resource2>")
        self.assertEquals(7,len(r4))

        r5 = ts.find_triples(literals="chat")
        self.assertEquals(3,len(r5))

        r6 = ts.find_triples(subjects=['<http://example.org/resource26>','http://example.org/resource25'])
        self.assertEquals(3,len(r6))

        r7 = ts.find_triples(subjects=['<http://example.org/resource26>','http://example.org/resource25'], predicates="<http://example.org/property>")
        self.assertEquals(3,len(r7))

        r8 = ts.find_triples(['<http://example.org/resource26>','http://example.org/resource25'], predicates="<http://example.org/propertyX>")
        self.assertEquals(0,len(r8))

        r9 = ts.find_triples("<http://example.org/resource14>", "<http://example.org/property>", literals="x")
        self.assertEquals(1,len(r9))

        r10 = ts.find_triples("<http://example.org/resource14>", "<http://example.org/property>", objects="x")
        self.assertEquals(0,len(r10))
Ejemplo n.º 4
0
    def test_simple_pattern_match(self):
        ts = pyTripleSimple.SimpleTripleStore()
        f = open("acme.nt","r")
        ts.load_ntriples(f)

        r1 = ts.simple_pattern_match([("a","p","b")],[("p","in",["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"])],("b"))
        self.assertEquals(5,len(r1))

        r2 = ts.simple_pattern_match([("a","p","b")],[("p","in",["<http://example.org/predicateDoesNotExist>"])],("b"))
        self.assertEquals(0,len(r2))

        r3 = ts.simple_pattern_match([("a","p","b"),("a","r","ca"),("b","r","cb")],[("r","in",["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"])],("p","ca","cb"))
        self.assertEquals(5,len(r3))

        r4 = ts.simple_pattern_match([("a","p","b"),("a","r","ca"),("b","r","cb")],[("p", "!=", "r"),("r","in",["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"])],("p","ca","cb"))
        self.assertEquals(2,len(r4))

        r5 = ts.simple_pattern_match([('a','p','b')], [],['a','p','b'])
        self.assertEquals(57,len(r5))

        r6 = ts.simple_pattern_match([('a','p','b')], [('p','in',['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'])],['a','p','b'])
        self.assertEquals(14,len(r6))

        r7 = ts.simple_pattern_match([('a','p','b')], [('p','not in',['<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'])],['a','p','b'])
        self.assertEquals(57 - 14,len(r7))

        r8 = ts.simple_pattern_match([('a','p','b')], [('b','in',['Hollywood'])],['a','p','b'])
        self.assertEquals(2,len(r8))

        r9 = ts.simple_pattern_match([('a','p','b')], [('b','not in',['Hollywood'])],['a','p','b'])
        self.assertEquals(57 - 2,len(r9))

        r10 = ts.simple_pattern_match([('s','p','o')],[],[pyTripleSimple.is_literal('o')])
        self.assertEquals(2,len(r10))
Ejemplo n.º 5
0
 def test_EmbeddedQuote(self):
     ts = pyTripleSimple.SimpleTripleStore()
     ts.load_ntriples([r'<http://link.informatics.stonybrook.edu/umls/SUI/S7018982> <http://www.w3.org/2000/01/rdf-schema#label> "Instruct parent/caregiver of infant 0-3 months to limit water intake to Z\\\"x oz to 1 oz at a time, 4 oz daily" .',
                       r'<http://link.informatics.stonybrook.edu/umls/SUI/S11718919> <http://www.w3.org/2000/01/rdf-schema#label> "Vetscription Worm Away 7 Praziquantel 30 MG \\\" Pyrantel Pamoate 30 MG Chewable Tablet" .'])
     triples = list(ts.iterator_triples())
     self.assertEquals(triples[0].object,r"Instruct parent/caregiver of infant 0-3 months to limit water intake to Z\\\"x oz to 1 oz at a time, 4 oz daily")
     self.assertEquals(triples[1].object,r"Vetscription Worm Away 7 Praziquantel 30 MG \\\" Pyrantel Pamoate 30 MG Chewable Tablet")
Ejemplo n.º 6
0
    def test_generate_graphml(self):

        self.ts = pyTripleSimple.SimpleTripleStore()
        f = open("acme.nt")
        self.ts.load_ntriples(f)
        f.close()
        egfrsts_obj = pyTripleSimple.ExtractGraphFromSimpleTripleStore(self.ts)
        egfrsts_obj.register_label()
        egfrsts_obj.register_class()
        egfrsts_obj.add_pattern_for_links([['a','b','c']],[('b','in',['<http://acme.com/rdf#isLabeller>'])],("a","c"), "labeller")
        egfrsts_obj.register_node_predicate("<http://acme.com/rdf#ndc/date_issued>", "date", lambda x : x.upper())
        result_xml = egfrsts_obj.translate_into_graphml_file()

        from xml.etree.ElementTree import XML
        elements = XML(result_xml)
        xml_tags = []

        for element in elements:
            xml_tags.append(element.tag)
        self.assertTrue("{http://graphml.graphdrawing.org/xmlns}key" in xml_tags)
        self.assertTrue("{http://graphml.graphdrawing.org/xmlns}graph" in xml_tags)

        try:
            import networkx
            fo = open("acme.graphml","w")
            fo.write(result_xml)
            fo.close()
            networkx.read_graphml("acme.graphml")
            f.close()
        except ImportError:
            pass
Ejemplo n.º 7
0
 def test_TripleIterator(self):
     ts = pyTripleSimple.SimpleTripleStore()
     ts.load_ntriples(self.test_source)
     result1 = list(ts.iterator_triples())
     
     self.assertEquals(30,len(result1),"Wrong number of triples iterated")
     result2 = list(ts.iterator_ntriples())
     self.assertEquals(30,len(result2),"Wrong number of triples iterated")
Ejemplo n.º 8
0
    def __init__(self, triple_simple_store, predicates_to_index=[rdfs_label]):
        self.predicate_for_word = "http://vivoweb.org/ontology/core#freetextKeyword"
        self.triple_simple_store = triple_simple_store
        self.predicates_to_index = predicates_to_index
        self.lexer = FreeTextLexer()
        self.predicates_triple_store = {}

        for predicate_to_index in self.predicates_to_index:
            self.predicates_triple_store[
                predicate_to_index] = pyTripleSimple.SimpleTripleStore()
Ejemplo n.º 9
0
def main():
    parser = OptionParser(usage="usage: %prog [options]", version="%prog 1.0")
    parser.add_option("-f",
                      "--file",
                      action="store",
                      dest="file_name",
                      default=False,
                      help="ntriples file to read in")

    parser.add_option("-c",
                      "--command",
                      action="store",
                      dest="command",
                      default="statistics",
                      help="Supported commands are: 'statistics' and 'query'")

    parser.add_option("-q",
                      "--query",
                      action="store",
                      dest="query",
                      default=False,
                      help="Specify the pattern to match the solution for")

    parser.add_option("-r",
                      "--restrictions",
                      action="store",
                      dest="restrictions",
                      default=None,
                      help="Specify restrictions on the solution")

    parser.add_option("-v",
                      "--variables",
                      action="store",
                      dest="variables",
                      default=False,
                      help="Specify the variables for output")

    parser.add_option("-n",
                      "--limit",
                      action="store",
                      dest="display_n",
                      default="50",
                      help="Limit the number of results")

    parser.add_option(
        "-o",
        "--output-format",
        action="store",
        dest="output_format",
        default="stdout",
        help="Query output format: stdout, json, delimited, ntriples")

    parser.add_option("-w",
                      "--output-file",
                      action="store",
                      dest="output_file_name",
                      default=0,
                      help="Send results to named file")

    parser.add_option("--header",
                      action="store",
                      dest="header",
                      default=1,
                      help="For table output add a header row")

    parser.add_option("--delimiter",
                      action="store",
                      dest="delimiter",
                      default="\t",
                      help="Delimiter to use in table output")

    parser.add_option("--clean",
                      action="store",
                      dest="clean",
                      default=0,
                      help="Strips string and <> uri designations")

    (options, args) = parser.parse_args()

    ts = pyTripleSimple.SimpleTripleStore(
    )  #pyTripleSimple.ShelveTripleEngine(ntriples_file_name)

    if options.display_n == "All":
        display_n = None
    else:
        display_n = int(options.display_n)

    file_name = options.file_name
    if file_name:
        try:
            f = open(file_name, "r")
        except IOError:
            raise

        if options.output_file_name:
            try:
                fo = open(options.output_file_name, 'w')
            except IOError:
                raise

        else:
            fo = StringIO.StringIO()

        if options.command == "statistics":

            fo.write('Loading "%s"\n' % os.path.abspath(file_name))
            start_time = time.clock()
            ts.load_ntriples(f)
            end_time = time.clock()
            fo.write("Finished loading ntriples file\n")

            number_of_triples = ts.n_triples()
            Nt = number_of_triples

            fo.write(
                "Number of triples %s loaded in %s seconds (%s triples/second)\n"
                % (number_of_triples, end_time - start_time,
                   (number_of_triples * 1.0) / (end_time - start_time)))

            object_breakdown = ts.simple_pattern_match(
                [("s", "p", "o")], [], [pyTripleSimple.is_literal("o")])

            number_of_literals = 0
            for result in object_breakdown:
                if result[0][0] == '"1"':
                    number_of_literals = result[1]
            Nl = number_of_literals

            number_of_objects = number_of_triples - number_of_literals
            No = number_of_objects

            rdf_type_breakdown = ts.simple_pattern_match(
                [("a", "r", "c")],
                [("r", "in",
                  ["<" + pyTripleSimple.common_prefixes["rdf"] + "type>"])],
                ["r"])

            if len(rdf_type_breakdown):
                number_of_instances = rdf_type_breakdown[0][1]
            else:
                number_of_instances = 0

            Ni = number_of_instances

            number_of_symbols = ts.n_symbols()
            Ns = number_of_symbols

            number_of_distinct_literals = ts.n_literals()
            Ndl = number_of_distinct_literals

            classes_results = ts.simple_pattern_match(
                [("a", "r", "c")],
                [("r", "in",
                  ["<" + pyTripleSimple.common_prefixes["rdf"] + "type>"])],
                ["c"])
            number_of_distinct_classes = len(classes_results)
            Ndc = number_of_distinct_classes

            number_of_distinct_objects = ts.n_objects(
            ) - number_of_distinct_literals
            Ndo = number_of_distinct_objects

            number_of_distinct_subjects = ts.n_subjects()
            Nds = number_of_distinct_subjects

            number_of_distinct_predicates = ts.n_predicates()
            Ndp = number_of_distinct_predicates

            subject_uris = ts.simple_pattern_match([("s", "p", "o")], [],
                                                   ["s"])
            object_uris = ts.simple_pattern_match([("s", "p", "o")], [], ["o"])

            subject_objects_literals_uris = ts.union_pattern_match_result_set(
                subject_uris, object_uris)

            subject_objects_uris = [
                uresult for uresult in subject_objects_literals_uris
                if uresult[0][0][0] != '"' and uresult[0][0][-1]
            ]

            number_of_distinct_uris = len(subject_objects_uris)
            Nu = number_of_distinct_uris

            class_coverage = [(class_result[1] * 1.0) / Ni
                              for class_result in classes_results]

            fo.write("\n")
            fo.write("Number of triples (Nt): %s\n" % number_of_triples)
            fo.write("Number of literals (Nl): %s\n" % number_of_literals)
            fo.write("Number of objects (No): %s\n" % number_of_objects)
            fo.write("Number of typed instances (Ni): %s\n" %
                     number_of_instances)

            fo.write("Number of URIs excluding predicates (Nu): %s\n" %
                     number_of_distinct_uris)
            fo.write("Number of distinct classes (Nc): %s\n" %
                     number_of_distinct_classes)

            fo.write("Number of distinct subjects (Nds): %s\n" %
                     number_of_distinct_subjects)
            fo.write("Number of distinct predicates (Ndp): %s\n" %
                     number_of_distinct_predicates)

            fo.write("Number of distinct objects (Ndo): %s\n" %
                     number_of_distinct_objects)

            fo.write("Number of distinct literals (Ndl): %s\n" %
                     number_of_distinct_literals)
            fo.write("Number of distinct lexical symbols (Ndls): %s\n" %
                     number_of_symbols)

            fo.write("\n")
            fo.write("Literalness (Nl/Nt): %s\n" % ((Nl * 1.0) / Nt))
            if Nl > 0:
                fo.write("Literal uniqueness (Ndl/Nl): %s\n" %
                         ((Ndl * 1.0) / Nl))
            else:
                fo.write("Literal uniqueness (Ndl/Nl): undefined")
            fo.write("Object uniqueness (Ndo/No): %s\n" % ((Ndo * 1.0) / No))
            fo.write("Interconnectedness (1 - (Nl+Ni)/Nt): %s\n" %
                     (1.0 - (Nl + Ni) / (Nt * 1.0)))
            fo.write("Subject coverage (Nds/Nu): %s\n" % ((1.0 * Nds) / Nu))
            fo.write("Object coverage (Ndo/Nu): %s\n" % ((1.0 * Ndo) / Nu))
            fo.write("Class coverage: %s\n" % class_coverage)

            #fo.write("Fraction of objects that are literals: %s\n" % ((number_of_distinct_literals * 1.0) / number_of_distinct_objects))
            fo.write("\n")
            fo.write("Top subjects are:\n")
            pprint.pprint(ts.top_subjects(display_n), fo)
            fo.write("\n")
            fo.write("Top objects are:\n")
            pprint.pprint(ts.top_objects(display_n), fo)
            fo.write("\n")
            fo.write("Top predicates are:\n")
            pprint.pprint(ts.top_predicates(None), fo)
            fo.write("\n")
            fo.write("Top classes are:\n")

            pprint.pprint(classes_results, fo)

        elif options.command == "query":
            ts.load_ntriples(f)
            query = eval(options.query)
            if options.restrictions:
                restrictions = eval(options.restrictions)
            else:
                restrictions = []
            if options.variables:
                solution_variables = eval(options.variables)
            else:
                solution_variables = None

            result_set = ts.simple_pattern_match(query, restrictions,
                                                 solution_variables)

            if display_n == "All":
                pass
            else:
                result_set = result_set[:display_n]

            if options.output_format == "stdout":
                pprint.pprint(result_set, fo)
                fo.write("Query returned %s results" % len(result_set))
            elif options.output_format == "ntriples":
                for result in result_set:
                    i = 1
                    for solution in result[0]:
                        if i % 3 == 1:
                            fo.write(result[0][0] + " ")
                        elif i % 3 == 2:
                            fo.write(result[0][1] + " ")
                        elif i % 3 == 0:
                            fo.write(result[0][2] + " .\n")
                        i += 1

            elif options.output_format == "json":
                import json
                json.dump(result_set, fo)
            elif options.output_format == "delimited":
                header = options.header
                delimiter = options.delimiter
                string_tab = ""
                if header:
                    if len(result_set):
                        for solution_variable in solution_variables:
                            fo.write("%s%s" % (solution_variable, delimiter))
                        fo.write("count\n")
                else:
                    pass

                for result in result_set:
                    for solution in result[0]:
                        if options.clean:
                            if len(solution):
                                solution = solution[1:-1]
                        fo.write("%s%s" % (solution, delimiter))
                    fo.write("%s\n" % result[1])

        if options.output_file_name:
            pass
        else:
            print(fo.getvalue())

        fo.close()
Ejemplo n.º 10
0
    def test_PyTripleSimpleStore(self):
        ts = pyTripleSimple.SimpleTripleStore()
        ts.load_ntriples(self.test_source)

        self.assertEquals(30,ts.n_triples(),"Wrong number of triples extracted")
def main(ntriples_file_name):
    ts = pyt.SimpleTripleStore()  #Create a triple store object

    try:
        f = open(ntriples_file_name)
    except IOError:
        print("File '%s' could not be read" %
              os.path.abspath(ntriple_file_name))
        raise

    ts.load_ntriples(f)

    rdf_type = "<" + pyt.common_prefixes["rdf"] + 'type>'

    # Get all classes defined with counts
    classes_result = ts.simple_pattern_match([('a', 't', 'c')],
                                             [('t', 'in', [rdf_type])], ['c'])
    class_count = len(classes_result)
    class_sizes = [class_result[1] for class_result in classes_result]
    class_mean = (sum(class_sizes) * 1.0) / class_count
    class_count_normalized = [
        class_size / class_mean for class_size in class_sizes
    ]  # normalize the count by the mean

    # Get all definitions from typed objects
    property_class_results = ts.simple_pattern_match([('a', 'p', 'b'),
                                                      ('a', 't', 'ca'),
                                                      (('b', 't', 'cb'))],
                                                     [('t', 'in', [rdf_type]),
                                                      ('p', '!=', 't')],
                                                     ['p', 'ca', 'cb'])
    property_class_relations_count = len(property_class_results)
    property_class_relations_sizes = [
        property_class_result[1]
        for property_class_result in property_class_results
    ]
    property_class_mean = (1.0 * sum(property_class_relations_sizes)
                           ) / property_class_relations_count
    property_class_count_normalized = [
        property_class_size / property_class_mean
        for property_class_size in property_class_relations_sizes
    ]

    gexf_string = ""
    gexf = GephiGexf()
    gexf_string += gexf.xml_header()
    gexf_string += gexf.metadata()
    gexf_string += gexf.open_graph()
    gexf_string += gexf.open_nodes()

    class_dict = {}
    for i in range(class_count):  # Create nodes
        class_name = classes_result[i][0][0][1:-1]
        class_dict[class_name] = i
        gexf_string += gexf.open_node(i,
                                      classes_result[i][0][0][1:-1],
                                      size=class_count_normalized[i])
        gexf_string += gexf.close_node()
    gexf_string += gexf.close_nodes()

    property_dict_normalized = {}
    for i in range(property_class_relations_count
                   ):  # Define edges - Gephi does not support parallel edges
        subject, object = (property_class_results[i][0][1][1:-1],
                           property_class_results[i][0][2][1:-1])
        subject_id = class_dict[subject]
        object_id = class_dict[object]
        relation_pair = (subject_id, object_id)
        if relation_pair in property_dict_normalized:
            property_dict_normalized[
                relation_pair] += property_class_count_normalized[
                    i]  # Cumulate weights
        else:
            property_dict_normalized[
                relation_pair] = property_class_count_normalized[i]

    gexf_string += gexf.open_edges()
    i = 0
    for relation_pair in property_dict_normalized.keys():  # Output edges
        gexf_string += gexf.open_edge(i, relation_pair[0], relation_pair[1],
                                      property_dict_normalized[relation_pair])
        gexf_string += gexf.close_edge()
        i += 1

    gexf_string += gexf.close_edges()
    gexf_string += gexf.close_graph()
    gexf_string += gexf.close_xml()

    # Write out Gephi file
    try:
        gexf_file_name = ntriples_file_name + ".gexf"
        fg = open(gexf_file_name, "w")
    except IOError:
        print("File %s'' could not be written" %
              os.path.abspath(gexf_file_name))
        raise
    fg.write(gexf_string)
    fg.close()

    #Write out predicate counts to standard output
    print("count\tclass1\tpredicate\tclass2\tclass1Count\tclass2Count")
    for property_class_result in property_class_results:
        count = property_class_result[1]
        property_class_pair = property_class_result[0]
        class_1 = property_class_pair[1][1:-1]
        class_1i = class_dict[class_1]
        predicate = property_class_pair[0][1:-1]
        class_2 = property_class_pair[2][1:-1]
        class_2i = class_dict[class_2]
        class_1n = class_sizes[class_1i]
        class_2n = class_sizes[class_2i]
        print(
            str(count) + '\t' + class_1 + '\t' + predicate + '\t' + class_2 +
            '\t' + str(class_1n) + '\t' + str(class_2n))