Exemple #1
0
    def __sample_file(self, dir, file):
        """Creates a local from a specific file in a given directory.

        :param dir: directory (relative to path_to_dbpedia)
        :param file:
        """
        t = Triple()
        p = NTriplesParser(t)
        infile = os.path.join(self.path_to_dbpedia, dir, file)
        outfile = os.path.join(self.output_dir, dir, file)
        print("Processing file " + file + " ...")
        i = 0
        with FileUtils.open_file_by_type(infile) as fin:
            fout = FileUtils.open_file_by_type(
                outfile,
                mode="w")  # output file will be of the same type as the input
            for line in fin:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue
                subj = self.prefix.get_prefixed(
                    t.subject())  # prefixing subject
                if subj in self.sample_entities:
                    fout.write(line)
                i += 1
                if i % 100000 == 0:
                    print(str(i // 1000) + "K lines processed")
            fout.close()
    def __load_entity_abstracts(self, filename):
        prefix = URIPrefix()
        t = Triple()
        p = NTriplesParser(t)
        lines_counter = 0
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            # basic line parsing
            line = line.decode("utf-8") if isinstance(line, bytes) else line
            try:
                p.parsestring(line)
            except ParseError:  # skip lines that couldn't be parsed
                continue
            if t.subject() is None:  # only if parsed as a triple
                continue

            # Subject and object identification
            subj = prefix.get_prefixed(t.subject())
            obj = ""
            if type(t.object()) is URIRef:
                # PLOGGER.error("Error: it is URIRef the parsed obj")
                pass
            else:
                obj = t.object().encode("utf-8")
                if len(obj) == 0:
                    continue  # skip empty objects
            self.__entity_abstracts[subj] = obj

            lines_counter += 1
            if lines_counter % 10000 == 0:
                PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                pass

        PLOGGER.info("\n### Loading entity abstracts... Done.")
Exemple #3
0
    def add_file(self, filename, reverse_triple=False, predicate_prefix=None):
        """Adds contents from an NTriples file to MongoDB.

        :param filename: NTriples file.
        :param reverse_triple: if set True, the subject and object values are swapped.
        :param predicate_prefix: prefix to be added to predicates.
        :param subjects_redirecter: redirects dict.
        """
        print("Processing " + filename + "...")

        t = Triple()
        p = NTriplesParser(t)
        self.__m_id = None  # document id for MongoDB -- subj
        self.__m_contents = None  # document contents for MongoDB -- pred, obj
        i = 0

        with FileUtils.open_file_by_type(filename) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # subject prefixing
                subj = self.__prefix.get_prefixed(t.subject())

                # predicate prefixing
                pred = self.__prefix.get_prefixed(t.predicate())
                if predicate_prefix is not None:
                    pred = predicate_prefix + pred

                # Object prefixing
                if type(t.object()) is URIRef:
                    obj = self.__prefix.get_prefixed(t.object())
                else:
                    obj = t.object()
                    if len(obj) == 0:
                        continue  # skip empty objects

                # write or append
                if reverse_triple:  # reverse subj and obj
                    self._next_triple(obj, pred, subj)
                else:  # normal mode
                    self._next_triple(subj, pred, obj)

                i += 1
                if i % 100000 == 0:
                    print(
                        str(i // 1000) + "K lines processed from " + filename)

        # process last triple
        self._write_to_mongo()
Exemple #4
0
def axioms(filename):
    stream = Stream()
    parser = NTriplesParser(stream)

    with open(filename, "rb") as data:
        parser.parse(data)

    graph = stream.graph()
    propertise = set()

    for triple in graph:
        propertise.add(triple[1])

    print(propertise)

    dic_fun = functionality(graph, propertise)
    print('1')

    dic_ref = reflexivity(graph, propertise)
    print('2')
    dic_sym = symmetry(graph, propertise)
    print('3')
    dic_inv = inverse(graph, propertise)
    print('4')
    dic_tra = transivity(graph, propertise)
    print('5')

    csvname = filename + '.csv'
    out = open(csvname, 'a', newline='')
    csv_writer = csv.writer(out, dialect='excel')

    for pi in propertise:
        l1 = [pi]
        if (dic_fun[pi] > 0):
            l1.append('functionality')
            l1.append(dic_fun[pi])

        if (dic_ref[pi] == 1):
            l1.append('reflexivity')
        if (dic_sym[pi] == 1):
            l1.append('symmetry')
        if (len(dic_inv[pi]) != 0):
            l1.append('inverse')

        if (dic_tra[pi] == 1):
            l1.append('transivity')

        print(l1)
        csv_writer.writerow(l1)

    print('over')
Exemple #5
0
    def parse(self, source, sink, **kwargs):
        '''
        Parse the NT format

        :type source: `rdflib.parser.InputSource`
        :param source: the source of NT-formatted data
        :type sink: `rdflib.graph.Graph`
        :param sink: where to send parsed triples
        :param kwargs: Additional arguments to pass to `.NTriplesParser.parse`
        '''
        f = source.getByteStream()  # TODO getCharacterStream?
        parser = NTriplesParser(NTSink(sink))
        parser.parse(f, **kwargs)
        f.close()
Exemple #6
0
 def __init__(self, config):
     self.__elastic = None
     self.__config = config
     self.__index_name = config["index_name"]
     self.__dbpedia_path = config["dbpedia_files_path"]
     # For triple parsing
     self.__prefix = URIPrefix()
     self.__triple = Triple()
     self.__ntparser = NTriplesParser(self.__triple)
     # Entity abstract and type assignments kept in memory
     self.__entity_abstracts = {}
     self.__load_entity_abstracts()
     self.__types_entities = defaultdict(list)
     self.__load_entity_types()
Exemple #7
0
 def get_triples(self):
     self.mode = 'parse'
     parser = NTriplesParser_()
     parser.sink = self.Sink(self)
     self.triple = None
     while True:
         parser.line = self._stream.readline().strip().decode('utf-8')
         if not parser.line:
             break
         try:
             parser.parseline()
         except ParseError:
             raise ParseError("Invalid line: %r" % parser.line)
         if self.triple:
             yield self.triple
             self.triple = None
Exemple #8
0
    def parse(self,
              filename: str = None,
              input_format: str = None,
              provided_by: str = None,
              predicates: Set[URIRef] = None) -> None:
        """
        Parse a n-triple file into networkx.MultiDiGraph

        The file must be a *.nt formatted file.

        Parameters
        ----------
        filename : str
            File to read from.
        input_format : str
            The input file format. Must be one of ``['nt', 'nt.gz']``
        provided_by : str
            Define the source providing the input file.

        """
        p = p = NTriplesParser(self)
        self.start = current_time_in_millis()
        if input_format == INPUT_FORMATS[0]:
            p.parse(open(filename, 'rb'))
        elif input_format == INPUT_FORMATS[1]:
            p.parse(gzip.open(filename, 'rb'))
        else:
            raise NameError(
                f"input_format: {input_format} not supported. Must be one of {INPUT_FORMATS}"
            )
        print("Done parsing NT file")
        self.dereify(self.assocs)
Exemple #9
0
def process_file(infile, sink):
    bad_lines = defaultdict(int)
    for line in infile:
        s = BytesIO()
        s.write(line)
        s.seek(0)
        parser = NTriplesParser(sink)
        try:
            parser.parse(s)
        except (ParseError, ElementStrError) as e:
            bad_lines[line] += 1

    print('read {} lines from {}'.format(sink.nlines, infile.name))
    print('bad lines and their frequencies:')
    for line, count in bad_lines.items():
        print('  {:>10} : {}'.format(count, line))
    def load(cls, filepath):
        """Return array of FAST dict. Main method."""
        if zipfile.is_zipfile(filepath):
            with ZipFile(filepath) as zf:
                nt_filename = next(
                    (n for n in zf.namelist() if n.endswith('.nt')))
                # defaults to equivalent of 'rb'
                nt_file = zf.open(nt_filename)
        else:
            nt_file = open(filepath, 'rb')

        instance = cls()
        parser = NTriplesParser(instance)
        parser.parse(nt_file)

        nt_file.close()

        return instance.terms
Exemple #11
0
    def triples(self, xxx_todo_changeme3, context=None):
        """Generator over the triple store

        Returns triples that match the given triple pattern. If triple pattern
        does not provide a context, all contexts will be searched.
        """
        (s, p, o) = xxx_todo_changeme3
        url = self._statement_encode((s, p, o), context)
        req = Request(url)
        req.add_header('Accept', 'text/plain')
        # N-Triples is best for generator (one line per triple)
        log.debug("Request: %s" % req.get_full_url())
        dumper = DumpSink()
        parser = NTriplesParser(dumper)

        for l in urlopen(req):
            log.debug('line: %s' % l)
            parser.parsestring(l)
            yield dumper.get_triple()
Exemple #12
0
    def triples(self, xxx_todo_changeme3, context=None):
        """Generator over the triple store

        Returns triples that match the given triple pattern. If triple pattern
        does not provide a context, all contexts will be searched.
        """
        (s, p, o) = xxx_todo_changeme3
        url = self._statement_encode((s, p, o), context)
        req = Request(url)
        req.add_header('Accept', 'text/plain')
                       # N-Triples is best for generator (one line per triple)
        log.debug("Request: %s" % req.get_full_url())
        dumper = DumpSink()
        parser = NTriplesParser(dumper)

        for l in urlopen(req):
            log.debug('line: %s' % l)
            parser.parsestring(l)
            yield dumper.get_triple()
Exemple #13
0
def transform_part(
    input_path,
    global_id_marker,
    part_name,
    left,
    right,
    prefixer=None,
):
    print(f'starting {part_name}: {left} -- {right}')
    with open(input_path, 'rb') as in_file:
        in_file.seek(left)
        part_bytes = in_file.read(right - left)
        part_str = part_bytes.decode('utf8')  # wasteful
        with PropertyGraphSink(global_id_marker, part_name, prefixer) as sink:
            ntp = NTriplesParser(sink=sink)
            ntp.parsestring(part_str)

    triple_count = sum(sink.predicate_count.values())
    print(f'finished {part_name}: {triple_count} triples')
    return part_name, dict(sink.predicate_count)
Exemple #14
0
    def parse_file(self, filename, triplehandler):
        """Parses file and calls callback function with the parsed triple"""
        PLOGGER.info("Processing " + filename + "...")

        prefix = URIPrefix()
        t = Triple(prefix)
        p = NTriplesParser(t)
        i = 0

        with open(filename) as f:
            for line in f:
                p.parsestring(line)
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # call the handler object with the parsed triple
                triplehandler.triple_parsed(t)

                i += 1
                if i % 10000 == 0:
                    PLOGGER.info(str(i / 1000) + "K lines processed")
Exemple #15
0
    def read_fb2dbp_file(self, is_39=False):
        """Reads the file and generates an initial mapping of Freebase to DBpedia IDs.
        Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored.
        """
        fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file
        print("Processing " + fb2dbp_file + "...")

        t = Triple()
        p = NTriplesParser(t)
        i = 0
        fb2dbp_mapping = defaultdict(set)
        with FileUtils.open_file_by_type(fb2dbp_file) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # prefixing
                dbp_id = self.__prefix.get_prefixed(t.subject())
                fb_id = self.__prefix.get_prefixed(t.object())

                # if reading 3.9 file, converts ID to 2015-10 version
                if is_39:
                    dbp_id = EntityUtils.convert_39_to_201510(dbp_id)
                    fb2dbp_mapping[fb_id].add(dbp_id)

                # if reading 2015-10 file, keeps only the proper DBpedia entities
                else:
                    entity_utils = EntityUtils(
                        self.__mongo_dbpedia.find_by_id(dbp_id))
                    if entity_utils.is_entity():
                        fb2dbp_mapping[fb_id].add(dbp_id)
                i += 1
                if i % 1000 == 0:
                    print(str(i // 1000) + "K lines are processed!")

        return fb2dbp_mapping
Exemple #16
0
def classPart(filename):
	
	stream = Stream()
	parser = NTriplesParser(stream) 	
	with open(filename,"rb") as data:
		parser.parse(data)
	graph = stream.graph()	# garaph(set) is the dataset <s1,p1,o1> <s2,p2,o2>...
	print('success load')
	
	classes = {}
	c = set()
	p = set()
	for triple in graph:
		c.add(triple[2])
		p.add(triple[0])
		if triple[2] not in classes:
			classes[triple[2]] = set()
		classes[triple[2]].add(triple[0])
	print('the number of classes: ',end='')
	print(len(c))
	print('the number of instances: ',end='')
	print(len(p))
	
	return classes
Exemple #17
0
            else:
                log.error(e) 
        return result
        
    def triples(self, (s, p, o), context=None):
        """Generator over the triple store

        Returns triples that match the given triple pattern. If triple pattern
        does not provide a context, all contexts will be searched.
        """
        url = self._statement_encode((s, p, o), context)
        req = Request(url)
        req.add_header('Accept','text/plain') # N-Triples is best for generator (one line per triple)
        log.debug("Request: %s" % req.get_full_url())
        dumper=DumpSink()
        parser=NTriplesParser(dumper)
        
        for l in urlopen(req):
            log.debug('line: %s'%l)
            parser.parsestring(l)
            yield dumper.get_triple() 

    def __len__(self):
        """Returns the number of triples in the graph
        calls http://{self.url}/size  very fast
        """
        return int(urlopen(self.url+"/size").read())

    def set(self, (subject, predicate, object)):
        """Convenience method to update the value of object
        if data:
            data = set(json.loads(data))

        for w in v:
            data.add(w)

        if data:
            self.db.put(k, json.dumps(list(data)))
            logging.info('categories: {0} => {1}'.format(
                unquote(k),
                list(data)[:5]))


if __name__ == '__main__':

    labels = NTriplesParser(sink=Label('./labels'))
    categories = NTriplesParser(
        sink=Category('./categories', labels=labels.sink.db))

    def process_labels(line):
        labels.parsestring(line)

    for filename in [
            './labels_en.nt', './labels_en_uris_id.nt',
            './category_labels_en.nt', './category_labels_en_uris_id.nt'
    ]:
        logging.info('labels: processing: {0}'.format(filename))
        Parallel(n_jobs=N_JOBS)(delayed(process_labels)(line)
                                for line in open(filename))

    def process_categories(line):
Exemple #19
0
            index_o = self.res[o]
            self.filedict.get(p).write("{} {}\n".format(index_s, index_o))
            print "{} {} {} --> {} {}".format(s, p, o, index_s, index_o)
        except UnicodeEncodeError:
            print "Unicode error, skipping triple..."
            self.i += 1


# set logging to basic
logging.basicConfig()

pathToFile = sys.argv[1]
targetDir = "graph"

csk = CountSink()
ntp = NTriplesParser(csk)
with open(pathToFile, "r") as anons:
    print "Counting into {}...".format(pathToFile)
    ntp.parse(anons)

f = open(targetDir + '/resources.tsv', 'w')
for r in csk.res:
    f.write(re.sub(r"\n", " ", re.sub(r"\r", " ", r.n3().encode('utf8')[1:-1])) + "\n")

sk = RDFToTensorSink()
sk.set_filedict(csk.filedict)
sk.tensor_size = len(csk.res)
n = NTriplesParser(sk)
with open(pathToFile, "r") as anons:
    print "Extracting relationships from {}...".format(pathToFile)
    n.parse(anons)
Exemple #20
0
class IndexerDBpediaTypes(object):
    __DOC_TYPE = "doc"  # we don't make use of types
    __MAPPINGS = {
        "id": Elastic.notanalyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    def __init__(self, config):
        self.__elastic = None
        self.__config = config
        self.__index_name = config["index_name"]
        self.__dbpedia_path = config["dbpedia_files_path"]
        # For triple parsing
        self.__prefix = URIPrefix()
        self.__triple = Triple()
        self.__ntparser = NTriplesParser(self.__triple)
        # Entity abstract and type assignments kept in memory
        self.__entity_abstracts = {}
        self.__load_entity_abstracts()
        self.__types_entities = defaultdict(list)
        self.__load_entity_types()

    @property
    def name(self):
        return self.__index_name

    def __parse_line(self, line):
        """Parses a line from a ttl file and returns subject and object pair.

        It is used for parsing DBpedia abstracts and entity types.
        The subject is always prefixed.
        For object URIs, it is returned prefixed if from DBpedia otherwise
        None (i.e., types); literal objects are always returned (i.e.,
        abstracts).
        """
        line = line.decode("utf-8") if isinstance(line, bytes) else line
        try:
            self.__ntparser.parsestring(line)
        except ParseError:  # skip lines that couldn't be parsed
            return None, None
        if self.__triple.subject() is None:  # only if parsed as a triple
            return None, None

        subj = self.__prefix.get_prefixed(self.__triple.subject())
        obj = None
        if type(self.__triple.object()) is URIRef:
            if self.__triple.object().startswith(
                    "http://dbpedia.org/ontology"):
                obj = self.__prefix.get_prefixed(self.__triple.object())
        else:
            obj = self.__triple.object().encode("utf-8")

        return subj, obj

    def __load_entity_abstracts(self):
        num_lines = 0
        filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE])
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            entity, abstract = self.__parse_line(line)
            if abstract and len(abstract) > 0:  # skip empty objects
                self.__entity_abstracts[entity] = abstract

            num_lines += 1
            if num_lines % 10000 == 0:
                PLOGGER.info("  {}K lines processed".format(num_lines // 1000))

        PLOGGER.info("  Done.")

    def __load_entity_types(self):
        num_lines = 0
        for types_file in ENTITY_TYPES_FILES:
            filename = os.sep.join([self.__dbpedia_path, types_file])
            PLOGGER.info("Loading entity types from {}".format(filename))
            for line in FileUtils.read_file_as_list(filename):
                entity, entity_type = self.__parse_line(line)
                if type(entity_type) != str:  # Likely result of parsing error
                    continue
                if not entity_type.startswith("<dbo:"):
                    PLOGGER.info("  Non-DBpedia type: {}".format(entity_type))
                    continue
                if not entity.startswith("<dbpedia:"):
                    PLOGGER.info("  Invalid entity: {}".format(entity))
                    continue
                self.__types_entities[entity_type].append(entity)

                num_lines += 1
                if num_lines % 10000 == 0:
                    PLOGGER.info("  {}K lines processed".format(num_lines //
                                                                1000))
            PLOGGER.info("  Done.")

    def __make_type_doc(self, type_name):
        """Gets the document representation of a type to be indexed, from its
        entity short abstracts."""
        content = "\n".join([
            self.__entity_abstracts.get(e, b"").decode("utf-8")
            for e in self.__types_entities[type_name]
        ])

        if len(content) > MAX_BULKING_DOC_SIZE:
            PLOGGER.info("Type {} has content larger than allowed: {}.".format(
                type_name, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t.
            # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            num_entities = len(self.__types_entities[type_name])
            amount_abstracts_to_sample = min(
                floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN),
                num_entities)
            entities_sample = [
                self.__types_entities[type_name][i] for i in sample(
                    range(num_entities), amount_abstracts_to_sample)
            ]
            content = ""  # reset content
            for entity in entities_sample:
                new_content_candidate = "\n".join([
                    content,
                    self.__entity_abstracts.get(entity, b"").decode("utf-8")
                ])
                # we add an abstract only if by doing so it will not exceed
                # MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) > MAX_BULKING_DOC_SIZE:
                    break
                content = new_content_candidate

        return {"content": content}

    def build_index(self, force=False):
        """Builds the index.

        Note: since DBpedia only has a few hundred types, no bulk indexing is
        needed.

        :param force: True iff it is required to overwrite the index (i.e. by
        creating it by force); False by default.
        :type force: bool
        :return:
        """
        PLOGGER.info("Building type index {}".format(self.__index_name))
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)

        for type_name in self.__types_entities:
            PLOGGER.info("  Adding {} ...".format(type_name))
            contents = self.__make_type_doc(type_name)
            self.__elastic.add_doc(type_name, contents)

        PLOGGER.info("  Done.")
        try:
            c.execute("INSERT INTO entities (entity) VALUES (?)", [(s)])
            s_id = c.lastrowid
        except sqlite3.IntegrityError:
            c.execute("SELECT id FROM entities WHERE entity = ?", [(s)])
            s_id = c.fetchone()[0]
        try:
            c.execute("INSERT INTO properties (property) VALUES (?)", [(p)])
            p_id = c.lastrowid
        except sqlite3.IntegrityError:
            c.execute("SELECT id FROM properties WHERE property = ?", [(p)])
            p_id = c.fetchone()[0]
        try:
            c.execute("INSERT INTO entities (entity) VALUES (?)", [(o)])
            o_id = c.lastrowid
        except sqlite3.IntegrityError:
            c.execute("SELECT id FROM entities WHERE entity = ?", [(o)])
            o_id = c.fetchone()[0]

        # print "{} {} {}".format(s_id, p_id, o_id)


if __name__ == "__main__":
    # Create a new parser and try to parse the NT file.
    sk = StreamSink()
    n = NTriplesParser(sk)
    with open(sys.argv[1], "r") as anons:
        n.parse(anons)
    conn.commit()
    conn.close()
    print "triples = {}".format(sk.length)
Exemple #22
0
            dic_dis[triple[2]].add(triple[0])

    return dic_dis


# input the filepath
datasets = Filelist('E:/python/ttldata')

print(datasets)

dic1 = {}

for filename in datasets:
    stream = Stream()
    parser = NTriplesParser(stream)

    with open(filename, "rb") as data:
        parser.parse(data)
    graph = stream.graph()
    ChoiceType(graph, dic1)

print(dic1)

for i in dic1:
    for j in dic1:
        if i == j:
            continue
        if dic1[i] & dic1[j]:
            print(i, ' and ', j, ' aren\'t disjunction')
        else:
from rdflib.plugins.parsers.ntriples import NTriplesParser, Sink
import sys

reload(sys)
sys.setdefaultencoding("utf-8")

class StreamSink(Sink):
    
    def triple(self, s, p, o):
        self.length += 1
        print "Stream of triples s={s}, p={p}, o={o}".format(s=s, p=p, o=o).encode('utf8')
            

if __name__ == "__main__":
    # Create a new parser and try to parse the NT file.
    sk = StreamSink()
    n = NTriplesParser(sk)
    with open(sys.argv[1], "r") as anons:
        n.parse(anons)
    print "triples = {}".format(sk.length)
Exemple #24
0
def load(
    dump_file: 'url of the Freebase RDF dump', mid_textid_file:
    'url of the part of the Freebase RDF dump containing type.object.id relations'
):
    engine = create_engine(get_db_url(), pool_recycle=3600)
    Base.metadata.create_all(engine)

    def execute_select(statement, **args):
        db = engine.connect()
        try:
            for row in db.execute(statement, **args):
                yield row
        except OperationalError:
            db.close()
            db = engine.connect()
            for row in db.execute(statement, **args):
                yield row
        finally:
            db.close()

    def execute_edit(statement, **args):
        db = engine.connect()
        try:
            return db.execute(statement, **args)
        except OperationalError:
            db.close()
            db = engine.connect()
            return db.execute(statement, **args)
        finally:
            db.close()

    @lru_cache(maxsize=4096)
    def get_topic_id_from_url(url: str) -> Optional[int]:
        input_id = url.replace('http://rdf.freebase.com/ns',
                               '').replace('.', '/')
        if input_id.startswith('/m/') or input_id.startswith('/g/'):
            for topic in execute_select(
                    Topic.__table__.select(Topic.mid == input_id)):
                return topic[0]
            return execute_edit(insert_query(Topic),
                                mid=input_id).inserted_primary_key[0]
        else:
            if len(input_id) > MAX_VARCHAR_SIZE:
                return None
            for topic in execute_select(
                    Topic.__table__.select(Topic.textid == input_id)):
                return topic[0]
            try:
                return execute_edit(insert_query(Topic),
                                    textid=input_id).inserted_primary_key[0]
            except IntegrityError as e:
                logger.error(e)
                return None

    def add_to_language_column(table, s, label, max_size):
        s_topic_id = get_topic_id_from_url(s)
        if s_topic_id is None:
            logger.warning(
                'Not able to get mid for label subject {}'.format(s))
            return
        if len(label) >= max_size:
            logger.error('Not able to add too long label: {}'.format(label))
            return
        try:
            execute_edit(insert_query(table),
                         topic_id=s_topic_id,
                         language=label.language,
                         value=label.value)
        except IntegrityError:
            pass  # We do not care about duplicates

    def add_type(s, o, notable):
        s_topic_id = get_topic_id_from_url(s)
        if s_topic_id is None:
            logger.warning('Not able to get mid for type subject {}'.format(s))
            return
        o_topic_id = get_topic_id_from_url(o)
        if o_topic_id is None:
            logger.warning('Not able to get mid for type object {}'.format(o))
            return
        try:
            execute_edit(insert_query(Type),
                         topic_id=s_topic_id,
                         type_id=o_topic_id,
                         notable=notable)
        except IntegrityError:
            if notable:
                # We add notability
                execute_edit(Type.__table__.update().where(
                    Type.topic_id == s_topic_id).where(
                        Type.type_id == o_topic_id).values(notable=notable))

    def add_key(s, key):
        if not is_interesting_key(key):
            return False
        s_topic_id = get_topic_id_from_url(s)
        if s_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(s))
            return
        key = decode_key(key)
        if len(key) >= MAX_VARCHAR_SIZE:
            logger.error('Not able to add too long key: {}'.format(key))
            return
        try:
            execute_edit(insert_query(Key),
                         topic_id=s_topic_id,
                         key=decode_key(key))
        except IntegrityError:
            pass

    def add_property_topic_id_field(field_name, s, o):
        s_topic_id = get_topic_id_from_url(s)
        if s_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(s))
            return
        o_topic_id = get_topic_id_from_url(o)
        if o_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(s))
            return
        try:
            execute_edit(insert_query(Property),
                         topic_id=s_topic_id,
                         **{field_name: o_topic_id})
        except IntegrityError:
            execute_edit(
                update_query(Property).values(**{
                    field_name: o_topic_id
                }).where(Property.topic_id == s_topic_id))

    def add_unique(s, o):
        s_topic_id = get_topic_id_from_url(s)
        if s_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(s))
            return
        try:
            execute_edit(insert_query(Property),
                         unique=to_bool(o),
                         topic_id=s_topic_id)
        except IntegrityError:
            execute_edit(
                update_query(Property).values(unique=to_bool(o)).where(
                    Property.topic_id == s_topic_id))

    def add_edge(s, p, o):
        s_topic_id = get_topic_id_from_url(s)
        if s_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(s))
            return
        p_topic_id = get_topic_id_from_url(p)
        if p_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(p))
            return
        o_topic_id = get_topic_id_from_url(o)
        if o_topic_id is None:
            logger.warning('Not able to get mid for key {}'.format(o))
            return
        try:
            execute_edit(insert_query(Edge),
                         subject_id=s_topic_id,
                         predicate_id=p_topic_id,
                         object_id=o_topic_id)
        except IntegrityError:
            pass

    def to_bool(s):
        s = str(s)
        if s == 'true':
            return True
        elif s == 'false':
            return False
        else:
            raise ValueError("Unexpected value: '{}'".format(s))

    class TextIdSink:
        def triple(self, s, p, o):
            if p == type_object_id:
                s = s.replace('http://rdf.freebase.com/ns',
                              '').replace('.', '/')
                o = o.replace('http://rdf.freebase.com/ns',
                              '').replace('.', '/')
                try:
                    execute_edit(insert_query(Topic), mid=s, textid=o)
                except IntegrityError:
                    pass
            else:
                logger.info('Unexpected triple: {} {} {}'.format(s, p, o))

    class TripleSink:
        def __init__(self, start_cursor=0):
            self.cursor = start_cursor

        def triple(self, s, p, o):
            self.cursor += 1
            if self.cursor % 1000000 == 0:
                print(self.cursor)
                with open('progress.txt', 'wt') as pfp:
                    pfp.write(str(self.cursor))

            try:
                """
                if p == type_object_name:
                    add_to_language_column(Label, s, o, MAX_VARCHAR_SIZE)
                elif p == common_topic_description:
                    add_to_language_column(Description, s, o, sys.maxsize)
                elif p == common_topic_alias:
                    add_to_language_column(Alias, s, o, MAX_VARCHAR_SIZE)
                elif p == type_object_type:
                    add_type(s, o, False)
                elif p == common_topic_notable_types:
                    add_type(s, o, True)
                elif p == type_object_key:
                    add_key(s, o.value)
                if p == type_property_schema:
                    add_property_topic_id_field('schema_id', s, o)
                elif p == type_property_expected_type:
                    add_property_topic_id_field('expected_type_id', s, o)
                elif p == type_property_unique:
                    add_unique(s, o)
                elif p == type_property_master_property:
                    add_property_topic_id_field('master_id', s, o)
                elif p == type_property_reverse_property:
                    add_property_topic_id_field('reverse_id', s, o)
                elif p == type_property_unit:
                    add_property_topic_id_field('unit_id', s, o)
                elif p == type_property_delegated:
                    add_property_topic_id_field('delegated_id', s, o)
                """
                if isinstance(o, URIRef) and o.startswith(
                        'http://rdf.freebase.com/') and p.startswith(
                            'http://rdf.freebase.com/ns/') and not any(
                                b in p for b in edge_blacklist):
                    add_edge(s, p, o)
            except ValueError:
                pass

    with gzip.open(mid_textid_file) as fp:
        NTriplesParser(sink=TextIdSink()).parse(fp)

    with gzip.open(dump_file) as fp:
        cursor = 0
        progress = Path('progress.txt')
        if progress.is_file():
            with progress.open('rt') as fpc:
                cursor = int(fpc.read().strip())
            logger.info('Skipping the first {} lines'.format(cursor))
        for _ in range(cursor):
            fp.readline()
        NTriplesParser(sink=TripleSink(cursor)).parse(fp)
Exemple #25
0
		try:    
			p_id = props[p][0]
		except KeyError:
			props[p] = (i, open("matrices/" + str(i), "w+"))
			props[p][1].write("%%MatrixMarket matrix coordinate integer general\n%\nnum_ents num_ents num_nonZeros\n")
			p_id = i
			i += 1
		c.execute("SELECT id FROM entities WHERE entity = ?", [(s)])
		s_id = c.fetchone()[0]
		c.execute("SELECT id FROM entities WHERE entity = ?", [(o)])
		o_id = c.fetchone()[0]
		props[p][1].write(("{} {} 1" + "\n").format(s_id, o_id))
		

TSink = TensorSink()
g = NTriplesParser(TSink)
f = open("test.ttl", 'rb')
g.parse(f)
f.close()
conn.commit()
c.execute("SELECT count(*) FROM entities")
num_ents = c.fetchone()[0]

#close all writers and the database connection
for key, value in props.items():	
	value[1].close()
conn.close()

#create .mtx for all properties with proper head fields
for key, value in props.items():
	id_p = str(value[0])
Exemple #26
0
 def parse(self, source, sink, baseURI=None):
     f = source.getByteStream()  # TODO getCharacterStream?
     parser = NTriplesParser(NTSink(sink))
     parser.parse(f)
     f.close()
Exemple #27
0
        if not self.labels:
            raise Exception('Labels not set.')

    def triple(self, s, p, o):
        k = o.encode('utf-8')
        s = s.encode('utf-8')
        for v in self.labels.get(s, default=set()):
            self.store[k].add(v)
        logging.info('categories: {0} => {1}'.format(
            unquote(k),
            list(self.store.get(k))[:5]))


if __name__ == '__main__':

    labels = NTriplesParser(sink=Label())

    def process_labels(line):
        labels.parsestring(line)

    for filename in [
            './labels_en.nt', './labels_en_uris_id.nt',
            './category_labels_en.nt', './category_labels_en_uris_id.nt'
    ]:
        logging.info('labels: processing: {0}'.format(filename))
        Parallel(n_jobs=N_JOBS)(delayed(process_labels)(line)
                                for line in open(filename))

    pickle.dump(labels.sink.store, open('labels.p', 'wb'))

    categories = NTriplesParser(sink=Category(labels=labels.sink.store))
 def parse(self, source, sink, baseURI=None):
     f = source.getByteStream()  # TODO getCharacterStream?
     parser = NTriplesParser(NTSink(sink))
     parser.parse(f)
     f.close()
Exemple #29
0
    def triples(self, (s, p, o), context=None):
        """Generator over the triple store

        Returns triples that match the given triple pattern. If triple pattern
        does not provide a context, all contexts will be searched.
        """
        ctx = context or self.context
        url = self._statement_encode((s, p, o), ctx)
        req = Request(url)
        req.add_header(
            'Accept', 'text/plain'
        )  # N-Triples is best for generator (one line per triple)
        log.debug("Request: %s" % req.get_full_url())
        dumper = DumpSink()
        parser = NTriplesParser(dumper)

        for l in urlopen(req):
            #log.debug('line: %s'%l)
            parser.parsestring(l)
            yield dumper.get_triple()

    def __len__(self):
        """Returns the number of triples in the graph
        calls http://{self.url}/size  very fast
        """
        return int(urlopen(self.url + "/size").read())

    def set(self, (subject, predicate, object)):
        """Convenience method to update the value of object
Exemple #30
0
 def _parse(s):
     n = NTriplesParser()
     n.line = s
     return n.object()