Ejemplo n.º 1
0
    def build(self, callback_get_doc_content, bulk_size=1000):
        """Builds the DBpedia index from the mongo collection.

        To speedup indexing, we index documents as a bulk.
        There is an optimum value for the bulk size; try to figure it out.

        :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing
        :param bulk_size: Number of documents to be added to the index as a bulk
        """
        PLOGGER.info("Building " + self.__index_name + " ...")
        elastic = Elastic(self.__index_name)
        elastic.create_index(self.__mappings, model=self.__model, force=True)

        i = 0
        docs = dict()
        for mdoc in self.__mongo.find_all(no_timeout=True):
            docid = Mongo.unescape(mdoc[Mongo.ID_FIELD])

            # get back document from mongo with keys and _id field unescaped
            doc = callback_get_doc_content(Mongo.unescape_doc(mdoc))
            if doc is None:
                continue
            docs[docid] = doc

            i += 1
            if i % bulk_size == 0:
                elastic.add_docs_bulk(docs)
                docs = dict()
                PLOGGER.info(str(i / 1000) + "K documents indexed")
        # indexing the last bulk of documents
        elastic.add_docs_bulk(docs)
        PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
Ejemplo n.º 2
0
class FACCToMongo(object):
    def __init__(self, config):
        """Inserts FACC surface forms to Mongo."""
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__path = config[KEY_PATH]
        self.__predicate = config[KEY_PREDICATE]
        self.__lowercase = config[KEY_LOWERCASE]
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_PATH not in config:
                raise Exception(KEY_PATH + " is missing")
            if KEY_PREDICATE not in config:
                raise Exception(KEY_PREDICATE + " is missing")
            if KEY_LOWERCASE not in config:
                config[KEY_LOWERCASE] = True
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)

    def __add_surface_form(self, surface_form, freebase_uri, count):
        """Adds a surface form."""
        if self.__lowercase:
            surface_form = surface_form.lower()
        # Increases count; if the id is not associated with the surface form yet, it adds it with count.
        freebase_id = self.__convert_to_fb_id(freebase_uri)
        self.__mongo.inc_in_dict(surface_form, self.__predicate, freebase_id,
                                 count)

    def __convert_to_fb_id(self, fb_uri):
        """Converts /m/047b9p0 to <fb:m.047b9p0>"""
        fb_id = fb_uri.replace("/", ".")
        return "<fb:" + fb_id[1:] + ">"

    def __add_file(self, tsv_filename):
        """Adds name variants from an FACC tsv file."""
        PLOGGER.info("Adding name variants from '" + tsv_filename + "'...")
        infile = open(tsv_filename, "r")
        for line in infile:
            f = line.rstrip().split("\t")
            self.__add_surface_form(f[0], f[1], int(f[2]))
        infile.close()

    def build(self):
        """Builds surface form collection from FACC annotations."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        for path, dirs, files in os.walk(self.__path):
            for fn in files:
                if fn.endswith(".tsv"):
                    self.__add_file(os.path.join(path, fn))
        PLOGGER.info("Collection " + self.__collection + " is built.")
Ejemplo n.º 3
0
 def __init__(self, config):
     """Inserts DBpedia surface forms to Mongo."""
     self.__check_config(config)
     self.__collection = config[KEY_COLLECTION]
     self.__lowercase = config[KEY_LOWERCASE]
     self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                  MONGO_COLLECTION_DBPEDIA)
     self.__mongo = None
Ejemplo n.º 4
0
    def build(self):
        """Builds surface form collection from FACC annotations."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        for path, dirs, files in os.walk(self.__path):
            for fn in files:
                if fn.endswith(".tsv"):
                    self.__add_file(os.path.join(path, fn))
Ejemplo n.º 5
0
 def __init__(self, config):
     self.__check_config(config)
     self.__collection = config[KEY_COLLECTION]
     self.__fb2dbp_file = config[KEY_MAPPING_FILE]
     self.__fb2dbp_file_39 = config[
         KEY_MAPPING_FILE_39]  # used for removing duplicates
     self.__prefix = URIPrefix()
     self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                  MONGO_COLLECTION_DBPEDIA)
Ejemplo n.º 6
0
class Entity(object):
    def __init__(self):
        self.__coll_dbpedia = None
        self.__coll_sf_facc = None
        self.__coll_sf_dbpedia = None
        self.__coll_fb2dbp = None

    def __init_coll_dbpedia(self):
        """Makes connection to the entity (DBpedia) collection."""
        if self.__coll_dbpedia is None:
            self.__coll_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                        MONGO_COLLECTION_DBPEDIA)

    def __init_coll_sf_facc(self):
        """Makes connection to the surface form collection."""
        if self.__coll_sf_facc is None:
            self.__coll_sf_facc = Mongo(MONGO_HOST, MONGO_DB,
                                        MONGO_COLLECTION_SF_FACC)

    def __init_coll_sf_dbpedia(self):
        """Makes connection to the surface form collection."""
        if self.__coll_sf_dbpedia is None:
            self.__coll_sf_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                           MONGO_COLLECTION_SF_DBPEDIA)

    def __init_coll_fb2dbp(self):
        """Makes connection to Freebase2DBpedia collection."""
        if self.__coll_fb2dbp is None:
            self.__coll_fb2dbp = Mongo(MONGO_HOST, MONGO_DB,
                                       MONGO_COLLECTION_FREEBASE2DBPEDIA)

    def lookup_en(self, entity_id):
        """Looks up an entity by its identifier.

        :param entity_id: entity identifier ("<dbpedia:Audi_A4>")
        :return A dictionary with the entity document or None.
        """
        self.__init_coll_dbpedia()
        return self.__coll_dbpedia.find_by_id(entity_id)

    def lookup_name_facc(self, name):
        """Looks up a name in a surface form dictionary and returns all candidate entities."""
        self.__init_coll_sf_facc()
        res = self.__coll_sf_facc.find_by_id(name)
        return res if res else {}

    def lookup_name_dbpedia(self, name):
        """Looks up a name in a surface form dictionary and returns all candidate entities."""
        self.__init_coll_sf_dbpedia()
        res = self.__coll_sf_dbpedia.find_by_id(name)
        return res if res else {}

    def fb_to_dbp(self, fb_id):
        """Converts Freebase id to DBpedia; it returns list of DBpedia IDs."""
        self.__init_coll_fb2dbp()
        res = self.__coll_fb2dbp.find_by_id(fb_id)
        return res["!<owl:sameAs>"] if res else None
Ejemplo n.º 7
0
    def build(self):
        """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        infile = FileUtils.open_file_by_type(self.__w2v_fname)
        i = 0
        for line in infile:
            term, vector = self.__parse_line(line)
            self.__mongo.add(term, {'vector': vector})
            i += 1
            if i % 1000 == 0:
                PLOGGER.info(str(i / 1000) + "K lines are loaded.")
Ejemplo n.º 8
0
class Word2VecToMongo(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__w2v_fname = config[KEY_MAPPING_FILE]
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks params and set default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_MAPPING_FILE not in config:
                raise Exception(KEY_MAPPING_FILE + " is missing")
            if not op.exists(config[KEY_MAPPING_FILE]):
                raise Exception("Mapping file path does not exist.")
        except Exception as e:
            print("Error in config file: ", e)
            exit(1)
        return config

    def __parse_line(self, line):
        """
        Parses a line of the plain-text GoogleNews 300-dim pre-trained corpus.

        :param line:
        :type line: string
        :return: a (word, vector) tuple.
        """
        word, vec_str = line.rstrip().split(maxsplit=1)
        vector = [float(x) for x in vec_str.split()]

        return word, vector

    def build(self):
        """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        infile = FileUtils.open_file_by_type(self.__w2v_fname)
        i = 0
        for line in infile:
            term, vector = self.__parse_line(line)
            self.__mongo.add(term, {'vector': vector})
            i += 1
            if i % 1000 == 0:
                print(str(i / 1000) + "K lines are loaded.")
                # break
                pass
Ejemplo n.º 9
0
    def build_collection(self):
        """Adds all name variants from DBpedia."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        # iterate through all DBpedia entities
        i = 0
        for mdoc in self.__mongo_dbpedia.find_all():
            entity = EntityUtils(Mongo.unescape_doc(mdoc))

            # skips entities without names
            if not entity.has_name():
                continue

            surface_form = entity.get_name()

            # the entity is redirect page
            if entity.is_redirect():
                entity_id = entity.get_predicate(
                    EntityUtils.PREDICATE_REDIRECT)[0]
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_REDIRECT,
                                        entity_id)

            # the entity is disambiguation page
            if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE):
                entity_ids = entity.get_predicate(
                    EntityUtils.PREDICATE_DISAMBIGUATE)
                for entity_id in entity_ids:
                    self.__add_surface_form(surface_form,
                                            EntityUtils.PREDICATE_DISAMBIGUATE,
                                            entity_id)

            # entity is not a redirect/disambiguation page and has name and abstract
            if entity.is_entity():
                entity_id = entity.get_id()
                # adds entity name
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_NAME, entity_id)
                # adds other entity names
                foaf_name_predicate = "<foaf:name>"
                if entity.has_predicate(foaf_name_predicate):
                    for surface_form in entity.get_predicate(
                            foaf_name_predicate):
                        self.__add_surface_form(surface_form,
                                                foaf_name_predicate, entity_id)
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities processed")
Ejemplo n.º 10
0
def field_counts2json(out_file):
    """Reads all documents in the Mongo collection and calculates field frequencies.
        i.e. For DBpedia collection, it returns all entity fields.

    :param doc_collection: The name mongo collection stores all documents/entities.
    :return a dictionary of fields and their frequency
    """
    print("Counting fields ...")
    dbpedia_coll = Mongo(MONGO_HOST, MONGO_DB,
                         MONGO_COLLECTION_DBPEDIA).find_all()
    i = 0
    field_counts = dict()
    for entity in dbpedia_coll:
        for field in entity:
            if field == Mongo.ID_FIELD:
                continue
            if field in field_counts:
                field_counts[field] += 1
            else:
                field_counts[field] = 1
        i += 1
        if i % 1000000 == 0:
            print("\t", str(int(i / 1000000)), "M entity is processed!")

    json.dump(field_counts, open(out_file, "w"), indent=4, sort_keys=True)
    print("\tField count file:", out_file)
Ejemplo n.º 11
0
    def create_sample_file(self):
        """Creates a sample file from the context of index"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA)

        example_docs = [
            "<dbpedia:Texhoma,_Oklahoma>", "<dbpedia:Karen_Spärck_Jones>",
            "<dbpedia:Audi_A4>", "<dbpedia:Barack_Obama>"
        ]
        doc_contents = {}
        for docid in example_docs:
            doc_contents[docid] = self.get_doc_content(mongo.find_by_id(docid))
        json.dump(doc_contents,
                  open("output/example_docs.json", "w"),
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
Ejemplo n.º 12
0
    def get_doc_content(self, doc):
        """create the index content for a given mongo document
        Here we keep both FSDM fields and individual fields for each document.

        :param doc: a Mongo document
        :return: a document ready for indexing
        """
        # Ignores document if the ID does not start with "<dbpedia:" (just to speed up)
        doc_id = Mongo.unescape(doc[Mongo.ID_FIELD])
        if not doc_id.startswith("<dbpedia:"):
            return None

        # Ignores document if it does not have must have fields
        for f in self._config["must_have"]:
            if f not in doc:
                return None

        self._doc_content = defaultdict(list)

        for f in doc:
            # Adds content for FSDM fields
            if f.lower() in self._config["names"]:
                self._doc_content["names"] += self.__get_field_value(doc[f])

            elif f in self._config["categories"]:
                self._doc_content["categories"] += self.__get_field_value(
                    doc[f])

            elif f in self._config["similar_entity_names"]:
                self._doc_content[
                    "similar_entity_names"] += self.__get_field_value(doc[f])

            elif f not in self._config["blacklist"]:
                if doc[f][0].startswith("<dbpedia:"):
                    self._doc_content[
                        "related_entity_names"] += self.__get_field_value(
                            doc[f], f)
                else:
                    self._doc_content["attributes"] += self.__get_field_value(
                        doc[f], f)

            # Adds content for each individual field
            if f in self.__top_fields:
                self._doc_content[f] += self.__get_field_value(doc[f])

        # keeps only unique phrases for each field
        # Adds everything to the catchall field
        for field in self._fsdm_fields:
            self._doc_content[field] = list(set(self._doc_content[field]))
            self._doc_content[
                Elastic.FIELD_CATCHALL] += self._doc_content[field]

        return self._doc_content
Ejemplo n.º 13
0
    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities are added!")
Ejemplo n.º 14
0
def main(args):
    # word2vec main __instances
    w2v_mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC)
    w2v = Word2Vec(w2v_mongo)
    print(
        "\t\t*** word2vec functionalities, with word vectors from GoogleNews 300-dim pre-trained corpus. ***\n"
    )

    # Testing some functionalities
    if args.word:
        word = args.word.strip()
        vector = w2v.get_vector(word)

        print("word = {}\nvector = {}\nvector dimension = {}\n".format(
            word, vector, vector.shape[0]))

    if args.centroid:
        str = args.centroid.strip()
        centroid_v = w2v.get_centroid_vector(str)
        print("expression = {}\ncentroid vector = {}\n".format(
            str, centroid_v))
Ejemplo n.º 15
0
def compute_field_counts():
    """Reads all documents in the Mongo collection and calculates field frequencies.
        i.e. For DBpedia collection, it returns all entity fields.

    :return a dictionary of fields and their frequency
    """
    PLOGGER.info("Counting fields ...")
    dbpedia_coll = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA).find_all()
    i = 0
    field_counts = dict()
    for entity in dbpedia_coll:
        for field in entity:
            if field == Mongo.ID_FIELD:
                continue
            if field in field_counts:
                field_counts[field] += 1
            else:
                field_counts[field] = 1
        i += 1
        if i % 1000000 == 0:
            PLOGGER.info("\t" + str(int(i / 1000000)) + "M entity is processed!")
    return field_counts
Ejemplo n.º 16
0
 def __init_coll_sf_facc(self):
     """Makes connection to the surface form collection."""
     if self.__coll_sf_facc is None:
         self.__coll_sf_facc = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_SF_FACC)
Ejemplo n.º 17
0
 def __init__(self, host, db, collection):
     self.__mongo = Mongo(host, db, collection)
     self.__prefix = URIPrefix()
     self.__m_id = None
     self.__m_contents = None
     logging.basicConfig(level="ERROR")  # no warnings from the rdf parser
Ejemplo n.º 18
0
class NTriplesToMongoDB(object):
    def __init__(self, host, db, collection):
        self.__mongo = Mongo(host, db, collection)
        self.__prefix = URIPrefix()
        self.__m_id = None
        self.__m_contents = None
        logging.basicConfig(level="ERROR")  # no warnings from the rdf parser

    def _next_triple(self, subj, pred, obj):
        """Processes a triple.

          - Appends to previous triple if it's the same subject.
          - Otherwise inserts last triple and creates a new one.
        """
        if (self.__m_id is not None) and (self.__m_id == subj):
            if pred not in self.__m_contents:
                self.__m_contents[pred] = []
            self.__m_contents[pred].append(obj)
        else:
            self._write_to_mongo()
            self.__m_id = subj
            self.__m_contents = {pred: [obj]}

    def _write_to_mongo(self):
        """Writes triple (inserts or appends existing) to MongoDB collection."""
        if self.__m_id is not None:
            for field, value in self.__m_contents.items():
                self.__mongo.append_set(self.__m_id, field, value)
            # self.mongo.add(self.m_id, self.m_contents)
            self.__m_id = None
            self.__m_contents = None

    def drop(self):
        """Deletes the collection."""
        self.__mongo.drop()

    def add_file(self, filename, reverse_triple=False, predicate_prefix=None):
        """Adds contents from an NTriples file to MongoDB.

        :param filename: NTriples file.
        :param reverse_triple: if set True, the subject and object values are swapped.
        :param predicate_prefix: prefix to be added to predicates.
        :param subjects_redirecter: redirects dict.
        """
        print("Processing " + filename + "...")

        t = Triple()
        p = NTriplesParser(t)
        self.__m_id = None  # document id for MongoDB -- subj
        self.__m_contents = None  # document contents for MongoDB -- pred, obj
        i = 0

        with FileUtils.open_file_by_type(filename) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # subject prefixing
                subj = self.__prefix.get_prefixed(t.subject())

                # predicate prefixing
                pred = self.__prefix.get_prefixed(t.predicate())
                if predicate_prefix is not None:
                    pred = predicate_prefix + pred

                # Object prefixing
                if type(t.object()) is URIRef:
                    obj = self.__prefix.get_prefixed(t.object())
                else:
                    obj = t.object()
                    if len(obj) == 0:
                        continue  # skip empty objects

                # write or append
                if reverse_triple:  # reverse subj and obj
                    self._next_triple(obj, pred, subj)
                else:  # normal mode
                    self._next_triple(subj, pred, obj)

                i += 1
                if i % 100000 == 0:
                    print(
                        str(i // 1000) + "K lines processed from " + filename)

        # process last triple
        self._write_to_mongo()
Ejemplo n.º 19
0
 def word2vec(self):
     if self.__word2vec is None:
         w2v_mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC)
         self.__word2vec = Word2Vec(w2v_mongo)
     return self.__word2vec
Ejemplo n.º 20
0
 def __init_coll_sf_dbpedia(self):
     """Makes connection to the surface form collection."""
     if self.__coll_sf_dbpedia is None:
         self.__coll_sf_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                        MONGO_COLLECTION_SF_DBPEDIA)
Ejemplo n.º 21
0
class Freebase2DBpedia2Mongo(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__fb2dbp_file = config[KEY_MAPPING_FILE]
        self.__fb2dbp_file_39 = config[
            KEY_MAPPING_FILE_39]  # used for removing duplicates
        self.__prefix = URIPrefix()
        self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)

    @staticmethod
    def __check_config(config):
        """Checks params and set default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_MAPPING_FILE not in config:
                raise Exception(KEY_MAPPING_FILE + " is missing")
            if KEY_MAPPING_FILE_39 not in config:
                raise Exception(KEY_MAPPING_FILE_39 + " is missing")
            if not (os.path.exists(config[KEY_MAPPING_FILE])) or not (
                    os.path.exists(config[KEY_MAPPING_FILE_39])):
                raise Exception("Mapping file path does not exist.")
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)
        return config

    def read_fb2dbp_file(self, is_39=False):
        """Reads the file and generates an initial mapping of Freebase to DBpedia IDs.
        Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored.
        """
        fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file
        print("Processing " + fb2dbp_file + "...")

        t = Triple()
        p = NTriplesParser(t)
        i = 0
        fb2dbp_mapping = defaultdict(set)
        with FileUtils.open_file_by_type(fb2dbp_file) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # prefixing
                dbp_id = self.__prefix.get_prefixed(t.subject())
                fb_id = self.__prefix.get_prefixed(t.object())

                # if reading 3.9 file, converts ID to 2015-10 version
                if is_39:
                    dbp_id = EntityUtils.convert_39_to_201510(dbp_id)
                    fb2dbp_mapping[fb_id].add(dbp_id)

                # if reading 2015-10 file, keeps only the proper DBpedia entities
                else:
                    entity_utils = EntityUtils(
                        self.__mongo_dbpedia.find_by_id(dbp_id))
                    if entity_utils.is_entity():
                        fb2dbp_mapping[fb_id].add(dbp_id)
                i += 1
                if i % 1000 == 0:
                    print(str(i // 1000) + "K lines are processed!")

        return fb2dbp_mapping

    def load_fb2dbp_mapping(self):
        """Checks Freebase IDs that are mapped to more than one entity and keeps only one of them."""
        mappings = defaultdict(list)
        fb2dbp_39 = self.read_fb2dbp_file(is_39=True)
        fb2dbp = self.read_fb2dbp_file()

        for fb_id, dbp_ids in fb2dbp.items():
            if len(dbp_ids) > 1:
                dbp_ids_39 = fb2dbp_39.get(fb_id, None)
                dbp_id_39 = dbp_ids_39.pop() if dbp_ids_39 else None
                if dbp_id_39 in dbp_ids:
                    mappings[fb_id].append(dbp_id_39)
                else:
                    mappings[fb_id] = list(dbp_ids)
                    print(fb_id, "3.9", dbp_id_39, "2015", dbp_ids)
            else:
                mappings[fb_id] = list(dbp_ids)

        print(len(mappings))
        return mappings

    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities are added!")
Ejemplo n.º 22
0
 def __init_coll_fb2dbp(self):
     """Makes connection to Freebase2DBpedia collection."""
     if self.__coll_fb2dbp is None:
         self.__coll_fb2dbp = Mongo(MONGO_HOST, MONGO_DB,
                                    MONGO_COLLECTION_FREEBASE2DBPEDIA)
Ejemplo n.º 23
0
 def __init_coll_dbpedia(self):
     """Makes connection to the entity (DBpedia) collection."""
     if self.__coll_dbpedia is None:
         self.__coll_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)
Ejemplo n.º 24
0
 def __init__(self, index_name, mappings, collection, model=Elastic.BM25):
     self.__index_name = index_name
     self.__mappings = mappings
     self.__mongo = Mongo(MONGO_HOST, MONGO_DB, collection)
     self.__model = model
Ejemplo n.º 25
0
 def __init__(self):
     mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC)
     self.__word2vec = Word2Vec(mongo)
Ejemplo n.º 26
0
class DBpediaSurfaceforms2Mongo(object):
    def __init__(self, config):
        """Inserts DBpedia surface forms to Mongo."""
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__lowercase = config[KEY_LOWERCASE]
        self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_LOWERCASE not in config:
                config[KEY_LOWERCASE] = True
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)

    def __add_surface_form(self, surface_form, predicate, entity_id):
        """Adds a surface form (removes the disambiguation part form the surface form, if exists).

        :param surface_form: surface form for entity
        :param predicate: predicate that entity is extracted from e.g. <rdfs:label>
        :param entity_id: entity ID
        """
        if sys.getsizeof(surface_form) >= 1024:  # Mongo key limit
            return
        surface_form = surface_form.replace("(disambiguation)", "").strip()
        if self.__lowercase:
            surface_form = surface_form.lower()
        self.__mongo.inc_in_dict(surface_form, predicate, entity_id, 1)

    def build_collection(self):
        """Adds all name variants from DBpedia."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        # iterate through all DBpedia entities
        i = 0
        for mdoc in self.__mongo_dbpedia.find_all():
            entity = EntityUtils(Mongo.unescape_doc(mdoc))

            # skips entities without names
            if not entity.has_name():
                continue

            surface_form = entity.get_name()

            # the entity is redirect page
            if entity.is_redirect():
                entity_id = entity.get_predicate(
                    EntityUtils.PREDICATE_REDIRECT)[0]
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_REDIRECT,
                                        entity_id)

            # the entity is disambiguation page
            if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE):
                entity_ids = entity.get_predicate(
                    EntityUtils.PREDICATE_DISAMBIGUATE)
                for entity_id in entity_ids:
                    self.__add_surface_form(surface_form,
                                            EntityUtils.PREDICATE_DISAMBIGUATE,
                                            entity_id)

            # entity is not a redirect/disambiguation page and has name and abstract
            if entity.is_entity():
                entity_id = entity.get_id()
                # adds entity name
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_NAME, entity_id)
                # adds other entity names
                foaf_name_predicate = "<foaf:name>"
                if entity.has_predicate(foaf_name_predicate):
                    for surface_form in entity.get_predicate(
                            foaf_name_predicate):
                        self.__add_surface_form(surface_form,
                                                foaf_name_predicate, entity_id)
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities processed")