Beispiel #1
0
class FACCToMongo(object):
    def __init__(self, config):
        """Inserts FACC surface forms to Mongo."""
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__path = config[KEY_PATH]
        self.__predicate = config[KEY_PREDICATE]
        self.__lowercase = config[KEY_LOWERCASE]
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_PATH not in config:
                raise Exception(KEY_PATH + " is missing")
            if KEY_PREDICATE not in config:
                raise Exception(KEY_PREDICATE + " is missing")
            if KEY_LOWERCASE not in config:
                config[KEY_LOWERCASE] = True
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)

    def __add_surface_form(self, surface_form, freebase_uri, count):
        """Adds a surface form."""
        if self.__lowercase:
            surface_form = surface_form.lower()
        # Increases count; if the id is not associated with the surface form yet, it adds it with count.
        freebase_id = self.__convert_to_fb_id(freebase_uri)
        self.__mongo.inc_in_dict(surface_form, self.__predicate, freebase_id,
                                 count)

    def __convert_to_fb_id(self, fb_uri):
        """Converts /m/047b9p0 to <fb:m.047b9p0>"""
        fb_id = fb_uri.replace("/", ".")
        return "<fb:" + fb_id[1:] + ">"

    def __add_file(self, tsv_filename):
        """Adds name variants from an FACC tsv file."""
        PLOGGER.info("Adding name variants from '" + tsv_filename + "'...")
        infile = open(tsv_filename, "r")
        for line in infile:
            f = line.rstrip().split("\t")
            self.__add_surface_form(f[0], f[1], int(f[2]))
        infile.close()

    def build(self):
        """Builds surface form collection from FACC annotations."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        for path, dirs, files in os.walk(self.__path):
            for fn in files:
                if fn.endswith(".tsv"):
                    self.__add_file(os.path.join(path, fn))
        PLOGGER.info("Collection " + self.__collection + " is built.")
Beispiel #2
0
    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities are added!")
Beispiel #3
0
class Word2VecToMongo(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__w2v_fname = config[KEY_MAPPING_FILE]
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks params and set default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_MAPPING_FILE not in config:
                raise Exception(KEY_MAPPING_FILE + " is missing")
            if not op.exists(config[KEY_MAPPING_FILE]):
                raise Exception("Mapping file path does not exist.")
        except Exception as e:
            print("Error in config file: ", e)
            exit(1)
        return config

    def __parse_line(self, line):
        """
        Parses a line of the plain-text GoogleNews 300-dim pre-trained corpus.

        :param line:
        :type line: string
        :return: a (word, vector) tuple.
        """
        word, vec_str = line.rstrip().split(maxsplit=1)
        vector = [float(x) for x in vec_str.split()]

        return word, vector

    def build(self):
        """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        infile = FileUtils.open_file_by_type(self.__w2v_fname)
        i = 0
        for line in infile:
            term, vector = self.__parse_line(line)
            self.__mongo.add(term, {'vector': vector})
            i += 1
            if i % 1000 == 0:
                print(str(i / 1000) + "K lines are loaded.")
                # break
                pass
Beispiel #4
0
class NTriplesToMongoDB(object):
    def __init__(self, host, db, collection):
        self.__mongo = Mongo(host, db, collection)
        self.__prefix = URIPrefix()
        self.__m_id = None
        self.__m_contents = None
        logging.basicConfig(level="ERROR")  # no warnings from the rdf parser

    def _next_triple(self, subj, pred, obj):
        """Processes a triple.

          - Appends to previous triple if it's the same subject.
          - Otherwise inserts last triple and creates a new one.
        """
        if (self.__m_id is not None) and (self.__m_id == subj):
            if pred not in self.__m_contents:
                self.__m_contents[pred] = []
            self.__m_contents[pred].append(obj)
        else:
            self._write_to_mongo()
            self.__m_id = subj
            self.__m_contents = {pred: [obj]}

    def _write_to_mongo(self):
        """Writes triple (inserts or appends existing) to MongoDB collection."""
        if self.__m_id is not None:
            for field, value in self.__m_contents.items():
                self.__mongo.append_set(self.__m_id, field, value)
            # self.mongo.add(self.m_id, self.m_contents)
            self.__m_id = None
            self.__m_contents = None

    def drop(self):
        """Deletes the collection."""
        self.__mongo.drop()

    def add_file(self, filename, reverse_triple=False, predicate_prefix=None):
        """Adds contents from an NTriples file to MongoDB.

        :param filename: NTriples file.
        :param reverse_triple: if set True, the subject and object values are swapped.
        :param predicate_prefix: prefix to be added to predicates.
        :param subjects_redirecter: redirects dict.
        """
        print("Processing " + filename + "...")

        t = Triple()
        p = NTriplesParser(t)
        self.__m_id = None  # document id for MongoDB -- subj
        self.__m_contents = None  # document contents for MongoDB -- pred, obj
        i = 0

        with FileUtils.open_file_by_type(filename) as f:
            for line in f:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue

                # subject prefixing
                subj = self.__prefix.get_prefixed(t.subject())

                # predicate prefixing
                pred = self.__prefix.get_prefixed(t.predicate())
                if predicate_prefix is not None:
                    pred = predicate_prefix + pred

                # Object prefixing
                if type(t.object()) is URIRef:
                    obj = self.__prefix.get_prefixed(t.object())
                else:
                    obj = t.object()
                    if len(obj) == 0:
                        continue  # skip empty objects

                # write or append
                if reverse_triple:  # reverse subj and obj
                    self._next_triple(obj, pred, subj)
                else:  # normal mode
                    self._next_triple(subj, pred, obj)

                i += 1
                if i % 100000 == 0:
                    print(
                        str(i // 1000) + "K lines processed from " + filename)

        # process last triple
        self._write_to_mongo()
class DBpediaSurfaceforms2Mongo(object):
    def __init__(self, config):
        """Inserts DBpedia surface forms to Mongo."""
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__lowercase = config[KEY_LOWERCASE]
        self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_LOWERCASE not in config:
                config[KEY_LOWERCASE] = True
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)

    def __add_surface_form(self, surface_form, predicate, entity_id):
        """Adds a surface form (removes the disambiguation part form the surface form, if exists).

        :param surface_form: surface form for entity
        :param predicate: predicate that entity is extracted from e.g. <rdfs:label>
        :param entity_id: entity ID
        """
        if sys.getsizeof(surface_form) >= 1024:  # Mongo key limit
            return
        surface_form = surface_form.replace("(disambiguation)", "").strip()
        if self.__lowercase:
            surface_form = surface_form.lower()
        self.__mongo.inc_in_dict(surface_form, predicate, entity_id, 1)

    def build_collection(self):
        """Adds all name variants from DBpedia."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        # iterate through all DBpedia entities
        i = 0
        for mdoc in self.__mongo_dbpedia.find_all():
            entity = EntityUtils(Mongo.unescape_doc(mdoc))

            # skips entities without names
            if not entity.has_name():
                continue

            surface_form = entity.get_name()

            # the entity is redirect page
            if entity.is_redirect():
                entity_id = entity.get_predicate(
                    EntityUtils.PREDICATE_REDIRECT)[0]
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_REDIRECT,
                                        entity_id)

            # the entity is disambiguation page
            if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE):
                entity_ids = entity.get_predicate(
                    EntityUtils.PREDICATE_DISAMBIGUATE)
                for entity_id in entity_ids:
                    self.__add_surface_form(surface_form,
                                            EntityUtils.PREDICATE_DISAMBIGUATE,
                                            entity_id)

            # entity is not a redirect/disambiguation page and has name and abstract
            if entity.is_entity():
                entity_id = entity.get_id()
                # adds entity name
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_NAME, entity_id)
                # adds other entity names
                foaf_name_predicate = "<foaf:name>"
                if entity.has_predicate(foaf_name_predicate):
                    for surface_form in entity.get_predicate(
                            foaf_name_predicate):
                        self.__add_surface_form(surface_form,
                                                foaf_name_predicate, entity_id)
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities processed")