コード例 #1
0
    def __entity_centric(self, query):
        """Entity-centric TTI.

        :param query: query string
        :type query: str
        """
        types = dict()  # to be returned

        # Set the configurations
        model = self.__config.get("model", TTI_MODEL_BM25)
        ec_cutoff = self.__config.get("ec_cutoff", DEFAULT_TTI_EC_K_CUTOFF)
        self.__ec_retr_config = dict()
        for param in ["smoothing_method", "smoothing_param"]:
            if self.__config.get(param, None) is not None:
                self.__ec_retr_config[param] = self.__config.get(param)

        # Perform EC TTI using late fusion support
        late_fusion_scorer = LateFusionScorer(
            self.__config["index"],
            model,
            self.__ec_retr_config,
            num_docs=ec_cutoff,
            field="catchall",
            run_id=self.__config["run_id"],
            num_objs=self.__config["num_docs"])
        ret_res = late_fusion_scorer.score_query(
            query, assoc_fun=self.__entity_centric_mapper)

        for doc_id, score in ret_res.get_scores_sorted():
            types[doc_id] = {"score": score}
            PLOGGER.info("done")

        return types
コード例 #2
0
ファイル: instances.py プロジェクト: zxlzr/nordlys
    def to_str(self, file_name=None):
        """ Converts instances to string and write them to the given file.
        :param file_name
        :return: String format of instances
        """
        out_file = None
        if file_name is not None:
            open(file_name, "w").close()  # cleans previous contents
            out_file = open(file_name, "a")

        counter = 0
        out = ""
        for ins in self.get_all():
            out += ins.to_str() + "\n"
            counter += 1
            # append instances to the file
            if (counter % 1000) == 0:
                # print "Converting is done until instance " + str(ins.id)
                if out_file is not None:
                    out_file.write(out)
                    out = ""
        if out_file is not None:
            out_file.write(out)
            PLOGGER.info("String output:\t" + file_name)
            return None
        return out
コード例 #3
0
ファイル: el.py プロジェクト: theVoogie/nordlys
    def batch_linking(self):
        """Scores queries in a batch and outputs results."""
        results = {}

        if self.__config["step"] == "linking":
            queries = json.load(open(self.__query_file))
            for qid in sorted(queries):
                results[qid] = self.link(queries[qid], qid)
            json.dump(results,
                      open(self.__output_file, "w"),
                      indent=4,
                      sort_keys=True)

        # only ranking step
        if self.__config["step"] == "ranking":
            queries = json.load(open(self.__query_file))
            for qid in sorted(queries):
                linker = self.__get_linker(Query(queries[qid], qid))
                results[qid] = linker.rank_ens()
            ranked_inss = Instances(
                sum([inss.get_all() for inss in results.values()], []))
            ranked_inss.to_json(self.__output_file)

        # only disambiguation step
        if self.__config["step"] == "disambiguation":
            inss = Instances.from_json(self.__config["test_set"])
            inss_by_query = inss.group_by_property("qid")
            for qid, q_inss in sorted(inss_by_query.items()):
                linker = self.__get_linker("")
                results[qid] = linker.disambiguate(Instances(q_inss))
            to_elq_eval(results, self.__output_file)

        PLOGGER.info("Output file: " + self.__output_file)
コード例 #4
0
    def __type_centric(self, query):
        """Type-centric TTI.

        :param query: query string
        :type query: str
        """
        types = dict()
        model = self.__config.get("model", TTI_MODEL_BM25)
        elastic = ElasticCache(
            self.__tc_config.get("index", DEFAULT_TTI_TC_INDEX))

        if model == TTI_MODEL_BM25:
            PLOGGER.info("TTI, TC, BM25")
            self.__tc_config["model"] = "bm25"
            # scorer = Scorer.get_scorer(elastic, query, self.__tc_config)
            types = Retrieval(self.__tc_config).retrieve(query)

        elif model == TTI_MODEL_LM:
            PLOGGER.debug("TTI, TC, LM")
            self.__tc_config["model"] = "lm"  # Needed for 2nd-pass
            self.__tc_config["field"] = "content"  # Needed for 2nd-pass
            self.__tc_config["second_pass"] = {"field": "content"}
            for param in ["smoothing_method", "smoothing_param"]:
                if self.__config.get(param, None) is not None:
                    self.__tc_config["second_pass"][param] = self.__config.get(
                        param)

            scorer = Scorer.get_scorer(elastic, query, self.__tc_config)
            types = Retrieval(self.__tc_config).retrieve(query, scorer)

            PLOGGER.info(types)

        return types
コード例 #5
0
    def __make_type_doc(self, type_name):
        """Gets the document representation of a type to be indexed, from its
        entity short abstracts."""
        content = "\n".join([
            self.__entity_abstracts.get(e, b"").decode("utf-8")
            for e in self.__types_entities[type_name]
        ])

        if len(content) > MAX_BULKING_DOC_SIZE:
            PLOGGER.info("Type {} has content larger than allowed: {}.".format(
                type_name, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t.
            # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            num_entities = len(self.__types_entities[type_name])
            amount_abstracts_to_sample = min(
                floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN),
                num_entities)
            entities_sample = [
                self.__types_entities[type_name][i] for i in sample(
                    range(num_entities), amount_abstracts_to_sample)
            ]
            content = ""  # reset content
            for entity in entities_sample:
                new_content_candidate = "\n".join([
                    content,
                    self.__entity_abstracts.get(entity, b"").decode("utf-8")
                ])
                # we add an abstract only if by doing so it will not exceed
                # MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) > MAX_BULKING_DOC_SIZE:
                    break
                content = new_content_candidate

        return {"content": content}
コード例 #6
0
    def batch_identification(self):
        """Annotates, in a batch, queries with identified target types, and outputs results."""
        queries = json.load(FileUtils.open_file_by_type(self.__query_file))

        f_trec_out = None
        if "trec_output_file" in self.__config:  # for TREC-formatted outputting
            f_trec_out = FileUtils.open_file_by_type(
                self.__config["trec_output_file"], mode="w")

        results = dict()
        for query_id in sorted(queries):
            PLOGGER.info("Identifying target types for [{}] {}".format(
                query_id, queries[query_id]))
            results[query_id] = self.identify(queries[query_id])

            # Output resulting scores in TREC format if required
            if f_trec_out:
                type_to_score = dict()
                for d in results.get(query_id, {}).get("results", {}).values():
                    type_to_score[d["type"]] = d["score"]
                ret_res = RetrievalResults(type_to_score)
                ret_res.write_trec_format(query_id,
                                          self.__config["run_id"],
                                          f_trec_out,
                                          max_rank=self.__config["num_docs"])

        json.dump(results,
                  FileUtils.open_file_by_type(self.__output_file, mode="w"),
                  indent=4,
                  sort_keys=True)
        PLOGGER.info("Output file: {}".format(self.__output_file))

        if f_trec_out:
            f_trec_out.close()
コード例 #7
0
ファイル: create_sample.py プロジェクト: zxlzr/nordlys
    def __sample_file(self, dir, file):
        """Creates a local from a specific file in a given directory.

        :param dir: directory (relative to path_to_dbpedia)
        :param file:
        """
        t = Triple()
        p = NTriplesParser(t)
        infile = os.path.join(self.path_to_dbpedia, dir, file)
        outfile = os.path.join(self.output_dir, dir, file)
        PLOGGER.info("Processing file " + file + " ...")
        i = 0
        with FileUtils.open_file_by_type(infile) as fin:
            fout = FileUtils.open_file_by_type(outfile, mode="w")  # output file will be of the same type as the input
            for line in fin:
                try:
                    p.parsestring(line.decode("utf-8"))
                except ParseError:  # skip lines that couldn't be parsed
                    continue
                if t.subject() is None:  # only if parsed as a triple
                    continue
                subj = self.prefix.get_prefixed(t.subject())  # prefixing subject
                if subj in self.sample_entities:
                    fout.write(line)
                i += 1
                if i % 100000 == 0:
                    PLOGGER.info(str(i // 1000) + "K lines processed")
            fout.close()
コード例 #8
0
ファイル: instances.py プロジェクト: zxlzr/nordlys
    def to_treceval(self, file_name, qid_prop="qid", docid_prop="en_id"):
        """
        Generates a TREC style run file
        - If there is an entity ranked more than once for the same query, the one with higher score is kept.

        :param file_name: File to write TREC file
        :param qid_prop: Name of instance property to be used as query ID (1st column)
        :param docid_prop: Name of instance property to be used as document ID (3rd column)
        """
        unique_entries = defaultdict(dict)
        # sort and rank entities
        for ins in self.get_all():
            if ins.score is not None:
                qid, doc_id = ins.get_property(qid_prop), ins.get_property(
                    docid_prop)
                score = unique_entries.get(qid, {}).get(doc_id, None)
                if (score is None) or (score < ins.score):
                    unique_entries[qid][doc_id] = ins.score

        out_str = ""
        for qid, docs in sorted(unique_entries.items()):
            rank = 1
            for doc_id, score in sorted(docs.items(),
                                        key=lambda x: x[1],
                                        reverse=True):
                out_str += qid + "\tQ0\t" + doc_id + "\t" + str(
                    rank) + "\t" + "{0:.5f}".format(score) + "\tnordlys\n"
                rank += 1
        open(file_name, "w").write(out_str)
        PLOGGER.info("Trec-eval output:\t" + file_name)
コード例 #9
0
 def __add_file(self, tsv_filename):
     """Adds name variants from an FACC tsv file."""
     PLOGGER.info("Adding name variants from '" + tsv_filename + "'...")
     infile = open(tsv_filename, "r")
     for line in infile:
         f = line.rstrip().split("\t")
         self.__add_surface_form(f[0], f[1], int(f[2]))
     infile.close()
コード例 #10
0
ファイル: el_utils.py プロジェクト: zxlzr/nordlys
def load_kb_snapshot(kb_file):
    """Loads DBpedia Snapshot of proper name entities (used for entity linking)."""
    if config.KB_SNAPSHOT is None:
        PLOGGER.info("Loading KB snapshot of proper named entities ...")
        kb_snapshot = set()
        with open(kb_file, "r") as f:
            for line in f:
                kb_snapshot.add(line.strip())
        config.KB_SNAPSHOT = kb_snapshot
コード例 #11
0
    def build(self):
        """Builds surface form collection from FACC annotations."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        for path, dirs, files in os.walk(self.__path):
            for fn in files:
                if fn.endswith(".tsv"):
                    self.__add_file(os.path.join(path, fn))
        PLOGGER.info("Collection " + self.__collection + " is built.")
コード例 #12
0
def main(args):
    run = TrecRun(args.run_file)

    if args.operation == "stat":
        run.print_stat()
    elif args.operation == "filter":
        if len(args.doc_ids_file) == 0 or len(args.output_file) == 0:
            PLOGGER.info("doc_ids_file or output_file missing")
        else:
            run.filter(args.doc_ids_file, args.output_file)
コード例 #13
0
def main(args):
    config = FileUtils.load_config(args.config)

    type2entity_file = os.path.expanduser(os.path.join(config.get("type2entity_file", "")))
    entity_abstracts_file = os.path.expanduser(os.path.join(config.get("entity_abstracts_file", "")))
    if (not os.path.isfile(type2entity_file)) or (not os.path.isfile(entity_abstracts_file)):
        exit(1)

    indexer = IndexerDBpediaTypes(config)
    indexer.build_index(force=True)
    PLOGGER.info("Index build: <{}>".format(indexer.name))
コード例 #14
0
ファイル: el_utils.py プロジェクト: theVoogie/nordlys
def to_elq_eval(annotations, output_file):
    """Write entity annotations to ELQ evaluation format.

    :param linked_ens: {qid:[{"mention":xx, "entity": yy, "score":zz}, ..], ..}
    """
    out_str = ""
    for qid, q_annots in sorted(annotations.items()):
        for annot in q_annots:
            out_str += qid + "\t1\t" + annot["entity"] + "\n"
    open(output_file, "w").write(out_str)
    PLOGGER.info("ELQ evaluation file: " + output_file)
コード例 #15
0
 def get_top_term(self, en, n):
     """Returns top-n fields with highest document frequency for the given entity ID."""
     doc_freq = {}
     if self.DEBUG:
         PLOGGER.info("Entity:[" + en + "]")
     for field in self.fields:
         df = self.elastic.doc_freq(en, field)
         if df > 0:
             doc_freq[field] = df
     top_fields = self.__get_top_n(doc_freq, n)
     return top_fields
コード例 #16
0
ファイル: ml.py プロジェクト: zxlzr/nordlys
    def output(self, instances):
        """Writes results to output file.

        :param instances: Instances object
        """
        with open(self.__config["output_file"], "w") as f:
            f.write("id\tscore\n")  # output to file
            PLOGGER.info("id\ttarget\tscore\n")
            for ins in instances.get_all():
                f.write(ins.id + "\t" + "{0:.5f}".format(ins.score) +
                        "\n")  # output to file
        PLOGGER.info("Output saved in: " + self.__config["output_file"])
コード例 #17
0
ファイル: word2vec2mongo.py プロジェクト: zxlzr/nordlys
    def build(self):
        """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        infile = FileUtils.open_file_by_type(self.__w2v_fname)
        i = 0
        for line in infile:
            term, vector = self.__parse_line(line)
            self.__mongo.add(term, {'vector': vector})
            i += 1
            if i % 1000 == 0:
                PLOGGER.info(str(i / 1000) + "K lines are loaded.")
コード例 #18
0
    def build_collection(self, mappings):
        """Builds Mongo collection"""
        mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        mongo.drop()

        predicate = "!<owl:sameAs>"
        i = 0
        for fb_id, dbp_ids in mappings.items():
            for dbp_id in dbp_ids:
                mongo.append_set(fb_id, predicate, [dbp_id])
            i += 1
            if i % 1000 == 0:
                PLOGGER.info(str(i // 1000) + "K entities are added!")
コード例 #19
0
def main(args):
    config = FileUtils.load_config(args.config)
    dbpedia_path = config.get("dbpedia_files_path", "")
    # Check DBpedia files
    PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path))
    for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES:
        if os.path.isfile(os.sep.join([dbpedia_path, fname])):
            PLOGGER.info("  - {}: OK".format(fname))
        else:
            PLOGGER.error("  - {}: Missing".format(fname))
            exit(1)

    indexer = IndexerDBpediaTypes(config)
    indexer.build_index(force=True)
コード例 #20
0
    def build_collection(self):
        """Adds all name variants from DBpedia."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        # iterate through all DBpedia entities
        i = 0
        for mdoc in self.__mongo_dbpedia.find_all():
            entity = EntityUtils(Mongo.unescape_doc(mdoc))

            # skips entities without names
            if not entity.has_name():
                continue

            surface_form = entity.get_name()

            # the entity is redirect page
            if entity.is_redirect():
                entity_id = entity.get_predicate(
                    EntityUtils.PREDICATE_REDIRECT)[0]
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_REDIRECT,
                                        entity_id)

            # the entity is disambiguation page
            if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE):
                entity_ids = entity.get_predicate(
                    EntityUtils.PREDICATE_DISAMBIGUATE)
                for entity_id in entity_ids:
                    self.__add_surface_form(surface_form,
                                            EntityUtils.PREDICATE_DISAMBIGUATE,
                                            entity_id)

            # entity is not a redirect/disambiguation page and has name and abstract
            if entity.is_entity():
                entity_id = entity.get_id()
                # adds entity name
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_NAME, entity_id)
                # adds other entity names
                foaf_name_predicate = "<foaf:name>"
                if entity.has_predicate(foaf_name_predicate):
                    for surface_form in entity.get_predicate(
                            foaf_name_predicate):
                        self.__add_surface_form(surface_form,
                                                foaf_name_predicate, entity_id)
            i += 1
            if i % 1000 == 0:
                PLOGGER.info(str(i // 1000) + "K entities processed")
コード例 #21
0
ファイル: instances.py プロジェクト: zxlzr/nordlys
    def to_json(self, json_file=None):
        """ Converts all instances to JSON and writes it to the file

        :param json_file: (string)
        :return: JSON dump of all instances.
        """
        inss_json = {}
        for ins in self.get_all():
            inss_json.update(ins.to_json())
        if json_file is not None:
            # print "Writing JSON format of instances ..."
            out = open(json_file, "w")
            json.dump(inss_json, out, indent=4, sort_keys=True)
            PLOGGER.info("JSON output:\t" + json_file)
        return inss_json
コード例 #22
0
ファイル: indexer_dbpedia_uri.py プロジェクト: zxlzr/nordlys
def main(args):
    config = FileUtils.load_config(args.config)
    if "_uri" not in config["index_name"]:
        PLOGGER.error("index name might not be correct, please check again!")
        exit(0)

    if "fields_file" not in config:
        fields_count = compute_field_counts()
    else:
        fields_count = json.load(config["fields_file"])

    indexer = IndexerDBpediaURI(config, fields_count)

    indexer.build()
    PLOGGER.info("Index build: " + config["index_name"])
コード例 #23
0
ファイル: instances.py プロジェクト: zxlzr/nordlys
    def from_json(cls, json_file):
        """Loads instances from a JSON file.

        :param json_file: (string)
        :return Instances object
        """
        PLOGGER.info("Reading JSON file " + json_file + " ...")
        json_data = open(json_file)
        data = json.load(json_data)
        instance_list = []
        # read instances
        for ins_id, fields in data.items():
            instance = Instance.from_json(ins_id, fields)
            instance_list.append(instance)
        return cls(instance_list)
コード例 #24
0
ファイル: el_utils.py プロジェクト: zxlzr/nordlys
def to_elq_eval(annotations, output_file):
    """Write entity annotations to ELQ evaluation format.

    :param linked_ens: {qid:[{"mention":xx, "entity": yy, "score":zz}, ..], ..}
    """
    uniq_annots = set()
    out_str = ""
    for qid, q_annots in sorted(annotations.items()):
        for annot in q_annots["results"]:
            if (qid, annot["entity"]) not in uniq_annots:
                out_str += qid + "\t" + str(
                    annot["score"]) + "\t" + annot["entity"] + "\n"
                uniq_annots.add((qid, annot["entity"]))
    open(output_file, "w").write(out_str)
    PLOGGER.info("ELQ evaluation file: " + output_file)
コード例 #25
0
ファイル: retrieval.py プロジェクト: zxlzr/nordlys
    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            PLOGGER.info("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        PLOGGER.info("Output file:" + self.__output_file)
コード例 #26
0
ファイル: trec_qrels.py プロジェクト: zxlzr/nordlys
def main(args):
    qrels = TrecQrels(args.qrels_file)

    if args.operation == CHOICE_STAT:
        qrels.print_stat()
    elif args.operation == CHOICE_FILTER_DOCS:
        if len(args.doc_ids_file) == 0 or len(args.output_file) == 0:
            PLOGGER.info("doc_ids_file or output_file missing")
        else:
            qrels.filter_by_doc_ids(args.doc_ids_file, args.output_file)
    elif args.operation == CHOICE_FILTER_QS:
        if len(args.query_ids_file) == 0 or len(args.output_file) == 0:
            PLOGGER.info("query_ids_file or output_file missing")
        else:
            qrels.filter_by_query_ids(args.query_ids_file, args.output_file)
コード例 #27
0
ファイル: create_sample.py プロジェクト: zxlzr/nordlys
    def __sample_dir(self, dir, ext):
        """Creates a local from a specific directory.

        :param dir: directory (relative to path_to_dbpedia)
        :param ext: file extensions considered
        """
        PLOGGER.info("Processing directory " + dir + " ...")
        # make sure the dir exists under the output directory
        outdir = os.path.join(self.output_dir, dir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        # make a local of each file from that directory with the given extension
        for root, dirs, files in os.walk(os.path.join(self.path_to_dbpedia, dir)):
            PLOGGER.info(root)
            for file in files:
                if file.endswith(ext):
                    self.__sample_file(dir, file)
コード例 #28
0
ファイル: instances.py プロジェクト: zxlzr/nordlys
def main(args):
    inss = Instances()
    # we assume that the 1st column is always the ins_id (unique)
    # the list specifies which property or feature the column value should be loaded to; columns with None are ignored
    # one file with properties
    inss.add_properties_from_tsv(args[0], ["sequence"])
    # one or more files with features
    inss.add_features_from_tsv(args[1], [
        "sentence_length", "article_length", "sentence_order",
        "predicate_tense"
    ])
    # inss.add_features_from_tsv(feat_file_2, ["feature4"])
    # inss.add_features_from_tsv(feat_file_3, ["feature5", "feature6"])
    # one with target value
    inss.add_target_from_tsv(args[2])
    PLOGGER.info(inss.to_str())
    inss.to_json("data/maff.json")
コード例 #29
0
ファイル: mongo.py プロジェクト: zxlzr/nordlys
 def print_doc(doc):
     PLOGGER.info("_id: " + doc[Mongo.ID_FIELD])
     for key, value in doc.items():
         if key == Mongo.ID_FIELD: continue  # ignore the id key
         if type(value) is list:
             PLOGGER.info(key + ":")
             for v in value:
                 PLOGGER.info("\t" + str(v))
         else:
             PLOGGER.info(key + ": " + str(value))
コード例 #30
0
ファイル: el.py プロジェクト: theVoogie/nordlys
    def link(self, query, qid=""):
        """Performs entity linking for the query.

        :param query: query string
        :return: annotated query
        """
        PLOGGER.info("Linking query " + qid + " [" + query + "] ")
        q = Query(query, qid)
        linker = self.__get_linker(q)
        if self.__config["step"] == "ranking":
            res = linker.rank_ens()
        else:
            linked_ens = linker.link()
            res = {
                "query": q.raw_query,
                "processed_query": q.query,
                "results": linked_ens
            }
        return res