Beispiel #1
0
def get_summary(topic_path,
                summary_size=100,
                oracle="accept_reject",
                summarizer="sume",
                parser=None,
                language="english",
                rouge_dir="rouge/RELEASE-1.5.5/"):

    # relativize the topic path!!!!
    if topic_path.startswith("/"):
        relative_path = re.search('^(/)(.*)$', topic_path).group(2)
    else:
        relative_path = topic_path

    resolved_topic_path = path.normpath(
        path.join(path.expanduser("~"), ".ukpsummarizer",
                  path.normpath(relative_path)))
    topic = Topic(resolved_topic_path)
    docs = topic.get_docs()
    models = topic.get_models()

    if summarizer == "sume":
        sw = SumeWrap(language)
        summary = sw(docs, summary_size)
        return summary
    elif summarizer == "custom_weights":
        sw = SumeWrap(language)

    return "no summary for summarizer type %s" % summarizer
Beispiel #2
0
class Summary(object):
    def __init__(self, summary_file):
        p, f = path.split(summary_file)
        self.topic = Topic(path.normpath(path.join(p, "..")))

        self.idx = None

        for i, (fn, t) in enumerate(self.topic.get_models()):
            if fn.startswith(summary_file):
                self.idx = i

        print(f, self.idx)

    def get_index(self):
        return self.idx

    def get_topic(self):
        return self.topic
Beispiel #3
0
    def run(self,
            topic_path,
            size=None,
            summarizer="SUME",
            summary_idx=None,
            parser=None,
            oracle="accept",
            feedback_log=None,
            propagation=False,
            max_iteration_count=10,
            preload_embeddings=None,
            feedbackstore=None,
            override_results_files=False,
            num_clusters=8):
        log = logging.getLogger("SingleTopicRunner")

        sf = None  # just for the sake of being able to run without simulated feedback...
        self.tlog.debug("SingleTopicRunner started")
        # relativize the topic path!
        if type(topic_path) is Topic:
            topic = topic_path
        else:
            if topic_path.startswith("/"):
                relative_path = re.search('^(/)(.*)$', topic_path).group(2)
            else:
                relative_path = topic_path

            topic = Topic(
                path.join(self.iobasedir, path.normpath(relative_path)))
        language = topic.get_language()
        docs = topic.get_docs()
        summaries = topic.get_models()

        flightrecorder = get_flightrecorder_from_file(feedback_log)
        preceding_size = len(
            flightrecorder.records
        )  # the number of iterations that happened due to the provided feedback_log

        embeddings = None
        """
        if preload_embeddings:
            embeddings_path = path.normpath(path.join(self.iobasedir, "embeddings"))
            embeddings = load_w2v_embeddings(embeddings_path, language, 'active_learning')
        else:
            embeddings = preload_embeddings
        """

        if summary_idx is not None:
            summaries = [summaries[summary_idx]]

        if size is None:
            use_size = topic.get_summary_size()
        else:
            use_size = size

        clusters_path = path.join(self.iobasedir, 'clustering',
                                  '{}'.format(num_clusters))
        #print(clusters_path)
        #clusters = get_clusters(clusters_path, topic.docs_dir)

        if summarizer == "SUME":
            sw = SumeWrap(language)
            summary = sw(docs, use_size)
            outputfilecontents = {
                "summary": summary,
                "type": summarizer,
                "info_data": []
            }

            json_content = json.dumps(outputfilecontents)
            if self.out is not None:
                log.info("writing output to %s" % (self.out))
                write_to_file(json_content, self.out)
            write_to_file(
                json_content,
                path.normpath(
                    path.expanduser(
                        path.join(self.iobasedir, "tmp", "tmp.json"))))
        elif summarizer == "UPPER_BOUND":
            ub_summary = load_ub_summary(language,
                                         docs,
                                         summaries,
                                         use_size,
                                         base_dir=self.iobasedir)
            summary = '\n'.join(ub_summary)

            outputfilecontents = {
                "summary": summary,
                "type": summarizer,
                "info_data": []
            }

            json_content = json.dumps(outputfilecontents)
            if self.out is not None:
                log.info("writing output to %s" % (self.out))
                write_to_file(json_content, self.out)
            write_to_file(
                json_content,
                path.normpath(
                    path.expanduser(
                        path.join(self.iobasedir, "tmp", "tmp.json"))))
        elif summarizer == "PROPAGATION":
            #UB considering all the summaries
            ub_summary = load_ub_summary(language,
                                         docs,
                                         summaries,
                                         use_size,
                                         base_dir=self.iobasedir)
            summary = '\n'.join(ub_summary)
            ub_scores = self.rouge(summary, summaries, use_size)

            log.debug(
                "UB scores: R1:%s R2:%s SU4:%s" %
                (str(ub_scores[0]), str(ub_scores[1]), str(ub_scores[2])))

            ref_summ = random.choice(summaries)

            parse_info = []
            #parse_info = topic.get_parse_info(summaries.index(ref_summ))

            # initialize the Algorithm.
            run_config = dict()
            run_config['rank_subset'] = True
            run_config['relative_k'] = True
            run_config['dynamic_k'] = False
            for flag in ['adaptive_sampling', 'strategy']:
                run_config[flag] = False

            r = 0
            clusters = None
            log.info("recording k_size in summarize %f", self.k)
            #TODO: Added summaries instead of one single summary
            sf = SimulatedFeedback(
                language,
                self.rouge,
                embeddings=None,  #TODO: embeddings
                docs=docs,
                models=summaries,
                summary_length=use_size,
                oracle_type=oracle,
                ub_score=ub_scores,
                ub_summary=ub_summary,
                parser_type=parser,
                flightrecorder=flightrecorder,
                feedbackstore=feedbackstore,
                parse_info=parse_info,
                run_config=run_config,
                k=self.k,
                adaptive_window_size=r,
                clusters=clusters)

            if sf.embeddings is None or sf.embeddings == {}:
                embe_var = "none",
            else:
                if sf.embeddings.embedding_variant is None:
                    embe_var = "none"
                else:
                    embe_var = sf.embeddings.embedding_variant
            if feedbackstore is None:
                cfg = {"type": "Unconfigured default"}
            else:
                cfg = feedbackstore.get_config()

            rs = []
            for p, t in [ref_summ]:
                rs.append({"name": os.path.split(p)[1], "text": t})

            run_id_string = "%s-%s-%s-%s-%s-%s-%s-%s" % (
                oracle, summarizer, parser, embe_var, topic.get_dataset(),
                topic.get_name(), [item["name"]
                                   for item in rs], json.dumps(cfg))

            run_id = hashlib.sha224(run_id_string).hexdigest()
            filename = path.join(self.scores_storage_path,
                                 "result-%s.json" % (run_id))

            if (os.path.exists(filename) and self.out is None
                    and self.override_results_switch is False):
                log.info(
                    "Skipping run_id '%s' because the result file does already exist. config: %s"
                    % (run_id, run_id_string))
                return
            else:
                log.info("Doing %s iterations for run_id '%s'\n %s" %
                         (max_iteration_count, run_id, run_id_string))
                write_to_file("", filename)

            summary, confirmatory_summary, exploratory_summary = sf.run_full_simulation(
                max_iteration_count=max_iteration_count)

            recommendations, recom_sentences = sf.get_recommendations()

            derived_records = []
            # construct table-like array of feedbacks per iteration.
            for i, record in enumerate(sf.flight_recorder.records):
                for accept in record.accept:
                    derived_records.append({
                        "iteration": i,
                        "concept": accept,
                        "value": "accept"
                    })
                for reject in record.reject:
                    derived_records.append({
                        "iteration": i,
                        "concept": reject,
                        "value": "reject"
                    })
                for implicit_reject in record.implicit_reject:
                    derived_records.append({
                        "iteration": i,
                        "concept": implicit_reject,
                        "value": "implicit_reject"
                    })

            for item in recommendations:
                derived_records.append({
                    "iteration":
                    -1,
                    "concept":
                    item,
                    "value":
                    "recommendation",
                    "weight":
                    sf.summarizer.weights.get(item, 0.0),
                    "uncertainity":
                    sf.svm_uncertainity.get(item, -1.0)
                })

            result = {
                "config_run_id": run_id,
                "config_oracle_type": oracle,
                "config_summarizer_type": summarizer,
                "config_parse_type": str(parser),
                #"config_wordembeddings": emb_var,
                "config_feedbackstore": sf.feedbackstore.get_config(),
                "config_feedback_interpretation": {},
                "config_concept_recommendation": {},
                "dataset": topic.get_dataset(),
                "topic": topic.get_name(),
                "models": rs,
                "model_rougescores": {
                    "iteration": -1,
                    "ROUGE-1 R score": ub_scores[0],
                    "ROUGE-2 R score": ub_scores[1],
                    "ROUGE-SU* R score": ub_scores[2],
                    "accepted": [],
                    "accept_count": 0,
                    "rejected": [],
                    "reject_count": 0,
                    "summary": ub_summary
                },
                "result_summary": summary,
                "result_rougescores": sf.log_sir_info_data,
                "log_feedbacks": derived_records
            }

            r2 = [{
                "iteration": i,
                "summary": sf.log_info_data[i]
            } for i in range(len(sf.flight_recorder.records))]
            log.debug(
                "records: %s, infos %s, diff: %s" %
                (len(sf.flight_recorder.records), len(sf.log_info_data),
                 len(sf.flight_recorder.records) - len(sf.log_info_data)))

            write_to_file(json.dumps(result), filename)
            log.info("Writing results to %s" % (filename))

            df = pd.DataFrame(derived_records)
            filename = path.join(self.scores_storage_path,
                                 "flightrecorder-%s.csv" % (run_id))
            log.info("saving flightrecorder to %s with run_id %s" %
                     (filename, run_id))
            df.to_csv(filename, encoding="UTF-8")

            write_to_file(
                json.dumps(sf.new_debug_weights_history),
                path.join(
                    self.scores_storage_path,
                    "weightshistory-%s-%s-%s-%s.json" %
                    (topic.get_dataset(), topic.get_name(), summarizer,
                     run_id)))
            log.info("Writing weights history to %s" % (filename))
            weights_hist = pd.DataFrame(sf.new_debug_weights_history)

            filename = path.join(self.scores_storage_path,
                                 "weightshistory-%s.csv" % (run_id))
            weights_hist.to_csv(filename, encoding="UTF-8")

            log.debug("----------------------------------------------")
            log.debug(summary)
            log.debug(sf.log_info_data[-1])
            log.debug("----------------------------------------------")
            if self.pickle_store is not None:
                # Pickle dictionary using protocol 0.
                print('Pickle in file %s' % self.pickle_store)
                self.pickle_write(sf, self.pickle_store, log)

            json_content = self.write_summarize_output_json(
                sf, confirmatory_summary, derived_records, log,
                recom_sentences, result, run_id, summarizer, summary,
                self.pickle_store)
            # write_to_file(json_content, path.normpath(path.expanduser(path.join(self.iobasedir, "tmp", "tmp.json"))))
        else:
            raise BaseException("You should tell which summarizer to use")

        if sf is not None:
            write_details_file([sf.log_info_data],
                               path.join(self.iobasedir, "tmp", "tmp.csv"))
        self.tlog.debug("SingleTopicRunner finished")
Beispiel #4
0
            d = DataSet(f)
            # unroll to get topics
            for t in d.get_topics():
                for (mf, mt) in t.get_models():
                    mf = path.normpath(mf)
                    pref = path.commonprefix([mf, iobasedir])
                    tn = mf[len(pref) + 1:]
                    print("shortened:", tn)
                    queue.append(mf)

                    # topics.append([t.get_name for t in d.get_topics()])

        elif path.exists(path.join(f, "task.json")):
            # is topic
            t = Topic(f)
            for (mf, mt) in t.get_models():
                mf = path.normpath(mf)
                pref = path.commonprefix([mf, iobasedir])
                tn = mf[len(pref) + 1:]
                print("shortened:", tn)
                queue.append(mf)
        elif path.exists(path.join(f, "..", "..", "task.json")) \
                and path.exists(f):
            # should be model
            queue.append(f)
        else:
            raise BaseException("Invalid file given.", f, " is neither a dataset nor a topic nor a model.")

        if args.max_models:
            queue = queue[:args.max_models]