def summerize(file):
	paragraphs = preprocess_file(file)
	paragraphs,inverse_sentence_structure = preprocess_paragraphs(paragraphs)
	data = feature_extraction(paragraphs)
	present = create_yes_no_column(inverse_sentence_structure, outputfile)
	data['present'] = present
	print(data.head())
	print(data.columns)
	train_and_plot_results(data)
Exemple #2
0
def plot_most_quoted_countries(data, nb_country):
    '''
    This function plots an histogram representing the number of occurrences of most-quoted countries.

    Parameters
        - data       : DataFrame sorted by the number of occurrences
        - nb_country : selection on the most representative countries
    '''

    data = data.head(nb_country)
    countries_plot = sns.barplot(x=data.index,
                                 y='Occurrences',
                                 data=data,
                                 color='hotpink')
    for label in countries_plot.get_xticklabels():
        label.set_rotation(90)
    countries_plot.set(ylabel='Occurrences')
    countries_plot.set_title('Number of occurrences of ' + str(nb_country) +
                             ' most-quoted countries')
    sns.plt.show()
Exemple #3
0
    def predict(self,
                input_dir,
                output_dir,
                rw_type,
                input_format,
                chunk_len=100,
                test_scores=False,
                output_confidence=False,
                special_model_path=None):
        """
        tags each file in the input directory (txt or tsv files) and writes the results
        to output_dir. Also adds a folder "result_stats" with runtime information to the
        output_dir

        tsv files must have at least the columns 'tok' and 'sentstart'
        :param input_dir: string value: path to input directory
        :param output_dir: string value: path to output directory
        :param rw_type: string value: direct, indirect, freeIndirect or reported
        :param input_format: string value: txt or tsv
        :param chunk_len:
        :return:
        """
        # time the prediction
        start_time = datetime.datetime.now().replace(microsecond=0)
        # create a subdir for testing and overview information in the outputdir
        result_subdir = "result_stats"
        if not os.path.exists(os.path.join(output_dir, result_subdir)):
            os.makedirs(os.path.join(output_dir, result_subdir))

        # load the model
        # determine the current script path
        curr_path = os.path.dirname(os.path.abspath(__file__))
        if special_model_path is None:
            model_path = os.path.join(curr_path, "models", rw_type,
                                      "final-model.pt")
        else:
            model_path = os.path.join(curr_path, "models", special_model_path,
                                      "final-model.pt")
        if not os.path.exists(model_path):
            logging.warning(
                "Predicting {} aborted. Model not found at path '{}'. Please download a model and put it into "
                "the appropriate directory. The model file must be named final-model.pt."
                .format(rw_type, model_path))
        else:
            self.logger.info("loading model {}".format(model_path))
            model = SequenceTagger.load(model_path)
            self.logger.info("model loaded")

            # if test mode, collect score data (initialize in any case)
            score_dict = {"file": [], "f1": [], "precision": [], "recall": []}
            all_predictions_df = pd.DataFrame()

            input_files = [x for x in os.listdir(input_dir)]
            for file in input_files:
                resfile_name = re.sub("\..+$", ".tsv", file)
                self.logger.info("predicting {}".format(file))
                # read the file and convert to dataframe
                if input_format == "txt":
                    data = self.convert_txtfile_to_dateframe(
                        os.path.join(input_dir, file))
                else:
                    data = pd.read_csv(os.path.join(input_dir, file),
                                       sep="\t",
                                       quoting=3,
                                       encoding="utf-8",
                                       na_values=[])

                # check for tok column:
                if "tok" not in data.columns:
                    self.logger.warning(
                        "Column 'tok' is missing in file {}. File will be skipped."
                        .format(file))
                else:
                    if "sentstart" not in data.columns:
                        self.logger.warning(
                            "Column 'sentstart' is missing in file {}. Will be added with default values (all 'no')."
                            .format(file))
                        data["sentstart"] = ["no"] * len(data)

                    self.logger.debug("TEST: data head:\n {}".format(
                        data.head(10)))
                    # create sentlist (based on max chunk length)
                    sent_list = self.create_sentlist_from_file_batchmax(
                        data, maxlen=chunk_len, compare_column="NaN")
                    # predict
                    res_dict = {
                        "tok": [],
                        rw_type + "_pred": [],
                        rw_type + "_conf": []
                    }
                    for sent in sent_list:
                        model.predict(sent)
                        pred_conf_list = [
                            x["labels"]
                            for x in sent.to_dict(tag_type="cat")["entities"]
                        ]
                        pred_list = [
                            x[0].to_dict()["value"] for x in pred_conf_list
                        ]
                        conf_list = [
                            x[0].to_dict()["confidence"]
                            for x in pred_conf_list
                        ]
                        res_dict["tok"].extend([
                            x["text"]
                            for x in sent.to_dict(tag_type="cat")["entities"]
                        ])
                        res_dict[rw_type + "_conf"].extend(conf_list)
                        res_dict[rw_type + "_pred"].extend(pred_list)
                    pred_df = pd.DataFrame(res_dict)
                    # create output
                    # if there is a missmatch in file length after prediction, still save the results
                    if (len(data) != len(pred_df)):
                        self.logger.warning(
                            "File length changed when predicting for file {} (before: {}, after: {})\n"
                            "Result file will be saved with prefix 'warn_'; additional columns are lost."
                            .format(file, len(data), len(pred_df)))
                        pred_df.to_csv(os.path.join(output_dir,
                                                    "warn_" + resfile_name),
                                       index=False,
                                       sep="\t")
                    # if everything is okay, add the new column(s) to the original data and save
                    else:
                        if output_confidence:
                            data[rw_type + "_conf"] = pred_df[rw_type +
                                                              "_conf"]
                        data[rw_type + "_pred"] = pred_df[rw_type + "_pred"]
                        data.to_csv(os.path.join(output_dir, resfile_name),
                                    index=False,
                                    sep="\t",
                                    encoding="utf-8")
                        # calculate the testscores:
                        if test_scores:
                            self.logger.info(
                                "Calculate scores for {}".format(file))
                            if rw_type in data.columns and rw_type + "_pred" in data.columns:
                                data, f1, prec, rec = self.calculate_scores(
                                    data, rw_type)
                                score_dict["file"].append(file)
                                score_dict["f1"].append(f1)
                                score_dict["precision"].append(prec)
                                score_dict["recall"].append(rec)
                                all_predictions_df = all_predictions_df.append(
                                    data)
                            else:
                                self.logger.warning(
                                    "Skipping test scores for file {}: Missing column {} and/or {}"
                                    .format(file, rw_type, rw_type + "_pred"))

            end_time = datetime.datetime.now().replace(microsecond=0)

            # write an overview file when the process is finished
            res_text = "RW Tagger (predict): Model {}\n" \
                       "Predict time:\nstart: {}nend:{}\ntotal: {}" \
                .format(model_path, start_time, end_time, end_time - start_time)
            # if in test mode, calculate the final scores (for all the data) and save the test score df
            if test_scores:
                self.logger.info("Calculate total scores")
                if len(all_predictions_df) > 0:
                    self.logger.debug("all_predictions_len: {}".format(
                        len(all_predictions_df)))
                    all_predictions_df, f1, prec, rec = self.calculate_scores(
                        all_predictions_df, rw_type)
                    score_dict["file"].append("total")
                    score_dict["f1"].append(f1)
                    score_dict["precision"].append(prec)
                    score_dict["recall"].append(rec)
                    score_df = pd.DataFrame(score_dict)
                    score_df.to_csv(os.path.join(output_dir, result_subdir,
                                                 rw_type + "_test_scores.tsv"),
                                    index=False,
                                    sep="\t",
                                    encoding="utf-8")
                    res_text += "\nTotal test scores (for detailed scores see {}_test_scores.tsv):\n" \
                                "f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec)
                    self.logger.info(
                        "Total scores for {}: f1: {}, precision: {}, recall: {}"
                        .format(rw_type, f1, prec, rec))
            with open(os.path.join(output_dir, result_subdir,
                                   rw_type + "_overview.txt"),
                      "w",
                      encoding="utf-8") as f:
                f.write(res_text)