Beispiel #1
0
    def get_similarity(text_1, text_2):
    #text_1 and text_2 are xml data that uses spans to seperate boundaries
    #e.g. BOSTON, MA ... <span class="highlighted" id="634541">Steven L.
    #Davis pled guilty yesterday to federal charges that he stole and disclosed trade secrets of The Gillette Company</span>.

        if text_1 == '' or text_2 == '':
            return 'Error Text Input Is Empty'
        else:

            xml_soup_1 = BeautifulSoup(text_1)
            xml_soup_2 = BeautifulSoup(text_2)
            xml_soup_1 = remove_html_tags(xml_soup_1)
            xml_soup_2 = remove_html_tags(xml_soup_2)

            segements_1 = get_segements(xml_soup_1)
            segements_2 = get_segements(xml_soup_2)

            seg_check = check_segment_length(segements_1, segements_2)

            if not seg_check:
                return 'Error Source Text Was Different'

            masses_1 = segeval.convert_positions_to_masses(segements_1)
            masses_2 = segeval.convert_positions_to_masses(segements_2)

            ss = segeval.segmentation_similarity(masses_1, masses_2)
            ss = float(ss)
            pk = segeval.pk(masses_1, masses_2)
            pk = 1 - float(pk)
            win_diff = segeval.window_diff(masses_1, masses_2)
            win_diff = 1 - float(win_diff)

            return ss, pk, win_diff
Beispiel #2
0
def __initialization(reference, hypothesis):
    if (len(reference) != len(hypothesis)):
        print(
            "Error! The length of hypothesis doesn't match the length of reference!"
        )
        raise SystemExit
    # Initializing the format of the reference and hypothesis sequences for feeding in the SegEval
    reference_boundary = segeval.convert_positions_to_masses(reference)
    hypothesis_boundary = segeval.convert_positions_to_masses(hypothesis)
    return reference_boundary, hypothesis_boundary
def seg_eval(algo_group_vec, real_group_vec, rev=True):
    """
    A function computing the Pk and win_diff value for 2 segmentations. Also give random baselines
    :param algo_group_vec: The algorithm result in the form a token group memberships
    :type algo_group_vec: Union[list, numpy.ndarray]
    :param real_group_vec: The real group memberships of tokens
    :type real_group_vec: Union[list, numpy.ndarray]
    :return: Pk value, Win_diff value, Pk random value, Win_diff random value
    :rtype: (float, float, float, float)
    """

    # Transform into segmentation vectors
    real_segm_vec = convert_positions_to_masses(real_group_vec)
    algo_segm_vec = convert_positions_to_masses(algo_group_vec)

    # Make a shuffle group vec
    rdm_group_vec = real_group_vec.copy()
    rdm.shuffle(rdm_group_vec)
    rdm_segm_vec = convert_positions_to_masses(rdm_group_vec)

    if rev:
        # Compute the real value
        pk_res = pk(real_segm_vec, algo_segm_vec)
        try:
            win_diff = window_diff(real_segm_vec, algo_segm_vec)
        except:
            win_diff = 1
        # Compute the random value
        pk_rdm = pk(real_segm_vec, rdm_segm_vec)
        try:
            win_diff_rdm = window_diff(real_segm_vec, rdm_segm_vec)
        except:
            win_diff_rdm = 1
    else:
        # Compute the real value
        pk_res = pk(algo_segm_vec, real_segm_vec)
        try:
            win_diff = window_diff(algo_segm_vec, real_segm_vec)
        except:
            win_diff = 1
        # Compute the random value
        pk_rdm = pk(rdm_segm_vec, real_segm_vec)
        try:
            win_diff_rdm = window_diff(rdm_segm_vec, real_segm_vec)
        except:
            win_diff_rdm = 1

    # Return
    return pk_res, win_diff, pk_rdm, win_diff_rdm
Beispiel #4
0
    def test_convert_positions_to_masses(self):
        '''
        Test convert_positions_to_masses.
        '''

        self.assertEquals(
            (4, 2), convert_positions_to_masses([1, 1, 1, 1, 2, 2]))
Beispiel #5
0
import segeval

ground_truth_file_list = [
    "mix_word1_groups.txt", "mix_word5_groups.txt", "mix_sent1_groups.txt",
    "mix_sent5_groups.txt", "61320_199211_pp_groups.txt",
    "61320_200411_pp_groups.txt", "61320_201211_pp_groups.txt",
    "61320_201611_pp_groups.txt", "61620_200411_pp_groups.txt",
    "61620_200811_pp_groups.txt", "61620_201211_pp_groups.txt",
    "61620_201611_pp_groups.txt"
]

for ground_truth_file in ground_truth_file_list:
    # Getting the base path (must run the script from a folder inside the "SemSim_Autocor" folder)
    working_path = os.getcwd()
    base_path = str.split(working_path,
                          "SemSim_AutoCor")[0] + "SemSim_AutoCor/"
    # Path of the raw text file
    ground_truth_path = f"{base_path}corpora/{ground_truth_file}"

    # Loading ground truth
    with open(ground_truth_path) as ground_truth:
        real_group_vec = ground_truth.read()
        real_group_vec = np.array(
            [int(element) for element in real_group_vec.split(",")])

    n_group = max(real_group_vec) + 1
    real_segm_vec = segeval.convert_positions_to_masses(real_group_vec)
    print(
        f"Groupfile {ground_truth_file} has {n_group} groups and mean segment length of {np.mean(real_segm_vec)}"
    )
Beispiel #6
0
from nltk.metrics.segmentation import pk, windowdiff
import segeval as se
import horae as ho
import codecs


if __name__ == '__main__':

    test = sys.argv[1]
    classifier = sys.argv[2]
    type_ = sys.argv[3]
    level = sys.argv[4]

    path_pred = "../data/test/seg/" + test + "_" + level + ".pred_" +\
                classifier
    path_ref = "../data/test/choiformat/" + type_ + "/" + test + "_" +\
               level + ".ref"

    ref, nbref1, refs = ho.load_text(path_ref)
    pred, nbpred1, preds = ho.load_text(path_pred)

    d = {"stargazer": {"1": refs, "2": preds}}

    seg1 = d['stargazer']['1']
    seg2 = d['stargazer']['2']
    segs1 = se.convert_positions_to_masses(seg1)
    segs2 = se.convert_positions_to_masses(seg2)
    print("pk\tWindowdiff: \n")
    print(str(round(se.pk(segs2, segs1), 4)) + "\t" +
          str(round(se.window_diff(segs2, segs1), 4)))
Beispiel #7
0
rstr_best_real_group_vec = np.delete(best_real_group_vec,
                                     indices_for_known_label)

# Compute nmi score
nmi = normalized_mutual_info_score(rstr_real_group_vec, rstr_algo_group_vec)
# Compute Map
ap_vector = [
    average_precision_score(rstr_best_real_group_vec == group_id,
                            rstr_algo_group_vec == group_id)
    for group_id in range(1,
                          max(rstr_real_group_vec) + 1)
]
map = np.mean(ap_vector)

# Segmentation evaluation
real_segm_vec = convert_positions_to_masses(rstr_real_group_vec)
algo_segm_vec = convert_positions_to_masses(rstr_algo_group_vec)
rdm_group_vec = rstr_real_group_vec.copy()
rdm.shuffle(rdm_group_vec)
rdm_segm_vec = convert_positions_to_masses(rdm_group_vec)
pk_res = pk(algo_segm_vec, real_segm_vec)
win_diff = window_diff(algo_segm_vec, real_segm_vec)
pk_rdm = pk(rdm_segm_vec, real_segm_vec)
win_diff_rdm = window_diff(rdm_segm_vec, real_segm_vec)

# Compute the aggregate labels
df_results = pd.DataFrame(result_matrix)
df_results["Token"] = token_list
type_results = df_results.groupby("Token").mean()
type_list = list(type_results.index)
type_values = type_results.to_numpy()
        sent_list = txt_f.readlines()
    # Make the whole text
    text_string = " ".join(sent_list)
    # Split by tokens
    token_list = nltk.word_tokenize(text_string)
    # Vocabulary of text
    vocab_text = set(token_list)

    # Get the groups
    with open(f"{input_text_folder}/{group_file_list[i]}", "r") as grp_f:
        token_group_vec = grp_f.read()
        token_group_vec = np.array(
            [int(element) for element in token_group_vec.split(",")])

    n_groups = len(set(token_group_vec))
    token_segm_vec = segeval.convert_positions_to_masses(token_group_vec)

    # Make groups by sentences
    sent_group_vec = []
    ind_1 = 0
    for sent in sent_list:
        sent_token = nltk.word_tokenize(sent)
        token_group = list(token_group_vec[ind_1:(ind_1 + len(sent_token))])
        sent_group_vec.append(int(max(set(token_group),
                                      key=token_group.count)))
        ind_1 = ind_1 + len(sent_token)
    sent_group_vec = np.array(sent_group_vec)
    sent_segm_vec = segeval.convert_positions_to_masses(sent_group_vec)

    # Write results
    with open(output_file, "a") as output: