def divide(sent_path, score_path):
    sents = read_file(sent_path, preprocess=lambda x: x.split("\002")[0])
    scores = read_file(score_path, preprocess=lambda x: x.split("\t"))
    scores = [[float(ele) for ele in score_list] for score_list in scores]
    score_array = np.array(scores)
    en_model = spacy.load("en")
    index = []
    reverse_index = []
    for idx, ele in enumerate(sents):
        res = en_model(ele)
        for token in res:
            if token.dep_ == "ccomp" and token.idx < token.head.idx:
                reverse_index.append(idx)
                break
            else:
                index.append(idx)
                break

    first = score_array[index]
    res_frist = first[:, 0] > first[:, 2]

    second = score_array[reverse_index]
    res_second = second[:, 0] > second[:, 2]

    print(first.mean(axis=0))
    print(res_frist.sum() / len(res_frist), len(res_frist))

    print(second.mean(axis=0))
    print(res_second.sum() / len(res_second), len(res_second))
Example #2
0
def generate_typos():
    a = read_file("/Users/zxj/Google 云端硬盘/similar_typos.txt",
                  preprocess=lambda x: x.strip().split("\t"))
    typo_dict = {arr[0]: arr[1].split(" ") for arr in a}
    num = 3
    sents = read_file("/Users/zxj/Dropbox/data/similar_structure.txt",
                      preprocess=lambda x: x.strip().split("\t"))
    for arr in sents:
        words = arr[0].split(" ")
        first = [1 if word in typo_dict else 0 for word in words]
        if np.sum(first) >= num:
            typo_map = random_typo(words, typo_dict, 1)
            for key, value in typo_map.items():
                new_sent = re.sub(key, value, arr[0])
                arr.append(new_sent)
                print("\t".join(arr))
Example #3
0
def generate_verb_phases():
    language_model = spacy.load('en')
    sents = read_file(
        "test2.txt", preprocess=lambda x: extrac_verb_phase(language_model, x))
    sents = (ele for ele in sents if len(ele.split(" ")) > 1)
    for ele in sents:
        print(ele.strip())
Example #4
0
def filter_noun_adj(snli_path, output_path):
    snli_list = read_file(snli_path,
                          preprocess=lambda x: x.strip().split("\t"))
    snli_list = filter(
        lambda x: not x[1].strip() in {"little", "small"} or not x[2].strip()
        in {"boy", "girl", "child", "children", "baby"}, snli_list)
    new_snli_list = filter(lambda x: not x[1] in {"green", "other", "musical"},
                           snli_list)
    new_quad_list = []
    antonym_dict = {
        "same": "different",
        "different": "same",
        "long": "short",
        "short": "long",
        "older": "young",
        "younger": "old"
    }
    for quad in snli_list:
        if quad[1] == "same":
            quad[3] = "different"
        if quad[1] == "long":
            quad[3] = "short"
        if quad[1] == "short":
            quad[3] = "long"

        new_quad_list.append(quad)

    output_list_to_file(new_quad_list,
                        output_path,
                        process=lambda x: "\t".join(x))
Example #5
0
def generate_negative_samples():
    file_path = "/Users/zxj/PycharmProjects/sentence_evaluation/dataset/negative_no_unique.txt"
    file_generator = read_file(file_path, preprocess=lambda x: x.split("\t"))
    file_generator = ([tup[0], tup[1].strip(),
                       negate_verb(tup[0])] for tup in file_generator)
    for ele in file_generator:
        print("\001".join(ele).strip())
def test():
    file_path = "/Users/zxj/Google 云端硬盘/experiment-results/SICK/contradiction-sentences.txt"
    a = read_file(file_path, preprocess=lambda x: x.split("\t")[:2])
    a = filter(lambda x: "n't" in x[0] or "n't" in x[1], a)
    for ele in a:
        result = ele[1] + "\001" + ele[0] + "\001" + ele[1] if "n't" in ele[0] \
            else ele[0] + "\001" + ele[1] + "\001" + ele[0]
        print(result)
def calculate_score(file_path):
    scores = read_file(file_path, preprocess=lambda x: x.split("\t"))
    scores = [[float(ele) for ele in score_list] for score_list in scores]
    score_array = np.array(scores)
    res1 = score_array[:, 0] < score_array[:, 1]
    res2 = score_array[:, 2] < score_array[:, 1]
    res3 = np.zeros(len(score_array))
    for index in range(len(score_array)):
        res3[index] = res1[index] and res2[index]
    print(np.sum(res3) / len(res3))
Example #8
0
def filter_mrpc(file_path):
    msrp_iter = read_file(file_path,
                          preprocess=lambda x: x.strip().split("\t"))
    msrp_iter = filter(lambda x: len(x) == 5 and x[0] == "1", msrp_iter)
    msrp_iter = map(lambda x: (x[3], x[4]), msrp_iter)
    for first, second in msrp_iter:
        if is_sentence_with_clause(first, nlp):
            yield (first + "\t" + second)
        elif is_sentence_with_clause(second, nlp):
            yield (second + "\t" + first)
Example #9
0
def generate_random(file_path, get_reordered):
    """

    :param file_path: specify path of a certain file
    :return:
    """
    sent_tuple = read_file(file_path, lambda x: x.strip().split("\t"))
    for arr in sent_tuple:
        first_sent = arr[0]
        first_arr = first_sent.split(" ")
        reversed_first = get_reordered(first_arr)
        arr.append(reversed_first)
        print("\t".join(arr).strip())
Example #10
0
def extrac_caluse_from(language_model):
    path = "/Users/zxj/Downloads/dataset-sts/data/para/msr/msr-para-train.tsv"
    file = read_file(path, preprocess=lambda x: x.split("\t"))
    file = [(ele[3], ele[4].strip()) for ele in file if ele[0] == "1"]
    say_regex = re.compile("said|says?|saying|tell|told|thinks")
    for a, b in file:
        if say_regex.search(a):
            clause = extract_clause(a, language_model)
            if clause:
                print(a + "\t" + clause)
        if not say_regex.search(a) and say_regex.search(b):
            clause = extract_clause(b, language_model)
            if clause:
                print(b + "\t" + clause)
Example #11
0
def calculate_word_frequency():
    file_path = "/Users/zxj/Desktop/snli_1.0/possible_contradiction"
    sent_tuple = read_file(file_path, lambda x: x.split("\t"))
    results = {}
    for arr in sent_tuple:
        arr_tuple = zip(arr[0].split(" "), arr[1].split(" "))
        arr_tuple = [ele for ele in arr_tuple if ele[0] != ele[1]]
        arr_str = (arr_tuple[0][0] + "\t" + arr_tuple[0][1]).strip().lower()
        if arr_str not in results:
            results[arr_str] = 1
        else:
            results[arr_str] += 1

    for key, value in results.items():
        if value == 1:
            print(key, value)
Example #12
0
def factual_test():
    sents = read_file(
        "/Users/zxj/Google 云端硬盘/experiment-results/Clause Relatedness/clause_relatededness_samples.txt",
        preprocess=lambda x: x.strip().split("\002")[:-1])
    replace_dict = {
        "say": "deny",
        "says": "denies",
        "said": "denied",
        "think": "doubt",
        "thinks": "doubts",
        "thought": "doubted"
    }

    for arr in sents:
        for key, value in replace_dict.items():
            pattern = re.compile(" {0}".format(key))
            if not pattern.search(arr[0]):
                continue

            new_sent = pattern.sub(" {0}".format(value), arr[0])
            arr.append(new_sent)
            print("\t".join(arr))
def calculate_mean(file_path):
    scores = read_file(file_path, preprocess=lambda x: x.split("\t"))
    scores = [[float(ele) for ele in score_list] for score_list in scores]
    score_array = np.mean(axis=0, a=scores)
    print(score_array)
Example #14
0
def load_sick2(sick_path="/Users/zxj/Downloads/SICK/SICK.txt"):
    file_list = read_file(sick_path)
    file_list = (ele.split("\t")[1:7] for ele in file_list
                 if not ele.startswith("pair_ID"))
    file_list = ([ele[0], ele[1], ele[2], ele[3]] for ele in file_list)
    return file_list
Example #15
0
    for ele in parser(sentence):
        if ele.pos_ == "ADJ" and ele.dep_ == "amod" and ele.head.pos_ == "NOUN" and ele.idx > ele.head.idx:
            return False

    return True


def fix_bugs_in_sentence(sent, parser):
    for token in parser(sent):
        if token.dep_ == "relcl" and token.text == "is":
            if token.head.tag_ in {"NNS", "NNPS"}:
                return sent[:token.idx] + "are" + sent[token.idx +
                                                       len(token.text):]
    return sent


if __name__ == '__main__':
    nlp = spacy.load('en_core_web_sm')
    msrp_dir = "/home/zxj/Downloads/new_corpus"
    file_path = os.path.join(msrp_dir,
                             "filtered_opinion_negation_triplets.txt")
    negation_pattern = re.compile(r" 't")
    msrp_iter = read_file(file_path,
                          preprocess=lambda x: x.strip().split("\t"))
    for ele in msrp_iter:
        first = ele[0]
        if negation_pattern.search(first):
            continue
        second = negate_word_msr(first, nlp)
        print(first + "\t" + ele[1] + "\t" + second)