def divide(sent_path, score_path): sents = read_file(sent_path, preprocess=lambda x: x.split("\002")[0]) scores = read_file(score_path, preprocess=lambda x: x.split("\t")) scores = [[float(ele) for ele in score_list] for score_list in scores] score_array = np.array(scores) en_model = spacy.load("en") index = [] reverse_index = [] for idx, ele in enumerate(sents): res = en_model(ele) for token in res: if token.dep_ == "ccomp" and token.idx < token.head.idx: reverse_index.append(idx) break else: index.append(idx) break first = score_array[index] res_frist = first[:, 0] > first[:, 2] second = score_array[reverse_index] res_second = second[:, 0] > second[:, 2] print(first.mean(axis=0)) print(res_frist.sum() / len(res_frist), len(res_frist)) print(second.mean(axis=0)) print(res_second.sum() / len(res_second), len(res_second))
def generate_typos(): a = read_file("/Users/zxj/Google 云端硬盘/similar_typos.txt", preprocess=lambda x: x.strip().split("\t")) typo_dict = {arr[0]: arr[1].split(" ") for arr in a} num = 3 sents = read_file("/Users/zxj/Dropbox/data/similar_structure.txt", preprocess=lambda x: x.strip().split("\t")) for arr in sents: words = arr[0].split(" ") first = [1 if word in typo_dict else 0 for word in words] if np.sum(first) >= num: typo_map = random_typo(words, typo_dict, 1) for key, value in typo_map.items(): new_sent = re.sub(key, value, arr[0]) arr.append(new_sent) print("\t".join(arr))
def generate_verb_phases(): language_model = spacy.load('en') sents = read_file( "test2.txt", preprocess=lambda x: extrac_verb_phase(language_model, x)) sents = (ele for ele in sents if len(ele.split(" ")) > 1) for ele in sents: print(ele.strip())
def filter_noun_adj(snli_path, output_path): snli_list = read_file(snli_path, preprocess=lambda x: x.strip().split("\t")) snli_list = filter( lambda x: not x[1].strip() in {"little", "small"} or not x[2].strip() in {"boy", "girl", "child", "children", "baby"}, snli_list) new_snli_list = filter(lambda x: not x[1] in {"green", "other", "musical"}, snli_list) new_quad_list = [] antonym_dict = { "same": "different", "different": "same", "long": "short", "short": "long", "older": "young", "younger": "old" } for quad in snli_list: if quad[1] == "same": quad[3] = "different" if quad[1] == "long": quad[3] = "short" if quad[1] == "short": quad[3] = "long" new_quad_list.append(quad) output_list_to_file(new_quad_list, output_path, process=lambda x: "\t".join(x))
def generate_negative_samples(): file_path = "/Users/zxj/PycharmProjects/sentence_evaluation/dataset/negative_no_unique.txt" file_generator = read_file(file_path, preprocess=lambda x: x.split("\t")) file_generator = ([tup[0], tup[1].strip(), negate_verb(tup[0])] for tup in file_generator) for ele in file_generator: print("\001".join(ele).strip())
def test(): file_path = "/Users/zxj/Google 云端硬盘/experiment-results/SICK/contradiction-sentences.txt" a = read_file(file_path, preprocess=lambda x: x.split("\t")[:2]) a = filter(lambda x: "n't" in x[0] or "n't" in x[1], a) for ele in a: result = ele[1] + "\001" + ele[0] + "\001" + ele[1] if "n't" in ele[0] \ else ele[0] + "\001" + ele[1] + "\001" + ele[0] print(result)
def calculate_score(file_path): scores = read_file(file_path, preprocess=lambda x: x.split("\t")) scores = [[float(ele) for ele in score_list] for score_list in scores] score_array = np.array(scores) res1 = score_array[:, 0] < score_array[:, 1] res2 = score_array[:, 2] < score_array[:, 1] res3 = np.zeros(len(score_array)) for index in range(len(score_array)): res3[index] = res1[index] and res2[index] print(np.sum(res3) / len(res3))
def filter_mrpc(file_path): msrp_iter = read_file(file_path, preprocess=lambda x: x.strip().split("\t")) msrp_iter = filter(lambda x: len(x) == 5 and x[0] == "1", msrp_iter) msrp_iter = map(lambda x: (x[3], x[4]), msrp_iter) for first, second in msrp_iter: if is_sentence_with_clause(first, nlp): yield (first + "\t" + second) elif is_sentence_with_clause(second, nlp): yield (second + "\t" + first)
def generate_random(file_path, get_reordered): """ :param file_path: specify path of a certain file :return: """ sent_tuple = read_file(file_path, lambda x: x.strip().split("\t")) for arr in sent_tuple: first_sent = arr[0] first_arr = first_sent.split(" ") reversed_first = get_reordered(first_arr) arr.append(reversed_first) print("\t".join(arr).strip())
def extrac_caluse_from(language_model): path = "/Users/zxj/Downloads/dataset-sts/data/para/msr/msr-para-train.tsv" file = read_file(path, preprocess=lambda x: x.split("\t")) file = [(ele[3], ele[4].strip()) for ele in file if ele[0] == "1"] say_regex = re.compile("said|says?|saying|tell|told|thinks") for a, b in file: if say_regex.search(a): clause = extract_clause(a, language_model) if clause: print(a + "\t" + clause) if not say_regex.search(a) and say_regex.search(b): clause = extract_clause(b, language_model) if clause: print(b + "\t" + clause)
def calculate_word_frequency(): file_path = "/Users/zxj/Desktop/snli_1.0/possible_contradiction" sent_tuple = read_file(file_path, lambda x: x.split("\t")) results = {} for arr in sent_tuple: arr_tuple = zip(arr[0].split(" "), arr[1].split(" ")) arr_tuple = [ele for ele in arr_tuple if ele[0] != ele[1]] arr_str = (arr_tuple[0][0] + "\t" + arr_tuple[0][1]).strip().lower() if arr_str not in results: results[arr_str] = 1 else: results[arr_str] += 1 for key, value in results.items(): if value == 1: print(key, value)
def factual_test(): sents = read_file( "/Users/zxj/Google 云端硬盘/experiment-results/Clause Relatedness/clause_relatededness_samples.txt", preprocess=lambda x: x.strip().split("\002")[:-1]) replace_dict = { "say": "deny", "says": "denies", "said": "denied", "think": "doubt", "thinks": "doubts", "thought": "doubted" } for arr in sents: for key, value in replace_dict.items(): pattern = re.compile(" {0}".format(key)) if not pattern.search(arr[0]): continue new_sent = pattern.sub(" {0}".format(value), arr[0]) arr.append(new_sent) print("\t".join(arr))
def calculate_mean(file_path): scores = read_file(file_path, preprocess=lambda x: x.split("\t")) scores = [[float(ele) for ele in score_list] for score_list in scores] score_array = np.mean(axis=0, a=scores) print(score_array)
def load_sick2(sick_path="/Users/zxj/Downloads/SICK/SICK.txt"): file_list = read_file(sick_path) file_list = (ele.split("\t")[1:7] for ele in file_list if not ele.startswith("pair_ID")) file_list = ([ele[0], ele[1], ele[2], ele[3]] for ele in file_list) return file_list
for ele in parser(sentence): if ele.pos_ == "ADJ" and ele.dep_ == "amod" and ele.head.pos_ == "NOUN" and ele.idx > ele.head.idx: return False return True def fix_bugs_in_sentence(sent, parser): for token in parser(sent): if token.dep_ == "relcl" and token.text == "is": if token.head.tag_ in {"NNS", "NNPS"}: return sent[:token.idx] + "are" + sent[token.idx + len(token.text):] return sent if __name__ == '__main__': nlp = spacy.load('en_core_web_sm') msrp_dir = "/home/zxj/Downloads/new_corpus" file_path = os.path.join(msrp_dir, "filtered_opinion_negation_triplets.txt") negation_pattern = re.compile(r" 't") msrp_iter = read_file(file_path, preprocess=lambda x: x.strip().split("\t")) for ele in msrp_iter: first = ele[0] if negation_pattern.search(first): continue second = negate_word_msr(first, nlp) print(first + "\t" + ele[1] + "\t" + second)