def generate_datasets(root_dir, category_name, generation_algorithm): templates_dir = os.path.join(root_dir, "templates") dict_dir = os.path.join(root_dir, "dict") template_path = os.path.join(templates_dir, "{0}_templates.txt".format(category_name)) dict_path = os.path.join(dict_dir, "{0}_words.txt".format(category_name)) word_dict = { key: value for key, value in read_file(dict_path, preprocess=lambda x: x.strip().split("\t")) } templates = list(read_file(template_path, preprocess=lambda x: x.strip())) return generation_algorithm(templates, word_dict)
def read_template_dict(template_name, category_name): root_dir = "/home/zxj/Data/multinli_1.0" template_path = os.path.join(root_dir, template_name) dict_path = os.path.join(root_dir, "word-pairs-per-category.json") pair_dict = json.load(open(dict_path))[category_name] templates = read_file(template_path, preprocess=lambda x: x.strip()) return templates, pair_dict
def extract_dict(file_path): """ :type file_path: str :param file_path: path of input file :return: Dict[str, str] """ file_iter = read_file(file_path, preprocess=lambda x: x.strip().split("\t")[:2]) return {key.lower(): value.lower() for key, value in file_iter}
def main(): input_path = "/home/zxj/Data/SparkNotes/url_parts/url_part_{0}.txt" root_url = "https://www.sparknotes.com" output_path = "/home/zxj/Data/SparkNotes/chapters_url_parts/chapters_url_part_{0}.txt" loop = asyncio.get_event_loop() input_list = list( read_file("/home/zxj/Data/SparkNotes/missing_urls.txt", preprocess=lambda x: json.loads(x.strip()))) for ele in input_list: print(ele) loop.run_until_complete(get_url_link_list(input_list)) output_iterator("/home/zxj/Data/SparkNotes/missing_urls_new.txt", input_list, process=lambda x: json.dumps(x))
def summary_scrapping(): input_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_unfinished.txt" input_list = list( read_file(input_path, preprocess=lambda x: json.loads(x.strip()))) loop = asyncio.get_event_loop() results = loop.run_until_complete(get_url_link_list(input_list, get_link=get_summary)) finished_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_finished_new.txt" unfinished_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_unfinished_new.txt" finished = [ele for ele in input_list if 'summary' in ele] unfinished = [ele for ele in input_list if not 'summary' in ele] output_iterator(finished_path, finished, process=lambda x: json.dumps(x)) output_iterator(unfinished_path, unfinished, process=lambda x: json.dumps(x))
def get_chapter_summaries(): input_path = "/Users/zxj/Google 云端硬盘/SparkNotes/parts/book_chapter_summaries_1.txt" input_list = read_file(input_path, preprocess=lambda x: json.loads(x.strip())) input_list = (ele for ele in input_list if ele['chapters_url']) input_list = list(input_list) filtered_list = [ele for summary in input_list for ele in summary['chapters_url'] if 'summary' not in ele] loop = asyncio.get_event_loop() done, pending = loop.run_until_complete( get_chapter_summary_test(filtered_list)) for ele in filtered_list: print(ele) ''''
def get_document_summary(self, tokenize=False): root_dir = self.target_dir file_path_list = [ file_name for file_name in os.listdir(root_dir) if file_name[-6:] == ".story" ] result_list = (read_file(os.path.join(root_dir, path)) for path in file_path_list) result_list = [self.__process_story(doc) for doc in result_list] result_list = [tup for tup in result_list if tup[0] and tup[1]] if tokenize: self.tokenized = tokenize result_list = [(tokenize_list(doc, self.tokenizer), tokenize_list(summary, self.tokenizer)) for doc, summary in result_list] return result_list
def calcualte_overlap(dict_dir, dict_name, vocabulary): dict_path = os.path.join(dict_dir, dict_name) category_vocab = list( read_file(dict_path, preprocess=lambda x: x.strip().split("\t"))) new_vocab = [] oov_pairs = [] for first, second in category_vocab: ''' if dict_name != "family_words.txt": first = string.capwords(first) if dict_name not in {"family_words.txt", "currency_words.txt"}: second = string.capwords(second) ''' if first in vocabulary and second in vocabulary: new_vocab.append(first + "\t" + second) else: oov_pairs.append(first + "\t" + second) overlap_rate = float(len(new_vocab)) / float(len(category_vocab)) return new_vocab, oov_pairs, overlap_rate
def get_sentences_with_certain_words(file_path, dict_path, category_name, output_path, capitalize=False): sentence_iterator = read_file( file_path, preprocess=lambda x: x.strip().split("\t")) sentence_set = set([sent for arr in sentence_iterator for sent in arr]) with open(dict_path, "r") as category_dict: category_list = json.load(category_dict)[category_name] key_value_list = [] for key, value in category_list.items(): if capitalize: key = string.capwords(key) value = string.capwords(value) key_value_list.append(" {0} ".format(key)) if value.lower() != "real": key_value_list.append(" {0} ".format(value)) pattrn_str = "|".join(key_value_list) pattern = re.compile(pattrn_str) new_senence_set = [sent for sent in sentence_set if pattern.search(sent)] output_list_to_file(output_path, new_senence_set)
def main(): mnli_path = "/home/zxj/Data/multinli_1.0" input_path = os.path.join(mnli_path, "multinli_1.0_train_sents.txt") sent_list = set(sent for sent_tuple in read_file(input_path, preprocess=lambda x: x.strip().split("\t")) for sent in sent_tuple) nlp = spacy.load("en_core_web_sm") dict_path = os.path.join(mnli_path, "word-pairs-per-category.json") plural_verb_dict = json.load( open(dict_path, encoding="utf-8"))[": gram9-plural-verbs"] for sentence in sent_list: doc = nlp(sentence) for token in doc: if token.dep_ == "nsubj" and token.head.dep_ == "ROOT" and token.tag_ == "NNP": root_node = token.head root_text = root_node.text if root_node.tag_ == "VB" and root_text in plural_verb_dict: child_aux = [ child for child in root_node.children if child.dep_ == "aux"] if child_aux: child_negation = [ child for child in root_node.children if child.dep_ == "neg"] if not child_negation: words_to_delete = child_aux[0].text elif child_negation[0].text == "not": words_to_delete = child_aux[0].text + \ " " + child_negation[0].text else: words_to_delete = child_aux[0].text + \ child_negation[0].text plural_verb = plural_verb_dict[root_text] new_sentence = re.sub(words_to_delete, "", sentence) new_sentence = re.sub( root_text, plural_verb, new_sentence) new_sentence = re.sub("\s+", " ", new_sentence) print(root_text + "\t" + plural_verb + "\t" + sentence + "\t" + new_sentence) break
def analyze_document(input_path): number_lists = read_file( input_path, preprocess=lambda x: [int(ele) for ele in x.strip().split()]) counter = Counter() for ele in number_lists: counter.update(ele) a = 1 values = list(counter.values()) total = sum(values) partial = sum(values[:30]) print(partial / total) new_counter = sorted(counter.items())[:30] keys, values = zip(*new_counter) probs = [ele * 100.0 / total for ele in values] cdf = [0 for _ in range(len(probs))] for idx, ele in enumerate(probs): if idx > 0: cdf[idx] = cdf[idx - 1] + ele else: cdf[idx] = ele fig1, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 8)) ax1.plot(keys, probs) ax1.set( xlabel='index of sentences', ylabel='probability', title= 'Fig 1 Probability Distribution of A sentence that is closest to sentences in summary' ) ax2.plot(keys, cdf) ax2.set( xlabel='index of sentences', ylabel='probability', title= 'Fig 2 Cumulative Distribution of A sentence that is closest to sentences in summary' ) plt.show()
for idx, ele in enumerate(doc_list): if len(ele) < num_partitions: new_sentence_list.append(" ".join(ele)) partition_map[counter] = idx counter += 1 continue for part in chunks(ele, num_partitions): new_sentence_list.append(" ".join(part)) partition_map[counter] = idx counter += 1 return new_sentence_list, partition_map def merge_partition(partition_map, input_iter): max_length = max(partition_map.values()) + 1 new_result_list = ["" for _ in range(max_length)] for idx, ele in enumerate(input_iter): new_result_list[partition_map[str(idx)]] += ele new_result_list[partition_map[str(idx)]] += " " return new_result_list if __name__ == '__main__': input_dir = "/home/zxj/Documents/github/PacSum/extracted_parts/extracted_contents_all.txt" output_template = "/home/zxj/Documents/github/PacSum/extracted_parts/content_part_{0}.txt" doc_list = list(read_file(input_dir, preprocess=lambda x: x.strip())) idx = 0 for ele in chunks(doc_list, 7): output_iterator(output_template.format(idx), ele) idx += 1
transformation_list = [ random_deletion, random_masking, span_corrupting, word_reordering ] with open(os.path.join(output_path), mode="w+") as out_file: for ele in triplets_list: hypo, premise, neg_cand = ele output_dict = { "hypothesis": hypo, "premise": premise, "negative_candidates": [neg_cand] } tokenized_premise = [ele for ele in tokenizer(premise)] for corrupt_func in transformation_list: result = [ ele.text if not isinstance(ele, str) else ele for ele in corrupt_func(tokenized_premise) ] output_dict["negative_candidates"].append(" ".join(result)) out_file.write(json.dumps(output_dict) + "\n") if __name__ == '__main__': input_dir = "/home/zxj/Data/relation_based_analogy/input" input_path = os.path.join(input_dir, "adjective_compositionality.txt") input_iter = read_file(input_path, preprocess=lambda x: x.strip().split("\t")) outpath = os.path.join(input_dir, "adjective_analogy.txt") nlp = spacy.load("en_core_web_sm") generate_negative_candidates(input_iter, nlp, outpath)
def filter_topic(file_path): category_pattern = re.compile(r"^: ") contents = read_file(file_path, preprocess=lambda x: x.strip()) for ele in contents: if category_pattern.search(ele): print(ele)