Beispiel #1
0
def generate_datasets(root_dir, category_name, generation_algorithm):
    templates_dir = os.path.join(root_dir, "templates")
    dict_dir = os.path.join(root_dir, "dict")
    template_path = os.path.join(templates_dir,
                                 "{0}_templates.txt".format(category_name))
    dict_path = os.path.join(dict_dir, "{0}_words.txt".format(category_name))
    word_dict = {
        key: value
        for key, value in read_file(dict_path,
                                    preprocess=lambda x: x.strip().split("\t"))
    }
    templates = list(read_file(template_path, preprocess=lambda x: x.strip()))
    return generation_algorithm(templates, word_dict)
Beispiel #2
0
def read_template_dict(template_name, category_name):
    root_dir = "/home/zxj/Data/multinli_1.0"
    template_path = os.path.join(root_dir, template_name)
    dict_path = os.path.join(root_dir, "word-pairs-per-category.json")
    pair_dict = json.load(open(dict_path))[category_name]
    templates = read_file(template_path, preprocess=lambda x: x.strip())
    return templates, pair_dict
Beispiel #3
0
def extract_dict(file_path):
    """
    :type file_path: str
    :param file_path: path of input file
    :return: Dict[str, str]
    """
    file_iter = read_file(file_path,
                          preprocess=lambda x: x.strip().split("\t")[:2])
    return {key.lower(): value.lower() for key, value in file_iter}
def main():
    input_path = "/home/zxj/Data/SparkNotes/url_parts/url_part_{0}.txt"
    root_url = "https://www.sparknotes.com"
    output_path = "/home/zxj/Data/SparkNotes/chapters_url_parts/chapters_url_part_{0}.txt"
    loop = asyncio.get_event_loop()
    input_list = list(
        read_file("/home/zxj/Data/SparkNotes/missing_urls.txt",
                  preprocess=lambda x: json.loads(x.strip())))
    for ele in input_list:
        print(ele)
    loop.run_until_complete(get_url_link_list(input_list))

    output_iterator("/home/zxj/Data/SparkNotes/missing_urls_new.txt",
                    input_list, process=lambda x: json.dumps(x))
def summary_scrapping():
    input_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_unfinished.txt"
    input_list = list(
        read_file(input_path, preprocess=lambda x: json.loads(x.strip())))
    loop = asyncio.get_event_loop()
    results = loop.run_until_complete(get_url_link_list(input_list,
                                                        get_link=get_summary))
    finished_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_finished_new.txt"
    unfinished_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_unfinished_new.txt"

    finished = [ele for ele in input_list if 'summary' in ele]
    unfinished = [ele for ele in input_list if not 'summary' in ele]
    output_iterator(finished_path, finished, process=lambda x: json.dumps(x))
    output_iterator(unfinished_path, unfinished,
                    process=lambda x: json.dumps(x))
def get_chapter_summaries():
    input_path = "/Users/zxj/Google 云端硬盘/SparkNotes/parts/book_chapter_summaries_1.txt"
    input_list = read_file(input_path,
                           preprocess=lambda x: json.loads(x.strip()))
    input_list = (ele for ele in input_list if ele['chapters_url'])
    input_list = list(input_list)
    filtered_list = [ele for summary in input_list for ele in
                     summary['chapters_url'] if 'summary' not in ele]

    loop = asyncio.get_event_loop()
    done, pending = loop.run_until_complete(
        get_chapter_summary_test(filtered_list))

    for ele in filtered_list:
        print(ele)

    ''''
    def get_document_summary(self, tokenize=False):
        root_dir = self.target_dir
        file_path_list = [
            file_name for file_name in os.listdir(root_dir)
            if file_name[-6:] == ".story"
        ]
        result_list = (read_file(os.path.join(root_dir, path))
                       for path in file_path_list)
        result_list = [self.__process_story(doc) for doc in result_list]
        result_list = [tup for tup in result_list if tup[0] and tup[1]]

        if tokenize:
            self.tokenized = tokenize
            result_list = [(tokenize_list(doc, self.tokenizer),
                            tokenize_list(summary, self.tokenizer))
                           for doc, summary in result_list]
        return result_list
Beispiel #8
0
def calcualte_overlap(dict_dir, dict_name, vocabulary):
    dict_path = os.path.join(dict_dir, dict_name)
    category_vocab = list(
        read_file(dict_path, preprocess=lambda x: x.strip().split("\t")))
    new_vocab = []
    oov_pairs = []
    for first, second in category_vocab:
        '''
        if dict_name != "family_words.txt":
            first = string.capwords(first)
        if dict_name not in {"family_words.txt", "currency_words.txt"}:
            second = string.capwords(second)
        '''
        if first in vocabulary and second in vocabulary:
            new_vocab.append(first + "\t" + second)
        else:
            oov_pairs.append(first + "\t" + second)

    overlap_rate = float(len(new_vocab)) / float(len(category_vocab))
    return new_vocab, oov_pairs, overlap_rate
Beispiel #9
0
def get_sentences_with_certain_words(file_path, dict_path, category_name, output_path, capitalize=False):
    sentence_iterator = read_file(
        file_path, preprocess=lambda x: x.strip().split("\t"))
    sentence_set = set([sent for arr in sentence_iterator for sent in arr])

    with open(dict_path, "r") as category_dict:
        category_list = json.load(category_dict)[category_name]
    key_value_list = []
    for key, value in category_list.items():
        if capitalize:
            key = string.capwords(key)
            value = string.capwords(value)

        key_value_list.append(" {0} ".format(key))
        if value.lower() != "real":
            key_value_list.append(" {0} ".format(value))

    pattrn_str = "|".join(key_value_list)
    pattern = re.compile(pattrn_str)
    new_senence_set = [sent for sent in sentence_set if pattern.search(sent)]
    output_list_to_file(output_path, new_senence_set)
Beispiel #10
0
def main():
    mnli_path = "/home/zxj/Data/multinli_1.0"
    input_path = os.path.join(mnli_path, "multinli_1.0_train_sents.txt")
    sent_list = set(sent for sent_tuple in read_file(input_path, preprocess=lambda x: x.strip().split("\t")) for sent in
                    sent_tuple)

    nlp = spacy.load("en_core_web_sm")
    dict_path = os.path.join(mnli_path, "word-pairs-per-category.json")

    plural_verb_dict = json.load(
        open(dict_path, encoding="utf-8"))[": gram9-plural-verbs"]
    for sentence in sent_list:
        doc = nlp(sentence)
        for token in doc:
            if token.dep_ == "nsubj" and token.head.dep_ == "ROOT" and token.tag_ == "NNP":
                root_node = token.head
                root_text = root_node.text
                if root_node.tag_ == "VB" and root_text in plural_verb_dict:
                    child_aux = [
                        child for child in root_node.children if child.dep_ == "aux"]
                    if child_aux:
                        child_negation = [
                            child for child in root_node.children if child.dep_ == "neg"]
                        if not child_negation:
                            words_to_delete = child_aux[0].text
                        elif child_negation[0].text == "not":
                            words_to_delete = child_aux[0].text + \
                                              " " + child_negation[0].text

                        else:
                            words_to_delete = child_aux[0].text + \
                                              child_negation[0].text
                        plural_verb = plural_verb_dict[root_text]
                        new_sentence = re.sub(words_to_delete, "", sentence)
                        new_sentence = re.sub(
                            root_text, plural_verb, new_sentence)
                        new_sentence = re.sub("\s+", " ", new_sentence)
                        print(root_text + "\t" + plural_verb +
                              "\t" + sentence + "\t" + new_sentence)
                        break
Beispiel #11
0
def analyze_document(input_path):
    number_lists = read_file(
        input_path,
        preprocess=lambda x: [int(ele) for ele in x.strip().split()])
    counter = Counter()
    for ele in number_lists:
        counter.update(ele)
    a = 1
    values = list(counter.values())
    total = sum(values)
    partial = sum(values[:30])
    print(partial / total)
    new_counter = sorted(counter.items())[:30]
    keys, values = zip(*new_counter)
    probs = [ele * 100.0 / total for ele in values]
    cdf = [0 for _ in range(len(probs))]
    for idx, ele in enumerate(probs):
        if idx > 0:
            cdf[idx] = cdf[idx - 1] + ele
        else:
            cdf[idx] = ele

    fig1, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 8))
    ax1.plot(keys, probs)
    ax1.set(
        xlabel='index of sentences',
        ylabel='probability',
        title=
        'Fig 1 Probability Distribution of A sentence that is closest to sentences in summary'
    )
    ax2.plot(keys, cdf)
    ax2.set(
        xlabel='index of sentences',
        ylabel='probability',
        title=
        'Fig 2 Cumulative Distribution of A sentence that is closest to sentences in summary'
    )

    plt.show()
    for idx, ele in enumerate(doc_list):
        if len(ele) < num_partitions:
            new_sentence_list.append(" ".join(ele))
            partition_map[counter] = idx
            counter += 1
            continue

        for part in chunks(ele, num_partitions):
            new_sentence_list.append(" ".join(part))
            partition_map[counter] = idx
            counter += 1
    return new_sentence_list, partition_map


def merge_partition(partition_map, input_iter):
    max_length = max(partition_map.values()) + 1
    new_result_list = ["" for _ in range(max_length)]
    for idx, ele in enumerate(input_iter):
        new_result_list[partition_map[str(idx)]] += ele
        new_result_list[partition_map[str(idx)]] += " "
    return new_result_list


if __name__ == '__main__':
    input_dir = "/home/zxj/Documents/github/PacSum/extracted_parts/extracted_contents_all.txt"
    output_template = "/home/zxj/Documents/github/PacSum/extracted_parts/content_part_{0}.txt"
    doc_list = list(read_file(input_dir, preprocess=lambda x: x.strip()))
    idx = 0
    for ele in chunks(doc_list, 7):
        output_iterator(output_template.format(idx), ele)
        idx += 1
    transformation_list = [
        random_deletion, random_masking, span_corrupting, word_reordering
    ]
    with open(os.path.join(output_path), mode="w+") as out_file:
        for ele in triplets_list:
            hypo, premise, neg_cand = ele
            output_dict = {
                "hypothesis": hypo,
                "premise": premise,
                "negative_candidates": [neg_cand]
            }

            tokenized_premise = [ele for ele in tokenizer(premise)]
            for corrupt_func in transformation_list:
                result = [
                    ele.text if not isinstance(ele, str) else ele
                    for ele in corrupt_func(tokenized_premise)
                ]
                output_dict["negative_candidates"].append(" ".join(result))
            out_file.write(json.dumps(output_dict) + "\n")


if __name__ == '__main__':
    input_dir = "/home/zxj/Data/relation_based_analogy/input"
    input_path = os.path.join(input_dir, "adjective_compositionality.txt")
    input_iter = read_file(input_path,
                           preprocess=lambda x: x.strip().split("\t"))
    outpath = os.path.join(input_dir, "adjective_analogy.txt")
    nlp = spacy.load("en_core_web_sm")
    generate_negative_candidates(input_iter, nlp, outpath)
Beispiel #14
0
def filter_topic(file_path):
    category_pattern = re.compile(r"^: ")
    contents = read_file(file_path, preprocess=lambda x: x.strip())
    for ele in contents:
        if category_pattern.search(ele):
            print(ele)