Ejemplo n.º 1
0
def main():
    input_path = "/home/zxj/Data/SparkNotes/url_parts/url_part_{0}.txt"
    root_url = "https://www.sparknotes.com"
    output_path = "/home/zxj/Data/SparkNotes/chapters_url_parts/chapters_url_part_{0}.txt"
    loop = asyncio.get_event_loop()
    input_list = list(
        read_file("/home/zxj/Data/SparkNotes/missing_urls.txt",
                  preprocess=lambda x: json.loads(x.strip())))
    for ele in input_list:
        print(ele)
    loop.run_until_complete(get_url_link_list(input_list))

    output_iterator("/home/zxj/Data/SparkNotes/missing_urls_new.txt",
                    input_list, process=lambda x: json.dumps(x))
Ejemplo n.º 2
0
def summary_scrapping():
    input_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_unfinished.txt"
    input_list = list(
        read_file(input_path, preprocess=lambda x: json.loads(x.strip())))
    loop = asyncio.get_event_loop()
    results = loop.run_until_complete(get_url_link_list(input_list,
                                                        get_link=get_summary))
    finished_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_finished_new.txt"
    unfinished_path = "/Users/zxj/Google 云端硬盘/SparkNotes/book_summaries_unfinished_new.txt"

    finished = [ele for ele in input_list if 'summary' in ele]
    unfinished = [ele for ele in input_list if not 'summary' in ele]
    output_iterator(finished_path, finished, process=lambda x: json.dumps(x))
    output_iterator(unfinished_path, unfinished,
                    process=lambda x: json.dumps(x))
def preprocess_file(test_dir, summary_path):
    cnn_preprocessor = Preprocessor(test_dir, tokenizer=None)
    tokenize = False
    content_list = cnn_preprocessor.get_document_summary(tokenize)
    doc_list, summary_list = zip(*content_list)
    nlp = English()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    cnn_pattern = re.compile(r"\(CNN\)")
    #words = ([ele.text for sent in doc for ele in nlp(cnn_pattern.sub("", sent)).sents] for doc in doc_list)
    summaries = [[
        " ".join([ele.text for ele in tokenizer(sent)]) for sent in sum
    ] for sum in summary_list]
    #output_iterator(os.path.join(output_path, "cnn_dm_input.txt"), words, process=lambda x: "\001".join(x))
    output_iterator(summary_path, summaries, process=lambda x: "\001".join(x))
Ejemplo n.º 4
0
    if args.use_multiple_gpu:
        gpt_model = DataParallel(gpt_model)

    sample_id_list = []
    for ele in cnn_dataloader:
        input_ids, attention_mask, output_ids = ele
        if use_cuda:
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
        with torch.no_grad():
            result = sample_sequence(gpt_model,
                                     100,
                                     input_ids,
                                     attention_mask,
                                     args.method,
                                     repetition_penalty=1.2,
                                     top_p=0.9,
                                     temperature=0.9,
                                     eos_idx=gpt_tokenizer.eos_token_id)
            sample_id_list.append(result)

    sample_id_list = [ele.cpu().numpy() for ele in sample_id_list]
    sample_list = decode_id_array(sample_id_list)
    sample_list = [re.sub("\n+", " ", ele) for ele in sample_list]

    output_iterator(os.path.join(args.output_dir, "generated_summaries.txt"),
                    sample_list)

    output_iterator(os.path.join(args.output_dir, "actual_summaries.txt"),
                    summary_list)
    for idx, ele in enumerate(doc_list):
        if len(ele) < num_partitions:
            new_sentence_list.append(" ".join(ele))
            partition_map[counter] = idx
            counter += 1
            continue

        for part in chunks(ele, num_partitions):
            new_sentence_list.append(" ".join(part))
            partition_map[counter] = idx
            counter += 1
    return new_sentence_list, partition_map


def merge_partition(partition_map, input_iter):
    max_length = max(partition_map.values()) + 1
    new_result_list = ["" for _ in range(max_length)]
    for idx, ele in enumerate(input_iter):
        new_result_list[partition_map[str(idx)]] += ele
        new_result_list[partition_map[str(idx)]] += " "
    return new_result_list


if __name__ == '__main__':
    input_dir = "/home/zxj/Documents/github/PacSum/extracted_parts/extracted_contents_all.txt"
    output_template = "/home/zxj/Documents/github/PacSum/extracted_parts/content_part_{0}.txt"
    doc_list = list(read_file(input_dir, preprocess=lambda x: x.strip()))
    idx = 0
    for ele in chunks(doc_list, 7):
        output_iterator(output_template.format(idx), ele)
        idx += 1