Python preprocess_raw_sentの例、preprocess.preprocess_raw_sent Pythonの例

コード例 #1

0

ファイルを表示

ファイル: onlyrougeGA.py プロジェクト: chihiro2101/tbga_for_longver

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):

    for example in sub_stories:
        start_time = time.time()
        raw_sents = re.split("\n\n", example[0])[1].split(' . ')
        title = re.split("\n\n", example[0])[0]
        abstract = re.split("\n\n", example[0])[2]

        #remove too short sentences
        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        preprocessed_sentences = []
        for raw_sent in raw_sentences:
            preprocessed_sent = preprocess_raw_sent(raw_sent)
            preprocessed_sentences.append(preprocessed_sent)

        preprocessed_abs_sentences_list = []
        raw_abs_sent_list = abstract.split(' . ')
        for abs_sent in raw_abs_sent_list:
            preprocessed_abs_sent = preprocess_raw_sent(abs_sent)
            preprocessed_abs_sentences_list.append(preprocessed_abs_sent)
        preprocessed_abs_sentences = (
            " ").join(preprocessed_abs_sentences_list)

        if len(preprocessed_sentences) < 7 or len(
                preprocessed_abs_sentences_list) < 3:
            continue

        rougeforsentences = evaluate_rouge(raw_sentences, abstract)

        print("Done preprocessing!")

        print('time for processing', time.time() - start_time)
        if len(preprocessed_sent) < 4:
            NUM_PICKED_SENTS = len(preprocessed_sentences)
        else:
            NUM_PICKED_SENTS = 4
        # DONE!
        Solver = Summerizer(title, preprocessed_sentences, raw_sentences,
                            POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
                            NUM_PICKED_SENTS, rougeforsentences, abstract,
                            order_params)
        best_individual = Solver.solve()
        file_name = os.path.join(save_path, example[1])

        print(file_name)
        if best_individual is None:
            solution_for_exception(rougeforsentences, raw_sentences, file_name)
        else:
            print(best_individual)
            Solver.show(best_individual, file_name)

コード例 #2

0

ファイルを表示

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):

    for example in sub_stories:
        start_time = time.time()
        raw_sents = re.split("\n\n", example[0])[1].split(' . ')
        title = re.split("\n\n", example[0])[0]
        abstract = re.split("\n\n", example[0])[2]

        #remove too short sentences
        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        preprocessed_sentences = []
        for raw_sent in raw_sentences:
            preprocessed_sent = preprocess_raw_sent(raw_sent)
            preprocessed_sentences.append(preprocessed_sent)

        preprocessed_abs_sentences_list = []
        raw_abs_sent_list = abstract.split(' . ')
        for abs_sent in raw_abs_sent_list:
            preprocessed_abs_sent = preprocess_raw_sent(abs_sent)
            preprocessed_abs_sentences_list.append(preprocessed_abs_sent)
        preprocessed_abs_sentences = (
            " ").join(preprocessed_abs_sentences_list)

        if len(preprocessed_sentences) < 7 or len(
                preprocessed_abs_sentences_list) < 3:
            continue

        rougeforsentences = evaluate_rouge(raw_sentences, abstract)
        rank_rougeforsentences = sorted(rougeforsentences,
                                        key=lambda x: x[1],
                                        reverse=True)
        length_of_summary = int(0.2 * len(raw_sentences))
        rank_rouge = rank_rougeforsentences[:length_of_summary]
        rank_rouge = sorted(rank_rouge, key=lambda x: x[2], reverse=False)
        print("Done preprocessing!")

        print('time for processing', time.time() - start_time)

        file_name = os.path.join(save_path, example[1])
        f = open(file_name, 'w', encoding='utf-8')
        for sent in rank_rouge:
            f.write(sent[0] + ' ')
        f.close()

コード例 #3

0

ファイルを表示

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params):
    for example in sub_stories:
        file_name = os.path.join(save_path, example[1])
        start_time = time.time()
        raw_sents = re.split(" . ", example[0])
        #remove too short sentences
        df = pd.DataFrame(raw_sents, columns =['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) < 5:
            continue

        preprocessed_sentences = []
        for raw_sent in raw_sentences:
            preprocessed_sent = preprocess_raw_sent(raw_sent)
            preprocessed_sentences.append(preprocessed_sent)


        if len(preprocessed_sentences) < 10:
            solution_for_exception(raw_sentences, file_name)
        title = preprocessed_sentences[0]

        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(preprocessed_sentences)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        list_sentences_frequencies = dense.tolist()
        # df_tfidf = pd.DataFrame(list_sentences_frequencies, columns=feature_names)
        title_vector = list_sentences_frequencies[0]

        #tfidf for document and abstract
        document = [(" ").join(preprocessed_sentences)]
        vector_doc = vectorizer.fit_transform(document)
        dense_doc = vector_doc.todense()
        document_vector = dense_doc.tolist()[0]
        
        
        number_of_nouns = count_noun(raw_sentences, option= True)
        simWithTitle = sim_with_title(list_sentences_frequencies, title_vector)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector)
        NUM_PICKED_SENTS = 4
        print("Done preprocessing!")
        print('time for processing', time.time() - start_time)

            
        Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params)
        best_individual = Solver.solve()
                 

        print(file_name)
        if best_individual is None:
            solution_for_exception(raw_sentences, file_name)     
        else:
            print(best_individual)
            Solver.show(best_individual, file_name)

コード例 #4

0

ファイルを表示

def sim_with_title_of_paragraph(document):
    paragraphs = document.split('\n')
    sim_header = []
    raw_sents = []
    preprocessed_sents = []
    for para in paragraphs:
        raw = para.split(' . ')
        df = pd.DataFrame(raw, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        newdf['preprocessed_raw'] = df['preprocess_raw'].apply(
            lambda x: preprocess_raw_sent(x))

        raw_sentences = newdf['preprocess_raw'].values.tolist()
        preprocessed_sentences = newdf['preprocessed_raw'].values.tolist()

        # preprocessed_sentences = []
        # for raw_sent in raw_sentences:
        #     preprocessed_sent = preprocess_raw_sent(raw_sent)
        #     preprocessed_sentences.append(preprocessed_sent)

        raw_sents.extend(raw_sentences)
        preprocessed_sents.extend(preprocessed_sentences)

        #similar with header of paragraph
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(preprocessed_sentences)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        denselist = dense.tolist()
        df_tfidf = pd.DataFrame(denselist, columns=feature_names)
        simWithTitle = sim_with_title(denselist, denselist[0])
        sim_header.extend(simWithTitle)
        del df
        del newdf

    return raw_sents, preprocessed_sents, sim_header

コード例 #5

0

ファイルを表示

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):

    for example in sub_stories:
        start_time = time.time()
        raw_doc = re.split("\n\n", example[0])[1]
        title = re.split("\n\n", example[0])[0]
        abstract = re.split("\n\n", example[0])[2]

        #remove too short sentences
        # df = pd.DataFrame(raw_sents, columns =['raw'])
        # df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        # newdf = df.loc[(df['preprocess_raw'] != 'None')]
        # raw_sentences = newdf['preprocess_raw'].values.tolist()

        # preprocessed_sentences = []
        # for raw_sent in raw_sentences:
        #     preprocessed_sent = preprocess_raw_sent(raw_sent)
        #     preprocessed_sentences.append(preprocessed_sent)

        raw_sentences, preprocessed_sentences, simWithTitle = sim_with_title_of_paragraph(
            raw_doc)
        if len(raw_sentences) == 0:
            continue

        preprocessed_abs_sentences_list = []
        raw_abs_sent_list = abstract.split(' . ')
        for abs_sent in raw_abs_sent_list:
            preprocessed_abs_sent = preprocess_raw_sent(abs_sent)
            preprocessed_abs_sentences_list.append(preprocessed_abs_sent)

        if len(preprocessed_abs_sentences_list) < 4 or len(
                preprocessed_sentences) < 7:
            continue
        preprocessed_abs_sentences = (
            " ").join(preprocessed_abs_sentences_list)

        #tfidf for sentences
        bodyandtitle = preprocessed_sentences.copy()
        bodyandtitle.append(preprocess_raw_sent(title.lower()))

        full_text = preprocessed_sentences.copy()
        full_text.append(preprocessed_abs_sentences)
        full_text.append(preprocess_raw_sent(title.lower()))

        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(full_text)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        denselist = dense.tolist()
        df_tfidf = pd.DataFrame(denselist, columns=feature_names)
        title_vector = denselist[-1]

        #tfidf for document and abstract
        document = [(" ").join(bodyandtitle), preprocessed_abs_sentences]
        vector_doc = vectorizer.fit_transform(document)
        dense_doc = vector_doc.todense()
        document_vector = dense_doc.tolist()[0]
        abstract_vector = dense_doc.tolist()[1]

        list_sentences_frequencies = denselist[:-2]

        # number_of_nouns = count_noun(preprocessed_sentences, option = True)
        number_of_nouns = 0

        # simWithTitle = sim_with_title(list_sentences_frequencies, title_vector)

        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector)
        simWithAbs = sim_with_doc(list_sentences_frequencies, abstract_vector)
        rougeforsentences = evaluate_rouge(raw_sentences, abstract)
        print("Done preprocessing!")

        print('time for processing', time.time() - start_time)
        if len(preprocessed_sentences) < 4:
            NUM_PICKED_SENTS = len(preprocessed_sentences)
        else:
            NUM_PICKED_SENTS = 4
        # DONE!
        Solver = Summerizer(title, preprocessed_sentences, raw_sentences,
                            POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
                            NUM_PICKED_SENTS, simWithTitle, simWithDoc,
                            sim2sents, number_of_nouns, simWithAbs,
                            rougeforsentences, order_params)
        best_individual = Solver.solve()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)

コード例 #6

0

ファイルを表示

ファイル: GA.py プロジェクト: dangtrunganh/psoga191

def main():
    # Setting Variables
    POPU_SIZE = 30
    MAX_GEN = 20
    CROSS_RATE = 0.8
    MUTATE_RATE = 0.4
    NUM_PICKED_SENTS = 4

    directory = 'stories'
    save_path = 'hyp'

    print("Setting: ")
    print("POPULATION SIZE: {}".format(POPU_SIZE))
    print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN))
    print("CROSSING RATE: {}".format(CROSS_RATE))
    print("MUTATION SIZE: {}".format(MUTATE_RATE))

    # list of documents
    stories = load_docs(directory)
    start_time = time.time()
    for example in stories:
        try:
            raw_sents = example[0].split(" . ")
            print("Preprocessing ", example[1])
            sentences = []
            sentences_for_NNP = []

            df = pd.DataFrame(raw_sents, columns=['raw'])
            df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
            newdf = df.loc[(df['preprocess_raw'] != 'None')]
            raw_sentences = newdf['preprocess_raw'].values.tolist()

            for raw_sent in raw_sentences:
                sent = preprocess_raw_sent(raw_sent)
                sent_tmp = preprocess_numberOfNNP(raw_sent)
                # print(f'time-preprocess_numberOfNNP = {time.time() - time_2} s')
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)

            title_raw = raw_sentences[0]
            title = preprocess_raw_sent(title_raw)
            number_of_nouns = count_noun(sentences_for_NNP)

            simWithTitle = sim_with_title(sentences, title)
            sim2sents = sim_2_sent(sentences)
            simWithDoc = []
            for sent in sentences:
                simWithDoc.append(sim_with_doc(sent, sentences))

            print("Done preprocessing!")
            # DONE!

            Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                                MAX_GEN, CROSS_RATE, MUTATE_RATE,
                                NUM_PICKED_SENTS, simWithTitle, simWithDoc,
                                sim2sents, number_of_nouns)
            best_individual = Solver.PSO()
            file_name = os.path.join(save_path, example[1])

            if best_individual is None:
                print('No solution.')
            else:
                print(file_name)
                print(best_individual)
                Solver.show(best_individual, file_name)
        except Exception as e:
            print(example[1])
            print("type error: " + str(e))

    print("--- %s mins ---" % ((time.time() - start_time) /
                               (60.0 * len(stories))))

コード例 #7

0

ファイルを表示

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params, scheme):
    for example in sub_stories:
        start_time = time.time()
        raw_sents = re.split("\n", example[0])

        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        title_raw = raw_sentences[0]

        sentences = []
        sentences_for_NNP = []
        for raw_sent in raw_sentences:
            sent = preprocess_raw_sent(raw_sent)
            # sent_tmp = preprocess_numberOfNNP(raw_sent)

            sent_tmp = preprocess_raw_sent(raw_sent, True)
            if len(sent.split(' ')) < 2:
                raw_sentences.remove(raw_sent)
            else:
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)
        title = preprocess_raw_sent(title_raw)
        list_sentences_frequencies = word_frequencies(sentences, title)
        number_of_nouns = count_noun(sentences_for_NNP)
        simWithTitle = sim_with_title(list_sentences_frequencies)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = []
        # for sent in sentences:
        for i in range(len(sentences)):
            simWithDoc.append(
                sim_with_doc(list_sentences_frequencies, index_sentence=i))

        # POPU_SIZE = 40
        if len(sentences) < 20:
            MAX_GEN = 20
        elif len(sentences) < 50:
            MAX_GEN = 50
        else:
            MAX_GEN = 80

        print("POPULATION SIZE: {}".format(POPU_SIZE))
        print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN))

        print("Done preprocessing!")
        # DONE!
        print('time for processing', time.time() - start_time)
        if len(sentences) < 4:
            NUM_PICKED_SENTS = len(sentences)
        else:
            NUM_PICKED_SENTS = 4

        MinLT = 1
        MaxLT = 7

        Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                            MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS,
                            simWithTitle, simWithDoc, sim2sents,
                            number_of_nouns, order_params, MinLT, MaxLT,
                            scheme)
        best_individual = Solver.solve()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)

コード例 #8

0

ファイルを表示

ファイル: onlyrouge.py プロジェクト: chihiro2101/text_rank_for_long_ver

def start_run(processID, sub_stories, save_path, word_embeddings):

    for example in sub_stories:
        start_time = time.time()
        raw_sents = re.split("\n\n", example[0])[1].split(' . ')
        title = re.split("\n\n", example[0])[0]
        abstract = re.split("\n\n", example[0])[2]

        #remove too short sentences
        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        preprocessed_sentences = []
        for raw_sent in raw_sentences:
            preprocessed_sent = preprocess_raw_sent(raw_sent)
            preprocessed_sentences.append(preprocessed_sent)

        preprocessed_abs_sentences_list = []
        raw_abs_sent_list = abstract.split(' . ')
        for abs_sent in raw_abs_sent_list:
            preprocessed_abs_sent = preprocess_raw_sent(abs_sent)
            preprocessed_abs_sentences_list.append(preprocessed_abs_sent)
        preprocessed_abs_sentences = (
            " ").join(preprocessed_abs_sentences_list)

        if len(preprocessed_sentences) < 7 or len(
                preprocessed_abs_sentences_list) < 3:
            continue

        sentences = preprocessed_sentences.copy()

        sentence_vectors = []
        for i in sentences:
            if len(i) != 0:
                v = sum([
                    word_embeddings.get(w, np.zeros((50, )))
                    for w in i.split()
                ]) / (len(i.split()) + 0.001)
            else:
                v = np.zeros((50, ))
            sentence_vectors.append(v)

        sim_mat = np.zeros([len(sentences), len(sentences)])
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(
                        sentence_vectors[i].reshape(1, 50),
                        sentence_vectors[j].reshape(1, 50))[0, 0]

        nx_graph = nx.from_numpy_array(sim_mat)
        try:
            scores = nx.pagerank(nx_graph)  # score of all sentences in article
        except Exception:
            continue
        scores_with_sentences = []
        for i in range(len(raw_sentences)):
            tmp = (raw_sentences[i], scores[i], i)
            scores_with_sentences.append(tmp)

        rank_scores_with_sentences = sorted(scores_with_sentences,
                                            key=lambda x: x[1],
                                            reverse=True)
        length_of_summary = int(0.2 * len(raw_sentences))
        rank_text = rank_scores_with_sentences[:length_of_summary]
        rank_text = sorted(rank_text, key=lambda x: x[2], reverse=False)

        print("Done preprocessing!")

        print('time for processing', time.time() - start_time)

        file_name = os.path.join(save_path, example[1])
        f = open(file_name, 'w', encoding='utf-8')
        for sent in rank_text:
            f.write(sent[0] + ' ')
        f.close()

コード例 #9

0

ファイルを表示

ファイル: GA_clean.py プロジェクト: chihiro2101/checkpsoduc2k2

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):
    # def start_run(POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params):

    for example in sub_stories:

        start_time = time.time()
        # raw_sentences = re.split("\n\s+", example[0])
        raw_sents = re.split("\n", example[0])
        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        # print('raw', len(raw_sentences), stories.index(example))
        title_raw = raw_sentences[0]
        # Preprocessing
        # print("Preprocessing...")
        sentences = []
        sentences_for_NNP = []
        for raw_sent in raw_sentences:
            sent = preprocess_raw_sent(raw_sent)
            # sent_tmp = preprocess_numberOfNNP(raw_sent)

            sent_tmp = preprocess_raw_sent(raw_sent, True)
            if len(sent.split(' ')) < 2:
                raw_sentences.remove(raw_sent)
            else:
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)
        title = preprocess_raw_sent(title_raw)
        list_sentences_frequencies = word_frequencies(sentences, title)
        number_of_nouns = count_noun(sentences_for_NNP)
        simWithTitle = sim_with_title(list_sentences_frequencies)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = []
        # for sent in sentences:
        for i in range(len(sentences)):
            simWithDoc.append(
                sim_with_doc(list_sentences_frequencies, index_sentence=i))

        print("Done preprocessing!")
        # DONE!
        print('time for processing', time.time() - start_time)
        if len(sentences) < 4:
            NUM_PICKED_SENTS = len(sentences)
        else:
            #            NUM_PICKED_SENTS = x
            #       NUM_PICKED_SENTS = int(len(sentences)*0.2)
            NUM_PICKED_SENTS = 4

        Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                            MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS,
                            simWithTitle, simWithDoc, sim2sents,
                            number_of_nouns, order_params)
        best_individual = Solver.PSO()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)