Esempi in Python per sim_with_doc, esempi in Python per preprocess.sim_with_doc

Esempio n. 1

0

Mostra file

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params):
    for example in sub_stories:
        file_name = os.path.join(save_path, example[1])
        start_time = time.time()
        raw_sents = re.split(" . ", example[0])
        #remove too short sentences
        df = pd.DataFrame(raw_sents, columns =['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) < 5:
            continue

        preprocessed_sentences = []
        for raw_sent in raw_sentences:
            preprocessed_sent = preprocess_raw_sent(raw_sent)
            preprocessed_sentences.append(preprocessed_sent)


        if len(preprocessed_sentences) < 10:
            solution_for_exception(raw_sentences, file_name)
        title = preprocessed_sentences[0]

        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(preprocessed_sentences)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        list_sentences_frequencies = dense.tolist()
        # df_tfidf = pd.DataFrame(list_sentences_frequencies, columns=feature_names)
        title_vector = list_sentences_frequencies[0]

        #tfidf for document and abstract
        document = [(" ").join(preprocessed_sentences)]
        vector_doc = vectorizer.fit_transform(document)
        dense_doc = vector_doc.todense()
        document_vector = dense_doc.tolist()[0]
        
        
        number_of_nouns = count_noun(raw_sentences, option= True)
        simWithTitle = sim_with_title(list_sentences_frequencies, title_vector)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector)
        NUM_PICKED_SENTS = 4
        print("Done preprocessing!")
        print('time for processing', time.time() - start_time)

            
        Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params)
        best_individual = Solver.solve()
                 

        print(file_name)
        if best_individual is None:
            solution_for_exception(raw_sentences, file_name)     
        else:
            print(best_individual)
            Solver.show(best_individual, file_name)

Esempio n. 2

0

Mostra file

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):

    for example in sub_stories:
        start_time = time.time()
        raw_doc = re.split("\n\n", example[0])[1]
        title = re.split("\n\n", example[0])[0]
        abstract = re.split("\n\n", example[0])[2]

        #remove too short sentences
        # df = pd.DataFrame(raw_sents, columns =['raw'])
        # df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        # newdf = df.loc[(df['preprocess_raw'] != 'None')]
        # raw_sentences = newdf['preprocess_raw'].values.tolist()

        # preprocessed_sentences = []
        # for raw_sent in raw_sentences:
        #     preprocessed_sent = preprocess_raw_sent(raw_sent)
        #     preprocessed_sentences.append(preprocessed_sent)

        raw_sentences, preprocessed_sentences, simWithTitle = sim_with_title_of_paragraph(
            raw_doc)
        if len(raw_sentences) == 0:
            continue

        preprocessed_abs_sentences_list = []
        raw_abs_sent_list = abstract.split(' . ')
        for abs_sent in raw_abs_sent_list:
            preprocessed_abs_sent = preprocess_raw_sent(abs_sent)
            preprocessed_abs_sentences_list.append(preprocessed_abs_sent)

        if len(preprocessed_abs_sentences_list) < 4 or len(
                preprocessed_sentences) < 7:
            continue
        preprocessed_abs_sentences = (
            " ").join(preprocessed_abs_sentences_list)

        #tfidf for sentences
        bodyandtitle = preprocessed_sentences.copy()
        bodyandtitle.append(preprocess_raw_sent(title.lower()))

        full_text = preprocessed_sentences.copy()
        full_text.append(preprocessed_abs_sentences)
        full_text.append(preprocess_raw_sent(title.lower()))

        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(full_text)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        denselist = dense.tolist()
        df_tfidf = pd.DataFrame(denselist, columns=feature_names)
        title_vector = denselist[-1]

        #tfidf for document and abstract
        document = [(" ").join(bodyandtitle), preprocessed_abs_sentences]
        vector_doc = vectorizer.fit_transform(document)
        dense_doc = vector_doc.todense()
        document_vector = dense_doc.tolist()[0]
        abstract_vector = dense_doc.tolist()[1]

        list_sentences_frequencies = denselist[:-2]

        # number_of_nouns = count_noun(preprocessed_sentences, option = True)
        number_of_nouns = 0

        # simWithTitle = sim_with_title(list_sentences_frequencies, title_vector)

        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector)
        simWithAbs = sim_with_doc(list_sentences_frequencies, abstract_vector)
        rougeforsentences = evaluate_rouge(raw_sentences, abstract)
        print("Done preprocessing!")

        print('time for processing', time.time() - start_time)
        if len(preprocessed_sentences) < 4:
            NUM_PICKED_SENTS = len(preprocessed_sentences)
        else:
            NUM_PICKED_SENTS = 4
        # DONE!
        Solver = Summerizer(title, preprocessed_sentences, raw_sentences,
                            POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
                            NUM_PICKED_SENTS, simWithTitle, simWithDoc,
                            sim2sents, number_of_nouns, simWithAbs,
                            rougeforsentences, order_params)
        best_individual = Solver.solve()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)

Esempio n. 3

0

Mostra file

File: GA.py Progetto: dangtrunganh/psoga191

def main():
    # Setting Variables
    POPU_SIZE = 30
    MAX_GEN = 20
    CROSS_RATE = 0.8
    MUTATE_RATE = 0.4
    NUM_PICKED_SENTS = 4

    directory = 'stories'
    save_path = 'hyp'

    print("Setting: ")
    print("POPULATION SIZE: {}".format(POPU_SIZE))
    print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN))
    print("CROSSING RATE: {}".format(CROSS_RATE))
    print("MUTATION SIZE: {}".format(MUTATE_RATE))

    # list of documents
    stories = load_docs(directory)
    start_time = time.time()
    for example in stories:
        try:
            raw_sents = example[0].split(" . ")
            print("Preprocessing ", example[1])
            sentences = []
            sentences_for_NNP = []

            df = pd.DataFrame(raw_sents, columns=['raw'])
            df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
            newdf = df.loc[(df['preprocess_raw'] != 'None')]
            raw_sentences = newdf['preprocess_raw'].values.tolist()

            for raw_sent in raw_sentences:
                sent = preprocess_raw_sent(raw_sent)
                sent_tmp = preprocess_numberOfNNP(raw_sent)
                # print(f'time-preprocess_numberOfNNP = {time.time() - time_2} s')
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)

            title_raw = raw_sentences[0]
            title = preprocess_raw_sent(title_raw)
            number_of_nouns = count_noun(sentences_for_NNP)

            simWithTitle = sim_with_title(sentences, title)
            sim2sents = sim_2_sent(sentences)
            simWithDoc = []
            for sent in sentences:
                simWithDoc.append(sim_with_doc(sent, sentences))

            print("Done preprocessing!")
            # DONE!

            Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                                MAX_GEN, CROSS_RATE, MUTATE_RATE,
                                NUM_PICKED_SENTS, simWithTitle, simWithDoc,
                                sim2sents, number_of_nouns)
            best_individual = Solver.PSO()
            file_name = os.path.join(save_path, example[1])

            if best_individual is None:
                print('No solution.')
            else:
                print(file_name)
                print(best_individual)
                Solver.show(best_individual, file_name)
        except Exception as e:
            print(example[1])
            print("type error: " + str(e))

    print("--- %s mins ---" % ((time.time() - start_time) /
                               (60.0 * len(stories))))

Esempio n. 4

0

Mostra file

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params, scheme):
    for example in sub_stories:
        start_time = time.time()
        raw_sents = re.split("\n", example[0])

        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        title_raw = raw_sentences[0]

        sentences = []
        sentences_for_NNP = []
        for raw_sent in raw_sentences:
            sent = preprocess_raw_sent(raw_sent)
            # sent_tmp = preprocess_numberOfNNP(raw_sent)

            sent_tmp = preprocess_raw_sent(raw_sent, True)
            if len(sent.split(' ')) < 2:
                raw_sentences.remove(raw_sent)
            else:
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)
        title = preprocess_raw_sent(title_raw)
        list_sentences_frequencies = word_frequencies(sentences, title)
        number_of_nouns = count_noun(sentences_for_NNP)
        simWithTitle = sim_with_title(list_sentences_frequencies)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = []
        # for sent in sentences:
        for i in range(len(sentences)):
            simWithDoc.append(
                sim_with_doc(list_sentences_frequencies, index_sentence=i))

        # POPU_SIZE = 40
        if len(sentences) < 20:
            MAX_GEN = 20
        elif len(sentences) < 50:
            MAX_GEN = 50
        else:
            MAX_GEN = 80

        print("POPULATION SIZE: {}".format(POPU_SIZE))
        print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN))

        print("Done preprocessing!")
        # DONE!
        print('time for processing', time.time() - start_time)
        if len(sentences) < 4:
            NUM_PICKED_SENTS = len(sentences)
        else:
            NUM_PICKED_SENTS = 4

        MinLT = 1
        MaxLT = 7

        Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                            MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS,
                            simWithTitle, simWithDoc, sim2sents,
                            number_of_nouns, order_params, MinLT, MaxLT,
                            scheme)
        best_individual = Solver.solve()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)

Esempio n. 5

0

Mostra file

File: GA_clean.py Progetto: chihiro2101/checkpsoduc2k2

def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):
    # def start_run(POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params):

    for example in sub_stories:

        start_time = time.time()
        # raw_sentences = re.split("\n\s+", example[0])
        raw_sents = re.split("\n", example[0])
        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        # print('raw', len(raw_sentences), stories.index(example))
        title_raw = raw_sentences[0]
        # Preprocessing
        # print("Preprocessing...")
        sentences = []
        sentences_for_NNP = []
        for raw_sent in raw_sentences:
            sent = preprocess_raw_sent(raw_sent)
            # sent_tmp = preprocess_numberOfNNP(raw_sent)

            sent_tmp = preprocess_raw_sent(raw_sent, True)
            if len(sent.split(' ')) < 2:
                raw_sentences.remove(raw_sent)
            else:
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)
        title = preprocess_raw_sent(title_raw)
        list_sentences_frequencies = word_frequencies(sentences, title)
        number_of_nouns = count_noun(sentences_for_NNP)
        simWithTitle = sim_with_title(list_sentences_frequencies)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = []
        # for sent in sentences:
        for i in range(len(sentences)):
            simWithDoc.append(
                sim_with_doc(list_sentences_frequencies, index_sentence=i))

        print("Done preprocessing!")
        # DONE!
        print('time for processing', time.time() - start_time)
        if len(sentences) < 4:
            NUM_PICKED_SENTS = len(sentences)
        else:
            #            NUM_PICKED_SENTS = x
            #       NUM_PICKED_SENTS = int(len(sentences)*0.2)
            NUM_PICKED_SENTS = 4

        Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                            MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS,
                            simWithTitle, simWithDoc, sim2sents,
                            number_of_nouns, order_params)
        best_individual = Solver.PSO()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)