def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: file_name = os.path.join(save_path, example[1]) start_time = time.time() raw_sents = re.split(" . ", example[0]) #remove too short sentences df = pd.DataFrame(raw_sents, columns =['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) < 5: continue preprocessed_sentences = [] for raw_sent in raw_sentences: preprocessed_sent = preprocess_raw_sent(raw_sent) preprocessed_sentences.append(preprocessed_sent) if len(preprocessed_sentences) < 10: solution_for_exception(raw_sentences, file_name) title = preprocessed_sentences[0] vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(preprocessed_sentences) feature_names = vectorizer.get_feature_names() dense = vectors.todense() list_sentences_frequencies = dense.tolist() # df_tfidf = pd.DataFrame(list_sentences_frequencies, columns=feature_names) title_vector = list_sentences_frequencies[0] #tfidf for document and abstract document = [(" ").join(preprocessed_sentences)] vector_doc = vectorizer.fit_transform(document) dense_doc = vector_doc.todense() document_vector = dense_doc.tolist()[0] number_of_nouns = count_noun(raw_sentences, option= True) simWithTitle = sim_with_title(list_sentences_frequencies, title_vector) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector) NUM_PICKED_SENTS = 4 print("Done preprocessing!") print('time for processing', time.time() - start_time) Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params) best_individual = Solver.solve() print(file_name) if best_individual is None: solution_for_exception(raw_sentences, file_name) else: print(best_individual) Solver.show(best_individual, file_name)
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() raw_doc = re.split("\n\n", example[0])[1] title = re.split("\n\n", example[0])[0] abstract = re.split("\n\n", example[0])[2] #remove too short sentences # df = pd.DataFrame(raw_sents, columns =['raw']) # df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) # newdf = df.loc[(df['preprocess_raw'] != 'None')] # raw_sentences = newdf['preprocess_raw'].values.tolist() # preprocessed_sentences = [] # for raw_sent in raw_sentences: # preprocessed_sent = preprocess_raw_sent(raw_sent) # preprocessed_sentences.append(preprocessed_sent) raw_sentences, preprocessed_sentences, simWithTitle = sim_with_title_of_paragraph( raw_doc) if len(raw_sentences) == 0: continue preprocessed_abs_sentences_list = [] raw_abs_sent_list = abstract.split(' . ') for abs_sent in raw_abs_sent_list: preprocessed_abs_sent = preprocess_raw_sent(abs_sent) preprocessed_abs_sentences_list.append(preprocessed_abs_sent) if len(preprocessed_abs_sentences_list) < 4 or len( preprocessed_sentences) < 7: continue preprocessed_abs_sentences = ( " ").join(preprocessed_abs_sentences_list) #tfidf for sentences bodyandtitle = preprocessed_sentences.copy() bodyandtitle.append(preprocess_raw_sent(title.lower())) full_text = preprocessed_sentences.copy() full_text.append(preprocessed_abs_sentences) full_text.append(preprocess_raw_sent(title.lower())) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(full_text) feature_names = vectorizer.get_feature_names() dense = vectors.todense() denselist = dense.tolist() df_tfidf = pd.DataFrame(denselist, columns=feature_names) title_vector = denselist[-1] #tfidf for document and abstract document = [(" ").join(bodyandtitle), preprocessed_abs_sentences] vector_doc = vectorizer.fit_transform(document) dense_doc = vector_doc.todense() document_vector = dense_doc.tolist()[0] abstract_vector = dense_doc.tolist()[1] list_sentences_frequencies = denselist[:-2] # number_of_nouns = count_noun(preprocessed_sentences, option = True) number_of_nouns = 0 # simWithTitle = sim_with_title(list_sentences_frequencies, title_vector) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector) simWithAbs = sim_with_doc(list_sentences_frequencies, abstract_vector) rougeforsentences = evaluate_rouge(raw_sentences, abstract) print("Done preprocessing!") print('time for processing', time.time() - start_time) if len(preprocessed_sentences) < 4: NUM_PICKED_SENTS = len(preprocessed_sentences) else: NUM_PICKED_SENTS = 4 # DONE! Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, simWithAbs, rougeforsentences, order_params) best_individual = Solver.solve() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)
def main(): # Setting Variables POPU_SIZE = 30 MAX_GEN = 20 CROSS_RATE = 0.8 MUTATE_RATE = 0.4 NUM_PICKED_SENTS = 4 directory = 'stories' save_path = 'hyp' print("Setting: ") print("POPULATION SIZE: {}".format(POPU_SIZE)) print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN)) print("CROSSING RATE: {}".format(CROSS_RATE)) print("MUTATION SIZE: {}".format(MUTATE_RATE)) # list of documents stories = load_docs(directory) start_time = time.time() for example in stories: try: raw_sents = example[0].split(" . ") print("Preprocessing ", example[1]) sentences = [] sentences_for_NNP = [] df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) sent_tmp = preprocess_numberOfNNP(raw_sent) # print(f'time-preprocess_numberOfNNP = {time.time() - time_2} s') sentences.append(sent) sentences_for_NNP.append(sent_tmp) title_raw = raw_sentences[0] title = preprocess_raw_sent(title_raw) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(sentences, title) sim2sents = sim_2_sent(sentences) simWithDoc = [] for sent in sentences: simWithDoc.append(sim_with_doc(sent, sentences)) print("Done preprocessing!") # DONE! Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns) best_individual = Solver.PSO() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name) except Exception as e: print(example[1]) print("type error: " + str(e)) print("--- %s mins ---" % ((time.time() - start_time) / (60.0 * len(stories))))
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params, scheme): for example in sub_stories: start_time = time.time() raw_sents = re.split("\n", example[0]) df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue title_raw = raw_sentences[0] sentences = [] sentences_for_NNP = [] for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) # sent_tmp = preprocess_numberOfNNP(raw_sent) sent_tmp = preprocess_raw_sent(raw_sent, True) if len(sent.split(' ')) < 2: raw_sentences.remove(raw_sent) else: sentences.append(sent) sentences_for_NNP.append(sent_tmp) title = preprocess_raw_sent(title_raw) list_sentences_frequencies = word_frequencies(sentences, title) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(list_sentences_frequencies) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = [] # for sent in sentences: for i in range(len(sentences)): simWithDoc.append( sim_with_doc(list_sentences_frequencies, index_sentence=i)) # POPU_SIZE = 40 if len(sentences) < 20: MAX_GEN = 20 elif len(sentences) < 50: MAX_GEN = 50 else: MAX_GEN = 80 print("POPULATION SIZE: {}".format(POPU_SIZE)) print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN)) print("Done preprocessing!") # DONE! print('time for processing', time.time() - start_time) if len(sentences) < 4: NUM_PICKED_SENTS = len(sentences) else: NUM_PICKED_SENTS = 4 MinLT = 1 MaxLT = 7 Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params, MinLT, MaxLT, scheme) best_individual = Solver.solve() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): # def start_run(POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params): for example in sub_stories: start_time = time.time() # raw_sentences = re.split("\n\s+", example[0]) raw_sents = re.split("\n", example[0]) df = pd.DataFrame(raw_sents, columns=['raw']) df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x)) newdf = df.loc[(df['preprocess_raw'] != 'None')] raw_sentences = newdf['preprocess_raw'].values.tolist() if len(raw_sentences) == 0: continue # print('raw', len(raw_sentences), stories.index(example)) title_raw = raw_sentences[0] # Preprocessing # print("Preprocessing...") sentences = [] sentences_for_NNP = [] for raw_sent in raw_sentences: sent = preprocess_raw_sent(raw_sent) # sent_tmp = preprocess_numberOfNNP(raw_sent) sent_tmp = preprocess_raw_sent(raw_sent, True) if len(sent.split(' ')) < 2: raw_sentences.remove(raw_sent) else: sentences.append(sent) sentences_for_NNP.append(sent_tmp) title = preprocess_raw_sent(title_raw) list_sentences_frequencies = word_frequencies(sentences, title) number_of_nouns = count_noun(sentences_for_NNP) simWithTitle = sim_with_title(list_sentences_frequencies) sim2sents = sim_2_sent(list_sentences_frequencies) simWithDoc = [] # for sent in sentences: for i in range(len(sentences)): simWithDoc.append( sim_with_doc(list_sentences_frequencies, index_sentence=i)) print("Done preprocessing!") # DONE! print('time for processing', time.time() - start_time) if len(sentences) < 4: NUM_PICKED_SENTS = len(sentences) else: # NUM_PICKED_SENTS = x # NUM_PICKED_SENTS = int(len(sentences)*0.2) NUM_PICKED_SENTS = 4 Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params) best_individual = Solver.PSO() file_name = os.path.join(save_path, example[1]) if best_individual is None: print('No solution.') else: print(file_name) print(best_individual) Solver.show(best_individual, file_name)