def analyze_text(self, book_folder, out_folder): filename = self.input_file.split('.')[0] result_book_folder = out_folder + filename + "/" if not os.path.exists(result_book_folder): os.makedirs(os.path.dirname(result_book_folder)) novel = Novel(book_folder + self.input_file) novel.read() novel.parse_persons() novel.store(filename=result_book_folder + self.all_names, data=novel.persons) # if you do not remove single occurrences, eps behaviour will be unstable occurrence_limit = 2 novel.remove_less_than(occurrences=occurrence_limit) novel.store(filename=result_book_folder + filename + "_names_more_than_" + str(occurrence_limit) + ".csv", data=novel.persons) novel.cluster_aliases() novel.associate_single_names() novel.store(filename=result_book_folder + self.clusters, data=novel.cluster_repetitions) novel.dealiases() novel.store(filename=result_book_folder + self.output_file, data=novel.dealiased_text, type='txt') self.novel = novel
def analyze_text(self, book_folder, out_folder): filename = self.input_file.split('.')[0] result_book_folder = out_folder + filename + "/" if not os.path.exists(result_book_folder): os.makedirs(os.path.dirname(result_book_folder)) novel = Novel(book_folder + self.input_file) novel.read() novel.parse_persons() novel.find_persons_title() novel.store(filename=result_book_folder + self.all_names, data=novel.persons) # if you do not remove single occurrences, eps behaviour will be unstable occurrence_limit = 2 novel.remove_less_than(occurrences=occurrence_limit) novel.store(filename=result_book_folder + filename + "_names_more_than_" + str(occurrence_limit) + ".csv", data=novel.persons) novel.cluster_aliases() novel.associate_simple_single_names() novel.associate_single_names() novel.store(filename=result_book_folder + self.clusters, data=novel.cluster_repetitions) novel.create_cluster_repetitions_df() novel.cluster_repetitions_df.to_pickle(result_book_folder + filename + '.pkl') novel.dealiases() novel.store(filename=result_book_folder + filename + "_dealiased.txt", data=novel.dealiased_text, type='txt') #Do the coreference after the dealias, because sometimes the coreference write a name just after a separation # and this lead to some not desired wrong situations in which name are together (e.g. "Potter,Hermione") novel.coreference() novel.store(filename=result_book_folder + self.output_file, data=novel.dealiased_text, type='txt') self.novel = novel return novel.cluster_repetitions_df