def read_definite_non_summary_section_titles(): """ Reads the list of sections titles from which summary statements are very rare to come from. :return: the list of such section titles. """ return set(Reader().open_file( BASE_DIR + "/Data/Utility_Data/definite_non_summary_titles.txt"))
def read_stopwords(): """ Reads the list of stop words that should not be included in the scoring of each sentence such as \"and\" and \"or\". """ common_words = Reader().open_file(STOPWORD_SOURCE) + list(string.punctuation) return set(common_words)
def read_section_titles(): """ Reads the section titles permitted in the paper. :return: A set of the permitted titles. """ return set(Reader().open_file(BASE_DIR + "/Data/Utility_Data/permitted_titles.txt"))
def read_in_paper(filename, sentences_as_lists=False, preserve_order=False): """ Reads in a paper and returns it as a dictionary. :param filename: the filename of the paper to read, of the paper file itself only not the path to the paper. :param sentences_as_lists: if true, will return the sentences of the paper as lists of words rather than strings. :param preserve_order: if true keeps track of which sections occured in what order in the paper. :return: a dictionary of the form (section: list of sentences in that section) """ paper_to_open = PAPER_SOURCE + filename paper_text = Reader().open_file_single_string(paper_to_open) udata = paper_text.decode("utf-8") paper = udata.encode("ascii", "ignore") return paper_tokenize(paper, sentences_as_lists=sentences_as_lists, preserve_order=preserve_order)
def read_in_files(): """Function which reads in all the scientific paper data, and parses it into a list of lists, where each item in the list is a sentence, in the form of a list of words. This is the form needed for the Word2Vec model.""" num_files = len( [name for name in os.listdir(DATA_SOURCE) if name.endswith(".txt")]) loading_section_size = num_files / 30 count = 0 sentences_as_lists = [] for filename in os.listdir(DATA_SOURCE): if filename.endswith(".txt"): # Pretty loading bar print("Processing Files: [", end="") for i in range(31, -1, -1): if count > i * loading_section_size: for j in range(0, i): print("-", end="") sys.stdout.flush() for j in range(i, 30): print(" ", end="") sys.stdout.flush() break if count == num_files: print("] ", count, end="\n") else: print("] ", count, end="\r") sys.stdout.flush() # Open the paper paper_to_open = DATA_SOURCE + filename paper = Reader().open_file_single_string(paper_to_open) udata = paper.decode("utf-8") paper = udata.encode("ascii", "ignore") # Split the data into a list of sentences, where each sentence is a list of words sentences = sent_tokenize(paper) for sentence in sentences: words = word_tokenize(sentence) sentences_as_lists.append(words) if DEBUG: print(sentences_as_lists) #wait() count += 1 return count, sentences_as_lists
def paper_tokenize(text, sentences_as_lists=False, preserve_order=False): """ Takes a paper with the sections delineated by '@&#' and splits them into a dictionary where the key is the section and the value is the text under that section. This could probably be a bit more efficient but it works well enough. :param text: the text of the paper to split :param sentences_as_lists: if true, returns the text of each sentence as a list of sentences. :param preserve_order: if true, tracks the order in which the paper sections occured. :returns: a dictionary of the form (section: section_text) """ permitted_titles = set(Reader().open_file(PERMITTED_TITLES_SOURCE)) # Split the text into sections if preserve_order: split_text_1 = re.split("@&#", text) split_text = zip(split_text_1, range(len(split_text_1))) else: split_text = re.split("@&#", text) # The key value. This value is changed if a permitted section title is encountered in the list. state = "" # After the for loop, this dictionary will have keys relating to each permitted section, and values corresponding # to the text of that section sentences_with_states = defaultdict(str) section_counts = defaultdict(int) if preserve_order: for text, pos in split_text: # Hack for proper sentence tokenization because NLTK tokeniser doesn't work properly for tokenising papers text = text.replace("etal.", "etal") text = text.replace("et al.", "etal") text = text.replace("Fig.", "Fig") text = text.replace("fig.", "fig") text = text.replace("Eq.", "Eq") text = text.replace("eq.", "eq") text = text.replace("pp.", "pp") text = text.replace("i.e.", "ie") text = text.replace("e.g.", "eg") text = text.replace("ref.", "ref") text = text.replace("Ref.", "Ref") text = text.replace("etc.", "etc") text = text.replace("Figs.", "Figs") text = text.replace("figs.", "figs") text = text.replace("No.", "No") text = text.replace("eqs.", "eqs") # Checks if text is a section title if text.lower() in permitted_titles: state = text section_counts[state] += 1 else: if sentences_as_lists: if section_counts[state] > 1: state = state + "_" + str(section_counts[state]) sentences_with_states[state] = ([preprocess_sentence(x) for x in sent_tokenize(text)], pos) else: if section_counts[state] > 1: state = state + "_" + str(section_counts[state]) sentences_with_states[state] = (text, pos) if not preserve_order: for text in split_text: # Hack for proper sentence tokenization because NLTK tokeniser doesn't work properly for tokenising papers text = text.replace("etal.", "etal") text = text.replace("et al.", "etal") text = text.replace("Fig.", "Fig") text = text.replace("fig.", "fig") text = text.replace("Eq.", "Eq") text = text.replace("eq.", "eq") text = text.replace("pp.", "pp") text = text.replace("i.e.", "ie") text = text.replace("e.g.", "eg") text = text.replace("ref.", "ref") text = text.replace("Ref.", "Ref") text = text.replace("etc.", "etc") text = text.replace("Figs.", "Figs") text = text.replace("figs.", "figs") text = text.replace("No.", "No") text = text.replace("eqs.", "eqs") # Checks if text is a section title if text.lower() in permitted_titles: state = text section_counts[state] += 1 else: if sentences_as_lists: if section_counts[state] > 1: state = state + "_" + str(section_counts[state]) sentences_with_states[state] = [preprocess_sentence(x) for x in sent_tokenize(text)] else: if section_counts[state] > 1: state = state + "_" + str(section_counts[state]) sentences_with_states[state] = text return sentences_with_states
# Location of the file with stopwords in it STOPWORD_SOURCE = BASE_DIR + "/Data/Utility_Data/common_words.txt" # Location of the Word2Vec model MODEL_SOURCE = BASE_DIR + "/Word2Vec/Word2Vec_Models/summarisation_100features_5minwords_20context" # Location to write the global filecounts of bag of words for each paper and total word count GLOBAL_WORDCOUNT_WRITE_LOC = BASE_DIR + "/Data/Utility_Data/Global_Counts/" # This may not be needed anymore DD_CLASSIFICATION_MODEL_LOC_VECS = BASE_DIR + "/Trained_Models/DoubleDataset/lr_c2_5_vector.pkl" DD_CLASSIFICATION_MODEL_LOC_FEATS = BASE_DIR + "/Trained_Models/DoubleDataset/lr_c2_5_feature.pkl" W2V_CLASSIFICATION_MODEL_LOC = BASE_DIR + "/Trained_Models/Word2Vec_Classifier/lr_highlights_only_c3_66.pkl" # Reads the stopwords as a set STOPWORDS = set(Reader().open_file(STOPWORD_SOURCE) + list(string.punctuation)) # Counts how many papers there are in the paper directory NUMBER_OF_PAPERS = len([name for name in os.listdir(PAPER_SOURCE) if name.endswith(".txt")]) # Integers defining the possible location of sentences in the paper HIGHLIGHT = 0 ABSTRACT = 1 INTRODUCTION = 2 RESULT_DISCUSSION = 3 METHOD = 4 CONCLUSION = 5 OTHER = 6 # An object which can compare two sentences and tell how similar they are / if they are the same SENTENCE_COMPARATOR_OBJ = SentenceComparator()