def read_in_paper(filename, sentences_as_lists=False, preserve_order=False): """ Reads in a paper and returns it as a dictionary. :param filename: the filename of the paper to read, of the paper file itself only not the path to the paper. :param sentences_as_lists: if true, will return the sentences of the paper as lists of words rather than strings. :param preserve_order: if true keeps track of which sections occured in what order in the paper. :return: a dictionary of the form (section: list of sentences in that section) """ paper_to_open = PAPER_SOURCE + filename paper_text = Reader().open_file_single_string(paper_to_open) udata = paper_text.decode("utf-8") paper = udata.encode("ascii", "ignore") return paper_tokenize(paper, sentences_as_lists=sentences_as_lists, preserve_order=preserve_order)
def read_in_files(): """Function which reads in all the scientific paper data, and parses it into a list of lists, where each item in the list is a sentence, in the form of a list of words. This is the form needed for the Word2Vec model.""" num_files = len( [name for name in os.listdir(DATA_SOURCE) if name.endswith(".txt")]) loading_section_size = num_files / 30 count = 0 sentences_as_lists = [] for filename in os.listdir(DATA_SOURCE): if filename.endswith(".txt"): # Pretty loading bar print("Processing Files: [", end="") for i in range(31, -1, -1): if count > i * loading_section_size: for j in range(0, i): print("-", end="") sys.stdout.flush() for j in range(i, 30): print(" ", end="") sys.stdout.flush() break if count == num_files: print("] ", count, end="\n") else: print("] ", count, end="\r") sys.stdout.flush() # Open the paper paper_to_open = DATA_SOURCE + filename paper = Reader().open_file_single_string(paper_to_open) udata = paper.decode("utf-8") paper = udata.encode("ascii", "ignore") # Split the data into a list of sentences, where each sentence is a list of words sentences = sent_tokenize(paper) for sentence in sentences: words = word_tokenize(sentence) sentences_as_lists.append(words) if DEBUG: print(sentences_as_lists) #wait() count += 1 return count, sentences_as_lists