def create_supplemental_sentence_structures(supp_file_path): """ Create SentenceStructures from supplemental documents :param supp_file_path: Path to directory where supplemental documents are located :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension """ #Create a dictionary of documents doc_dictionary = {} # cd into test file directory cwd = os.getcwd() os.chdir(supp_file_path) #Iterate over documents in the supp_file_path directory for document in os.listdir(): #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document doc_sentence_structure_list = [] #Open the document doc = open(document, "r") doc_text = doc.read() doc_text_processed = preprocess(doc_text) doc_text_processed_split = doc_text_processed.splitlines() doc.close() doc = open(document, "r") #Strip the extension from the file to get the document name doc_name = os.path.splitext(document)[0] #Iterate over sentences in the document counter = 0 for sentence in doc.readlines(): #Create a SentenceStructure obj ss = SentenceStructure(sentence, doc_name) ss.modified_sentence = doc_text_processed_split[counter] #Add SentenceStructure obj to the list doc_sentence_structure_list.append(ss) counter += 1 #Add the SentenceStructureList to the dictionary doc_dictionary[doc_name] = doc_sentence_structure_list #Close the document doc.close() #Return to original path os.chdir(cwd) #Return the dictionary return doc_dictionary
def create_sentence_structures(raw_file_path): """ Iterates through all documents in the directory specified in the params and creates a SentenceStructure object for each sentence. :param raw_file_path: Path to directory where raw documents are located :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension """ #Create a dictionary of documents doc_dictionary = {} max_sentence_length = 0 # cd into test file directory cwd = os.getcwd() os.chdir(raw_file_path) #Iterate over documents in the raw_file_path directory for document in os.listdir(): #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document doc_sentence_structure_list = [] #Open the document doc = open(document, "r") doc_text = doc.read() doc_text_processed = preprocess(doc_text) doc_text_processed_split = doc_text_processed.splitlines() doc.close() doc = open(document, "r") try: #Iterate over sentences in the document counter = 0 for sentence in doc.readlines(): #Create a SentenceStructure obj ss = SentenceStructure(sentence) lower_sentence = sentence.lower() ss.modified_sentence = lower_sentence #TODO(Jeff) Readd Preprocessed text. #ss.modified_sentence = doc_text_processed_split[counter] if len(ss.original_sentence_array) > max_sentence_length: max_sentence_length = len(ss.original_sentence_array) #Add SentenceStructure obj to the list doc_sentence_structure_list.append(ss) counter += 1 except: print("ERR. " + str(document)) sys.exit(0) assert(len(doc_sentence_structure_list) == len(doc_text_processed_split)), "Assertion Failed, array lengths don't match. " + str(len(doc_sentence_structure_list)) + " " + str(len(doc_text_processed_split)) #Strip the extension from the file to get the document name doc_name = os.path.splitext(document)[0] #Add the SentenceStructureList to the dictionary doc_dictionary[doc_name] = doc_sentence_structure_list #Close the document doc.close() #Return to original path os.chdir(cwd) #Return the dictionary return doc_dictionary, max_sentence_length