def CreateSupplementalSentenceStructures(supp_file_path): """ Create SentenceStructures from supplemental documents :param supp_file_path: Path to directory where supplemental documents are located :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension """ #Create a dictionary of documents docDictionary = {} # cd into test file directory cwd = os.getcwd() os.chdir(supp_file_path) #Iterate over documents in the supp_file_path directory for document in os.listdir(): #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document docSentenceStructureList = [] #Open the document doc = open(document, "r") docText = doc.read() docTextProcessed = preprocess(docText) docTextProcessedSplit = docTextProcessed.splitlines() doc.close() doc = open(document, "r") #Strip the extension from the file to get the document name docName = os.path.splitext(document)[0] #Iterate over sentences in the document counter = 0 for sentence in doc.readlines(): #Create a SentenceStructure obj ss = SentenceStructure(sentence, docName) ss.modifiedSentence = docTextProcessedSplit[counter] #Add SentenceStructure obj to the list docSentenceStructureList.append(ss) counter += 1 #Add the SentenceStructureList to the dictionary docDictionary[docName] = docSentenceStructureList #Close the document doc.close() #Return to original path os.chdir(cwd) #Return the dictionary return docDictionary
def CreateSentenceStructures(raw_file_path): """ Create SentenceStructures from raw documents :param raw_file_path: Path to directory where raw documents are located :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension """ #Create a dictionary of documents docDictionary = {} # cd into test file directory cwd = os.getcwd() os.chdir(raw_file_path) #Iterate over documents in the raw_file_path directory for document in os.listdir(): #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document docSentenceStructureList = [] #Open the document doc = open(document, "r") docText = doc.read() docTextProcessed = preprocess(docText) docTextProcessedSplit = docTextProcessed.splitlines() doc.close() doc = open(document, "r") try: #Iterate over sentences in the document counter = 0 for sentence in doc.readlines(): #Create a SentenceStructure obj ss = SentenceStructure(sentence) ss.modifiedSentence = docTextProcessedSplit[counter] #Add SentenceStructure obj to the list docSentenceStructureList.append(ss) counter += 1 except: print("ERR. " + str(document)) sys.exit(0) assert(len(docSentenceStructureList) == len(docTextProcessedSplit)), "Assertion Failed, array lengths don't match. " + str(len(docSentenceStructureList)) + " " + str(len(docTextProcessedSplit)) #Strip the extension from the file to get the document name docName = os.path.splitext(document)[0] #Add the SentenceStructureList to the dictionary docDictionary[docName] = docSentenceStructureList #Close the document doc.close() #Return to original path os.chdir(cwd) #Return the dictionary return docDictionary