Beispiel #1
0
def CreateSupplementalSentenceStructures(supp_file_path):
	"""
	Create SentenceStructures from supplemental documents
	
	:param supp_file_path: Path to directory where supplemental documents are located
	:return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
	"""

	#Create a dictionary of documents
	docDictionary = {}

	# cd into test file directory
	cwd = os.getcwd()
	os.chdir(supp_file_path)

	#Iterate over documents in the supp_file_path directory
	for document in os.listdir():

		#Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
		docSentenceStructureList = []

		#Open the document
		doc = open(document, "r")
		
		docText = doc.read()
		docTextProcessed = preprocess(docText)
		docTextProcessedSplit = docTextProcessed.splitlines()
		
		doc.close()
		
		doc = open(document, "r")
		
		#Strip the extension from the file to get the document name
		docName = os.path.splitext(document)[0]

		#Iterate over sentences in the document
		counter = 0
		for sentence in doc.readlines():
			#Create a SentenceStructure obj
			ss = SentenceStructure(sentence, docName)
			ss.modifiedSentence = docTextProcessedSplit[counter]

			#Add SentenceStructure obj to the list
			docSentenceStructureList.append(ss)      
			counter += 1

		#Add the SentenceStructureList to the dictionary
		docDictionary[docName] = docSentenceStructureList

		#Close the document
		doc.close()
		
	#Return to original path
	os.chdir(cwd)
	
	#Return the dictionary
	return docDictionary
Beispiel #2
0
def CreateSentenceStructures(raw_file_path):
    """
	Create SentenceStructures from raw documents
	
	:param raw_file_path: Path to directory where raw documents are located
	:return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
	"""

    #Create a dictionary of documents
    docDictionary = {}

    # cd into test file directory
    cwd = os.getcwd()
    os.chdir(raw_file_path)

    #Iterate over documents in the raw_file_path directory
    for document in os.listdir():

        #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
        docSentenceStructureList = []

        #Open the document
        doc = open(document, "r")

        #Iterate over sentences in the document
        for sentence in doc.readlines():

            #Create a SentenceStructure obj
            ss = SentenceStructure(sentence)

            #Add SentenceStructure obj to the list
            docSentenceStructureList.append(ss)

        #Strip the extension from the file to get the document name
        docName = os.path.splitext(document)[0]

        #Add the SentenceStructureList to the dictionary
        docDictionary[docName] = docSentenceStructureList

        #Close the document
        doc.close()

    #Return to original path
    os.chdir(cwd)

    #Return the dictionary
    return docDictionary
Beispiel #3
0
def create_sentence_structures(raw_file_path):
    """
    Iterates through all documents in the directory specified in the params and creates a SentenceStructure object for each sentence.

    :param raw_file_path: Path to directory where raw documents are located
    :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
    """
    #Create a dictionary of documents
    doc_dictionary = {}
    max_sentence_length = 0

    # cd into test file directory
    cwd = os.getcwd()
    os.chdir(raw_file_path)

    #Iterate over documents in the raw_file_path directory
    for document in os.listdir():

        #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
        doc_sentence_structure_list = []

        #Open the document
        doc = open(document, "r")

        doc_text = doc.read()
        doc_text_processed = preprocess(doc_text)
        doc_text_processed_split = doc_text_processed.splitlines()

        doc.close()

        doc = open(document, "r")
        try:
            #Iterate over sentences in the document
            counter = 0
            for sentence in doc.readlines():
                #Create a SentenceStructure obj
                ss = SentenceStructure(sentence)
                lower_sentence = sentence.lower()
                ss.modified_sentence = lower_sentence
                #TODO(Jeff) Readd Preprocessed text.
                #ss.modified_sentence = doc_text_processed_split[counter]
                
                if len(ss.original_sentence_array) > max_sentence_length:
                    max_sentence_length = len(ss.original_sentence_array)

                #Add SentenceStructure obj to the list
                doc_sentence_structure_list.append(ss)

                counter += 1
        except:
            print("ERR. " + str(document))
            sys.exit(0)

        assert(len(doc_sentence_structure_list) == len(doc_text_processed_split)), "Assertion Failed, array lengths don't match. " + str(len(doc_sentence_structure_list)) + " " + str(len(doc_text_processed_split))

        #Strip the extension from the file to get the document name
        doc_name = os.path.splitext(document)[0]

        #Add the SentenceStructureList to the dictionary
        doc_dictionary[doc_name] = doc_sentence_structure_list

        #Close the document
        doc.close()

    #Return to original path
    os.chdir(cwd)

    #Return the dictionary
    return doc_dictionary, max_sentence_length
Beispiel #4
0
def CreateSentenceStructures(raw_file_path):
	"""
	Create SentenceStructures from raw documents
	
	:param raw_file_path: Path to directory where raw documents are located
	:return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
	"""

	#Create a dictionary of documents
	docDictionary = {}

	# cd into test file directory
	cwd = os.getcwd()
	os.chdir(raw_file_path)

	#Iterate over documents in the raw_file_path directory
	for document in os.listdir():

		#Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
		docSentenceStructureList = []

		#Open the document
		doc = open(document, "r")
		
		docText = doc.read()
		docTextProcessed = preprocess(docText)
		docTextProcessedSplit = docTextProcessed.splitlines()
		
		doc.close()
		
		doc = open(document, "r")
		try:
			#Iterate over sentences in the document
			counter = 0
			for sentence in doc.readlines():
				#Create a SentenceStructure obj
				ss = SentenceStructure(sentence)
				ss.modifiedSentence = docTextProcessedSplit[counter]

				#Add SentenceStructure obj to the list
				docSentenceStructureList.append(ss)        
				
				counter += 1
		except:
			print("ERR. " + str(document))
			sys.exit(0)
			
		assert(len(docSentenceStructureList) == len(docTextProcessedSplit)), "Assertion Failed, array lengths don't match. " + str(len(docSentenceStructureList)) + " " + str(len(docTextProcessedSplit))

		#Strip the extension from the file to get the document name
		docName = os.path.splitext(document)[0]

		#Add the SentenceStructureList to the dictionary
		docDictionary[docName] = docSentenceStructureList

		#Close the document
		doc.close()
		
	#Return to original path
	os.chdir(cwd)
	
	#Return the dictionary
	return docDictionary