예제 #1
0
파일: run.py 프로젝트: NLPatVCU/NER-OUS
def create_supplemental_sentence_structures(supp_file_path):
    """
    Create SentenceStructures from supplemental documents

    :param supp_file_path: Path to directory where supplemental documents are located
    :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
    """
    #Create a dictionary of documents
    doc_dictionary = {}

    # cd into test file directory
    cwd = os.getcwd()
    os.chdir(supp_file_path)

    #Iterate over documents in the supp_file_path directory
    for document in os.listdir():

        #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
        doc_sentence_structure_list = []

        #Open the document
        doc = open(document, "r")

        doc_text = doc.read()
        doc_text_processed = preprocess(doc_text)
        doc_text_processed_split = doc_text_processed.splitlines()

        doc.close()

        doc = open(document, "r")

        #Strip the extension from the file to get the document name
        doc_name = os.path.splitext(document)[0]

        #Iterate over sentences in the document
        counter = 0
        for sentence in doc.readlines():
            #Create a SentenceStructure obj
            ss = SentenceStructure(sentence, doc_name)
            ss.modified_sentence = doc_text_processed_split[counter]

            #Add SentenceStructure obj to the list
            doc_sentence_structure_list.append(ss)
            counter += 1

        #Add the SentenceStructureList to the dictionary
        doc_dictionary[doc_name] = doc_sentence_structure_list

        #Close the document
        doc.close()

    #Return to original path
    os.chdir(cwd)

    #Return the dictionary
    return doc_dictionary
예제 #2
0
파일: run.py 프로젝트: NLPatVCU/NER-OUS
def create_sentence_structures(raw_file_path):
    """
    Iterates through all documents in the directory specified in the params and creates a SentenceStructure object for each sentence.

    :param raw_file_path: Path to directory where raw documents are located
    :return: Dictionary of lists of SentenceStructure objects keyed on document name stripped of extension
    """
    #Create a dictionary of documents
    doc_dictionary = {}
    max_sentence_length = 0

    # cd into test file directory
    cwd = os.getcwd()
    os.chdir(raw_file_path)

    #Iterate over documents in the raw_file_path directory
    for document in os.listdir():

        #Instantiate a list to hold a SentenceStructure for each sentence(line) in the document
        doc_sentence_structure_list = []

        #Open the document
        doc = open(document, "r")

        doc_text = doc.read()
        doc_text_processed = preprocess(doc_text)
        doc_text_processed_split = doc_text_processed.splitlines()

        doc.close()

        doc = open(document, "r")
        try:
            #Iterate over sentences in the document
            counter = 0
            for sentence in doc.readlines():
                #Create a SentenceStructure obj
                ss = SentenceStructure(sentence)
                lower_sentence = sentence.lower()
                ss.modified_sentence = lower_sentence
                #TODO(Jeff) Readd Preprocessed text.
                #ss.modified_sentence = doc_text_processed_split[counter]
                
                if len(ss.original_sentence_array) > max_sentence_length:
                    max_sentence_length = len(ss.original_sentence_array)

                #Add SentenceStructure obj to the list
                doc_sentence_structure_list.append(ss)

                counter += 1
        except:
            print("ERR. " + str(document))
            sys.exit(0)

        assert(len(doc_sentence_structure_list) == len(doc_text_processed_split)), "Assertion Failed, array lengths don't match. " + str(len(doc_sentence_structure_list)) + " " + str(len(doc_text_processed_split))

        #Strip the extension from the file to get the document name
        doc_name = os.path.splitext(document)[0]

        #Add the SentenceStructureList to the dictionary
        doc_dictionary[doc_name] = doc_sentence_structure_list

        #Close the document
        doc.close()

    #Return to original path
    os.chdir(cwd)

    #Return the dictionary
    return doc_dictionary, max_sentence_length