def test_estimate_text_abusing_headers(self):
     text = load_resource_document(
         'lexnlp/utils/parsing/text_abusing_headers.txt', 'utf-8')
     text = pre_process_document(text)
     estimator = ParsedTextQualityEstimator()
     estim = estimator.estimate_text(text)
     self.assertLess(estim.extra_line_breaks_prob, 50)
    def test_estimate_fishy_header(self):
        text = """
Notwithstanding anything in this Section (B) of Article IV to the contrary, in the event any such disruption to Tenant's operations and use of the demised premises is attributable to Landlord's negligence, or that of its agents, contractors, servants or employees, or is attributable to a breach by Landlord of its obligations under this lease, and if such disruption shall materially impair Tenant's use of the demised premises for a period in excess of five (5) business days in duration, then a just proportion of the Rent, according to the nature and extent of the impairment to Tenant's operation and use of the demised premises shall abate for any such period of time from the date of disruption which is in excess of said five (5) business days in duration.



ARTICLE V


RENT"""
        text = pre_process_document(text)
        corrector = ParsedTextCorrector()
        corr = corrector.correct_line_breaks(text)
        self.assertLess(len(corr), len(text))
    def process_text_files_in_folder(self, src_folder, dst_folder):
        corrector = ParsedTextCorrector()

        files = [f for f in listdir(src_folder) if isfile(join(src_folder, f))]
        for file in files:
            text = ''
            full_path = src_folder + file
            with codecs.open(full_path, encoding='utf-8', mode='r') as myfile:
                text = myfile.read()
            text = pre_process_document(text)

            corr = corrector.correct_if_corrupted(text)
            if len(text) == len(corr):
                continue #corr = ''

            savepath = dst_folder + file
            with codecs.open(savepath, encoding='utf-8', mode='w') as myfile:
                myfile.write(corr)
Ejemplo n.º 4
0
import sys
import lexnlp.nlp.en.segments.sentences as lex_sentences
import lexnlp.extract.en.dates as lex_dates
import lexnlp.extract.en.entities.nltk_maxent as lex_entities

direct_path = "/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/Dish_Sample.txt"

with open(direct_path, 'r') as file:
    brief = file.read()

processed_brief = lex_sentences.pre_process_document(brief)
sentences_brief = lex_sentences.get_sentence_list(processed_brief)

facts = []

for sentence in sentences_brief:
    entities = lex_entities.get_persons(sentence)
    for entity in entities:
        facts.append((entity, sentence))

for fact in facts:
    print("Question:\nWhy is {} relevant?\n\nAnswer:\n{}".format(
        fact[0], fact[1]))
    print("\n---------------\n")
'''
Question:
Why is Farmers Branch relevant?

Answer:
In 2009, DISH began a pilot program to test QPC, a new incentive-based system at several locations, including two of its eight offices in the North Texas region: Farmers Branch and North Richland Hills.
Ejemplo n.º 5
0
 def init_preprocess(self, text=None):
     return lex_sentences.pre_process_document(text)