def main(): filename = '2018-09-21 15694060 nonfinal rejection.txt' #tokenize_test() #sentence_seg() data = file_reader.getStringFromTxt(filename) clean_oa, numsubs = clean_OA(data) sentence_seg(clean_oa)
def main_old(): filename = "" invalid_input = True while invalid_input: user_input = input("Convert PDF (Y/N)?") if user_input.lower() == "y": user_input = input("Provide PDF filename:") filename = user_input output_path = convert_pdf_to_txt(filename) invalid_input = False elif user_input.lower() == "n": filename = "test.pdf" output_path = "output/test.txt" invalid_input = False else: print("did not understand") invalid_input = True data = file_reader.getStringFromTxt(output_path) clean_data = data.replace('\n\n', '\n') data_split_space = data.split('\n') print(clean_data) print(data_split_space) print(len(data_split_space)) oa1 = OfficeAction() regex = r'.+Claim.+rejected\sunder.+' test = [] for i in range(len(data_split_space)): temp = '' if re.match(regex, data_split_space[i]): rej = Rejection() for j in range(i, i + 5): temp += ' ' + data_split_space[j] #print(data_split_space[j]) if (data_split_space[j].strip().endswith('.')): #print(data_split_space[j]) break #print(temp) test.append(temp) rej.rejectionText = temp matchObj = re.search(r'(Claim.+)(?:\s)(?:is|are)', temp) #matchObj = re.search(r'(Claim.+)(?:(\s(is|are)))',temp) print(type(matchObj)) print(rej.claims_refs) if matchObj: #print(temp) #print('yes') print(matchObj.groups()) print(matchObj.group(1)) rej.claims_refs[matchObj.group(1)] = None rej.claims = matchObj.group(1) oa1.rejections.append(rej) #print(test) for r in oa1.rejections: print(r.rejectionText) print(r.claims_refs) print(r.claims)
def tokenize_test(): data = file_reader.getStringFromTxt( '2018-09-21 15694060 nonfinal rejection.txt') #print(data) words = nltk.tokenize.word_tokenize(data)
20. The method of claim 16, wherein each of the plurality of memory devices is configured to determine a resistance and a target output high level voltage of an output driver by performing the impedance calibration operation, and wherein the output driver is configured to output data externally to each of the plurality of memory devices. """ #print(test) #matchObj = re.match(r'[0-9]\.',test) #print(matchObj.group()) #list = re.findall(r'[0-9]{1,2}\..*\n\n',test,re.M|re.S) claims = {} filename = '8836S-1189 claims.txt' claimsString = file_reader.getStringFromTxt(filename) list = re.findall(r'[0-9]{1,2}\..*$', claimsString, re.M) print(list) print(len(list)) for i in list: claimNo = re.match(r'[0-9]{1,2}', i).group(0) print(claimNo) #print(type(claimNo)) #matchObj = re.search(r'claim [0-9]{1,2}',i,re.I) claims[claimNo] = [] depClaimRegex = r'(claim) (\d+)'