def s3PdfAccess(url, target_path): try: url = url.strip() s3_client = boto3.client('s3') bucket, key = url.split('/', 0)[-1].split('/', 1) fname = key.rsplit('/', 1)[1] s3_client.download_file(bucket, key, target_path + '/' + fname) return fname except Exception, e: logger.info("-Not able to download file from s3-" + str(e) + str(url)) error_handler = Error_Handler.Error_Handler() error_handler.mysql_insert_error("S3_download", myconfig.error_code3, str(url) + " " + str(e))
def pdf_reader(self, path): page_pos_dict = [] pdf = pdfquery.PDFQuery(path) i = 0 try: while i <= 7: pdf.load(i) jquery = pdf.pq('LTPage') if jquery.text().find('Service of Process Transmittal') >= 0: i += 1 continue for j in jquery("LTTextBoxHorizontal"): page_pos_dict.append(jquery(j).text()) i += 1 except Exception, e: logger.info(str(e)) return page_pos_dict
def page_extractor(self, directory_files, pages=None): all_page_set = [] global front_page front_page = [] if not pages: pagenums = set() else: pagenums = set(pages) try: path = directory_files manager = PDFResourceManager() fd = open(path, 'rb') page_no = 0 f_page = "" all_pages = [] for page in PDFPage.get_pages(fd, pagenums): output = StringIO() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) interpreter.process_page(page) doc = output.getvalue() if doc[:18] == "Service of Process": f_page = f_page + doc else: page_no = page_no + 1 if page_no > 5: front_page.append(f_page) break all_pages.append(doc) converter.close() output.close all_page_set.append(' '.join(all_pages)) fd.close() except Exception, e: logger.info("exception from jurisdiction " + str(self.esop_id) + str(e))
def pdf_process(path, esop_id=1): final_result = {} try: logger.info("Downloading PDF for esop" + str(esop_id) + "for PDF") pdf_read = PDFReader(path) if (pdf_read[0] == 0): if (path.endswith(".pdf")): logger.info("starting OCR PDF extraction for esop--" + str(esop_id)) attachment = Attachment.Attachment(esop_id, pdf_read[4]) jurisdiction = Jurisdiction.Jurisdiction( esop_id, pdf_read[6], pdf_read[5]) lawsuit = Lawsuit.Lawsuit(esop_id, pdf_read[4]) caseno = casenumber.CaseNumber(esop_id, pdf_read[1], pdf_read[2], pdf_read[3]) pool = multiprocessing.Pool(4) results = pool.map(run_mdules_pool, [attachment, jurisdiction, lawsuit, caseno]) for result in results: final_result.update(result) pool.close() pool.join() attach = final_result['attachment'].split("#") attachment_pltf = [] for i in attach: attachment_pltf.append( unicodedata.normalize('NFKD', i).encode('ascii', 'ignore')) law = final_result['lawsuit'] pltfdftd = PlaintiffDefendant.PltfDftd(esop_id, path, pdf_read[7]) final_result.update( json.loads(pltfdftd.predict(law, attachment_pltf))) print final_result return final_result else: final_result.update({'esop_id': esop_id}) except Exception, e: logger.info("--error while processing pool--" + str(e) + str(esop_id)) logger.info("Exception--inserted null values for esop--" + str(esop_id) + "--" + str(path))
def predict(self): try: logger.info("starting lawsuit prediction for esop "+str(self.esop_id)) data = self.clean_text(self.pdf_read) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) x = tfidf.transform([data]) lawsuit = classifier.predict(x)[0].strip() cs = max(max(classifier.predict_proba(x))) data = {"esop_id": self.esop_id, "lawsuit": lawsuit, "lawsuit_cs": cs} data_final = json.dumps(data) logger.info("sent result from lawsuit "+ str(self.esop_id)) return data_final except Exception,e: logger.info("exception from Lawsuit " +str(e)+ str(self.esop_id)) error_handler = Error_Handler.Error_Handler() error_handler.mysql_insert_error("Lawsuit",myconfig.error_code2,str(self.esop_id)+" "+str(e))
def predict(self): try: logger.info("starting attachment prediction for esop " + str(self.esop_id)) document_pages = self.pdf_read cleaned_pages = [] for page in document_pages: cleaned_pages.append(self.clean_text(page)) x_test_vector = vectorizer.transform(cleaned_pages) predicted_result = model.predict(x_test_vector) c = model.predict_proba(x_test_vector) sum1 = 0 count = 0 for i in c: sum1 = sum1 + max(i) count = count + 1 cs = sum1 / count result = list(set(predicted_result)) attachment = '#'.join(result) result = { "esop_id": self.esop_id, "attachment": attachment, "attachment_cs": cs } result_final = json.dumps(result) logger.info("sent result from attachment " + str(self.esop_id)) return result_final except Exception, e: logger.info("exception from attachment " + str(e) + str(self.esop_id)) error_handler = Error_Handler.Error_Handler() error_handler.mysql_insert_error("Attachment", myconfig.error_code2, str(self.esop_id) + " " + str(e))
def predict(self): try: logger.info("starting case number prediction " + str(self.esop_id)) cs1 = 0 case_number = '######' flag = False pdf1 = self.pdf_readLine pdf2 = self.pdf_readBox case1 = "" scr = "" data_line = [] for each in pdf1: line = self.pre_process(each) data_line.append(line) if len(data_line) > 0: Xtest = vect.transform(data_line) result_line = Model.predict(Xtest) scr_line = Model.predict_proba(Xtest) c = 0 while c < len(data_line): if result_line[c] == 1: case = self.extract_case_number(data_line[c]) if case != "": case1 += case + " " scr += str(max(scr_line[c])) + " " c += 1 if case1.strip() == "": data = [] for each in pdf2: line = self.pre_process(each) data.append(line) if len(data) > 0: Xtest = vect.transform(data) result = Model.predict(Xtest) scr_box = Model.predict_proba(Xtest) c = 0 while c < len(data): if result[c] == 1: if data[c].find("dfs-sop") >= 0: data[c] = data[c - 1] case = self.extract_case_number(data[c]) if case != "": case1 += case + " " scr += str(max(scr_box[c])) + " " c += 1 if case1.strip() == "": c = 0 while c < len(data_line): if result_line[c] == 1: case = self.extract_case_number(' '.join( data_line[c - 2:c + 2])) if case != "": case1 += case + " " scr += str(max(scr_line[c])) + " " c += 1 if case1.strip() == "": PagePosDict = self.pdf_readCord fd_train = self.clean_text(PagePosDict) Y = [] X = [] result = [] nearby_txt = [] doc_contents = [] check = [] regex = re.compile('[0-9]+') for txt in fd_train: contents = txt.split(',') temp = [] try: text = contents[0] text = text.replace("(cid:9)", "") net_content = '"' + contents[1] + '",' doc_contents.append(net_content) data = regex.sub('<num>', text) if (data != " "): nearby_txt.append(data) except Exception: pass if (len(fd_train) > 0): X_Text = text_vectorizer.transform( nearby_txt).toarray() result1 = (clf2.predict(X_Text).reshape(1, -1)) result_prob = clf2.predict_proba(X_Text) c = 0 while c < len(result1[0]): if result1[0][c] == 1: case = self.extract_case_number(data[c]) if case != "": case1 += case + " " #print str(max(result_prob[c])),"confidence for model" scr += str(max(result_prob[c])) + " " c += 1
scr += str(max(scr_line[c])) + " " c += 1 if len(case1.split()) == 0: case1 = "NONE" scr = "0 " result_numbers = list(set(case1.split())) scr_set = list(set(scr.split())) case_number = str(','.join(result_numbers)) cs1 = float(max(scr_set)) data = { "esop_id": self.esop_id, "casenumber": case_number, "casenumber_cs": cs1 } data_final = json.dumps(data) logger.info("sent result from casenumber " + str(self.esop_id)) return data_final else: data = { "esop_id": self.esop_id, "casenumber": None, "casenumber_cs": 0.0 } data_final = json.dumps(data) logger.info("sent result from casenumber(else) " + str(self.esop_id)) return data_final except Exception, e: logger.info("exception from case number " + str(e) + str(self.esop_id))
def predict(self, law, attach): try: logger.info("started from pltfdftd " + str(self.esop_id)) #path="/home/1232301/PycharmProjects/WK-FINAL-without-kafka/testdocs/531104590_FC_UT.pdf" litigation = law attachments = attach pageOrder = orderPages(litigation, attachments) final_plaintiff = None final_defendant = None involuntary_plaintiff = [] pltfdftdVS = [] pltfdftdPartyTitle = [] cs_score = [] cs_pltf = 0 cs_dftd = 0 for page in pageOrder: if final_plaintiff != None and final_defendant != None: break X = [] party_title_details = [] pltf_aftr_filtering = [] dftd_aftr_filtering = [] plaintiff = [] defendant = [] pltf_index_level1 = [] dftd_index_level1 = [] original_txt, original_top, original_below, X_txt, X_top, X_below, X_cordinate, X_tags = self.read_cordinates( self.PageDict[page]) for xp1, xp2, xp3, x4, xd1, xd2, xd3, xz1, xz2, xz3 in \ zip(text_vectorizer_pltf.transform(X_txt).toarray(), \ top_vectorizer_pltf.transform(X_top).toarray(), \ below_vectorizer_pltf.transform(X_below).toarray(), X_tags, \ text_vectorizer_dftd.transform(X_txt).toarray(), \ top_vectorizer_dftd.transform(X_top).toarray(), below_vectorizer_dftd.transform(X_below).toarray(), \ text_vectorizer_zero.transform(X_txt).toarray(), \ top_vectorizer_zero.transform(X_top).toarray(), below_vectorizer_zero.transform(X_below).toarray(), ): tmp = [] for x in xp1: tmp.append(x) for x in xp2: tmp.append(x) for x in xp3: tmp.append(x) for x in x4: tmp.append(x) for x in xd1: tmp.append(x) for x in xd2: tmp.append(x) for x in xd3: tmp.append(x) for x in xz1: tmp.append(x) for x in xz2: tmp.append(x) for x in xz3: tmp.append(x) X.append(tmp) if len(X) > 0: result_level1 = clf.predict(X) cs_score = clf.predict_proba(X) else: continue pltf_index_level1 = [ a for a in range(len(result_level1)) if result_level1[a] == 1 ] dftd_index_level1 = [ a for a in range(len(result_level1)) if result_level1[a] == 2 ] if final_plaintiff == None and len(pltf_index_level1) > 0: for index in pltf_index_level1: txt = X_txt[index] top = X_top[index] below = X_below[index] pltf = original_txt[index] plaintiff.append(pltf) out = pltf + "#" + top + "#" + below + "#" + "\n" party_title_details.append(out) if PLTF(txt, top, below) == 1: pltf_aftr_filtering.append(pltf) # print pltf_aftr_filtering temp = replace_v(pltf) if temp.find('versus') > 5: pltfdftdVS.append(pltf) pltfdftdPartyTitle.append(out) if len(pltf_aftr_filtering) > 0: final_plaintiff = extractor_plaintiff( pltf_aftr_filtering, party_title_details, involuntary_plaintiff) cs_pltf = max(cs_score[index]) logger.info("before final_defendant == None" + str(self.esop_id)) if final_defendant == None and len(dftd_index_level1) > 0: totalArea = [] for index in dftd_index_level1: txt = X_txt[index] top = X_top[index] below = X_below[index] tags = X_tags[index] dftd = original_txt[index] defendant.append(dftd) if DFTD(txt, top, below, tags) == 1: dftd_aftr_filtering.append(defendant) out = dftd + '#' + top + '#' + below + '#' party_title_details.append(out) temp = replace_v(dftd) if temp.find('versus') > 5: pltfdftdVS.append(dftd) pltfdftdPartyTitle.append(out) totalArea.append(X_cordinate[index]) c = 0 logger.info("before final_defendant == None" + str(self.esop_id)) if len(totalArea) != 0: dftd_aftr_filtering = [] for dft in read_line(self.path, totalArea): c += 1 for phrases in [ 'AFFIDAVIT FOR GARNISHMENT', 'AGAINST BANK ACCOUNT' ]: if difflib.SequenceMatcher( None, dft, phrases).ratio() * 100 >= 90: continue if len(dft.strip()) <= 3: continue dftd_aftr_filtering.append( dft.replace('(cid:9)', '')) if len(dftd_aftr_filtering) > 0: final_defendant = extractor_defendant( dftd_aftr_filtering, party_title_details).strip() + ' // To:' cs_dftd = max(cs_score[index]) # print "Final Defendant 1" # print "Plaintiff after processing page ",page,"is",final_plaintiff # print "Defendant after processing page ",page,"is",final_defendant if final_plaintiff == None and litigation.lower().find('lev') >= 0: final_plaintiff, cs_pltf = plaintiff_levies(self.path) cs_pltf = 0.86 # print "Plaintiff identified at 2" elif final_plaintiff == None and len(pltfdftdVS) > 0: final_plaintiff = extractor_plaintiff(pltfdftdVS, pltfdftdPartyTitle, involuntary_plaintiff) cs_pltf = 0.76 # print "Plaintiff identified at 3" elif final_plaintiff == None and len(pltf_index_level1) != 0: final_plaintiff = extractor_plaintiff( [original_txt[pltf_index_level1[0]]], [''], involuntary_plaintiff) cs_pltf = 0.64 # print "Plaintiff identified at 4" if final_defendant == None and len(pltfdftdVS) != 0: final_defendant = extractor_defendant(pltfdftdVS, pltfdftdPartyTitle) cs_dftd = 0.84 # print "Final Defendant 2" elif final_defendant == None and len(dftd_index_level1) != 0: totalArea = [X_cordinate[dftd_index_level1[0]]] if len(totalArea) != 0: dftd_aftr_filtering = [] for dft in read_line(self.path, totalArea): c += 1 for phrases in [ 'AFFIDAVIT FOR GARNISHMENT', 'AGAINST BANK ACCOUNT' ]: if difflib.SequenceMatcher( None, dft, phrases).ratio() * 100 >= 90: continue if len(dft.strip()) <= 3: continue dftd_aftr_filtering.append(dft.replace('(cid:9)', '')) if len(dftd_aftr_filtering) > 0: final_defendant = extractor_defendant( dftd_aftr_filtering, ['']) # print "Final Defendant 3" #print "Story Ends :", ('{:%H:%M:%S}'.format(datetime.datetime.now())) if final_plaintiff == None: final_plaintiff = 'None' if final_defendant == None: final_defendant = 'None' result = { "esop_id": self.esop_id, "pltf": final_plaintiff, "pltf_cs": cs_pltf, "dftd": final_defendant, "dftd_cs": cs_dftd } result_final = json.dumps(result) logger.info("sent result from pltfdftd " + str(self.esop_id)) return result_final except Exception, e: logger.info("exception from pltfdftd " + str(e) + str(self.esop_id)) error_handler = Error_Handler.Error_Handler() error_handler.mysql_insert_error("pltfdftd", myconfig.error_code2, str(self.esop_id) + " " + str(e))
def predict(self): try: logger.info("starting jurisdiction prediction for esop " + str(self.esop_id)) global fd global data data = self.pdf_readLine #print self.path for x in data: block = self.pre_process(x) xd = tf.transform([block]) result = clf.predict(xd) if result[0] == 1: cs = clf.predict_proba(xd) cs1 = max(max(cs)) text = u''.join(x).encode('utf-8').strip() tokens = nltk.word_tokenize(x) for word in reversed(tokens): if word in myconfig.state_code_dictionary.keys(): self.J.append(myconfig.state_code_dictionary[word]) self.B.append(text) if myconfig.state_code_dictionary[ word] == myconfig.connecticut: docus = self.pdf_readPage jurisdiction = self.jurisdiction_suggestions( docus[0]) data = { "esop_id": self.esop_id, "jurisdiction": jurisdiction, "jurisdiction_cs": cs1 } data_final = json.dumps(data) logger.info( "sent result from jurisdiction(case 1)" + str(self.esop_id)) return data_final else: data = { "esop_id": self.esop_id, "jurisdiction": myconfig.state_code_dictionary[word], "jurisdiction_cs": cs1 } data_final = json.dumps(data) logger.info( "sent result from jurisdiction(case 2)" + str(self.esop_id)) return data_final docus = self.pdf_readPage jurisdiction = self.jurisdiction_suggestions(docus[0]) if jurisdiction != "": data = { "esop_id": self.esop_id, "jurisdiction": jurisdiction, "jurisdiction_cs": myconfig.cs_juri_specialcase } data_final = json.dumps(data) else: data = { "esop_id": self.esop_id, "jurisdiction": jurisdiction, "jurisdiction_cs": 0 } data_final = json.dumps(data) logger.info("sent result from jurisdiction(case 3) " + str(self.esop_id)) return data_final except Exception, e: logger.info("exception from Jurisdiction " + str(e) + str(self.esop_id)) error_handler = Error_Handler.Error_Handler() msg = str(self.esop_id) + " " + str(e) error_handler.mysql_insert_error("Jurisdiction", myconfig.error_code2, msg)