def s3PdfAccess(url, target_path):
    try:
        url = url.strip()
        s3_client = boto3.client('s3')
        bucket, key = url.split('/', 0)[-1].split('/', 1)
        fname = key.rsplit('/', 1)[1]
        s3_client.download_file(bucket, key, target_path + '/' + fname)
        return fname
    except Exception, e:
        logger.info("-Not able to download file from s3-" + str(e) + str(url))
        error_handler = Error_Handler.Error_Handler()
        error_handler.mysql_insert_error("S3_download", myconfig.error_code3,
                                         str(url) + " " + str(e))
Exemple #2
0
 def pdf_reader(self, path):
     page_pos_dict = []
     pdf = pdfquery.PDFQuery(path)
     i = 0
     try:
         while i <= 7:
             pdf.load(i)
             jquery = pdf.pq('LTPage')
             if jquery.text().find('Service of Process Transmittal') >= 0:
                 i += 1
                 continue
             for j in jquery("LTTextBoxHorizontal"):
                 page_pos_dict.append(jquery(j).text())
             i += 1
     except Exception, e:
         logger.info(str(e))
         return page_pos_dict
Exemple #3
0
 def page_extractor(self, directory_files, pages=None):
     all_page_set = []
     global front_page
     front_page = []
     if not pages:
         pagenums = set()
     else:
         pagenums = set(pages)
     try:
         path = directory_files
         manager = PDFResourceManager()
         fd = open(path, 'rb')
         page_no = 0
         f_page = ""
         all_pages = []
         for page in PDFPage.get_pages(fd, pagenums):
             output = StringIO()
             converter = TextConverter(manager, output, laparams=LAParams())
             interpreter = PDFPageInterpreter(manager, converter)
             interpreter.process_page(page)
             doc = output.getvalue()
             if doc[:18] == "Service of Process":
                 f_page = f_page + doc
             else:
                 page_no = page_no + 1
                 if page_no > 5:
                     front_page.append(f_page)
                     break
                 all_pages.append(doc)
             converter.close()
             output.close
         all_page_set.append(' '.join(all_pages))
         fd.close()
     except Exception, e:
         logger.info("exception from jurisdiction " + str(self.esop_id) +
                     str(e))
def pdf_process(path, esop_id=1):
    final_result = {}
    try:
        logger.info("Downloading PDF for esop" + str(esop_id) + "for PDF")
        pdf_read = PDFReader(path)
        if (pdf_read[0] == 0):
            if (path.endswith(".pdf")):
                logger.info("starting OCR PDF extraction for esop--" +
                            str(esop_id))
                attachment = Attachment.Attachment(esop_id, pdf_read[4])
                jurisdiction = Jurisdiction.Jurisdiction(
                    esop_id, pdf_read[6], pdf_read[5])
                lawsuit = Lawsuit.Lawsuit(esop_id, pdf_read[4])
                caseno = casenumber.CaseNumber(esop_id, pdf_read[1],
                                               pdf_read[2], pdf_read[3])
                pool = multiprocessing.Pool(4)
                results = pool.map(run_mdules_pool,
                                   [attachment, jurisdiction, lawsuit, caseno])
                for result in results:
                    final_result.update(result)
                pool.close()
                pool.join()
                attach = final_result['attachment'].split("#")
                attachment_pltf = []
                for i in attach:
                    attachment_pltf.append(
                        unicodedata.normalize('NFKD',
                                              i).encode('ascii', 'ignore'))
                law = final_result['lawsuit']
                pltfdftd = PlaintiffDefendant.PltfDftd(esop_id, path,
                                                       pdf_read[7])
                final_result.update(
                    json.loads(pltfdftd.predict(law, attachment_pltf)))
                print final_result
                return final_result
        else:
            final_result.update({'esop_id': esop_id})
    except Exception, e:
        logger.info("--error while processing pool--" + str(e) + str(esop_id))
        logger.info("Exception--inserted null values for esop--" +
                    str(esop_id) + "--" + str(path))
Exemple #5
0
 def predict(self):
   try:
     logger.info("starting lawsuit prediction for esop "+str(self.esop_id))
     data = self.clean_text(self.pdf_read)
     with warnings.catch_warnings():
       warnings.simplefilter("ignore", category=UserWarning)
       x = tfidf.transform([data])
       lawsuit = classifier.predict(x)[0].strip()
       cs = max(max(classifier.predict_proba(x)))
       data = {"esop_id": self.esop_id, "lawsuit": lawsuit, "lawsuit_cs": cs}
       data_final = json.dumps(data)
       logger.info("sent result from lawsuit "+ str(self.esop_id))
       return data_final
   except Exception,e:
     logger.info("exception from Lawsuit " +str(e)+ str(self.esop_id))
     error_handler = Error_Handler.Error_Handler()
     error_handler.mysql_insert_error("Lawsuit",myconfig.error_code2,str(self.esop_id)+" "+str(e))
Exemple #6
0
    def predict(self):
        try:
            logger.info("starting attachment prediction for esop " +
                        str(self.esop_id))
            document_pages = self.pdf_read
            cleaned_pages = []
            for page in document_pages:
                cleaned_pages.append(self.clean_text(page))

            x_test_vector = vectorizer.transform(cleaned_pages)

            predicted_result = model.predict(x_test_vector)
            c = model.predict_proba(x_test_vector)
            sum1 = 0
            count = 0
            for i in c:
                sum1 = sum1 + max(i)
                count = count + 1
            cs = sum1 / count

            result = list(set(predicted_result))
            attachment = '#'.join(result)
            result = {
                "esop_id": self.esop_id,
                "attachment": attachment,
                "attachment_cs": cs
            }
            result_final = json.dumps(result)
            logger.info("sent result from attachment " + str(self.esop_id))
            return result_final
        except Exception, e:
            logger.info("exception from attachment " + str(e) +
                        str(self.esop_id))
            error_handler = Error_Handler.Error_Handler()
            error_handler.mysql_insert_error("Attachment",
                                             myconfig.error_code2,
                                             str(self.esop_id) + " " + str(e))
    def predict(self):
        try:
            logger.info("starting case number prediction " + str(self.esop_id))
            cs1 = 0
            case_number = '######'

            flag = False
            pdf1 = self.pdf_readLine
            pdf2 = self.pdf_readBox

            case1 = ""
            scr = ""
            data_line = []
            for each in pdf1:
                line = self.pre_process(each)
                data_line.append(line)
            if len(data_line) > 0:
                Xtest = vect.transform(data_line)
                result_line = Model.predict(Xtest)
                scr_line = Model.predict_proba(Xtest)
                c = 0
                while c < len(data_line):
                    if result_line[c] == 1:
                        case = self.extract_case_number(data_line[c])
                        if case != "":
                            case1 += case + " "
                            scr += str(max(scr_line[c])) + " "
                    c += 1
                if case1.strip() == "":
                    data = []
                    for each in pdf2:
                        line = self.pre_process(each)
                        data.append(line)
                    if len(data) > 0:
                        Xtest = vect.transform(data)
                        result = Model.predict(Xtest)
                        scr_box = Model.predict_proba(Xtest)
                        c = 0
                        while c < len(data):
                            if result[c] == 1:

                                if data[c].find("dfs-sop") >= 0:
                                    data[c] = data[c - 1]
                                case = self.extract_case_number(data[c])
                                if case != "":
                                    case1 += case + " "
                                    scr += str(max(scr_box[c])) + " "
                            c += 1
                if case1.strip() == "":
                    c = 0
                    while c < len(data_line):
                        if result_line[c] == 1:
                            case = self.extract_case_number(' '.join(
                                data_line[c - 2:c + 2]))
                            if case != "":
                                case1 += case + " "
                                scr += str(max(scr_line[c])) + " "
                        c += 1

                if case1.strip() == "":
                    PagePosDict = self.pdf_readCord
                    fd_train = self.clean_text(PagePosDict)
                    Y = []
                    X = []
                    result = []
                    nearby_txt = []
                    doc_contents = []
                    check = []

                    regex = re.compile('[0-9]+')
                    for txt in fd_train:
                        contents = txt.split(',')
                        temp = []
                        try:
                            text = contents[0]
                            text = text.replace("(cid:9)", "")
                            net_content = '"' + contents[1] + '",'
                            doc_contents.append(net_content)
                            data = regex.sub('<num>', text)

                            if (data != " "):
                                nearby_txt.append(data)
                        except Exception:
                            pass
                    if (len(fd_train) > 0):
                        X_Text = text_vectorizer.transform(
                            nearby_txt).toarray()
                        result1 = (clf2.predict(X_Text).reshape(1, -1))
                        result_prob = clf2.predict_proba(X_Text)
                        c = 0
                        while c < len(result1[0]):
                            if result1[0][c] == 1:
                                case = self.extract_case_number(data[c])
                                if case != "":
                                    case1 += case + " "
                                    #print str(max(result_prob[c])),"confidence for model"
                                    scr += str(max(result_prob[c])) + " "
                            c += 1
                                scr += str(max(scr_line[c])) + " "
                        c += 1
                if len(case1.split()) == 0:
                    case1 = "NONE"
                    scr = "0 "
                result_numbers = list(set(case1.split()))
                scr_set = list(set(scr.split()))
                case_number = str(','.join(result_numbers))
                cs1 = float(max(scr_set))
                data = {
                    "esop_id": self.esop_id,
                    "casenumber": case_number,
                    "casenumber_cs": cs1
                }
                data_final = json.dumps(data)
                logger.info("sent result from casenumber " + str(self.esop_id))
                return data_final

            else:
                data = {
                    "esop_id": self.esop_id,
                    "casenumber": None,
                    "casenumber_cs": 0.0
                }
                data_final = json.dumps(data)
                logger.info("sent result from casenumber(else) " +
                            str(self.esop_id))
                return data_final
        except Exception, e:
            logger.info("exception from case number " + str(e) +
                        str(self.esop_id))
Exemple #9
0
    def predict(self, law, attach):
        try:
            logger.info("started from pltfdftd " + str(self.esop_id))
            #path="/home/1232301/PycharmProjects/WK-FINAL-without-kafka/testdocs/531104590_FC_UT.pdf"
            litigation = law
            attachments = attach
            pageOrder = orderPages(litigation, attachments)
            final_plaintiff = None
            final_defendant = None
            involuntary_plaintiff = []
            pltfdftdVS = []
            pltfdftdPartyTitle = []
            cs_score = []
            cs_pltf = 0
            cs_dftd = 0
            for page in pageOrder:
                if final_plaintiff != None and final_defendant != None:
                    break
                X = []
                party_title_details = []
                pltf_aftr_filtering = []
                dftd_aftr_filtering = []
                plaintiff = []
                defendant = []
                pltf_index_level1 = []
                dftd_index_level1 = []
                original_txt, original_top, original_below, X_txt, X_top, X_below, X_cordinate, X_tags = self.read_cordinates(
                    self.PageDict[page])
                for xp1, xp2, xp3, x4, xd1, xd2, xd3, xz1, xz2, xz3 in \
                        zip(text_vectorizer_pltf.transform(X_txt).toarray(), \
                            top_vectorizer_pltf.transform(X_top).toarray(), \
                            below_vectorizer_pltf.transform(X_below).toarray(), X_tags, \
                            text_vectorizer_dftd.transform(X_txt).toarray(), \
                            top_vectorizer_dftd.transform(X_top).toarray(),
                            below_vectorizer_dftd.transform(X_below).toarray(), \
                            text_vectorizer_zero.transform(X_txt).toarray(), \
                            top_vectorizer_zero.transform(X_top).toarray(),
                            below_vectorizer_zero.transform(X_below).toarray(), ):
                    tmp = []
                    for x in xp1:
                        tmp.append(x)
                    for x in xp2:
                        tmp.append(x)
                    for x in xp3:
                        tmp.append(x)
                    for x in x4:
                        tmp.append(x)
                    for x in xd1:
                        tmp.append(x)
                    for x in xd2:
                        tmp.append(x)
                    for x in xd3:
                        tmp.append(x)
                    for x in xz1:
                        tmp.append(x)
                    for x in xz2:
                        tmp.append(x)
                    for x in xz3:
                        tmp.append(x)
                    X.append(tmp)
                if len(X) > 0:
                    result_level1 = clf.predict(X)
                    cs_score = clf.predict_proba(X)
                else:
                    continue
                pltf_index_level1 = [
                    a for a in range(len(result_level1))
                    if result_level1[a] == 1
                ]
                dftd_index_level1 = [
                    a for a in range(len(result_level1))
                    if result_level1[a] == 2
                ]
                if final_plaintiff == None and len(pltf_index_level1) > 0:
                    for index in pltf_index_level1:
                        txt = X_txt[index]
                        top = X_top[index]
                        below = X_below[index]
                        pltf = original_txt[index]
                        plaintiff.append(pltf)
                        out = pltf + "#" + top + "#" + below + "#" + "\n"
                        party_title_details.append(out)
                        if PLTF(txt, top, below) == 1:
                            pltf_aftr_filtering.append(pltf)
                            # print pltf_aftr_filtering
                            temp = replace_v(pltf)
                            if temp.find('versus') > 5:
                                pltfdftdVS.append(pltf)
                                pltfdftdPartyTitle.append(out)
                    if len(pltf_aftr_filtering) > 0:
                        final_plaintiff = extractor_plaintiff(
                            pltf_aftr_filtering, party_title_details,
                            involuntary_plaintiff)
                        cs_pltf = max(cs_score[index])
                logger.info("before final_defendant == None" +
                            str(self.esop_id))
                if final_defendant == None and len(dftd_index_level1) > 0:
                    totalArea = []
                    for index in dftd_index_level1:
                        txt = X_txt[index]
                        top = X_top[index]
                        below = X_below[index]
                        tags = X_tags[index]
                        dftd = original_txt[index]
                        defendant.append(dftd)
                        if DFTD(txt, top, below, tags) == 1:
                            dftd_aftr_filtering.append(defendant)
                            out = dftd + '#' + top + '#' + below + '#'
                            party_title_details.append(out)
                            temp = replace_v(dftd)
                            if temp.find('versus') > 5:
                                pltfdftdVS.append(dftd)
                                pltfdftdPartyTitle.append(out)
                            totalArea.append(X_cordinate[index])
                    c = 0
                    logger.info("before final_defendant == None" +
                                str(self.esop_id))
                    if len(totalArea) != 0:
                        dftd_aftr_filtering = []
                        for dft in read_line(self.path, totalArea):
                            c += 1
                            for phrases in [
                                    'AFFIDAVIT FOR GARNISHMENT',
                                    'AGAINST BANK ACCOUNT'
                            ]:
                                if difflib.SequenceMatcher(
                                        None, dft,
                                        phrases).ratio() * 100 >= 90:
                                    continue
                            if len(dft.strip()) <= 3:
                                continue
                            dftd_aftr_filtering.append(
                                dft.replace('(cid:9)', ''))
                    if len(dftd_aftr_filtering) > 0:
                        final_defendant = extractor_defendant(
                            dftd_aftr_filtering,
                            party_title_details).strip() + '  //  To:'
                        cs_dftd = max(cs_score[index])
                        # print "Final Defendant 1"
                        # print "Plaintiff after processing page ",page,"is",final_plaintiff
                        # print "Defendant after processing page ",page,"is",final_defendant
            if final_plaintiff == None and litigation.lower().find('lev') >= 0:
                final_plaintiff, cs_pltf = plaintiff_levies(self.path)
                cs_pltf = 0.86
                # print "Plaintiff identified at 2"
            elif final_plaintiff == None and len(pltfdftdVS) > 0:
                final_plaintiff = extractor_plaintiff(pltfdftdVS,
                                                      pltfdftdPartyTitle,
                                                      involuntary_plaintiff)
                cs_pltf = 0.76
                # print "Plaintiff identified at 3"
            elif final_plaintiff == None and len(pltf_index_level1) != 0:
                final_plaintiff = extractor_plaintiff(
                    [original_txt[pltf_index_level1[0]]], [''],
                    involuntary_plaintiff)
                cs_pltf = 0.64
                # print "Plaintiff identified at 4"
            if final_defendant == None and len(pltfdftdVS) != 0:
                final_defendant = extractor_defendant(pltfdftdVS,
                                                      pltfdftdPartyTitle)
                cs_dftd = 0.84
                # print "Final Defendant 2"
            elif final_defendant == None and len(dftd_index_level1) != 0:
                totalArea = [X_cordinate[dftd_index_level1[0]]]
                if len(totalArea) != 0:
                    dftd_aftr_filtering = []
                    for dft in read_line(self.path, totalArea):
                        c += 1
                        for phrases in [
                                'AFFIDAVIT FOR GARNISHMENT',
                                'AGAINST BANK ACCOUNT'
                        ]:
                            if difflib.SequenceMatcher(
                                    None, dft, phrases).ratio() * 100 >= 90:
                                continue
                        if len(dft.strip()) <= 3:
                            continue
                        dftd_aftr_filtering.append(dft.replace('(cid:9)', ''))
                    if len(dftd_aftr_filtering) > 0:
                        final_defendant = extractor_defendant(
                            dftd_aftr_filtering, [''])
                        # print "Final Defendant 3"
            #print "Story Ends :", ('{:%H:%M:%S}'.format(datetime.datetime.now()))
            if final_plaintiff == None:
                final_plaintiff = 'None'
            if final_defendant == None:
                final_defendant = 'None'

            result = {
                "esop_id": self.esop_id,
                "pltf": final_plaintiff,
                "pltf_cs": cs_pltf,
                "dftd": final_defendant,
                "dftd_cs": cs_dftd
            }
            result_final = json.dumps(result)
            logger.info("sent result from pltfdftd " + str(self.esop_id))
            return result_final
        except Exception, e:
            logger.info("exception from pltfdftd " + str(e) +
                        str(self.esop_id))
            error_handler = Error_Handler.Error_Handler()
            error_handler.mysql_insert_error("pltfdftd", myconfig.error_code2,
                                             str(self.esop_id) + " " + str(e))
Exemple #10
0
    def predict(self):
        try:
            logger.info("starting jurisdiction prediction for esop " +
                        str(self.esop_id))
            global fd
            global data
            data = self.pdf_readLine

            #print self.path

            for x in data:
                block = self.pre_process(x)
                xd = tf.transform([block])
                result = clf.predict(xd)

                if result[0] == 1:
                    cs = clf.predict_proba(xd)
                    cs1 = max(max(cs))
                    text = u''.join(x).encode('utf-8').strip()
                    tokens = nltk.word_tokenize(x)
                    for word in reversed(tokens):
                        if word in myconfig.state_code_dictionary.keys():
                            self.J.append(myconfig.state_code_dictionary[word])
                            self.B.append(text)
                            if myconfig.state_code_dictionary[
                                    word] == myconfig.connecticut:
                                docus = self.pdf_readPage
                                jurisdiction = self.jurisdiction_suggestions(
                                    docus[0])
                                data = {
                                    "esop_id": self.esop_id,
                                    "jurisdiction": jurisdiction,
                                    "jurisdiction_cs": cs1
                                }
                                data_final = json.dumps(data)
                                logger.info(
                                    "sent result from jurisdiction(case 1)" +
                                    str(self.esop_id))
                                return data_final
                            else:
                                data = {
                                    "esop_id":
                                    self.esop_id,
                                    "jurisdiction":
                                    myconfig.state_code_dictionary[word],
                                    "jurisdiction_cs":
                                    cs1
                                }
                                data_final = json.dumps(data)
                                logger.info(
                                    "sent result from jurisdiction(case 2)" +
                                    str(self.esop_id))
                                return data_final

            docus = self.pdf_readPage
            jurisdiction = self.jurisdiction_suggestions(docus[0])
            if jurisdiction != "":
                data = {
                    "esop_id": self.esop_id,
                    "jurisdiction": jurisdiction,
                    "jurisdiction_cs": myconfig.cs_juri_specialcase
                }
                data_final = json.dumps(data)
            else:
                data = {
                    "esop_id": self.esop_id,
                    "jurisdiction": jurisdiction,
                    "jurisdiction_cs": 0
                }
                data_final = json.dumps(data)
            logger.info("sent result from jurisdiction(case 3) " +
                        str(self.esop_id))
            return data_final
        except Exception, e:
            logger.info("exception from Jurisdiction " + str(e) +
                        str(self.esop_id))
            error_handler = Error_Handler.Error_Handler()
            msg = str(self.esop_id) + " " + str(e)
            error_handler.mysql_insert_error("Jurisdiction",
                                             myconfig.error_code2, msg)