def updateProData_DEGOV(): pdfList = getFileList(rawDataDir_DEGOV, ext=".pdf", recursive=True) txtList = getFileList(proDataDir_DEGOV, ext=".detx", recursive=True) numList = [f.split('/')[-1][:-5] for f in txtList] newList = [f for f in pdfList if f.split('/')[-1][:-4] not in numList] print("Extracting text from", len(newList), "new PDFs.") for f in newList: savePDFtext_DEGOV(f) print("Done.") return True
def processJSONfolder_CL(sourcePath, outputPath, recursive=False): fileList = getFileList(sourcePath, ext='.json', recursive=recursive) failCounter = 0 lawboxCounter = 0 htmlWCcounter = 0 plainTextCounter = 0 for file in fileList: data = loadData(file) fileName = file.split('/')[-1] if data['html_lawbox']: if len(data['html_lawbox']) > 10: outDir = outputPath + '/' + file.split( '/')[-3] + '/' + file.split('/')[-2] + "/lawbox/" newFilePath = outDir + fileName[:-4] + "cllb" if not os.path.exists(outDir): os.makedirs(outDir) with open(newFilePath, 'wb') as file: #Need to open as binary file file.write(data['html_lawbox'].encode('utf8')) lawboxCounter += 1 continue elif data['html_with_citations']: if len(data['html_with_citations']) > 10: outDir = outputPath + '/' + file.split( '/')[-3] + '/' + file.split('/')[-2] + "/withCitations/" newFilePath = outDir + fileName[:-4] + "clwc" if not os.path.exists(outDir): os.makedirs(outDir) with open(newFilePath, 'wb') as file: #Need to open as binary file file.write(data['html_with_citations'].encode('utf8')) htmlWCcounter += 1 continue elif data['plain_text']: if len(data['plain_text']) > 10: outDir = outputPath + '/' + file.split( '/')[-3] + '/' + file.split('/')[-2] + "/plainText/" newFilePath = outDir + fileName[:-4] + 'cltx' if not os.path.exists(outDir): os.makedirs(outDir) with open(newFilePath, 'wb') as file: #Need to open as binary file file.write(data['plain_text'].encode('utf8')) plainTextCounter += 1 continue else: print("Failed to load file: ", sourcePath + file) failCounter += 1 print("\nTotal lawbox:", lawboxCounter) print("Total HTMLwithCitations:", htmlWCcounter) print("Total plainText:", plainTextCounter) print("failed to load:", failCounter)
def getData_CL(jurisdiction): """ print("Downloading data . . . . .") downloadData_CL(jurisdiction) """ print("Extracting data . . . . .") extractData_CL(jurisdiction) print("Processing data . . . . .") if jurisdiction == "Delaware": sourcepath = rawDataDir_CL + "Delaware/" elif jurisdiction == "Pennsylvania": sourcepath = rawDataDir_CL + "Pennsylvania/" elif jurisdiction == "Federal": sourcepath = rawDataDir_CL + "Federal/" else: print("Invalid jurisdiction. (getData_CL())") return False flist = getFileList(sourcepath, ext="", recursive=True) for f in flist: courtFolder = f.split('/')[-3] + "/" + f.split('/')[-2] + "/" savepath = proDataDir_CL + courtFolder processJSON_CL(f, savepath, recursive=False) print("Done.") return True
def parseAndSaveOP_all(jx='DE'): """ Script to proccess all downloaded files and save them into the database """ if jx == 'DE': flist1 = getFileList("/home/dan/Data/CourtListener/Processed/Delaware", ".clwc", True) flist2 = getFileList("/home/dan/Data/CourtListener/Processed/Delaware", ".cltx", True) flist3 = getFileList("/home/dan/Data/DelawareGov/Processed/", ".detx", True) flist_tx = flist1 + flist2 + flist3 flist_lb = getFileList( "/home/dan/Data/CourtListener/Processed/Delaware", ".cllb", True) for f in flist_tx: parseAndSaveOP_text(f) for f in flist_lb: parseAndSaveOP_LB(f) if jx == 'PA': flist1 = getFileList( "/home/dan/Data/CourtListener/Processed/Pennsylvania", ".clwc", True) flist2 = getFileList( "/home/dan/Data/CourtListener/Processed/Pennsylvania", ".cltx", True) flist_tx = flist1 + flist2 flist_lb = getFileList( "/home/dan/Data/CourtListener/Processed/Pennsylvania", ".cllb", True) for f in flist_tx: parseAndSaveOP_text(f) for f in flist_lb: parseAndSaveOP_LB(f) # ONLY DOES LB OPINIONS # EXPAND TO INCLUDE SECOND AND THIRD CIRCUIT WC FILES if jx == 'US': flist_lb = getFileList( "/home/dan/Data/CourtListener/Processed/Federal", ".cllb", True) print("Parsing", len(flist_lb), " US opinion files. Completed: ", end='') for i, f in enumerate(flist_lb): if i % 1000 == 0: print(i, " . . . ", end='') parseAndSaveOP_LB(f) print("Done.") return True
def analyzeJSON_CL(path, recursive=False): fileList = getFileList(path, ext='.json', recursive=recursive) OCRcounter = 0 lawboxCounter = 0 notLawboxCounter = 0 withCitesCounter = 0 htmlCounter = 0 plainTextCounter = 0 for file in fileList: data = loadData(file) if data['extracted_by_ocr'] != None: if data['extracted_by_ocr'] == 'true': OCRcounter += 1 if data['html_lawbox'] != None: if len(data['html_lawbox']) > 10: lawboxCounter += 1 if data['html_lawbox'] != None: if len(data['html_lawbox']) <= 10: notLawboxCounter += 1 if data['html_lawbox'] == None: notLawboxCounter += 1 if data['html_with_citations'] != None: if len(data['html_with_citations']) > 10: withCitesCounter += 1 if data['html'] != None: if len(data['html']) > 10: htmlCounter += 1 if data['plain_text'] != None: if len(data['plain_text']) > 10: plainTextCounter += 1 #Testing fileCounter = len(fileList) print('\n', "Total Number of JSON Files:", fileCounter) print("Number OCR Used:", OCRcounter, "({0:.2f}%)".format(OCRcounter / fileCounter * 100)) print("lawbox:", lawboxCounter, "({0:.2f}%)".format(lawboxCounter / fileCounter * 100)) print("notLawbox:", notLawboxCounter, "({0:.2f}%)".format(notLawboxCounter / fileCounter * 100)) print("html_with_cites:", withCitesCounter, "({0:.2f}%)".format(withCitesCounter / fileCounter * 100)) print("html:", htmlCounter, "({0:.2f}%)".format(htmlCounter / fileCounter * 100)) print("plain_text:", plainTextCounter, "({0:.2f}%)".format(plainTextCounter / fileCounter * 100)) print('\n', data.keys(), '\n') return fileCounter, OCRcounter, lawboxCounter, withCitesCounter
def saveAllPDFtext_DEGOV(path=rawDataDir_DEGOV): flist = getFileList(path, ext=".pdf", recursive=True) for f in flist: savePDFtext_DEGOV(f) return True