Exemple #1
0
def updateProData_DEGOV():

    pdfList = getFileList(rawDataDir_DEGOV, ext=".pdf", recursive=True)
    txtList = getFileList(proDataDir_DEGOV, ext=".detx", recursive=True)
    numList = [f.split('/')[-1][:-5] for f in txtList]

    newList = [f for f in pdfList if f.split('/')[-1][:-4] not in numList]
    print("Extracting text from", len(newList), "new PDFs.")

    for f in newList:
        savePDFtext_DEGOV(f)
    print("Done.")

    return True
Exemple #2
0
def processJSONfolder_CL(sourcePath, outputPath, recursive=False):

    fileList = getFileList(sourcePath, ext='.json', recursive=recursive)
    failCounter = 0
    lawboxCounter = 0
    htmlWCcounter = 0
    plainTextCounter = 0

    for file in fileList:

        data = loadData(file)
        fileName = file.split('/')[-1]

        if data['html_lawbox']:
            if len(data['html_lawbox']) > 10:
                outDir = outputPath + '/' + file.split(
                    '/')[-3] + '/' + file.split('/')[-2] + "/lawbox/"
                newFilePath = outDir + fileName[:-4] + "cllb"
                if not os.path.exists(outDir):
                    os.makedirs(outDir)
                with open(newFilePath,
                          'wb') as file:  #Need to open as binary file
                    file.write(data['html_lawbox'].encode('utf8'))
                lawboxCounter += 1
                continue

        elif data['html_with_citations']:
            if len(data['html_with_citations']) > 10:
                outDir = outputPath + '/' + file.split(
                    '/')[-3] + '/' + file.split('/')[-2] + "/withCitations/"
                newFilePath = outDir + fileName[:-4] + "clwc"
                if not os.path.exists(outDir):
                    os.makedirs(outDir)
                with open(newFilePath,
                          'wb') as file:  #Need to open as binary file
                    file.write(data['html_with_citations'].encode('utf8'))
                htmlWCcounter += 1
                continue

        elif data['plain_text']:
            if len(data['plain_text']) > 10:
                outDir = outputPath + '/' + file.split(
                    '/')[-3] + '/' + file.split('/')[-2] + "/plainText/"
                newFilePath = outDir + fileName[:-4] + 'cltx'
                if not os.path.exists(outDir):
                    os.makedirs(outDir)
                with open(newFilePath,
                          'wb') as file:  #Need to open as binary file
                    file.write(data['plain_text'].encode('utf8'))
                plainTextCounter += 1
                continue

        else:
            print("Failed to load file: ", sourcePath + file)
            failCounter += 1

    print("\nTotal lawbox:", lawboxCounter)
    print("Total HTMLwithCitations:", htmlWCcounter)
    print("Total plainText:", plainTextCounter)
    print("failed to load:", failCounter)
Exemple #3
0
def getData_CL(jurisdiction):
    """
    print("Downloading data  . . . . .")
    downloadData_CL(jurisdiction)
    """

    print("Extracting data . . . . .")
    extractData_CL(jurisdiction)

    print("Processing data . . . . .")
    if jurisdiction == "Delaware": sourcepath = rawDataDir_CL + "Delaware/"
    elif jurisdiction == "Pennsylvania":
        sourcepath = rawDataDir_CL + "Pennsylvania/"
    elif jurisdiction == "Federal":
        sourcepath = rawDataDir_CL + "Federal/"
    else:
        print("Invalid jurisdiction.  (getData_CL())")
        return False

    flist = getFileList(sourcepath, ext="", recursive=True)
    for f in flist:
        courtFolder = f.split('/')[-3] + "/" + f.split('/')[-2] + "/"
        savepath = proDataDir_CL + courtFolder
        processJSON_CL(f, savepath, recursive=False)

    print("Done.")
    return True
Exemple #4
0
def parseAndSaveOP_all(jx='DE'):
    """
    Script to proccess all downloaded files and save them into the database
    """

    if jx == 'DE':
        flist1 = getFileList("/home/dan/Data/CourtListener/Processed/Delaware",
                             ".clwc", True)
        flist2 = getFileList("/home/dan/Data/CourtListener/Processed/Delaware",
                             ".cltx", True)
        flist3 = getFileList("/home/dan/Data/DelawareGov/Processed/", ".detx",
                             True)
        flist_tx = flist1 + flist2 + flist3
        flist_lb = getFileList(
            "/home/dan/Data/CourtListener/Processed/Delaware", ".cllb", True)
        for f in flist_tx:
            parseAndSaveOP_text(f)
        for f in flist_lb:
            parseAndSaveOP_LB(f)

    if jx == 'PA':
        flist1 = getFileList(
            "/home/dan/Data/CourtListener/Processed/Pennsylvania", ".clwc",
            True)
        flist2 = getFileList(
            "/home/dan/Data/CourtListener/Processed/Pennsylvania", ".cltx",
            True)
        flist_tx = flist1 + flist2
        flist_lb = getFileList(
            "/home/dan/Data/CourtListener/Processed/Pennsylvania", ".cllb",
            True)
        for f in flist_tx:
            parseAndSaveOP_text(f)
        for f in flist_lb:
            parseAndSaveOP_LB(f)

    # ONLY DOES LB OPINIONS
    # EXPAND TO INCLUDE SECOND AND THIRD CIRCUIT WC FILES
    if jx == 'US':
        flist_lb = getFileList(
            "/home/dan/Data/CourtListener/Processed/Federal", ".cllb", True)
        print("Parsing",
              len(flist_lb),
              " US opinion files. Completed: ",
              end='')
        for i, f in enumerate(flist_lb):
            if i % 1000 == 0: print(i, " . . . ", end='')
            parseAndSaveOP_LB(f)
        print("Done.")

    return True
Exemple #5
0
def analyzeJSON_CL(path, recursive=False):

    fileList = getFileList(path, ext='.json', recursive=recursive)

    OCRcounter = 0
    lawboxCounter = 0
    notLawboxCounter = 0
    withCitesCounter = 0
    htmlCounter = 0
    plainTextCounter = 0
    for file in fileList:
        data = loadData(file)
        if data['extracted_by_ocr'] != None:
            if data['extracted_by_ocr'] == 'true': OCRcounter += 1
        if data['html_lawbox'] != None:
            if len(data['html_lawbox']) > 10: lawboxCounter += 1
        if data['html_lawbox'] != None:
            if len(data['html_lawbox']) <= 10: notLawboxCounter += 1
        if data['html_lawbox'] == None:
            notLawboxCounter += 1
        if data['html_with_citations'] != None:
            if len(data['html_with_citations']) > 10: withCitesCounter += 1
        if data['html'] != None:
            if len(data['html']) > 10: htmlCounter += 1
        if data['plain_text'] != None:
            if len(data['plain_text']) > 10: plainTextCounter += 1

    #Testing
    fileCounter = len(fileList)
    print('\n', "Total Number of JSON Files:", fileCounter)
    print("Number OCR Used:", OCRcounter,
          "({0:.2f}%)".format(OCRcounter / fileCounter * 100))
    print("lawbox:", lawboxCounter,
          "({0:.2f}%)".format(lawboxCounter / fileCounter * 100))
    print("notLawbox:", notLawboxCounter,
          "({0:.2f}%)".format(notLawboxCounter / fileCounter * 100))
    print("html_with_cites:", withCitesCounter,
          "({0:.2f}%)".format(withCitesCounter / fileCounter * 100))
    print("html:", htmlCounter,
          "({0:.2f}%)".format(htmlCounter / fileCounter * 100))
    print("plain_text:", plainTextCounter,
          "({0:.2f}%)".format(plainTextCounter / fileCounter * 100))
    print('\n', data.keys(), '\n')

    return fileCounter, OCRcounter, lawboxCounter, withCitesCounter
Exemple #6
0
def saveAllPDFtext_DEGOV(path=rawDataDir_DEGOV):
    flist = getFileList(path, ext=".pdf", recursive=True)
    for f in flist:
        savePDFtext_DEGOV(f)
    return True