Ejemplo n.º 1
0
def createDirStructure(results, MOD_PDF_PATH):
    for result in results:
        dirStructure = getDirStructurePath(result[3], MOD_PDF_PATH)
        akmutility.setup_download_dir(dirStructure)


if __name__ == "__main__":
    benchCourtList = akmutility.getBenchList()
    judicialBodiesList = akmutility.getJudicialBodiesList()
    # current date and time
    now = datetime.now()
    CURRENT_DATE_DIR = now.strftime("%Y%m%d")
    for SOURCE in judicialBodiesList:
        for BENCH_COURT in benchCourtList:
            #BENCH_COURT = BENCH.replace('Bench', '').replace(' ', '')
            logger.info(akmutility.getStringWithThis("*"))
            logger.info(
                f'JUDGEMENTS DATA DOWNLOAD from {SOURCE} for Bench {BENCH_COURT} STARTED !'
            )
            START_TIME = time.time()
            PDF_PATH = akmutility.getjudgementoutputpath(SOURCE)
            MOD_PDF_PATH = PDF_PATH + "/" + BENCH_COURT + "/"
            try:
                results = getlinksandfilenamesforjudgements(
                    SOURCE, BENCH_COURT)
                #print(results)
                if len(results) > 0:
                    createDirStructure(results, MOD_PDF_PATH)
                #fileNames = getFileNamesFromDB(source)
                with ThreadPoolExecutor() as executor:
                    # Create a new partially applied function that stores the directory argument.
def start(inputVar):
    START_TIME = time.time()
    #inputVar = str(
    #    input('Provide the source from where the information has to be scraped [PA_CIRP, PA_VLP, PA_LP, IRP] : '))
    #[SOURCE, MAIN_URL_PATH] = akmutility.checkinputargumentandgeturl(inputVar)
    #logger.info(
    #    "*********************************************************************************************************")
    #logger.info(f'IBBI WEB MASTER DATA scrapping - {inputVar} .....')

    [SOURCE, MAIN_URL_PATH] = akmutility.getURLforsource(inputVar)
    # print("Main Path "+MAIN_URL_PATH)
    # Get SOUP
    SOUP = getsoup(1, MAIN_URL_PATH, flag=False)

    # Get Last Page
    LAST_PAGE = getlastpage(SOUP, SOURCE)
    #print(LAST_PAGE)

    # LAST_PAGE = 1

    # Get Last Updated Record if any - To load the delta records
    result = dbpersistence.getlastupdatedrecord(SOURCE, CURSOR)
    #print(result)
    updateflag = False
    dbupdate = False
    if not result is None:
        logger.info("Latest records will be updated.")
        logger.info("Updating........................")
        resultstr = str(result).replace(",", "").replace("'", "").replace(
            "(", "").replace(")", "")
        logger.info("Last Updated Record Details - " + resultstr)
        for i in range(DELTA_PAGE, 0, -1):
            logger.info("Scanning page......")
            SOUP = getsoup(i, MAIN_URL_PATH, flag=True)
            # Get all data associated with this class
            no_rows = len(SOUP.find_all("tr")) - 1
            # sys.exit()
            for row in SOUP.find_all("tr")[no_rows:0:-1]:
                # Get all cells inside the row
                basic_data_cells = row.findAll("td")
                if updateflag:
                    dbInsert(basic_data_cells, SOURCE)
                    dbupdate = True
                else:
                    # print("update = false")
                    # print(getkeyvalues(basic_data_cells))
                    if resultstr == getkeyvalues(basic_data_cells, SOURCE):
                        # print("Result same as key values "+ getkeyvalues(basic_data_cells))
                        updateflag = True
                        continue
                    else:
                        continue
    else:
        for i in range(LAST_PAGE, 0, -1):
            SOUP = getsoup(i, MAIN_URL_PATH, flag=True)
            # Get all data associated with this class
            no_rows = len(SOUP.find_all("tr")) - 1
            # sys.exit()
            for row in SOUP.find_all("tr")[no_rows:0:-1]:
                # Get all cells inside the row
                basic_data_cells = row.findAll("td")
                dbInsert(basic_data_cells, SOURCE)

    if not dbupdate:
        logger.info(SOURCE + " Database is upto-date! ")

    MY_DB.close()
    CURSOR.close()

    END_TIME = time.time()

    #logger.info("******************************************************************************************************************")
    logger.info(akmutility.getStringWithThis("*"))
    logger.info(
        f"Time taken to scrape the IBBI information from {inputVar} is {END_TIME - START_TIME} seconds "
    )
    logger.info(akmutility.getStringWithThis("*"))
Ejemplo n.º 3
0
def start(SOURCE, benchCourtList, fromDate):
    toDate = constructDate(date.today())
    for bench in benchCourtList:
        logger.info('Scrapping -----> ' + bench)
        fromDate = getJudgementFromDate(bench, toDate, SOURCE)
        logger.info(akmutility.getStringWithThis("*"))
        logger.info(
            f"Looking for Judgements for {bench} from {fromDate} to {toDate}")
        courtId = parser.get('bench', bench)
        MAIN_URL_PATH = "https://nclt.gov.in/judgement-date-wise?field_bench_target_id=" + courtId + "&field_search_date_value%5Bmin%5D%5Bdate%5D=" + fromDate + "&field_search_date_value%5Bmax%5D%5Bdate%5D=" + toDate + "&page="
        # Get SOUP
        SOUP = getsoup(0, MAIN_URL_PATH, flag=True)
        no_rows = len(SOUP.find_all("tr")) - 1
        if no_rows <= 0:
            logger.error(
                MAIN_URL_PATH +
                ' - Data Not Available at the moment. Try again after sometime!'
            )
            continue
        # Get Last Page
        LAST_PAGE = getlastpage(SOUP)
        # Get Last Updated Record if any - To load the delta records
        resultSet = dbpersistence.getlastupdatedjudgementrecord(
            bench, SOURCE, CURSOR)
        result = str(resultSet)
        resultstr = result[2:(len(result) - 3)]
        #resultstr = str(result).replace(",", "").replace("'", "").replace("(", "").replace(")", "")
        # print(result)
        updateflag = False
        dbupdate = False
        if not resultSet is None:
            logger.info("Latest records will be updated.")
            logger.info("Updating........................")
            logger.info("Last Updated Record Details - " + resultstr)
            # LAST_PAGE INSTEAD OF DELTA PAGE AS IT HAS TO TRAVERSE BETWEEN THE TWO DATE RANGES
            for i in range(LAST_PAGE, -1, -1):
                logger.info("Scanning page......")
                SOUP = getsoup(i, MAIN_URL_PATH, flag=True)
                # Get all data associated with this class
                no_rows = len(SOUP.find_all("tr")) - 1
                if no_rows <= 0:
                    logger.error(
                        MAIN_URL_PATH +
                        ' - Data Not Available at the moment. Try again after sometime!'
                    )
                    break
                # sys.exit()
                for row in SOUP.find_all("tr")[no_rows:0:-1]:
                    # Get all cells inside the row
                    basic_data_cells = row.findAll("td")
                    if not checkIBCCases(basic_data_cells):
                        continue
                    if updateflag:
                        dbInsert(basic_data_cells, bench, SOURCE)
                        dbupdate = True
                    else:
                        # print("update = false")
                        # print(getkeyvalues(basic_data_cells))
                        if resultstr == getkeyvalues(basic_data_cells):
                            # print("Result same as key values "+ getkeyvalues(basic_data_cells))
                            updateflag = True
                            continue
                        else:
                            continue
        else:
            for i in range(LAST_PAGE, -1, -1):
                SOUP = getsoup(i, MAIN_URL_PATH, flag=True)
                # Get all data associated with this class
                no_rows = len(SOUP.find_all("tr")) - 1
                # sys.exit()
                for row in SOUP.find_all("tr")[no_rows:0:-1]:
                    # Get all cells inside the row
                    basic_data_cells = row.findAll("td")
                    if not checkIBCCases(basic_data_cells):
                        continue
                    dbInsert(basic_data_cells, bench, SOURCE)

        if not dbupdate:
            logger.info(SOURCE + " Database is upto-date! ")

        logger.info(
            f"Judgements Scrapping for {bench} from {fromDate} to {toDate} - Completed!!"
        )

    MY_DB.close()
    CURSOR.close()