def createDirStructure(results, MOD_PDF_PATH): for result in results: dirStructure = getDirStructurePath(result[3], MOD_PDF_PATH) akmutility.setup_download_dir(dirStructure) if __name__ == "__main__": benchCourtList = akmutility.getBenchList() judicialBodiesList = akmutility.getJudicialBodiesList() # current date and time now = datetime.now() CURRENT_DATE_DIR = now.strftime("%Y%m%d") for SOURCE in judicialBodiesList: for BENCH_COURT in benchCourtList: #BENCH_COURT = BENCH.replace('Bench', '').replace(' ', '') logger.info(akmutility.getStringWithThis("*")) logger.info( f'JUDGEMENTS DATA DOWNLOAD from {SOURCE} for Bench {BENCH_COURT} STARTED !' ) START_TIME = time.time() PDF_PATH = akmutility.getjudgementoutputpath(SOURCE) MOD_PDF_PATH = PDF_PATH + "/" + BENCH_COURT + "/" try: results = getlinksandfilenamesforjudgements( SOURCE, BENCH_COURT) #print(results) if len(results) > 0: createDirStructure(results, MOD_PDF_PATH) #fileNames = getFileNamesFromDB(source) with ThreadPoolExecutor() as executor: # Create a new partially applied function that stores the directory argument.
def start(inputVar): START_TIME = time.time() #inputVar = str( # input('Provide the source from where the information has to be scraped [PA_CIRP, PA_VLP, PA_LP, IRP] : ')) #[SOURCE, MAIN_URL_PATH] = akmutility.checkinputargumentandgeturl(inputVar) #logger.info( # "*********************************************************************************************************") #logger.info(f'IBBI WEB MASTER DATA scrapping - {inputVar} .....') [SOURCE, MAIN_URL_PATH] = akmutility.getURLforsource(inputVar) # print("Main Path "+MAIN_URL_PATH) # Get SOUP SOUP = getsoup(1, MAIN_URL_PATH, flag=False) # Get Last Page LAST_PAGE = getlastpage(SOUP, SOURCE) #print(LAST_PAGE) # LAST_PAGE = 1 # Get Last Updated Record if any - To load the delta records result = dbpersistence.getlastupdatedrecord(SOURCE, CURSOR) #print(result) updateflag = False dbupdate = False if not result is None: logger.info("Latest records will be updated.") logger.info("Updating........................") resultstr = str(result).replace(",", "").replace("'", "").replace( "(", "").replace(")", "") logger.info("Last Updated Record Details - " + resultstr) for i in range(DELTA_PAGE, 0, -1): logger.info("Scanning page......") SOUP = getsoup(i, MAIN_URL_PATH, flag=True) # Get all data associated with this class no_rows = len(SOUP.find_all("tr")) - 1 # sys.exit() for row in SOUP.find_all("tr")[no_rows:0:-1]: # Get all cells inside the row basic_data_cells = row.findAll("td") if updateflag: dbInsert(basic_data_cells, SOURCE) dbupdate = True else: # print("update = false") # print(getkeyvalues(basic_data_cells)) if resultstr == getkeyvalues(basic_data_cells, SOURCE): # print("Result same as key values "+ getkeyvalues(basic_data_cells)) updateflag = True continue else: continue else: for i in range(LAST_PAGE, 0, -1): SOUP = getsoup(i, MAIN_URL_PATH, flag=True) # Get all data associated with this class no_rows = len(SOUP.find_all("tr")) - 1 # sys.exit() for row in SOUP.find_all("tr")[no_rows:0:-1]: # Get all cells inside the row basic_data_cells = row.findAll("td") dbInsert(basic_data_cells, SOURCE) if not dbupdate: logger.info(SOURCE + " Database is upto-date! ") MY_DB.close() CURSOR.close() END_TIME = time.time() #logger.info("******************************************************************************************************************") logger.info(akmutility.getStringWithThis("*")) logger.info( f"Time taken to scrape the IBBI information from {inputVar} is {END_TIME - START_TIME} seconds " ) logger.info(akmutility.getStringWithThis("*"))
def start(SOURCE, benchCourtList, fromDate): toDate = constructDate(date.today()) for bench in benchCourtList: logger.info('Scrapping -----> ' + bench) fromDate = getJudgementFromDate(bench, toDate, SOURCE) logger.info(akmutility.getStringWithThis("*")) logger.info( f"Looking for Judgements for {bench} from {fromDate} to {toDate}") courtId = parser.get('bench', bench) MAIN_URL_PATH = "https://nclt.gov.in/judgement-date-wise?field_bench_target_id=" + courtId + "&field_search_date_value%5Bmin%5D%5Bdate%5D=" + fromDate + "&field_search_date_value%5Bmax%5D%5Bdate%5D=" + toDate + "&page=" # Get SOUP SOUP = getsoup(0, MAIN_URL_PATH, flag=True) no_rows = len(SOUP.find_all("tr")) - 1 if no_rows <= 0: logger.error( MAIN_URL_PATH + ' - Data Not Available at the moment. Try again after sometime!' ) continue # Get Last Page LAST_PAGE = getlastpage(SOUP) # Get Last Updated Record if any - To load the delta records resultSet = dbpersistence.getlastupdatedjudgementrecord( bench, SOURCE, CURSOR) result = str(resultSet) resultstr = result[2:(len(result) - 3)] #resultstr = str(result).replace(",", "").replace("'", "").replace("(", "").replace(")", "") # print(result) updateflag = False dbupdate = False if not resultSet is None: logger.info("Latest records will be updated.") logger.info("Updating........................") logger.info("Last Updated Record Details - " + resultstr) # LAST_PAGE INSTEAD OF DELTA PAGE AS IT HAS TO TRAVERSE BETWEEN THE TWO DATE RANGES for i in range(LAST_PAGE, -1, -1): logger.info("Scanning page......") SOUP = getsoup(i, MAIN_URL_PATH, flag=True) # Get all data associated with this class no_rows = len(SOUP.find_all("tr")) - 1 if no_rows <= 0: logger.error( MAIN_URL_PATH + ' - Data Not Available at the moment. Try again after sometime!' ) break # sys.exit() for row in SOUP.find_all("tr")[no_rows:0:-1]: # Get all cells inside the row basic_data_cells = row.findAll("td") if not checkIBCCases(basic_data_cells): continue if updateflag: dbInsert(basic_data_cells, bench, SOURCE) dbupdate = True else: # print("update = false") # print(getkeyvalues(basic_data_cells)) if resultstr == getkeyvalues(basic_data_cells): # print("Result same as key values "+ getkeyvalues(basic_data_cells)) updateflag = True continue else: continue else: for i in range(LAST_PAGE, -1, -1): SOUP = getsoup(i, MAIN_URL_PATH, flag=True) # Get all data associated with this class no_rows = len(SOUP.find_all("tr")) - 1 # sys.exit() for row in SOUP.find_all("tr")[no_rows:0:-1]: # Get all cells inside the row basic_data_cells = row.findAll("td") if not checkIBCCases(basic_data_cells): continue dbInsert(basic_data_cells, bench, SOURCE) if not dbupdate: logger.info(SOURCE + " Database is upto-date! ") logger.info( f"Judgements Scrapping for {bench} from {fromDate} to {toDate} - Completed!!" ) MY_DB.close() CURSOR.close()