def run(folder, logger=None, minDelay=10): listOfFiles = getFiles(folderPath=folder) message = f"Got {len(listOfFiles)} files to parse." logger.info(message) if logger is not None else print(message) if len(listOfFiles) > 0: index = 0 counter = 0 skipCounter = 0 for fileName in listOfFiles: try: index += 1 message = f"Starting to parse file {index} of {len(listOfFiles)}... " logger.info(message) if logger is not None else print(message, end='') doc = readData( '', fileName) # fileName include path and os.sep not needed if len(doc) < 1: # private case - we got empty file message = "Skipped because len" logger.info(message) if logger is not None else print( message) skipCounter += 1 moveFile(doc, fileName, folder, unhandledFolder) continue elif 'פני:' not in str(doc['Doc Details']): # old type of case message = "Skipped because missing key" logger.info(message) if logger is not None else print( message) skipCounter += 1 moveFile(doc, fileName, folder, unhandledFolder) continue doc['Doc Details'], succeed = parser( doc['Doc Details']) # if succeed Dict, else text writeFolder = handledFolder if succeed else unhandledFolder if succeed: # insert info data into doc details and remove old duplicate for key in doc['Doc Info']: doc['Doc Details'][key] = doc['Doc Info'][key] if key != 'עמודים' \ else [int(s) for s in doc['Doc Info'][key].split() if s.isdigit()][0] doc.pop('Doc Info', None) counter += 1 logger.info(f"File {index} succeed" ) if logger is not None else print('Succeed') else: logger.info(f"File {index} failed" ) if logger is not None else print('Failed') moveFile(doc, fileName, folder, writeFolder) except: pass message = f"{counter} files Succeed, {skipCounter} files Skipped, {len(listOfFiles) - counter - skipCounter} files Failed, Total {len(listOfFiles)} files" logger.info(message) if logger is not None else print(message) else: logger.info( 'Parser finished his job.') if logger is not None else print( 'Parser finished his job.') callSleep( logger=logger, minutes=minDelay ) # after finished with all the files wait a bit - hours * minutes * seconds
def start_crawler(self, index): callSleep(seconds=index * 5) # crawlers start in different times to ensure they don't take the same page crawler = None canIGO = self.getSettings('crawler Run') while canIGO: crawler = Crawler(index=index, delay=2, site=self.site) if crawler is None else crawler try: date, link, first, last, caseList = self.get_link() if first <= last or last == -1: message = f'Starting to scrape date: {date}' self.logger.info(message) t1 = time() self.get_Cases_Data(crawler, date, link, first, last, caseList) message = f'Finished crawling page with the date: {date}, it took {(time() - t1) / 60} minutes' else: message = 'Nothing to crawl here' self.logger.info(message) except WebDriverException as _: message = f'browser closed or crashed - restart value is {canIGO}' self.logger.exception(message) except Exception as _: message = 'unknown error' self.logger.exception(message) finally: canIGO = self.getSettings('crawler Run') if crawler is not None: crawler.close() callSleep(minutes=10) self.start_crawler(index=index)
def downloadSync(loop=True): _logger = Logger('downloadSync.log', getPath(N=0) + f'logs{sep}').getLogger() while True: total = 0 db = DB().getDB('SupremeCourt') for folder in downloadFolders: counter = 0 connection = db.get_collection(getFolderName(folder)) cursor = list(connection.find({})) fileList = [ file.replace(folder, '') for file in getFiles(folderPath=folder) ] # extract file name for file in cursor: if file['name'] not in fileList: saveData(file['data'], file['name'], folder) counter += 1 total += counter _logger.info( f"Total {counter} files ware downloaded into {folder}") _logger.info(f"Total {total} files ware downloaded") if loop is False: break callSleep(logger=_logger, hours=1)
def main(): _logger = Logger('elasticsearch.log', getPath(N=0) + f'logs{sep}').getLogger() while True: Elastic_5_5_3( _logger).start_index() # start index product to elastic DB callSleep( logger=_logger, minutes=10 ) # after finished with all the files wait a bit - hours * minutes * seconds
def uploadSync(loop=True): _logger = Logger('uploadSync.log', getPath(N=0) + f'logs{sep}').getLogger() while True: total = 0 uCounter = 0 sCounter = 0 db = DB().getDB('SupremeCourt') for folder in uploadFolders.keys(): connection = db.get_collection(getFolderName(folder)) cursor = list(connection.find({})) backupFileList = [file['name'] for file in cursor] listOfFiles = getFiles(folderPath=folder) total += len(listOfFiles) _logger.info( f"Got {len(listOfFiles)} files to upload in folder {folder}..." ) if len(listOfFiles) > 0: index = 0 for fileName in listOfFiles: index += 1 _logger.info( f"Starting to upload file {index} of {len(listOfFiles)}... " ) data = readData(fileName, '') fixData(fileName, data) fullFilePath = fileName fileName = fileName.replace(folder, '') # extract file name if fileName not in backupFileList: try: connection.insert_one({ "name": fileName, "data": data }) uCounter += 1 _logger.info(f"Succeed to upload file {fileName}") if folder != uploadFolders[ folder]: # move file if folders are different changeDir( fullFilePath, uploadFolders[folder], deleteSourceIfDestinationFileExist=True) except Exception as e: # TODO better Exception _logger.info( f"Failed to upload file {fullFilePath}") _logger.info(e) else: _logger.info("Skipped") sCounter += 1 _logger.info( f"{uCounter} files Uploaded...\n{sCounter} files Skipped...\n{total - uCounter - sCounter} Failed...\nTotal {total} files" ) if loop is False: break callSleep(logger=_logger, minutes=10)
def getColumnText(self, crawler, index): string = self.get_string_by_index('no info column', index) elem = crawler.find_elem('xpath', string, raise_error=False) update = crawler.read_elem_text(elem) if update is True: # No info here return None string = self.get_string_by_index('inside column', index) elem = crawler.find_elem('xpath', string) callSleep(seconds=1) if index == 1: func = self.getGeneralDetails else: func = self.getOtherCaseDetails if elem is not None: return func(crawler, index) else: return None