def run(folder, logger=None, minDelay=10):
    listOfFiles = getFiles(folderPath=folder)
    message = f"Got {len(listOfFiles)} files to parse."
    logger.info(message) if logger is not None else print(message)
    if len(listOfFiles) > 0:
        index = 0
        counter = 0
        skipCounter = 0
        for fileName in listOfFiles:
            try:
                index += 1
                message = f"Starting to parse file {index} of {len(listOfFiles)}... "
                logger.info(message) if logger is not None else print(message,
                                                                      end='')
                doc = readData(
                    '',
                    fileName)  # fileName include path and os.sep not needed
                if len(doc) < 1:  # private case - we got empty file
                    message = "Skipped because len"
                    logger.info(message) if logger is not None else print(
                        message)
                    skipCounter += 1
                    moveFile(doc, fileName, folder, unhandledFolder)
                    continue
                elif 'פני:' not in str(doc['Doc Details']):  # old type of case
                    message = "Skipped because missing key"
                    logger.info(message) if logger is not None else print(
                        message)
                    skipCounter += 1
                    moveFile(doc, fileName, folder, unhandledFolder)
                    continue
                doc['Doc Details'], succeed = parser(
                    doc['Doc Details'])  # if succeed Dict, else text
                writeFolder = handledFolder if succeed else unhandledFolder

                if succeed:
                    # insert info data into doc details and remove old duplicate
                    for key in doc['Doc Info']:
                        doc['Doc Details'][key] = doc['Doc Info'][key] if key != 'עמודים' \
                            else [int(s) for s in doc['Doc Info'][key].split() if s.isdigit()][0]
                    doc.pop('Doc Info', None)
                    counter += 1
                    logger.info(f"File {index} succeed"
                                ) if logger is not None else print('Succeed')
                else:
                    logger.info(f"File {index} failed"
                                ) if logger is not None else print('Failed')
                moveFile(doc, fileName, folder, writeFolder)
            except:
                pass
        message = f"{counter} files Succeed, {skipCounter} files Skipped, {len(listOfFiles) - counter - skipCounter} files Failed, Total {len(listOfFiles)} files"
        logger.info(message) if logger is not None else print(message)

    else:
        logger.info(
            'Parser finished his job.') if logger is not None else print(
                'Parser finished his job.')
        callSleep(
            logger=logger, minutes=minDelay
        )  # after finished with all the files wait a bit - hours * minutes * seconds
    def start_crawler(self, index):
        callSleep(seconds=index * 5)  # crawlers start in different times to ensure they don't take the same page
        crawler = None
        canIGO = self.getSettings('crawler Run')

        while canIGO:
            crawler = Crawler(index=index, delay=2, site=self.site) if crawler is None else crawler
            try:
                date, link, first, last, caseList = self.get_link()
                if first <= last or last == -1:
                    message = f'Starting to scrape date: {date}'
                    self.logger.info(message)
                    t1 = time()
                    self.get_Cases_Data(crawler, date, link, first, last, caseList)
                    message = f'Finished crawling page with the date: {date}, it took {(time() - t1) / 60} minutes'
                else:
                    message = 'Nothing to crawl here'
                self.logger.info(message)

            except WebDriverException as _:
                message = f'browser closed or crashed - restart value is {canIGO}'
                self.logger.exception(message)
            except Exception as _:
                message = 'unknown error'
                self.logger.exception(message)
            finally:
                canIGO = self.getSettings('crawler Run')

        if crawler is not None:
            crawler.close()
        callSleep(minutes=10)
        self.start_crawler(index=index)
Beispiel #3
0
def downloadSync(loop=True):
    _logger = Logger('downloadSync.log',
                     getPath(N=0) + f'logs{sep}').getLogger()
    while True:
        total = 0
        db = DB().getDB('SupremeCourt')
        for folder in downloadFolders:
            counter = 0
            connection = db.get_collection(getFolderName(folder))
            cursor = list(connection.find({}))
            fileList = [
                file.replace(folder, '')
                for file in getFiles(folderPath=folder)
            ]  # extract file name
            for file in cursor:
                if file['name'] not in fileList:
                    saveData(file['data'], file['name'], folder)
                    counter += 1
            total += counter
            _logger.info(
                f"Total {counter} files ware downloaded into {folder}")
        _logger.info(f"Total {total} files ware downloaded")
        if loop is False:
            break
        callSleep(logger=_logger, hours=1)
Beispiel #4
0
def main():
    _logger = Logger('elasticsearch.log',
                     getPath(N=0) + f'logs{sep}').getLogger()
    while True:
        Elastic_5_5_3(
            _logger).start_index()  # start index product to elastic DB
        callSleep(
            logger=_logger, minutes=10
        )  # after finished with all the files wait a bit - hours * minutes * seconds
Beispiel #5
0
def uploadSync(loop=True):
    _logger = Logger('uploadSync.log', getPath(N=0) + f'logs{sep}').getLogger()
    while True:
        total = 0
        uCounter = 0
        sCounter = 0
        db = DB().getDB('SupremeCourt')

        for folder in uploadFolders.keys():
            connection = db.get_collection(getFolderName(folder))
            cursor = list(connection.find({}))
            backupFileList = [file['name'] for file in cursor]
            listOfFiles = getFiles(folderPath=folder)
            total += len(listOfFiles)
            _logger.info(
                f"Got {len(listOfFiles)} files to upload in folder {folder}..."
            )
            if len(listOfFiles) > 0:
                index = 0
                for fileName in listOfFiles:
                    index += 1
                    _logger.info(
                        f"Starting to upload file {index} of {len(listOfFiles)}... "
                    )
                    data = readData(fileName, '')
                    fixData(fileName, data)
                    fullFilePath = fileName
                    fileName = fileName.replace(folder,
                                                '')  # extract file name
                    if fileName not in backupFileList:
                        try:
                            connection.insert_one({
                                "name": fileName,
                                "data": data
                            })
                            uCounter += 1
                            _logger.info(f"Succeed to upload file {fileName}")
                            if folder != uploadFolders[
                                    folder]:  # move file if folders are different
                                changeDir(
                                    fullFilePath,
                                    uploadFolders[folder],
                                    deleteSourceIfDestinationFileExist=True)
                        except Exception as e:  # TODO better Exception
                            _logger.info(
                                f"Failed to upload file {fullFilePath}")
                            _logger.info(e)
                    else:
                        _logger.info("Skipped")
                        sCounter += 1

        _logger.info(
            f"{uCounter} files Uploaded...\n{sCounter} files Skipped...\n{total - uCounter - sCounter} Failed...\nTotal {total} files"
        )
        if loop is False:
            break
        callSleep(logger=_logger, minutes=10)
    def getColumnText(self, crawler, index):
        string = self.get_string_by_index('no info column', index)
        elem = crawler.find_elem('xpath', string, raise_error=False)
        update = crawler.read_elem_text(elem)
        if update is True:  # No info here
            return None

        string = self.get_string_by_index('inside column', index)
        elem = crawler.find_elem('xpath', string)
        callSleep(seconds=1)

        if index == 1:
            func = self.getGeneralDetails
        else:
            func = self.getOtherCaseDetails

        if elem is not None:
            return func(crawler, index)
        else:
            return None