def __init__(self, index=1, browser='chrome', delay=1, url=None, site=None):
     logPath = getPath(N=0) + f'logs{sep}{site}{sep}' if site is not None else getPath(N=0) + f'logs{sep}'
     self._logger = Logger(f'crawler_{index}.log', logPath).getLogger()
     self._driver = self.getBrowser(browser)
     self._driver.maximize_window()  # fullscreen_window()  # Maximize browser window
     self.update_delay(delay)  # update delay
     self.update_page(url)  # open url
     self._logger.info('Finished crawler Initialize')
Exemple #2
0
 def __init__(self, num_of_crawlers=0, site=None):
     logPath = getPath(N=0) + f'logs{sep}{site}{sep}' if site is not None else getPath(N=0) + f'logs{sep}'
     self.logger = Logger(f'{site}_Scraper.log', logPath).getLogger()
     self.db = DB(logger=self.logger).getDB(site)
     self.num_of_crawlers = min(cpu_count(), 4) if num_of_crawlers == 0 else num_of_crawlers  # 0 => 4 threads, else num
     self.productsFolder = getPath(N=0) + f'products{sep}json_products{sep}'  # product path
     self.backupFolder = getPath(N=0) + f'products{sep}backup_json_products{sep}'
     createDir(self.productsFolder)
def main():
    _logger = Logger('parser.log', getPath(N=0) + f'logs{sep}').getLogger()
    _logger.info("Parser is Starting")
    run(unhandledFolder, _logger, minDelay=0)
    while True:
        _logger.info("Parser is Starting")
        run(readFolder, _logger)
def downloadSync(loop=True):
    _logger = Logger('downloadSync.log',
                     getPath(N=0) + f'logs{sep}').getLogger()
    while True:
        total = 0
        db = DB().getDB('SupremeCourt')
        for folder in downloadFolders:
            counter = 0
            connection = db.get_collection(getFolderName(folder))
            cursor = list(connection.find({}))
            fileList = [
                file.replace(folder, '')
                for file in getFiles(folderPath=folder)
            ]  # extract file name
            for file in cursor:
                if file['name'] not in fileList:
                    saveData(file['data'], file['name'], folder)
                    counter += 1
            total += counter
            _logger.info(
                f"Total {counter} files ware downloaded into {folder}")
        _logger.info(f"Total {total} files ware downloaded")
        if loop is False:
            break
        callSleep(logger=_logger, hours=1)
Exemple #5
0
def main():
    _logger = Logger('elasticsearch.log',
                     getPath(N=0) + f'logs{sep}').getLogger()
    while True:
        Elastic_5_5_3(
            _logger).start_index()  # start index product to elastic DB
        callSleep(
            logger=_logger, minutes=10
        )  # after finished with all the files wait a bit - hours * minutes * seconds
 def getBrowser(browser='chrome'):
     path = f'ILCourtScraper{sep}WebDrivers{sep}'
     if browser == 'chrome':
         return webdriver.Chrome(ChromeDriverManager().install())
     elif browser == 'firefox':
         return webdriver.Firefox(GeckoDriverManager().install())
     elif browser == 'edge':
         if system() == 'Windows':
             return webdriver.Edge(executable_path=getPath(N=0) + path + 'msedgedriver.exe')
def uploadSync(loop=True):
    _logger = Logger('uploadSync.log', getPath(N=0) + f'logs{sep}').getLogger()
    while True:
        total = 0
        uCounter = 0
        sCounter = 0
        db = DB().getDB('SupremeCourt')

        for folder in uploadFolders.keys():
            connection = db.get_collection(getFolderName(folder))
            cursor = list(connection.find({}))
            backupFileList = [file['name'] for file in cursor]
            listOfFiles = getFiles(folderPath=folder)
            total += len(listOfFiles)
            _logger.info(
                f"Got {len(listOfFiles)} files to upload in folder {folder}..."
            )
            if len(listOfFiles) > 0:
                index = 0
                for fileName in listOfFiles:
                    index += 1
                    _logger.info(
                        f"Starting to upload file {index} of {len(listOfFiles)}... "
                    )
                    data = readData(fileName, '')
                    fixData(fileName, data)
                    fullFilePath = fileName
                    fileName = fileName.replace(folder,
                                                '')  # extract file name
                    if fileName not in backupFileList:
                        try:
                            connection.insert_one({
                                "name": fileName,
                                "data": data
                            })
                            uCounter += 1
                            _logger.info(f"Succeed to upload file {fileName}")
                            if folder != uploadFolders[
                                    folder]:  # move file if folders are different
                                changeDir(
                                    fullFilePath,
                                    uploadFolders[folder],
                                    deleteSourceIfDestinationFileExist=True)
                        except Exception as e:  # TODO better Exception
                            _logger.info(
                                f"Failed to upload file {fullFilePath}")
                            _logger.info(e)
                    else:
                        _logger.info("Skipped")
                        sCounter += 1

        _logger.info(
            f"{uCounter} files Uploaded...\n{sCounter} files Skipped...\n{total - uCounter - sCounter} Failed...\nTotal {total} files"
        )
        if loop is False:
            break
        callSleep(logger=_logger, minutes=10)
def readData(fileName, filePath=None):
    try:
        filePath = getPath() if filePath is None else filePath
        with open(filePath + fileName, encoding='utf8') as json_file:
            data = json.load(json_file)
        return data
    except JSONDecodeError as e:
        print(f'Error in decoding this file: {fileName}')
        print(e)
        return ''
from ILCourtScraper.Extra.logger import Logger
from ILCourtScraper.Extra.time import callSleep
from ILCourtScraper.Extra.json import readData, saveData
from ILCourtScraper.Extra.path import getPath, sep, createDir, getFiles, remove

readFolder = getPath(N=0) + f'products{sep}json_products{sep}'
handledFolder = getPath(N=0) + f'products{sep}handled_json_products{sep}'
unhandledFolder = getPath(N=0) + f'products{sep}unhandled_json_products{sep}'

for f in [readFolder, handledFolder, unhandledFolder]:
    createDir(f)


def clean_spaces(text):
    if type(text) is str:  # if text is a string
        if '\n' in text:  # if there is more than one line
            return clean_spaces(text.splitlines())  # resend it as list
    else:  # if text is a list
        for index in range(len(text)):  # for each line in the list
            text[index] = clean_spaces(text[index])  # resend one line
        return text

    temp_list = list()  # the return list of characters
    space = ' '
    for index in range(len(text)):
        if text[index] == space:  # if this a space
            if index != 0:  # if we not on the first index
                if text[index -
                        1] == space:  # if we saw a space don't add this one
                    continue
                else:  # in this case we do want to add
Exemple #10
0
def saveData(data, fileName=None, filePath=None):
    fileName = f"dataFromScraper_{currTime()}" if fileName is None else fileName
    filePath = getPath() if filePath is None else filePath
    with open(filePath + fileName, 'w') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=True)
Exemple #11
0
from ILCourtScraper.Extra.db import DB
from ILCourtScraper.Extra.logger import Logger
from ILCourtScraper.Extra.time import callSleep
from ILCourtScraper.Extra.json import readData, saveData
from ILCourtScraper.Extra.path import getPath, sep, getFiles, createDir, changeDir

handledFolder = getPath(N=0) + f'products{sep}handled_json_products{sep}'
unhandledFolder = getPath(N=0) + f'products{sep}unhandled_json_products{sep}'
backupFolder = getPath(N=0) + f'products{sep}backup_json_products{sep}'
unBackupFolder = getPath(N=0) + f'products{sep}unBackup_json_products{sep}'
elasticFolder = getPath(N=0) + f'products{sep}upload_json_to_elastic{sep}'

# key = source, value = destination
uploadFolders = {
    handledFolder: handledFolder,
    unhandledFolder: unhandledFolder,
    backupFolder: backupFolder,
    unBackupFolder: backupFolder,
    elasticFolder: elasticFolder
}
downloadFolders = [handledFolder, backupFolder]

for f in [
        handledFolder, unhandledFolder, backupFolder, unBackupFolder,
        elasticFolder
]:
    createDir(f)


def getFolderName(folder):
    return folder.split(sep)[-2]