def detectAndApplyChanges(pdfUrlList): tmpmap = {} connection.ping(True) for pdfUrl in pdfUrlList: subjectCode = os.path.basename(unquote(pdfUrl)).split(".")[0].replace(" ", "") logger.info("Checking subject: " + subjectCode) resultcenter.ping("Scanning " + subjectCode) pdfhead = requests.get(pdfUrl) hashInput = str(pdfhead.headers['Last-Modified']) hash = (hashlib.sha256(hashInput.encode('utf-8')).hexdigest()) if subjectCode not in subjectCheckSums: logger.warn("Unknown subject: " + subjectCode) continue if subjectCheckSums[subjectCode] == None: subjectCheckSums[subjectCode] = hash updateSubjectCheckSumRemort(subjectCode, hash, "None") if subjectCheckSums[subjectCode] != hash: logger.info("Changes detected for " + subjectCode) try: xmlData = downloader.getXML(pdfUrl) jsonData = converter.jsonGenerator(xmlData, pdfUrl) if resultcenter.submitDataSet(jsonData): updateSubjectCheckSumRemort(subjectCode, hash, "Update") subjectCheckSums[subjectCode] = hash except Exception as error: print("ERROR" + str(error)) traceback.print_exc() return tmpmap
def getXML(url, manualMode=False): if not manualMode: resultcenter.ping("Processing result sheet") logger.info("Resolving: " + url) session = requests.Session() pageLoad = session.get('https://www.freefileconvert.com') searchObj = re.search(r'( <meta name="csrf-token" content=")(\w{2,})(">)', str(pageLoad.content)) accessToken = "" if searchObj: accessToken = searchObj.group()[34:-2] else: logger.crit("Unable to fetch the access token.") raise Exception('Unable to fetch the access token.') logger.info("Using token: " + accessToken) headers = { "Origin": "https://www.freefileconvert.com", "Accept-Encoding": "gzip, deflate, br", "X-CSRF-TOKEN": accessToken, "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": "https://www.freefileconvert.com/", "X-Requested-With": "XMLHttpRequest", "Connection": "keep-alive" } progressKey = id_generator() logger.info("Using progress key: " + progressKey) payload = { "_token": accessToken, "url": url, "output_format": "xml", "progress_key": progressKey, } if not manualMode: resultcenter.ping("Processing result sheet") xmlRequest = session.post('https://www.freefileconvert.com/file/url', data=payload, headers=headers) parsedJSON = json.loads("" + xmlRequest.content.strip().decode('utf-8')) if (parsedJSON['status'] == "success"): fileURL = 'https://www.freefileconvert.com/file/' + parsedJSON[ 'id'] + '/download' logger.info("Reading XML: " + fileURL) logger.info("Waiting for the PDF -> XML conversion to finish") while True: if not manualMode: resultcenter.ping("Processing result sheet") statusResp = session.get("https://www.freefileconvert.com/file/" + parsedJSON['id'] + "/status", headers=headers) if "Success" in statusResp.content.strip().decode('utf-8'): break time.sleep(1) logger.info("Fetching XML translation") xml = session.get(fileURL, headers=headers) return xml.content.strip().decode('utf-8') else: logger.crit("Error occured: " + parsedJSON['error']) raise Exception("Error occured: " + parsedJSON['error'])
def detectAndApplyChanges(pdfUrlList): global unknownSubjects, zeroSheets tmpmap = {} connection.ping(True) if len(pdfUrlList) == 0: logger.warn("There are no result sheets in the UGVLE", zeroSheets == False) zeroSheets = True return zeroSheets = False for pdfUrl in pdfUrlList: subjectCode = os.path.basename(unquote(pdfUrl)).split(".")[0].replace(" ", "") logger.info("Checking subject: " + subjectCode) resultcenter.ping("Scanning " + subjectCode) pdfhead = None try: pdfhead = requests.head(pdfUrl) except: logger.warn("Unable to fetch results sheet using URL: " + pdfUrl, True) if (pdfhead.status_code != 200): report = pdfhead.status_code not in [503, 404] logger.warn("Request to fetch " + pdfUrl + " returned: " + str(pdfhead.status_code), report) continue hash = GetHashCheckSum(pdfhead.headers['Last-Modified']) fileHash = GetHashCheckSum(pdfhead.headers['content-length'] + '_' + subjectCode) if subjectCode not in subjectCheckSums: if subjectCode not in unknownSubjects: logger.warn("Unknown subject detected: " + subjectCode, True) unknownSubjects.append(subjectCode) else: logger.info("Skipping unknown subject: " + subjectCode) continue if subjectCheckSums[subjectCode] == None: subjectCheckSums[subjectCode] = hash updateSubjectCheckSumRemort(subjectCode, hash, "None") subjectCheckSums[subjectCode] = hash if subjectCheckSums[subjectCode] != hash: if IsPreviouslyProcessed(subjectCode, fileHash): updateSubjectCheckSumRemort(subjectCode, hash, "Checksum Update") continue logger.info("Changes detected for " + subjectCode) try: xmlData = downloader.getXML(pdfUrl) jsonData = converter.jsonGenerator(xmlData, pdfUrl) dataSetId = resultcenter.submitDataSet(jsonData) if dataSetId: updateSubjectCheckSumRemort(subjectCode, hash, "Update") else: logger.crit("Failed to submit dataset: " + subjectCode, True) subjectCheckSums[subjectCode] = hash AddAsProcessedFile(subjectCode, fileHash, dataSetId) except Exception as error: logger.crit(subjectCode + ": " + str(error)) traceback.print_exc() return tmpmap
def fetchFromDB(map): logger.info("Fecthing previous checksums from Database") with connection.cursor() as cursor: sql = "SELECT code, checksum FROM subject;" cursor.execute(sql) for result in cursor.fetchall(): map[result['code']] = result['checksum'] logger.info("Starting Monitoring Client") manualMode = False zeroSheets = False stabilizeMode = False if (len(sys.argv) == 1): logger.announceLogFile(True) resultcenter.ping("Starting") else: logger.announceLogFile(False) if sys.argv[1] == 'stabilize': stabilizeMode = True else: logger.info('Starting in manual mode') manualMode = True connection = pymysql.connect(host=os.environ['AWS_RDB_HOST'], user=os.environ['AWS_RDB_USERNAME'], password=os.environ['AWS_RDB_PASSWORD'], db=os.environ['AWS_RDB_DATABASE'], charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) if connection.open == False:
traceback.print_exc() return tmpmap def fetchFromDB(map): logger.info("Fecthing previous checksums from Database") with connection.cursor() as cursor: sql = "SELECT code, checksum FROM results.subject;" cursor.execute(sql) for result in cursor.fetchall(): map[result['code']] = result['checksum'] logger.info("Initializing loadup") resultcenter.ping("Starting") connection = pymysql.connect(host=os.environ['AWS_RDB_HOST'], user=os.environ['AWS_RDB_USERNAME'], password=os.environ['AWS_RDB_PASSWORD'], db=os.environ['AWS_RDB_DATABASE'], charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) if connection.open == False: logger.crit("Failed to connect to the database") exit(1) logger.info("Connected to database") subjectCheckSums = {} fetchFromDB(subjectCheckSums) waitTime = os.environ['MONIT_WAIT_TIME'] logger.info("Wait time is: " + waitTime) itterationNumber = 1