def getExaminationYear(txt): lineRegEx = re.search('font="[0-9]"(.*)>[0-9]{4}<', txt) lineRegEx = re.search('>[0-9]{4}<', lineRegEx.group(0)) year = lineRegEx.group(0)[1:-1] if int(year) > 2015 and int(year) < 2050: logger.info("Examination year detected as: " + str(year)) return year logger.crit("Failed to detect examination year") return "####"
def recalculate(pattern): headers = { 'accessToken': os.environ['RESCENT_ACCESS_TOKEN'], 'Content-Type': 'application/json' } resp = requests.post(domain + '/admin/calculate/pattern/' + pattern, headers=headers) if (resp.status_code == 200): logger.info("Recalculation for pattern " + pattern + " complted. " + resp.text) return True logger.crit("Recalculation for pattern " + pattern + " failed. " + resp.text) return False
def submitDataSet(datasetJSON): headers = { 'accessToken': os.environ['RESCENT_ACCESS_TOKEN'], 'Content-Type': 'application/json' } # return True # DEBUG SKIP resp = requests.post(domain + '/admin/result/dataset', data=datasetJSON, headers=headers) if (resp.status_code == 200): logger.info("Dataset submitted. " + resp.text) return True logger.crit("Dataset submission failed: " + resp.text) return False
def jsonGenerator(xmlData, url, forcedSubjectCode=None): indexStack = [] resultStack = [] affected = [] logger.info('XML -> JSON conversion started') root = xml.etree.ElementTree.fromstring(xmlData) xmlString = xml.etree.ElementTree.tostring( root, encoding='utf8', method='xml').strip().decode('utf-8') fontIndex = getFontResultIndex(xmlString) for text in root.findall(".//text[@font='" + fontIndex + "']"): result = brain(text.text) if 'index' in result: indexStack.append(result['index']) elif 'grade' in result: resultStack.append(result['grade']) if (len(indexStack) == len(resultStack)): output = {} logger.info('Stack lengths matched') output['data'] = {} for i in range(len(indexStack)): if int(str(indexStack[i])[:2]) > 13: output['data'][indexStack[i]] = resultStack[i] if str(indexStack[i])[:4] not in affected: affected.append(str(indexStack[i])[:4]) affectedIndexes.append(str(indexStack[i])[:4]) output['total'] = len(output['data']) output['affected'] = affected output['year'] = getExaminationYear(xmlString) filePath = os.path.basename(unquote(url)).split(".")[0].replace( " ", "") output['subject'] = filePath if (forcedSubjectCode != None): output['subject'] = str(forcedSubjectCode) jsonOutput = json.dumps(output, sort_keys=False, indent=4, separators=(',', ':')) writeToFile(filePath, jsonOutput) logger.info('Affected index patterns: ' + str(affected)) logger.info('XML -> JSON conversion success') return jsonOutput else: logger.crit('Task Failed. Length mismatch') raise Exception("Error occured: Task Failed. Length mismatch")
def getXML(url, manualMode=False): if not manualMode: resultcenter.ping("Processing result sheet") logger.info("Resolving: " + url) session = requests.Session() pageLoad = session.get('https://www.freefileconvert.com') searchObj = re.search(r'( <meta name="csrf-token" content=")(\w{2,})(">)', str(pageLoad.content)) accessToken = "" if searchObj: accessToken = searchObj.group()[34:-2] else: logger.crit("Unable to fetch the access token.") raise Exception('Unable to fetch the access token.') logger.info("Using token: " + accessToken) headers = { "Origin": "https://www.freefileconvert.com", "Accept-Encoding": "gzip, deflate, br", "X-CSRF-TOKEN": accessToken, "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": "https://www.freefileconvert.com/", "X-Requested-With": "XMLHttpRequest", "Connection": "keep-alive" } progressKey = id_generator() logger.info("Using progress key: " + progressKey) payload = { "_token": accessToken, "url": url, "output_format": "xml", "progress_key": progressKey, } if not manualMode: resultcenter.ping("Processing result sheet") xmlRequest = session.post('https://www.freefileconvert.com/file/url', data=payload, headers=headers) parsedJSON = json.loads("" + xmlRequest.content.strip().decode('utf-8')) if (parsedJSON['status'] == "success"): fileURL = 'https://www.freefileconvert.com/file/' + parsedJSON[ 'id'] + '/download' logger.info("Reading XML: " + fileURL) logger.info("Waiting for the PDF -> XML conversion to finish") while True: if not manualMode: resultcenter.ping("Processing result sheet") statusResp = session.get("https://www.freefileconvert.com/file/" + parsedJSON['id'] + "/status", headers=headers) if "Success" in statusResp.content.strip().decode('utf-8'): break time.sleep(1) logger.info("Fetching XML translation") xml = session.get(fileURL, headers=headers) return xml.content.strip().decode('utf-8') else: logger.crit("Error occured: " + parsedJSON['error']) raise Exception("Error occured: " + parsedJSON['error'])
def detectAndApplyChanges(pdfUrlList): global unknownSubjects, zeroSheets tmpmap = {} connection.ping(True) if len(pdfUrlList) == 0: logger.warn("There are no result sheets in the UGVLE", zeroSheets == False) zeroSheets = True return zeroSheets = False for pdfUrl in pdfUrlList: subjectCode = os.path.basename(unquote(pdfUrl)).split(".")[0].replace(" ", "") logger.info("Checking subject: " + subjectCode) resultcenter.ping("Scanning " + subjectCode) pdfhead = None try: pdfhead = requests.head(pdfUrl) except: logger.warn("Unable to fetch results sheet using URL: " + pdfUrl, True) if (pdfhead.status_code != 200): report = pdfhead.status_code not in [503, 404] logger.warn("Request to fetch " + pdfUrl + " returned: " + str(pdfhead.status_code), report) continue hash = GetHashCheckSum(pdfhead.headers['Last-Modified']) fileHash = GetHashCheckSum(pdfhead.headers['content-length'] + '_' + subjectCode) if subjectCode not in subjectCheckSums: if subjectCode not in unknownSubjects: logger.warn("Unknown subject detected: " + subjectCode, True) unknownSubjects.append(subjectCode) else: logger.info("Skipping unknown subject: " + subjectCode) continue if subjectCheckSums[subjectCode] == None: subjectCheckSums[subjectCode] = hash updateSubjectCheckSumRemort(subjectCode, hash, "None") subjectCheckSums[subjectCode] = hash if subjectCheckSums[subjectCode] != hash: if IsPreviouslyProcessed(subjectCode, fileHash): updateSubjectCheckSumRemort(subjectCode, hash, "Checksum Update") continue logger.info("Changes detected for " + subjectCode) try: xmlData = downloader.getXML(pdfUrl) jsonData = converter.jsonGenerator(xmlData, pdfUrl) dataSetId = resultcenter.submitDataSet(jsonData) if dataSetId: updateSubjectCheckSumRemort(subjectCode, hash, "Update") else: logger.crit("Failed to submit dataset: " + subjectCode, True) subjectCheckSums[subjectCode] = hash AddAsProcessedFile(subjectCode, fileHash, dataSetId) except Exception as error: logger.crit(subjectCode + ": " + str(error)) traceback.print_exc() return tmpmap
else: logger.announceLogFile(False) if sys.argv[1] == 'stabilize': stabilizeMode = True else: logger.info('Starting in manual mode') manualMode = True connection = pymysql.connect(host=os.environ['AWS_RDB_HOST'], user=os.environ['AWS_RDB_USERNAME'], password=os.environ['AWS_RDB_PASSWORD'], db=os.environ['AWS_RDB_DATABASE'], charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) if connection.open == False: logger.crit("Failed to connect to the database", True) exit(1) logger.info("Connected to database") subjectCheckSums = {} fetchFromDB(subjectCheckSums) waitTime = os.environ['MONIT_WAIT_TIME'] if (manualMode == False and stabilizeMode == False): logger.info("Wait time is: " + waitTime) unknownSubjects = [] itterationNumber = 1 # Manual Mode if manualMode: manualParse.manualRun(logger, subjectCheckSums, sys.argv[1], connection) exit(0)
cursor.execute(sql) for result in cursor.fetchall(): map[result['code']] = result['checksum'] logger.info("Initializing loadup") resultcenter.ping("Starting") connection = pymysql.connect(host=os.environ['AWS_RDB_HOST'], user=os.environ['AWS_RDB_USERNAME'], password=os.environ['AWS_RDB_PASSWORD'], db=os.environ['AWS_RDB_DATABASE'], charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) if connection.open == False: logger.crit("Failed to connect to the database") exit(1) logger.info("Connected to database") subjectCheckSums = {} fetchFromDB(subjectCheckSums) waitTime = os.environ['MONIT_WAIT_TIME'] logger.info("Wait time is: " + waitTime) itterationNumber = 1 while True: converter.clearAffectedIndexes() resultcenter.ping("Initializing Scan") logger.info("Scanning for changes. Itteration number: #" + str(itterationNumber)) detectAndApplyChanges(getPDFList()) logger.info("Scan completed.") if len(converter.affectedIndexes) > 0: logger.info("Following indexes requires recalculation: " + str(converter.affectedIndexes))