def getExaminationYear(txt):
    lineRegEx = re.search('font="[0-9]"(.*)>[0-9]{4}<', txt)
    lineRegEx = re.search('>[0-9]{4}<', lineRegEx.group(0))
    year = lineRegEx.group(0)[1:-1]
    if int(year) > 2015 and int(year) < 2050:
        logger.info("Examination year detected as: " + str(year))
        return year
    logger.crit("Failed to detect examination year")
    return "####"
Example #2
0
def recalculate(pattern):
    headers = {
        'accessToken': os.environ['RESCENT_ACCESS_TOKEN'],
        'Content-Type': 'application/json'
    }
    resp = requests.post(domain + '/admin/calculate/pattern/' + pattern, headers=headers)
    if (resp.status_code == 200):
        logger.info("Recalculation for pattern " + pattern + " complted. " + resp.text)
        return True
    logger.crit("Recalculation for pattern " + pattern + " failed. " + resp.text)
    return False
Example #3
0
def submitDataSet(datasetJSON):
    headers = {
        'accessToken': os.environ['RESCENT_ACCESS_TOKEN'],
        'Content-Type': 'application/json'
    }
    # return True # DEBUG SKIP
    resp = requests.post(domain + '/admin/result/dataset', data=datasetJSON, headers=headers)
    if (resp.status_code == 200):
        logger.info("Dataset submitted. " + resp.text)
        return True
    logger.crit("Dataset submission failed: " + resp.text)
    return False
def jsonGenerator(xmlData, url, forcedSubjectCode=None):
    indexStack = []
    resultStack = []
    affected = []

    logger.info('XML -> JSON conversion started')

    root = xml.etree.ElementTree.fromstring(xmlData)
    xmlString = xml.etree.ElementTree.tostring(
        root, encoding='utf8', method='xml').strip().decode('utf-8')
    fontIndex = getFontResultIndex(xmlString)
    for text in root.findall(".//text[@font='" + fontIndex + "']"):
        result = brain(text.text)
        if 'index' in result:
            indexStack.append(result['index'])
        elif 'grade' in result:
            resultStack.append(result['grade'])

    if (len(indexStack) == len(resultStack)):
        output = {}
        logger.info('Stack lengths matched')
        output['data'] = {}
        for i in range(len(indexStack)):
            if int(str(indexStack[i])[:2]) > 13:
                output['data'][indexStack[i]] = resultStack[i]
                if str(indexStack[i])[:4] not in affected:
                    affected.append(str(indexStack[i])[:4])
                    affectedIndexes.append(str(indexStack[i])[:4])
        output['total'] = len(output['data'])
        output['affected'] = affected
        output['year'] = getExaminationYear(xmlString)
        filePath = os.path.basename(unquote(url)).split(".")[0].replace(
            " ", "")
        output['subject'] = filePath
        if (forcedSubjectCode != None):
            output['subject'] = str(forcedSubjectCode)
        jsonOutput = json.dumps(output,
                                sort_keys=False,
                                indent=4,
                                separators=(',', ':'))
        writeToFile(filePath, jsonOutput)
        logger.info('Affected index patterns: ' + str(affected))
        logger.info('XML -> JSON conversion success')
        return jsonOutput
    else:
        logger.crit('Task Failed. Length mismatch')
        raise Exception("Error occured: Task Failed. Length mismatch")
def getXML(url, manualMode=False):
    if not manualMode:
        resultcenter.ping("Processing result sheet")
    logger.info("Resolving: " + url)
    session = requests.Session()

    pageLoad = session.get('https://www.freefileconvert.com')
    searchObj = re.search(r'( <meta name="csrf-token" content=")(\w{2,})(">)',
                          str(pageLoad.content))

    accessToken = ""
    if searchObj:
        accessToken = searchObj.group()[34:-2]
    else:
        logger.crit("Unable to fetch the access token.")
        raise Exception('Unable to fetch the access token.')

    logger.info("Using token: " + accessToken)
    headers = {
        "Origin": "https://www.freefileconvert.com",
        "Accept-Encoding": "gzip, deflate, br",
        "X-CSRF-TOKEN": accessToken,
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://www.freefileconvert.com/",
        "X-Requested-With": "XMLHttpRequest",
        "Connection": "keep-alive"
    }
    progressKey = id_generator()
    logger.info("Using progress key: " + progressKey)
    payload = {
        "_token": accessToken,
        "url": url,
        "output_format": "xml",
        "progress_key": progressKey,
    }
    if not manualMode:
        resultcenter.ping("Processing result sheet")
    xmlRequest = session.post('https://www.freefileconvert.com/file/url',
                              data=payload,
                              headers=headers)
    parsedJSON = json.loads("" + xmlRequest.content.strip().decode('utf-8'))
    if (parsedJSON['status'] == "success"):
        fileURL = 'https://www.freefileconvert.com/file/' + parsedJSON[
            'id'] + '/download'
        logger.info("Reading XML: " + fileURL)
        logger.info("Waiting for the PDF -> XML conversion to finish")
        while True:
            if not manualMode:
                resultcenter.ping("Processing result sheet")
            statusResp = session.get("https://www.freefileconvert.com/file/" +
                                     parsedJSON['id'] + "/status",
                                     headers=headers)
            if "Success" in statusResp.content.strip().decode('utf-8'):
                break
            time.sleep(1)
        logger.info("Fetching XML translation")
        xml = session.get(fileURL, headers=headers)
        return xml.content.strip().decode('utf-8')
    else:
        logger.crit("Error occured: " + parsedJSON['error'])
        raise Exception("Error occured: " + parsedJSON['error'])
def detectAndApplyChanges(pdfUrlList):
    global unknownSubjects, zeroSheets
    tmpmap = {}
    connection.ping(True)
    if len(pdfUrlList) == 0:
        logger.warn("There are no result sheets in the UGVLE", zeroSheets == False)
        zeroSheets = True
        return

    zeroSheets = False
    for pdfUrl in pdfUrlList:
        subjectCode = os.path.basename(unquote(pdfUrl)).split(".")[0].replace(" ", "")
        logger.info("Checking subject: " + subjectCode)
        resultcenter.ping("Scanning " + subjectCode)
        pdfhead = None
        try:
            pdfhead = requests.head(pdfUrl)
        except:
            logger.warn("Unable to fetch results sheet using URL: " + pdfUrl, True)

        if (pdfhead.status_code != 200):
            report = pdfhead.status_code not in [503, 404]
            logger.warn("Request to fetch " + pdfUrl + " returned: " + str(pdfhead.status_code), report)
            continue

        hash = GetHashCheckSum(pdfhead.headers['Last-Modified'])
        fileHash = GetHashCheckSum(pdfhead.headers['content-length'] + '_' + subjectCode)

        if subjectCode not in subjectCheckSums:
            if subjectCode not in unknownSubjects:
                logger.warn("Unknown subject detected: " + subjectCode, True)
                unknownSubjects.append(subjectCode)
            else:
                logger.info("Skipping unknown subject: " + subjectCode)
            continue

        if subjectCheckSums[subjectCode] == None:
            subjectCheckSums[subjectCode] = hash
            updateSubjectCheckSumRemort(subjectCode, hash, "None")
            subjectCheckSums[subjectCode] = hash

        if subjectCheckSums[subjectCode] != hash:
            if IsPreviouslyProcessed(subjectCode, fileHash):
                updateSubjectCheckSumRemort(subjectCode, hash, "Checksum Update")
                continue

            logger.info("Changes detected for " + subjectCode)
            try:
                xmlData = downloader.getXML(pdfUrl)
                jsonData = converter.jsonGenerator(xmlData, pdfUrl)
                dataSetId = resultcenter.submitDataSet(jsonData)
                if dataSetId:
                    updateSubjectCheckSumRemort(subjectCode, hash, "Update")
                else:
                    logger.crit("Failed to submit dataset: " + subjectCode, True)
                subjectCheckSums[subjectCode] = hash
                AddAsProcessedFile(subjectCode, fileHash, dataSetId)
            except Exception as error:
                logger.crit(subjectCode + ": " + str(error))
                traceback.print_exc()
    return tmpmap
else:
    logger.announceLogFile(False)
    if sys.argv[1] == 'stabilize':
        stabilizeMode = True
    else:
        logger.info('Starting in manual mode')
        manualMode = True

connection = pymysql.connect(host=os.environ['AWS_RDB_HOST'],
                                 user=os.environ['AWS_RDB_USERNAME'],
                                 password=os.environ['AWS_RDB_PASSWORD'],
                                 db=os.environ['AWS_RDB_DATABASE'],
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
if connection.open == False:
    logger.crit("Failed to connect to the database", True)
    exit(1)
logger.info("Connected to database")
subjectCheckSums = {}
fetchFromDB(subjectCheckSums)
waitTime = os.environ['MONIT_WAIT_TIME']
if (manualMode == False and stabilizeMode == False):
    logger.info("Wait time is: " + waitTime)

unknownSubjects = []
itterationNumber = 1

# Manual Mode
if manualMode:
    manualParse.manualRun(logger, subjectCheckSums, sys.argv[1], connection)
    exit(0)
Example #8
0
        cursor.execute(sql)
        for result in cursor.fetchall():
            map[result['code']] = result['checksum']



logger.info("Initializing loadup")
resultcenter.ping("Starting")
connection = pymysql.connect(host=os.environ['AWS_RDB_HOST'],
                                 user=os.environ['AWS_RDB_USERNAME'],
                                 password=os.environ['AWS_RDB_PASSWORD'],
                                 db=os.environ['AWS_RDB_DATABASE'],
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
if connection.open == False:
    logger.crit("Failed to connect to the database")
    exit(1)
logger.info("Connected to database")
subjectCheckSums = {}
fetchFromDB(subjectCheckSums)
waitTime = os.environ['MONIT_WAIT_TIME']
logger.info("Wait time is: " + waitTime)
itterationNumber = 1
while True:
    converter.clearAffectedIndexes()
    resultcenter.ping("Initializing Scan")
    logger.info("Scanning for changes. Itteration number: #" + str(itterationNumber))
    detectAndApplyChanges(getPDFList())
    logger.info("Scan completed.")
    if len(converter.affectedIndexes) > 0:
        logger.info("Following indexes requires recalculation: " + str(converter.affectedIndexes))