コード例 #1
0
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding,
                     localFilename, pageread):
    if ("excel" in contentType):
        try:
            dirToSaveDownloads = comm.downloadsDir + baseUrl
            if not os.path.isdir(dirToSaveDownloads):
                os.makedirs(dirToSaveDownloads)
            fileparser.spreadURLsByContentType(redirectedTo,
                                               None,
                                               contentType,
                                               od,
                                               _encoding,
                                               filePath=(dirToSaveDownloads +
                                                         "/" + localFilename))
        except:
            comm.printException(
                comm.pathToSaveProgrammingErrors,
                "create_dir_for_excel_and_send_file_to_parser")
            pass
    else:
        try:
            fileparser.spreadURLsByContentType(redirectedTo, pageread,
                                               contentType, od, _encoding)
        except:
            comm.printException(comm.pathToSaveProgrammingErrors,
                                "send_file_to_parser")
            pass
コード例 #2
0
ファイル: auth.py プロジェクト: Mailis/EstNer
def importRDFfiles():
    global WORKER_INSTANCES
    inst_str = "number of log rows: " + str(nr_of_log_rows) + "\n"
    inst_str += "number of processed rows: " + str(line_counter) + "\n"
    inst_str += "length of POST list: " + str(postListSize) + "\n"

    list_len = len(WORKER_INSTANCES)
    inst_str += "number of worker instaces: " + str(list_len) + "\n" 
    #print("WORKERS " + str(WORKER_INSTANCES))
    if list_len > 0:
        for instance in WORKER_INSTANCES:
            wwwname = instance['name']
            inst_str += "\nworkername: " + wwwname + "\nmachineType: " + instance['machineType'] + "\n"
            wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP']
            www_data = dict()
            www_data["ip"] = wwwip
            www_data["name"] = wwwname
            www_data["statfile"] = mylargefile
            p = subprocess.Popen(["python3", "download_rdf_files.py", json.dumps(www_data)])
            #Wait for process to terminate.
            out, err = p.communicate()
        #add info about instances
        comm.saveStatistics(mylargefile, inst_str + "\n\n")
    else:
        comm.printException(comm.pathToSaveDownloadErrors, errString='No instances to list.')
コード例 #3
0
ファイル: read_eksel.py プロジェクト: peepkungas/EstNer
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    # row = worksheet.row(curr_row)
                    # print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if cell_type == 1:
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)

        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
コード例 #4
0
ファイル: auth.py プロジェクト: Mailis/EstNer
def processObject(client, itemname):
    try:
        #try to access json object in Google Compute Storage
        # Get Payload Data
        req = client.objects().get_media(
                    bucket = comm.jsonsDir,
                    object=itemname)
        #store info whether a json-object exists in the bucket or not
        fileExists = True
        try:
            resp = req.execute()
        except:
            fileExists = False
            pass
            
        #continue only when the object exists
        if (fileExists):
            # The BytesIO object may be replaced with any io.Base instance.
            fh = io.BytesIO()
            #prepare for reading a json-object
            downloader = MediaIoBaseDownload(fh, req, chunksize=1024*1024)
            done = False
            while not done:
                status, done = downloader.next_chunk()
            #load accessed json-object into dictionary
            jsonToDict = json.loads(fh.getvalue())#json.loads(fh.getvalue())#return value
            #print ("RETURNED VALUE: " + jsonToDict)
            doDownloadJob(jsonToDict, itemname)
    #store error message into respective errors bucket
    except oauth2_client.AccessTokenRefreshError:
        comm.printException(comm.pathToSaveDownloadErrors, errString="False credentials")
        pass
コード例 #5
0
ファイル: read_eksel.py プロジェクト: peepkungas/EstNer
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    #row = worksheet.row(curr_row)
                    #print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if (cell_type == 1):
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)
        
        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
コード例 #6
0
ファイル: postToWorker.py プロジェクト: Mailis/EstNer
def deleteRDFsInWorker(workerip):
    #delete RDF-files and download excel-files in a worker
    worker_address = "http://" + workerip + "/delete_rdf_files.php"
    try:
        requests.post(worker_address)
    except:
        comm.printException(comm.pathToConnectionErrors, errString='ConnectionError_to_worker_' + workerip)
コード例 #7
0
def importRDFfiles():
    global WORKER_INSTANCES
    list_len = len(WORKER_INSTANCES)
    #print("WORKERS " + str(WORKER_INSTANCES))
    try:
        if list_len > 0:
            for instance in WORKER_INSTANCES:
                wwwname = instance['name']
                wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][
                    'natIP']
                www_data = dict()
                www_data["ip"] = wwwip
                www_data["name"] = wwwname
                www_data["statfile"] = ""
                subprocess.Popen([
                    "python3",
                    comm.parentDir + "upload_logfile/download_rdf_files.py",
                    json.dumps(www_data)
                ])
        else:
            comm.printException(comm.updateErrorsFilePath,
                                errString='No instances to list.')
    except:
        comm.printException(comm.updateErrorsFilePath,
                            errString='Importing_RDFs.')
コード例 #8
0
def importRDFfiles():
    global WORKER_INSTANCES
    inst_str = "number of log rows: " + str(nr_of_log_rows) + "\n"
    inst_str += "number of processed rows: " + str(line_counter) + "\n"
    inst_str += "length of POST list: " + str(postListSize) + "\n"

    list_len = len(WORKER_INSTANCES)
    inst_str += "number of worker instaces: " + str(list_len) + "\n"
    #print("WORKERS " + str(WORKER_INSTANCES))
    if list_len > 0:
        for instance in WORKER_INSTANCES:
            wwwname = instance['name']
            inst_str += "\nworkername: " + wwwname + "\nmachineType: " + instance[
                'machineType'] + "\n"
            wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][
                'natIP']
            www_data = dict()
            www_data["ip"] = wwwip
            www_data["name"] = wwwname
            www_data["statfile"] = mylargefile
            p = subprocess.Popen(
                ["python3", "download_rdf_files.py",
                 json.dumps(www_data)])
            #Wait for process to terminate.
            out, err = p.communicate()
        #add info about instances
        comm.saveStatistics(mylargefile, inst_str + "\n\n")
    else:
        comm.printException(comm.pathToSaveDownloadErrors,
                            errString='No instances to list.')
コード例 #9
0
def processObject(client, itemname):
    try:
        #try to access json object in Google Compute Storage
        # Get Payload Data
        req = client.objects().get_media(bucket=comm.jsonsDir, object=itemname)
        #store info whether a json-object exists in the bucket or not
        fileExists = True
        try:
            resp = req.execute()
        except:
            fileExists = False
            pass

        #continue only when the object exists
        if (fileExists):
            # The BytesIO object may be replaced with any io.Base instance.
            fh = io.BytesIO()
            #prepare for reading a json-object
            downloader = MediaIoBaseDownload(fh, req, chunksize=1024 * 1024)
            done = False
            while not done:
                status, done = downloader.next_chunk()
            #load accessed json-object into dictionary
            jsonToDict = json.loads(
                fh.getvalue())  #json.loads(fh.getvalue())#return value
            #print ("RETURNED VALUE: " + jsonToDict)
            doDownloadJob(jsonToDict, itemname)
    #store error message into respective errors bucket
    except oauth2_client.AccessTokenRefreshError:
        comm.printException(comm.pathToSaveDownloadErrors,
                            errString="False credentials")
        pass
コード例 #10
0
def deleteRDFsInWorker(workerip):
    #delete RDF-files and download excel-files in a worker
    worker_address = "http://" + workerip + "/delete_rdf_files.php"
    try:
        requests.post(worker_address)
    except:
        comm.printException(comm.pathToConnectionErrors,
                            errString='ConnectionError_to_worker_' + workerip)
コード例 #11
0
ファイル: read_plaintext.py プロジェクト: Mailis/EstNer
def readPlainText(htmlurl, plaintext, ontologyData):
    try:
        punc = (plaintext).strip() 
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + htmlurl)
コード例 #12
0
ファイル: downloadFilesFromLog.py プロジェクト: Mailis/EstNer
def insertNewSubFileToDict(localFilename, page_redirected, page_info, page_sha254, page_status):
    try:
        newDict = dict()
        insertValuesToDict(newDict, localFilename, page_redirected, page_info, page_sha254, page_status)
        return newDict
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        pass
コード例 #13
0
ファイル: read_plaintext.py プロジェクト: peepkungas/EstNer
def readPlainText(htmlurl, plaintext, ontologyData):
    try:
        punc = (plaintext).strip()
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_plaintext.py " + htmlurl)
コード例 #14
0
ファイル: downloadDocs.py プロジェクト: Mailis/EstNer
def downloadDocuments(ajadir):
    start = datetime.datetime.now()
    currTime = time.strftime("%H:%M:%S")
    nrOfDownloads = dowloadFromJsons(ajadir)
    end = datetime.datetime.now() 
    try:
        saveUpdateStatistics(currTime, start, end, action="Download_datasets:_nr_of_downloads:_"+str(nrOfDownloads), isLast = True)
    except:
        comm.printException(comm.updateErrorsFilePath, errString="download")
        pass
コード例 #15
0
def insertNewSubFileToDict(localFilename, page_redirected, page_info,
                           page_sha254, page_status):
    try:
        newDict = dict()
        insertValuesToDict(newDict, localFilename, page_redirected, page_info,
                           page_sha254, page_status)
        return newDict
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        pass
コード例 #16
0
ファイル: postToWorker.py プロジェクト: Mailis/EstNer
def postListToWorker(wip, urlsList):
    pikkus = len(urlsList)
    dik = dict(zip(range(pikkus),urlsList))
    address = "http://"+wip
    try:
        requests.post(address, data={'data':json.dumps(dik), 'chunksize':json.dumps(comm.chunksize)})
        return 1
    except:
        comm.printException(comm.pathToConnectionErrors, errString='ConnectionError_to_worker_' + wip)
        return 0
        pass
コード例 #17
0
def saveUpdateStatistics(currtime, start, end, action="", isLast=False):
    span = end - start
    try:
        jf = open(comm.monthly_updates_path, 'a')
        if (isLast):
            jf.write(action + " " + currtime + " " + str(span) + "\n")
        else:
            jf.write(action + " " + currtime + " " + str(span) + " ")
        jf.close()
    except:
        comm.printException(comm.updateErrorsFilePath, errString="update")
        pass
コード例 #18
0
ファイル: downloadDocs.py プロジェクト: Mailis/EstNer
def saveUpdateStatistics(currtime, start, end, action="", isLast = False):
    span = end-start
    try:
        jf = open(comm.monthly_updates_path, 'a')
        if(isLast):
            jf.write(action + " " + currtime + " " + str(span) + "\n")
        else:
            jf.write(action + " " + currtime + " " + str(span) + " ")
        jf.close()
    except:
        comm.printException(comm.updateErrorsFilePath, errString="update")
        pass
コード例 #19
0
ファイル: commonMethods.py プロジェクト: Mailis/EstNer
def measureDownloadsTime(path_to_stat_file, ajadir):
    start = datetime.datetime.now()
    currTime = time.strftime("%H:%M:%S")
    nrOfDownloads = dowloadFromJsons(ajadir)
    end = datetime.datetime.now() 
    span = end-start
    try:
        jf = open(path_to_stat_file, 'a', encoding='utf-8')
        jf.write(currTime + " " + str(nrOfDownloads) + " " + str(span) + " " + "\n")
        jf.close()
    except:
        comm.printException(comm.pathToSaveDownloadErrors, errString="download")
        pass
コード例 #20
0
ファイル: importRemoteFiles.py プロジェクト: Mailis/EstNer
def importExcelFiles(WORKER_INSTANCES):
    list_len = len(WORKER_INSTANCES)
    if list_len > 0:
        for instance in WORKER_INSTANCES:
            wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP']#download excels            
            try:
                #excels_url ="http://146.148.115.150/downloaded_files/"
                excels_url = "http://" + wwwip + "/downloaded_files/"
                dest = comm.downloadsDir_for_excels
                os.system('wget  -r --no-parent -nH --cut-dirs=1 --reject "index.html*" ' + excels_url + " -P " + dest)
            except:
                comm.printException(comm.pathToSaveDownloadErrors, errString="collecting_Excels_and_PDFs")
                pass
コード例 #21
0
def readJson(jsonurl, readedPage, od):
    '''#if httpResponse is filepath
    jsonfile = (httpResponse.read()).decode('utf-8')
    '''
    #if httpResponse is saved nto string already
    try:
        jsonfile = (readedPage).strip()
        dictnry = json.loads(jsonfile)
        readDictValues(jsonurl, dictnry, set(), od)

    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_json.py " + jsonurl)
        pass
コード例 #22
0
ファイル: downloadFilesFromLog.py プロジェクト: Mailis/EstNer
def insertNewContentToDict(localFilename, page_redirected, page_info, page_sha254, page_status):
    try:
        newDict = dict()
        newDict[page_sha254] = page_info
        newDict[page_sha254]["localFilename"] = localFilename
        newDict[page_sha254]["file_url"] = page_redirected
        newDict[page_sha254]["sha224"] = page_sha254
        newDict[page_sha254]["status"] = page_status
        newDict[page_sha254]["timeDir"] = comm.timeDir
        return newDict
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        return False
コード例 #23
0
ファイル: downloadFilesFromLog.py プロジェクト: Mailis/EstNer
def insertValuesToDict(dictnry, localFilename, page_redirected, page_info, page_sha254, page_status):
    try:
        dictnry[localFilename] = dict()
        dictnry[localFilename][page_sha254] = page_info
        dictnry[localFilename][page_sha254]["localFilename"] = localFilename
        dictnry[localFilename][page_sha254]["file_url"] = page_redirected
        dictnry[localFilename][page_sha254]["sha224"] = page_sha254
        dictnry[localFilename][page_sha254]["status"] = page_status
        dictnry[localFilename][page_sha254]["timeDir"] = comm.timeDir
        return dictnry
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        pass
コード例 #24
0
def insertValuesToDict(dictnry, localFilename, page_redirected, page_info,
                       page_sha254, page_status):
    try:
        dictnry[localFilename] = dict()
        dictnry[localFilename][page_sha254] = page_info
        dictnry[localFilename][page_sha254]["localFilename"] = localFilename
        dictnry[localFilename][page_sha254]["file_url"] = page_redirected
        dictnry[localFilename][page_sha254]["sha224"] = page_sha254
        dictnry[localFilename][page_sha254]["status"] = page_status
        dictnry[localFilename][page_sha254]["timeDir"] = comm.timeDir
        return dictnry
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        pass
コード例 #25
0
def insertNewContentToDict(localFilename, page_redirected, page_info,
                           page_sha254, page_status):
    try:
        newDict = dict()
        newDict[page_sha254] = page_info
        newDict[page_sha254]["localFilename"] = localFilename
        newDict[page_sha254]["file_url"] = page_redirected
        newDict[page_sha254]["sha224"] = page_sha254
        newDict[page_sha254]["status"] = page_status
        newDict[page_sha254]["timeDir"] = comm.timeDir
        return newDict
    except:
        comm.printException(comm.pathToSaveJsonErrors)
        return False
コード例 #26
0
def downloadDocuments(ajadir):
    start = datetime.datetime.now()
    currTime = time.strftime("%H:%M:%S")
    nrOfDownloads = dowloadFromJsons(ajadir)
    end = datetime.datetime.now()
    try:
        saveUpdateStatistics(currTime,
                             start,
                             end,
                             action="Download_datasets:_nr_of_downloads:_" +
                             str(nrOfDownloads),
                             isLast=True)
    except:
        comm.printException(comm.updateErrorsFilePath, errString="download")
        pass
コード例 #27
0
ファイル: commonMethods.py プロジェクト: peepkungas/EstNer
def measureDownloadsTime(path_to_stat_file, ajadir):
    start = datetime.datetime.now()
    currTime = time.strftime("%H:%M:%S")
    nrOfDownloads = dowloadFromJsons(ajadir)
    end = datetime.datetime.now()
    span = end - start
    try:
        jf = open(path_to_stat_file, 'a', encoding='utf-8')
        jf.write(currTime + " " + str(nrOfDownloads) + " " + str(span) + " " +
                 "\n")
        jf.close()
    except:
        comm.printException(comm.pathToSaveDownloadErrors,
                            errString="download")
        pass
コード例 #28
0
ファイル: downloadFilesFromLog.py プロジェクト: Mailis/EstNer
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread):
    if("excel" in contentType):
        try:
            dirToSaveDownloads = comm.downloadsDir + baseUrl
            if not os.path.isdir(dirToSaveDownloads):
                os.makedirs(dirToSaveDownloads)
            fileparser.spreadURLsByContentType(redirectedTo, None, contentType, od, _encoding, filePath = (dirToSaveDownloads + "/" + localFilename))
        except:
            comm.printException(comm.pathToSaveProgrammingErrors, "create_dir_for_excel_and_send_file_to_parser")
            pass
    else:
        try:
            fileparser.spreadURLsByContentType(redirectedTo, pageread, contentType, od, _encoding)
        except:
            comm.printException(comm.pathToSaveProgrammingErrors, "send_file_to_parser")
            pass
コード例 #29
0
ファイル: authenticate_gce.py プロジェクト: Mailis/EstNer
def getMyAuthService(service_name = 'bigquery', service_version = 'v2'):
    http = httplib2.Http()
    token_uri = '%s/%s/token' % (METADATA_SERVER, SERVICE_ACCOUNT)
    resp, content = http.request(token_uri, method='GET',
                                 body=None,
                                 headers={'Metadata-Flavor': 'Google'})
    if resp.status == 200:
        d = json.loads(content)
        access_token = d['access_token']  # Save the access token
        credentials = oauth2_client.AccessTokenCredentials(access_token, 'my-user-agent/1.0')
        AUTH_HTTP = credentials.authorize(http)
        return build(service_name, service_version, http=AUTH_HTTP)
        
    else:
        comm.printException(comm.pathToSaveauthErrors, errString="AUTHENTICATION RESPONSE STATUS: " + resp.status)
        pass
コード例 #30
0
def postListToWorker(wip, urlsList):
    pikkus = len(urlsList)
    dik = dict(zip(range(pikkus), urlsList))
    address = "http://" + wip
    try:
        requests.post(address,
                      data={
                          'data': json.dumps(dik),
                          'chunksize': json.dumps(comm.chunksize)
                      })
        return 1
    except:
        comm.printException(comm.pathToConnectionErrors,
                            errString='ConnectionError_to_worker_' + wip)
        return 0
        pass
コード例 #31
0
def importExcelFiles(WORKER_INSTANCES):
    list_len = len(WORKER_INSTANCES)
    if list_len > 0:
        for instance in WORKER_INSTANCES:
            wwwip = instance['networkInterfaces'][0]['accessConfigs'][0][
                'natIP']  #download excels
            try:
                #excels_url ="http://146.148.115.150/downloaded_files/"
                excels_url = "http://" + wwwip + "/downloaded_files/"
                dest = comm.downloadsDir_for_excels
                os.system(
                    'wget  -r --no-parent -nH --cut-dirs=1 --reject "index.html*" '
                    + excels_url + " -P " + dest)
            except:
                comm.printException(comm.pathToSaveDownloadErrors,
                                    errString="collecting_Excels_and_PDFs")
                pass
コード例 #32
0
ファイル: read_pdf.py プロジェクト: Mailis/EstNer
def readPdf(url, readdedpdf, od):
    b = BytesIO(readdedpdf)
    pdfFile = PdfFileReader(b, "rb")
    pdfFile.strict = False
    #pdfFile = PdfFileReader("pdf-sample.pdf", "rb")
    
    #print(pdfFile)
    try:
        for i in range(pdfFile.numPages):
            #print(i)
            pageObject = pdfFile.getPage(i)#ContentStream(pdfFile.getPage(i)["/Contents"])
            text = (pageObject.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
コード例 #33
0
def importRDFfiles():
    global WORKER_INSTANCES
    list_len = len(WORKER_INSTANCES)
    #print("WORKERS " + str(WORKER_INSTANCES))
    try:
        if list_len > 0:
            for instance in WORKER_INSTANCES:
                wwwname = instance['name']
                wwwip = instance['networkInterfaces'][0]['accessConfigs'][0]['natIP']
                www_data = dict()
                www_data["ip"] = wwwip
                www_data["name"] = wwwname
                www_data["statfile"] = ""
                subprocess.Popen(["python3", comm.parentDir + "upload_logfile/download_rdf_files.py", json.dumps(www_data)])
        else:
            comm.printException(comm.updateErrorsFilePath, errString='No instances to list.')
    except:
        comm.printException(comm.updateErrorsFilePath, errString='Importing_RDFs.')
コード例 #34
0
ファイル: read_pdf.py プロジェクト: peepkungas/EstNer
def readPdf(url, readdedpdf, od):
    b = BytesIO(readdedpdf)
    pdfFile = PdfFileReader(b, "rb")
    pdfFile.strict = False
    #pdfFile = PdfFileReader("pdf-sample.pdf", "rb")

    #print(pdfFile)
    try:
        for i in range(pdfFile.numPages):
            #print(i)
            pageObject = pdfFile.getPage(
                i)  #ContentStream(pdfFile.getPage(i)["/Contents"])
            text = (pageObject.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
コード例 #35
0
ファイル: authenticate_gce.py プロジェクト: peepkungas/EstNer
def getMyAuthService(service_name='bigquery', service_version='v2'):
    http = httplib2.Http()
    token_uri = '%s/%s/token' % (METADATA_SERVER, SERVICE_ACCOUNT)
    resp, content = http.request(token_uri,
                                 method='GET',
                                 body=None,
                                 headers={'Metadata-Flavor': 'Google'})
    if resp.status == 200:
        d = json.loads(content)
        access_token = d['access_token']  # Save the access token
        credentials = oauth2_client.AccessTokenCredentials(
            access_token, 'my-user-agent/1.0')
        AUTH_HTTP = credentials.authorize(http)
        return build(service_name, service_version, http=AUTH_HTTP)

    else:
        comm.printException(comm.pathToSaveauthErrors,
                            errString="AUTHENTICATION RESPONSE STATUS: " +
                            resp.status)
        pass
コード例 #36
0
ファイル: auth.py プロジェクト: Mailis/EstNer
def listOfJsonObjects(client):
    try:
        # Get Metadata
        req = client.objects().list(bucket=comm.jsonsDir)            
        # If you have too many items to list in one request, list_next() will
        # automatically handle paging with the pageToken.
        while req is not None:
            resp = req.execute()
            #print json.dumps(resp, indent=2)
            #print ("----------------------------------")
            if resp and 'items' in resp:
                for item in (resp["items"]):
                    #print(item["name"])
                    itemname = (item["name"])
                    processObject(client, itemname)
                
                req = client.objects().list_next(req, resp)
    except oauth2_client.AccessTokenRefreshError:
        comm.printException(comm.pathToSaveDownloadErrors, errString="False credentials")
        pass
コード例 #37
0
def listOfJsonObjects(client):
    try:
        # Get Metadata
        req = client.objects().list(bucket=comm.jsonsDir)
        # If you have too many items to list in one request, list_next() will
        # automatically handle paging with the pageToken.
        while req is not None:
            resp = req.execute()
            #print json.dumps(resp, indent=2)
            #print ("----------------------------------")
            if resp and 'items' in resp:
                for item in (resp["items"]):
                    #print(item["name"])
                    itemname = (item["name"])
                    processObject(client, itemname)

                req = client.objects().list_next(req, resp)
    except oauth2_client.AccessTokenRefreshError:
        comm.printException(comm.pathToSaveDownloadErrors,
                            errString="False credentials")
        pass
コード例 #38
0
ファイル: initRdf.py プロジェクト: Mailis/EstNer
 def addTriples(self, chunkedList, addLemmas = True):
     try:
         newDataExists = False
         g = self.getPerRdfGraph()
         g_new = Graph()
         #define specific namespace prefix
         self.bindNamespaces(g)
         for andmed in chunkedList:
             for webpage in andmed:
                 gName = andmed[webpage]["gName"]
                 fName = andmed[webpage]["fName"]
                 name = andmed[webpage]["name"]
                 lemmaList = andmed[webpage]["lemmaSet"]
                 #print (lemmaList)
                 try:
                     #make triples
                     newPerson = URIRef(self.perStr + name.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower())
                     newGivenName = Literal(gName)
                     newFamilyName = Literal(fName)
                     newPerName = Literal(name)
                     newWebpage = URIRef(webpage);
                     
                     #add triples
                     #check if graph contains bob already
                     if ( newPerson, RDF.type, URIRef(self.person)) not in g:
                         newDataExists = True
                         g_new.add( (newPerson, RDF.type, URIRef(self.person)) )
                         if(newGivenName != Literal("")):
                             g_new.add( (newPerson, self.givenName, newGivenName) )
                         if(newFamilyName != Literal("")):
                             g_new.add( (newPerson, self.familyName, newFamilyName) )
                         g_new.add( (newPerson, self.perName, newPerName) )
                       
                     #check if graph contains bob already
                     if ( newPerson, self.mentionedAtSite, newWebpage) not in g:
                         newDataExists = True
                         g_new.add( (newPerson, self.mentionedAtSite, newWebpage) )
                     #add lemmas also
                     if(addLemmas):
                         for newLemma in lemmaList:
                             #check if graph contains bob already
                             if ( newPerson, self.lemma, Literal(newLemma)) not in g:
                                 newDataExists = True
                                 g_new.add( (newPerson, self.lemma, Literal(newLemma)) )
                 except:
                     comm.printException(comm.initRdfErrorsFilePath, "build_per_graph")
                     pass
         #print(str(newDataExists)) 
         #write rdf into file
         if (newDataExists):
             try:
                 gg = g+g_new
                 (gg).serialize(self.perRdf, format='pretty-xml', encoding='utf-8')
             except:
                 comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager serialization error: ")
                 pass
     except:
         comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager (addTriples) error: ")
         pass
コード例 #39
0
ファイル: auth.py プロジェクト: Mailis/EstNer
def doDownloadJob(jsonToDict, itemname):
    global nrOfDownloads
    try:
        base_url = jsonToDict['base_url']#becomes folderName
        for fname_key in jsonToDict.keys():
            #At first level, there are two sorts of keys in json-file:
            #1. base-url
            #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes)
            ##As the file content may change over time, every sha(filename)-element contains
            ##1. sha(s) of a content(s)
            ###Every sha of a content contains
            ###1. metadata of a file/content
            if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename
                #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename)
                #loop over every sha(content) of a sha(filename)
                #here, csha is the sha(filecontent)
                for csha in jsonToDict[fname_key].keys():
                    contentKeyExists=False
                    """check if metadata contains key 'Content-Type'"""
                    try:
                        if ('Content-Type' in jsonToDict[fname_key][csha]):
                            contentKeyExists=True
                    except:
                        contentKeyExists=False
                        pass
                    #Get the time the json-file was made
                    timeDir = jsonToDict[fname_key][csha]['timeDir']
                    #download only changes that are no  older than 
                    #the date of start of current process!
                    process_start_date = comm.makeDateObj(ajadir)
                    json_model_date = comm.makeDateObj(timeDir)
                    #continueonly if 
                    #date in model is younger or equal to a 
                    #date of a process start
                    if(contentKeyExists) & (json_model_date >= process_start_date):
                        #excel type is already downloaded
                        if("excel" not in jsonToDict[fname_key][csha]['Content-Type']):
                            #full URL of a file
                            file_url = jsonToDict[fname_key][csha]['file_url']
                            dirPath = comm.downloadsDir + base_url + "/"
                            try:
                                #create folder for this 'date/base_url' if does not exist
                                if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)):
                                    os.makedirs(dirPath)
                                try:
                                    #download the file into that folder
                                    #fname_key is the sha(filename)
                                    #resulting path of a file will become 'date/base_url/sha(filename)'
                                    urr(file_url, dirPath + fname_key)
                                    nrOfDownloads += 1
                                except:
                                    comm.printException(comm.pathToSaveDownloadErrors, itemname)
                                    pass
                            except:
                                comm.printException(comm.pathToSaveDownloadErrors, itemname)
                                pass
    except:
        comm.printException(comm.pathToSaveDownloadErrors, itemname)
        pass   
コード例 #40
0
def readHtmlPage(htmlurl, readedPage, ontologyData):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        #if the root is null, the html is incorrectly formed
        if(root is  not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()
            
            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if(len(sentences) > 0): 
                lsent = list(sentences)
                for lau in lsent:
                    if(lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + htmlurl)
        pass
コード例 #41
0
ファイル: read_xml.py プロジェクト: peepkungas/EstNer
def readXml(xmlurl, pathToFile, ontologyData):

    # https://docs.python.org/3.4/library/functions.html#setattr
    """
    #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml
    if httpResponse is path to xml file:
      tree = ET.parse(httpResponse)
      root = tree.getroot()
    
    Or directly from a string: (xml is read already):
    """
    try:
        root = ET.fromstring(pathToFile)
        if root is not None:
            for data in root.iter():
                if data.text is not None:
                    stripped = data.text.strip()
                    if (stripped is not None) & (len(stripped) > 2):
                        sentences = comm.replaceToPunkts(stripped)
                        for sentence in sentences:
                            getEntities.getEntities(xmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl)
        pass
コード例 #42
0
def readXml(xmlurl, pathToFile, ontologyData):
    
    #https://docs.python.org/3.4/library/functions.html#setattr
    '''
    #https://docs.python.org/3.4/library/xml.etree.elementtree.html?highlight=elementtree#parsing-xml
    if httpResponse is path to xml file:
      tree = ET.parse(httpResponse)
      root = tree.getroot()
    
    Or directly from a string: (xml is read already):
    '''
    try:
        root = ET.fromstring(pathToFile)
        if(root is  not None):
            for data in root.iter():
                if(data.text is not None):
                    stripped = data.text.strip()
                    if(stripped is not None) & (len(stripped)>2):
                        sentences = comm.replaceToPunkts(stripped)
                        for sentence in sentences:
                            getEntities.getEntities(xmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_xml.py " + xmlurl)
        pass
コード例 #43
0
ファイル: initRdf.py プロジェクト: Mailis/EstNer
 def addTriples(self, chunkedList, addLemmas = True):
     try:
         newDataExists = False
         g = self.getLocRdfGraph()
         g_new = Graph()
         #define specific namespace prefix
         self.bindNamespaces(g)
         
         for andmed in chunkedList:
             for webpage in andmed:
                 for objName in andmed[webpage]:
                     lemmaList = andmed[webpage][objName]
                     #print (lemmaList)
                     try:
                         #make triples
                         newLocation = URIRef(self.locStr + objName.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower())
                         newLocationName = Literal(objName)
                         newWebpage = URIRef(webpage);
                         
                         #add triples
                         #check if graph contains bob already
                         if ( newLocation, RDF.type, URIRef(self.location)) not in g:
                             newDataExists = True
                             g_new .add( (newLocation, RDF.type, URIRef(self.location)) )
                             g_new .add( (newLocation, self.locationName, newLocationName) )
                          
                         #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) )   
                         #check if graph contains bob already
                         if ( newLocation, self.mentionedAtSite, newWebpage) not in g:
                             newDataExists = True
                             g_new .add( (newLocation, self.mentionedAtSite, newWebpage) )
                         #add lemmas also
                         if(addLemmas):
                             for newLemma in lemmaList:
                                 #check if graph contains bob already
                                 if ( newLocation, self.lemma, Literal(newLemma)) not in g:
                                     newDataExists = True
                                     g_new .add( (newLocation, self.lemma, Literal(newLemma)) )
                     except:
                         comm.printException(comm.initRdfErrorsFilePath, "build_loc_graph")
                         pass
         #write rdf into file
         if (newDataExists):
             try:
                 gg = g+g_new
                 (gg).serialize(self.locRdf, format='pretty-xml', encoding='utf-8')
             except:
                 comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager serialization error: ")
                 pass
     except:
         comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager (addTriples) error: ")
         pass
コード例 #44
0
    def addTriples(self, chunkedList, addLemmas=True):
        try:
            newDataExists = False
            g = self.getLocRdfGraph()
            g_new = Graph()
            #define specific namespace prefix
            self.bindNamespaces(g)

            for andmed in chunkedList:
                for webpage in andmed:
                    for objName in andmed[webpage]:
                        lemmaList = andmed[webpage][objName]
                        #print (lemmaList)
                        try:
                            #make triples
                            newLocation = URIRef(
                                self.locStr +
                                objName.replace(">", "").replace("<", "").
                                replace("|", "").replace(" ", "_").lower())
                            newLocationName = Literal(objName)
                            newWebpage = URIRef(webpage)

                            #add triples
                            #check if graph contains bob already
                            if (newLocation, RDF.type,
                                    URIRef(self.location)) not in g:
                                newDataExists = True
                                g_new.add((newLocation, RDF.type,
                                           URIRef(self.location)))
                                g_new.add((newLocation, self.locationName,
                                           newLocationName))

                            #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) )
                            #check if graph contains bob already
                            if (newLocation, self.mentionedAtSite,
                                    newWebpage) not in g:
                                newDataExists = True
                                g_new.add((newLocation, self.mentionedAtSite,
                                           newWebpage))
                            #add lemmas also
                            if (addLemmas):
                                for newLemma in lemmaList:
                                    #check if graph contains bob already
                                    if (newLocation, self.lemma,
                                            Literal(newLemma)) not in g:
                                        newDataExists = True
                                        g_new.add((newLocation, self.lemma,
                                                   Literal(newLemma)))
                        except:
                            comm.printException(comm.initRdfErrorsFilePath,
                                                "build_loc_graph")
                            pass
            #write rdf into file
            if (newDataExists):
                try:
                    gg = g + g_new
                    (gg).serialize(self.locRdf,
                                   format='pretty-xml',
                                   encoding='utf-8')
                except:
                    comm.printException(
                        comm.initRdfErrorsFilePath,
                        "RDF Location Manager serialization error: ")
                    pass
        except:
            comm.printException(comm.initRdfErrorsFilePath,
                                "RDF Location Manager (addTriples) error: ")
            pass
コード例 #45
0
ファイル: commonMethods.py プロジェクト: peepkungas/EstNer
def dowloadFromJsons(ajadir):
    nrOfDownloads = 0
    jsons = comm.jsonsDir
    """loop over every file in jsons-folder"""
    for filePath in listdir(jsons):
        if (filePath != "errors.txt"):
            #open json file:
            #'jsons' is a folder where json-files are saved
            #'filePath' is a filename in this folder
            ##'jsons'-folder lives in the folder "datadownload"
            ##'downloaded_files'-folder lives also in the folder "datadownload"
            try:
                """load json-file into directory-type"""
                jsonToDict = json.load(open(jsons + filePath))
            except:
                continue
            #'base_url' is the hostname, before "/"-slashes, in json-file
            #'base_url' is the json-file name ('filePath'), followed by an extension '.json'
            #'base_url' is also a directory name in 'downloaded_files'-folder
            base_url = jsonToDict['base_url']  #becomes folderName
            for fname_key in jsonToDict.keys():
                #At first level, there are two sorts of keys in json-file:
                #1. base-url
                #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes)
                ##As the file content may change over time, every sha(filename)-element contains
                ##1. sha(s) of a content(s)
                ###Every sha of a content contains
                ###1. metadata of a file/content
                if (fname_key != 'base_url'
                    ):  #fname_key(sha of file url) becomes local filename
                    #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename)
                    #loop over every sha(content) of a sha(filename)
                    #here, csha is the sha(filecontent)
                    for csha in jsonToDict[fname_key].keys():
                        contentKeyExists = False
                        """check if metadata contains key 'Content-Type'"""
                        try:
                            if ('Content-Type' in jsonToDict[fname_key][csha]):
                                contentKeyExists = True
                        except:
                            contentKeyExists = False
                            pass
                        """Get the time the json-file was made"""
                        timeDir = jsonToDict[fname_key][csha]['timeDir']
                        #download only today's changes!
                        if (contentKeyExists) & (ajadir == timeDir):
                            #excel type is already downloaded
                            if ("excel" not in jsonToDict[fname_key][csha]
                                ['Content-Type']):
                                """Full URL of a file"""
                                file_url = jsonToDict[fname_key][csha][
                                    'file_url']
                                """'dirPath' is the path of a folder of a file currently wants to be downloaded"""
                                dirPath = comm.downloadsDir + base_url + "/"
                                try:
                                    """create folder for this 'date/base_url' if does not exist"""
                                    if (not os.path.isdir(dirPath)) & (
                                            not os.path.exists(dirPath)):
                                        os.makedirs(dirPath)
                                    try:
                                        #download the file into that folder
                                        #fname_key is the sha(filename)
                                        #resulting path of a file will become 'date/base_url/sha(filename)'
                                        urr(file_url, dirPath + fname_key)
                                        nrOfDownloads += 1
                                        #print(timeDir, base_url, , file_url)

                                    except:
                                        comm.printException(
                                            comm.pathToSaveDownloadErrors,
                                            filePath)
                                        pass
                                except:
                                    comm.printException(
                                        comm.pathToSaveDownloadErrors,
                                        filePath)
                                    pass
    return nrOfDownloads
コード例 #46
0
ファイル: commonMethods.py プロジェクト: Mailis/EstNer
def dowloadFromJsons(ajadir):
    nrOfDownloads = 0
    jsons=comm.jsonsDir
    """loop over every file in jsons-folder"""
    for filePath in listdir(jsons):
            if(filePath != "errors.txt"):
                #open json file:
                #'jsons' is a folder where json-files are saved
                #'filePath' is a filename in this folder
                ##'jsons'-folder lives in the folder "datadownload"
                ##'downloaded_files'-folder lives also in the folder "datadownload"
                try:
                    """load json-file into directory-type"""
                    jsonToDict = json.load(open(jsons+filePath));
                except:
                    continue
                #'base_url' is the hostname, before "/"-slashes, in json-file
                #'base_url' is the json-file name ('filePath'), followed by an extension '.json'
                #'base_url' is also a directory name in 'downloaded_files'-folder
                base_url = jsonToDict['base_url']#becomes folderName
                for fname_key in jsonToDict.keys():
                    #At first level, there are two sorts of keys in json-file:
                    #1. base-url
                    #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes)
                    ##As the file content may change over time, every sha(filename)-element contains
                    ##1. sha(s) of a content(s)
                    ###Every sha of a content contains
                    ###1. metadata of a file/content
                    if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename
                        #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename)
                        #loop over every sha(content) of a sha(filename)
                        #here, csha is the sha(filecontent)
                        for csha in jsonToDict[fname_key].keys():
                            contentKeyExists=False
                            """check if metadata contains key 'Content-Type'"""
                            try:
                               if ('Content-Type' in jsonToDict[fname_key][csha]):
                                   contentKeyExists=True
                            except:
                                contentKeyExists=False
                                pass
                            """Get the time the json-file was made"""
                            timeDir = jsonToDict[fname_key][csha]['timeDir']
                                                   #download only today's changes!
                            if(contentKeyExists) & (ajadir == timeDir):
                                #excel type is already downloaded
                                if("excel" not in jsonToDict[fname_key][csha]['Content-Type']):
                                    """Full URL of a file"""
                                    file_url = jsonToDict[fname_key][csha]['file_url']
                                    """'dirPath' is the path of a folder of a file currently wants to be downloaded"""
                                    dirPath = comm.downloadsDir + base_url + "/"
                                    try:
                                        """create folder for this 'date/base_url' if does not exist"""
                                        if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)):
                                            os.makedirs(dirPath)
                                        try:
                                            #download the file into that folder
                                            #fname_key is the sha(filename)
                                            #resulting path of a file will become 'date/base_url/sha(filename)'
                                            urr(file_url, dirPath + fname_key)
                                            nrOfDownloads += 1
                                            #print(timeDir, base_url, , file_url)
                                            
                                        except:
                                            comm.printException(comm.pathToSaveDownloadErrors, filePath)
                                            pass
                                    except:
                                        comm.printException(comm.pathToSaveDownloadErrors, filePath)
                                        pass
    return nrOfDownloads
コード例 #47
0
def mergeRDFfiles():

    try:
        for dname in rdfFnames:
            g_old_path = comm.pathToRDFdir + dname + ".rdf"  # file: /var/www/html/master/rdf_files/ORG.rdf
            # temporary dirs ORG, LOC, PER
            rdf_file_dir = comm.pathToRDFdir + dname  # dir: /var/www/html/master/rdf_files/ORG
            g_copy_path = comm.rdf_copypath + dname + ".rdf"  # file: /var/www/html/master/rdf_copy/ORG.rdf

            g_new = Graph()
            g_new_for_copy = Graph()
            for rdf_file in listdir(rdf_file_dir):
                tmp_path = rdf_file_dir + "/" + rdf_file  # /var/www/html/master/rdf_files/ORG/<worker-1>_<date_dir>.rdf
                try:
                    g_new.parse(tmp_path)  # load temporary file into graph
                    g_new_for_copy.parse(tmp_path)  # load temporary file into graph
                except:
                    comm.printException(
                        comm.pathToSaveDownloadErrors, errString="cannot load temporary file into graph"
                    )
                    pass
            if os.path.exists(g_old_path):  # there is an existing oath to rdf-file
                try:
                    g_new.parse(g_old_path)  # load old file into graph, adding it to new graph
                    g_new.serialize(g_old_path, format="pretty-xml", encoding="utf-8")
                except:
                    comm.printException(comm.pathToSaveDownloadErrors, errString="cannot merge RDF")
                    pass
                # delete used files
                os.system("sudo rm -r " + rdf_file_dir + "/*")
            else:  # no existing path to RDF files yet
                try:
                    g_new.serialize(g_old_path, format="pretty-xml", encoding="utf-8")
                except:
                    comm.printException(comm.pathToSaveDownloadErrors, errString="cannot create RDF")
                    pass

            ##BACKUP RDF files
            if os.path.exists(g_copy_path):  # there is an existing oath to rdf-file
                try:
                    g_new_for_copy.parse(g_copy_path)  # load old file into graph, adding it to new graph
                    g_new_for_copy.serialize(g_copy_path, format="pretty-xml", encoding="utf-8")  # save into file
                except:
                    comm.printException(comm.pathToSaveDownloadErrors, errString="cannot merge backup  RDF")
                    pass
            else:  # no existing path to RDF files yet
                try:
                    g_new_for_copy.serialize(g_copy_path, format="pretty-xml", encoding="utf-8")  # save into file
                except:
                    comm.printException(comm.pathToSaveDownloadErrors, errString="cannot create backup RDF")
                    pass
    except:
        comm.printException(comm.pathToSaveDownloadErrors, errString="merge RDF")
        pass
コード例 #48
0
ファイル: auth.py プロジェクト: Mailis/EstNer
     filedic["statfile"] = mylargefile
     subprocess.Popen(["python3", "amountOfTriples.py", json.dumps(filedic)])
     
     #end of statistics!
     comm.saveStatistics(mylargefile, "\n--------------------------- ")
     
     
     #delete RDF-files and download excel-files in each worker
     postToWorker.deleteRDFsInWorkers(ipList)
     ###
     ###
     try:
         # Download json-objects from cloud storage
         downloadJsons.getJsonObjects(client)
     except:
         comm.printException(comm.pathToSaveDownloadErrors, errString="downloadJsons")
         pass
     
     ###
     ###
     try:
         ###
         ### Download generated error-objects from cloud storage
         downloadErrors.getErrorObjects(client)
     except:
         comm.printException(comm.pathToSaveDownloadErrors, errString="downloadErrors")
         pass
     
     
 else:
     print("no worker instances")
コード例 #49
0
 start = datetime.datetime.now()
 currTime = time.strftime("%d/%m/%Y_%H:%M:%S")
 #dir of json-files
 jsonFiles=comm.jsonsDir_local
 try:
     p = subprocess.Popen(["python2", comm.parentDir + "upload_logfile/getworkers.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     out, err = p.communicate()
     #print((out.decode()))
     WORKER_INSTANCES = json.loads(out.decode())
     #print(WORKER_INSTANCES)
     #tuple of ip list and list of tuple of ip and amount of cpus
     ipList_and_tuple = getListOfWorkerIPs(WORKER_INSTANCES)
     ipList = ipList_and_tuple[0]
     ipList_tuple = ipList_and_tuple[1]
 except Exception as e:
     comm.printException(comm.updateErrorsFilePath, errString="subprocess_to_getworkers.py" + str(e))
     pass
 #if there is at least 1 ip n the list
 lenIpList = len(ipList)
 
 
 if (lenIpList>0):
     #the list of doc URLs that is going to send to worker
     listOfUrls = []
     #loop over all json-files
     for (dirpath, dirnames, filenames) in walk(jsonFiles):
         for fname in filenames:
             #start to search for changed content,
             #comparing content hash saved in json-file to the 
             #content of web document at current moment
             #print(os.path.join(dirpath, fname))
コード例 #50
0
ファイル: monthlyUpdate.py プロジェクト: Mailis/EstNer
            dirpath = dirpath0
            jsonsList.extend(filenames) #collect all json-files into list
            break
        for filename in jsonsList:
            jobs.append(dirpath+filename)
            nrOfJobs += 1
             
        #pool of processes
        pool = Pool(processes=os.cpu_count())
        #search for changes
        nrOfChanges_ = pool.map(addChangedContent, jobs)
        
        pool.close()
        pool.join()
    except:
        comm.printException(comm.updateErrorsFilePath, errString="update")
        pass
    
    
#import getpass
#print("USER: "******" " + str(nrOfJobs) + " " + str(span) + " " + str(comm.chunksize) + " " + str(sum(nrOfChanges_)) + " ")
   jf.close()
except:
   comm.printException(comm.updateErrorsFilePath, errString="update")
コード例 #51
0
def saveMetadata(url, od):
    canOpen = True
    try:
        redirectedTo = requests.get(url).url
    except:
        canOpen = False
        errStr = (url + " Cannot_open_web-source \n")
        comm.printException(comm.pathToSaveJsonErrors, errStr)
        pass
    if canOpen is True:
        if not os.path.isdir(comm.jsonsDir):
            os.makedirs(comm.jsonsDir)

        try:
            page = requests.get(redirectedTo)
            statusCode = page.status_code
            pageread = page.text
            pageInfo = dict(page.headers)
            #page.close()
            localFilename = comm.getUrlSHA(redirectedTo)
            contentType = page.headers['content-type']
            _encoding = page.encoding
            '''
            if(pageencoding is None) or (len(pageencoding) == 0):
                _encoding = commethods.getDocumentEncoding(contentType)
            else:
                _encoding = pageencoding.upper()
            '''
            #base_url becomes also jsonFile name (pathToSaveMetadata)
            baseUrl = (urlparse(redirectedTo)).netloc
            sha224_ = (hashlib.sha224(pageread.encode('utf-8')).hexdigest())

            pathToSaveMetadata = comm.jsonsDir + baseUrl + ".json"
            isDesiredType = comm.isDesiredContent(contentType)
            #print("isDesiredType ", isDesiredType)#

            if (isDesiredType):
                #if this file does not exist yet in locale storage, create
                #if this base-url path does not exist:
                if not os.path.isfile(pathToSaveMetadata):
                    #print("excel in contentType ", ("excel" in contentType))#
                    #create directory for the downloads from this base_url and save file into it
                    #downloadedFilePath = downLoadFile(pageread, localFilename, baseUrl)
                    #print("downloadedFilePath ", downloadedFilePath)
                    #if(downloadedFilePath):
                    #print()#
                    infoDict_tmp = dict()
                    infoDict_tmp["base_url"] = baseUrl
                    infoDict = insertValuesToDict(infoDict_tmp, localFilename,
                                                  redirectedTo, pageInfo,
                                                  sha224_, statusCode)
                    saveJsonToFile(pathToSaveMetadata, infoDict)
                    #send the file to parser and then to estner for extracting entities:
                    sendFileToParser(contentType, baseUrl, redirectedTo, od,
                                     _encoding, localFilename, pageread)

                #if this file does exist already, upload new version
                else:
                    #print("ELSE excel in contentType ", ("excel" in contentType))#
                    someNewData = False
                    # Open the json file (<baseUrl>.json) for reading
                    # and comparing sha224 values of sha saved in json and in opened file
                    in_file = open(pathToSaveMetadata, "r")
                    # Load the contents from the file, which creates a new dictionary

                    isValidJsonFile = True
                    try:
                        #open json file
                        existingFileDict = json.load(in_file)
                    except:
                        isValidJsonFile = False
                        pass
                    # Close the file... we don't need it anymore
                    in_file.close()
                    if (isValidJsonFile):
                        #(existingFileDict_tmp['base_url']) e.g. www.temtec.ee
                        if (existingFileDict['base_url'] in redirectedTo
                            ):  #same file resource was requested
                            #print("BASE URL ", existingFileDict['base_url'])
                            #dict has two 1-level keys: 'base_url' and sha244 of a file name
                            fNameKey = [
                                k for k in existingFileDict.keys()
                                if k != 'base_url'
                            ]
                            #print("  fNameKey", fNameKey)
                            #print("  localFilename", localFilename)
                            #print("  localFilename in fNameKey", (localFilename in fNameKey))
                            #this list may contain 'localFilename'
                            if (localFilename in fNameKey):
                                #if earlier saved file's sha does not equal to current sha,
                                #the contents of file has changed.
                                #Saved file's sha is saved into key, find it or not:
                                shaKeys = existingFileDict[localFilename].keys(
                                )
                                if (sha224_ not in shaKeys):  #file has changed
                                    #search for date, if it same, update existing sha key
                                    replaceSha = ""
                                    #if there is same timedDir under some sha key, get this sha and replace
                                    for sk in shaKeys:
                                        savedDate = existingFileDict[
                                            localFilename][sk]["timeDir"]
                                        if (savedDate == comm.timeDir):
                                            replaceSha = sk
                                            break
                                    if (
                                            replaceSha != ""
                                    ):  #delete sha, because of same day date
                                        del existingFileDict[localFilename][
                                            replaceSha]
                                        #print("REPLACING!")
                                    #add new value with new content_sha-key under filename_sha-key
                                    #filename-url is same, but content is changed
                                    #so add new content-sha
                                    newDataDict = insertNewContentToDict(
                                        localFilename, redirectedTo, pageInfo,
                                        sha224_, statusCode)
                                    if (newDataDict):
                                        existingFileDict[localFilename].update(
                                            newDataDict)
                                        someNewData = True

                            else:  #new file (resource) from same domain (or 'base_url') requested
                                #add new value with new filename_sha-key for that base-resource
                                newDataDict = insertNewSubFileToDict(
                                    localFilename, redirectedTo, pageInfo,
                                    sha224_, statusCode)
                                if (newDataDict):
                                    existingFileDict.update(newDataDict)
                                    someNewData = True
                    if someNewData:
                        #save metadata of file
                        saveJsonToFile(pathToSaveMetadata, existingFileDict)
                        #send the file to parser, then to estner entity extractor:
                        sendFileToParser(contentType, baseUrl, redirectedTo,
                                         od, _encoding, localFilename,
                                         pageread)
        #save errors
        except urr.HTTPError as e:
            errStr = (redirectedTo + " HTTPError " + str(e.code) + " " +
                      str(e.reason) + " \n")
            comm.printException(comm.pathToSaveJsonErrors, errStr)
            pass
        except urr.URLError as e:
            errStr = (redirectedTo + " URLError " + str(e.reason) + " \n")
            comm.printException(comm.pathToSaveJsonErrors, errStr)
            pass
        except IOError as e:
            errStr = (redirectedTo + " " +
                      str("I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) +
                      "\n")
            comm.printException(comm.pathToSaveJsonErrors, errStr)
            pass
        except ValueError:
            errStr = (redirectedTo +
                      " ValueError_Could_not_convert_data_to_an_integer.\n")
            comm.printException(comm.pathToSaveJsonErrors, errStr)
            pass
        except TypeError:
            errStr = (redirectedTo + " TypeError\n")
            comm.printException(comm.pathToSaveJsonErrors, errStr)
            pass
        except:
            errStr = (redirectedTo + " Unexpected_error:_" +
                      (str(sys.exc_info()[0])) + "\n")
            comm.printException(comm.pathToSaveJsonErrors, errStr)
            pass
コード例 #52
0
    def addTriples(self, chunkedList, addLemmas=True):
        try:
            newDataExists = False
            g = self.getPerRdfGraph()
            g_new = Graph()
            #define specific namespace prefix
            self.bindNamespaces(g)
            for andmed in chunkedList:
                for webpage in andmed:
                    gName = andmed[webpage]["gName"]
                    fName = andmed[webpage]["fName"]
                    name = andmed[webpage]["name"]
                    lemmaList = andmed[webpage]["lemmaSet"]
                    #print (lemmaList)
                    try:
                        #make triples
                        newPerson = URIRef(
                            self.perStr +
                            name.replace(">", "").replace("<", "").replace(
                                "|", "").replace(" ", "_").lower())
                        newGivenName = Literal(gName)
                        newFamilyName = Literal(fName)
                        newPerName = Literal(name)
                        newWebpage = URIRef(webpage)

                        #add triples
                        #check if graph contains bob already
                        if (newPerson, RDF.type, URIRef(self.person)) not in g:
                            newDataExists = True
                            g_new.add(
                                (newPerson, RDF.type, URIRef(self.person)))
                            if (newGivenName != Literal("")):
                                g_new.add(
                                    (newPerson, self.givenName, newGivenName))
                            if (newFamilyName != Literal("")):
                                g_new.add((newPerson, self.familyName,
                                           newFamilyName))
                            g_new.add((newPerson, self.perName, newPerName))

                        #check if graph contains bob already
                        if (newPerson, self.mentionedAtSite,
                                newWebpage) not in g:
                            newDataExists = True
                            g_new.add(
                                (newPerson, self.mentionedAtSite, newWebpage))
                        #add lemmas also
                        if (addLemmas):
                            for newLemma in lemmaList:
                                #check if graph contains bob already
                                if (newPerson, self.lemma,
                                        Literal(newLemma)) not in g:
                                    newDataExists = True
                                    g_new.add((newPerson, self.lemma,
                                               Literal(newLemma)))
                    except:
                        comm.printException(comm.initRdfErrorsFilePath,
                                            "build_per_graph")
                        pass
            #print(str(newDataExists))
            #write rdf into file
            if (newDataExists):
                try:
                    gg = g + g_new
                    (gg).serialize(self.perRdf,
                                   format='pretty-xml',
                                   encoding='utf-8')
                except:
                    comm.printException(
                        comm.initRdfErrorsFilePath,
                        "RDF People Manager serialization error: ")
                    pass
        except:
            comm.printException(comm.initRdfErrorsFilePath,
                                "RDF People Manager (addTriples) error: ")
            pass
コード例 #53
0
        subprocess.Popen(
            ["python3", "amountOfTriples.py",
             json.dumps(filedic)])

        #end of statistics!
        comm.saveStatistics(mylargefile, "\n--------------------------- ")

        #delete RDF-files and download excel-files in each worker
        postToWorker.deleteRDFsInWorkers(ipList)
        ###
        ###
        try:
            # Download json-objects from cloud storage
            downloadJsons.getJsonObjects(client)
        except:
            comm.printException(comm.pathToSaveDownloadErrors,
                                errString="downloadJsons")
            pass

        ###
        ###
        try:
            ###
            ### Download generated error-objects from cloud storage
            downloadErrors.getErrorObjects(client)
        except:
            comm.printException(comm.pathToSaveDownloadErrors,
                                errString="downloadErrors")
            pass

    else:
        print("no worker instances")
コード例 #54
0
ファイル: getEntities.py プロジェクト: peepkungas/EstNer
def getEntities(url,
                text,
                ontologyData,
                orgWords=[
                    'kogu', 'selts', 'ansambel', 'keskus', 'as', 'klubi',
                    'asutus', 'keskus', 'fond', 'cup'
                ],
                locWords=['vabarii', 'maakond']):
    ntwl = list()
    try:
        ner_tagged = tagger(analyzer(tokenizer(text)))
    except:
        comm.printException(
            comm.pathToSaveParsingErrors,
            "getEntities.py-def_getEntities_:_ner_tagged " + text[0:100] +
            " URL_" + url)
        pass
    try:
        ntwl = ner_tagged.named_entities
    except:
        comm.printException(
            comm.pathToSaveParsingErrors,
            "getEntities.py-def_getEntities_:_ntwl" + str(len(ntwl)) + " " +
            text[0:100] + " URL_" + url)
        pass
    try:
        if (len(ntwl) > 0):

            andmed = dict()
            #andmed[url]=dict()
            #print(text)
            #print(ner_tagged)
            #print(ntwl)
            for i in ntwl:
                label = i.label
                freqLemma = i.lemma.replace(';', '').replace(':', '').replace(
                    ',', '').replace('.', '').replace('?', '').replace(
                        '!', '').replace('"', '').replace("'", '').replace(
                            ' | ', '').replace('|', '').lower()

                #correct some ner labels
                for ow in orgWords:
                    if (ow.lower() in freqLemma.lower()):
                        label = "ORG"
                for lw in locWords:
                    if (lw.lower() in freqLemma.lower()):
                        label = "LOC"

                #process values by labels
                if label == "PER":
                    entitySet = set()
                    if (freqLemma != ""):
                        name = freqLemma.title()
                        names = name.split(' ')
                        gName = ""
                        fName = ""
                        try:
                            if len(names) > 1:
                                if len(names) > 2:
                                    gName = names[0] + " " + names[1]
                                    fName = names[2]
                                elif len(names) == 2:
                                    gName = names[0]
                                    fName = names[1]
                        except:
                            comm.printException(
                                comm.pathToSaveParsingErrors,
                                "getEntities.py-def_getEntities_gname-fname")
                            pass
                        entitySet.add(freqLemma)
                        #to later remove, currently for avoid double values
                        entitySet.add(name)
                        entitySet.add(gName)
                        entitySet.add(fName)
                        wConcat = (' '.join(w.text for w in i.words)).replace(
                            ';', '').replace(':', '').replace(',', '').replace(
                                '.', '').replace('?', '').replace('!', '')
                        entitySet.add(wConcat)
                        lemmalist = list()
                        for w in i.words:
                            lemmalist.append(w.lemmas)
                        produkt = itertools.product(*lemmalist)
                        for j in produkt:
                            entitySet.add(" ".join(
                                str(u) for u in (list(j))
                                if ((u.lower() != name.lower()) & (u != "")
                                    & (u.title() in names))))
                        #now remove double values
                        if name in entitySet:
                            entitySet.remove(name)
                        if gName in entitySet:
                            entitySet.remove(gName)
                        if fName in entitySet:
                            entitySet.remove(fName)
                        if "" in entitySet:
                            entitySet.remove("")

                        andmed = {
                            url: {
                                "gName": gName,
                                "fName": fName,
                                "name": name,
                                "lemmaSet": entitySet
                            }
                        }
                        if not (ontologyData.sharedList_per._callmethod(
                                '__contains__', (andmed, ))):
                            ontologyData.sharedList_per._callmethod(
                                'append', (andmed, ))

                        if ((ontologyData.sharedList_per
                             )._callmethod('__len__') > comm.chunksize):
                            try:
                                chunkedList = ontologyData.sharedList_per[:]  #makes copy,not refrence
                                del ontologyData.sharedList_per[:]
                                perManager = initRdf.PeopleManager(
                                    ontologyData)
                                perManager.addTriples(chunkedList)
                            except:
                                comm.printException(comm.initRdfErrorsFilePath,
                                                    "get_PER_entities")
                                pass
                else:
                    objName = freqLemma.title()
                    entitySet = set()
                    entitySet.add(freqLemma)
                    wConcat = (' '.join(w.text for w in i.words)).replace(
                        ';', '').replace(':', '').replace(',', '').replace(
                            '.', '').replace('?', '').replace('!', '')
                    entitySet.add(wConcat)
                    lemmalist = list()
                    for w in i.words:
                        lemmalist.append(w.lemmas)
                    produkt = itertools.product(*lemmalist)
                    for j in produkt:
                        entitySet.add(" ".join(
                            str(u) for u in (list(j))
                            if ((u.lower() != objName.lower()) & (u != ""))))
                    if "" in entitySet:
                        entitySet.remove("")

                    andmed = {
                        url: {
                            objName: entitySet
                        }
                    }

                    if (label == "ORG"):
                        if not (ontologyData.sharedList_org._callmethod(
                                '__contains__', (andmed, ))):
                            ontologyData.sharedList_org._callmethod(
                                'append', (andmed, ))
                    elif (label == "LOC"):
                        if not (ontologyData.sharedList_loc._callmethod(
                                '__contains__', (andmed, ))):
                            ontologyData.sharedList_loc._callmethod(
                                'append', (andmed, ))

                    if ((ontologyData.sharedList_org)._callmethod('__len__') >
                            comm.chunksize):
                        try:
                            chunkedList = (ontologyData.sharedList_org[:]
                                           )  #makes copy,not refrence
                            del ontologyData.sharedList_org[:]
                            #tests
                            #jf = open("tEst.txt", 'a', encoding='utf-8')
                            #jf.write(str(len(chunkedList)) + "\n")
                            #jf.close()
                            orgManager = initRdf.OrganizationManager(
                                ontologyData)
                            orgManager.addTriples(chunkedList)
                        except:
                            comm.printException(comm.initRdfErrorsFilePath,
                                                "get_ORG_entities")
                            pass
                    if ((ontologyData.sharedList_loc)._callmethod('__len__') >
                            comm.chunksize):
                        try:
                            chunkedList = ontologyData.sharedList_loc[:]  #makes copy,not refrence
                            del ontologyData.sharedList_loc[:]
                            locManager = initRdf.LocationManager(ontologyData)
                            locManager.addTriples(chunkedList)
                        except:
                            comm.printException(comm.initRdfErrorsFilePath,
                                                "get_LOC_entities")
                            pass
    except:
        comm.printException(comm.initRdfErrorsFilePath, "getEntities.py")
        pass
コード例 #55
0
def detectChanges(jsonFilePath, listOfUrls):
    global nrOfChanges
    global worker_counter
    #print(jsonFilePath)
    #print("--------------------")
    #print(str(len(listOfUrls)))
    #print("--------------------")
    jsonDict = dict()
    #json-files may be incorrectly formed,
    #in that case one cannot load it into dictionary
    isValidJsonFile=True
    try:
        #open json file
        #load file into dictionary-type:
        jsonDict = json.load(open(jsonFilePath))
    except:
        isValidJsonFile=False
        pass
    #print(str(isValidJsonFile))
    #get URL from the current dict
    if(isValidJsonFile):
        #hash is sha224
        for key in jsonDict.keys():
            #if key is currently not a base_url, it is filename
            #under filename-key, there can be among other metadata
            #one or more  content SHAs
            #URL to that file (document) in web
            if(key != 'base_url'):#key is sha of file's URL, elsewhere saved into variable localFilename 
                #structure:
                #sha of filename
                ###sha of file content
                ######metadata of file + (filename , sha(file content), 
                ###### human-readable file url (under key 'file_url'), accessed date) 
                ###sha of file another (updated) content
                ######metadata...
                fileSHAs = list(jsonDict[key].keys())#list of SHA's of file content at time of accessing this file
                arbitrFileSha = fileSHAs[0]#this is only for getting file URL
                fileUrl = jsonDict[key][arbitrFileSha]["file_url"]
                redirectedTo=0
                try:
                    redirectedTo = requests.get(fileUrl).url
                except:
                    comm.printException(comm.updateErrorsFilePath, errString="open_url")
                    continue #continue with next URL in loop
                if(redirectedTo!=0):
                    #print(str(redirectedTo))
                    #read the doc's content at current moment
                    try:
                        pageread = (requests.get(redirectedTo)).text
                    except:
                        comm.printException(comm.updateErrorsFilePath, errString="pageread1")
                        try:
                            pageread = ((requests.get(redirectedTo)).text.encode('utf-8').strip())
                        except Exception as e:
                            comm.printException(comm.updateErrorsFilePath, errString="pageread2")
                            print(e)
                            continue
                    #get hash of this doc
                    fileContentSha224 = (hashlib.sha224(pageread.encode('utf-8')).hexdigest())
                    #check if content is changed meanwhile
                    if(fileContentSha224 not in fileSHAs):#data has changed!!!
                        #collect number of changes
                        nrOfChanges += 1
                        #as a content of this doc has changed, send its URL to the worker
                        #for extracting entities
                        #fill the list of URLs
                        listOfUrls.append(fileUrl)
                        ''''''
                        postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if(len(listOfUrls) == postListSize):
                            #send list of urls to worker
                            worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls)
                            #empty list of object names
                            #to prepare it for the next worker
                            del listOfUrls[:] 
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter > (len(ipList)-1)):#last worker in the workers list
                                #start over from first worker in the workers list
                                worker_counter = 0
コード例 #56
0
        try:
            p = subprocess.Popen(
                ["python2", comm.parentDir + "upload_logfile/getworkers.py"],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            out, err = p.communicate()
            #print((out.decode()))
            WORKER_INSTANCES = json.loads(out.decode())
            #print(WORKER_INSTANCES)
            #tuple of ip list and list of tuple of ip and amount of cpus
            ipList_and_tuple = getListOfWorkerIPs(WORKER_INSTANCES)
            ipList = ipList_and_tuple[0]
            ipList_tuple = ipList_and_tuple[1]
        except Exception as e:
            comm.printException(comm.updateErrorsFilePath,
                                errString="subprocess_to_getworkers.py" +
                                str(e))
            pass
        #if there is at least 1 ip n the list
        lenIpList = len(ipList)

        if (lenIpList > 0):
            #the list of doc URLs that is going to send to worker
            listOfUrls = []
            #loop over all json-files
            for (dirpath, dirnames, filenames) in walk(jsonFiles):
                for fname in filenames:
                    #start to search for changed content,
                    #comparing content hash saved in json-file to the
                    #content of web document at current moment
                    #print(os.path.join(dirpath, fname))
コード例 #57
0
def detectChanges(jsonFilePath, listOfUrls):
    global nrOfChanges
    global worker_counter
    #print(jsonFilePath)
    #print("--------------------")
    #print(str(len(listOfUrls)))
    #print("--------------------")
    jsonDict = dict()
    #json-files may be incorrectly formed,
    #in that case one cannot load it into dictionary
    isValidJsonFile = True
    try:
        #open json file
        #load file into dictionary-type:
        jsonDict = json.load(open(jsonFilePath))
    except:
        isValidJsonFile = False
        pass
    #print(str(isValidJsonFile))
    #get URL from the current dict
    if (isValidJsonFile):
        #hash is sha224
        for key in jsonDict.keys():
            #if key is currently not a base_url, it is filename
            #under filename-key, there can be among other metadata
            #one or more  content SHAs
            #URL to that file (document) in web
            if (
                    key != 'base_url'
            ):  #key is sha of file's URL, elsewhere saved into variable localFilename
                #structure:
                #sha of filename
                ###sha of file content
                ######metadata of file + (filename , sha(file content),
                ###### human-readable file url (under key 'file_url'), accessed date)
                ###sha of file another (updated) content
                ######metadata...
                fileSHAs = list(
                    jsonDict[key].keys()
                )  #list of SHA's of file content at time of accessing this file
                arbitrFileSha = fileSHAs[0]  #this is only for getting file URL
                fileUrl = jsonDict[key][arbitrFileSha]["file_url"]
                redirectedTo = 0
                try:
                    redirectedTo = requests.get(fileUrl).url
                except:
                    comm.printException(comm.updateErrorsFilePath,
                                        errString="open_url")
                    continue  #continue with next URL in loop
                if (redirectedTo != 0):
                    #print(str(redirectedTo))
                    #read the doc's content at current moment
                    try:
                        pageread = (requests.get(redirectedTo)).text
                    except:
                        comm.printException(comm.updateErrorsFilePath,
                                            errString="pageread1")
                        try:
                            pageread = ((requests.get(redirectedTo)
                                         ).text.encode('utf-8').strip())
                        except Exception as e:
                            comm.printException(comm.updateErrorsFilePath,
                                                errString="pageread2")
                            print(e)
                            continue
                    #get hash of this doc
                    fileContentSha224 = (hashlib.sha224(
                        pageread.encode('utf-8')).hexdigest())
                    #check if content is changed meanwhile
                    if (fileContentSha224
                            not in fileSHAs):  #data has changed!!!
                        #collect number of changes
                        nrOfChanges += 1
                        #as a content of this doc has changed, send its URL to the worker
                        #for extracting entities
                        #fill the list of URLs
                        listOfUrls.append(fileUrl)
                        ''''''
                        postListSize = postToWorker.defPostListSize(
                            worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if (len(listOfUrls) == postListSize):
                            #send list of urls to worker
                            worker_counter = postToWorker.detectConnection(
                                ipList, worker_counter, listOfUrls)
                            #empty list of object names
                            #to prepare it for the next worker
                            del listOfUrls[:]
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter >
                                (len(ipList) -
                                 1)):  #last worker in the workers list
                                #start over from first worker in the workers list
                                worker_counter = 0
コード例 #58
0
def doDownloadJob(jsonToDict, itemname):
    global nrOfDownloads
    try:
        base_url = jsonToDict['base_url']  #becomes folderName
        for fname_key in jsonToDict.keys():
            #At first level, there are two sorts of keys in json-file:
            #1. base-url
            #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes)
            ##As the file content may change over time, every sha(filename)-element contains
            ##1. sha(s) of a content(s)
            ###Every sha of a content contains
            ###1. metadata of a file/content
            if (fname_key != 'base_url'
                ):  #fname_key(sha of file url) becomes local filename
                #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename)
                #loop over every sha(content) of a sha(filename)
                #here, csha is the sha(filecontent)
                for csha in jsonToDict[fname_key].keys():
                    contentKeyExists = False
                    """check if metadata contains key 'Content-Type'"""
                    try:
                        if ('Content-Type' in jsonToDict[fname_key][csha]):
                            contentKeyExists = True
                    except:
                        contentKeyExists = False
                        pass
                    #Get the time the json-file was made
                    timeDir = jsonToDict[fname_key][csha]['timeDir']
                    #download only changes that are no  older than
                    #the date of start of current process!
                    process_start_date = comm.makeDateObj(ajadir)
                    json_model_date = comm.makeDateObj(timeDir)
                    #continueonly if
                    #date in model is younger or equal to a
                    #date of a process start
                    if (contentKeyExists) & (json_model_date >=
                                             process_start_date):
                        #excel type is already downloaded
                        if ("excel" not in jsonToDict[fname_key][csha]
                            ['Content-Type']):
                            #full URL of a file
                            file_url = jsonToDict[fname_key][csha]['file_url']
                            dirPath = comm.downloadsDir + base_url + "/"
                            try:
                                #create folder for this 'date/base_url' if does not exist
                                if (not os.path.isdir(dirPath)) & (
                                        not os.path.exists(dirPath)):
                                    os.makedirs(dirPath)
                                try:
                                    #download the file into that folder
                                    #fname_key is the sha(filename)
                                    #resulting path of a file will become 'date/base_url/sha(filename)'
                                    urr(file_url, dirPath + fname_key)
                                    nrOfDownloads += 1
                                except:
                                    comm.printException(
                                        comm.pathToSaveDownloadErrors,
                                        itemname)
                                    pass
                            except:
                                comm.printException(
                                    comm.pathToSaveDownloadErrors, itemname)
                                pass
    except:
        comm.printException(comm.pathToSaveDownloadErrors, itemname)
        pass