Ejemplo n.º 1
0
def readJson(jsonurl, readedPage, od, _encoding):
    '''#if httpResponse is filepath
    jsonfile = (httpResponse.read()).decode('utf-8')
    '''
    #if httpResponse is saved nto string already
    try:
        if(_encoding != None):
            if("utf" in _encoding.lower()):
                _encoding = _encoding.upper()
            try:
                jsonfile = (readedPage.decode(_encoding)).strip()
            except:
                try:
                    jsonfile = (readedPage.decode(sys.stdout.encoding)).strip()
                except:
                    jsonfile = (readedPage.decode('latin-1')).strip() 
                    pass
        else:
            try:
                jsonfile = (readedPage.decode(sys.stdout.encoding)).strip()
            except:
                jsonfile = (readedPage.decode('latin-1')).strip()
                pass
        
        dictnry = json.loads(jsonfile)
        readDictValues(jsonurl, dictnry, set(), od)

    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_json.py " + _encoding + " " + jsonurl)
        pass
Ejemplo n.º 2
0
def readJson(jsonurl, readedPage, od, _encoding):
    '''#if httpResponse is filepath
    jsonfile = (httpResponse.read()).decode('utf-8')
    '''
    #if httpResponse is saved nto string already
    try:
        if (_encoding != None):
            if ("utf" in _encoding.lower()):
                _encoding = _encoding.upper()
            try:
                jsonfile = (readedPage.decode(_encoding)).strip()
            except:
                try:
                    jsonfile = (readedPage.decode(sys.stdout.encoding)).strip()
                except:
                    jsonfile = (readedPage.decode('latin-1')).strip()
                    pass
        else:
            try:
                jsonfile = (readedPage.decode(sys.stdout.encoding)).strip()
            except:
                jsonfile = (readedPage.decode('latin-1')).strip()
                pass

        dictnry = json.loads(jsonfile)
        readDictValues(jsonurl, dictnry, set(), od)

    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_json.py " + _encoding + " " + jsonurl)
        pass
Ejemplo n.º 3
0
def readPlainText(htmlurl, plaintext, ontologyData, _encoding):
    try:
        try:
            punc = (plaintext.decode(_encoding)).strip()
        except:
            try:
                punc = (plaintext.decode(sys.stdout.encoding)).strip()
            except:
                try:
                    punc = (plaintext.decode('UTF-8')).strip()
                except:
                    try:
                        punc = (plaintext.decode('latin-1')).strip()
                    except:
                        try:
                            punc = (plaintext.decode('ISO-8859-1')).strip()
                        except:
                            try:
                                punc = (plaintext.decode()).strip()
                            except:
                                punc = plaintext
                                pass

        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_plaintext.py " + _encoding + " " + htmlurl)
Ejemplo n.º 4
0
def readPlainText(htmlurl, plaintext, ontologyData, _encoding):
    try:
        try:
            punc = (plaintext.decode(_encoding)).strip()
        except:
            try:
                punc = (plaintext.decode(sys.stdout.encoding)).strip()
            except:
                try:
                    punc = (plaintext.decode('UTF-8')).strip()
                except:
                    try:
                        punc = (plaintext.decode('latin-1')).strip() 
                    except:
                        try:
                            punc = (plaintext.decode('ISO-8859-1')).strip()  
                        except:
                            try:
                                punc = (plaintext.decode()).strip()  
                            except:
                                punc = plaintext
                                pass 
                        
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
                getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + _encoding + " " + htmlurl)
Ejemplo n.º 5
0
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding,
                     localFilename, pageread):
    contentType = contentType.lower()
    #it is possible to read excel-type and pdf only after downloading this excel-doc
    if (("excel" in contentType) or ("pdf" in contentType)):
        try:
            dirToSaveDownloads = comm.downloadsDir + baseUrl
            if not os.path.isdir(dirToSaveDownloads):
                os.makedirs(dirToSaveDownloads)
            fileparser.spreadURLsByContentType(
                redirectedTo,
                None,
                contentType,
                od,
                _encoding,
                filePath=(dirToSaveDownloads + "/" + localFilename))
        except:
            comm.printException(
                comm.pathToSaveProgrammingErrors,
                "create_dir_for_downloads_and_send_file_to_parser_" +
                str(baseUrl))
            pass
    else:
        try:
            fileparser.spreadURLsByContentType(redirectedTo, pageread,
                                               contentType, od, _encoding)
        except:
            comm.printException(comm.pathToSaveProgrammingErrors,
                                "send_file_to_parser")
            pass
Ejemplo n.º 6
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    #row = worksheet.row(curr_row)
                    #print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if (cell_type == 1):
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)
        
        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
Ejemplo n.º 7
0
def insertValuesToDict(dictnry, localFilename, page_redirected, page_info, page_sha254, page_status ):
    try:
        dictnry[localFilename] = dict()
        dictnry[localFilename][page_sha254] = page_info
        dictnry[localFilename][page_sha254]["localFilename"] = localFilename
        dictnry[localFilename][page_sha254]["file_url"] = page_redirected
        dictnry[localFilename][page_sha254]["sha224"] = page_sha254
        dictnry[localFilename][page_sha254]["status"] = page_status
        dictnry[localFilename][page_sha254]["timeDir"] = comm.timeDir
        return dictnry
    except:
        comm.printException(comm.pathToSaveJsonErrors, "insertValuesToDict")
        pass
Ejemplo n.º 8
0
def readPdf(filePath, url, od):
    urldownl(url, filePath)
    pdf = PdfFileReader(open(filePath, "rb"))
    pdf.strict = True

    try:
        for page in pdf.pages:
            text = (page.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Ejemplo n.º 9
0
def readPdf(filePath, url, od):
    urldownl(url, filePath)
    pdf = PdfFileReader(open(filePath, "rb"))
    pdf.strict = True
 
    try:
        for page in pdf.pages:
            text = (page.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Ejemplo n.º 10
0
def sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread):
    contentType = contentType.lower()
    #it is possible to read excel-type and pdf only after downloading this excel-doc
    if(("excel" in contentType) or ("pdf" in contentType)):
        try:
            dirToSaveDownloads = comm.downloadsDir + baseUrl
            if not os.path.isdir(dirToSaveDownloads):
                os.makedirs(dirToSaveDownloads)
            fileparser.spreadURLsByContentType(redirectedTo, None, contentType, od, _encoding, filePath = (dirToSaveDownloads + "/" + localFilename))
        except:
            comm.printException(comm.pathToSaveProgrammingErrors, "create_dir_for_downloads_and_send_file_to_parser_" + str(baseUrl))
            pass
    else:
        try:
            fileparser.spreadURLsByContentType(redirectedTo, pageread, contentType, od, _encoding)
        except:
            comm.printException(comm.pathToSaveProgrammingErrors, "send_file_to_parser")
            pass
Ejemplo n.º 11
0
 def addTriples(self, chunkedList, addLemmas = True):
     try:
         newDataExists = False
         g = self.getPerRdfGraph()
         g_new = Graph()
         #define specific namespace prefix
         self.bindNamespaces(g)
         for andmed in chunkedList:
             for webpage in andmed:
                 gName = andmed[webpage]["gName"]
                 fName = andmed[webpage]["fName"]
                 name = andmed[webpage]["name"]
                 lemmaList = andmed[webpage]["lemmaSet"]
                 #print (lemmaList)
                 try:
                     #make triples
                     newPerson = URIRef(self.perStr + name.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower())
                     newGivenName = Literal(gName)
                     newFamilyName = Literal(fName)
                     newPerName = Literal(name)
                     newWebpage = URIRef(webpage);
                     
                     #add triples
                     #check if graph contains bob already
                     if ( newPerson, RDF.type, URIRef(self.person)) not in g:
                         newDataExists = True
                         g_new.add( (newPerson, RDF.type, URIRef(self.person)) )
                         if(newGivenName != Literal("")):
                             g_new.add( (newPerson, self.givenName, newGivenName) )
                         if(newFamilyName != Literal("")):
                             g_new.add( (newPerson, self.familyName, newFamilyName) )
                         g_new.add( (newPerson, self.perName, newPerName) )
                       
                     #check if graph contains bob already
                     if ( newPerson, self.mentionedAtSite, newWebpage) not in g:
                         newDataExists = True
                         g_new.add( (newPerson, self.mentionedAtSite, newWebpage) )
                     #add lemmas also
                     if(addLemmas):
                         for newLemma in lemmaList:
                             #check if graph contains bob already
                             if ( newPerson, self.lemma, Literal(newLemma)) not in g:
                                 newDataExists = True
                                 g_new.add( (newPerson, self.lemma, Literal(newLemma)) )
                 except:
                     comm.printException(comm.initRdfErrorsFilePath, "build_per_graph")
                     pass
         #print(str(newDataExists)) 
         #write rdf into file
         if (newDataExists):
             try:
                 gg = g+g_new
                 (gg).serialize(self.perRdf, format='pretty-xml', encoding='utf-8')
             except:
                 comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager serialization error: ")
                 pass
     except:
         comm.printException(comm.initRdfErrorsFilePath, "RDF People Manager (addTriples) error: ")
         pass
Ejemplo n.º 12
0
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        if (root is not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()

            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if (len(sentences) > 0):
                lsent = list(sentences)
                for lau in lsent:
                    if (lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(),
                                                    ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_html.py " + _encoding + " " + htmlurl)
        pass
Ejemplo n.º 13
0
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        if (root is not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()
                
            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if(len(sentences) > 0): 
                lsent = list(sentences)
                for lau in lsent:
                    if(lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl)
        pass
Ejemplo n.º 14
0
 def addTriples(self, chunkedList, addLemmas = True):
     try:
         newDataExists = False
         g = self.getLocRdfGraph()
         g_new = Graph()
         #define specific namespace prefix
         self.bindNamespaces(g)
         
         for andmed in chunkedList:
             for webpage in andmed:
                 for objName in andmed[webpage]:
                     lemmaList = andmed[webpage][objName]
                     #print (lemmaList)
                     try:
                         #make triples
                         newLocation = URIRef(self.locStr + objName.replace (">", "").replace ("<", "").replace ("|", "").replace (" ", "_").lower())
                         newLocationName = Literal(objName)
                         newWebpage = URIRef(webpage);
                         
                         #add triples
                         #check if graph contains bob already
                         if ( newLocation, RDF.type, URIRef(self.location)) not in g:
                             newDataExists = True
                             g_new .add( (newLocation, RDF.type, URIRef(self.location)) )
                             g_new .add( (newLocation, self.locationName, newLocationName) )
                          
                         #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) )   
                         #check if graph contains bob already
                         if ( newLocation, self.mentionedAtSite, newWebpage) not in g:
                             newDataExists = True
                             g_new .add( (newLocation, self.mentionedAtSite, newWebpage) )
                         #add lemmas also
                         if(addLemmas):
                             for newLemma in lemmaList:
                                 #check if graph contains bob already
                                 if ( newLocation, self.lemma, Literal(newLemma)) not in g:
                                     newDataExists = True
                                     g_new .add( (newLocation, self.lemma, Literal(newLemma)) )
                     except:
                         comm.printException(comm.initRdfErrorsFilePath, "build_loc_graph")
                         pass
         #write rdf into file
         if (newDataExists):
             try:
                 gg = g+g_new
                 (gg).serialize(self.locRdf, format='pretty-xml', encoding='utf-8')
             except:
                 comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager serialization error: ")
                 pass
     except:
         comm.printException(comm.initRdfErrorsFilePath, "RDF Location Manager (addTriples) error: ")
         pass
Ejemplo n.º 15
0
def saveMetadata(url, od):
    
    #save the result of trying of opening a page into variable 'canOpen'
    canOpen = True
    try:
        #try to open document at URL
        redirectedTo = requests.get(url).url
    except ConnectionError:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n" )
        comm.printException(comm.pathToConnectionErrors, errStr)
        #continue without terminating a program
        pass
    except:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n" )
        comm.printException(comm.pathToSaveJsonErrors, errStr)
        #continue without terminating a program
        pass
    #continue only if 'canOpen' is still true
    if canOpen is True:
        #continue only if url is valid
        isValidUrl = valide.isNeededUrl(redirectedTo)
        if(isValidUrl):
            #print("can open: " + str(canOpen))
            if not os.path.isdir(comm.jsonsDir):
                os.makedirs(comm.jsonsDir)
            try:
                #in following, use only the URL, where one was redirected, if at all
                page = requests.get(redirectedTo)
                statusCode = page.status_code
                #textual content of a doc
                pageread = page.text
                #get doc's metadata
                pageInfo = dict(page.headers)
                #generate filename for local storage:
                #it will be the hash of doc URL
                localFilename = comm.getUrlSHA(redirectedTo)
                #important metadata: content type
                contentType = page.headers['content-type']
                isValid_contentType = valide.isValideType(contentType)
                #base_url denotes host name, 
                #all documents that are from the same host, will be saved into same json-file
                #base_url becomes also json-file name (pathToSaveMetadata)
                baseUrl = None
                if(isValid_contentType):
                    baseUrl = (urlparse(redirectedTo)).netloc
                    
                if(baseUrl is not None):
                    #generate hash of the content of  doc.
                    #this hash is used later for detecting whether the doc's content has changed or not.
                    # this chnge-detection happens in cases of 
                    #1. monthly update
                    #2. current method, when appears, that this URL have processed earlier
                    sha224_ = (hashlib.sha224(pageread.encode('utf-8')).hexdigest())
                    #important data for parsers: encoding
                    _encoding = page.encoding
                    #_encoding = comm.getDocumentEncoding(contentType, pageread)
                    #print("-----------------------------------------------------")
                    
                    #exclude doc types where it is not possible to find textual  content: e.g images, videos
                    isDesiredType = comm.isDesiredContent(contentType, od)
                    #continue only witj desired types
                    if(isDesiredType):
                        #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage
                        jsonsDir = comm.jsonsDir
                        print(jsonsDir)
                        #jsonsFile becomes a so called 'object' inside a bucket
                        #object's name is URL's host name and extension is '.json'
                        jsonsFile = baseUrl + ".json"
                        #build dictionary of address of object of this meta data
                        jsonsPath = dict()
                        jsonsPath["object"] = jsonsFile#'hostname.json'#
                        jsonsPath["bucket"] = jsonsDir#e.g. 'datadownload_json'#
                        pathToSaveMetadata_ = json.dumps(jsonsPath)
                        #save meta data into dictionary structure
                        infoDict_tmp = dict()
                        infoDict_tmp["base_url"] = baseUrl
                        infoDict = insertValuesToDict(infoDict_tmp, localFilename, redirectedTo, pageInfo, sha224_, statusCode )
                        #convert dictionary into json-string
                        jsondata = json.dumps(infoDict, indent=4)
                        #dict for sending collected data to 'updateObj.py'
                        insertJson = dict()
                        insertJson["jsondata"] = jsondata
                        insertJson["localFilename"] = localFilename
                        insertJson["redirectedTo"] = redirectedTo
                        insertJson["pageInfo"] = pageInfo
                        insertJson["sha224_"] = sha224_
                        insertJson["statusCode"] = statusCode
                        insertJson["timeDir"] = comm.timeDir
                        insertJson["address"] = pathToSaveMetadata_
            
                        #variable 'someNewData' is for storing knowledge about
                        #whether this doc at this url 
                        #is processed earlier:
                        #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False
                        #2. if No, then 'someNewData' becomes True 
                        someNewData = False
                        #string for saving a unique error message
                        errr=""
                        try:
                            #convert dictionary into json-string for sending argument to 'updateObj.py'
                            jd = json.dumps(insertJson)
                            #get info back about whether here is some new data
                            #'p' is a returned boolean value of 'someNewData'
                            #communication with google-python-api-client is done using older version, python2.7
                            p = subprocess.Popen(["python2", "storage/updateObj.py", jd], stdout=subprocess.PIPE)
                            out, err = p.communicate()
                            someNewData  = out.decode()
                            errr = str(err).lower()
                            print("\nsomeNewData " + str(someNewData))
                            print("\nerrr " + str(errr))
                        except:
                            errstr = errr if ((errr != "") & (errr != "none")) else "storage-updateObj.py-ERROR"
                            comm.printException(comm.pathToSaveJsonErrors, errstr)
                            pass
                        
                        #continue with parsing of doc only when new data was detected
                        if someNewData: 
                            sendFileToParser(contentType, baseUrl, redirectedTo, od, _encoding, localFilename, pageread)
            #record errors                    
            except urr.HTTPError as e:
                errStr = (redirectedTo + " HTTPError " + str(e.code) + " " + str(e.reason) + " \n" )
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except urr.URLError as e:
                errStr = (redirectedTo + " URLError " + str(e.reason) + " \n" )
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except IOError as e:
                errStr = (redirectedTo + " " + str("I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except ValueError:
                errStr = (redirectedTo + " ValueError_Could_not_convert_data_to_an_integer.\n") 
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except TypeError:
                errStr = (redirectedTo + " TypeError\n")  
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except:
                errStr = (redirectedTo + " Unexpected_error:_" + (str(sys.exc_info()[0])) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
Ejemplo n.º 16
0
def saveMetadata(url, od):

    #save the result of trying of opening a page into variable 'canOpen'
    canOpen = True
    try:
        #try to open document at URL
        redirectedTo = requests.get(url).url
    except ConnectionError:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n")
        comm.printException(comm.pathToConnectionErrors, errStr)
        #continue without terminating a program
        pass
    except:
        #it was not possible to open this web document
        canOpen = False
        #save exception (error of getting web document)
        errStr = (url + " Cannot_open_web-source \n")
        comm.printException(comm.pathToSaveJsonErrors, errStr)
        #continue without terminating a program
        pass
    #continue only if 'canOpen' is still true
    if canOpen is True:
        #continue only if url is valid
        isValidUrl = valide.isNeededUrl(redirectedTo)
        if (isValidUrl):
            #print("can open: " + str(canOpen))
            if not os.path.isdir(comm.jsonsDir):
                os.makedirs(comm.jsonsDir)
            try:
                #in following, use only the URL, where one was redirected, if at all
                page = requests.get(redirectedTo)
                statusCode = page.status_code
                #textual content of a doc
                pageread = page.text
                #get doc's metadata
                pageInfo = dict(page.headers)
                #generate filename for local storage:
                #it will be the hash of doc URL
                localFilename = comm.getUrlSHA(redirectedTo)
                #important metadata: content type
                contentType = page.headers['content-type']
                isValid_contentType = valide.isValideType(contentType)
                #base_url denotes host name,
                #all documents that are from the same host, will be saved into same json-file
                #base_url becomes also json-file name (pathToSaveMetadata)
                baseUrl = None
                if (isValid_contentType):
                    baseUrl = (urlparse(redirectedTo)).netloc

                if (baseUrl is not None):
                    #generate hash of the content of  doc.
                    #this hash is used later for detecting whether the doc's content has changed or not.
                    # this chnge-detection happens in cases of
                    #1. monthly update
                    #2. current method, when appears, that this URL have processed earlier
                    sha224_ = (hashlib.sha224(
                        pageread.encode('utf-8')).hexdigest())
                    #important data for parsers: encoding
                    _encoding = page.encoding
                    #_encoding = comm.getDocumentEncoding(contentType, pageread)
                    #print("-----------------------------------------------------")

                    #exclude doc types where it is not possible to find textual  content: e.g images, videos
                    isDesiredType = comm.isDesiredContent(contentType, od)
                    #continue only witj desired types
                    if (isDesiredType):
                        #jsonsDir is actually a so called 'bucket' name in Google Cloud Storage
                        jsonsDir = comm.jsonsDir
                        print(jsonsDir)
                        #jsonsFile becomes a so called 'object' inside a bucket
                        #object's name is URL's host name and extension is '.json'
                        jsonsFile = baseUrl + ".json"
                        #build dictionary of address of object of this meta data
                        jsonsPath = dict()
                        jsonsPath["object"] = jsonsFile  #'hostname.json'#
                        jsonsPath[
                            "bucket"] = jsonsDir  #e.g. 'datadownload_json'#
                        pathToSaveMetadata_ = json.dumps(jsonsPath)
                        #save meta data into dictionary structure
                        infoDict_tmp = dict()
                        infoDict_tmp["base_url"] = baseUrl
                        infoDict = insertValuesToDict(
                            infoDict_tmp, localFilename, redirectedTo,
                            pageInfo, sha224_, statusCode)
                        #convert dictionary into json-string
                        jsondata = json.dumps(infoDict, indent=4)
                        #dict for sending collected data to 'updateObj.py'
                        insertJson = dict()
                        insertJson["jsondata"] = jsondata
                        insertJson["localFilename"] = localFilename
                        insertJson["redirectedTo"] = redirectedTo
                        insertJson["pageInfo"] = pageInfo
                        insertJson["sha224_"] = sha224_
                        insertJson["statusCode"] = statusCode
                        insertJson["timeDir"] = comm.timeDir
                        insertJson["address"] = pathToSaveMetadata_

                        #variable 'someNewData' is for storing knowledge about
                        #whether this doc at this url
                        #is processed earlier:
                        #1. if Yes and doc's content has changed, then 'someNewData' becomes True; else remains False
                        #2. if No, then 'someNewData' becomes True
                        someNewData = False
                        #string for saving a unique error message
                        errr = ""
                        try:
                            #convert dictionary into json-string for sending argument to 'updateObj.py'
                            jd = json.dumps(insertJson)
                            #get info back about whether here is some new data
                            #'p' is a returned boolean value of 'someNewData'
                            #communication with google-python-api-client is done using older version, python2.7
                            p = subprocess.Popen(
                                ["python2", "storage/updateObj.py", jd],
                                stdout=subprocess.PIPE)
                            out, err = p.communicate()
                            someNewData = out.decode()
                            errr = str(err).lower()
                            print("\nsomeNewData " + str(someNewData))
                            print("\nerrr " + str(errr))
                        except:
                            errstr = errr if (
                                (errr != "") & (errr != "none")
                            ) else "storage-updateObj.py-ERROR"
                            comm.printException(comm.pathToSaveJsonErrors,
                                                errstr)
                            pass

                        #continue with parsing of doc only when new data was detected
                        if someNewData:
                            sendFileToParser(contentType, baseUrl,
                                             redirectedTo, od, _encoding,
                                             localFilename, pageread)
            #record errors
            except urr.HTTPError as e:
                errStr = (redirectedTo + " HTTPError " + str(e.code) + " " +
                          str(e.reason) + " \n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except urr.URLError as e:
                errStr = (redirectedTo + " URLError " + str(e.reason) + " \n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except IOError as e:
                errStr = (redirectedTo + " " + str(
                    "I/O_erRror({0}):_{1}".format(e.errno, e.strerror)) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except ValueError:
                errStr = (
                    redirectedTo +
                    " ValueError_Could_not_convert_data_to_an_integer.\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except TypeError:
                errStr = (redirectedTo + " TypeError\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
            except:
                errStr = (redirectedTo + " Unexpected_error:_" + (str(
                    sys.exc_info()[0])) + "\n")
                comm.printException(comm.pathToSaveJsonErrors, errStr)
                pass
Ejemplo n.º 17
0
    def addTriples(self, chunkedList, addLemmas=True):
        try:
            newDataExists = False
            g = self.getLocRdfGraph()
            g_new = Graph()
            #define specific namespace prefix
            self.bindNamespaces(g)

            for andmed in chunkedList:
                for webpage in andmed:
                    for objName in andmed[webpage]:
                        lemmaList = andmed[webpage][objName]
                        #print (lemmaList)
                        try:
                            #make triples
                            newLocation = URIRef(
                                self.locStr +
                                objName.replace(">", "").replace("<", "").
                                replace("|", "").replace(" ", "_").lower())
                            newLocationName = Literal(objName)
                            newWebpage = URIRef(webpage)

                            #add triples
                            #check if graph contains bob already
                            if (newLocation, RDF.type,
                                    URIRef(self.location)) not in g:
                                newDataExists = True
                                g_new.add((newLocation, RDF.type,
                                           URIRef(self.location)))
                                g_new.add((newLocation, self.locationName,
                                           newLocationName))

                            #g_new .add( (newLocation, od.mentionedAtSite, newWebpage) )
                            #check if graph contains bob already
                            if (newLocation, self.mentionedAtSite,
                                    newWebpage) not in g:
                                newDataExists = True
                                g_new.add((newLocation, self.mentionedAtSite,
                                           newWebpage))
                            #add lemmas also
                            if (addLemmas):
                                for newLemma in lemmaList:
                                    #check if graph contains bob already
                                    if (newLocation, self.lemma,
                                            Literal(newLemma)) not in g:
                                        newDataExists = True
                                        g_new.add((newLocation, self.lemma,
                                                   Literal(newLemma)))
                        except:
                            comm.printException(comm.initRdfErrorsFilePath,
                                                "build_loc_graph")
                            pass
            #write rdf into file
            if (newDataExists):
                try:
                    gg = g + g_new
                    (gg).serialize(self.locRdf,
                                   format='pretty-xml',
                                   encoding='utf-8')
                except:
                    comm.printException(
                        comm.initRdfErrorsFilePath,
                        "RDF Location Manager serialization error: ")
                    pass
        except:
            comm.printException(comm.initRdfErrorsFilePath,
                                "RDF Location Manager (addTriples) error: ")
            pass
Ejemplo n.º 18
0
def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath = None):
    doctext = httpResponse
    '''#parse excel file'''
    if("excel" in tyyp):
        try:
            '''#parse web page excel'''
            read_eksel.readExcel(filePath, url, od)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_excel")
            pass
    
    elif("pdf" in tyyp):
        try:
            '''#parse pdf'''
            read_pdf.readPdf(filePath, url, od)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_pdf")
            pass
            
    elif("xml" in tyyp):
        try:
            '''#parse web page xml'''
            doctext = detectEncoding(_encoding, httpResponse)
            read_xml.readXml(url, doctext, od)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_xml")
            pass
    elif("html" in tyyp) :
        try:
            '''#parse web page html/txt'''
            doctext = detectEncoding(_encoding, httpResponse)
            read_html.readHtmlPage(url, doctext, od, _encoding)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_html")
            pass
    elif("json" in tyyp):
        try:
            '''#parse json app/json'''
            doctext = detectEncoding(_encoding, httpResponse)
            read_json.readJson(url, doctext, od, _encoding)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_json")
            pass
    elif("plain" in tyyp) or ("text" in tyyp):
        try:
            doctext = detectEncoding(_encoding, httpResponse)
            '''#assumes incoming is plain text try to parse text lines'''
            read_plaintext.readPlainText(url, doctext, od, _encoding)
        except:
            comm.printException(comm.pathToSaveParsingErrors, "fileparser_plainText")
            pass
    else:
        comm.printException(comm.pathToSaveParsingErrors, "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n")
Ejemplo n.º 19
0
def getEntities(
    url,
    text,
    ontologyData,
    orgWords=[
        "kogu",
        "selts",
        "ansambel",
        "keskus",
        "ühendus",
        "ühing",
        "mtü",
        "oü",
        "as",
        "klubi",
        "asutus",
        "keskus",
        "fond",
        "cup",
    ],
    locWords=["vabarii", "maakond"],
):
    # print("GETENTITIES ", url)
    # printIncomingText(text)
    if "^" not in text:  # invalid for RDFlibURI
        ntwl = list()
        ner_tagged = None
        try:
            ner_tagged = tagger(analyzer(tokenizer(text)))
        except:
            comm.printException(
                comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ner_tagged " + str(text)
            )
            pass
        if ner_tagged is not None:
            try:
                ntwl = ner_tagged.named_entities
            except:
                comm.printException(
                    comm.initRdfErrorsFilePath,
                    "\n" + url + "\n_getEntities.py-def_getEntities_:_ntwl" + str(len(ntwl)) + " " + str(text),
                )
                pass
        try:
            if len(ntwl) > 0:

                andmed = dict()
                # get label for entity
                for i in ntwl:
                    label = i.label

                    freqLemma = comm.replaceWith(" ", i.lemma)  # replace some chars with space

                    frlower = freqLemma.lower()
                    # correct some ner labels
                    for ow in orgWords:
                        if ow.lower() in frlower:
                            label = "ORG"
                    for lw in locWords:
                        if lw.lower() in frlower:
                            label = "LOC"

                    # process values by labels
                    if label == "PER":
                        entitySet = set()
                        if freqLemma != "":
                            name = freqLemma.title()
                            names = name.split(" ")
                            gName = ""
                            fName = ""
                            try:
                                if len(names) > 1:
                                    if len(names) > 2:  # more than 1 given name, assuming 2 of them
                                        gName = names[0] + " " + names[1]
                                        fName = names[2]  # family name
                                    elif len(names) == 2:
                                        gName = names[0]  # one given name
                                        fName = names[1]  # family name
                            except:
                                comm.printException(
                                    comm.initRdfErrorsFilePath,
                                    "\n" + url + "\n_getEntities.py-def_getEntities_gname-fname",
                                )
                                pass
                            entitySet.add(freqLemma)
                            # to later remove, currently for avoid double values
                            entitySet.add(name)
                            entitySet.add(gName)
                            entitySet.add(fName)
                            wConcat = comm.replaceWith("", (" ".join(w.text for w in i.words)))
                            entitySet.add(wConcat)
                            lemmalist = list()
                            for w in i.words:
                                lemmalist.append(w.lemmas)
                            produkt = itertools.product(*lemmalist)
                            for j in produkt:
                                entitySet.add(
                                    " ".join(
                                        str(u)
                                        for u in (list(j))
                                        if ((u.lower() != name.lower()) & (u != "") & (u.title() in names))
                                    )
                                )
                            # now remove double values
                            if name in entitySet:
                                entitySet.remove(name)
                            if gName in entitySet:
                                entitySet.remove(gName)
                            if fName in entitySet:
                                entitySet.remove(fName)
                            if "" in entitySet:
                                entitySet.remove("")

                            andmed = {url: {"gName": gName, "fName": fName, "name": name, "lemmaSet": entitySet}}
                            if not (ontologyData.sharedList_per._callmethod("__contains__", (andmed,))):
                                ontologyData.sharedList_per._callmethod("append", (andmed,))

                            if (ontologyData.sharedList_per)._callmethod("__len__") > comm.chunksize:
                                try:
                                    chunkedList = ontologyData.sharedList_per[:]  # makes copy,not refrence
                                    del ontologyData.sharedList_per[:]
                                    perManager = init_rdf.PeopleManager(ontologyData)
                                    perManager.addTriples(chunkedList)
                                except:
                                    comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_PER_entities")
                                    pass
                    else:
                        objName = freqLemma.title()
                        entitySet = set()
                        entitySet.add(freqLemma)
                        wConcat = comm.replaceWith("", (" ".join(w.text for w in i.words)))
                        entitySet.add(wConcat)
                        lemmalist = list()
                        for w in i.words:
                            lemmalist.append(w.lemmas)
                        produkt = itertools.product(*lemmalist)
                        for j in produkt:
                            entitySet.add(
                                " ".join(str(u) for u in (list(j)) if ((u.lower() != objName.lower()) & (u != "")))
                            )
                        if "" in entitySet:
                            entitySet.remove("")

                        andmed = {url: {objName: entitySet}}

                        if label == "ORG":
                            if not (ontologyData.sharedList_org._callmethod("__contains__", (andmed,))):
                                ontologyData.sharedList_org._callmethod("append", (andmed,))
                        elif label == "LOC":
                            if not (ontologyData.sharedList_loc._callmethod("__contains__", (andmed,))):
                                ontologyData.sharedList_loc._callmethod("append", (andmed,))

                        if (ontologyData.sharedList_org)._callmethod("__len__") > comm.chunksize:
                            try:
                                chunkedList = ontologyData.sharedList_org[:]  # makes copy,not refrence
                                del ontologyData.sharedList_org[:]
                                # tests
                                # jf = open("tEst.txt", 'a', encoding='utf-8')
                                # jf.write(str(len(chunkedList)) + "\n")
                                # jf.close()
                                orgManager = init_rdf.OrganizationManager(ontologyData)
                                orgManager.addTriples(chunkedList)
                            except:
                                comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_ORG_entities")
                                pass
                        if (ontologyData.sharedList_loc)._callmethod("__len__") > comm.chunksize:
                            try:
                                chunkedList = ontologyData.sharedList_loc[:]  # makes copy,not refrence
                                del ontologyData.sharedList_loc[:]
                                locManager = init_rdf.LocationManager(ontologyData)
                                locManager.addTriples(chunkedList)
                            except:
                                comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_LOC_entities")
                                pass
        except:
            comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py")
            pass
Ejemplo n.º 20
0
    '''
    data0 = None
    data = None
    #
    try:
        data =  json.loads((  json.loads(sys.argv[1])  )["data"])
        data0 =  json.loads(sys.argv[1]) 
        #data =  json.loads((data0)["data"]) 
        comm.chunksize =  int(json.loads((data0)["chunksize"])) 
        
        #jf = open("/var/www/html/ch.txt", 'a')
        #jf.write(str(datetime.datetime.now()) + "\nCHUNKSIZE: " +  str(comm.chunksize) + "\n\n")
        #jf.close()
        
    except:
        comm.printException(comm.pathToSaveProgrammingErrors, "load_DATA_in_connector")
        pass

    if(data is not None):
        jobs = list(data.values())
        nrOfJobs=len(jobs)
        pool = Pool(processes=os.cpu_count())
        pool.map(sendUrl, jobs)
        pool.close()
        pool.join()
    
    

#FINALLY add triples from lists, that left over. 
#In the file 'getEntities',
#when chunking shared lists, it starts to create RDF-s, when list size exceeds chunksize (e.g. 25 items),
Ejemplo n.º 21
0
def getEntities(url, text, ontologyData, orgWords=['kogu', 'selts', 'ansambel', 'keskus', 'ühendus', 'ühing', 'mtü', 'oü', 'as', 'klubi', 'asutus', 'keskus', 'fond', 'cup'], locWords=['vabarii', 'maakond']):
    #print("GETENTITIES ", url)
    #printIncomingText(text)
    if("^" not in text):#invalid for RDFlibURI
        ntwl = list()
        ner_tagged = None
        try:
            ner_tagged = tagger(analyzer(tokenizer(text)))
        except:
            comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ner_tagged " + str(text))
            pass
        if (ner_tagged is not None):
            try:
                ntwl = ner_tagged.named_entities
            except:
                comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_:_ntwl" + str(len(ntwl)) + " " + str(text))
                pass
        try:
            if(len(ntwl) > 0):
                
                andmed = dict()
                #get label for entity
                for i in ntwl:
                    label = i.label
                    
                    freqLemma = comm.replaceWith(' ', i.lemma)#replace some chars with space
                    
                    frlower = freqLemma.lower()
                    #correct some ner labels
                    for ow in orgWords:
                        if(ow.lower() in frlower ):
                            label = "ORG"
                    for lw in locWords:
                        if(lw.lower() in frlower ):
                            label = "LOC"
                            
                    #process values by labels
                    if label == "PER":
                        entitySet = set()
                        if(freqLemma != ""):
                            name = freqLemma.title()
                            names = name.split(' ')
                            gName = ""
                            fName = ""
                            try:
                                if len(names) > 1:
                                    if len(names) > 2:#more than 1 given name, assuming 2 of them
                                        gName = names[0] + " " + names[1]
                                        fName = names[2]#family name
                                    elif len(names) == 2:
                                        gName = names[0]#one given name
                                        fName = names[1]#family name
                            except:
                                comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py-def_getEntities_gname-fname")
                                pass
                            entitySet.add(freqLemma)
                            #to later remove, currently for avoid double values
                            entitySet.add(name)
                            entitySet.add(gName)
                            entitySet.add(fName)
                            wConcat = comm.replaceWith('', (' '.join(w.text for w in i.words)))
                            entitySet.add(wConcat)
                            lemmalist = list()
                            for w in i.words:
                                lemmalist.append(w.lemmas)
                            produkt = itertools.product(*lemmalist)
                            for j in produkt:
                                entitySet.add( " ".join(str(u) for u in(list(j)) if ((u.lower() != name.lower()) & (u != "") & (u.title() in names))  ) )
                            #now remove double values
                            if name in entitySet:
                                entitySet.remove(name)
                            if gName in entitySet:
                                entitySet.remove(gName)
                            if fName in entitySet:
                                entitySet.remove(fName)
                            if "" in entitySet:
                                entitySet.remove("")
                                
                            andmed={url: {"gName": gName, "fName": fName, "name": name, "lemmaSet": entitySet}};
                            if not(ontologyData.sharedList_per._callmethod('__contains__', (andmed,))):
                                ontologyData.sharedList_per._callmethod('append', (andmed,))    
                            
                            if ((ontologyData.sharedList_per)._callmethod('__len__') > comm.chunksize):
                                try:
                                    chunkedList = ontologyData.sharedList_per[:]#makes copy,not refrence
                                    del ontologyData.sharedList_per[:]
                                    perManager = init_rdf.PeopleManager(ontologyData)
                                    perManager.addTriples(chunkedList)
                                except:
                                    comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_PER_entities") 
                                    pass
                    else:
                        objName = freqLemma.title()
                        entitySet = set();
                        entitySet.add(freqLemma);
                        wConcat = comm.replaceWith('', (' '.join(w.text for w in i.words)))
                        entitySet.add(wConcat)
                        lemmalist = list()
                        for w in i.words:
                            lemmalist.append(w.lemmas)
                        produkt = itertools.product(*lemmalist)
                        for j in produkt:
                            entitySet.add( " ".join(str(u) for u in(list(j)) if ((u.lower() != objName.lower()) & (u != ""))  ) )
                        if "" in entitySet:
                            entitySet.remove("")
                        
                        andmed={url: {objName: entitySet}};
                            
                        if(label == "ORG"):
                            if not(ontologyData.sharedList_org._callmethod('__contains__', (andmed,))):
                                ontologyData.sharedList_org._callmethod('append', (andmed,))
                        elif(label == "LOC"):
                            if not(ontologyData.sharedList_loc._callmethod('__contains__', (andmed,))):
                                ontologyData.sharedList_loc._callmethod('append', (andmed,))
                        
                        if ((ontologyData.sharedList_org)._callmethod('__len__') > comm.chunksize):
                            try:
                                chunkedList = (ontologyData.sharedList_org[:])#makes copy,not refrence
                                del ontologyData.sharedList_org[:]
                                #tests
                                #jf = open("tEst.txt", 'a', encoding='utf-8')
                                #jf.write(str(len(chunkedList)) + "\n")
                                #jf.close() 
                                orgManager = init_rdf.OrganizationManager(ontologyData)
                                orgManager.addTriples(chunkedList)
                            except:
                                comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_ORG_entities")
                                pass
                        if ((ontologyData.sharedList_loc)._callmethod('__len__') > comm.chunksize):
                            try:
                                chunkedList = ontologyData.sharedList_loc[:]#makes copy,not refrence
                                del ontologyData.sharedList_loc[:]
                                locManager = init_rdf.LocationManager(ontologyData)
                                locManager.addTriples(chunkedList)
                            except:
                                comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_get_LOC_entities")
                                pass
        except:
            comm.printException(comm.initRdfErrorsFilePath, "\n" + url + "\n_getEntities.py")
            pass
Ejemplo n.º 22
0
    '''
    data0 = None
    data = None
    #
    try:
        data = json.loads((json.loads(sys.argv[1]))["data"])
        data0 = json.loads(sys.argv[1])
        #data =  json.loads((data0)["data"])
        comm.chunksize = int(json.loads((data0)["chunksize"]))

        #jf = open("/var/www/html/ch.txt", 'a')
        #jf.write(str(datetime.datetime.now()) + "\nCHUNKSIZE: " +  str(comm.chunksize) + "\n\n")
        #jf.close()

    except:
        comm.printException(comm.pathToSaveProgrammingErrors,
                            "load_DATA_in_connector")
        pass

    if (data is not None):
        jobs = list(data.values())
        nrOfJobs = len(jobs)
        pool = Pool(processes=os.cpu_count())
        pool.map(sendUrl, jobs)
        pool.close()
        pool.join()

#FINALLY add triples from lists, that left over.
#In the file 'getEntities',
#when chunking shared lists, it starts to create RDF-s, when list size exceeds chunksize (e.g. 25 items),
#but when there are eventually less items in lists than chunksize, the items in it were not tripled so far.
#print("PER")
Ejemplo n.º 23
0
    def addTriples(self, chunkedList, addLemmas=True):
        try:
            newDataExists = False
            g = self.getPerRdfGraph()
            g_new = Graph()
            #define specific namespace prefix
            self.bindNamespaces(g)
            for andmed in chunkedList:
                for webpage in andmed:
                    gName = andmed[webpage]["gName"]
                    fName = andmed[webpage]["fName"]
                    name = andmed[webpage]["name"]
                    lemmaList = andmed[webpage]["lemmaSet"]
                    #print (lemmaList)
                    try:
                        #make triples
                        newPerson = URIRef(
                            self.perStr +
                            name.replace(">", "").replace("<", "").replace(
                                "|", "").replace(" ", "_").lower())
                        newGivenName = Literal(gName)
                        newFamilyName = Literal(fName)
                        newPerName = Literal(name)
                        newWebpage = URIRef(webpage)

                        #add triples
                        #check if graph contains bob already
                        if (newPerson, RDF.type, URIRef(self.person)) not in g:
                            newDataExists = True
                            g_new.add(
                                (newPerson, RDF.type, URIRef(self.person)))
                            if (newGivenName != Literal("")):
                                g_new.add(
                                    (newPerson, self.givenName, newGivenName))
                            if (newFamilyName != Literal("")):
                                g_new.add((newPerson, self.familyName,
                                           newFamilyName))
                            g_new.add((newPerson, self.perName, newPerName))

                        #check if graph contains bob already
                        if (newPerson, self.mentionedAtSite,
                                newWebpage) not in g:
                            newDataExists = True
                            g_new.add(
                                (newPerson, self.mentionedAtSite, newWebpage))
                        #add lemmas also
                        if (addLemmas):
                            for newLemma in lemmaList:
                                #check if graph contains bob already
                                if (newPerson, self.lemma,
                                        Literal(newLemma)) not in g:
                                    newDataExists = True
                                    g_new.add((newPerson, self.lemma,
                                               Literal(newLemma)))
                    except:
                        comm.printException(comm.initRdfErrorsFilePath,
                                            "build_per_graph")
                        pass
            #print(str(newDataExists))
            #write rdf into file
            if (newDataExists):
                try:
                    gg = g + g_new
                    (gg).serialize(self.perRdf,
                                   format='pretty-xml',
                                   encoding='utf-8')
                except:
                    comm.printException(
                        comm.initRdfErrorsFilePath,
                        "RDF People Manager serialization error: ")
                    pass
        except:
            comm.printException(comm.initRdfErrorsFilePath,
                                "RDF People Manager (addTriples) error: ")
            pass