コード例 #1
0
def processOntodict(ontodict, ontopath, mtype):
    log("Call to processOntodict() with ontopath: " + ontopath)
    import owlready2
    onto = owlready2.get_ontology("file://" + ontopath)
    onto.load()

    #class Concept(Thing):
    #    namespace = onto

    #class Document(Thing):
    #    namespace = onto

    clases = ontodict['clases']
    concepts = ontodict['concepts']
    if mtype == 1:
        log("Generating clases...")
        for classkey, classelem in clases.items():
            ontoclass = classkey.title()
            with onto:
                NewClass = owlready2.types.new_class(ontoclass,
                                                     (onto.Concept, ),
                                                     kwds={})
                for elem in classelem:
                    NewClass(elem.lower())

    for concept in concepts:
        docid = concept[1]
        conceptname = concept[0]
        with onto:
            currentDocumentSearch = onto.search(iri=onto.base_iri + str(docid))
            if len(currentDocumentSearch) > 0:
                currentDocument = currentDocumentSearch[0]
            else:
                currentDocument = onto.Document(docid)
            currentConceptSearch = onto.search(iri=onto.base_iri +
                                               conceptname.lower())
            if len(currentConceptSearch):
                currentConcept = currentConceptSearch[0]
            else:
                currentConcept = onto.Concept(conceptname.lower())
            currentDocument.documentHasConcept.append(currentConcept)
            currentConcept.conceptInDocument.append(currentDocument)

    onto_file = open(ontopath, 'wb+')
    try:
        onto.save(file=onto_file, format="rdfxml")
        log("Saved file to path " + ontopath)
        onto_file.close()
        log("Call to destroy ontolgy: " + ontopath)
        onto.destroy()
        del onto
        del owlready2
        return True
    except:
        log("Error saving ontology, destroying object: " + ontopath)
        onto.destroy()
        del onto
        del owlready2
        return False
コード例 #2
0
def getVersion():
    db = connect()
    cursor = db.cursor()
    cursor.execute("SELECT VERSION()")
    data = cursor.fetchone()
    api_log.log("Database version : %s " % data)
    db.close()
    return data
コード例 #3
0
def _getText(filename):
    decoded = ""
    try:
        text = textract.process(filename)
        decoded = text.decode("utf-8")
    except:
        log("Could not read document content: " + filename)
    return decoded
コード例 #4
0
def annotateDocumentsInPath(path, ontopath):
    log("Call to annotateDocumentsInPath()")
    from nltk.tag import StanfordPOSTagger
    import coruja_database
    os.environ["STANFORD_MODELS"] = os.path.join(
        os.path.dirname(__file__),
        'scpDocs/stanford-postagger-full-2017-06-09/models')
    lemmaDict = pd.read_pickle(
        os.path.join(os.path.dirname(__file__), 'lemmatization-es.pkl'))
    lemmaDict.columns = ["lemma", "token"]
    maxWordDistance = 2
    spanish_postagger = StanfordPOSTagger(
        'spanish.tagger',
        os.path.join(
            os.path.dirname(__file__),
            'scpDocs/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
        ))
    posTagDescDf = pd.read_csv(
        os.path.join(os.path.dirname(__file__), "Stanford_POS_Tags.csv"))
    files = [f for f in os.listdir(path) if os.path.isfile(path + "/" + f)]
    files = filter(lambda f: f.endswith(('.pdf', '.PDF')), files)
    status = {}
    ontoDict = {}
    for file in files:
        filepath = path + "/" + file
        log("Procesing " + filepath)
        docid = coruja_database.insertDocument(path + "/", file)
        processDocument(docid, filepath, ontoDict, maxWordDistance,
                        posTagDescDf, spanish_postagger, lemmaDict)
        status[filepath] = 1
    allCount = len(ontoDict)
    tenPCount = math.trunc(0.1 * allCount)
    countList = []
    ontoDictFinal = {"clases": {}, "concepts": set([])}
    for key, element in ontoDict.items():
        countList.append((element["count"], key))
        ontoDictFinal["concepts"].add((key, element["docid"]))

    mainConcepts = sorted(countList, reverse=True)[:tenPCount]

    for mainConcept in mainConcepts:
        validNeighbor = []
        neighborCount = len(ontoDict[mainConcept[1]]["neighbors"])
        neighborTenPCount = math.trunc(0.5 * neighborCount)
        closeNeighbors = sorted(
            ontoDict[mainConcept[1]]["neighbors"])[:neighborTenPCount]
        for neighbor in closeNeighbors:
            if neighbor != mainConcept[1]:
                if ontoDict[mainConcept[1]]["neighbors"][neighbor] > 1:
                    validNeighbor.append(neighbor)
        if len(validNeighbor) > 0:
            ontoDictFinal["clases"][mainConcept[1]] = set(validNeighbor)
    processOntodict(ontoDictFinal, ontopath, 1)
    return status
コード例 #5
0
def getActiveOntologies():
    db = connect()
    cursor = db.cursor()
    ontofiles = []
    try:
        cursor.execute("SELECT * FROM `ontologia` WHERE Bo_OntEstado = 1")
        results = cursor.fetchall()
        for row in results:
            file = row[2]
            path = row[3]
            ontofile = path + file
            ontofiles.append(ontofile)
            # Now print fetched result
            api_log.log("Active ontology: " + ontofile)
    except MySQLError as e:
        api_log.log("Error: unable to fetch data")
    db.close()
    return ontofiles
コード例 #6
0
def createBaseOntology(filename, filepath):
    import coruja_database
    ontofileName = config.OntologyNamespace + filename + ".owl"
    import owlready2
    onto = owlready2.get_ontology(ontofileName)
    log("Creating ontology " + ontofileName)

    class Document(owlready2.Thing):
        namespace = onto

    class Concept(owlready2.Thing):
        namespace = onto

    class documentHasConcept(owlready2.ObjectProperty):
        namespace = onto
        domain = [Document]
        range = [Concept]

    class conceptInDocument(owlready2.ObjectProperty):
        namespace = onto
        domain = [Concept]
        range = [Document]
        inverse_property = documentHasConcept


#    class hasId(DataProperty,FunctionalProperty):
#        namespace = onto
#        domain = [Document]
#        range = [int]

    filenameOwl = filename + ".owl"
    uri = config.OntologyNamespace
    onto_file = open(filepath + filenameOwl, 'wb+')
    onto.save(file=onto_file, format="rdfxml")
    coruja_database.insertOntology(uri, filenameOwl, filepath)
    try:
        log("Trying to destroy: " + onto.base_iri)
        onto.destroy()
        del onto
    except:
        log("Can't destroy ontology")
    return onto_file.name, filenameOwl, uri
コード例 #7
0
def getOntology(ontoId):
    db = connect()
    cursor = db.cursor()
    ontofile = ""
    try:
        sql = "SELECT * FROM `ontologia` WHERE id_ontologia = %s"
        cursor.execute(sql, (ontoId, ))
        result = cursor.fetchone()
        if result != None:
            file = result[2]
            path = result[3]
            ontofile = path + file
            api_log.log("Got ontology: " + ontofile)
        else:
            api_log.log("Wrong ontology id: " + ontoId)
            return ""
    except MySQLError as e:
        api_log.log("Error: unable to fetch data")
    db.close()
    return ontofile
コード例 #8
0
ファイル: __init__.py プロジェクト: jpalanco/alienvault-ossim
 def __init__(self, http_code=500, log=None, log_level='ERROR', message=''):
     super(APIException, self).__init__(message)
     self._http_code = http_code
     if log is not None:
         api_log.log(message=log, level=log_level)
コード例 #9
0
def updateConcepts(docId, ontoId, concepts):
    log("Call to updateConcepts()")
    import coruja_database
    ontopath = coruja_database.getOntology(str(ontoId))
    import owlready2
    owlready2.default_world.ontologies.clear()
    onto = owlready2.get_ontology("file://" + ontopath)
    log("updateConcepts(): Onto world debug after load: " +
        str(onto.world.ontologies))
    try:
        onto.load()
        log("updateConcepts(): Load ontology " + ontopath)
    except:
        log("updateConcepts(): Error loading ontology " + ontopath)
        return 0
    log("updateConcepts(): Onto world debug: " + str(onto.world.ontologies))
    log("updateConcepts(): Searching for " + onto.base_iri + str(docId))
    status = 1
    result = onto.search(iri=onto.base_iri + str(docId))
    if (len(result) > 0):
        document = result[0]
        ontoConcepts = document.documentHasConcept
        keepConcepts = []
        if len(ontoConcepts) > 0:
            for concept in ontoConcepts:
                if concept.name in concepts:
                    keepConcepts.append(concept)
                else:
                    log("updateConcepts(): Deleting concept: " + concept.name)
            document.documentHasConcept = keepConcepts
        else:
            log("updateConcepts(): No concepts found")
            status = 0
    else:
        status = 0
        log("updateConcepts(): Document not found id=" + str(docId))

    if status == 1:
        try:
            onto_file = open(ontopath, 'wb+')
            #onto.save(file=ontopath, format="rdfxml")
            onto.save(file=onto_file, format="rdfxml")
            onto_file.flush()
            onto_file.close()
            del onto_file
            log("updateConcepts(): Saved ontology to " + ontopath)
        except:
            log("updateConcepts(): Error saving ontology " + ontopath)
            status = 0
    else:
        log("updateConcepts(): Empty ontology loaded " + ontopath)
    try:
        log("updateConcepts(): Destroying ontology " + onto.base_iri)
        onto.destroy()
        del onto
        del owlready2
    except:
        log("updateConcepts(): Error destroying ontology:" + onto.base_iri)
    gc.collect()
    return status
コード例 #10
0
def annotateDocumentsInList(docList, ontoId, mtype):
    log("Call to annotateDocumentsInPath() type = " + str(mtype))
    import coruja_database
    ontopath = coruja_database.getOntology(ontoId)
    status = []
    if ontopath == "":
        for doc in docList:
            curr_status = {}
            curr_status["id"] = doc['id']
            curr_status["status"] = 0
            status.append(curr_status)
        return status
    from nltk.tag import StanfordPOSTagger
    os.environ["STANFORD_MODELS"] = os.path.join(
        os.path.dirname(__file__),
        'scpDocs/stanford-postagger-full-2017-06-09/models')
    lemmaDict = pd.read_pickle(
        os.path.join(os.path.dirname(__file__), 'lemmatization-es.pkl'))
    lemmaDict.columns = ["lemma", "token"]
    maxWordDistance = 2
    spanish_postagger = StanfordPOSTagger(
        'spanish.tagger',
        os.path.join(
            os.path.dirname(__file__),
            'scpDocs/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
        ))
    posTagDescDf = pd.read_csv(
        os.path.join(os.path.dirname(__file__), "Stanford_POS_Tags.csv"))
    ontoDict = {}
    for doc in docList:
        filepath = doc['path']
        docid = doc['id']
        log("Procesing " + filepath)
        processDocument(docid, filepath, ontoDict, maxWordDistance,
                        posTagDescDf, spanish_postagger, lemmaDict)
        curr_status = {}
        curr_status["id"] = doc['id']
        curr_status["status"] = 1
        status.append(curr_status)
    allCount = len(ontoDict)
    tenPCount = math.trunc(0.1 * allCount)
    countList = []
    ontoDictFinal = {"clases": {}, "concepts": set([])}
    for key, element in ontoDict.items():
        countList.append((element["count"], key))
        ontoDictFinal["concepts"].add((key, element["docid"]))

    mainConcepts = sorted(countList, reverse=True)[:tenPCount]

    for mainConcept in mainConcepts:
        validNeighbor = []
        neighborCount = len(ontoDict[mainConcept[1]]["neighbors"])
        neighborTenPCount = math.trunc(0.5 * neighborCount)
        closeNeighbors = sorted(
            ontoDict[mainConcept[1]]["neighbors"])[:neighborTenPCount]
        for neighbor in closeNeighbors:
            if neighbor != mainConcept[1]:
                if ontoDict[mainConcept[1]]["neighbors"][neighbor] > 1:
                    validNeighbor.append(neighbor)
        if len(validNeighbor) > 0:
            ontoDictFinal["clases"][mainConcept[1]] = set(validNeighbor)
    processOntodict(ontoDictFinal, ontopath, mtype)
    log("Returning annotation status - finished annotation process")
    return status
コード例 #11
0
def getConcepts(documentId, ontoId):
    import coruja_database
    import time
    log("Call to getConcepts(): docid = " + str(documentId) + " - ontid = " +
        str(ontoId))
    result = []
    ontopath = coruja_database.getOntology(str(ontoId))
    #Read temp file
    tmpFilename = config.OntologyDir + str(int(time.time())) + "_tmp.owl"
    copyfile(ontopath, tmpFilename)
    log("getConcepts(): Copied ontology from " + ontopath + " to " +
        tmpFilename)
    import owlready2
    owlready2.default_world.ontologies.clear()
    getConceptsOnto = owlready2.get_ontology("file://" + tmpFilename)
    log("getConcepts(): Onto world debug before load: " +
        str(getConceptsOnto.world.ontologies))
    #getConceptsOnto = get_ontology("file://" + ontopath)
    try:
        getConceptsOnto.load()
        log("getConcepts(): Load ontology :" + tmpFilename)
        #log("getConcepts(): Load ontology :" + ontopath)
    except:
        log("getConcepts(): Error loading ontology " + tmpFilename)
        #log("getConcepts(): Error loading ontology " + ontopath)
        return result
    documents = getConceptsOnto.search(iri=getConceptsOnto.base_iri +
                                       str(documentId))
    log("getConcepts(): Onto world debug: " +
        str(getConceptsOnto.world.ontologies))
    if len(documents) > 0:
        document = documents[0]
        log("getConcepts(): Document found " + document.iri)
        concepts = document.documentHasConcept
        for concept in concepts:
            if concept not in result:
                result.append(concept.name)
        if len(concepts) > 0:
            log("getConcepts(): Concepts found: " + str(len(concepts)))
        else:
            log("getConcepts(): No concepts found.")
    else:
        log("No document found. docid = " + str(documentId))

    os.remove(tmpFilename)
    log("getConcepts(): Deleting temp file " + tmpFilename)
    log("getConcepts(): Trying to destroy " + getConceptsOnto.base_iri)
    try:
        getConceptsOnto.destroy()
        del getConceptsOnto
        del owlready2
        log("getConcepts(): Ontology destroyed")
    except:
        log("getConcepts(): Can not destroy ontology loaded")
    gc.collect()
    return result
コード例 #12
0
def getDocumentsFromOntology(concepts, ontopath, resultDocuments):
    log("Call to getDocumentsFromOntology(): " + ontopath)
    import owlready2
    owlready2.default_world.ontologies.clear()
    onto = owlready2.get_ontology("file://" + ontopath)
    try:
        onto.load()
        #with onto:
        #    sync_reasoner()
        log("getDocumentsFromOntology(): Loaded ontology " + ontopath)
    except:
        log("getDocumentsFromOntology(): Error loading ontology " + ontopath)
        return
    log("getDocumentsFromOntology(): Onto world debug: " +
        str(onto.world.ontologies))
    ontoconcepts = onto.search(is_a=onto.Concept)
    scores = []
    for concept in ontoconcepts:
        score = -1
        for token in concepts:
            if score == -1:
                score = editdistance.eval(concept.name, token)
            else:
                score = min(score, editdistance.eval(concept.name, token))
        scores.append((score, concept))
    lowest_scores = list(filter(lambda x: x[0] < 3, scores))
    validConcepts = []
    for score in lowest_scores:
        validConcepts.append(score[1])
    #Expand concepts
    for concept in validConcepts:
        documents = concept.conceptInDocument
        for document in documents:
            if document.name not in resultDocuments:
                resultDocuments.append(document.name)
    try:
        log("getDocumentsFromOntology(): Call to destroy() ontology " +
            ontopath)
        onto.destroy()
        del onto
        del owlready2
    except:
        log("getDocumentsFromOntology(): Can't destroy ontology " + ontopath)
    gc.collect()
コード例 #13
0
 def __init__(self, http_code=500, log=None, log_level='ERROR', message=''):
     super(APIException, self).__init__(message)
     self._http_code = http_code
     if log is not None:
         api_log.log(message=log, level=log_level)