def getWordAggregationFromFile(fileName,
                               word,
                               entityIndex=None,
                               folder="Texts"):
    """
    This function gets a word tf or tf-idf from file, this is done by setting the entity index value ( case of tf-idf) or none ( case of tf)
    NB: word in parameter should be a word from vocabulary, if word not in file the funciton returns 0 as it coefficient 
    This function uses the database file to map entityIndex and entity row by row
    Parameters:
    :param fileName
    :param word: 
    :param entityIndex: if an entity index is provide or None for all entities
    :param folder: folder where to get the file (fileName)
    """
    fileFrame = readDataFile(fileName, folder)
    if entityIndex is None:
        rowInfo = fileFrame.loc[fileFrame["word"] == word]
        if rowInfo.empty:
            return 0
        else:
            return float(rowInfo.loc[:, "tf"].values.tolist()[0])

    elif entityIndex is not None:
        rows, cols = fileFrame.shape
        row = 0

        rowInfo = fileFrame.loc[(fileFrame["word"] == word)
                                & (fileFrame["entity"] == entityIndex)]
        if rowInfo.empty:
            return 0
        else:
            return float(rowInfo[-1])
Example #2
0
def numberOfImpEntity(wordImpCSV, wordImpCSVFolder):
    dataFrame = readDataFile(wordImpCSV, wordImpCSVFolder)
    df = dataFrame['entity'].nunique()
    print("### entity groups")
    print(df)
    print("###")
    return int(df)
Example #3
0
def analysisValues(csvKB, csvKBFolder):
    dataFrame = readDataFile(csvKB, csvKBFolder)
    print("### number of entities ")
    numberOfEntities, cols = dataFrame.shape
    print(numberOfEntities)
    print("###")
    print("### missing values ")
    numberOfMissingValue = (dataFrame == '').sum(axis=1).sum(axis=0)
    print(numberOfMissingValue)
    print("###")
    return numberOfEntities, numberOfMissingValue
def createStopListFromFile(fileName,
                           columnName=None,
                           by="column",
                           folder="Texts"):
    """
    This function is used to create a list of stopwords from a given column or columns from a data file ( .csv)
    Parameters:
    :param fileName: is the csv file name
    :param columnName: is the colomn/columns of the csv file to be used to get the stop words 
    :param by: defins the how the csv will be read 
    """
    df = readDataFile(fileName, folder)
    words = df[columnName].values.tolist()
    return set(words)
def checkIfEntityInDataset(entityURI,
                           entityAttributeName,
                           datasetFile,
                           datasetFileFolder="Outputs"):
    df = readDataFile(datasetFile, datasetFileFolder)
    listAttrib = list(df[entityAttributeName])
    listAttribute = [str(item) for item in listAttrib]
    if len(listAttribute) >= 1:
        if str(entityURI) in listAttribute:
            return True
        # elif int(entityURI) in listAttribute:
        #     return True
        else:
            return False
    else:
        return False
def getAttributeVector(myModel,
                       dataBaseFile,
                       entity,
                       entityProperty=None,
                       dataBaseFolder="Texts"):
    """
    This function get a
    Parameters:
    :param myModel: the embedding model from the corpus 
    :param dataBaseFile: The database file name (csv fromat) 
    :param entity: the URI of the entity we are interested on 
    :param entityProperty: a given property/list of properties of the entity.
    :param folder: is the name of the folder present in the data folder and containing the database file use by this function.
    It returns the vecteur representing the entity from the embedding  
    """
    df = readDataFile(dataBaseFile, dataBaseFolder)
    listOfColumns = list(df.columns)
    rows, cols = df.shape
    myModel = os.path.join(MODEL, myModel)
    model = Word2Vec.load(myModel)
    vectorSize = model.vector_size
    modelVocabulary = list(model.wv.vocab.keys())
    print("Shape of data frame: ", df.shape)
    dataBaseRow = 0
    meetEntity = False

    # search for the entity in the dataBase file
    df = df.applymap(str)
    rowInfo = df.loc[df["entity"] == str(entity)]
    if not rowInfo.empty:
        dataBaseRow = rowInfo.index.values[0]

        listRow = df.iloc[dataBaseRow, :]

        # if entity in dataBaseFile
        if dataBaseRow in range(rows) and isinstance(
                entityProperty, str) and entityProperty in listOfColumns:
            try:
                colIndex = listOfColumns.index(entityProperty)
                print("Property index", colIndex)
                attributeVector = {}
                if listRow[colIndex]:
                    attribute, attributeSize = createVocabulary(
                        stoplist, listRow[colIndex])
                    attributeVocabulary = {}

                    for attr in attribute:
                        if attr in modelVocabulary:
                            print("# ", attr)
                            attributeVocabulary[attr] = model[attr]
                    attributeVector[entityProperty] = attributeVocabulary
                    print("Attribute:", attributeVector)
                    return vectorSize, attributeVector
                else:
                    return vectorSize, attributeVector
            except:
                print("PROPERTY : ", entityProperty, "NOT IN DATABASE")
        elif dataBaseRow in range(rows) and isinstance(entityProperty, list):
            listOfAttributesVectors = []
            try:
                for propertyInList in entityProperty:
                    if propertyInList in listOfColumns:
                        colIndex = listOfColumns.index(propertyInList)
                        print("Property index ", colIndex)
                        attributeVector = {}
                        if listRow[colIndex]:
                            attribute, attributeSize = createVocabulary(
                                stoplist, listRow[colIndex])
                            attributeVocabulary = {}
                            for attr in attribute:
                                print("# ", attr)
                                if attr in modelVocabulary:
                                    attributeVocabulary[attr] = model[attr]
                            attributeVector[
                                propertyInList] = attributeVocabulary
                        listOfAttributesVectors.append(attributeVector)
                print(listOfAttributesVectors)
                return vectorSize, listOfAttributesVectors
            except:
                print("PROPERTY : ", entityProperty, "NOT IN DATABASE")

    else:
        attributeVector = {}
        return vectorSize, attributeVector
def completeSimilarityOfDatasets(corpusEmbeddedModel,
                                 model,
                                 dataBaseFileOne,
                                 frequencyModelFileOne,
                                 dataBaseFileTwo,
                                 frequencyModelFileTwo,
                                 properties=None,
                                 modelFolder="Models",
                                 dataBaseFolder="Texts",
                                 frequencyFolder="Outputs"):
    cEModel = corpusEmbeddedModel.split("_")
    """
    This function takes two datasets(csv format) and returns a file containing cross similarity of all their entities.

    Parameters:
    :param corpusEmbeddedModel: The trained model from the corpus.
    :param model: is the model being used (tf/idf/tfidf).
    :param dataBaseFileOne: is the first database CSV file.
    :param dataBaseFileTwo: is the second database CSV file.
    :param frequencyModelFileOne/frequencyModelFileTwo frequency model of first database and the second database respectively.
    :param modelFolder: is the folder containing the trained model from the corpus(corpusEmbeddedModel)
    """
    listOfVectorsGraphOne = []
    listOfVectorsGraphTwo = []
    dfOne = readDataFile(dataBaseFileOne, dataBaseFolder)
    dfTwo = readDataFile(dataBaseFileTwo, dataBaseFolder)
    rowsOne, colsOne = dfOne.shape
    rowsTwo, colsTwo = dfTwo.shape

    listOfAttributs = properties

    fileOne = dataBaseFileOne.split(".csv")

    fileTwo = dataBaseFileTwo.split(".csv")

    if properties is None:
        listOfAttributs = LISTOFPROPERTIES
    elif isinstance(properties, list):
        listOfAttributs = properties
    elif isinstance(properties, str):
        listOfAttributs = [properties]

    outputCombineFile = "distancesCrossSimilarity_" + "CorpusModel_" + cEModel[
        1] + "_win_" + cEModel[4] + "_vec_" + cEModel[
            6] + "_attribute_" + "-".join(
                listOfAttributs) + "_weight_" + model + "_" + str(
                    datetime.now()).replace(":", "").replace("-", "").replace(
                        " ", "").split(".")[0] + ".csv"
    characteristicCombineFile = open(os.path.join(OUTPUT, outputCombineFile),
                                     "a+")
    characteristicCombineFile.write("\t".join(
        [fileOne[0], fileTwo[0], "euclidean", "cosine"]))
    characteristicCombineFile.write("\n")
    characteristicCombineFile.close()
    for indexOne in range(rowsOne):
        listRowOne = dfOne.iloc[indexOne, :]
        print("### listRowOne")
        print(listRowOne["description"])
        print("###")
        for indexTwo in range(rowsTwo):
            listRowTwo = dfTwo.iloc[indexTwo, :]
            print("### listRowTwo")
            print(listRowTwo["entity"])
            print("###")
            vectorSizeOne, attributeVectorOne = getAttributeVector(
                corpusEmbeddedModel, dataBaseFileOne,
                str(listRowOne["entity"]), listOfAttributs, dataBaseFolder)
            vectorSizeTwo, attributeVectorTwo = getAttributeVector(
                corpusEmbeddedModel, dataBaseFileTwo,
                str(listRowTwo["entity"]), listOfAttributs, dataBaseFolder)
            entityVectorOne = usableAttributeVector(frequencyModelFileOne,
                                                    model,
                                                    str(listRowOne["entity"]),
                                                    attributeVectorOne,
                                                    vectorSizeOne,
                                                    frequencyFolder)
            entityVectorTwo = usableAttributeVector(frequencyModelFileTwo,
                                                    model,
                                                    str(listRowTwo["entity"]),
                                                    attributeVectorTwo,
                                                    vectorSizeTwo,
                                                    frequencyFolder)

            # consider only non zero vectors
            if not np.array_equal(
                    entityVectorOne, np.zeros(vectorSizeOne, dtype="float64")
            ) and not np.array_equal(entityVectorTwo,
                                     np.zeros(vectorSizeTwo, dtype="float64")):
                listOfVectorsGraphOne.append(entityVectorOne)
                listOfVectorsGraphTwo.append(entityVectorTwo)
                euclideanDistance, cosineDistance = computeSimilarity(
                    entityVectorOne, entityVectorTwo)
                print(str(listRowOne["entity"]), " - ",
                      str(listRowTwo["entity"]), " == ", euclideanDistance,
                      cosineDistance)

                characteristicCombineFile = open(
                    os.path.join(OUTPUT, outputCombineFile), "a+")
                characteristicCombineFile.write("\t".join([
                    str(listRowOne["entity"]),
                    str(listRowTwo["entity"]),
                    str(euclideanDistance),
                    str(cosineDistance)
                ]))
                characteristicCombineFile.write("\n")
                characteristicCombineFile.close()
    return "Outputs", outputCombineFile
def usableAttributeVector(frequencyModelFile,
                          model,
                          entity,
                          attributeVector,
                          vectorSize,
                          frequencyModelFolder="Outputs"):
    """
    This funciton returns a usable vector of an entity from a given database file.
    Parameters:
    :param frequencyModelFile: is the csv file containing words and their frequencies (tf/idf/tfidf)
    :param model : is the model being used (tf/idf/tfidf)
    :param entity : is the URI of the entity we are look for it vector 
    :param attributeVector : a list of  dictionary returned from getAttributeVector and containing relevent words from attribute of an entity 
    :param vectorSize: is the size of word vector from the embedding model

    This function returns a vector given the dictionary of an attribute with dictionary vectors of the key words that constitute them.  
    """
    print("### attributeVector")
    print(attributeVector)
    print("###")
    frequencyDataFrame = readDataFile(frequencyModelFile, frequencyModelFolder)
    if model == "idf" or model == "IDF":
        modelValue = vocabCount.idf_
        modelVocabulary = countMatrix.get_feature_names()

    elif model in ["TF-IDF", "tf-idf", "TFIDF", "tfidf", "TF", "tf"]:
        allEntityDataFrame = frequencyDataFrame.loc[:, "entity"] = entity
        print("### allEntityDataFrame")
        print(allEntityDataFrame)
        print("###")
        entityDataFrame = frequencyDataFrame.loc[
            frequencyDataFrame.loc[:, "entity"] == entity, :]
        print("### entity frame")
        print(entityDataFrame)
        print("###")
        listOfWords = entityDataFrame.loc[:, "word"].values
        print("### list of words")
        print(listOfWords)
        print("###")
        if attributeVector and isinstance(attributeVector, dict):
            sumVector = np.zeros(vectorSize, dtype="float64")
            for word in listOfWords:
                for attribute in attributeVector:
                    print("###")
                    print("attribute", attribute)
                    v = np.zeros(vectorSize, dtype="float64")
                    if word in attributeVector[attribute]:
                        print("word", word)
                        v = np.array(attributeVector[attribute][word],
                                     dtype="float64")
                        coef = entityDataFrame.loc[
                            entityDataFrame.loc[:, "word"] == word].values
                        print("coefficient value", coef[0, 2])
                        v = v * coef[0, 2]
                        print("### vectore multiply by coef")
                        print(v)
                        print("###")
                    sumVector += v
                    print("###")
            return sumVector
        elif attributeVector and isinstance(attributeVector, list):
            finalVector = np.zeros(vectorSize, dtype="float64")
            for attribute in attributeVector:
                finalVector += usableAttributeVector(frequencyModelFile, model,
                                                     entity, attribute,
                                                     vectorSize)
            return finalVector
        else:
            return None
Example #9
0
def evaluation(groundFile,
               groundColumnName,
               resultFile,
               resultColumnName,
               threshold=None,
               distance=None,
               groundFileFolder="Outputs",
               resultFileFolder="Outputs",
               plot=False):
    """
    This funciton takes the groud file and the result file, and returns for a percentage of correct match entities from the 
    ground file. It does this for each distances used in the result file.
    Parameters:
    :param groundFile: is the ground truth file name containing the matches of entities from both knowledge based files 
    :param groundColumnName: the column names (02) corresponding to the matches.
    :param resultFile: is the result file from cross calculations of distances 
    :param resultColumnName: is the column of interes from the result file 
    :param threshold: the value that the distances should satisfied.
    default -> None 
    :param distance: is the type of distance been used.
    default -> None 
    1 -> euclidean 
    2 -> cosine 




    :param plot: states if the threshold-precision graph should be ploted
    True -> plot graph 
    False -> do not plot graph 
    """
    groundFrame = readDataFile(groundFile, groundFileFolder)
    groundRows, groundCols = groundFrame.shape
    resultFrame = readDataFile(resultFile, resultFileFolder)
    resultRows, resultCols = resultFrame.shape
    extractedGround = groundFrame[groundColumnName]
    distanceInfo = resultFile.split("_")
    countMatch = 0
    outputevaluationFile = "evaluation" + str(datetime.now()).replace(
        ":", "").replace("-", "").replace(" ", "").split(".")[0] + ".txt"
    f = open(os.path.join(OUTPUT, outputevaluationFile), "a+")
    f.write("Ground file \n")
    f.write(groundFile)
    f.write("\n")
    f.write("Result file \n")
    f.write(resultFile)
    f.write("\n")
    f.write("Corpus Model \n")
    f.wrtite(distanceInfo[2])
    f.write("Corpus Model window size \n")
    f.write(distanceInfo[4])
    f.write("Corpus Model vector dimension \n")
    f.write(distanceInfo[6])
    f.write("Corpus Model attribute \n")
    f.write(" ".join(distanceInfo[8].split("-")))
    f.write("Weight coef \n")
    f.write(distanceInfo[10])
    f.write("\n")
    f.close()
    if isinstance(threshold, int) or isinstance(threshold, float):
        for index, row in extractedGround.iterrows():
            couple = [row[groundColumnName[0]], row[groundColumnName[1]]]
            print("### groud couple")
            print(couple)
            print("###")
            matchFrame = resultFrame[resultFrame[resultColumnName[0]] ==
                                     couple[0]]
            matchValues = matchFrame.values
            if not matchFrame.empty and matchValues[0][1] and matchValues[0][
                    1] == couple[1] and matchValues[0][distance +
                                                       1] >= threshold:
                countMatch += 1
                print("### countMatch")
                print(countMatch)
                print("###")
                print("### matchFrame")
                print(matchFrame.values)
                print("###")
        f = open(os.path.join(OUTPUT, outputevaluationFile), "a+")
        f.write("Recall: \n")
        recall = countMatch / groundRows
        f.write(str(recall))
        f.write("\n")
        f.write("Precision: \n")
        precision = countMatch / resultRows
        f.write(str(precision))
        f.close()
        return precision, recall
    elif isinstance(threshold, list) and plot == True:
        listOfPrecision = []
        print("### list of threshold")
        print(threshold)
        print("###")
        for th in threshold:
            print("### th in threshold")
            print(th)
            print("###")
            print()
            prec, rec = evaluation(groundFile, groundColumnName, resultFile,
                                   resultColumnName, th, distance,
                                   groundFileFolder, resultFileFolder, False)
            listOfPrecision.append(prec)

        print("### listOfPrecision")
        print(listOfPrecision)
        print("###")
        fig = plt.figure()
        plt.plot(threshold, listOfPrecision, 'ro')
        plt.axis([0, max(threshold), 0, 1])
        # plt.show()
        fig.savefig(
            os.path.join(
                OUTPUT, "evaluation" + "_plot_" + str(datetime.now()).replace(
                    ":", "").replace("-", "").replace(" ", "").split(".")[0] +
                ".png"))