def test_semanticSimilarity(self):
     self.assertTrue(
         LanguageBuilder().semanticSimilarity("papa", "patata") > 0.5)
     self.assertTrue(
         LanguageBuilder().semanticSimilarity("", "patata") < 0.5)
     self.assertTrue(
         LanguageBuilder().semanticSimilarity("hormigon", "patata") < 0.5)
    def extractData(self,
                    personalData: PersonalData = PersonalData.all) -> tuple:
        def cleanPicker():
            if personalData != PersonalData.idCards and not picker.isEmpty():
                listNames[len(listNames):] = picker.getAllNames(
                    self.dataSearch.checkNamesInDB,
                    MEASURE_FOR_TEXTS_WITHOUT_CONTEXTS)
                picker.clear()

        listNames = []
        idCards = []
        picker = DataPickerInTables()
        tokenizer = TokenizerHtml(self.soup)
        for token in tokenizer.getToken():
            if token.isTable == TableToken.NONE:
                if LanguageBuilder().hasContex(token.text[0]):
                    listNames[len(listNames):], idCards[
                        len(idCards):] = self.dataSearch.searchPersonalData(
                            token.text[0], personalData)
                elif personalData != PersonalData.idCards and self.dataSearch.isName(
                        token.text[0]):
                    listNames.append(token.text[0])
                cleanPicker()
            elif token.isTable == TableToken.HEAD and personalData != PersonalData.idCards:
                cleanPicker()
                keys = list(
                    filter(
                        lambda text: list(
                            filter(
                                lambda x: LanguageBuilder().semanticSimilarity(
                                    text, x
                                ) > MEASURE_TO_COLUMN_KEY_REFERS_TO_NAMES,
                                listOfVectorWords)), token.text))
                if keys:
                    for key in keys:
                        picker.addIndexColumn(token.text.index(key))
            elif token.isTable == TableToken.ROW:
                for index in picker.getIndexesColumn():
                    try:
                        picker.addName(index, token.text[index])
                    except IndexError:
                        continue
                if personalData != PersonalData.names:
                    idCards[len(idCards):] = list(
                        itertools.chain.from_iterable(
                            map(lambda id: self.dataSearch.giveIdCards(id), [
                                text for index, text in enumerate(token.text)
                                if not index in picker.getIndexesColumn()
                            ])))
        return listNames, idCards
Ejemplo n.º 3
0
    def getPersonalDataInTexts(self, text: Text, listNames: list,
                               idCards: list, personalData: PersonalData):
        """
        Get personal data in a pdf text.
        :param text: text
        :param listNames: list of string 
        :param idCards: list of string 
        :param personalData: PersonalData
        :return: list of string
        """

        if personalData != PersonalData.idCards:
            textSplit = text.split('\n')
            textWithContext = list(
                filter(lambda sent: LanguageBuilder().hasContex(sent),
                       textSplit))
            listNames[len(listNames):] = list(
                filter(
                    lambda words: words not in textWithContext and self.
                    dataSearch.isName(words), textSplit))
            listNames[len(listNames):], _ = self.dataSearch.searchPersonalData(
                ' '.join(textWithContext), PersonalData.names)
        if personalData != PersonalData.names:
            _, idCards[len(idCards):] = self.dataSearch.searchPersonalData(
                ' '.join(text), PersonalData.idCards)
Ejemplo n.º 4
0
 def __init__(self):
     self.hits = 0
     self.falsePositives = 0
     self.falseNegatives = 0
     self.nlp = LanguageBuilder().getlanguage()
     self.listOfFalseNegatives = []
     self.listOfFalsePositives = []
Ejemplo n.º 5
0
    def getPersonalDataInTables(self, tables: list, listNames: list,
                                idCards: list, lastKey: list,
                                personalData: PersonalData) -> list:
        """
        Get personal data in a pdf table.
        :param tables: list of string
        :param listNames: list of string 
        :param idCards: list of string 
        :param lastKey: list of string 
        :param personalData: PersonalData
        :return: list of string
        """

        for table in tables:
            namePicker = DataPickerInTables()
            for index, row in enumerate(table):
                if personalData != PersonalData.idCards and index == 0:
                    lables = list(
                        filter(
                            lambda cell: list(
                                filter(
                                    lambda x: LanguageBuilder(
                                    ).semanticSimilarity(cell, x) >
                                    MEASURE_TO_COLUMN_KEY_REFERS_TO_NAMES,
                                    listOfVectorWords)), row))
                    key = list(map(lambda cell: row.index(cell), lables))
                    if not key:
                        key = lastKey
                        namePicker.addIndexesColumn(key)
                    else:
                        lastKey = key
                        namePicker.addIndexesColumn(key)
                        continue

                nameRow = list(
                    filter(
                        lambda cell: namePicker.isColumnName(row.index(cell)),
                        row))
                if personalData != PersonalData.idCards:
                    for cell in nameRow:
                        namePicker.addName(row.index(cell), cell)
                    listNames[len(listNames):] = namePicker.getAllNames(
                        self.dataSearch.checkNamesInDB,
                        MEASURE_FOR_TEXTS_WITHOUT_CONTEXTS)

                if personalData != PersonalData.names:
                    idCards[len(idCards):] = list(
                        itertools.chain.from_iterable(
                            map(lambda cell: self.dataSearch.giveIdCards(cell),
                                filter(lambda cell: cell not in nameRow,
                                       row))))
        return lastKey
Ejemplo n.º 6
0
    def getPossibleColumnsNames(self, df: pd.DataFrame) -> typeOfColumn:
        """  
        You get the possible columns containing any name or surname
        :param df: pandas DataFrame
        :return: typeOfColumn
        """

        for key, typeColumn in zip(df.keys(), df.dtypes):
            if typeColumn == object:
                listOfWordSemantics = list(
                        filter(
                                lambda x: LanguageBuilder().semanticSimilarity(key, x) > MEASURE_TO_COLUMN_KEY_REFERS_TO_NAMES,
                                listOfVectorWords
                            )
                    )
                if listOfWordSemantics:
                    yield typeOfColumn(key,True)
                yield typeOfColumn(key,False)
Ejemplo n.º 7
0
    def extractData(self,
                    personalData: PersonalData = PersonalData.all) -> tuple:
        """  
        Extracts personal data from a document.
        :param personalData: PersonalData
        :return: tuple(names, DNIs)
        """

        listNames = []
        idCards = []
        with open(self.path, 'r', encoding='utf8') as file:
            for line in file:
                line = line[0:len(line) - 1]
                if LanguageBuilder().hasContex(line):
                    listNames[len(listNames):], idCards[
                        len(idCards):] = self.dataSearch.searchPersonalData(
                            line, personalData)
                elif personalData != PersonalData.idCards and self.dataSearch.isName(
                        line):
                    listNames.append(line)
        return listNames, idCards
Ejemplo n.º 8
0
 def __init__(self):
     super().__init__()
     self.nlp = LanguageBuilder().getlanguage()
Ejemplo n.º 9
0
 def __init__(self):
     super().__init__()
     self.label = LanguageBuilder().getLabelNameOfRules()
     self.nlp = LanguageBuilder().getlanguageByRules()
 def test_hasContex(self):
     self.assertTrue(LanguageBuilder().hasContex("Hola, que tal?"))
     self.assertFalse(LanguageBuilder().hasContex("MIGUEL"))
     self.assertFalse(LanguageBuilder().hasContex("MIGUEL ÁNGEL"))
     self.assertFalse(LanguageBuilder().hasContex(""))
Ejemplo n.º 11
0
    def extractData(self,
                    personalData: PersonalData = PersonalData.all) -> tuple:
        lastKey = []
        listNames = []
        listIdCard = []
        for block in itemIterator(self.document):
            if isinstance(block, Paragraph):
                if LanguageBuilder().hasContex(block.text):
                    listNames[len(listNames):], listIdCard[
                        len(listIdCard):] = self.dataSearch.searchPersonalData(
                            block.text, personalData)
                elif personalData != PersonalData.idCards and self.dataSearch.isName(
                        block.text):
                    listNames.append(block.text.strip())
                elif personalData != PersonalData.names:
                    _, listIdCard[len(listIdCard
                                      ):] = self.dataSearch.searchPersonalData(
                                          block.text, PersonalData.idCards)
            elif isinstance(block, Table):
                namePicker = DataPickerInTables()
                for index, row in enumerate(block.rows):
                    rowText = [cell.text for cell in row.cells]
                    if index == 0:
                        lables = list(
                            filter(
                                lambda cell: list(
                                    filter(
                                        lambda x: LanguageBuilder(
                                        ).semanticSimilarity(cell, x) >
                                        MEASURE_TO_COLUMN_KEY_REFERS_TO_NAMES,
                                        listOfVectorWords)), rowText))
                        key = list(
                            map(lambda cell: rowText.index(cell), lables))
                        if not key:
                            key = lastKey
                            namePicker.addIndexesColumn(key)
                        else:
                            lastKey = key
                            namePicker.addIndexesColumn(key)
                            continue

                    nameRow = list(
                        filter(
                            lambda cell: namePicker.isColumnName(
                                rowText.index(cell)), rowText))
                    if personalData != PersonalData.idCards:
                        for cell in nameRow:
                            namePicker.addName(rowText.index(cell), cell)

                    if personalData != PersonalData.names:
                        listIdCard[len(listIdCard):] = list(
                            itertools.chain.from_iterable(
                                map(
                                    lambda cell: self.dataSearch.giveIdCards(
                                        cell),
                                    filter(lambda cell: cell not in nameRow,
                                           rowText))))

                if personalData != PersonalData.idCards:
                    listNames[len(listNames):] = namePicker.getAllNames(
                        self.dataSearch.checkNamesInDB,
                        MEASURE_FOR_TEXTS_WITHOUT_CONTEXTS)
            else:
                continue
        return listNames, listIdCard