Ejemplo n.º 1
0
    def preprocessing(self):
        self.df = pd.read_csv('static/models/resampled_comments_1.csv')
        self.comments = self.df[['comment', 'rating', 'sentiment']]
        self.comments['comment'] = self.comments['comment'].map(
            lambda x: x.lower())

        toknizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
        token = self.comments.apply(
            lambda row: toknizer.tokenize(row['comment']), axis=1)

        stop_words = set(stopwords.words('french'))
        stop_token = token.apply(
            lambda x: [item for item in x if item not in stop_words])

        stemmer = SnowballStemmer(language='french')
        stemm = stop_token.apply(lambda x: [stemmer.stem(y) for y in x])

        lemmatizer = FrenchLefffLemmatizer()
        lemm = stemm.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

        for i in range(len(lemm)):
            lemm[i] = ' '.join(lemm[i])

        self.comments['lemmatiser_com'] = lemm
        data = self.comments[['comment', 'lemmatiser_com', 'sentiment']]

        self.df = pd.DataFrame(data)
        return self.df
Ejemplo n.º 2
0
def lemma(liste):
    lemmatizer = FrenchLefffLemmatizer()
    listepleine = []
    for u in liste:
        v = lemmatizer.lemmatize(u)
        listepleine.append(v)
    return listepleine
Ejemplo n.º 3
0
 def lemmatize_verbs(self, words):
     """Lemmatize verbs in list of tokenized words"""
     if self._lang == 'fr':
         lemmatizer = FrenchLefffLemmatizer()
     else:
         lemmatizer = WordNetLemmatizer()
     lemmas = []
     for word in words:
         lemma = lemmatizer.lemmatize(word, pos='v')
         lemmas.append(lemma)
     return lemmas
def french_processing_text(df, text_field, POS_JAR, POS_MODEL):
    tokenizer = RegexpTokenizer(r'\w+')
    pos_tagger = StanfordPOSTagger(POS_MODEL, POS_JAR, encoding='utf8')
    lemmatizer = FrenchLefffLemmatizer()
    stop = stopwords.words('french')

    # Store processed original dataset
    new_text = []

    # Store normalized words (1-gram) and its POS
    words = []
    pos = []

    for text in df[text_field]:  # a text may have multiple sentences
        new_sentences = ''
        for sentence in text.split(
                '.'
        ):  # Treat one sentence at a time. Why ? because we're playing with POS
            processed_sentence = french_processing_sentence(
                sentence, tokenizer, pos_tagger, lemmatizer, stop)
            new_sentences += ' '.join(processed_sentence) + ' '
        new_text.append(new_sentences)

    df['processed_text'] = new_text

    return df
Ejemplo n.º 5
0
class FrenchLemmatizer(object):
    name = 'lefff'

    SPACY_WORDNET_DIC = {'ADJ': 'a', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v'}

    def __init__(self):
        # register your new attribute token._.lefff_lemma
        Token.set_extension('lefff_lemma', default=None)

        self.lemmatizer = FrenchLefffLemmatizer()

    def __call__(self, doc):
        for token in doc:
            wn_pos = self.SPACY_WORDNET_DIC[
                token.
                pos_] if token.pos_ in self.SPACY_WORDNET_DIC else token.pos_
            lemma = self.lemmatizer.lemmatize(token.text, wn_pos)

            # TODO: return only ONE lemma
            if not isinstance(lemma, list):
                token._.lefff_lemma = lemma
            elif len(lemma) == 1:
                token._.lefff_lemma = lemma[0][0]
            else:
                # not lemmatized word or None or empty?
                token._.lefff_lemma = ''
        return doc
 def test_french_lefff_lemmatizer_when_lefff_files_are_empty_expect_empty_table(
         self):
     lemmatizer = FrenchLefffLemmatizer(
         lefff_file_path=self.path_file,
         lefff_additional_file_path=self.path_file)
     self.assertEqual(
         2,  # Corresponds to the triplets to add (errors)
         len(lemmatizer.LEFFF_TABLE))
class TestIsWordnetApi(unittest.TestCase):
    def setUp(self):
        # To make the test faster, we load an empty file.
        self.lemmatizer = FrenchLefffLemmatizer(open('file.txt', 'w').close())

    def tearDown(self):
        if os.path.exists('file.txt'):
            os.remove('file.txt')

    def test_is_wordnet_api_when_pos_expect_true(self):
        self.assertTrue(self.lemmatizer.is_wordnet_pos('a'))
        self.assertTrue(self.lemmatizer.is_wordnet_pos('n'))
        self.assertTrue(self.lemmatizer.is_wordnet_pos('r'))
        self.assertTrue(self.lemmatizer.is_wordnet_pos('v'))

    def test_is_wordnet_api_when_not_pos_expect_false(self):
        self.assertFalse(self.lemmatizer.is_wordnet_pos('adj'))
Ejemplo n.º 8
0
def simpleFilter(sentence):
    filtered_sent = []
    
    lemmatizer = None
    if targetLang == "french":
        lemmatizer = FrenchLefffLemmatizer()
    else:
        lemmatizer = WordNetLemmatizer()
    
    
    stop_words = set(stopwords.words(targetLang))
    words = word_tokenize(sentence)

    for w in words:
        if w not in stop_words:
            filtered_sent.append(lemmatizer.lemmatize(w))

    return filtered_sent
class TestLemmatize(unittest.TestCase):
    lemmatizer = FrenchLefffLemmatizer()

    def test_lemmatize_when_pos_default_expect_correct_lemmatized_word(self):
        self.assertEqual('voiture', self.lemmatizer.lemmatize('voitures'))

    def test_lemmatize_when_pos_n_expect_correct_lemmatized_word(self):
        self.assertEqual('abbaye', self.lemmatizer.lemmatize('abbayes',
                                                             pos='n'))

    def test_lemmatize_when_pos_np_expect_correct_couple(self):
        self.assertEqual([('Nantes', 'np')],
                         self.lemmatizer.lemmatize('Nantes', pos='np'))

    def test_lemmatize_when_pos_does_not_correspond_expect_raw_word(self):
        self.assertEqual('Nantes', self.lemmatizer.lemmatize('Nantes',
                                                             pos='n'))

    def test_lemmatize_when_pos_does_not_exist_expect_correct_couple(self):
        self.assertEqual(('Nantes', 'np'),
                         self.lemmatizer.lemmatize('Nantes', pos='x'))

    def test_lemmatize_when_word_does_not_exists_expect_raw_word(self):
        self.assertEqual('bliblis', self.lemmatizer.lemmatize('bliblis'))

    def test_lemmatize_when_word_and_pos_do_not_exist_expect_empty_list(self):
        self.assertEqual(list(), self.lemmatizer.lemmatize('bliblis', pos='x'))

    def test_lemmatize_when_pos_a_expect_correct_lemmatized_word(self):
        self.assertEqual('court', self.lemmatizer.lemmatize('courtes',
                                                            pos='a'))

    def test_lemmatize_when_pos_r_expect_correct_lemmatized_word(self):
        self.assertEqual('dernièrement',
                         self.lemmatizer.lemmatize('dernièrement', pos='r'))

    def test_lemmatize_when_pos_v_expect_correct_lemmatized_word(self):
        self.assertEqual('manger', self.lemmatizer.lemmatize('manges',
                                                             pos='v'))

    def test_lemmatize_when_uncorrect_pos_expect_list_of_couples(self):
        expected_result = set(x
                              for x in [('voiture', 'nc'), ('voiturer', 'v')])
        self.assertEqual(expected_result,
                         set(self.lemmatizer.lemmatize('voitures', pos='x')))

    def test_lemmatize_when_pos_all_expect_all_lemmas(self):
        expected_result = {('avion', 'nc'), ('avoir', 'auxAvoir'),
                           ('avoir', 'v')}
        self.assertEqual(expected_result,
                         set(self.lemmatizer.lemmatize('avions', pos='all')))
class FilterLefffPos(unittest.TestCase):
    def setUp(self):
        # To make the test faster, we load an empty file.
        self.lemmatizer = FrenchLefffLemmatizer(open('file.txt', 'w').close())

    def tearDown(self):
        if os.path.exists('file.txt'):
            os.remove('file.txt')

    def test_filter_lefff_pos_when_list_expect_filtered_list(self):
        self.assertSetEqual({'adj', 'v'},
                            self.lemmatizer.filter_lefff_pos(
                                ['a', 'v', 'auxAvoir', 'wrong_pos']))

    def test_filter_lefff_pos_when_only_wrong_pos_expect_none(self):
        self.assertIsNone(self.lemmatizer.filter_lefff_pos(['wrong_pos']))

    def test_filter_lefff_pos_when_list_correct_pos_expect_correct_pos(self):
        self.assertSetEqual({'nc'}, self.lemmatizer.filter_lefff_pos(['n']))

    def test_filter_lefff_pos_when_not_list_expect_pos_tags(self):
        self.assertSetEqual({'nc', 'adj'},
                            self.lemmatizer.filter_lefff_pos('na'))
Ejemplo n.º 11
0
        def process_input(self,
                          input_description,
                          stem=False,
                          undesired_words=[]):
            '''
            Description :
                
                Apply the processing steps to an input text

            Parameters : 
                
                - input_description (String) : the text to process.
                - stem (Boolean) : specify if the model should use stemmed descriptions or not.
                - undesired_words (List[String]) : list of all words that we want to filter in the processing step
            '''

            tokenizer_ = RegexpTokenizer(r'[^\d\W]+')
            stemmer_ = FrenchStemmer()
            lemmatizer_ = FrenchLefffLemmatizer()
            stop_words_ = stopwords.words('french')

            tokens = list(
                map(lambda x: x.lower(),
                    tokenizer_.tokenize(input_description)))
            tokens = list(
                filter(
                    lambda x: x not in stop_words_ and len(x) >= 3 and x not in
                    undesired_words, tokens))
            tokens = [
                unidecode(lemmatizer_.lemmatize(token)) for token in tokens
            ]

            if stem:
                return ' '.join(list(map(lambda x: stemmer_.stem(x), tokens)))

            else:
                return ' '.join(tokens)
Ejemplo n.º 12
0
class ClusteringArticles(object):
    def __init__(self, articles):
        self.articles = articles
        self.stemmer = FrenchStemmer()
        self.lemmetizer = FrenchLefffLemmatizer()
        self.liste_french = [
            "demain",
            "iii",
            "ii",
            "reuters",
            "lundi",
            "mardi",
            "mercredi",
            "jeudi",
            "vendredi",
            "samedi",
            "dimanche",
            #                            "janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre",
            "fin",
            "afp",
            "déjà",
            "ok",
            "ca",
            "cas",
            "a",
            "abord",
            "absolument",
            "afin",
            "ah",
            "ai",
            "aie",
            "aient",
            "aies",
            "ailleurs",
            "ainsi",
            "ait",
            "allaient",
            "allo",
            "allons",
            "allô",
            "alors",
            "anterieur",
            "anterieure",
            "anterieures",
            "apres",
            "après",
            "as",
            "assez",
            "attendu",
            "au",
            "aucun",
            "aucune",
            "aucuns",
            "aujourd",
            "aujourd'hui",
            "aupres",
            "auquel",
            "aura",
            "aurai",
            "auraient",
            "aurais",
            "aurait",
            "auras",
            "aurez",
            "auriez",
            "aurions",
            "aurons",
            "auront",
            "aussi",
            "autre",
            "autrefois",
            "autrement",
            "autres",
            "autrui",
            "aux",
            "auxquelles",
            "auxquels",
            "avaient",
            "avais",
            "avait",
            "avant",
            "avec",
            "avez",
            "aviez",
            "avions",
            "avoir",
            "avons",
            "ayant",
            "ayez",
            "ayons",
            "b",
            "bah",
            "bas",
            "basee",
            "bat",
            "beau",
            "beaucoup",
            "bien",
            "bigre",
            "bon",
            "boum",
            "bravo",
            "brrr",
            "c",
            "car",
            "ce",
            "ceci",
            "cela",
            "celle",
            "celle-ci",
            "celle-là",
            "celles",
            "celles-ci",
            "celles-là",
            "celui",
            "celui-ci",
            "celui-là",
            "celà",
            "cent",
            "cependant",
            "certain",
            "certaine",
            "certaines",
            "certains",
            "certes",
            "ces",
            "cet",
            "cette",
            "ceux",
            "ceux-ci",
            "ceux-là",
            "chacun",
            "chacune",
            "chaque",
            "cher",
            "chers",
            "chez",
            "chiche",
            "chut",
            "chère",
            "chères",
            "ci",
            "cinq",
            "cinquantaine",
            "cinquante",
            "cinquantième",
            "cinquième",
            "clac",
            "clic",
            "combien",
            "comme",
            "comment",
            "comparable",
            "comparables",
            "compris",
            "concernant",
            "contre",
            "couic",
            "crac",
            "d",
            "da",
            "dans",
            "de",
            "debout",
            "dedans",
            "dehors",
            "deja",
            "delà",
            "depuis",
            "dernier",
            "derniere",
            "derriere",
            "derrière",
            "des",
            "desormais",
            "desquelles",
            "desquels",
            "dessous",
            "dessus",
            "deux",
            "deuxième",
            "deuxièmement",
            "devant",
            "devers",
            "devra",
            "devrait",
            "different",
            "differentes",
            "differents",
            "différent",
            "différente",
            "différentes",
            "différents",
            "dire",
            "directe",
            "directement",
            "dit",
            "dite",
            "dits",
            "divers",
            "diverse",
            "diverses",
            "dix",
            "dix-huit",
            "dix-neuf",
            "dix-sept",
            "dixième",
            "doit",
            "doivent",
            "donc",
            "dont",
            "dos",
            "douze",
            "douzième",
            "dring",
            "droite",
            "du",
            "duquel",
            "durant",
            "dès",
            "début",
            "désormais",
            "e",
            "effet",
            "egale",
            "egalement",
            "egales",
            "eh",
            "elle",
            "elle-même",
            "elles",
            "elles-mêmes",
            "en",
            "encore",
            "enfin",
            "entre",
            "envers",
            "environ",
            "es",
            "essai",
            "est",
            "et",
            "etant",
            "etc",
            "etre",
            "eu",
            "eue",
            "eues",
            "euh",
            "eurent",
            "eus",
            "eusse",
            "eussent",
            "eusses",
            "eussiez",
            "eussions",
            "eut",
            "eux",
            "eux-mêmes",
            "exactement",
            "excepté",
            "extenso",
            "exterieur",
            "eûmes",
            "eût",
            "eûtes",
            "f",
            "fais",
            "faisaient",
            "faisant",
            "fait",
            "faites",
            "façon",
            "feront",
            "fi",
            "flac",
            "floc",
            "fois",
            "font",
            "force",
            "furent",
            "fus",
            "fusse",
            "fussent",
            "fusses",
            "fussiez",
            "fussions",
            "fut",
            "fûmes",
            "fût",
            "fûtes",
            "g",
            "gens",
            "h",
            "ha",
            "haut",
            "hein",
            "hem",
            "hep",
            "hi",
            "ho",
            "holà",
            "hop",
            "hormis",
            "hors",
            "hou",
            "houp",
            "hue",
            "hui",
            "huit",
            "huitième",
            "hum",
            "hurrah",
            "hé",
            "hélas",
            "i",
            "ici",
            "il",
            "ils",
            "importe",
            "j",
            "je",
            "jusqu",
            "jusque",
            "juste",
            "k",
            "l",
            "la",
            "laisser",
            "laquelle",
            "las",
            "le",
            "lequel",
            "les",
            "lesquelles",
            "lesquels",
            "leur",
            "leurs",
            "longtemps",
            "lors",
            "lorsque",
            "lui",
            "lui-meme",
            "lui-même",
            "là",
            "lès",
            "m",
            "ma",
            "maint",
            "maintenant",
            "mais",
            "malgre",
            "malgré",
            "maximale",
            "me",
            "meme",
            "memes",
            "merci",
            "mes",
            "mien",
            "mienne",
            "miennes",
            "miens",
            "mille",
            "mince",
            "mine",
            "minimale",
            "moi",
            "moi-meme",
            "moi-même",
            "moindres",
            "moins",
            "mon",
            "mot",
            "moyennant",
            "multiple",
            "multiples",
            "même",
            "mêmes",
            "n",
            "na",
            "naturel",
            "naturelle",
            "naturelles",
            "ne",
            "neanmoins",
            "necessaire",
            "necessairement",
            "neuf",
            "neuvième",
            "ni",
            "nombreuses",
            "nombreux",
            "nommés",
            "non",
            "nos",
            "notamment",
            "notre",
            "nous",
            "nous-mêmes",
            "nouveau",
            "nouveaux",
            "nul",
            "néanmoins",
            "nôtre",
            "nôtres",
            "o",
            "oh",
            "ohé",
            "ollé",
            "olé",
            "on",
            "ont",
            "onze",
            "onzième",
            "ore",
            "ou",
            "ouf",
            "ouias",
            "oust",
            "ouste",
            "outre",
            "ouvert",
            "ouverte",
            "ouverts",
            "o|",
            "où",
            "p",
            "paf",
            "pan",
            "par",
            "parce",
            "parfois",
            "parle",
            "parlent",
            "parler",
            "parmi",
            "parole",
            "parseme",
            "partant",
            "particulier",
            "particulière",
            "particulièrement",
            "pas",
            "passé",
            "pendant",
            "pense",
            "permet",
            "personne",
            "personnes",
            "peu",
            "peut",
            "peuvent",
            "peux",
            "pff",
            "pfft",
            "pfut",
            "pif",
            "pire",
            "pièce",
            "plein",
            "plouf",
            "plupart",
            "plus",
            "plusieurs",
            "plutôt",
            "possessif",
            "possessifs",
            "possible",
            "possibles",
            "pouah",
            "pour",
            "pourquoi",
            "pourrais",
            "pourrait",
            "pouvait",
            "prealable",
            "precisement",
            "premier",
            "première",
            "premièrement",
            "pres",
            "probable",
            "probante",
            "procedant",
            "proche",
            "près",
            "psitt",
            "pu",
            "puis",
            "puisque",
            "pur",
            "pure",
            "q",
            "qu",
            "quand",
            "quant",
            "quant-à-soi",
            "quanta",
            "quarante",
            "quatorze",
            "quatre",
            "quatre-vingt",
            "quatrième",
            "quatrièmement",
            "que",
            "quel",
            "quelconque",
            "quelle",
            "quelles",
            "quelqu'un",
            "quelque",
            "quelques",
            "quels",
            "qui",
            "quiconque",
            "quinze",
            "quoi",
            "quoique",
            "r",
            "rare",
            "rarement",
            "rares",
            "relative",
            "relativement",
            "remarquable",
            "rend",
            "rendre",
            "restant",
            "reste",
            "restent",
            "restrictif",
            "retour",
            "revoici",
            "revoilà",
            "rien",
            "s",
            "sa",
            "sacrebleu",
            "sait",
            "sans",
            "sapristi",
            "sauf",
            "se",
            "sein",
            "seize",
            "selon",
            "semblable",
            "semblaient",
            "semble",
            "semblent",
            "sent",
            "sept",
            "septième",
            "sera",
            "serai",
            "seraient",
            "serais",
            "serait",
            "seras",
            "serez",
            "seriez",
            "serions",
            "serons",
            "seront",
            "ses",
            "seul",
            "seule",
            "seulement",
            "si",
            "sien",
            "sienne",
            "siennes",
            "siens",
            "sinon",
            "six",
            "sixième",
            "soi",
            "soi-même",
            "soient",
            "sois",
            "soit",
            "soixante",
            "sommes",
            "son",
            "sont",
            "sous",
            "souvent",
            "soyez",
            "soyons",
            "specifique",
            "specifiques",
            "speculatif",
            "stop",
            "strictement",
            "subtiles",
            "suffisant",
            "suffisante",
            "suffit",
            "suis",
            "suit",
            "suivant",
            "suivante",
            "suivantes",
            "suivants",
            "suivre",
            "sujet",
            "superpose",
            "sur",
            "surtout",
            "t",
            "ta",
            "tac",
            "tandis",
            "tant",
            "tardive",
            "te",
            "tel",
            "telle",
            "tellement",
            "telles",
            "tels",
            "tenant",
            "tend",
            "tenir",
            "tente",
            "tes",
            "tic",
            "tien",
            "tienne",
            "tiennes",
            "tiens",
            "toc",
            "toi",
            "toi-même",
            "ton",
            "touchant",
            "toujours",
            "tous",
            "tout",
            "toute",
            "toutefois",
            "toutes",
            "treize",
            "trente",
            "tres",
            "trois",
            "troisième",
            "troisièmement",
            "trop",
            "très",
            "tsoin",
            "tsouin",
            "tu",
            "té",
            "u",
            "un",
            "une",
            "unes",
            "uniformement",
            "unique",
            "uniques",
            "uns",
            "v",
            "va",
            "vais",
            "valeur",
            "vas",
            "vers",
            "via",
            "vif",
            "vifs",
            "vingt",
            "vivat",
            "vive",
            "vives",
            "vlan",
            "voici",
            "voie",
            "voient",
            "voilà",
            "vont",
            "vos",
            "votre",
            "vous",
            "vous-mêmes",
            "vu",
            "vé",
            "vôtre",
            "vôtres",
            "w",
            "x",
            "y",
            "z",
            "zut",
            "à",
            "â",
            "ça",
            "ès",
            "étaient",
            "étais",
            "était",
            "étant",
            "état",
            "étiez",
            "étions",
            "été",
            "étée",
            "étées",
            "étés",
            "êtes",
            "être",
            "ô"
        ]

    def main_article_clustering(self):
        self.clean_articles()
        cluster_words = self.clustering_Tf_Itf()
        #        self.match_general_cluster(cluster_words)
        return self.articles

    def clean_articles(self):
        def clean_articles2(x):
            liste_para = x[0].split("\r\n")
            end = re.sub("'\([^)]*\)", "",
                         str(liste_para[-1]).replace(str(x[1]), "")).strip()
            article = "\r\n".join([x for x in liste_para[:-1] if x != ''] +
                                  [end])
            return article

        self.articles = self.articles.loc[~pd.isnull(self.articles["article"])]
        self.articles = self.articles.loc[~pd.isnull(self.
                                                     articles["restricted"])]
        self.articles = self.articles.loc[
            self.articles["article"].apply(lambda x: len(x)) > 750]
        keep_index = self.articles["article"].apply(
            lambda x: False if "L'essentiel de l'actu " in x else True)
        self.articles = self.articles.loc[keep_index]
        self.articles["article"] = self.articles[["article", "auteur"]].apply(
            lambda x: clean_articles2(x), axis=1)

    def tokenize(self, text):
        text = text.replace("'", " ")
        text = re.sub(r'\S*@\S*\s?', '', text.strip(),
                      flags=re.MULTILINE)  # remove email
        text = re.sub(r'\@\S*\s?', '', text.strip(),
                      flags=re.MULTILINE)  # remove tweeter names
        text = re.sub(r'http\S+', '', text,
                      flags=re.MULTILINE)  # remove web addresses
        text = re.sub(
            r'\(Crédits :.+\)\r\n', ' ', text,
            flags=re.MULTILINE)  # remove credits from start of article
        text = re.sub(
            r'\r\n.+\r\nVoir les réactions', '', text,
            flags=re.MULTILINE)  # remove credits from start of article
        text = text.replace("/ REUTERS", "").replace("/ AFP", "")
        s = text.split("(Reuters) - ", 1)
        if len(s) > 1:
            text = s[1]
        s = text.split(" - ", 1)
        if len(s[0]) < 35:
            text = s[1]
        text = re.sub(r'\r\npar .+\r\n', '', text, flags=re.MULTILINE)
        text = re.sub(r'\r\n\(.+\)', ' ', text, flags=re.MULTILINE)
        text = re.sub(r'\r\nÀ LIRE AUSSI\r\n.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLIRE AUSSI\r\n.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLIRE AUSSI >>.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\n» LIRE AUSSI -.+\r\n',
                      ' ',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLE FIGARO. -.+ - ', ' ', text, flags=re.MULTILINE)
        text = re.sub(r'www.\S+', '', text,
                      flags=re.MULTILINE)  # remove web addresses
        text = re.sub(r'\r\n» Vous pouvez également suivre.+.',
                      '',
                      text,
                      flags=re.MULTILINE)
        text = re.sub(r'\r\nLIRE NOTRE DOSSIER COMPLET\r\n.+\r\n',
                      '',
                      text,
                      flags=re.MULTILINE)

        text = text.replace("\r\nLIRE AUSSI :\r\n»", "")
        text = text.replace("(Reuters)",
                            "").replace("Article réservé aux abonnés", " ")
        text = text.translate({ord(ch): None for ch in '0123456789'})
        text = text.translate({
            ord(ch): " "
            for ch in '-•“’!"#$%&()*+,./:;<=>?@[\\]^_`{|}~«»–…‘'
        })  # lower + suppress numbers
        text = re.sub(r' +', ' ', text,
                      flags=re.MULTILINE)  # remove autor name
        text = re.sub(r' \b[a-zA-Z]\b ', ' ', text,
                      flags=re.MULTILINE)  ### mono letters word
        tokens = nltk.word_tokenize(text.lower(), language='french')

        tokens = " ".join([
            self.lemmetizer.lemmatize(self.lemmetizer.lemmatize(word, "v"),
                                      "n") for word in tokens
        ])
        return tokens

    def clustering_Tf_Itf(self, thresh=0.6, nwords=100):
        '''
         Home made clustering method:
             - get nwords most important words per document (after tf idf)
             - Group articles having at least thresh of common weights (% of importance in common between articles)
             - If one group then cluster = -1
        '''

        articles = self.articles.copy()

        # =============================================================================
        #         #### 1) cluster articles close in words
        # =============================================================================
        clusters, tfs = self.intersect_cluster(articles,
                                               nwords=70,
                                               thresh=0.55)
        index_cluster = []
        for key, value in clusters.items():
            for index_art in value:
                index_cluster.append([index_art, key])

        index_cluster = pd.DataFrame(index_cluster).sort_values(0)
        articles["article_cluster"] = index_cluster[1].tolist()
        cluster_unique = articles["article_cluster"].value_counts()[
            articles["article_cluster"].value_counts() == 1].index

        idx = articles["article_cluster"].isin(cluster_unique)
        m = metrics.silhouette_score(tfs.toarray()[~idx],
                                     articles.loc[~idx]["article_cluster"],
                                     metric='cosine')
        print("first step clustering {0}, {1}".format(m, len(cluster_unique)))

        # =============================================================================
        #         #### 2) cluster clusters
        # =============================================================================
        articles["article"] = articles["titre"] + " " + articles[
            "titre"] + " " + articles["titre"] + " " + articles["article"]
        article_cluster = {}
        for cluster in articles["article_cluster"].value_counts().sort_index(
        ).index:
            sub_articles = articles.loc[articles["article_cluster"] == cluster,
                                        "article"].tolist()
            a = ""
            for art in sub_articles:
                a += " " + art
            article_cluster[cluster] = a
        article_cluster = pd.DataFrame.from_dict(article_cluster,
                                                 orient="index").sort_index()
        article_cluster.columns = ["article"]

        clusters2, tfs2 = self.intersect_cluster(article_cluster, 70, 0.37)
        articles["cluster"] = 0
        for key, value in clusters2.items():
            value_cluster = article_cluster.iloc[value].index.tolist()
            articles.loc[articles["article_cluster"].isin(value_cluster),
                         "cluster"] = articles.loc[
                             articles["article_cluster"].isin(value_cluster),
                             "article_cluster"].iloc[0]

        # =============================================================================
        #         #### 3) get main words from cluster to merge with past clusters
        # =============================================================================
        article_cluster = {}
        liste_cluster = []
        for cluster in articles["cluster"].value_counts().sort_index().index:
            sub_articles = articles.loc[articles["cluster"] == cluster,
                                        "article"].tolist()
            a = ""
            for art in sub_articles:
                a += " " + art
            liste_cluster.append(cluster)
            article_cluster[cluster] = a
        article_cluster = pd.DataFrame.from_dict(article_cluster,
                                                 orient="index").sort_index()
        article_cluster.columns = ["article"]

        liste_words, article_words, tfs2 = self.weight_words(article_cluster,
                                                             nwords=70)
        cluster_words = {}
        for i, words in enumerate(article_words):
            cluster_words[liste_cluster[i]] = words

        with open(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/dayly_cluster/{0}.json".format(
                    datetime.now().strftime("%Y-%m-%d")), "w") as f:
            json.dump(cluster_words, f, ensure_ascii=False, indent=2)

        # =============================================================================
        #         ### 4) finish : lonely cluster into -1
        # =============================================================================
        cluster_unique = articles["cluster"].value_counts()[
            articles["cluster"].value_counts() == 1].index
        articles["cluster"] = np.where(
            articles["cluster"].isin(cluster_unique), -1, articles["cluster"])

        idx = articles["cluster"] != -1
        m = metrics.silhouette_score(tfs.toarray()[idx],
                                     articles.loc[idx]["cluster"],
                                     metric='cosine')
        minus = articles["cluster"].value_counts().iloc[0]
        print("clustering {0} , {1}".format(m, minus))

        self.articles["cluster"] = articles["cluster"].tolist()
        self.articles["granular_cluster"] = articles["article_cluster"].tolist(
        )

        return cluster_words

    def match_general_cluster(self, cluster_words, thresh=0.37):

        if not os.path.isfile(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/general_cluster_words.json"):
            with open(
                    os.environ["DIR_PATH"] +
                    "/data/continuous_run/general_cluster_words.json",
                    "w") as f:
                json.dump(cluster_words, f, ensure_ascii=False, indent=2)
            return 0

        with open(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/general_cluster_words.json",
                "r") as read_file:
            general_cluster_words = json.load(read_file)

        max_cluster = max([int(x) for x in general_cluster_words.keys()])

        rematch_cluster = {}
        additionnal_dico = {}
        for new_cluster, new_words in tqdm.tqdm(cluster_words.items()):
            max_score = 0
            for cluster, words in general_cluster_words.items():
                intersect_words = list(
                    set(new_words.keys()).intersection(set(words.keys())))
                score = sum([new_words[x] + words[x]
                             for x in intersect_words]) * 2 / (
                                 sum(new_words.values()) + sum(words.values()))
                if score >= thresh:
                    max_score = score
                    rematch_cluster[new_cluster] = cluster

            ### if not in treshold, create new cluster
            if max_score < thresh:
                additionnal_dico[str(max_cluster + 1)] = new_words
                rematch_cluster[new_cluster] = str(max_cluster + 1)
                max_cluster += 1

        general_cluster_words.update(additionnal_dico)
        with open(
                os.environ["DIR_PATH"] +
                "/data/continuous_run/clusters/general_cluster_words.json",
                "w") as f:
            json.dump(general_cluster_words, f, ensure_ascii=False, indent=2)

        for key, value in rematch_cluster.items():
            self.articles.loc[self.articles["cluster"] == key,
                              "cluster"] = value

    def intersect_cluster(self, articles, nwords, thresh):

        liste_words, article_words, tfs = self.weight_words(articles, nwords)

        clusters = {}
        index_articles = list(range(len(article_words)))
        cluster = 0
        while len(index_articles) > 1:
            j = index_articles[0]
            clusters[str(cluster)] = []
            for k in index_articles:
                intersect_words = list(
                    set(article_words[k].keys()).intersection(
                        set(article_words[j].keys())))
                score = sum([
                    article_words[j][x] + article_words[k][x]
                    for x in intersect_words
                ]) * 2 / (sum(article_words[j].values()) +
                          sum(article_words[k].values()))
                if score >= thresh:
                    clusters[str(cluster)].append(k)
                    index_articles.remove(k)
            cluster += 1

        if len(index_articles) == 1:
            clusters[str(cluster)] = [index_articles[0]]

        return clusters, tfs

    def sort_coo(self, coo_matrix):
        tuples = zip(coo_matrix.col, coo_matrix.data)
        return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

    def extract_topn_from_vector(self, feature_names, sorted_items, topn=10):
        """get the feature names and tf-idf score of top n items"""

        #use only topn items from vector
        sorted_items = sorted_items[:topn]

        score_vals = []
        feature_vals = []

        # word index and corresponding tf-idf score
        for idx, score in sorted_items:

            #keep track of feature name and its corresponding score
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        #create a tuples of feature,score
        #results = zip(feature_vals,score_vals)
        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]] = score_vals[idx]

        return results

    def weight_words(self, articles, nwords=40):

        tfidf = TfidfVectorizer(stop_words=self.liste_french,
                                preprocessor=self.tokenize,
                                min_df=2,
                                max_df=0.35,
                                ngram_range=(1, 2))
        tfs = tfidf.fit_transform(articles["article"].tolist())

        liste_words = []
        article_words = []
        for i, art in enumerate(articles["article"].tolist()):
            sorted_items = self.sort_coo(tfs[i].tocoo())
            article_words.append(
                self.extract_topn_from_vector(tfidf.get_feature_names(),
                                              sorted_items, nwords))
            liste_words += article_words[i].keys()

        return liste_words, article_words, tfs

    def db_scan_on_top_k(self, articles):

        liste_words, article_words, tfs = self.weight_words(articles, 100)
        liste_words = list(set(liste_words))

        index_col_keep = []
        for word in liste_words:
            index_col_keep.append(liste_words.index(word))

        tfs = tfs[:, index_col_keep]
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        cluster_labels = clusterer.fit_predict(tfs.toarray())
        articles["article_cluster"] = cluster_labels

        idx = articles["article_cluster"] != -1
        m = metrics.silhouette_score(tfs.toarray()[idx],
                                     articles.loc[idx]["article_cluster"],
                                     metric='cosine')
        print("KPIS: silhouette {0}, shape {1}, -1 {2}".format(
            m, articles["article_cluster"].value_counts().shape[0],
            articles["article_cluster"].value_counts().iloc[0]))

        return articles, tfs

    def clustering_Tf_ItfV1(self):
        tfidf = TfidfVectorizer(preprocessor=self.tokenize,
                                stop_words=self.liste_french,
                                ngram_range=(1, 2),
                                max_features=2000,
                                max_df=0.25,
                                min_df=3)

        tfs = tfidf.fit_transform(self.articles["article"].tolist())
        # HDBSCAN / -1 = no cluster
        clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
        cluster_labels = clusterer.fit_predict(tfs.toarray())
        self.articles["cluster"] = cluster_labels

        ### get most used words per cluster detected
        self.articles["main_words"] = ""
        for i in self.articles["cluster"].value_counts().iloc[1:].index:
            try:
                keeping_words, d = self.generate_text(tfidf, tfs, i,
                                                      self.articles)
                self.articles.loc[self.articles["cluster"] == i,
                                  "main_words"] = keeping_words
            except Exception:
                self.articles.loc[self.articles["cluster"] == i,
                                  "main_words"] = ""
                pass

        ### silhouette
        idx = self.articles["cluster"] != -1
        m = metrics.silhouette_score(tfs.toarray()[idx],
                                     self.articles.loc[idx]["cluster"],
                                     metric='cosine')
        print("KPIS: silhouette {0}, shape {1}, -1 {2}".format(
            m, self.articles["cluster"].value_counts().shape[0],
            self.articles["cluster"].value_counts().iloc[0]))
Ejemplo n.º 13
0
from sklearn.model_selection import train_test_split

#------------------------------------------------------------------------------------------------------------------------------

PATH_IMPUT_FILE = 'data.xlsx'
PATH_OUTPUT = 'wordcloud'
PATH_SETUP_FILE = 'config\\setup.json'


#------------------------------------------------------------------------------------------------------------------------------


french_stop_words = list(set(stopwords.words('french')) - set(['pas', 'non']))

lemmatizer = FrenchLefffLemmatizer(load_only_pos=['a', 'n'])

# chargement du fichier de config
# string -> dictionnaire
def load_setup(path_setup_file):

    with open(path_setup_file) as json_file:
        data = json.load(json_file)
        
    return data

# nettoyage des lignes du df selon les paramètres de config
def setup_clean_line(line, setup_data):
    x = line.split()
    
    for i in range(len(x)):
Ejemplo n.º 14
0
    def build(self, folder_path, num_cluster):
        start_time = time.clock()
        if not isinstance(self.language_regonizer, FunctionType):
            print("ERROR: language_regonizer never setted or type error")
            return 0
        if not isinstance(self.sentence_tokenizer, FunctionType):
            print("ERROR: sentence_tokenizer never setted")
            return 0
        if not isinstance(self.theme_cluster, FunctionType):
            print("ERROR: theme_cluster never setted")
            return 0
        if not isinstance(self.indexer, FunctionType):
            print("ERROR: indexer never setted")
            return 0
        if not isinstance(self.phrase_extractor, FunctionType):
            print("ERROR: phrase_extractor never setted")
            return 0
        if not isinstance(self.quantizator, FunctionType):
            print("ERROR: quantizator never setted")
            return 0
        if not isinstance(self.phrase_quantizator, FunctionType):
            print("ERROR: phrase_quantizator never setted")
            return 0

        sentence_index = -1
        sentences_store = []
        inversed_index = {}
        inversed_index_fr = {}
        punctuations = set(string.punctuation)
        porter_stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        fr_lemmatizer = FrenchLefffLemmatizer()
        translator = Translator()

        for file_name in os.listdir(folder_path):
            if not file_name.startswith('.'):   #avoid hidden file
                doc = read_txt(folder_path + '/' + file_name)
                print("[INFO] reading " + file_name + " to build model")
                # spilit doc to paragraphs, suppose single language in one paragraph
                paragraphs = doc.split('\n')
                for parag in paragraphs:
                    language = self.language_regonizer(parag)
                    # sentence tokenize
                    lst_sentence = self.sentence_tokenizer(parag, language)
                    # save original sentence to memory
                    sentences_store.append(lst_sentence)

                    for sentence in lst_sentence:
                        sentence_index += 1
                        # case fold
                        sentence = sentence.lower()
                        # extract phrases
                        phrases = self.phrase_extractor(sentence)
                        # remove these phrases from sentence
                        for p in phrases:
                            sentence = sentence.replace(p, '')

                        # remove all punctuation
                        removed_punc = ''.join(s for s in sentence if s not in punctuations)
                        # remove all digits
                        removed_digit = re.sub(r'\d+', '', removed_punc)
                        # tokenize
                        word_tokens = nltk.word_tokenize(removed_digit)
                        # filter stop words
                        removed_stopwords = [t for t in word_tokens if t not in stopwords.words(language)]
                        # stemming or lemmatization
                        for t in removed_stopwords:
                            if language == 'english':
                                # token = porter_stemmer.stem(t)
                                pos_token = pos_word(t)
                                if pos_token is not 0:
                                    token = lemmatizer.lemmatize(t, pos=pos_token)
                                    self.indexer(token, sentence_index, inversed_index)
                            elif language == 'french':
                                token = fr_lemmatizer.lemmatize(t)
                            # index for words
                                self.indexer(token, sentence_index, inversed_index_fr)
                        # index for phrase
                        for t in phrases:
                            if language == 'english':
                                self.indexer(t, sentence_index, inversed_index)
                            elif language == 'french':
                                self.indexer(t, sentence_index, inversed_index_fr)


        # persist the index
        save_pickle(inversed_index, 'out/index.pickle')
        save_txt(str_of(inversed_index), 'out/index.txt')

        save_pickle(inversed_index_fr, 'out/index_fr.pickle')
        save_txt(str_of(inversed_index_fr), 'out/index_fr.txt')
        index = 0
        for i in range(0,len(sentences_store)):
            input_sentence={"_id":str(index)}
            index = index + 1
            input_sentence["content"] = sentences_store[i]
            sentences_store[i] = input_sentence
            insert("corpus",sentences_store[i])

        # get all tokens
        lst_tokens = list(inversed_index.keys())
        lst_tokens_fr = list(inversed_index_fr.keys())
        # a hashmap to quick match between word and vec
        self.word_vec_map = {}
        # list of vector(list)
        matrix = []
        for token in lst_tokens:
            if ' '  not in token:   # token is a word
                if IF_DEBUG:
                    print("[INFO] " + token + "is a word")

                temp_vector = self.quantizator(token)
                if len(temp_vector) > 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector

            else:   # token is a phrase
                if IF_DEBUG:
                    print("[INFO] " + token + "is a phrase")

                temp_vector = self.phrase_quantizator(token, self.quantizator)
                if temp_vector is not 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector

        for token in lst_tokens_fr:
            if ' ' not in token:
                after_trans = translator.translate(token, dest='en')
                # print(after_trans.text)
                ####tanslate to engish after_tran
                temp_vector = self.quantizator(after_trans.text)
                if len(temp_vector) > 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector
            else:
                ####translate to english
                after_trans = translator.translate(token, dest='en')
                temp_vector = self.phrase_quantizator(after_trans.text, self.quantizator)
                if temp_vector is not 0:
                    matrix.append(temp_vector)
                    self.word_vec_map[token] = temp_vector

        # theme cluster
        self.theme_clustered = self.theme_cluster(num_cluster, matrix, list(self.word_vec_map.keys()), self.word_vec_map)
        elapsed = (time.clock() - start_time)
        print("========== END ========")
        print(self.theme_clustered['clusters'])
        print("\n")

        print(self.theme_clustered['centers'])
        print('\n')

        print(self.theme_clustered['representative'])
        print('\n')
        print("RUNNING TIME:" + str(elapsed) + " sec" )
 def test_french_lefff_lemmatizer_when_load_only_pos_is_empty_list_expect_all(
         self):
     lemmatizer = FrenchLefffLemmatizer(load_only_pos=[])
     self.assertEqual(455789, len(lemmatizer.LEFFF_TABLE))
Ejemplo n.º 16
0
    'PUNCT':'all',
    'SCONJ':'prel',
    'SYM':'exclude',
    'X':'all',
    'TOP':'exclude' }

pronouns_tags = ['cln',#pronoms personnels sujet
                 'cla', #pronoms personnels complément d'objet direct
                 'cld', #pronoms personnels complément d'objet indirect (ou 'forme disjointe')
                 'clg',#pronoms personnels 'en' et 'y'
                 #'cll', #same ?!, does bullshit
                 'prel', #pronom relatif
                 'pro',
                 'clr']#random pronom, idk anymore

lemmatizer = FrenchLefffLemmatizer()


class Dependencies():

    def __init__(self, sentence, port=9004, insecables=[], fr_lemmatize = True):
        self.sentence = sentence.rstrip('.')
        self.sentence = re.sub(r'(.?)([\.,;:\?!()\[\]\{\}«»\'\"\—\/’&])', '\\1 \\2 ', self.sentence)

        self.insecables= insecables
        self.fr_lemmatize = fr_lemmatize
        self.corenlpparser = CoreNLPDependencyParser(url='http://localhost:'+ str(port))

        try :
            temp_parse = list(self.corenlpparser.raw_parse(self.sentence))[0]
            self.parse = {i:temp_parse.nodes[i] for i in range(0, len(temp_parse.nodes))}
 def test_french_lefff_lemmatizer_lexicon_data_length(self):
     lemmatizer = FrenchLefffLemmatizer()
     self.assertEqual(455789, len(lemmatizer.LEFFF_TABLE))
 def setUp(self):
     # To make the test faster, we load an empty file.
     self.lemmatizer = FrenchLefffLemmatizer(open('file.txt', 'w').close())
 def test_french_lefff_lemmatizer_when_load_only_pos_is_v_expect_only_verbs(
         self):
     lemmatizer = FrenchLefffLemmatizer(load_only_pos=['v'])
     self.assertEqual(301208, len(lemmatizer.LEFFF_TABLE))
 def test_french_lefff_lemmatizer_when_load_only_pos_is_a_expect_only_adjectives(
         self):
     lemmatizer = FrenchLefffLemmatizer(load_only_pos=['a'])
     self.assertEqual(56823, len(lemmatizer.LEFFF_TABLE))
 def test_french_lefff_lemmatizer_when_load_only_pos_is_n_expect_only_nouns(
         self):
     lemmatizer = FrenchLefffLemmatizer(load_only_pos=['n'])
     self.assertEqual(86235, len(lemmatizer.LEFFF_TABLE))
Ejemplo n.º 22
0
 def __init__(self, articles):
     self.articles = articles
     self.stemmer = FrenchStemmer()
     self.lemmetizer = FrenchLefffLemmatizer()
     self.liste_french = [
         "demain",
         "iii",
         "ii",
         "reuters",
         "lundi",
         "mardi",
         "mercredi",
         "jeudi",
         "vendredi",
         "samedi",
         "dimanche",
         #                            "janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre",
         "fin",
         "afp",
         "déjà",
         "ok",
         "ca",
         "cas",
         "a",
         "abord",
         "absolument",
         "afin",
         "ah",
         "ai",
         "aie",
         "aient",
         "aies",
         "ailleurs",
         "ainsi",
         "ait",
         "allaient",
         "allo",
         "allons",
         "allô",
         "alors",
         "anterieur",
         "anterieure",
         "anterieures",
         "apres",
         "après",
         "as",
         "assez",
         "attendu",
         "au",
         "aucun",
         "aucune",
         "aucuns",
         "aujourd",
         "aujourd'hui",
         "aupres",
         "auquel",
         "aura",
         "aurai",
         "auraient",
         "aurais",
         "aurait",
         "auras",
         "aurez",
         "auriez",
         "aurions",
         "aurons",
         "auront",
         "aussi",
         "autre",
         "autrefois",
         "autrement",
         "autres",
         "autrui",
         "aux",
         "auxquelles",
         "auxquels",
         "avaient",
         "avais",
         "avait",
         "avant",
         "avec",
         "avez",
         "aviez",
         "avions",
         "avoir",
         "avons",
         "ayant",
         "ayez",
         "ayons",
         "b",
         "bah",
         "bas",
         "basee",
         "bat",
         "beau",
         "beaucoup",
         "bien",
         "bigre",
         "bon",
         "boum",
         "bravo",
         "brrr",
         "c",
         "car",
         "ce",
         "ceci",
         "cela",
         "celle",
         "celle-ci",
         "celle-là",
         "celles",
         "celles-ci",
         "celles-là",
         "celui",
         "celui-ci",
         "celui-là",
         "celà",
         "cent",
         "cependant",
         "certain",
         "certaine",
         "certaines",
         "certains",
         "certes",
         "ces",
         "cet",
         "cette",
         "ceux",
         "ceux-ci",
         "ceux-là",
         "chacun",
         "chacune",
         "chaque",
         "cher",
         "chers",
         "chez",
         "chiche",
         "chut",
         "chère",
         "chères",
         "ci",
         "cinq",
         "cinquantaine",
         "cinquante",
         "cinquantième",
         "cinquième",
         "clac",
         "clic",
         "combien",
         "comme",
         "comment",
         "comparable",
         "comparables",
         "compris",
         "concernant",
         "contre",
         "couic",
         "crac",
         "d",
         "da",
         "dans",
         "de",
         "debout",
         "dedans",
         "dehors",
         "deja",
         "delà",
         "depuis",
         "dernier",
         "derniere",
         "derriere",
         "derrière",
         "des",
         "desormais",
         "desquelles",
         "desquels",
         "dessous",
         "dessus",
         "deux",
         "deuxième",
         "deuxièmement",
         "devant",
         "devers",
         "devra",
         "devrait",
         "different",
         "differentes",
         "differents",
         "différent",
         "différente",
         "différentes",
         "différents",
         "dire",
         "directe",
         "directement",
         "dit",
         "dite",
         "dits",
         "divers",
         "diverse",
         "diverses",
         "dix",
         "dix-huit",
         "dix-neuf",
         "dix-sept",
         "dixième",
         "doit",
         "doivent",
         "donc",
         "dont",
         "dos",
         "douze",
         "douzième",
         "dring",
         "droite",
         "du",
         "duquel",
         "durant",
         "dès",
         "début",
         "désormais",
         "e",
         "effet",
         "egale",
         "egalement",
         "egales",
         "eh",
         "elle",
         "elle-même",
         "elles",
         "elles-mêmes",
         "en",
         "encore",
         "enfin",
         "entre",
         "envers",
         "environ",
         "es",
         "essai",
         "est",
         "et",
         "etant",
         "etc",
         "etre",
         "eu",
         "eue",
         "eues",
         "euh",
         "eurent",
         "eus",
         "eusse",
         "eussent",
         "eusses",
         "eussiez",
         "eussions",
         "eut",
         "eux",
         "eux-mêmes",
         "exactement",
         "excepté",
         "extenso",
         "exterieur",
         "eûmes",
         "eût",
         "eûtes",
         "f",
         "fais",
         "faisaient",
         "faisant",
         "fait",
         "faites",
         "façon",
         "feront",
         "fi",
         "flac",
         "floc",
         "fois",
         "font",
         "force",
         "furent",
         "fus",
         "fusse",
         "fussent",
         "fusses",
         "fussiez",
         "fussions",
         "fut",
         "fûmes",
         "fût",
         "fûtes",
         "g",
         "gens",
         "h",
         "ha",
         "haut",
         "hein",
         "hem",
         "hep",
         "hi",
         "ho",
         "holà",
         "hop",
         "hormis",
         "hors",
         "hou",
         "houp",
         "hue",
         "hui",
         "huit",
         "huitième",
         "hum",
         "hurrah",
         "hé",
         "hélas",
         "i",
         "ici",
         "il",
         "ils",
         "importe",
         "j",
         "je",
         "jusqu",
         "jusque",
         "juste",
         "k",
         "l",
         "la",
         "laisser",
         "laquelle",
         "las",
         "le",
         "lequel",
         "les",
         "lesquelles",
         "lesquels",
         "leur",
         "leurs",
         "longtemps",
         "lors",
         "lorsque",
         "lui",
         "lui-meme",
         "lui-même",
         "là",
         "lès",
         "m",
         "ma",
         "maint",
         "maintenant",
         "mais",
         "malgre",
         "malgré",
         "maximale",
         "me",
         "meme",
         "memes",
         "merci",
         "mes",
         "mien",
         "mienne",
         "miennes",
         "miens",
         "mille",
         "mince",
         "mine",
         "minimale",
         "moi",
         "moi-meme",
         "moi-même",
         "moindres",
         "moins",
         "mon",
         "mot",
         "moyennant",
         "multiple",
         "multiples",
         "même",
         "mêmes",
         "n",
         "na",
         "naturel",
         "naturelle",
         "naturelles",
         "ne",
         "neanmoins",
         "necessaire",
         "necessairement",
         "neuf",
         "neuvième",
         "ni",
         "nombreuses",
         "nombreux",
         "nommés",
         "non",
         "nos",
         "notamment",
         "notre",
         "nous",
         "nous-mêmes",
         "nouveau",
         "nouveaux",
         "nul",
         "néanmoins",
         "nôtre",
         "nôtres",
         "o",
         "oh",
         "ohé",
         "ollé",
         "olé",
         "on",
         "ont",
         "onze",
         "onzième",
         "ore",
         "ou",
         "ouf",
         "ouias",
         "oust",
         "ouste",
         "outre",
         "ouvert",
         "ouverte",
         "ouverts",
         "o|",
         "où",
         "p",
         "paf",
         "pan",
         "par",
         "parce",
         "parfois",
         "parle",
         "parlent",
         "parler",
         "parmi",
         "parole",
         "parseme",
         "partant",
         "particulier",
         "particulière",
         "particulièrement",
         "pas",
         "passé",
         "pendant",
         "pense",
         "permet",
         "personne",
         "personnes",
         "peu",
         "peut",
         "peuvent",
         "peux",
         "pff",
         "pfft",
         "pfut",
         "pif",
         "pire",
         "pièce",
         "plein",
         "plouf",
         "plupart",
         "plus",
         "plusieurs",
         "plutôt",
         "possessif",
         "possessifs",
         "possible",
         "possibles",
         "pouah",
         "pour",
         "pourquoi",
         "pourrais",
         "pourrait",
         "pouvait",
         "prealable",
         "precisement",
         "premier",
         "première",
         "premièrement",
         "pres",
         "probable",
         "probante",
         "procedant",
         "proche",
         "près",
         "psitt",
         "pu",
         "puis",
         "puisque",
         "pur",
         "pure",
         "q",
         "qu",
         "quand",
         "quant",
         "quant-à-soi",
         "quanta",
         "quarante",
         "quatorze",
         "quatre",
         "quatre-vingt",
         "quatrième",
         "quatrièmement",
         "que",
         "quel",
         "quelconque",
         "quelle",
         "quelles",
         "quelqu'un",
         "quelque",
         "quelques",
         "quels",
         "qui",
         "quiconque",
         "quinze",
         "quoi",
         "quoique",
         "r",
         "rare",
         "rarement",
         "rares",
         "relative",
         "relativement",
         "remarquable",
         "rend",
         "rendre",
         "restant",
         "reste",
         "restent",
         "restrictif",
         "retour",
         "revoici",
         "revoilà",
         "rien",
         "s",
         "sa",
         "sacrebleu",
         "sait",
         "sans",
         "sapristi",
         "sauf",
         "se",
         "sein",
         "seize",
         "selon",
         "semblable",
         "semblaient",
         "semble",
         "semblent",
         "sent",
         "sept",
         "septième",
         "sera",
         "serai",
         "seraient",
         "serais",
         "serait",
         "seras",
         "serez",
         "seriez",
         "serions",
         "serons",
         "seront",
         "ses",
         "seul",
         "seule",
         "seulement",
         "si",
         "sien",
         "sienne",
         "siennes",
         "siens",
         "sinon",
         "six",
         "sixième",
         "soi",
         "soi-même",
         "soient",
         "sois",
         "soit",
         "soixante",
         "sommes",
         "son",
         "sont",
         "sous",
         "souvent",
         "soyez",
         "soyons",
         "specifique",
         "specifiques",
         "speculatif",
         "stop",
         "strictement",
         "subtiles",
         "suffisant",
         "suffisante",
         "suffit",
         "suis",
         "suit",
         "suivant",
         "suivante",
         "suivantes",
         "suivants",
         "suivre",
         "sujet",
         "superpose",
         "sur",
         "surtout",
         "t",
         "ta",
         "tac",
         "tandis",
         "tant",
         "tardive",
         "te",
         "tel",
         "telle",
         "tellement",
         "telles",
         "tels",
         "tenant",
         "tend",
         "tenir",
         "tente",
         "tes",
         "tic",
         "tien",
         "tienne",
         "tiennes",
         "tiens",
         "toc",
         "toi",
         "toi-même",
         "ton",
         "touchant",
         "toujours",
         "tous",
         "tout",
         "toute",
         "toutefois",
         "toutes",
         "treize",
         "trente",
         "tres",
         "trois",
         "troisième",
         "troisièmement",
         "trop",
         "très",
         "tsoin",
         "tsouin",
         "tu",
         "té",
         "u",
         "un",
         "une",
         "unes",
         "uniformement",
         "unique",
         "uniques",
         "uns",
         "v",
         "va",
         "vais",
         "valeur",
         "vas",
         "vers",
         "via",
         "vif",
         "vifs",
         "vingt",
         "vivat",
         "vive",
         "vives",
         "vlan",
         "voici",
         "voie",
         "voient",
         "voilà",
         "vont",
         "vos",
         "votre",
         "vous",
         "vous-mêmes",
         "vu",
         "vé",
         "vôtre",
         "vôtres",
         "w",
         "x",
         "y",
         "z",
         "zut",
         "à",
         "â",
         "ça",
         "ès",
         "étaient",
         "étais",
         "était",
         "étant",
         "état",
         "étiez",
         "étions",
         "été",
         "étée",
         "étées",
         "étés",
         "êtes",
         "être",
         "ô"
     ]
 def test_french_lefff_lemmatizer_when_lefff_additional_file_path_is_not_none_and_empty_expect_only_lefff_file(
         self):
     lemmatizer = FrenchLefffLemmatizer(
         lefff_additional_file_path=self.path_file)
     self.assertEqual(455787, len(lemmatizer.LEFFF_TABLE))  # 455785 + 2
Ejemplo n.º 24
0
    def __init__(self):
        # register your new attribute token._.lefff_lemma
        Token.set_extension('lefff_lemma', default=None)

        self.lemmatizer = FrenchLefffLemmatizer()
 def test_french_lefff_lemmatizer_when_load_only_lefff_additional(self):
     lemmatizer = FrenchLefffLemmatizer(lefff_file_path=self.path_file)
     self.assertEqual(56843, len(
         lemmatizer.LEFFF_TABLE))  # 56841 + 2 "errors" added
Ejemplo n.º 26
0
def clean_sequences(sequences, language = 'fr', annotate = ['anatomy'] , annotate_meds = True, lemmatizing = False, stemming = False, remove_accent = False, correcting = True, remove_stopwords = True):
    '''
    clean a set of sentences with the following methods
        - remove stopwords
        - annotate words in anatomy and med types
        - stemming or lemmatizing
        - remove accents or not
    '''

    med_list = get_med_list()
    annotations = get_QUEARO_annorations()

    if remove_stopwords :
        if language == 'en':
            stpwds = stopwords.words('english')
        else :
            stpwds = stopwords.words('french')
            new_stpwds = ['a','les']
            stpwds += new_stpwds
        for i in range(len(sequences)) :
            sequences[i]=[el for el in sequences[i] if el not in stpwds]

    if annotate_meds :
        for (sent_ind,sent) in enumerate(sequences) :
            for (w_ind,w) in enumerate(sent) :
                w = unicodedata.normalize('NFD', w).encode('ascii', 'ignore').decode('UTF-8')
                if w in med_list :
                    sequences[sent_ind][w_ind] = 'médicament'

    if len(annotate) > 0 :
        for category in annotate :
            ann = annotations[category[:4]]
            if len(ann)>0 :
                for (sent_ind,sent) in enumerate(sequences) :
                    for (w_ind,w) in enumerate(sent) :
                        w = unicodedata.normalize('NFD', w).encode('ascii', 'ignore').decode('UTF-8')
                        if w in ann :
                            sequences[sent_ind][w_ind] = 'anatomie'

    if stemming :
        if language == 'en':
            stemmer = SnowballStemmer("english")
        else :
            stemmer = SnowballStemmer("french")
        for i in range(len(sequences)) :
            sequences[i]=[stemmer.stem(el) for el in sequences[i]]

    if lemmatizing :
        if language == 'en':
            lemmatizer = WordNetLemmatizer()
            no_lemmatizing = []
        else :
            lemmatizer = FrenchLefffLemmatizer()
            no_lemmatizing = ['soucis', 'sous']
        for i in range(len(sequences)) :
            sequences[i]=[lemmatizer.lemmatize(el) for el in sequences[i] if el not in no_lemmatizing]

    # if correcting :
    #     if language == 'en':
    #         d = enchant.DictWithPWL("en")
    #         for i in range(len(sequences)) :
    #             sequences[i]=[d.suggest(el)[0] for el in sequences[i] if not d.check(el) and len(d.suggest(el))>0]
    #             sequences[i]=[el for el in sequences[i] if el not in stpwds]
    #     else :
    #         spell = SpellChecker(language='fr')
    #         for i in range(len(sequences)) :
    #             print(i)
    #             correct = []
    #             for w in sequences[i]:
    #                 if spell[w]:
    #                     correct.append(w)
    #                 else :
    #                     correct.append(spell.correction(w))
    #                     print(w)
    #                     print(spell.correction(w))
    #             sequences[i]=[el for el in correct if el not in stpwds]

    if remove_accent and language == 'fr':
        for i in range(len(sequences)) :
            sequences[i]=[unicodedata.normalize('NFD', el).encode('ascii', 'ignore').decode('UTF-8') for el in sequences[i]]

    print("--- seq cleaned ---")
    return(sequences)
 def test_french_lefff_lemmatizer_with_additional_file_is_false(self):
     lemmatizer = FrenchLefffLemmatizer(with_additional_file=False)
     self.assertEqual(455787, len(lemmatizer.LEFFF_TABLE))