Exemple #1
0
def get_gazetteer(language: str = 'ENG'):
    """
    return a dictionary with the types and the gazetteers associated
    :param language: language in wich we try to get the gazetteers
    :type language: str
    :return: dictionary {'LOC': [Paris, ...]}
    :rtype: dict
    """
    if language == 'ENG':
        gazloc = no_caps(
            nltk.corpus.gazetteers.words(fileids=[
                'countries.txt', 'uscities.txt', 'usstates.txt',
                'usstateabbrev.txt', 'mexstates.txt', 'caprovinces.txt'
            ]))
        gazper = no_caps(
            nltk.corpus.names.words(fileids=['male.txt', 'female.txt']))
        gazmisc = no_caps(
            nltk.corpus.gazetteers.words(fileids=['nationalities.txt']))
        return {cst.LOC: gazloc, cst.PER: gazper, cst.MISC: gazmisc}
    if language == 'FR':
        cfg = get_asset_root()
        gazloc = get_file_content(cfg, 'gazLOC')
        gazloc = pd.read_csv(gazloc)
        gazloc = gazloc.iloc[:, 0].tolist()
        gazper = get_file_content(cfg, 'gazPER')
        gazper = pd.read_csv(gazper)
        gazper = gazper.iloc[:, 0].tolist()
        return {cst.LOC: gazloc, cst.PER: gazper}
Exemple #2
0
 def get_already_trained(
         cls,
         name: str,
         language: str = cst.NO_LANGUAGE,
         entity: list = ["ORG", "LOC", "PER"]) -> Union[object, None]:
     """
     charge an already trained dataset, you must specify the langage
     :param langage: langage of the dataset
     :type langage: str
     :return: the Training database object
     :rtype: Union[object, None]
     """
     self = Training_database()
     self.df_name = name
     self.language = language
     self.entity = entity
     cfg = get_asset_root()
     file = get_file_content(cfg, name)
     try:
         self.df = pd.read_csv(filepath_or_buffer=file, index_col=0)
         self.df = self.df.fillna(0)
         self.categories = list(self.df['NEtag'].unique())
         self.categories.remove(0)
         return self
     except:
         return "Error when trying to read the dataframe"
Exemple #3
0
 def train_with_function(self, model, name_entity:list, features:list, name_to_export:None):
     """
     train the model on the given named entity
     :param model: model to use, ex random forest
     :type model: sklearn stuff
     :param name_entity: list of the columns of the named entity in the dataset
     :type name_entity: list
     :param features: list of the features on which sklearn will do its stuff
     :type features: columns
     :return: trained model
     :rtype: saves the model in the pkl file
     """
     multi = []
     train_model = self.train_model
     for i in range(0, len(self.train_model['Word'])):
         f = 1
         for j in range(0, len(name_entity)):
             if self.train_model.loc[i,name_entity] == 1:
                 multi.append(j)
         if len(multi)<f:
             multi.append(0)
         f = f+1
     train_model['multi'] = multi
     X = train_model[features]
     y = train_model['multi']
     params_sk = model.fit(X,y)
     if name_to_export is not None:
         g = get_asset_root()
         joblib.dump(params_sk, f'{g["pkl_root"]}/{name_to_export}.pkl', compress=9)
     LOGGER.info(model.score(X,y))
     return params_sk
    def try_trained_model(txt_to_test: str,
                          model: str = "svm_all_features",
                          entity: list = ['ORG', 'LOC', 'MISC', 'PER'],
                          name_entity: list = [
                              'Organisations', 'Locations', 'Miscellaneaous',
                              'Persons'
                          ]):
        df_to_do = pd.DataFrame()
        df_to_do['Word'] = txt_to_test.split()
        df_to_do = Training_database.do_feature_dataset(df_to_do)
        cfg = get_asset_root()
        directory = get_file_content(cfg, model)
        model_clone = joblib.load(directory)
        df_test = df_to_do[cst.list_features_en]
        result = model_clone.predict(df_test)
        LOGGER.info(result)
        dict_entity = {}
        for j in name_entity:
            dict_entity[j] = []
        for i in range(0, len(result)):
            for j in range(0, len(entity)):
                if result[i] == j + 1:
                    dict_entity[name_entity[j]].append(df_to_do['Word'][i])

        return dict_entity
Exemple #5
0
 def train_with_function(self,
                         model,
                         df: None,
                         features: list,
                         name_to_export: None,
                         entity: list = ['ORG', 'LOC', 'PER']):
     """
     train the model on the given named entity
     :param model: model to use, ex random forest
     :type model: sklearn stuff
     :param name_entity: list of the columns of the named entity in the dataset
     :type name_entity: list
     :param features: list of the features on which sklearn will do its stuff
     :type features: columns
     :return: trained model
     :rtype: saves the model in the pkl file
     """
     multi = []
     self.entity = entity
     if df is not None:
         self.df = df
     train_model = self.df
     f = 1
     for i in range(0, len(train_model['Word'])):
         for j in range(0, len(self.entity)):
             if train_model.loc[i, self.entity[j]] == 1:
                 multi.append(j + 1)
         if len(multi) < f:
             multi.append(0)
         f = f + 1
     LOGGER.info(f)
     LOGGER.info({
         len(train_model['Word']): "len trained model",
         len(multi): "len multi"
     })
     train_model['multi'] = multi
     X = train_model[features]
     y = train_model['multi']
     params_sk = model.fit(X, y)
     if name_to_export is not None:
         g = get_asset_root()
         joblib.dump(params_sk,
                     f'{g["pkl_root"]}/{name_to_export}.pkl',
                     compress=9)
     LOGGER.info(model.score(X, y))
     y_pred = model.predict(X)
     LOGGER.info(sklearn.metrics.confusion_matrix(y, y_pred))
     self.trained_model = params_sk
     return params_sk
    def gazetteer(df: pd.DataFrame, language: str = 'ENG'):
        cfg = get_asset_root()

        list_gaz = get_type_of_gazetteers(cfg, 'en')
        for i in list_gaz:
            g = [0 for i in df[cst.WORD]]
            list_files = get_file_content(cfg, 'gazetteer_en', gaztype=i)
            for j in list_files:
                gaz = list(pd.read_csv(j)[cst.LOWERCASE])

                for index in range(0, len(df[cst.LOWERCASE])):
                    if df[cst.LOWERCASE][index] in gaz:
                        g[index] = 1
            df[i] = g
        return df
 def preuni_factory(df: pd.DataFrame, directory:str="pre_freq_CONLL2003"):
     preuni = {'preuniORG':  f'{directory}/preuniORG',
               'preuniLOC':  f'{directory}/preuniLOC', 'preuniPER': f'{directory}/preuniPER',
               'preuniMISC': f'{directory}/preuniMISC'}
     cfg = get_asset_root()
     for key, value in preuni.items():
         file_name = get_file_content(cfg, value)
         with open(file_name) as json_file:
             data = json.load(json_file)
         frequentname = data[key]
         LOGGER.info(frequentname)
         L = [0]
         for i in range(1, len(df)):
             if df.iloc[i - 1][cst.LOWERCASE] in frequentname:
                 L.append(1)
             else:
                 L.append(0)
         df[key] = L
     return df
 def frequency_factory(df: pd.DataFrame, directory:str="freq_names_CONLL2003"):
     freq = {'FreqNAMES': f'{directory}/freqNAMES', 'FreqORG': f'{directory}/freqORG',
             'FreqLOC':   f'{directory}/freqLOC', 'FreqPER': f'{directory}/freqPER',
             'FreqMISC':  f'{directory}/freqMISC'}
     cfg = get_asset_root()
     for key, value in freq.items():
         file_name = get_file_content(cfg, value)
         with open(file_name) as json_file:
             data = json.load(json_file)
         frequentname = data[key]
         LOGGER.info(frequentname)
         freq_entity = []
         for row in df.itertuples(index=True, name='Pandas'):
             if getattr(row, cst.LOWERCASE) in frequentname:
                 freq_entity.append(1)
             else:
                 freq_entity.append(0)
         df[key] = freq_entity
     return df
Exemple #9
0
 def clean_and_setup_training(
         cls,
         name: str,
         language: str = cst.NO_LANGUAGE) -> Union[object, None]:
     """
     charge a virgin dataset, and add the features
     :param name: name of the virgin dataset
     :type name: str
     :param language: language of the dataset
     :type language: str
     :return: the new traning dataset object
     :rtype: Union[object, None]
     """
     self = Training_database()
     self.df_name = name
     self.language = language
     cfg = get_asset_root()
     file = get_file_content(cfg, name)
     try:
         self.df = pd.read_csv(filepath_or_buffer=file, index_col=0)
     except:
         return "Error when trying to read the dataframe"
     return self
Exemple #10
0
        model_clone = joblib.load(directory)
        df_test = df_to_do[list_features]
        result = model_clone.predict(df_test)
        LOGGER.info(result)
        dict_entity = {}
        for j in name_entity:
            dict_entity[j] = []
        for i in range(0, len(result)):
            for j in range(0, len(entity)):
                if result[i] == j + 1:
                    dict_entity[name_entity[j]].append(df_to_do['Word'][i])

        return dict_entity


if __name__ == "__main__":
    cfg = get_asset_root()
    # directory = get_file_content(cfg, "French_own_data/frenchreuters_trained")
    # df = pd.read_csv(directory)
    # g = Training_database()
    # rdm_forest = RandomForestClassifier(n_estimators=20, verbose=True)
    # svm_linear = svm.LinearSVC()
    # svm_multi = svm.SVC(kernel='rbf', C=1)
    # h = g.train_with_function(rdm_forest, df, features=cst.list_features_fr_no_caps, name_to_export="rdm_forest_with_debut_fr_no_caps")
    # h = g.train_with_function(svm_linear, df, features=cst.list_features_fr_no_caps, name_to_export="svm_linear_easy_fr_no_caps")
    text = 'Air France a décidé de garder les dividendes de Lagardère qui s apprete à recevoir un prix Nobel'
    g = Training_database.try_trained_model(text,
                                            "rdm_forest_with_debut_fr",
                                            list_features=cst.list_features_fr)
    LOGGER.info(g)