def get_gazetteer(language: str = 'ENG'): """ return a dictionary with the types and the gazetteers associated :param language: language in wich we try to get the gazetteers :type language: str :return: dictionary {'LOC': [Paris, ...]} :rtype: dict """ if language == 'ENG': gazloc = no_caps( nltk.corpus.gazetteers.words(fileids=[ 'countries.txt', 'uscities.txt', 'usstates.txt', 'usstateabbrev.txt', 'mexstates.txt', 'caprovinces.txt' ])) gazper = no_caps( nltk.corpus.names.words(fileids=['male.txt', 'female.txt'])) gazmisc = no_caps( nltk.corpus.gazetteers.words(fileids=['nationalities.txt'])) return {cst.LOC: gazloc, cst.PER: gazper, cst.MISC: gazmisc} if language == 'FR': cfg = get_asset_root() gazloc = get_file_content(cfg, 'gazLOC') gazloc = pd.read_csv(gazloc) gazloc = gazloc.iloc[:, 0].tolist() gazper = get_file_content(cfg, 'gazPER') gazper = pd.read_csv(gazper) gazper = gazper.iloc[:, 0].tolist() return {cst.LOC: gazloc, cst.PER: gazper}
def get_already_trained( cls, name: str, language: str = cst.NO_LANGUAGE, entity: list = ["ORG", "LOC", "PER"]) -> Union[object, None]: """ charge an already trained dataset, you must specify the langage :param langage: langage of the dataset :type langage: str :return: the Training database object :rtype: Union[object, None] """ self = Training_database() self.df_name = name self.language = language self.entity = entity cfg = get_asset_root() file = get_file_content(cfg, name) try: self.df = pd.read_csv(filepath_or_buffer=file, index_col=0) self.df = self.df.fillna(0) self.categories = list(self.df['NEtag'].unique()) self.categories.remove(0) return self except: return "Error when trying to read the dataframe"
def train_with_function(self, model, name_entity:list, features:list, name_to_export:None): """ train the model on the given named entity :param model: model to use, ex random forest :type model: sklearn stuff :param name_entity: list of the columns of the named entity in the dataset :type name_entity: list :param features: list of the features on which sklearn will do its stuff :type features: columns :return: trained model :rtype: saves the model in the pkl file """ multi = [] train_model = self.train_model for i in range(0, len(self.train_model['Word'])): f = 1 for j in range(0, len(name_entity)): if self.train_model.loc[i,name_entity] == 1: multi.append(j) if len(multi)<f: multi.append(0) f = f+1 train_model['multi'] = multi X = train_model[features] y = train_model['multi'] params_sk = model.fit(X,y) if name_to_export is not None: g = get_asset_root() joblib.dump(params_sk, f'{g["pkl_root"]}/{name_to_export}.pkl', compress=9) LOGGER.info(model.score(X,y)) return params_sk
def try_trained_model(txt_to_test: str, model: str = "svm_all_features", entity: list = ['ORG', 'LOC', 'MISC', 'PER'], name_entity: list = [ 'Organisations', 'Locations', 'Miscellaneaous', 'Persons' ]): df_to_do = pd.DataFrame() df_to_do['Word'] = txt_to_test.split() df_to_do = Training_database.do_feature_dataset(df_to_do) cfg = get_asset_root() directory = get_file_content(cfg, model) model_clone = joblib.load(directory) df_test = df_to_do[cst.list_features_en] result = model_clone.predict(df_test) LOGGER.info(result) dict_entity = {} for j in name_entity: dict_entity[j] = [] for i in range(0, len(result)): for j in range(0, len(entity)): if result[i] == j + 1: dict_entity[name_entity[j]].append(df_to_do['Word'][i]) return dict_entity
def train_with_function(self, model, df: None, features: list, name_to_export: None, entity: list = ['ORG', 'LOC', 'PER']): """ train the model on the given named entity :param model: model to use, ex random forest :type model: sklearn stuff :param name_entity: list of the columns of the named entity in the dataset :type name_entity: list :param features: list of the features on which sklearn will do its stuff :type features: columns :return: trained model :rtype: saves the model in the pkl file """ multi = [] self.entity = entity if df is not None: self.df = df train_model = self.df f = 1 for i in range(0, len(train_model['Word'])): for j in range(0, len(self.entity)): if train_model.loc[i, self.entity[j]] == 1: multi.append(j + 1) if len(multi) < f: multi.append(0) f = f + 1 LOGGER.info(f) LOGGER.info({ len(train_model['Word']): "len trained model", len(multi): "len multi" }) train_model['multi'] = multi X = train_model[features] y = train_model['multi'] params_sk = model.fit(X, y) if name_to_export is not None: g = get_asset_root() joblib.dump(params_sk, f'{g["pkl_root"]}/{name_to_export}.pkl', compress=9) LOGGER.info(model.score(X, y)) y_pred = model.predict(X) LOGGER.info(sklearn.metrics.confusion_matrix(y, y_pred)) self.trained_model = params_sk return params_sk
def gazetteer(df: pd.DataFrame, language: str = 'ENG'): cfg = get_asset_root() list_gaz = get_type_of_gazetteers(cfg, 'en') for i in list_gaz: g = [0 for i in df[cst.WORD]] list_files = get_file_content(cfg, 'gazetteer_en', gaztype=i) for j in list_files: gaz = list(pd.read_csv(j)[cst.LOWERCASE]) for index in range(0, len(df[cst.LOWERCASE])): if df[cst.LOWERCASE][index] in gaz: g[index] = 1 df[i] = g return df
def preuni_factory(df: pd.DataFrame, directory:str="pre_freq_CONLL2003"): preuni = {'preuniORG': f'{directory}/preuniORG', 'preuniLOC': f'{directory}/preuniLOC', 'preuniPER': f'{directory}/preuniPER', 'preuniMISC': f'{directory}/preuniMISC'} cfg = get_asset_root() for key, value in preuni.items(): file_name = get_file_content(cfg, value) with open(file_name) as json_file: data = json.load(json_file) frequentname = data[key] LOGGER.info(frequentname) L = [0] for i in range(1, len(df)): if df.iloc[i - 1][cst.LOWERCASE] in frequentname: L.append(1) else: L.append(0) df[key] = L return df
def frequency_factory(df: pd.DataFrame, directory:str="freq_names_CONLL2003"): freq = {'FreqNAMES': f'{directory}/freqNAMES', 'FreqORG': f'{directory}/freqORG', 'FreqLOC': f'{directory}/freqLOC', 'FreqPER': f'{directory}/freqPER', 'FreqMISC': f'{directory}/freqMISC'} cfg = get_asset_root() for key, value in freq.items(): file_name = get_file_content(cfg, value) with open(file_name) as json_file: data = json.load(json_file) frequentname = data[key] LOGGER.info(frequentname) freq_entity = [] for row in df.itertuples(index=True, name='Pandas'): if getattr(row, cst.LOWERCASE) in frequentname: freq_entity.append(1) else: freq_entity.append(0) df[key] = freq_entity return df
def clean_and_setup_training( cls, name: str, language: str = cst.NO_LANGUAGE) -> Union[object, None]: """ charge a virgin dataset, and add the features :param name: name of the virgin dataset :type name: str :param language: language of the dataset :type language: str :return: the new traning dataset object :rtype: Union[object, None] """ self = Training_database() self.df_name = name self.language = language cfg = get_asset_root() file = get_file_content(cfg, name) try: self.df = pd.read_csv(filepath_or_buffer=file, index_col=0) except: return "Error when trying to read the dataframe" return self
model_clone = joblib.load(directory) df_test = df_to_do[list_features] result = model_clone.predict(df_test) LOGGER.info(result) dict_entity = {} for j in name_entity: dict_entity[j] = [] for i in range(0, len(result)): for j in range(0, len(entity)): if result[i] == j + 1: dict_entity[name_entity[j]].append(df_to_do['Word'][i]) return dict_entity if __name__ == "__main__": cfg = get_asset_root() # directory = get_file_content(cfg, "French_own_data/frenchreuters_trained") # df = pd.read_csv(directory) # g = Training_database() # rdm_forest = RandomForestClassifier(n_estimators=20, verbose=True) # svm_linear = svm.LinearSVC() # svm_multi = svm.SVC(kernel='rbf', C=1) # h = g.train_with_function(rdm_forest, df, features=cst.list_features_fr_no_caps, name_to_export="rdm_forest_with_debut_fr_no_caps") # h = g.train_with_function(svm_linear, df, features=cst.list_features_fr_no_caps, name_to_export="svm_linear_easy_fr_no_caps") text = 'Air France a décidé de garder les dividendes de Lagardère qui s apprete à recevoir un prix Nobel' g = Training_database.try_trained_model(text, "rdm_forest_with_debut_fr", list_features=cst.list_features_fr) LOGGER.info(g)