def __init__ (self, Xdata=None, Ydata=None, csv=None,xlsx=None,x_col='X',y_col='Y',models='all',test_frac=0.1,train_frac=0.9): if models=='all': self.model_list = [ 'bert-base-uncased', 'albert-base-v2', 'roberta-base', 'linear_SVM', 'multinomial_naive_bayesian',] elif models=='count-vectorizer': self.model_list = [ 'linear_SVM', 'multinomial_naive_bayesian',] elif models=='transformers': self.model_list = [ 'bert-base-uncased', 'albert-base-v2', 'roberta-base',] else: print('Models not recognized, the available options are currently "all", "count-vectorizer", and "transformers"') return if csv!=None and xlsx!= None and Xdata!=None: print("You have provided too much data, give just x and y data, or a csv or xlsx file!") return if csv!=None: csv_data=pd.read_csv(csv) Xdata=csv_data[x_col] Ydata=csv_data[y_col] if xlsx!=None: xlsx_data=pd.read_excel(xlsx) Xdata=xlsx_data[x_col] Ydata=xlsx_data[y_col] if isinstance(Xdata, pd.Series): print('converting pandas series to list') Xdata=list(Xdata) if isinstance(Ydata, pd.Series): print('converting pandas series to list') Ydata=list(Ydata) if Xdata==Ydata==None or (Xdata==None and Ydata!=None) or (Xdata!=None and Ydata==None): print('Either you have not put in your own data, or you have only put in X or Y data, loading default dataset...') self.train_dataset_raw, self.test_dataset_raw = load_dataset('imdb', split=['train', 'test']) X=self.train_dataset_raw['text']+self.test_dataset_raw['text'] Y=self.train_dataset_raw['label']+self.test_dataset_raw['label'] keys=set(Y) else: X=Xdata Y=Ydata if all(isinstance(n, int) for n in Y): keys=set(Y) else: Y,keys=string_labels_to_int(Y) #add method to make min label 0 if min(Y)>=1: Y=[y-min(Y) for y in Y] if len(Xdata)<20: print('dataset is really small, using default test/train split (0.25)') test_frac=None train_frac=None if len(Xdata)<8: print('dataset is really too small, using default test/train split (0.5)') test_frac=0.5 train_frac=0.5 if len(Xdata)!=len(Ydata): print('ERROR: X data and Y data lengths are not the same size, they need to be!') return X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=test_frac, train_size=train_frac) self.num_labels=len(keys) #self.train_dataset_raw_CNN = TensorDataset(X_train, int_labels_to_list(Y_train,keys)) #self.test_dataset_raw_CNN = TensorDataset(X_test, int_labels_to_list(Y_test,keys)) print('X_train length: ' + str(len(X_train))) print('X_test length: ' + str(len(X_test))) print('Y_train length: ' + str(len(Y_train))) print('Y_test length: ' + str(len(Y_test))) self.train_dataset_raw = Dataset.from_pandas(pd.DataFrame({'text':X_train, 'labels': Y_train})) self.test_dataset_raw = Dataset.from_pandas(pd.DataFrame({'text':X_test, 'labels': Y_test})) self.all_metrics = {}
"SentiNews_sentence_test.tsv", sep="\t") df_document_sl_hr_train = pd.read_csv(sentinews_location + "HRSLSentiNews_document_train.tsv", sep="\t") df_document_sl_hr_valid = pd.read_csv(sentinews_location + "HRSLSentiNews_document_valid.tsv", sep="\t") # NO test hr mixed as HR test will be used as final test # gather everyone if you want to have a single DatasetDict document = DatasetDict({ "train": Dataset.from_pandas(df_document_sl_hr_train), "valid": Dataset.from_pandas(df_document_sl_hr_valid), "test": Dataset.from_pandas(df_document_croatian_test) }) # document.save_to_disk("sentinews-document") # gather everyone if you want to have a single DatasetDict paragraph = DatasetDict({ "train": Dataset.from_pandas(df_paragraph_train), "valid": Dataset.from_pandas(df_paragraph_valid), "test": Dataset.from_pandas(df_paragraph_test), }) # paragraph.save_to_disk("sentinews-paragraph") # gather everyone if you want to have a single DatasetDict sentence = DatasetDict({