def loadData(): WOS.download_and_extract() fname = os.path.join(path_WOS,"WebOfScience/WOS5736/X.txt") fnamek = os.path.join(path_WOS,"WebOfScience/WOS5736/YL1.txt") fnameL2 = os.path.join(path_WOS,"WebOfScience/WOS5736/YL2.txt") with open(fname) as f: content = f.readlines() content = [text_cleaner(x) for x in content] with open(fnamek) as fk: contentk = fk.readlines() contentk = [x.strip() for x in contentk] with open(fnameL2) as fk: contentL2 = fk.readlines() contentL2 = [x.strip() for x in contentL2] Label = np.matrix(contentk, dtype=int) Label = np.transpose(Label) number_of_classes_L1 = np.max(Label)+1 # number of classes in Level 1 Label_L2 = np.matrix(contentL2, dtype=int) Label_L2 = np.transpose(Label_L2) np.random.seed(7) print(Label.shape) print(Label_L2.shape) Label = np.column_stack((Label, Label_L2)) number_of_classes_L2 = np.zeros(number_of_classes_L1,dtype=int) X_train, X_test, y_train, y_test = train_test_split(content, Label, test_size=0.2,random_state= 0) vectorizer_x = CountVectorizer() X_train = vectorizer_x.fit_transform(X_train).toarray() X_test = vectorizer_x.transform(X_test).toarray() L2_Train = [] L2_Test = [] content_L2_Train = [] content_L2_Test = [] for i in range(0, number_of_classes_L1): L2_Train.append([]) L2_Test.append([]) content_L2_Train.append([]) content_L2_Test.append([]) for i in range(0, X_train.shape[0]): L2_Train[y_train[i, 0]].append(y_train[i, 1]) number_of_classes_L2[y_train[i, 0]] = max(number_of_classes_L2[y_train[i, 0]],(y_train[i, 1]+1)) content_L2_Train[y_train[i, 0]].append(X_train[i]) for i in range(0, X_test.shape[0]): L2_Test[y_test[i, 0]].append(y_test[i, 1]) content_L2_Test[y_test[i, 0]].append(X_test[i]) for i in range(0, number_of_classes_L1): L2_Train[i] = np.array(L2_Train[i]) L2_Test[i] = np.array(L2_Test[i]) content_L2_Train[i] = np.array(content_L2_Train[i]) content_L2_Test[i] = np.array(content_L2_Test[i]) return (X_train,y_train,X_test,y_test,content_L2_Train,L2_Train,content_L2_Test,L2_Test,number_of_classes_L2)
* Comments and Error: email: [email protected] """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" import re from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.feature_extraction.text import CountVectorizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences import WOS_input as WOS import Download_Glove as GloVe import numpy as np import os ''' Location of the dataset''' path_WOS = WOS.download_and_extract() GLOVE_DIR = GloVe.download_and_extract() print(GLOVE_DIR) def clean_str(string): """ Tokenization/string cleaning for dataset Every dataset is lower cased except """ string = re.sub(r"\\", "", string) string = re.sub(r"\'", "", string) string = re.sub(r"\"", "", string) return string.strip().lower() def text_cleaner(text): text = text.replace(".", "")