def set_database(): """ This function sets up the database for the corpus documents """ corpus="/home/stark/git/Different-news-articles-classified/for_demo/Corpus" ch=raw_input("1.Set corpus path\n2.Use default path\n") if ch==1: corpus=raw_input("Enter complete path of corpus:") cwd=os.getcwd() db=createdb.database() doc_list=BOW(db,corpus,cwd) N=db.get_no_of_doc() Total_words=db.get_total_words() print "Number of documents in corpus:"+str(N) print "Total number of words:"+str(Total_words) fil=open("query_setup.txt","w") fil.write(str(N)+"\n") fil.write(str(Total_words)+"\n") fil.close() fil2=open("doc_map.txt","w") temp=[(k,v) for k,v in doc_list.items()] for k,v in temp: fil2.write(str(k)+" "+str(v)+"\n") fil2.close()
def set_database(): """ This function sets up the database for the corpus documents """ corpus = "/home/mudit/git_repo/News/Corpus" ch = raw_input("1.Set corpus path\n2.Use default path\n") if ch == 1: corpus = raw_input("Enter complete path of corpus:") cwd = os.getcwd() db = createdb.database() doc_list = BOW(db, corpus, cwd) N = db.get_no_of_doc() Total_words = db.get_total_words() print "Number of documents in corpus:" + str(N) print "Total number of words:" + str(Total_words) fil = open("query_setup.txt", "w") fil.write(str(N) + "\n") fil.write(str(Total_words) + "\n") fil.close() fil2 = open("doc_map.txt", "w") temp = [(k, v) for k, v in doc_list.items()] for k, v in temp: fil2.write(str(k) + " " + str(v) + "\n") fil2.close()
def load_data(self, file, validate=True, label=True): fname = file[:-5] ### THIS BLOCK USED TO PREPROCESS TRAINING DATA ### all_avail = pandas.read_json(open(file, 'r')) all_avail['text'] = all_avail['text'].apply(func=self.preprocess) pickle.dump(all_avail, open('pickles/' + fname + '.pickle', 'wb')) ### UNCOMMENT TO LOAD SAVED PREPROCESSED FILES #all_avail = pickle.load(open('pickles/'+fname + '.pickle', 'rb')) print('Finished preprocessing') # 80-20 Split for training and development if validate: train_set = all_avail.sample(frac=0.8, random_state=1) dev_set = all_avail.drop(train_set.index) else: train_set = all_avail # Instantiate bag-of-words object bow = BOW(train_set) # Create vocabulary bow.create_bigram_vocabulary(500) bow.create_vocabulary(500) print('Finished BOW') self.feats = FeatureExtractor(bow) train_X, train_label = self.feats.df_to_feats_skl(train_set, label) joblib.dump(self.feats, 'pickles/feats.pickle') if validate: dev_X, dev_label = self.feats.df_to_feats_skl(dev_set, label) return train_X, train_label, dev_X, dev_label else: return train_X, train_label
sen_len = 40 fix_embedding = True # fix embedding during training batch_size = 128 epoch = 30 lr = 0.0001 # model_dir = os.path.join(path_prefix, 'model/') # model directory for checkpoint model model_dir = path_prefix # model directory for checkpoint model print("loading data ...") # 把'training_label.txt'跟'training_nolabel.txt'讀進來 train_x, y = utils.load_training_data(train_with_label) train_x_no_label = utils.load_training_data(train_no_label) test_x = utils.load_testing_data(testing_data) # 對input跟labels做預處理 max_len = 1200 b = BOW(max_len=max_len) b.bow(train_x, test_x) train_x = b['train'] #import pdb #pdb.set_trace() y = [int(label) for label in y] y = torch.LongTensor(y) # 製作一個model的對象 model = D_Net(embedding_dim=max_len, num_layers=1) model = model.to( device) # device為"cuda",model使用GPU來訓練(餵進去的inputs也需要是cuda tensor) # 把data分為training data跟validation data(將一部份training data拿去當作validation data) X_train, X_val, y_train, y_val = train_x[:190000], train_x[ 190000:], y[:190000], y[190000:]
correctedData = correctedData.values print("-------------CREANDO INDICES DE TEST Y TRAIN------------------") #indxTest,indxTrain,indxVal = utils.separate_dataset(correctedData,cantidad_preg) #cant_test = len(indxTest) #cant_train = len(indxTrain) #cant_val = len(indxVal) #print("cantidad de patrones de prueba: ",cant_test) #print("cantidad de patrones de entrenamiento: ",cant_train) #print("cantidad de patrones de validacion: ",cant_val) print( "-------------FINISHED CREANDO INDICES DE TEST Y TRAIN------------------\n" ) stoplist = stopwords.words('spanish') text = correctedData[:, 1] bow_unigram = BOW(text, 'ascii', stoplist, weighting=True) print( "-------------CREANDO Ytest e Ytrain (groundtruth de cada subconjunto), Xtext_test y Xtext_train-------------------" ) Y = np.zeros((cantidad_preg), dtype=np.int64) for i in range(cantidad_preg): Y[i] = correctedData[i, 0] Y = torch.from_numpy(Y) print(Y) """ #assert cant_test + cant_train == cantidad_preg #Ytrain = np.zeros((cant_train,1), dtype=np.int64) Ytrain = np.zeros((cant_train),dtype=np.int64) #clases de los patrones que estan en el subconjunto de train Xtrain = np.zeros((cant_train, bow_unigram.X.shape[1]), dtype=np.float) #opcion p mejorar performance, inicializar como sparse cada matriz X
from sklearn.model_selection import RandomizedSearchCV, train_test_split import torch from sklearn.metrics import balanced_accuracy_score from mpl_toolkits import mplot3d import scipy.interpolate as interp from mpl_toolkits.mplot3d import Axes3D from sklearn.ensemble import GradientBoostingClassifier correctedData = pn.read_csv( "C:/Users/lucy/chatbot/preprocessedQuestions_lem_completadas.csv", delimiter=',', header=None) #comentar esta linea en caso de descomentar la anterior cantidad_preg = correctedData.shape[0] correctedData = correctedData.values stoplist = stopwords.words('spanish') bow_unigram = BOW(correctedData[:, 1], 'ascii', stoplist, weighting=False) Y = np.zeros((cantidad_preg), dtype=np.int64) for i in range(cantidad_preg): Y[i] = correctedData[i, 0] Y = torch.from_numpy(Y) # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node
#trainX_RF = pn.read_csv("trainX_RF.csv",header=None,delimiter=',').values #trainY_RF = pn.read_csv("trainY_RF.csv",header=None,delimiter=',').values #testX_RF = pn.read_csv("testX_RF.csv",header=None,delimiter=',').values #testY_RF = pn.read_csv("testY_RF.csv",header=None,delimiter=',').values correctedData = pn.read_csv("C:/Users/lucy/chatbot/preprocessedQuestions_lem.csv",delimiter=',') #comentar esta linea en caso de descomentar la anterior cantidad_preg = correctedData.shape[0] correctedData = correctedData.values print(type(correctedData)) print(correctedData.dtype) Xtrain_text,trainY_RF,Xtest_text,testY_RF,_,_ = utils.separate_dataset(correctedData,cantidad_preg,validation=False) # print('Hasta acá todo ok') # Instantiate model stoplist = stopwords.words('spanish') print(Xtrain_text.shape) bow_unigram = BOW(Xtrain_text.ravel(),'ascii',stoplist,weighting = False) trainX_RF = bow_unigram.X testX_RF = bow_unigram.vectorizer.transform(Xtest_text.ravel()) print('Training Features Shape:', trainX_RF.shape)#num_patrones x num_caracteristicas print('Training Labels Shape:', trainY_RF.shape)#vector columna print('Testing Features Shape:', testX_RF.shape) print('Testing Labels Shape:', testY_RF.shape)#vector columna # Import matplotlib for plotting and use magic command for Jupyter Notebooks # # Set the style # plt.style.use('fivethirtyeight print('Instancié el modelo...') # rf = RandomForestRegressor(n_estimators= 10000, random_state=42)
train_set = json.load(open(train_file, 'r')) print('Loaded Json') train_set_preprocessed = preprocess(train_set) else: train_set_preprocessed = pickle.load( open('train_set_stop.pickle', 'rb')) print('Loaded from Pickle') #80-20 split for train and dev train_percent = 80 train_value = math.floor(.8 * len(train_set_preprocessed)) train_set = train_set_preprocessed[:train_value] dev_set = train_set_preprocessed[train_value:] bow = BOW(train_set_preprocessed) bow.calculate_independent_words() pickle.dump(bow, open('bow.pickle', 'wb')) print('Finished BOW') train = [(feature_extractor.extract_all(x), int(x['stars'])) for x in train_set] dev = [(feature_extractor.extract_all(x), int(x['stars'])) for x in dev_set] print(train[:5]) model = NBModel(train, dev) print(model.validate()) print(model.informative_features())
args = parser.parse_args() for arg in vars(args): print(arg, getattr(args, arg)) if args.task in ("tagging"): data = TaggingData(args) model = MajorityVote() model.train(data) print("Storing model and data") with open(args.model, "wb") as F: pickle.dump(model, F) with open(args.data_file, "wb") as F: pickle.dump(data, F) elif args.task in ("classification"): data = ClassifyData(args) model = BOW() model.train(data) print("Storing model and data") with open(args.model, "wb") as F: pickle.dump(model, F) with open(args.data_file, "wb") as F: pickle.dump(data, F)
# define dataset & dataloader ########## train_dataset = MSADataset(dataset['train'][0], dataset['train'][1], corpus) valid_dataset = MSADataset(dataset['valid'][0], dataset['valid'][1], corpus) print("training data = {}".format(len(train_dataset))) print("validation data = {}".format(len(valid_dataset))) MSADataLoader = partial(DataLoader, collate_fn=collate2) train_dataloader = MSADataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_dataloader = MSADataLoader(valid_dataset, batch_size=batch_size) ########## # define model ########## model = BOW(len(vocab), embed_size, len(corpus)) model = torch.load("model3.p") cuda = torch.cuda.is_available() if cuda: model.cuda() opt = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) def hasnan(x): n = (x.data != x.data).sum() return n != 0 def clip_grad(grad, clip=20): thres = torch.ones(grad.data.size())*clip if cuda: thres = thres.cuda()