def load_data(file): mapping = load_embeddings('glove.6B.50d.txt') preprocess = Preprocess() file = open(file, "r") data_x = [] data_y = [] sentence = [] categories = [] for line in file: if len(line.split()) == 0: sentence, categories = add_padding(sentence, categories, 20) data_x.append(sentence) data_y.append(categories) sentence = [] categories = [] else: if (line.split()[0]).lower() in mapping: word_embedding = mapping[(line.split()[0]).lower()] word_category = one_hot_encoding(line.split()[2]) sentence.append(word_embedding) categories.append(word_category) else: sentence.append(np.zeros(50)) categories.append(one_hot_encoding(line.split()[2])) data_x = np.array(data_x) data_y = np.array(data_y) return data_x, data_y
def main(args): pp = Preprocess() # ---- Run Spotlight ---- # graphs = pp.parse_tcp_dump(args.tcp_folder, args.csv_file) # tcp = ad.read_csv_file(args.csv_file) # graphs = tcp.iloc[:, [0, 1, 3]] # graphs.columns = ['source', 'destination', 'hours_past'] # # run_spotlight(args, np.array(graphs)) # # ---- Run Shingle Sketch ----- # graph_utils.create_graphs(args.csv_file, args.graph_folder) is_gexf = False graphs = pp.preprocess_gfiles(args.graph_folder) # #--- For Muta or Chemical Data ---- # graphs = pp.preprocess_gexf(args.gexffile) # is_gexf = True # # #---For DOS Attack Data --- # graphs = pp.preprocess_single_gfile("data/dos.g") run_shingle(args, graphs, is_gexf) ad = AnomalyDetection() skvector = ad.read_sketch(args.sketch_vector) print(skvector.shape) ad.anomaly_detection(skvector, args)
def algorithm(df, params): """ wrapper function to put each individual algorithm inside :param df: dataframe that contains all the input dataset :param params: algorithm specific parameters :return: a dictionary of { outputname: output content in memory } """ output = {} # algorithm specific code # construct sentiment analysis PP = Preprocess(df, params['column']) output['phrases'] = PP.get_phrases() output['filtered'] = filtered_tokens = PP.get_words() output['processed'] = processed_tokens = PP.stem_lematize( params['process'], filtered_tokens) output['tagged'] = PP.tagging(params['tagger'], processed_tokens) filtered_most_common, processed_most_common = PP.most_frequent( filtered_tokens, processed_tokens) output['most_common'] = processed_most_common # plot index = [] counts = [] for common in processed_most_common[1:51]: index.append(common[0]) counts.append(common[1]) title = 'Top 50 frequent words (' + params['process'] + ')' output['div'] = plot.plot_bar_chart(index, counts, title) return output
def __init__(self, image=False, images=[], GTPath=""): if images: try: self.homogenized = numpy.array(Image.open(images[0])) self.vesselEnhanced = numpy.array(Image.open(images[1])) self.images = images except IndexError: print("""`images` parameter must include the homogenized image at `images[0]` and vessel enhanced image at `images[1]`""") raise else: self.preprocess = Preprocess(image) self.homogenized = self.preprocess.process(enhance=False).image_array self.vesselEnhanced = self.preprocess.process(onlyEnhance=True).image_array self.mask = self.preprocess.mask self.source = image self.image = Image.open(image) self.loaded = self.image.load() if len(GTPath): self.gt = True self.groundtruth = Image.open(GTPath) else: self.gt = False self.feature_array = numpy.empty(0)
def load_data(file): mapping = load_embeddings('glove.6B.50d.txt') preprocess = Preprocess() data = pd.read_csv(file, encoding='latin-1', names=['sentiment', 'id', 'date', 'q', 'nick', 'tweet']) data = data.sample(frac=1) data = data[:100000] data_x = [] data_y = [] for index in data.index: row = data.loc[index, :] if row['sentiment'] != 2: row['tweet'] = preprocess.preprocess(row['tweet']) tweet = [] for word in row['tweet'].split(): if word in mapping: word_embedding = mapping[word] tweet.append(word_embedding) else: tweet.append(np.zeros(50)) tweet = add_padding(tweet, 20) data_x.append(tweet) data_y.append(one_hot_encoding(row['sentiment'])) data_x = np.array(data_x) data_y = np.array(data_y) return data_x, data_y
def __init__(self): classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz" # scenario 1 # classifier_path2 = "stanford/id-ner-model-half.ser.gz" # scenario 2 # classifier_path2 = "stanford/id-ner-model-id.ser.gz" # scenario 3 # classifier_path2 = "stanford/id-ner-model-2.ser.gz" ner_jar_path = "stanford/stanford-ner.jar" # for handling error nltk internals nltk.internals.config_java(options='-xmx5g') self.pre = Preprocess() self.scp = StanfordParser( './stanford/stanford-parser.jar', './stanford/stanford-parser-3.9.1-models.jar', encoding='utf8') self.ner_tagger = StanfordNERTagger(classifier_path1, ner_jar_path, encoding='utf8') # for scenario 3 self.pos_tagger = StanfordPOSTagger( './stanford/english-bidirectional-distsim.tagger', './stanford/stanford-postagger.jar', encoding='utf8') # combining classifier from Stanford with custom classifier # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2 self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
def parse(self, response): ''' This method is called repeatedly to process documents from the URL frontier. Scrapy handles compliance of Politeness policies ''' url = response.request.url # Remove html tags from the document raw_text = GetText(response.body) # Preprocess the document's content tokens = Preprocess(raw_text) # Add document to be stored in local storage if self.count < LIMIT: self.dstore.add_document(tokens, response.body, url) # Extract url references and add them to the url frontier for a in response.css('a'): if 'href' in a.attrib: yield response.follow(a, callback=self.parse) # Limit of pages to crawl if self.count > LIMIT: raise CloseSpider(reason='reached_limit') # Force spider to close print(str(self.count) + '\n\n') # IGNORE/COMMENT THIS self.count += 1
def __init__(self, feature_list, **kwargs): """create a policy object that preprocesses according to feature_list and uses a neural network specified by keyword arguments (see create_network()) """ self.preprocessor = Preprocess(feature_list) kwargs["input_dim"] = self.preprocessor.output_dim self.model = CNNPolicy.create_network(**kwargs) self.forward = self._model_forward()
def test_feature_match(self,tresh,retest,img1,imgref): print("Test_feature_match") process = Preprocess("NA","NA") fonte = cv2.FONT_HERSHEY_SIMPLEX try: x_detect,y_detect,score=process.feature_match(img1, imgref) print(str("Score of Feature Match") + str(score)) except: score=0 print("except aqui") finally: url,CustomerName,Division,SerialNumber,AssemblyNumber,TesterName,ProcessStep,Operator = get_data_to_test() print("Teste de Serial:" + str(SerialNumber)) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H%M%S") if SerialNumber=="": SerialNumber=str("No_Serial" + str(dt_string)) print(SerialNumber) #-----Inclusão de Serial Number--------- self.Set_Serial_TestTime_List(SerialNumber) #RESULT OF TEST if(score>int(tresh)): cv2.putText(img1, "PASS - LABEL DETECTED", (50, 400), fonte, 3, (0,255,0), 3, cv2.LINE_AA) cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA) send_test_result("P") #cv2.imwrite("./logs/" + str(SerialNumber)+ "_pass.jpg",img1) elif(score<int(tresh)) and (score>=0): cv2.putText(img1, "FAIL- NO LABEL", (50, 400), fonte, 3, (0,0,255), 3, cv2.LINE_AA) cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA) #send_test_result("F") print("retest numbers:") print(str(self.Count_Serial_TestTime_Occurence(SerialNumber))) print(str(self.Get_Retest_Times_Before_Fail())) if (self.Count_Serial_TestTime_Occurence(SerialNumber) > self.Get_Retest_Times_Before_Fail()): send_test_result("F") #send_test_result_parser(ResultMes="F",Fail_Description=str("FAIL FIRMWARE VERSION "+ str(string))) cv2.putText(img1, "MES REJECTION"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA) else: cv2.putText(img1, "RETEST NUMBER:"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA) #cv2.imwrite("./logs/" + str(SerialNumber) +"_fail.jpg",img1) #cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA) return score
def main(argv): pre = Preprocess(argv[0], argv[1]) pre.build_vectors() dataset = ToxicityDataset(pre.vectors, pre.targets) # Without sentiment # gru = GRU(360).double() # With sentiment gru = GRU(373).double() if use_GPU: gru.cuda() training.train(gru, dataset, 2, 4, 0.1, use_gpu=use_GPU)
def preprocess(self, new, vector, chi2): self.prepro = Preprocess(self.data_set, new) if new == 'True': if vector == 'hashing': self.prepro.hashVector() if vector == 'tfidf': self.prepro.tfidfVector() # print self.preprocess.y_train else: self.prepro.vectorize(vector) if chi2: self.prepro.chisquare()
def get_data(): files = os.listdir('./MealNoMealData') meal_data_files = [] no_meal_data_files = [] for file in files: if 'Nomeal' in file: no_meal_data_files.append(os.path.join('./MealNoMealData', file)) else: meal_data_files.append(os.path.join('./MealNoMealData', file)) data = [] labels = [] for meal_data_file, no_meal_data_file in zip(meal_data_files, no_meal_data_files): preprocess_obj = Preprocess(meal_data_file) meal_df = preprocess_obj.get_dataframe() meal_features = Features(meal_df) meal_features.compute_features() # temp_meal_features = meal_features.pca_decomposition().tolist() temp_meal_features = meal_features.get_features() labels += [1] * len(temp_meal_features) preprocess_obj_ = Preprocess(no_meal_data_file) no_meal_df = preprocess_obj_.get_dataframe() no_meal_features = Features(no_meal_df) no_meal_features.compute_features() no_meal_features_ = no_meal_features.get_features() # no_meal_final_features = meal_features.pca.transform(no_meal_features_).tolist() no_meal_final_features = no_meal_features_ labels += [0] * len(no_meal_features_) for no_meal_feature in no_meal_final_features: temp_meal_features.append(no_meal_feature) for meal_no_meal_feature in temp_meal_features: data.append(meal_no_meal_feature) return data, labels
def define_model(self, name): if self.is_trained == False: if name == 'preprocInc': #self.mod = MultinomialNB() self.mod = Pipeline([('what', Preprocess()), ('a pain', MultinomialNB(alpha=0.05, fit_prior=False, class_prior=None))]) else: print( 'Error selecting the model, choose by default Gaussian NB') self.mod = MultinomialNB() else: print("Model already load")
def queryProcess(self): preprocess = Preprocess() self.query = self.query.lower() self.query = preprocess.preprocess(self.query) tokenizer = RegexpTokenizer(r"[\d-]+\w+|[A-Z][.A-Z]+\b\.*|[\w-]+|'.*'") self.query_tokens = tokenizer.tokenize(self.query) if self.query_tokens[0] in wh_qstn_words: self.query_type = 1 elif self.query_tokens[0] in ab_qstn_words: self.query_type = 2 elif self.query_tokens[0] in desc_qstn_words: self.query_type = 3 else: self.query_type = 4
def lambda_handler(event, context): # TODO implement json_data = json.loads(event['body']) preprocess = Preprocess(json_data=json_data) preprocess.scale_points(calculate_scale=False) pose_objects = preprocess.new_pose_objects features = [] features_obj = Features(pose_objects=pose_objects) features_obj.compute_features() features = features_obj.get_features() # pca_model = pickle.load(open('pca.pkl', 'rb')) # reduced_feature_matrix = pca_model.transform(features) s3 = boto3.resource('s3') svm_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("SVM_model.pkl").get() ['Body'].read()) logreg_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("LogReg_model.pkl").get() ['Body'].read()) lda_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("LDA_model.pkl").get() ['Body'].read()) random_forest_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("RForest_model.pkl").get() ['Body'].read()) prediction_rf = random_forest_classifier.predict(features) prediction_svm = svm_classifier.predict(features) prediction_lda = lda_classifier.predict(features) prediction_logreg = logreg_classifier.predict(features) data = { "1": prediction_svm[0], "2": prediction_logreg[0], "3": prediction_lda[0], "4": prediction_rf[0] } return {'statusCode': 200, 'body': json.dumps(data)}
import torch from preprocessing import Preprocess import json from dataset import QADataset from transformers import BertModel import os from model import Answer from solver import Solver import sys arg = sys.argv ctx_max_len = 475 question_max = 30 pre = Preprocess(ctx_max_len=ctx_max_len, question_max=question_max) data = {} if not os.path.isdir('processed_data'): os.mkdir('processed_data') if not os.path.isdir('ckpt'): os.mkdir('ckpt') # if arg[1] == 'train': for name in ['dev', 'train']: if not os.path.isfile(f'processed_data/{name}.pkl'): print(f"Start {name}......") with open(f"data/{name}.json") as f: file = json.load(f) file = [data for data in file['data']] pre_data = pre.preprocess_data(file, train=not (name == 'test'),
from preprocessing import Preprocess from activity import Activity from threshold import Threshold import pandas as pd import os # Read raw data from file raw_data_frame = Preprocess("raw_data/girlbosskaty_tweets.csv", header=0) # Select the Time and username column from raw data data_time_uid = raw_data_frame.get_columns(["Screen_Name", "Time"]) # print(data_time_uid) # Calculate Activity act = Activity(data_time_uid) dic_act = act.export_times() # print(dic_act) myThresh = Threshold(dic_act) # print(myThresh.apply_clock_threshold(start_time="01:00:00", stop_time="05:00:00"), # "tweets between %s and %s" % (myThresh.start_time, myThresh.stop_time)) # print(myThresh.ckeck_day_tweets()) WeekDay_counter = myThresh.ckeck_day_tweets() night_tweet_counter = myThresh.apply_clock_threshold(start_time="01:00:00", stop_time="05:00:00") print(WeekDay_counter)
# # print(len(X), len(Y)) # clf_rforest = Classification('RForest', X, Y) # clf_rforest.get_classifier_object() # clf_rforest.get_metrics() # pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb')) # print() files = os.listdir('./data/gift') for file in files: file_path = os.path.join('./data/gift', file) with open(file_path, encoding="utf-8") as data: json_data = json.load(data) # print(json_data) preprocess = Preprocess(json_data=json_data) preprocess.scale_points(calculate_scale=False) pose_objects = preprocess.new_pose_objects features = [] features_obj = Features(pose_objects=pose_objects) features_obj.compute_features() features = features_obj.get_features() pca_model = pickle.load(open('pca.pkl', 'rb')) # reduced_feature_matrix = pca_model.transform(features) random_forest_classifier = pickle.load(open('RForest_model.pkl', 'rb')) # prediction = random_forest_classifier.predict(reduced_feature_matrix)
from preprocessing import Preprocess if __name__ == '__main__': #os.system('sudo raspivid -br 80') cam = Camera(1280, 1080, dispositivo=1, camera_type='WEBCAM') cam.set_focus(25) cam.set_exposure(100) cam.set_exposure_auto(3) #Inicializa Testplan testplan = Testplan(produto='solo', posto=1) imReference = testplan.get_imgRef() #Inicializa Modelo de Preprocessamento preprocess = Preprocess(produto='solo', posto=1) while True: ret, frame1 = cam.camera_read() frame1 = cv2.resize(frame1, (640, 480), interpolation=cv2.INTER_CUBIC) preprocess.executa_preprocessamento(imgFrame=frame1, imgRef=imReference) #preprocess.segmentation(frame1) imReg, frame2, Result = preprocess.custom_processing( imReference, frame1) if (Result == True): testplan.executa_teste(imReg)
def train(self, train_data, time_info): self.new_model_history_label = [] self.lgb_predict_list = [] self.linear_predict_list = [] self.new_model_n_predict = 0 self.data = train_data gc.collect() self.data['changed_y'] = self.data[self.label].copy() self.preprocess = Preprocess() self.preprocess.train_preprocess(self) if self.n_predict == 0: tt, interval, na_num = time_interval( self.data[self.primary_timestamp]) with time_limit("featParamsad"): self.featParamsad = FeatParams(copy.deepcopy(self), tt, interval, na_num) self.featParamsad.fit_transform() gc.collect() self.feat_engine = Feat_engine(self.featParamsad) self.feat_engine.same_feat_train(self) self.feat_engine.history_feat_train(self) if self.use_sample_weight: TransExponentialDecay(self.primary_timestamp, init=1.0, finish=0.75, offset=0).fit(train_data) gc.collect() col = self.data.any() col = col[col].index self.data = self.data[col] gc.collect() X = self.data categorical_feature = [] self.last_drop_col.append(self.primary_timestamp) if self.n_predict == 0: y = self.data.pop(self.label) y1 = self.data['changed_y'] X_train, y_train, X_eval, y_eval = time_train_test_split( X, y, self.primary_timestamp, shuffle=False) if self.time_seg: seg_num = len(X_train) // self.time_seg X_train['time_seg'] = [ (((i // seg_num) + 1) if ((i // seg_num) + 1) <= self.time_seg else self.time_seg) for i in range(len(X_train)) ] X_eval['time_seg'] = self.time_seg self.lgb_model.param_opt_new(X_train, y_train, X_eval, y_eval, categorical_feature, self.primary_id, self.primary_agg, self.primary_timestamp) X_train.drop(self.last_drop_col, axis=1, inplace=True) _, sc1 = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight, round=100) if (y != y1).any(): y_train = y1[:len(y_train)] mod1 = self.lgb_model.model self.lgb_model.model = None _, sc2 = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight, round=100) if sc2 < sc1: gc.collect() self.use_exp_y = False y = y1 else: y_train = y[:len(y_train)] self.lgb_model.model = mod1 lgb_preds, _ = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight) col = X_train.any() col = col[col].index X_train = X_train[col] X_eval = X_eval[col] gc.collect() linear_preds = self.linear_model.valid_fit(X_train, y_train, X_eval, y_eval, self.use_sample_weight) gc.collect() if self.tmpControlType == 1: self.linear_weight, self.lgb_weight = 1, 0 elif self.tmpControlType == 2: self.linear_weight, self.lgb_weight = 0, 1 else: self.linear_weight, self.lgb_weight = serch_best_fusion_proportion( linear_preds, lgb_preds, y_eval) else: if not self.use_exp_y: self.data[self.label] = self.data['changed_y'].copy() y = self.data.pop(self.label) self.data.pop('changed_y') X.drop(self.last_drop_col, axis=1, inplace=True) if self.time_seg: seg_num = len(X) // self.time_seg X['time_seg'] = [ (((i // seg_num) + 1) if ((i // seg_num) + 1) <= self.time_seg else self.time_seg) for i in range(len(X)) ] with time_limit("linear_fit"): self.linear_model.fit(X, y, self.use_sample_weight) with time_limit("fit"): self.lgb_model.fit(X, y, categorical_feature, self.use_sample_weight) next_step = 'predict' return next_step
def preprocess(args): prep = Preprocess(path_to_links=args.path_to_links, path_to_movies=args.path_to_movies) prep.preprocess(path_show_data=args.path_to_overall, path_alg_data=args.path_to_alg)
#%% USE_CUDA = torch.cuda.is_available() device = torch.device("cuda" if USE_CUDA else "cpu") directory = '0105_clothing_ncf_it6_lr1e05' trainingEpoch = 20 trainOption = True validationOption = not True testOption = True # %% """ Setup for preprocessing """ pre_work = Preprocess() num_of_reviews = 5 batch_size = 16 num_of_rating = 1 num_of_validate = 3 # %% selectTable = 'clothing_' res, itemObj, userObj = pre_work.loadData( havingCount=6, sqlfile="HNAE/SQL/[email protected]", LIMIT=1000, testing=False, table='clothing_') # for clothing. # res, itemObj, userObj = loadData(havingCount=20, LIMIT=2000, testing=False, table='elec_') # for elec. # res, itemObj, userObj = pre_work.loadData(havingCount=15, LIMIT=1000, testing=False, table='toys_') # for toys
# coding:utf-8 """ building the AHTM models. """ import nltk import codecs import sys import pandas import numpy import gensim from preprocessing import Preprocess from gensim import corpora, models, similarities import pyLDAvis p = Preprocess() print "\n" pm = p.multi_process() train_set = pm[0] # [ [], # [],... # ] before_word_num = pm[1] after_word_num = pm[2] # print before_word_num # print after_word_num for i in train_set: for j in i: if len(j) <= 1: print "有长度为1的存在哦。" dic = corpora.Dictionary(train_set)
import pickle from preprocessing import Preprocess from features import Features import numpy as np import pandas as pd test_file_name = input("Please enter the test file name: ") preprocess_obj = Preprocess(test_file_name) test_file_dataframe = preprocess_obj.get_dataframe() test_file_features_obj = Features(test_file_dataframe) test_file_features_obj.compute_features() test_file_features = test_file_features_obj.get_features() # print(len(test_file_features)) # Random Forest random_forest_clf = pickle.load(open('RForest_model.pkl', 'rb')) y_pred = random_forest_clf.predict(test_file_features) print('Saving the output of RandomForest classifier prediction') rforest_dataframe = pd.DataFrame(y_pred, columns=['Meal/NoMeal']) rforest_dataframe.to_csv('RForest_output.csv') # AdaBoost adaboost_clf = pickle.load(open('Adaboost_model.pkl', 'rb')) y_pred = adaboost_clf.predict(test_file_features) print('Saving the output of AdaBoost classifier prediction') adaboost_dataframe = pd.DataFrame(y_pred, columns=['Meal/NoMeal']) adaboost_dataframe.to_csv('Adaboost_output.csv') # XGBoost XGBoost_clf = pickle.load(open('XGBoost_model.pkl', 'rb'))
def __init__(self): self.pre = Preprocess() self.nlp = NLPHelper() self.fex = FeatureExtractor() self.ut = Utility() self.mt = ModelTrainer()
from preprocessing import Preprocess from classification import Classification import pickle from features import Features import json import os preprocess = Preprocess() preprocess.scale_points() pose_objects = preprocess.new_pose_objects features = [] features_obj = Features(pose_objects=pose_objects) features_obj.compute_features() # reduced_feature_matrix = features_obj.compute_pca() # print(reduced_feature_matrix) # print(len(reduced_feature_matrix),len(reduced_feature_matrix[0])) # X = reduced_feature_matrix X = features_obj.get_features() Y = [obj.label for obj in pose_objects] print(len(X), len(Y)) clf_rforest = Classification('RForest', X, Y) clf_rforest.get_classifier_object() clf_rforest.get_metrics() pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb')) print()
def main(): opt = parse() model_path = "RESULT/"+ opt.save + "/model" vocab_path = "RESULT/" + opt.save + "/vocab" os.makedirs(model_path, exist_ok=True) os.makedirs(vocab_path, exist_ok=True) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu device = torch.device("cuda:0") opt.log = "RESULT/" + opt.save + "/log" opt.save_model = model_path # write a setting with open(opt.log, "a") as f: f.write("-----setting-----\n") f.write("MAX ITERATION : %d \ \nCHECK INTERVAL : %d \ \nBATCH SIZE : %d \ \nACCUMULATION STEPS : %d \ \nWORD CUT : %d \ \nD_MODEL : %d \ \nN_LAYERS : %d \ \nN_HEAD : %d \ \nDROPOUT : %.1f \ \nMODE : %s \ \nSAVE_MODEL : %s \ \nLOG_PATH : %s \ \nGPU NAME: %s \ \nGPU NUM %s \ \nDATASET : \n%s\n%s\n%s\n%s\n%s\n%s" \ %(opt.max_steps, \ opt.check_interval, \ opt.batch_size, \ opt.accumulation_steps, \ opt.word_cut, \ opt.d_model, \ opt.n_layers, \ opt.n_head, \ opt.dropout, \ opt.mode, \ opt.save, \ opt.log, \ torch.cuda.get_device_name(), \ opt.gpu, \ opt.train_src, \ opt.train_trg, \ opt.valid_src, \ opt.valid_trg, \ opt.test_src, \ opt.test_trg)) #gradient accumulation opt.batch_size = int(opt.batch_size/opt.accumulation_steps) opt.batch_max_token = int(opt.batch_max_token/opt.accumulation_steps) opt.check_interval = int(opt.check_interval * opt.accumulation_steps) opt.max_steps = int(opt.max_steps * opt.accumulation_steps) #前処理 source_vocab_path = "RESULT/" + opt.save + "/vocab/source_vocab" target_vocab_path = "RESULT/" + opt.save + "/vocab/target_vocab" SRC = Preprocess() TRG = Preprocess() train_source, valid_source, test_source = \ SRC.load(train=opt.train_src, valid=opt.valid_src, test = opt.test_src, mode=1, vocab_file=source_vocab_path) train_target, valid_target, test_target = \ TRG.load(train=opt.train_trg, valid=opt.valid_trg, test = opt.test_trg, mode=1, vocab_file=target_vocab_path) #SrcDict = SRC.reverse_dict TrgDict = TRG.reverse_dict src_size = len(SRC.dict) trg_size = len(TRG.dict) pad_idx = SRC.dict["<pad>"] trg_sos_idx = TRG.dict["<sos>"] trg_eos_idx = TRG.dict["<eos>"] #create batch sampler with the number of sentence train_batch_sampler = create_sentence_batch_sampler(train_source, train_target, opt.batch_size) valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size) #create batch sampler with the number of token #train_batch_sampler = create_token_batch_sampler(train_source, train_target, opt.batch_max_token) #valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size) #create dataset and dataloader train_data_set = MyDataset(train_source, train_target) valid_data_set = MyDataset(valid_source, valid_target) valid_data_loader = DataLoader(valid_data_set, batch_sampler=valid_batch_sampler, collate_fn=valid_data_set.collater) test_data_set = MyDataset(test_source, test_target) test_data_loader = DataLoader(test_data_set, batch_size=1, collate_fn=test_data_set.collater, shuffle=False) #train if opt.mode == "full" or opt.mode == "train": model = Transformer(src_size, trg_size, opt.d_model, opt.n_layers, opt.n_head, opt.dropout).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9) scheduler = LambdaLR(optimizer, lr_lambda=lr_schedule) model, optimizer = amp.initialize(model, optimizer, opt_level=opt.level) trainer = Trainer( model = model, optimizer = optimizer, train_data_set = train_data_set, train_batch_sampler = train_batch_sampler, valid_data_loader = valid_data_loader, lr_scheduler = scheduler, device = device, TrgDict = TrgDict, pad_idx = pad_idx ) trainer.train(opt.epoch, opt) #test if opt.mode == "full" or opt.mode == "test": load_point = opt.max_steps//opt.check_interval model = average_model(load_point, opt, src_size, trg_size, device) torch.cuda.empty_cache() beam_size = 4 max_seq_len = 410 translator = Translator( model = model, test_data_loader = test_data_loader, TrgDict = TrgDict, device = device, beam_size = beam_size, max_seq_len = max_seq_len, src_pad_idx = pad_idx, trg_pad_idx = pad_idx, trg_bos_idx = trg_sos_idx, trg_eos_idx = trg_eos_idx) translator.test(opt.save)
# readin the training and testing files in the dataframe train_data = pd.read_csv("./train.csv") test_data = pd.read_csv("./test.csv") # converting the columns into list with UTF-8 encoding train_text = train_data['Review Text'].values.astype('U').tolist() train_title = train_data['Review Title'].values.astype('U').tolist() train_rating = train_data['Star Rating'].values.astype('U').tolist() test_text = test_data['Review Text'].values.astype('U').tolist() # converting the rating into integer for i in range(len(train_rating)): train_rating[i] = int(train_rating[i]) # instanciating the Preprocessrt,Vectorizer,DatasetBalancer and Classifier pre = Preprocess() v = Vectors() balance = DatasetBalance() # preprocess the train data x = [] for i in train_text: x.append(" ".join(pre.preprocessing(i))) print("traing data ready") print("now making count vectors") # making count vectors x_cv = v.count_vectors(x, train=True) y = train_rating
rht = Removing HTML tags rurls = Revoing Urls rn = Removing Numbers ntw = convert numbers to words sc = Spelling Correction ata = convert accented to ASCII code sto = short_to_original ec = Expanding Contractions ps = Stemming (Porter Stemming) l = Lemmatization re = Removing Emojis ret = Removing Emoticons ew = Convert Emojis to words etw = Convert Emoticons to words rp = Removing Punctuations rs = Removing Stopwords rfw = Removing Frequent Words rrw = Removing Rare Words rsc = Removing Single characters res = Removing Extra Spaces """ print(f"******** Before preprocessing technique ******* ") for sent in sentences[:5]: print(sent) preprocessing = Preprocess() preprocessed_text = preprocessing.preprocessing(sentences, techniques) print(f"******** After preprocessing ****************") for sent in preprocessed_text[:5]: print(sent)
def get_tweet_name_time(input_file): name_time = Preprocess(input_file, header=0).get_columns(["Screen_Name", "Time"]) return name_time