def __init__(self): Preprocess.__init__(self) self.equal_regex = re.compile(r'=+[^=]+=+') self.pars_regex = re.compile(r'([^()]+)|\([^\(\)]+\)') self.chartype = Chartype()
def __init__(self): classifier_path1 = "stanford/english.muc.7class.distsim.crf.ser.gz" # scenario 1 # classifier_path2 = "stanford/id-ner-model-half.ser.gz" # scenario 2 # classifier_path2 = "stanford/id-ner-model-id.ser.gz" # scenario 3 # classifier_path2 = "stanford/id-ner-model-2.ser.gz" ner_jar_path = "stanford/stanford-ner.jar" # for handling error nltk internals nltk.internals.config_java(options='-xmx5g') self.pre = Preprocess() self.scp = StanfordParser( './stanford/stanford-parser.jar', './stanford/stanford-parser-3.9.1-models.jar', encoding='utf8') self.ner_tagger = StanfordNERTagger(classifier_path1, ner_jar_path, encoding='utf8') # for scenario 3 self.pos_tagger = StanfordPOSTagger( './stanford/english-bidirectional-distsim.tagger', './stanford/stanford-postagger.jar', encoding='utf8') # combining classifier from Stanford with custom classifier # self.com_tagger = NERComboTagger(classifier_path1,ner_jar_path,stanford_ner_models=classifier_path1+","+classifier_path2) #for scenario 1 and 2 self.core_nlp = StanfordCoreNLP('http://localhost', port=9000)
def __init__(self, image=False, images=[], GTPath=""): if images: try: self.homogenized = numpy.array(Image.open(images[0])) self.vesselEnhanced = numpy.array(Image.open(images[1])) self.images = images except IndexError: print("""`images` parameter must include the homogenized image at `images[0]` and vessel enhanced image at `images[1]`""") raise else: self.preprocess = Preprocess(image) self.homogenized = self.preprocess.process(enhance=False).image_array self.vesselEnhanced = self.preprocess.process(onlyEnhance=True).image_array self.mask = self.preprocess.mask self.source = image self.image = Image.open(image) self.loaded = self.image.load() if len(GTPath): self.gt = True self.groundtruth = Image.open(GTPath) else: self.gt = False self.feature_array = numpy.empty(0)
def load_data(file): mapping = load_embeddings('glove.6B.50d.txt') preprocess = Preprocess() data = pd.read_csv(file, encoding='latin-1', names=['sentiment', 'id', 'date', 'q', 'nick', 'tweet']) data = data.sample(frac=1) data = data[:100000] data_x = [] data_y = [] for index in data.index: row = data.loc[index, :] if row['sentiment'] != 2: row['tweet'] = preprocess.preprocess(row['tweet']) tweet = [] for word in row['tweet'].split(): if word in mapping: word_embedding = mapping[word] tweet.append(word_embedding) else: tweet.append(np.zeros(50)) tweet = add_padding(tweet, 20) data_x.append(tweet) data_y.append(one_hot_encoding(row['sentiment'])) data_x = np.array(data_x) data_y = np.array(data_y) return data_x, data_y
def main(args): pp = Preprocess() # ---- Run Spotlight ---- # graphs = pp.parse_tcp_dump(args.tcp_folder, args.csv_file) # tcp = ad.read_csv_file(args.csv_file) # graphs = tcp.iloc[:, [0, 1, 3]] # graphs.columns = ['source', 'destination', 'hours_past'] # # run_spotlight(args, np.array(graphs)) # # ---- Run Shingle Sketch ----- # graph_utils.create_graphs(args.csv_file, args.graph_folder) is_gexf = False graphs = pp.preprocess_gfiles(args.graph_folder) # #--- For Muta or Chemical Data ---- # graphs = pp.preprocess_gexf(args.gexffile) # is_gexf = True # # #---For DOS Attack Data --- # graphs = pp.preprocess_single_gfile("data/dos.g") run_shingle(args, graphs, is_gexf) ad = AnomalyDetection() skvector = ad.read_sketch(args.sketch_vector) print(skvector.shape) ad.anomaly_detection(skvector, args)
def __init__(self, feature_list, **kwargs): """create a policy object that preprocesses according to feature_list and uses a neural network specified by keyword arguments (see create_network()) """ self.preprocessor = Preprocess(feature_list) kwargs["input_dim"] = self.preprocessor.output_dim self.model = CNNPolicy.create_network(**kwargs) self.forward = self._model_forward()
def test_feature_match(self,tresh,retest,img1,imgref): print("Test_feature_match") process = Preprocess("NA","NA") fonte = cv2.FONT_HERSHEY_SIMPLEX try: x_detect,y_detect,score=process.feature_match(img1, imgref) print(str("Score of Feature Match") + str(score)) except: score=0 print("except aqui") finally: url,CustomerName,Division,SerialNumber,AssemblyNumber,TesterName,ProcessStep,Operator = get_data_to_test() print("Teste de Serial:" + str(SerialNumber)) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H%M%S") if SerialNumber=="": SerialNumber=str("No_Serial" + str(dt_string)) print(SerialNumber) #-----Inclusão de Serial Number--------- self.Set_Serial_TestTime_List(SerialNumber) #RESULT OF TEST if(score>int(tresh)): cv2.putText(img1, "PASS - LABEL DETECTED", (50, 400), fonte, 3, (0,255,0), 3, cv2.LINE_AA) cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA) send_test_result("P") #cv2.imwrite("./logs/" + str(SerialNumber)+ "_pass.jpg",img1) elif(score<int(tresh)) and (score>=0): cv2.putText(img1, "FAIL- NO LABEL", (50, 400), fonte, 3, (0,0,255), 3, cv2.LINE_AA) cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA) #send_test_result("F") print("retest numbers:") print(str(self.Count_Serial_TestTime_Occurence(SerialNumber))) print(str(self.Get_Retest_Times_Before_Fail())) if (self.Count_Serial_TestTime_Occurence(SerialNumber) > self.Get_Retest_Times_Before_Fail()): send_test_result("F") #send_test_result_parser(ResultMes="F",Fail_Description=str("FAIL FIRMWARE VERSION "+ str(string))) cv2.putText(img1, "MES REJECTION"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA) else: cv2.putText(img1, "RETEST NUMBER:"+ str(self.Count_Serial_TestTime_Occurence(SerialNumber)), (50, 680), fonte, 1.5, (0,0,255), 2, cv2.LINE_AA) #cv2.imwrite("./logs/" + str(SerialNumber) +"_fail.jpg",img1) #cv2.putText(img1, "Score:" + str(score), (50, 430), fonte, 1, (125,255,255), 1, cv2.LINE_AA) return score
def main(argv): pre = Preprocess(argv[0], argv[1]) pre.build_vectors() dataset = ToxicityDataset(pre.vectors, pre.targets) # Without sentiment # gru = GRU(360).double() # With sentiment gru = GRU(373).double() if use_GPU: gru.cuda() training.train(gru, dataset, 2, 4, 0.1, use_gpu=use_GPU)
def preprocess(self, new, vector, chi2): self.prepro = Preprocess(self.data_set, new) if new == 'True': if vector == 'hashing': self.prepro.hashVector() if vector == 'tfidf': self.prepro.tfidfVector() # print self.preprocess.y_train else: self.prepro.vectorize(vector) if chi2: self.prepro.chisquare()
class Project: def __init__(self, num_rows, wrt_feature): self.db = DB(db='major_2') data = Data(self.db) self.data_set = data.getData(num_rows, wrt_feature) def preprocess(self, new, vector, chi2): self.prepro = Preprocess(self.data_set, new) if new == 'True': if vector == 'hashing': self.prepro.hashVector() if vector == 'tfidf': self.prepro.tfidfVector() # print self.preprocess.y_train else: self.prepro.vectorize(vector) if chi2: self.prepro.chisquare() def run_classifier(self, method, classifier): if method == 'classifier': self.classifier = Classifier( self.prepro.severity.keys(), self.prepro.X_train, self.prepro.y_train, self.prepro.X_test, self.prepro.y_test, self.prepro.train_size, self.prepro.test_size) self.classifier.classify(classifier) if method == 'pipeline': self.classifier = PipeLineClassifier( self.prepro.severity.keys(), self.prepro.train_corpus, self.prepro.y_train, self.prepro.X_test, self.prepro.y_test, self.prepro.train_size, self.prepro.test_size) self.classifier.setVariables(classifier) self.classifier.benchmark()
def load_data(file): mapping = load_embeddings('glove.6B.50d.txt') preprocess = Preprocess() file = open(file, "r") data_x = [] data_y = [] sentence = [] categories = [] for line in file: if len(line.split()) == 0: sentence, categories = add_padding(sentence, categories, 20) data_x.append(sentence) data_y.append(categories) sentence = [] categories = [] else: if (line.split()[0]).lower() in mapping: word_embedding = mapping[(line.split()[0]).lower()] word_category = one_hot_encoding(line.split()[2]) sentence.append(word_embedding) categories.append(word_category) else: sentence.append(np.zeros(50)) categories.append(one_hot_encoding(line.split()[2])) data_x = np.array(data_x) data_y = np.array(data_y) return data_x, data_y
def queryProcess(self): preprocess = Preprocess() self.query = self.query.lower() self.query = preprocess.preprocess(self.query) tokenizer = RegexpTokenizer(r"[\d-]+\w+|[A-Z][.A-Z]+\b\.*|[\w-]+|'.*'") self.query_tokens = tokenizer.tokenize(self.query) if self.query_tokens[0] in wh_qstn_words: self.query_type = 1 elif self.query_tokens[0] in ab_qstn_words: self.query_type = 2 elif self.query_tokens[0] in desc_qstn_words: self.query_type = 3 else: self.query_type = 4
def parse(self, response): ''' This method is called repeatedly to process documents from the URL frontier. Scrapy handles compliance of Politeness policies ''' url = response.request.url # Remove html tags from the document raw_text = GetText(response.body) # Preprocess the document's content tokens = Preprocess(raw_text) # Add document to be stored in local storage if self.count < LIMIT: self.dstore.add_document(tokens, response.body, url) # Extract url references and add them to the url frontier for a in response.css('a'): if 'href' in a.attrib: yield response.follow(a, callback=self.parse) # Limit of pages to crawl if self.count > LIMIT: raise CloseSpider(reason='reached_limit') # Force spider to close print(str(self.count) + '\n\n') # IGNORE/COMMENT THIS self.count += 1
def lambda_handler(event, context): # TODO implement json_data = json.loads(event['body']) preprocess = Preprocess(json_data=json_data) preprocess.scale_points(calculate_scale=False) pose_objects = preprocess.new_pose_objects features = [] features_obj = Features(pose_objects=pose_objects) features_obj.compute_features() features = features_obj.get_features() # pca_model = pickle.load(open('pca.pkl', 'rb')) # reduced_feature_matrix = pca_model.transform(features) s3 = boto3.resource('s3') svm_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("SVM_model.pkl").get() ['Body'].read()) logreg_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("LogReg_model.pkl").get() ['Body'].read()) lda_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("LDA_model.pkl").get() ['Body'].read()) random_forest_classifier = pickle.loads( s3.Bucket("gesture-recognition").Object("RForest_model.pkl").get() ['Body'].read()) prediction_rf = random_forest_classifier.predict(features) prediction_svm = svm_classifier.predict(features) prediction_lda = lda_classifier.predict(features) prediction_logreg = logreg_classifier.predict(features) data = { "1": prediction_svm[0], "2": prediction_logreg[0], "3": prediction_lda[0], "4": prediction_rf[0] } return {'statusCode': 200, 'body': json.dumps(data)}
def pipeline(imgpath): img = io.imread(imgpath) try: gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) except cv2.error: print("Image already in Grayscale") gray = img processed = Preprocess.pre_process_image(gray) corners = Preprocess.find_corners_of_largest_polygon(processed) cropped = Preprocess.crop_and_warp(img, corners) resized = Preprocess.resize(cropped) inverted = Preprocess.invert(resized) #cv2.imshow('Inverted', inverted) # Press q on keyboard to exit #cv2.waitKey(25) & 0xFF == ord('q') cells = Preprocess.boxes(inverted) new_cells = [] for cell in cells: new_cell = Preprocess.process_cells(cell) new_cells.append(new_cell) return inverted, new_cells
def algorithm(df, params): """ wrapper function to put each individual algorithm inside :param df: dataframe that contains all the input dataset :param params: algorithm specific parameters :return: a dictionary of { outputname: output content in memory } """ output = {} # algorithm specific code # construct sentiment analysis PP = Preprocess(df, params['column']) output['phrases'] = PP.get_phrases() output['filtered'] = filtered_tokens = PP.get_words() output['processed'] = processed_tokens = PP.stem_lematize( params['process'], filtered_tokens) output['tagged'] = PP.tagging(params['tagger'], processed_tokens) filtered_most_common, processed_most_common = PP.most_frequent( filtered_tokens, processed_tokens) output['most_common'] = processed_most_common # plot index = [] counts = [] for common in processed_most_common[1:51]: index.append(common[0]) counts.append(common[1]) title = 'Top 50 frequent words (' + params['process'] + ')' output['div'] = plot.plot_bar_chart(index, counts, title) return output
class CombineNews(object): def __init__(self): self.pre = Preprocess() def cleansingText(self, text): text = self.pre.eliminatePunctuation(text) return self.pre.normalizePunctuation(text) def combineToCsvFromFolder(self): path_to_json = "datasets/" data = pd.DataFrame() for filename in os.listdir(path_to_json): if filename.endswith('.json'): with open(os.path.join(path_to_json, filename)) as json_data: json_result = json.load(json_data) for key, value in json_result['fiveWoneH'].iteritems(): json_result[key] = value del json_result['fiveWoneH'] data = data.append(json_result, ignore_index=True) data['text'] = data['text'].apply(lambda x: self.cleansingText(x)) data['title'] = data['title'].apply(lambda x: self.cleansingText(x)) data = data.dropna(axis=1, how="any") data.to_csv('golden_data.csv', sep=';', index=False, encoding='utf-8') def combineToCsvFromFile(self): filename = "news_crawler/page_contents.json" data = pd.DataFrame() with open(filename) as json_data: json_result = json.load(json_data) temp = pd.DataFrame() temp = data.append(json_result, ignore_index=True) temp['body'] = temp['body'].apply(lambda x: self.cleansingText(x)) temp['title'] = temp['title'].apply( lambda x: self.cleansingText(x)) temp = temp.dropna(axis=0, how="any") data = temp data.to_csv('test.csv', sep=';', index=False, encoding='utf-8')
def define_model(self, name): if self.is_trained == False: if name == 'preprocInc': #self.mod = MultinomialNB() self.mod = Pipeline([('what', Preprocess()), ('a pain', MultinomialNB(alpha=0.05, fit_prior=False, class_prior=None))]) else: print( 'Error selecting the model, choose by default Gaussian NB') self.mod = MultinomialNB() else: print("Model already load")
class Project: def __init__(self, num_rows, wrt_feature): self.db = DB(db='major_2') data = Data(self.db) self.data_set = data.getData(num_rows, wrt_feature) def preprocess(self, new, vector, chi2): self.prepro = Preprocess(self.data_set, new) if new == 'True': if vector == 'hashing': self.prepro.hashVector() if vector == 'tfidf': self.prepro.tfidfVector() # print self.preprocess.y_train else: self.prepro.vectorize(vector) if chi2: self.prepro.chisquare() def run_classifier(self, method, classifier): if method == 'classifier': self.classifier = Classifier( self.prepro.severity.keys(), self.prepro.X_train, self.prepro.y_train, self.prepro.X_test, self.prepro.y_test, self.prepro.train_size, self.prepro.test_size) self.classifier.classify(classifier) if method == 'pipeline': self.classifier = PipeLineClassifier( self.prepro.severity.keys(), self.prepro.train_corpus, self.prepro.y_train, self.prepro.X_test, self.prepro.y_test, self.prepro.train_size, self.prepro.test_size ) self.classifier.setVariables(classifier) self.classifier.benchmark()
def get_data(): files = os.listdir('./MealNoMealData') meal_data_files = [] no_meal_data_files = [] for file in files: if 'Nomeal' in file: no_meal_data_files.append(os.path.join('./MealNoMealData', file)) else: meal_data_files.append(os.path.join('./MealNoMealData', file)) data = [] labels = [] for meal_data_file, no_meal_data_file in zip(meal_data_files, no_meal_data_files): preprocess_obj = Preprocess(meal_data_file) meal_df = preprocess_obj.get_dataframe() meal_features = Features(meal_df) meal_features.compute_features() # temp_meal_features = meal_features.pca_decomposition().tolist() temp_meal_features = meal_features.get_features() labels += [1] * len(temp_meal_features) preprocess_obj_ = Preprocess(no_meal_data_file) no_meal_df = preprocess_obj_.get_dataframe() no_meal_features = Features(no_meal_df) no_meal_features.compute_features() no_meal_features_ = no_meal_features.get_features() # no_meal_final_features = meal_features.pca.transform(no_meal_features_).tolist() no_meal_final_features = no_meal_features_ labels += [0] * len(no_meal_features_) for no_meal_feature in no_meal_final_features: temp_meal_features.append(no_meal_feature) for meal_no_meal_feature in temp_meal_features: data.append(meal_no_meal_feature) return data, labels
rht = Removing HTML tags rurls = Revoing Urls rn = Removing Numbers ntw = convert numbers to words sc = Spelling Correction ata = convert accented to ASCII code sto = short_to_original ec = Expanding Contractions ps = Stemming (Porter Stemming) l = Lemmatization re = Removing Emojis ret = Removing Emoticons ew = Convert Emojis to words etw = Convert Emoticons to words rp = Removing Punctuations rs = Removing Stopwords rfw = Removing Frequent Words rrw = Removing Rare Words rsc = Removing Single characters res = Removing Extra Spaces """ print(f"******** Before preprocessing technique ******* ") for sent in sentences[:5]: print(sent) preprocessing = Preprocess() preprocessed_text = preprocessing.preprocessing(sentences, techniques) print(f"******** After preprocessing ****************") for sent in preprocessed_text[:5]: print(sent)
class FeatureExtraction: """ @param image {string} Source of raw image @param images {list} Source of preprocessed images. Where images[0] is the homogenized image and images[1] is the vessel enhanced image. @param GTPath {string} Wether image has ground truth. If the image has ground truth image with correct labels then gt is a path to the groundtruth image. """ def __init__(self, image=False, images=[], GTPath=""): if images: try: self.homogenized = numpy.array(Image.open(images[0])) self.vesselEnhanced = numpy.array(Image.open(images[1])) self.images = images except IndexError: print("""`images` parameter must include the homogenized image at `images[0]` and vessel enhanced image at `images[1]`""") raise else: self.preprocess = Preprocess(image) self.homogenized = self.preprocess.process(enhance=False).image_array self.vesselEnhanced = self.preprocess.process(onlyEnhance=True).image_array self.mask = self.preprocess.mask self.source = image self.image = Image.open(image) self.loaded = self.image.load() if len(GTPath): self.gt = True self.groundtruth = Image.open(GTPath) else: self.gt = False self.feature_array = numpy.empty(0) def __getHomogenized(self, forceNew=False): raise NotImplementedError """ `exportCSV` exports `self.feature_array` to `filename` unless `array` parameter is set if `balanced` then the exported features will have an equal amount of class 0 and class 1. The parameter `delim` can be used to change the seperator from commas to some other character. @method exportCSV @param filename {string} Name of file including the path to it where features will be exported to @param array {numpy array} The feature array to export @param delim {string} The delimeter @default "," @param balanced {bool} Wether to export the full feature array or a balanced version with equal class representation @default False """ def exportCSV( self, filename="", array=numpy.empty(0), delim=",", balanced=False ): if not array.any(): array = self.feature_array if balanced: zeros = array[numpy.less(array[:,0], 1)] ones = array[numpy.greater(array[:,0], 0)] if len(zeros) > len(ones): indices = numpy.random.choice( len(zeros), size=len(ones), replace=False ) array = numpy.concatenate( (ones, zeros[indices]), axis=0 ) if len(ones) > len(zeros): indices = numpy.random.choice( len(ones), size=len(zeros), replace=False ) array = numpy.concatenate( (zeros, ones[indices]), axis=0 ) if not len(filename): if hasattr(self, "source"): filename = "extracted_" + self.source else: filename = "extracted_" + self.images[1] if self.gt: formatting = ['%d', '%.0f', '%.0f', '%f', '%f', '%.0f', '%f', '%f'] header = """label,\tfeat. 1,\tfeat. 2,\tfeat. 3,\tfeat. 4,\tfeat. 5, \tHu mom. 1,\tHu mom. 2""" else: formatting = ['%.0f', '%.0f', '%f', '%f', '%.0f', '%f', '%f'] header = """feat. 1,\tfeat. 2,\tfeat. 3,\tfeat. 4,\tfeat. 5, \tHu mom. 1,\tHu mom. 2""" numpy.savetxt( filename, array, fmt=formatting, # formatting delimiter=',\t', # column delimiter newline='\n', # new line character footer='end of file', # file footer comments='# ', # character to use for comments header=header) # file header """ `normalize` is used to normalize the feature_array. If comp_only (compute only) is set to `True` then only `self.std_vector` and `self.mean_vector` will be set but the value of `self.feature_array` will not be set. This can be useful if computing an accumulated mean and standard deviation and then using the `mean` and `std` parameter later to normalize with the accumulated mean and average standard deviation vectors. @method normalize @param array {numpy array} The feature array if not set then `self.feature_array`. @param mean {numpy array} The mean to use in the normalization. If not set then it will be computed over the inside FOV pixels of the `array` using the `self.mask`. @param std {numpy array} The standard deviation to be used in normalization. @param comp_only {bool} If true then mean, sample variance and standard deviation will be computed and saved to `self.var_vector`, `self.std_vector` and `self.mean_vector` respectively. But they wont be used to normalize the feature array. @default False """ def normalize( self, array=numpy.empty(0), mean=numpy.empty(0), std=numpy.empty(0), comp_only=False ): if not array.any(): array = self.feature_array # preserve label column # compute mean and std excluding out of FOV pixels indices = numpy.greater(self.mask.flatten(), 0) FOV = array[indices] # Since mean should only be computed on the training set # the assumption of ignoring the first column is made, since # this is the label column. if not mean.any(): mean = FOV.mean(axis=0)[1:] if not std.any(): std = FOV.std(axis=0)[1:] var = FOV.var(axis=0)[1:] if comp_only: self.var_vector = var self.std_vector = std self.mean_vector = mean else: if self.gt: labels = array[:,0] array[:,1:] = (array[:,1:] - mean) / std else: array = (array - mean) / std if self.gt: array[:,0] = labels # since there is a groundtruth then the first column # will be the label column, the rest are the actual features.d self.feature_array = array return self def computeFeatures(self, forceNew=False): if forceNew: return self._extract() elif self.feature_array.any(): return self else: return self._extract() """ `_extract` is responsible of extracting the feature array for every pixel in the preprocessed image. If optional parameters `homogenized_array` and `ve_array` are not provided then @method _extract @param homogenized_array {numpy array} The homogenized image from preprocessing @param ve_array {numpy array} The vessel enhanced image from preprocessing """ def _extract( self, homogenized_array=numpy.empty(0), ve_array=numpy.empty(0) ): if not homogenized_array.any(): homogenized_array = self.homogenized if not ve_array.any(): ve_array = self.vesselEnhanced # erode image using an eroded mask mask = binary_erosion(self.mask, square(10)) homogenized_array = homogenized_array * mask # # # # # # # # # # # # # # # # # # # # # print("Extracting features ", end=" ") print("\t\t[", end="") self.feature_array = [] for x in range(len(homogenized_array)): for y in range(len(homogenized_array[0])): if self.mask[x,y] or True: # disabled for now ######################################### xstart = x - 8 if x-8 >= 0 else 0 ystart = y - 8 if y-8 >= 0 else 0 xend = x + 8 if x+8 < len(ve_array) else len(ve_array) -1 yend = y + 8 if y+8 < len(ve_array[0]) else len(ve_array[0]) -1 # 1 is added to the right and bottom boundary because of # pythons way of indexing xend += 1 yend += 1 subarea = ve_array[xstart:xend, ystart:yend] if subarea.max() != 0: Hu0, Hu1 = self.__moments(subarea) ######################################## xstart = x-4 if x-4 >= 0 else 0 ystart = y-4 if y-4 >= 0 else 0 xend = (x+4 if x+4 < len(homogenized_array) else len(homogenized_array) -1) yend = (y+4 if y+4 < len(homogenized_array[0]) else len(homogenized_array[0]) -1) # 1 is added to the right and bottom boundary because of # pythons way of indexing xend += 1 yend += 1 subarea = homogenized_array[xstart:xend, ystart:yend] FOV = numpy.greater(subarea, 0) subarea = (subarea[FOV] if FOV.any() and homogenized_array[x,y] > 0 else numpy.array([0])) # equation 5 from Marin et al. f1 = homogenized_array[x,y] - subarea.min() # equation 6 from Marin et al. f2 = subarea.max() - homogenized_array[x,y] # equation 7 from Marin et al. f3 = homogenized_array[x,y] - subarea.mean() # equation 8 from Marin et al. f4 = subarea.std() # equation 9 from Marin et al. # inverting the background, so setting zero to 255 f5 = homogenized_array[x,y] ######################################## if self.gt: # values in groundtruth are either 255 or 0 gtval = self.groundtruth.getpixel((x,y)) label = gtval if gtval == 0 else 1 features = [label, f1, f2, f3, f4, f5, Hu0, Hu1] else: features = [f1, f2, f3, f4, f5, Hu0, Hu1] elif not self.gt: features = [0, 0, 0.0, 0.0, 0, 0.0, 0.0] else: # values in groundtruth are either 255 or 0 gtval = self.groundtruth.getpixel((x,y)) label = gtval if gtval == 0 else 1 features = [label, 0, 0, 0.0, 0.0, 0, 0.0, 0.0] self.feature_array.append(features) if x % (len(homogenized_array) * 0.05) < 1: print("#", end="") self.feature_array = numpy.array(self.feature_array) print("]") return self """ `__moments` computes the first two Hu moment over some array given by the parameter `subarray`. @private @method __moments @param subarray {numpy array} The area which the Hu moments are computed over. """ def __moments(self, subarray): """ I_HU(x,y) = subarray(x,y) * gaussian_matrix(x,y) returns absolute value of the log of the first two Hu moments """ I_HU = self.__gausMatrix(subarray) h1, h2 = cv2.HuMoments(cv2.moments(I_HU))[0:2] h1 = numpy.log(h1) if h1 != 0 else h1 h2 = numpy.log(h2) if h2 != 0 else h2 return numpy.absolute( [h1[0], h2[0]] ) def __gausMatrix(self, array, mu=0.0, sigma=1.7): x, y = array.shape return scipy.ndimage.filters.gaussian_filter(array, 1.7)
class CNNPolicy(object): """uses a convolutional neural network to evaluate the state of the game and compute a probability distribution over the next action """ def __init__(self, feature_list, **kwargs): """create a policy object that preprocesses according to feature_list and uses a neural network specified by keyword arguments (see create_network()) """ self.preprocessor = Preprocess(feature_list) kwargs["input_dim"] = self.preprocessor.output_dim self.model = CNNPolicy.create_network(**kwargs) self.forward = self._model_forward() def _model_forward(self): """Construct a function using the current keras backend that, when given a batch of inputs, simply processes them forward and returns the output This is as opposed to model.compile(), which takes a loss function and training method. c.f. https://github.com/fchollet/keras/issues/1426 """ model_input = self.model.get_input(train=False) model_output = self.model.get_output(train=False) forward_function = K.function([model_input], [model_output]) # the forward_function returns a list of tensors # the first [0] gets the front tensor. # this tensor, however, has dimensions (1, width, height) # and we just want (width,height) hence the second [0] return lambda inpt: forward_function(inpt)[0][0] def batch_eval_state(self, state_gen, batch=16): """Given a stream of states in state_gen, evaluates them in batches to make best use of GPU resources. Returns: TBD (stream of results? that would break zip(). streaming pairs of pre-zipped (state, result)?) """ raise NotImplementedError() def eval_state(self, state): """Given a GameState object, returns a list of (action, probability) pairs according to the network outputs """ tensor = self.preprocessor.state_to_tensor(state) # run the tensor through the network network_output = self.forward([tensor]) # get network activations at legal move locations # note: may not be a proper distribution by ignoring illegal moves return [((x, y), network_output[x, y]) for (x, y) in state.get_legal_moves()] @staticmethod def create_network(**kwargs): """construct a convolutional neural network. Keword Arguments: - input_dim: depth of features to be processed by first layer (no default) - board: width of the go board to be processed (default 19) - filters_per_layer: number of filters used on every layer (default 128) - layers: number of convolutional steps (default 12) - filter_width_K: (where K is between 1 and <layers>) width of filter on layer K (default 3 except 1st layer which defaults to 5). Must be odd. """ defaults = { "board": 19, "filters_per_layer": 128, "layers": 12, "filter_width_1": 5 } # copy defaults, but override with anything in kwargs params = defaults params.update(kwargs) # create the network: # a series of zero-paddings followed by convolutions # such that the output dimensions are also board x board network = Sequential() # create first layer network.add( convolutional.Convolution2D(input_shape=(params["input_dim"], params["board"], params["board"]), nb_filter=params["filters_per_layer"], nb_row=params["filter_width_1"], nb_col=params["filter_width_1"], init='uniform', activation='relu', border_mode='same')) # create all other layers for i in range(2, params["layers"] + 1): # use filter_width_K if it is there, otherwise use 3 filter_key = "filter_width_%d" % i filter_width = params.get(filter_key, 3) network.add( convolutional.Convolution2D( nb_filter=params["filters_per_layer"], nb_row=filter_width, nb_col=filter_width, init='uniform', activation='relu', border_mode='same')) # the last layer maps each <filters_per_layer> featuer to a number network.add( convolutional.Convolution2D(nb_filter=1, nb_row=1, nb_col=1, init='uniform', border_mode='same')) # reshape output to be board x board network.add(Reshape((params["board"], params["board"]))) # softmax makes it into a probability distribution network.add(Activation('softmax')) return network def load_model(self, json_file): """load the architecture specified in json_file into 'self' """ raise NotImplementedError() def save_model(self, json_file): """write the network model and preprocessing features to the specified file """ raise NotImplementedError() def load_params(self, h5_file): """load model parameters (weights) in the specified file """ raise NotImplementedError() def save_params(self, h5_file): """save model parameters (weights) to the specified file """ raise NotImplementedError()
from preprocessing import Preprocess from classification import Classification import pickle from features import Features import json import os preprocess = Preprocess() preprocess.scale_points() pose_objects = preprocess.new_pose_objects features = [] features_obj = Features(pose_objects=pose_objects) features_obj.compute_features() # reduced_feature_matrix = features_obj.compute_pca() # print(reduced_feature_matrix) # print(len(reduced_feature_matrix),len(reduced_feature_matrix[0])) # X = reduced_feature_matrix X = features_obj.get_features() Y = [obj.label for obj in pose_objects] print(len(X), len(Y)) clf_rforest = Classification('RForest', X, Y) clf_rforest.get_classifier_object() clf_rforest.get_metrics() pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb')) print()
def main(): opt = parse() model_path = "RESULT/"+ opt.save + "/model" vocab_path = "RESULT/" + opt.save + "/vocab" os.makedirs(model_path, exist_ok=True) os.makedirs(vocab_path, exist_ok=True) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu device = torch.device("cuda:0") opt.log = "RESULT/" + opt.save + "/log" opt.save_model = model_path # write a setting with open(opt.log, "a") as f: f.write("-----setting-----\n") f.write("MAX ITERATION : %d \ \nCHECK INTERVAL : %d \ \nBATCH SIZE : %d \ \nACCUMULATION STEPS : %d \ \nWORD CUT : %d \ \nD_MODEL : %d \ \nN_LAYERS : %d \ \nN_HEAD : %d \ \nDROPOUT : %.1f \ \nMODE : %s \ \nSAVE_MODEL : %s \ \nLOG_PATH : %s \ \nGPU NAME: %s \ \nGPU NUM %s \ \nDATASET : \n%s\n%s\n%s\n%s\n%s\n%s" \ %(opt.max_steps, \ opt.check_interval, \ opt.batch_size, \ opt.accumulation_steps, \ opt.word_cut, \ opt.d_model, \ opt.n_layers, \ opt.n_head, \ opt.dropout, \ opt.mode, \ opt.save, \ opt.log, \ torch.cuda.get_device_name(), \ opt.gpu, \ opt.train_src, \ opt.train_trg, \ opt.valid_src, \ opt.valid_trg, \ opt.test_src, \ opt.test_trg)) #gradient accumulation opt.batch_size = int(opt.batch_size/opt.accumulation_steps) opt.batch_max_token = int(opt.batch_max_token/opt.accumulation_steps) opt.check_interval = int(opt.check_interval * opt.accumulation_steps) opt.max_steps = int(opt.max_steps * opt.accumulation_steps) #前処理 source_vocab_path = "RESULT/" + opt.save + "/vocab/source_vocab" target_vocab_path = "RESULT/" + opt.save + "/vocab/target_vocab" SRC = Preprocess() TRG = Preprocess() train_source, valid_source, test_source = \ SRC.load(train=opt.train_src, valid=opt.valid_src, test = opt.test_src, mode=1, vocab_file=source_vocab_path) train_target, valid_target, test_target = \ TRG.load(train=opt.train_trg, valid=opt.valid_trg, test = opt.test_trg, mode=1, vocab_file=target_vocab_path) #SrcDict = SRC.reverse_dict TrgDict = TRG.reverse_dict src_size = len(SRC.dict) trg_size = len(TRG.dict) pad_idx = SRC.dict["<pad>"] trg_sos_idx = TRG.dict["<sos>"] trg_eos_idx = TRG.dict["<eos>"] #create batch sampler with the number of sentence train_batch_sampler = create_sentence_batch_sampler(train_source, train_target, opt.batch_size) valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size) #create batch sampler with the number of token #train_batch_sampler = create_token_batch_sampler(train_source, train_target, opt.batch_max_token) #valid_batch_sampler = create_sentence_batch_sampler(valid_source, valid_target, opt.valid_batch_size) #create dataset and dataloader train_data_set = MyDataset(train_source, train_target) valid_data_set = MyDataset(valid_source, valid_target) valid_data_loader = DataLoader(valid_data_set, batch_sampler=valid_batch_sampler, collate_fn=valid_data_set.collater) test_data_set = MyDataset(test_source, test_target) test_data_loader = DataLoader(test_data_set, batch_size=1, collate_fn=test_data_set.collater, shuffle=False) #train if opt.mode == "full" or opt.mode == "train": model = Transformer(src_size, trg_size, opt.d_model, opt.n_layers, opt.n_head, opt.dropout).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9) scheduler = LambdaLR(optimizer, lr_lambda=lr_schedule) model, optimizer = amp.initialize(model, optimizer, opt_level=opt.level) trainer = Trainer( model = model, optimizer = optimizer, train_data_set = train_data_set, train_batch_sampler = train_batch_sampler, valid_data_loader = valid_data_loader, lr_scheduler = scheduler, device = device, TrgDict = TrgDict, pad_idx = pad_idx ) trainer.train(opt.epoch, opt) #test if opt.mode == "full" or opt.mode == "test": load_point = opt.max_steps//opt.check_interval model = average_model(load_point, opt, src_size, trg_size, device) torch.cuda.empty_cache() beam_size = 4 max_seq_len = 410 translator = Translator( model = model, test_data_loader = test_data_loader, TrgDict = TrgDict, device = device, beam_size = beam_size, max_seq_len = max_seq_len, src_pad_idx = pad_idx, trg_pad_idx = pad_idx, trg_bos_idx = trg_sos_idx, trg_eos_idx = trg_eos_idx) translator.test(opt.save)
from preprocessing import Preprocess from activity import Activity from threshold import Threshold import pandas as pd import os # Read raw data from file raw_data_frame = Preprocess("raw_data/girlbosskaty_tweets.csv", header=0) # Select the Time and username column from raw data data_time_uid = raw_data_frame.get_columns(["Screen_Name", "Time"]) # print(data_time_uid) # Calculate Activity act = Activity(data_time_uid) dic_act = act.export_times() # print(dic_act) myThresh = Threshold(dic_act) # print(myThresh.apply_clock_threshold(start_time="01:00:00", stop_time="05:00:00"), # "tweets between %s and %s" % (myThresh.start_time, myThresh.stop_time)) # print(myThresh.ckeck_day_tweets()) WeekDay_counter = myThresh.ckeck_day_tweets() night_tweet_counter = myThresh.apply_clock_threshold(start_time="01:00:00", stop_time="05:00:00") print(WeekDay_counter)
import torch from preprocessing import Preprocess import json from dataset import QADataset from transformers import BertModel import os from model import Answer from solver import Solver import sys arg = sys.argv ctx_max_len = 475 question_max = 30 pre = Preprocess(ctx_max_len=ctx_max_len, question_max=question_max) data = {} if not os.path.isdir('processed_data'): os.mkdir('processed_data') if not os.path.isdir('ckpt'): os.mkdir('ckpt') # if arg[1] == 'train': for name in ['dev', 'train']: if not os.path.isfile(f'processed_data/{name}.pkl'): print(f"Start {name}......") with open(f"data/{name}.json") as f: file = json.load(f) file = [data for data in file['data']] pre_data = pre.preprocess_data(file, train=not (name == 'test'),
# # print(len(X), len(Y)) # clf_rforest = Classification('RForest', X, Y) # clf_rforest.get_classifier_object() # clf_rforest.get_metrics() # pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb')) # print() files = os.listdir('./data/gift') for file in files: file_path = os.path.join('./data/gift', file) with open(file_path, encoding="utf-8") as data: json_data = json.load(data) # print(json_data) preprocess = Preprocess(json_data=json_data) preprocess.scale_points(calculate_scale=False) pose_objects = preprocess.new_pose_objects features = [] features_obj = Features(pose_objects=pose_objects) features_obj.compute_features() features = features_obj.get_features() pca_model = pickle.load(open('pca.pkl', 'rb')) # reduced_feature_matrix = pca_model.transform(features) random_forest_classifier = pickle.load(open('RForest_model.pkl', 'rb')) # prediction = random_forest_classifier.predict(reduced_feature_matrix)
parser.add_argument('--chd_hcmp', type=str, default='chd', help='chd or hcmp') parser.add_argument('--epochs', type=int, default=1000, help='Number of epochs for training') parser.add_argument('--init_learning_rate', type=float, default=0.02, help='Initial learning rate') FLAGS, unparsed = parser.parse_known_args() if FLAGS.chd_hcmp != "chd" and FLAGS.chd_hcmp != 'hcmp': raise NotImplementedError('choose "chd" or "hcmp" model, got {}'.format( FLAGS.roi_seg)) process = Preprocess() print('data directory :', FLAGS.data_dir) print('checkpoint directory :', FLAGS.checkpoint_dir) train_data_dir = FLAGS.data_dir image_filelist = [] for file in os.listdir(train_data_dir + '/image/'): if FLAGS.image_filename in file: image_filelist.append(os.path.join(train_data_dir, 'image', file)) print(image_filelist) num_labels = 1 num_channels = 1 input_shape = (None, None, None, num_channels) output_shape = (None, None, None, num_labels)
class CNNPolicy(object): """uses a convolutional neural network to evaluate the state of the game and compute a probability distribution over the next action """ def __init__(self, feature_list, **kwargs): """create a policy object that preprocesses according to feature_list and uses a neural network specified by keyword arguments (see create_network()) """ self.preprocessor = Preprocess(feature_list) kwargs["input_dim"] = self.preprocessor.output_dim self.model = CNNPolicy.create_network(**kwargs) self.forward = self._model_forward() def _model_forward(self): """Construct a function using the current keras backend that, when given a batch of inputs, simply processes them forward and returns the output This is as opposed to model.compile(), which takes a loss function and training method. c.f. https://github.com/fchollet/keras/issues/1426 """ model_input = self.model.get_input(train=False) model_output = self.model.get_output(train=False) forward_function = K.function([model_input], [model_output]) # the forward_function returns a list of tensors # the first [0] gets the front tensor. # this tensor, however, has dimensions (1, width, height) # and we just want (width,height) hence the second [0] return lambda inpt: forward_function(inpt)[0][0] def batch_eval_state(self, state_gen, batch=16): """Given a stream of states in state_gen, evaluates them in batches to make best use of GPU resources. Returns: TBD (stream of results? that would break zip(). streaming pairs of pre-zipped (state, result)?) """ raise NotImplementedError() def eval_state(self, state): """Given a GameState object, returns a list of (action, probability) pairs according to the network outputs """ tensor = self.preprocessor.state_to_tensor(state) # run the tensor through the network network_output = self.forward([tensor]) # get network activations at legal move locations # note: may not be a proper distribution by ignoring illegal moves return [((x,y), network_output[x,y]) for (x,y) in state.get_legal_moves()] @staticmethod def create_network(**kwargs): """construct a convolutional neural network. Keword Arguments: - input_dim: depth of features to be processed by first layer (no default) - board: width of the go board to be processed (default 19) - filters_per_layer: number of filters used on every layer (default 128) - layers: number of convolutional steps (default 12) - filter_width_K: (where K is between 1 and <layers>) width of filter on layer K (default 3 except 1st layer which defaults to 5). Must be odd. """ defaults = { "board": 19, "filters_per_layer": 128, "layers": 12, "filter_width_1": 5 } # copy defaults, but override with anything in kwargs params = defaults params.update(kwargs) # create the network: # a series of zero-paddings followed by convolutions # such that the output dimensions are also board x board network = Sequential() # create first layer network.add(convolutional.Convolution2D( input_shape=(params["input_dim"], params["board"], params["board"]), nb_filter=params["filters_per_layer"], nb_row=params["filter_width_1"], nb_col=params["filter_width_1"], init='uniform', activation='relu', border_mode='same')) # create all other layers for i in range(2,params["layers"]+1): # use filter_width_K if it is there, otherwise use 3 filter_key = "filter_width_%d" % i filter_width = params.get(filter_key, 3) network.add(convolutional.Convolution2D( nb_filter=params["filters_per_layer"], nb_row=filter_width, nb_col=filter_width, init='uniform', activation='relu', border_mode='same')) # the last layer maps each <filters_per_layer> featuer to a number network.add(convolutional.Convolution2D( nb_filter=1, nb_row=1, nb_col=1, init='uniform', border_mode='same')) # reshape output to be board x board network.add(Reshape((params["board"],params["board"]))) # softmax makes it into a probability distribution network.add(Activation('softmax')) return network def load_model(self, json_file): """load the architecture specified in json_file into 'self' """ raise NotImplementedError() def save_model(self, json_file): """write the network model and preprocessing features to the specified file """ raise NotImplementedError() def load_params(self, h5_file): """load model parameters (weights) in the specified file """ raise NotImplementedError() def save_params(self, h5_file): """save model parameters (weights) to the specified file """ raise NotImplementedError()
class Model: def __init__(self, info, test_timestamp, pred_timestamp): self.info = info self.primary_timestamp = info['primary_timestamp'] self.primary_id = info['primary_id'] self.primary_agg = None self.label = info['label'] self.schema = info['schema'] self.schema.pop(self.label) self.origin_feat = list(self.schema.keys()) print(f"\ninfo: {self.info}") self.dtype_cols = { 'cat': [col for col, types in self.schema.items() if types == 'str'], 'num': [col for col, types in self.schema.items() if types == 'num'] } self.test_timestamp = test_timestamp self.pred_timestamp = pred_timestamp self.n_test_timestamp = len(pred_timestamp) self.split_num = 5 self.update_interval = int(self.n_test_timestamp / self.split_num) self.lgb_model = LGBMRegressor() self.linear_model = LinearRegressor() self.use_Linear = True self.use_sample_weight = False self.use_exp_y = True self.tmpControlType = 4 self.time_seg = 0 self.linear_weight = 0 self.lgb_weight = 0 self.n_predict = 0 self.isfirst_predict = True self.last_drop_col = [] self.history = pd.DataFrame() self.new_model_n_predict = 0 self.new_model_history_label = [] self.lgb_predict_list = [] self.linear_predict_list = [] self.train_time_num = 0 self.preprocess = None self.featParamsad = None self.feat_engine = None self.data = pd.DataFrame() self.train_time = 0 def update_data(self, df): self.data = df def train(self, train_data, time_info): self.new_model_history_label = [] self.lgb_predict_list = [] self.linear_predict_list = [] self.new_model_n_predict = 0 self.data = train_data gc.collect() self.data['changed_y'] = self.data[self.label].copy() self.preprocess = Preprocess() self.preprocess.train_preprocess(self) if self.n_predict == 0: tt, interval, na_num = time_interval( self.data[self.primary_timestamp]) with time_limit("featParamsad"): self.featParamsad = FeatParams(copy.deepcopy(self), tt, interval, na_num) self.featParamsad.fit_transform() gc.collect() self.feat_engine = Feat_engine(self.featParamsad) self.feat_engine.same_feat_train(self) self.feat_engine.history_feat_train(self) if self.use_sample_weight: TransExponentialDecay(self.primary_timestamp, init=1.0, finish=0.75, offset=0).fit(train_data) gc.collect() col = self.data.any() col = col[col].index self.data = self.data[col] gc.collect() X = self.data categorical_feature = [] self.last_drop_col.append(self.primary_timestamp) if self.n_predict == 0: y = self.data.pop(self.label) y1 = self.data['changed_y'] X_train, y_train, X_eval, y_eval = time_train_test_split( X, y, self.primary_timestamp, shuffle=False) if self.time_seg: seg_num = len(X_train) // self.time_seg X_train['time_seg'] = [ (((i // seg_num) + 1) if ((i // seg_num) + 1) <= self.time_seg else self.time_seg) for i in range(len(X_train)) ] X_eval['time_seg'] = self.time_seg self.lgb_model.param_opt_new(X_train, y_train, X_eval, y_eval, categorical_feature, self.primary_id, self.primary_agg, self.primary_timestamp) X_train.drop(self.last_drop_col, axis=1, inplace=True) _, sc1 = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight, round=100) if (y != y1).any(): y_train = y1[:len(y_train)] mod1 = self.lgb_model.model self.lgb_model.model = None _, sc2 = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight, round=100) if sc2 < sc1: gc.collect() self.use_exp_y = False y = y1 else: y_train = y[:len(y_train)] self.lgb_model.model = mod1 lgb_preds, _ = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight) col = X_train.any() col = col[col].index X_train = X_train[col] X_eval = X_eval[col] gc.collect() linear_preds = self.linear_model.valid_fit(X_train, y_train, X_eval, y_eval, self.use_sample_weight) gc.collect() if self.tmpControlType == 1: self.linear_weight, self.lgb_weight = 1, 0 elif self.tmpControlType == 2: self.linear_weight, self.lgb_weight = 0, 1 else: self.linear_weight, self.lgb_weight = serch_best_fusion_proportion( linear_preds, lgb_preds, y_eval) else: if not self.use_exp_y: self.data[self.label] = self.data['changed_y'].copy() y = self.data.pop(self.label) self.data.pop('changed_y') X.drop(self.last_drop_col, axis=1, inplace=True) if self.time_seg: seg_num = len(X) // self.time_seg X['time_seg'] = [ (((i // seg_num) + 1) if ((i // seg_num) + 1) <= self.time_seg else self.time_seg) for i in range(len(X)) ] with time_limit("linear_fit"): self.linear_model.fit(X, y, self.use_sample_weight) with time_limit("fit"): self.lgb_model.fit(X, y, categorical_feature, self.use_sample_weight) next_step = 'predict' return next_step def after_train(self): pass def predict(self, new_history, pred_record, time_info): if (time_info['predict'] < 5) and not new_history.empty: if self.primary_id: lab_list = pred_record.join(new_history.set_index( self.primary_id)[self.label], how='left', on=self.primary_id) lab_list = lab_list[self.label].fillna( new_history[self.label].mean()) else: lab_list = pred_record.shape[0] * list( new_history[self.label])[-1:] return list(lab_list), 'predict' self.data = pred_record if not new_history.empty: y = new_history[self.label] self.history[self.label] = y if len(self.linear_predict_list): self.new_model_history_label.extend( list(new_history[self.label])) if self.tmpControlType == 4: if ((self.new_model_n_predict >= 50) and ((self.new_model_n_predict % 50) == 0)) or (self.new_model_n_predict == 15): linear_weight, lgb_weight = serch_best_fusion_proportion( pd.Series(self.linear_predict_list), pd.Series(self.lgb_predict_list), pd.Series(self.new_model_history_label)) self.linear_weight = self.linear_weight * 0.5 + linear_weight * 0.5 self.lgb_weight = self.lgb_weight * 0.5 + lgb_weight * 0.5 self.new_model_n_predict += 1 # preprocess self.preprocess.test_preprocess(self) # feat_engine self.feat_engine.same_feat_test(self) hh = self.data.copy() self.feat_engine.history_feat_test(self) self.history = hh self.n_predict += 1 self.data.drop(self.last_drop_col, axis=1, inplace=True) if self.time_seg: self.data['time_seg'] = self.time_seg linear_preds = self.linear_model.predict(self.data) lgb_preds = self.lgb_model.predict(self.data) predictions = self.linear_weight * linear_preds + self.lgb_weight * lgb_preds self.lgb_predict_list.extend(list(lgb_preds)) self.linear_predict_list.extend(list(linear_preds)) if (self.n_predict % self.update_interval == 0) and ( self.n_predict < self.split_num * self.update_interval) and ( time_info['update'] > self.train_time * 1.25): next_step = 'update' self.feat_engine = None self.preprocess = None self.history = pd.DataFrame() self.isfirst_predict = True self.new_model_history_label = None self.lgb_predict_list = None self.linear_predict_list = None gc.collect() else: self.isfirst_predict = False next_step = 'predict' if self.n_predict == self.n_test_timestamp: self.feat_engine = None self.preprocess = None self.history = pd.DataFrame() self.isfirst_predict = True self.new_model_history_label = None self.lgb_predict_list = None self.linear_predict_list = None gc.collect() return list(predictions), next_step def update(self, train_data, test_history_data, time_info): t1 = time.time() print(f"\nUpdate time budget: {time_info['update']}s") total_data = pd.concat([train_data, test_history_data]) total_data.drop_duplicates(subset=[self.primary_timestamp] + self.primary_id, inplace=True) total_data.reset_index(drop=True, inplace=True) self.train(total_data, time_info) print("Finish update\n") self.train_time = time.time() - t1 next_step = 'predict' return next_step def save(self, model_dir, time_info): print(f"\nSave time budget: {time_info['save']}s") self.data = pd.DataFrame() gc.collect() pkl_list = [] for attr in dir(self): if attr.startswith('__') or attr in [ 'train', 'predict', 'update', 'save', 'load' ]: continue pkl_list.append(attr) pickle.dump(getattr(self, attr), open(os.path.join(model_dir, f'{attr}.pkl'), 'wb')) pickle.dump(pkl_list, open(os.path.join(model_dir, f'pkl_list.pkl'), 'wb')) print("Finish save\n") def load(self, model_dir, time_info): print(f"\nLoad time budget: {time_info['load']}s") pkl_list = pickle.load( open(os.path.join(model_dir, 'pkl_list.pkl'), 'rb')) for attr in pkl_list: setattr( self, attr, pickle.load(open(os.path.join(model_dir, f'{attr}.pkl'), 'rb'))) print("Finish load\n")
from preprocessing import Preprocess if __name__ == '__main__': #os.system('sudo raspivid -br 80') cam = Camera(1280, 1080, dispositivo=1, camera_type='WEBCAM') cam.set_focus(25) cam.set_exposure(100) cam.set_exposure_auto(3) #Inicializa Testplan testplan = Testplan(produto='solo', posto=1) imReference = testplan.get_imgRef() #Inicializa Modelo de Preprocessamento preprocess = Preprocess(produto='solo', posto=1) while True: ret, frame1 = cam.camera_read() frame1 = cv2.resize(frame1, (640, 480), interpolation=cv2.INTER_CUBIC) preprocess.executa_preprocessamento(imgFrame=frame1, imgRef=imReference) #preprocess.segmentation(frame1) imReg, frame2, Result = preprocess.custom_processing( imReference, frame1) if (Result == True): testplan.executa_teste(imReg)
def train(self, train_data, time_info): self.new_model_history_label = [] self.lgb_predict_list = [] self.linear_predict_list = [] self.new_model_n_predict = 0 self.data = train_data gc.collect() self.data['changed_y'] = self.data[self.label].copy() self.preprocess = Preprocess() self.preprocess.train_preprocess(self) if self.n_predict == 0: tt, interval, na_num = time_interval( self.data[self.primary_timestamp]) with time_limit("featParamsad"): self.featParamsad = FeatParams(copy.deepcopy(self), tt, interval, na_num) self.featParamsad.fit_transform() gc.collect() self.feat_engine = Feat_engine(self.featParamsad) self.feat_engine.same_feat_train(self) self.feat_engine.history_feat_train(self) if self.use_sample_weight: TransExponentialDecay(self.primary_timestamp, init=1.0, finish=0.75, offset=0).fit(train_data) gc.collect() col = self.data.any() col = col[col].index self.data = self.data[col] gc.collect() X = self.data categorical_feature = [] self.last_drop_col.append(self.primary_timestamp) if self.n_predict == 0: y = self.data.pop(self.label) y1 = self.data['changed_y'] X_train, y_train, X_eval, y_eval = time_train_test_split( X, y, self.primary_timestamp, shuffle=False) if self.time_seg: seg_num = len(X_train) // self.time_seg X_train['time_seg'] = [ (((i // seg_num) + 1) if ((i // seg_num) + 1) <= self.time_seg else self.time_seg) for i in range(len(X_train)) ] X_eval['time_seg'] = self.time_seg self.lgb_model.param_opt_new(X_train, y_train, X_eval, y_eval, categorical_feature, self.primary_id, self.primary_agg, self.primary_timestamp) X_train.drop(self.last_drop_col, axis=1, inplace=True) _, sc1 = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight, round=100) if (y != y1).any(): y_train = y1[:len(y_train)] mod1 = self.lgb_model.model self.lgb_model.model = None _, sc2 = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight, round=100) if sc2 < sc1: gc.collect() self.use_exp_y = False y = y1 else: y_train = y[:len(y_train)] self.lgb_model.model = mod1 lgb_preds, _ = self.lgb_model.valid_fit(X_train, y_train, X_eval, y_eval, categorical_feature, self.use_sample_weight) col = X_train.any() col = col[col].index X_train = X_train[col] X_eval = X_eval[col] gc.collect() linear_preds = self.linear_model.valid_fit(X_train, y_train, X_eval, y_eval, self.use_sample_weight) gc.collect() if self.tmpControlType == 1: self.linear_weight, self.lgb_weight = 1, 0 elif self.tmpControlType == 2: self.linear_weight, self.lgb_weight = 0, 1 else: self.linear_weight, self.lgb_weight = serch_best_fusion_proportion( linear_preds, lgb_preds, y_eval) else: if not self.use_exp_y: self.data[self.label] = self.data['changed_y'].copy() y = self.data.pop(self.label) self.data.pop('changed_y') X.drop(self.last_drop_col, axis=1, inplace=True) if self.time_seg: seg_num = len(X) // self.time_seg X['time_seg'] = [ (((i // seg_num) + 1) if ((i // seg_num) + 1) <= self.time_seg else self.time_seg) for i in range(len(X)) ] with time_limit("linear_fit"): self.linear_model.fit(X, y, self.use_sample_weight) with time_limit("fit"): self.lgb_model.fit(X, y, categorical_feature, self.use_sample_weight) next_step = 'predict' return next_step
import pickle import math import bisect with open('collection.pickle','rb') as f: collection = pickle.load(f) with open('max_tf.pickle','rb') as f: max_tf = pickle.load(f) with open('documentRoot.pickle','rb') as f: documentRoot = pickle.load(f) with open('objs.pickle','rb') as f: documentLength,subset , get_index, getReference = pickle.load(f) while True: query = input("Enter a query: ") final_query = preprocessing.replace_dates(query) final_query = preprocessing.lemma_stop(final_query) for i in range(len(final_query)): final_query[i] = unidecode.unidecode(final_query[i]) final_query[i] = final_query[i].lower() print(final_query) tf_query = {} for w in final_query: if w not in tf_query: tf_query[w] = 1 else: tf_query[w] += 1 scores = {}