def analyszie_folder(wiki_folder, xlsx_folder, isGraphseg, use_xlsx_sub_folders=False): acc = accuracy.Accuracy() input_files = get_files(wiki_folder) if use_xlsx_sub_folders: annotated_files_folders = [] for f in os.listdir(xlsx_folder): sub_folder_path = xlsx_folder + f if os.path.isdir(sub_folder_path): annotated_files_folders.append(sub_folder_path) else: annotated_files_folders = [xlsx_folder] for file in input_files: id = os.path.basename(file) file_name = id + ".xlsx" if not isGraphseg else id xlsx_file_paths = [ os.path.join(xlsx_folder, file_name) for xlsx_folder in annotated_files_folders ] print str(xlsx_file_paths) print str(file) for xlsx_file_path in xlsx_file_paths: if os.path.isfile(xlsx_file_path): if (isGraphseg): tested_segments = get_graphseg_segments(xlsx_file_path) else: tested_segments = get_xlsx_segments(xlsx_file_path) else: tested_segments = None gold_segments = get_gold_segments(file) if (tested_segments is not None) and (len(tested_segments) != len(gold_segments)): print "(len(tested_segments) != len(gold_segments))" print "stop run" return 1000, 1000 if tested_segments is not None: acc.update(tested_segments, gold_segments) #Print results: calculated_pk, calculated_windiff = acc.calc_accuracy() print('Finished testing.') print('Pk: {:.4}.'.format(calculated_pk)) print('') return calculated_pk, calculated_windiff
def test(model, args, epoch, dataset, logger, threshold): model.eval() with tqdm(desc='Testing', total=len(dataset)) as pbar: acc = accuracy.Accuracy() for i, (data, target, paths) in enumerate(dataset): if True: if i == args.stop_after: break pbar.update() output = model(data) output_softmax = F.softmax(output, 1) targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) output_seg = output.data.cpu().numpy().argmax(axis=1) target_seg = targets_var.data.cpu().numpy() preds_stats.add(output_seg, target_seg) current_idx = 0 for k, t in enumerate(target): document_sentence_count = len(t) to_idx = int(current_idx + document_sentence_count) output = ((output_softmax.data.cpu().numpy()[ current_idx:to_idx, :])[:, 1] > threshold) h = np.append(output, [1]) tt = np.append(t, [1]) acc.update(h, tt) current_idx = to_idx # acc.update(output_softmax.data.cpu().numpy(), target) # # except Exception as e: # # logger.info('Exception "%s" in batch %s', e, i) # logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True) epoch_pk, epoch_windiff = acc.calc_accuracy() logger.debug( 'Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . ' .format(epoch + 1, preds_stats.get_accuracy(), epoch_pk, epoch_windiff, preds_stats.get_f1())) preds_stats.reset() return epoch_pk
def main(args): sys.path.append(str(Path(__file__).parent)) logger = utils.setup_logger(__name__, 'cross_validate_choi.log') utils.read_config_file(args.config) utils.config.update(args.__dict__) logger.debug('Running with config %s', utils.config) configure(os.path.join('runs', args.expname)) if not args.test: word2vec = gensim.models.KeyedVectors.load_word2vec_format(utils.config['word2vecfile'], binary=True) else: word2vec = None dataset_path = Path(args.flat_choi) with open(args.load_from, 'rb') as f: model = torch.load(f) model.eval() model = maybe_cuda(model) test_accuracy = accuracy.Accuracy() for j in range(5): validate_folder_numbers = range(5) validate_folder_numbers.remove(j) validate_folder_names = [dataset_path.joinpath(str(num)) for num in validate_folder_numbers] dev_dataset = ChoiDataset(dataset_path , word2vec, folder=True, folders_paths=validate_folder_names) test_dataset = ChoiDataset(dataset_path, word2vec, folder=True, folders_paths=[dataset_path.joinpath(str(j))]) dev_dl = DataLoader(dev_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers) test_dl = DataLoader(test_dataset, batch_size=args.test_bs, collate_fn=collate_fn, shuffle=False, num_workers=args.num_workers) _, threshold = validate(model, args, j, dev_dl, logger) test_pk = test(model, args, j, test_dl, logger, threshold, test_accuracy) logger.debug(colored('Cross validation section {} with p_k {} and threshold {}'.format(j, test_pk, threshold),'green')) cross_validation_pk, _ = test_accuracy.calc_accuracy() print ('Final cross validaiton Pk is: ' + str(cross_validation_pk)) logger.debug( colored('Final cross validaiton Pk is: {}'.format(cross_validation_pk), 'green'))
def main(args): utils.read_config_file(args.config) utils.config.update(args.__dict__) algo_delimeter = graphseg_delimeter files = get_files(args.folder) acc = accuracy.Accuracy() for file_path in files: file = open(str(file_path), "r") raw_content = file.read() file.close() sentences = [ s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n" ] sentences_length = [] h = [] t = [] is_first_sentence = True for sentence in sentences: if sentence == truth: if not is_first_sentence: t[-1] = 1 continue if sentence == algo_delimeter: if not is_first_sentence: h[-1] = 1 continue words = extract_sentence_words(sentence) sentences_length.append(len(words)) t.append(0) h.append(0) is_first_sentence = False t[-1] = 1 # end of last segment h[-1] = 1 # they already segment it correctly. acc.update(h, t) calculated_pk, calculated_windiff = acc.calc_accuracy() print 'Pk: {:.4}.'.format(calculated_pk) print 'Win_diff: {:.4}.'.format(calculated_windiff)
def main(args): start = timer() sys.path.append(str(Path(__file__).parent)) utils.read_config_file(args.config) utils.config.update(args.__dict__) logger.debug('Running with config %s', utils.config) print('Running with threshold: ' + str(args.seg_threshold)) preds_stats = utils.predictions_analysis() if not args.test: word2vec = gensim.models.KeyedVectors.load_word2vec_format( utils.config['word2vecfile'], binary=True) else: word2vec = None word2vec_done = timer() print 'Loading word2vec ellapsed: ' + str(word2vec_done - start) + ' seconds' dirname = 'test' if args.wiki: dataset_folders = [Path(utils.config['wikidataset']) / dirname] if (args.wiki_folder): dataset_folders = [] dataset_folders.append(args.wiki_folder) print 'running on wikipedia' else: if (args.bySegLength): dataset_folders = getSegmentsFolders(utils.config['choidataset']) print 'run on choi by segments length' else: dataset_folders = [utils.config['choidataset']] print 'running on Choi' with open(args.model, 'rb') as f: model = torch.load(f) model = maybe_cuda(model) model.eval() if (args.naive): model = naive.create() for dataset_path in dataset_folders: if (args.bySegLength): print 'Segment is ', os.path.basename(dataset_path), " :" if args.wiki: if (args.wiki_folder): dataset = WikipediaDataSet(dataset_path, word2vec, folder=True, high_granularity=False) else: dataset = WikipediaDataSet(dataset_path, word2vec, high_granularity=False) else: dataset = ChoiDataset(dataset_path, word2vec) dl = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=False) with tqdm(desc='Testing', total=len(dl)) as pbar: total_accurate = 0 total_count = 0 total_loss = 0 acc = accuracy.Accuracy() for i, (data, targets, paths) in enumerate(dl): if i == args.stop_after: break pbar.update() output = model(data) targets_var = Variable(maybe_cuda(torch.cat(targets, 0), args.cuda), requires_grad=False) batch_loss = 0 output_prob = softmax(output.data.cpu().numpy()) output_seg = output_prob[:, 1] > args.seg_threshold target_seg = targets_var.data.cpu().numpy() batch_accurate = (output_seg == target_seg).sum() total_accurate += batch_accurate total_count += len(target_seg) total_loss += batch_loss preds_stats.add(output_seg, target_seg) current_target_idx = 0 for k, t in enumerate(targets): document_sentence_count = len(t) sentences_length = [s.size()[0] for s in data[k] ] if args.calc_word else None to_idx = int(current_target_idx + document_sentence_count) h = output_seg[current_target_idx:to_idx] # hypothesis and targets are missing classification of last sentence, and therefore we will add # 1 for both h = np.append(h, [1]) t = np.append(t.cpu().numpy(), [1]) acc.update(h, t, sentences_length=sentences_length) current_target_idx = to_idx logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i, batch_loss, batch_accurate / len(target_seg)) pbar.set_description('Testing, Accuracy={:.4}'.format( batch_accurate / len(target_seg))) average_loss = total_loss / len(dl) average_accuracy = total_accurate / total_count calculated_pk, _ = acc.calc_accuracy() logger.info('Finished testing.') logger.info('Average loss: %s', average_loss) logger.info('Average accuracy: %s', average_accuracy) logger.info('Pk: {:.4}.'.format(calculated_pk)) logger.info('F1: {:.4}.'.format(preds_stats.get_f1())) end = timer() print('Seconds to execute to whole flow: ' + str(end - start))
def __init__(self): self.thresholds = np.arange(0, 1, 0.05) self.accuracies = {k: accuracy.Accuracy() for k in self.thresholds}
class DatasetGui(QtWidgets.QWidget): utils = Utils() featureExtractor = FeatureExtractor() bpn = BPNHandler(True) accuracy = accuracy.Accuracy() # Constructor of the DatasetGui class # # @param None # @return None def __init__(self): super(DatasetGui, self).__init__() self.setWindowTitle("Pointing Gesture Recognition - Dataset recording") # Retrieve all settings self.settings = Settings() # Load sounds self.countdownSound = QtMultimedia.QSound( self.settings.getResourceFolder() + "countdown.wav") self.countdownEndedSound = QtMultimedia.QSound( self.settings.getResourceFolder() + "countdown-ended.wav") # Get the context and initialise it self.context = Context() self.context.init() # Create the depth generator to get the depth map of the scene self.depth = DepthGenerator() self.depth.create(self.context) self.depth.set_resolution_preset(RES_VGA) self.depth.fps = 30 # Create the image generator to get an RGB image of the scene self.image = ImageGenerator() self.image.create(self.context) self.image.set_resolution_preset(RES_VGA) self.image.fps = 30 # Create the user generator to detect skeletons self.user = UserGenerator() self.user.create(self.context) # Initialise the skeleton tracking skeleton.init(self.user) # Start generating self.context.start_generating_all() print "Starting to detect users.." # Create a new dataset item self.data = Dataset() # Create a timer for an eventual countdown before recording the data self.countdownTimer = QtCore.QTimer() self.countdownRemaining = 10 self.countdownTimer.setInterval(1000) self.countdownTimer.setSingleShot(True) self.countdownTimer.timeout.connect(self.recordCountdown) # Create a timer to eventually record data for a heat map self.heatmapRunning = False self.heatmapTimer = QtCore.QTimer() self.heatmapTimer.setInterval(10) self.heatmapTimer.setSingleShot(True) self.heatmapTimer.timeout.connect(self.recordHeatmap) # Create the global layout self.layout = QtWidgets.QVBoxLayout(self) # Create custom widgets to hold sensor's images self.depthImage = SensorWidget() self.depthImage.setGeometry(10, 10, 640, 480) # Add these custom widgets to the global layout self.layout.addWidget(self.depthImage) # Hold the label indicating the number of dataset taken self.numberLabel = QtWidgets.QLabel() self.updateDatasetNumberLabel() # Create the acquisition form elements self.createAcquisitionForm() # Register a dialog window to prompt the target position self.dialogWindow = DatasetDialog(self) # Allow to save the data when the right distance is reached self.recordIfReady = False # Create and launch a timer to update the images self.timerScreen = QtCore.QTimer() self.timerScreen.setInterval(30) self.timerScreen.setSingleShot(True) self.timerScreen.timeout.connect(self.updateImage) self.timerScreen.start() # Update the depth image displayed within the main window # # @param None # @return None def updateImage(self): # Update to next frame self.context.wait_and_update_all() # Extract informations of each tracked user self.data = skeleton.track(self.user, self.depth, self.data) # Get the whole depth map self.data.depth_map = np.asarray( self.depth.get_tuple_depth_map()).reshape(480, 640) # Create the frame from the raw depth map string and convert it to RGB frame = np.fromstring(self.depth.get_raw_depth_map_8(), np.uint8).reshape(480, 640) frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) # Get the RGB image of the scene self.data.image = np.fromstring(self.image.get_raw_image_map_bgr(), dtype=np.uint8).reshape(480, 640, 3) # Will be used to specify the depth of the current hand wished currentDepth, showCurrentDepth = 0, "" if len(self.user.users) > 0 and len(self.data.skeleton["head"]) > 0: # Highlight the head ui.drawPoint(frame, self.data.skeleton["head"][0], self.data.skeleton["head"][1], 5) # Display lines from elbows to the respective hands ui.drawElbowLine(frame, self.data.skeleton["elbow"]["left"], self.data.skeleton["hand"]["left"]) ui.drawElbowLine(frame, self.data.skeleton["elbow"]["right"], self.data.skeleton["hand"]["right"]) # Get the pixel's depth from the coordinates of the hands leftPixel = self.utils.getDepthFromMap( self.data.depth_map, self.data.skeleton["hand"]["left"]) rightPixel = self.utils.getDepthFromMap( self.data.depth_map, self.data.skeleton["hand"]["right"]) if self.data.hand == self.settings.LEFT_HAND: currentDepth = leftPixel elif self.data.hand == self.settings.RIGHT_HAND: currentDepth = rightPixel # Get the shift of the boundaries around both hands leftShift = self.utils.getHandBoundShift(leftPixel) rightShift = self.utils.getHandBoundShift(rightPixel) # Display a rectangle around both hands ui.drawHandBoundaries(frame, self.data.skeleton["hand"]["left"], leftShift, (50, 100, 255)) ui.drawHandBoundaries(frame, self.data.skeleton["hand"]["right"], rightShift, (200, 70, 30)) # Record the current data if the user is ready if self.recordIfReady: cv2.putText(frame, str(self.data.getWishedDistance()), (470, 60), cv2.FONT_HERSHEY_SIMPLEX, 2, (252, 63, 253), 5) if self.data.getWishedDistance( ) >= int(currentDepth) - 10 and self.data.getWishedDistance( ) <= int(currentDepth) + 10: self.record([]) self.recordIfReady = False else: if int(currentDepth) < self.data.getWishedDistance(): showCurrentDepth = str(currentDepth) + " +" else: showCurrentDepth = str(currentDepth) + " -" else: showCurrentDepth = str(currentDepth) cv2.putText(frame, showCurrentDepth, (5, 60), cv2.FONT_HERSHEY_SIMPLEX, 2, (50, 100, 255), 5) # Update the frame self.depthImage.setPixmap(ui.convertOpenCVFrameToQPixmap(frame)) self.timerScreen.start() # Update the label indicating the number of dataset elements saved so far for the current type # # @param None # @return None def updateDatasetNumberLabel(self): if self.data.type == Dataset.TYPE_POSITIVE: self.numberLabel.setText("Dataset #%d" % (self.utils.getFileNumberInFolder( self.settings.getPositiveFolder()))) elif self.data.type == Dataset.TYPE_NEGATIVE: self.numberLabel.setText("Dataset #%d" % (self.utils.getFileNumberInFolder( self.settings.getNegativeFolder()))) elif self.data.type == Dataset.TYPE_ACCURACY: self.numberLabel.setText("Dataset #%d" % (self.utils.getFileNumberInFolder( self.settings.getAccuracyFolder()))) else: self.numberLabel.setText("Dataset #%d" % (self.utils.getFileNumberInFolder( self.settings.getDatasetFolder()))) # Record the actual informations # # @param obj Initiator of the event # @return None def record(self, obj): # If the user collects data to check accuracy, prompts additional informations if self.data.type == Dataset.TYPE_ACCURACY: self.saveForTarget() # If the user collects data for a heat map, let's do it elif self.data.type == Dataset.TYPE_HEATMAP: # The same button will be used to stop recording if not self.heatmapRunning: self.startRecordHeatmap() else: self.stopRecordHeatmap() else: # Directly save the dataset and update the label number self.data.save() self.countdownEndedSound.play() self.updateDatasetNumberLabel() # Handle a countdown as a mean to record the informations with a delay # # @param None # @return None def recordCountdown(self): # Decrease the countdown and check if it needs to continue self.countdownRemaining -= 1 if self.countdownRemaining <= 0: # Re-initialise the timer and record the data self.countdownTimer.stop() self.countdownButton.setText("Saving..") self.countdownRemaining = 10 self.record([]) else: self.countdownTimer.start() self.countdownSound.play() # Display the actual reminaining self.countdownButton.setText("Save in %ds" % (self.countdownRemaining)) # Record a heatmap representation of the informations by successive captures # # @param None # @return None def recordHeatmap(self): if self.data.hand == self.settings.NO_HAND: print "Unable to record as no hand is selected" return False if len(self.user.users) > 0 and len(self.data.skeleton["head"]) > 0: # Input the data into the feature extractor result = self.bpn.check( self.featureExtractor.getFeatures(self.data)) # Add the depth of the finger tip point = self.featureExtractor.fingerTip[result[1]] point.append(self.utils.getDepthFromMap(self.data.depth_map, point)) # Verify that informations are correct if point[0] != 0 and point[1] != 0 and point[2] != 0: # Add the result of the neural network point.append(result[0]) self.heatmap.append(point) self.countdownSound.play() # Loop timer self.heatmapTimer.start() # Start the recording of the heatmap # # @param None # @return None def startRecordHeatmap(self): self.saveButton.setText("Stop recording") self.heatmapRunning = True self.heatmapTimer.start() # Stop the recording of the heatmap # # @param None # @return None def stopRecordHeatmap(self): self.heatmapTimer.stop() self.heatmapRunning = False self.countdownEndedSound.play() self.saveButton.setText("Record") self.accuracy.showHeatmap(self.heatmap, "front") self.heatmap = [] # Raise a flag to record the informations when the chosen distance will be met # # @param None # @return None def startRecordWhenReady(self): self.recordIfReady = True # Hold the current informations to indicate the position of the target thanks to the dialog window # # @param None # @return None def saveForTarget(self): # Freeze the data self.timerScreen.stop() self.countdownEndedSound.play() # Translate the depth values to a frame and set it in the dialog window frame = np.fromstring(self.depth.get_raw_depth_map_8(), np.uint8).reshape(480, 640) frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) self.dialogWindow.setFrame(frame) # Prompt the position of the target self.dialogWindow.exec_() # Toggle the type of dataset chosen # # @param value Identifier of the new type of dataset # @return None def toggleType(self, value): self.data.toggleType(value) if value == self.data.TYPE_HEATMAP: self.saveButton.setText("Record") self.countdownButton.setText("Record in %ds" % (self.countdownRemaining)) self.readyButton.setEnabled(False) # Create an array to hold all points self.heatmap = [] else: self.updateDatasetNumberLabel() if hasattr(self, 'saveButton'): self.saveButton.setText("Save") self.countdownButton.setText("Save in %ds" % (self.countdownRemaining)) self.readyButton.setEnabled(True) # Create the acquisition form of the main window # # @param None # @return None def createAcquisitionForm(self): globalLayout = QtWidgets.QHBoxLayout() vlayout = QtWidgets.QVBoxLayout() # Drop down menu of the distance to record the informations when the pointing hand meet the corresponding value hlayout = QtWidgets.QHBoxLayout() label = QtWidgets.QLabel("Distance") label.setFixedWidth(100) comboBox = QtWidgets.QComboBox() comboBox.currentIndexChanged.connect(self.data.toggleDistance) comboBox.setFixedWidth(200) comboBox.addItem("550") comboBox.addItem("750") comboBox.addItem("1000") comboBox.addItem("1250") comboBox.addItem("1500") comboBox.addItem("1750") comboBox.addItem("2000") comboBox.setCurrentIndex(0) hlayout.addWidget(label) hlayout.addWidget(comboBox) vlayout.addLayout(hlayout) # Drop down menu to select the type of hand of the dataset hlayout = QtWidgets.QHBoxLayout() label = QtWidgets.QLabel("Pointing hand") label.setFixedWidth(100) comboBox = QtWidgets.QComboBox() comboBox.currentIndexChanged.connect(self.data.toggleHand) comboBox.setFixedWidth(200) comboBox.addItem("Left") comboBox.addItem("Right") comboBox.addItem("None") comboBox.setCurrentIndex(0) hlayout.addWidget(label) hlayout.addWidget(comboBox) vlayout.addLayout(hlayout) # Drop down menu of the dataset type hlayout = QtWidgets.QHBoxLayout() label = QtWidgets.QLabel("Type") label.setFixedWidth(100) comboBox = QtWidgets.QComboBox() comboBox.currentIndexChanged.connect(self.toggleType) comboBox.setFixedWidth(200) comboBox.addItem("Positive") comboBox.addItem("Negative") comboBox.addItem("Accuracy") comboBox.addItem("Heat map") comboBox.setCurrentIndex(0) hlayout.addWidget(label) hlayout.addWidget(comboBox) vlayout.addLayout(hlayout) globalLayout.addLayout(vlayout) vlayout = QtWidgets.QVBoxLayout() self.numberLabel.setAlignment(QtCore.Qt.AlignCenter) vlayout.addWidget(self.numberLabel) # Action buttons to record the way that suits the most hLayout = QtWidgets.QHBoxLayout() self.readyButton = QtWidgets.QPushButton( 'Save when ready', clicked=self.startRecordWhenReady) self.saveButton = QtWidgets.QPushButton('Save', clicked=self.record) hLayout.addWidget(self.readyButton) vlayout.addLayout(hLayout) item_layout = QtWidgets.QHBoxLayout() self.countdownButton = QtWidgets.QPushButton( "Save in %ds" % (self.countdownRemaining), clicked=self.countdownTimer.start) self.saveButton = QtWidgets.QPushButton('Save', clicked=self.record) item_layout.addWidget(self.countdownButton) item_layout.addWidget(self.saveButton) vlayout.addLayout(item_layout) globalLayout.addLayout(vlayout) self.layout.addLayout(globalLayout)
def test(model, args, epoch, dataset, logger, threshold): model.eval() with tqdm(desc='Testing', total=len(dataset)) as pbar: acc_1 = accuracy.Accuracy() acc_2 = accuracy.Accuracy() acc_3 = accuracy.Accuracy() acc_4 = accuracy.Accuracy() acc_5 = accuracy.Accuracy() for i, (data, target, paths, sent_bert_vec, target_idx) in enumerate(dataset): if True: if i == args.stop_after: break pbar.update() output, _ = model(data, sent_bert_vec, target_idx) output_softmax = F.softmax(output, 1) targets_var = Variable(maybe_cuda(torch.cat(target, 0), args.cuda), requires_grad=False) output_seg = output.data.cpu().numpy().argmax(axis=1) target_seg = targets_var.data.cpu().numpy() preds_stats.add(output_seg, target_seg) current_idx = 0 for k, t in enumerate(target): document_sentence_count = len(t) to_idx = int(current_idx + document_sentence_count) #output = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > threshold) output_1 = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > 0.1) output_2 = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > 0.2) output_3 = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > 0.3) output_4 = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > 0.4) output_5 = ((output_softmax.data.cpu().numpy()[current_idx: to_idx, :])[:, 1] > 0.5) h_1 = np.append(output_1, [1]) h_2 = np.append(output_2, [1]) h_3 = np.append(output_3, [1]) h_4 = np.append(output_4, [1]) h_5 = np.append(output_5, [1]) tt = np.append(t, [1]) t_pred = output_softmax.data.cpu().numpy()[current_idx: to_idx, :] t_gold = t acc_1.update(h_1, tt) acc_2.update(h_2, tt) acc_3.update(h_3, tt) acc_4.update(h_4, tt) acc_5.update(h_5, tt) current_idx = to_idx # acc.update(output_softmax.data.cpu().numpy(), target) # # except Exception as e: # # logger.info('Exception "%s" in batch %s', e, i) # logger.debug('Exception while handling batch with file paths: %s', paths, exc_info=True) epoch_pk_1, epoch_windiff_1 = acc_1.calc_accuracy() epoch_pk_2, epoch_windiff_2 = acc_2.calc_accuracy() epoch_pk_3, epoch_windiff_3 = acc_3.calc_accuracy() epoch_pk_4, epoch_windiff_4 = acc_4.calc_accuracy() epoch_pk_5, epoch_windiff_5 = acc_5.calc_accuracy() logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, preds_stats.get_accuracy(), epoch_pk_1, epoch_windiff_1, preds_stats.get_f1())) logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, preds_stats.get_accuracy(), epoch_pk_2, epoch_windiff_2, preds_stats.get_f1())) logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, preds_stats.get_accuracy(), epoch_pk_3, epoch_windiff_3, preds_stats.get_f1())) logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, preds_stats.get_accuracy(), epoch_pk_4, epoch_windiff_4, preds_stats.get_f1())) logger.debug('Testing Epoch: {}, accuracy: {:.4}, Pk: {:.4}, Windiff: {:.4}, F1: {:.4} . '.format(epoch + 1, preds_stats.get_accuracy(), epoch_pk_5, epoch_windiff_5, preds_stats.get_f1())) preds_stats.reset() return epoch_pk_1
def main(): parser = argparse.ArgumentParser( description='Hierarchical Clustering and Classification') parser.add_argument( '--batchsize', '-b', type=int, default=256, help='Number of images in each mini-batch for clustering') parser.add_argument( '--batchsize2', '-b2', type=int, default=64, help='Number of images in each mini-batch for classification') parser.add_argument('--data_type', '-d', type=str, default='toy', help='dataset name') parser.add_argument('--model_type', '-m', type=str, default='linear', help='model to use') parser.add_argument('--model_path', '-mp', type=str, default='', help='pre-trained model if necessary') parser.add_argument('--gpu', '-g', type=int, default=-1, help='gpu number to use') parser.add_argument('--cluster', '-c', type=int, default=2, help='the size of cluster') parser.add_argument('--weight_decay', '-w', type=float, default=0.0000, help='weight decay for classification') parser.add_argument('--unit', '-u', type=int, default=300, help='unit size for DocModel') parser.add_argument('--alpha', '-a', type=float, default=0.001, help='learning rate for clustering') parser.add_argument('--epoch', '-e', type=int, default=10, help='the number of epochs for clustering') parser.add_argument('--epoch2', '-e2', type=int, default=100, help='the number of epochs for classification') parser.add_argument('--mu', '-mu', type=float, default=150.0, help='the hyper-parameter for clustering') parser.add_argument('--out', '-o', type=str, default='results', help='output directory for result file') parser.add_argument('--train_file', '-train_f', type=str, default='', help='training dataset file') parser.add_argument('--test_file', '-test_f', type=str, default='', help='test dataset file') parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--resume', '-r', default='', help='resume the training from snapshot') parser.add_argument('--resume2', '-r2', default='', help='resume the training from snapshot') parser.add_argument('--optimizer', '-op', type=str, default='Adam', help='optimizer for clustering') parser.add_argument('--optimizer2', '-op2', type=str, default='Adam', help='optimizer for classification') parser.add_argument('--initial_lr', type=float, default=0.001, help='initial learning rate for classification') parser.add_argument( '--lr_decay_rate', type=float, default=0.5, help='decay rate for classification if MomentumSGD is used') parser.add_argument( '--lr_decay_epoch', type=float, default=25, help='decay epoch for classification if MomentumSGD is used') parser.add_argument('--random', action='store_true', default=False, help='Use random assignment or not') parser.add_argument('--valid', '--v', action='store_true', help='Use random assignment or not') args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) gpu = args.gpu data_type = args.data_type model_type = args.model_type num_clusters = args.cluster initial_lr = args.initial_lr lr_decay_rate = args.lr_decay_rate lr_decay_epoch = args.lr_decay_epoch opt1 = args.optimizer opt2 = args.optimizer2 model_path = args.model_path rand_assign = args.random train_file = args.train_file test_file = args.test_file unit = args.unit alpha = args.alpha sparse = False ndim = 1 n_in = None train_transform = None test_transform = None if data_type == 'toy': model = network.LinearModel(2, 2) elif data_type == 'mnist': if model_type == 'linear': model = network.LinearModel(784, num_clusters) elif model_type == 'DNN': model = network.MLP(1000, num_clusters) elif model_type == 'CNN': ndim = 3 model = network.CNN(num_clusters) else: raise ValueError elif data_type == 'cifar100': train_transform = partial(dataset.transform, mean=0.0, std=1.0, train=True) test_transform = partial(dataset.transform, mean=0.0, std=1.0, train=False) if model_type == 'Resnet50': model = network.ResNet50(num_clusters) n_in = 2048 load_npz(model_path, model, not_load_list=['fc7']) elif model_type == 'VGG': model = network.VGG(num_clusters) n_in = 1024 load_npz(model_path, model, not_load_list=['fc6']) else: raise ValueError elif data_type == 'LSHTC1': sparse = True if model_type == 'DocModel': model = network.DocModel(n_in=1024, n_mid=unit, n_out=num_clusters) elif model_type == 'DocModel2': model = network.DocModel2(n_in=1024, n_mid=unit, n_out=num_clusters) elif model_type == 'linear': model = network.LinearModel(n_in=92586, n_out=num_clusters) else: raise ValueError elif data_type == 'Dmoz': sparse = True if model_type == 'DocModel': model = network.DocModel(n_in=561127, n_mid=unit, n_out=num_clusters) elif model_type == 'linear': model = network.LinearModel(n_in=1024, n_out=num_clusters) else: raise ValueError else: if model_type == 'Resnet50': model = network.ResNet50(num_clusters) elif model_type == 'Resnet101': model = network.ResNet101(num_clusters) elif model_type == 'VGG': model = network.VGG(num_clusters) elif model_type == 'CNN': model = network.CNN(num_clusters) else: raise ValueError if gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(gpu).use() model.to_gpu() # Copy the model to the GPU (train_instances, train_labels), (test_instances, test_labels), num_classes \ = load_data(data_type, ndim, train_file, test_file) if rand_assign: assignment, count_classes = random_assignment(num_clusters, num_classes) else: if opt1 == 'Adam': optimizer = chainer.optimizers.Adam(alpha=alpha) else: optimizer = chainer.optimizers.SGD(lr=alpha) optimizer.setup(model) train = clustering.dataset.Dataset(*(train_instances, train_labels), sparse) test = clustering.dataset.Dataset(*(test_instances, test_labels), sparse) train_iter = chainer.iterators.SerialIterator( train, batch_size=args.batchsize) train_updater = clustering.updater.Updater(model, train, train_iter, optimizer, num_clusters=num_clusters, device=gpu, mu=args.mu) trainer = training.Trainer(train_updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) trainer.extend( extensions.PrintReport([ 'epoch', 'iteration', 'main/loss', 'main/loss_cc', 'main/loss_mut_info', 'main/H_Y', 'main/H_YX', 'elapsed_time' ])) trainer.extend(extensions.snapshot(), trigger=(5, 'epoch')) if args.resume: chainer.serializers.load_npz(args.resume, trainer) trainer.run() """ end clustering """ cluster_label = separate.det_cluster(model, train, num_classes, batchsize=128, device=gpu, sparse=sparse) assignment, count_classes = separate.assign(cluster_label, num_classes, num_clusters) del optimizer del train_iter del train_updater del trainer del train del test print(assignment) """ start classification """ model = h_net.HierarchicalNetwork(model, num_clusters, count_classes, n_in=n_in) if opt2 == 'Adam': optimizer2 = chainer.optimizers.Adam(alpha=initial_lr) elif opt2 == 'SGD': optimizer2 = chainer.optimizers.SGD(lr=initial_lr) else: optimizer2 = chainer.optimizers.MomentumSGD(lr=initial_lr) optimizer2.setup(model) if args.weight_decay > 0: optimizer2.add_hook(chainer.optimizer.WeightDecay(args.weight_decay)) if gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(gpu).use() model.to_gpu() # Copy the model to the GPU train = classification.dataset.Dataset(train_instances, train_labels, assignment, _transform=train_transform, sparse=sparse) test = classification.dataset.Dataset(test_instances, test_labels, assignment, _transform=test_transform, sparse=sparse) train_iter = chainer.iterators.SerialIterator(train, batch_size=args.batchsize2) test_iter = chainer.iterators.SerialIterator(test, batch_size=1, repeat=False) train_updater = classification.updater.Updater(model, train, train_iter, optimizer2, num_clusters, device=gpu) trainer = training.Trainer(train_updater, (args.epoch2, 'epoch'), args.out) acc = accuracy.Accuracy(model, assignment, num_clusters) trainer.extend(extensions.Evaluator(test_iter, acc, device=gpu)) trainer.extend( extensions.snapshot(filename='snapshot_iter_{.updater.iteration}.npz'), trigger=(20, 'epoch')) trainer.extend(extensions.LogReport(trigger=(1, 'epoch'))) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'main/loss_cluster', 'main/loss_class', 'validation/main/accuracy', 'validation/main/cluster_accuracy', 'validation/main/loss', 'elapsed_time' ])) if opt2 != 'Adam': trainer.extend(extensions.ExponentialShift('lr', lr_decay_rate), trigger=(lr_decay_epoch, 'epoch')) if args.resume2: chainer.serializers.load_npz(args.resume2, trainer) trainer.run()
def evalModel(self): self.one = Variable(torch.FloatTensor([1.0])) self.one = self.one.to(self.device) self.accObj = accuracy.Accuracy() self.computeEmbeddingQuality()
def main(args): start = timer() sys.path.append(str(Path(__file__).parent)) utils.read_config_file(args.config) utils.config.update(args.__dict__) logger.debug('Running with config %s', utils.config) print('Running with threshold: ' + str(args.seg_threshold)) preds_stats = utils.predictions_analysis() probs_stats = [[], []] article_stats = [] export = [] #samples = [] # Let's use Amazon S3 s3 = boto3.resource( 's3') #s3 = boto3.client('s3', profile_name='signal-rnd') mybucket = s3.Bucket('data.data-science.signal') myfolder = 'summaries-segmentation' #pullBucketSamples(mybucket, myfolder+'/samples') print('Samples pulled successfully into container') workbook = excel.Workbook('output.xlsx') #workbook = excel.Workbook('/output/output.xlsx')#when running from container worksheet = workbook.add_worksheet() if not args.test: #key = myfolder + utils.config['word2vecfile'] #word2vec = gensim.models.KeyedVectors.load_word2vec_format(mybucket.Object(key).get()['Body'].read(), binary=True) #word2vec = gensim.models.KeyedVectors.load_word2vec_format(io.BytesIO(mybucket.Object(key).get()['Body'].read()), binary=True) word2vec = gensim.models.KeyedVectors.load_word2vec_format( utils.config['word2vecfile'], binary=True) #response = urllib2.urlopen('https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing') #word2vec = gensim.models.KeyedVectors.load_word2vec_format(response.read(), binary=True) #mybucket.Object(key).download_file('GoogleNews_vectors') #word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews_vectors', binary=True) else: word2vec = None word2vec_done = timer() print 'Loading word2vec ellapsed: ' + str(word2vec_done - start) + ' seconds' dirname = 'test' if args.wiki: dataset_folders = [Path(utils.config['wikidataset']) / dirname] if (args.wiki_folder): dataset_folders = [] dataset_folders.append(args.wiki_folder) print 'running on wikipedia' else: if (args.bySegLength): dataset_folders = getSegmentsFolders(utils.config['choidataset']) print 'run on choi by segments length' else: dataset_folders = [utils.config['choidataset']] print 'running on Choi' key = myfolder + args.model #model = torch.load(mybucket.Object(key).get()['Body'].read()) #fileobj = io.BytesIO() #mybucket.Object(key).download_fileobj(fileobj) mybucket.Object(key).download_file('trained_model') #with open(args.model, 'rb') as f: with open('trained_model', 'rb') as f: model = torch.load(f) model = maybe_cuda(model) model.eval() if (args.naive): model = naive.create() for dataset_path in dataset_folders: if (args.bySegLength): print 'Segment is ', os.path.basename(dataset_path), " :" if args.wiki: if (args.wiki_folder): dataset = WikipediaDataSet(dataset_path, word2vec, folder=True, high_granularity=False) else: dataset = WikipediaDataSet(dataset_path, word2vec, high_granularity=False) else: dataset = ChoiDataset(dataset_path, word2vec) dl = DataLoader(dataset, batch_size=args.bs, collate_fn=collate_fn, shuffle=False) with tqdm(desc='Testing', total=len(dl)) as pbar: total_accurate = 0 total_count = 0 total_loss = 0 acc = accuracy.Accuracy() for i, (data, targets, paths) in enumerate(dl): if i == args.stop_after: break pbar.update() output = model(data) targets_var = Variable(maybe_cuda(torch.cat(targets, 0), args.cuda), requires_grad=False) batch_loss = 0 output_prob = softmax(output.data.cpu().numpy()) #if i < 5: #print output_prob.shape probs_stats[0].append(output_prob.tolist()) #samples.append(data) output_seg = output_prob[:, 1] > args.seg_threshold target_seg = targets_var.data.cpu().numpy() probs_stats[1].append(target_seg.tolist()) batch_accurate = (output_seg == target_seg).sum() total_accurate += batch_accurate total_count += len(target_seg) total_loss += batch_loss preds_stats.add(output_seg, target_seg) current_target_idx = 0 article_stats.append([]) for k, t in enumerate(targets): document_sentence_count = len(t) article_stats[i].append(document_sentence_count) sentences_length = [s.size()[0] for s in data[k] ] if args.calc_word else None to_idx = int(current_target_idx + document_sentence_count) h = output_seg[current_target_idx:to_idx] # hypothesis and targets are missing classification of last sentence, and therefore we will add # 1 for both h = np.append(h, [1]) t = np.append(t.cpu().numpy(), [1]) acc.update(h, t, sentences_length=sentences_length) current_target_idx = to_idx logger.debug('Batch %s - error %7.4f, Accuracy: %7.4f', i, batch_loss, batch_accurate / len(target_seg)) pbar.set_description('Testing, Accuracy={:.4}'.format( batch_accurate / len(target_seg))) average_loss = total_loss / len(dl) average_accuracy = total_accurate / total_count calculated_pk, _ = acc.calc_accuracy() article = 0 for batch, probs in enumerate(probs_stats[0]): boundary = 0 for sentences in article_stats[batch]: export.append([]) for sentence in range(0, sentences): export[article].append(probs[boundary][1]) worksheet.write(sentence, 2 * article, probs[boundary][1]) worksheet.write(sentence, 2 * article + 1, probs_stats[1][batch][boundary]) #worksheet.write(sentence, 3*article + 2, " ".join(samples[batch][boundary][:5])) boundary += 1 article += 1 #Save dataset as pickle #data_out = np.asarray(export) with open('LSTM_probs.pkl', 'wb') as f: #with open('/output/LSTM_probs.pkl', 'wb') as f:#when rnuning from container pkl.dump({'probs': export}, f, pkl.HIGHEST_PROTOCOL ) #, 'labels': y_train }, f, pkl.HIGHEST_PROTOCOL) workbook.close() key = myfolder + '/testing/softmax_probs.jsonl' mybucket.Object(key).upload_file('LSTM_probs.pkl') key = myfolder + '/testing/output.xlsx' mybucket.Object(key).upload_file('output.xlsx') logger.info('Finished testing.') logger.info('Average loss: %s', average_loss) logger.info('Average accuracy: %s', average_accuracy) logger.info('Pk: {:.4}.'.format(calculated_pk)) logger.info('F1: {:.4}.'.format(preds_stats.get_f1())) end = timer() print('Seconds to execute to whole flow: ' + str(end - start))
import matplotlib.pyplot as plt #import mpl_toolkits.axisartist as axisartist font_size = 8 fig_width = 3.0 font = {'family': 'serif', 'serif': ['Times'], 'size': font_size} plt.rc('text', usetex=True) plt.rc('font', family='serif') plt.rcParams['text.latex.preamble'] = r'\usepackage{siunitx}' #matplotlib.rc('font', **font) speed_data = speed.Speed() accuracy_data = accuracy.Accuracy() speed_averages = {} speed_per_mat_element = {} accuracy_averages = {} accuracy_per_mat_element = {} series_to_rowcount = {} series_to_rowcount[1000.0] = 3000.0 series_to_rowcount[2000.0] = 5400.0 series_to_rowcount[3000.0] = 8526.0 series_to_rowcount[4000.0] = 12288.0 series_to_rowcount[6000.0] = 18468.0 series_to_rowcount[8000.0] = 24000.0 series_to_rowcount[12000.0] = 33396.0 series_to_rowcount[16000.0] = 50700.0
fileList = os.listdir(openDir) eval = [] count = 0 totalLen = len(fileList) for item in fileList: count += 1 inFiles = os.path.join(openDir, item) outName = "output" + str(shouldNormalize) + str( similarityType) + "-" + str(item) outFiles = os.path.join(outDir, outName) ac1 = accuracy.Accuracy(iF=inFiles, vF=vectorFile, sN=shouldNormalize, sT=similarityType, oF=outFiles) vectorDict = ac1.vectorIn() inputRead = ac1.inputIn() finalVector = ac1.processVector(vectorDict=vectorDict, inputRead=inputRead) ans = ac1.finalAns(vectorDict=vectorDict, inputRead=inputRead, finalVector=finalVector) fileName = str(item).split(".")[0] eval.append(fileName + ": " + str(ans / len(inputRead)) + "\n") print(fileName + " " + str(count) + "/" + str(totalLen)) finalEval = "".join(eval) with open(evalFile, "w") as eva: eva.write(finalEval)