def predict_text(self, text): input_text = InputText(text) for token in input_text.gold_tokens: if not token.is_punctuation(): token.word_vec = self.word2vec.get_vector(token.word.lower()) slidingWindow = SlidingWindow() instances = slidingWindow.list_windows(input_text) punctuations = [] for instance in instances: probs = self.predict_caffe(instance) #print instance #self.show_probs(probs) punctuations.append(numpy.argmax(probs)) #print punctuations print(">>> Sentence with boundaries:") for i in range(len(punctuations) - 1, -1, -1): input_text.gold_tokens.insert(i + PUNCTUATION_POS, classes_as_string[punctuations[i]]) print "{", for t in input_text.gold_tokens: print t, print "}"
def generate(self, parsers, database, is_test): level_db = LevelDBCreator(database) window_slider = SlidingWindow() nr_instances = 0 if is_test: plain_text_instances_file = open( database + "/../test_instances.txt", "w") else: plain_text_instances_file = open( database + "/../train_instances.txt", "w") for i, talk_parser in enumerate(parsers): talks = talk_parser.parse() prev_progress = 0 print("") print("Processing file %s ..." % talk_parser.get_file_name()) for talk in talks: progress = int(talk_parser.progress() * 100) if progress > prev_progress: sys.stdout.write(str(progress) + "% ") sys.stdout.flush() prev_progress = progress talk.build_interval_tree() base_dir = os.path.dirname(talk_parser.get_file_name()) # get pitch feature values pitch_level_file = base_dir + "/" + talk.group_name + "_talkid" + str( talk.talk_id) + ".pitch" talk.parse_pitch_feature(pitch_level_file) # get energy feature values energy_level_file = base_dir + "/" + talk.group_name + "_talkid" + str( talk.talk_id) + ".energy" talk.parse_energy_feature(energy_level_file) # normalize features talk.normalize() # get the training instances training_instances = window_slider.list_windows(talk) # write training instances to level db for training_instance in training_instances: nr_instances += 1 # write instance to file s = unicode(training_instance) + "\n" s += "\n" plain_text_instances_file.write(s.encode('utf8')) # write to level db level_db.write_training_instance(training_instance) plain_text_instances_file.close()
def generate(self, parsers, database, is_test): level_db = LevelDBCreator(database) window_slider = SlidingWindow() nr_instances = 0 if is_test: plain_text_instances_file = open(database + "/../test_instances.txt", "w") else: plain_text_instances_file = open(database + "/../train_instances.txt", "w") for i, talk_parser in enumerate(parsers): talks = talk_parser.parse() prev_progress = 0 print("") print("Processing file %s ..." % talk_parser.get_file_name()) for talk in talks: progress = int(talk_parser.progress() * 100) if progress > prev_progress: sys.stdout.write(str(progress) + "% ") sys.stdout.flush() prev_progress = progress talk.build_interval_tree() base_dir = os.path.dirname(talk_parser.get_file_name()) # get pitch feature values pitch_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".pitch" talk.parse_pitch_feature(pitch_level_file) # get energy feature values energy_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".energy" talk.parse_energy_feature(energy_level_file) # normalize features talk.normalize() # get the training instances training_instances = window_slider.list_windows(talk) # write training instances to level db for training_instance in training_instances: nr_instances += 1 # write instance to file s = unicode(training_instance) + "\n" s += "\n" plain_text_instances_file.write(s.encode('utf8')) # write to level db level_db.write_training_instance(training_instance) plain_text_instances_file.close()
def predict(self, input_audio): sliding_window = SlidingWindow() instances = sliding_window.list_windows(input_audio) # get caffe predictions punctuation_probs = [] for instance in instances: probs = self._predict_caffe(instance) punctuation_probs.extend(numpy.copy(probs)) return punctuation_probs
def predict(self, input_text): for token in input_text.tokens: if not token.is_punctuation(): if not self.word2vec: token.word_vec = numpy.random.rand(300) else: token.word_vec = self.word2vec.get_vector(token.word.lower()) sliding_window = SlidingWindow() instances = sliding_window.list_windows(input_text) # get caffe predictions punctuation_probs = [] for instance in instances: probs = self._predict_caffe(instance) punctuation_probs.extend(numpy.copy(probs)) return punctuation_probs
def predict(self, input_text): for token in input_text.tokens: if not token.is_punctuation(): if not self.word2vec: token.word_vec = numpy.random.rand(300) else: token.word_vec = self.word2vec.get_vector( token.word.lower()) sliding_window = SlidingWindow() instances = sliding_window.list_windows(input_text) # get caffe predictions punctuation_probs = [] for instance in instances: probs = self._predict_caffe(instance) punctuation_probs.extend(numpy.copy(probs)) return punctuation_probs
def generate(self, parsers, database, is_test): level_db = LevelDBCreator(database) window_slider = SlidingWindow() # count how often each type (COMMA, PERIOD etc.) is in the instances class_distribution = dict() nr_instances = 0 nr_instances_used = 0 label_nr = len(Punctuation) if not self.USE_QUESTION_MARK: label_nr -= 1 perfect_distribution = 1.0 / label_nr if is_test: plain_text_instances_file = open(database + "/../test_instances.txt", "w") else: plain_text_instances_file = open(database + "/../train_instances.txt", "w") for i, text_parser in enumerate(parsers): texts = text_parser.parse() prev_progress = 0 print("") print("Processing file %s ..." % text_parser.get_file_name()) foo = open("lineparsing", "w") for text in texts: progress = int(text_parser.progress() * 100) if progress > prev_progress: sys.stdout.write(str(progress) + "% ") sys.stdout.flush() prev_progress = progress for sentence in text.sentences: tokens = sentence.get_tokens() # get the word vectors for all tokens in the sentence for i in range(len(tokens)): token = tokens[i] if not token.is_punctuation(): if i == len(tokens) - 1: punctuation_string = "PERIOD" else: next_token = tokens[i + 1] if next_token.is_punctuation(): punctuation_string = str(next_token.punctuation_type) punctuation_string = punctuation_string[12:] else: punctuation_string = "O" foo.write(token.word.lower() + "\t" + punctuation_string + "\n") token.word_vec = self.word2vec.get_vector(token.word.lower()) # get the training instances training_instances = window_slider.list_windows(text) # write training instances to level db for training_instance in training_instances: nr_instances += 1 class_variation = (class_distribution.get(training_instance.label, 0) / float(max(nr_instances_used, 1))) - perfect_distribution if is_test or (not self.CLASS_DISTRIBUTION_NORMALIZATION) or (class_variation <= self.CLASS_DISTRIBUTION_VARIATION): # write instance to file s = unicode(training_instance) + "\n" s += "\n" plain_text_instances_file.write(s.encode('utf8')) # adapt class distribution nr_instances_used += 1 class_distribution[training_instance.label] = class_distribution.get(training_instance.label, 0) + 1 # write to level db level_db.write_training_instance(training_instance) foo.close() plain_text_instances_file.close() print("") print("Originally " + str(nr_instances) + " instances.") print("Created " + str(nr_instances_used) + " instances." ) print("Class distribution:") print(class_distribution)