Exemple #1
0
    def predict_text(self, text):
        input_text = InputText(text)

        for token in input_text.gold_tokens:
            if not token.is_punctuation():
                token.word_vec = self.word2vec.get_vector(token.word.lower())

        slidingWindow = SlidingWindow()
        instances = slidingWindow.list_windows(input_text)

        punctuations = []
        for instance in instances:
            probs = self.predict_caffe(instance)
            #print instance
            #self.show_probs(probs)
            punctuations.append(numpy.argmax(probs))
        #print punctuations

        print(">>> Sentence with boundaries:")
        for i in range(len(punctuations) - 1, -1, -1):
            input_text.gold_tokens.insert(i + PUNCTUATION_POS, classes_as_string[punctuations[i]])
        print "{",
        for t in input_text.gold_tokens:
            print t,
        print "}"
    def generate(self, parsers, database, is_test):
        level_db = LevelDBCreator(database)
        window_slider = SlidingWindow()

        nr_instances = 0

        if is_test:
            plain_text_instances_file = open(
                database + "/../test_instances.txt", "w")
        else:
            plain_text_instances_file = open(
                database + "/../train_instances.txt", "w")

        for i, talk_parser in enumerate(parsers):
            talks = talk_parser.parse()

            prev_progress = 0
            print("")
            print("Processing file %s ..." % talk_parser.get_file_name())

            for talk in talks:
                progress = int(talk_parser.progress() * 100)
                if progress > prev_progress:
                    sys.stdout.write(str(progress) + "% ")
                    sys.stdout.flush()
                    prev_progress = progress

                talk.build_interval_tree()
                base_dir = os.path.dirname(talk_parser.get_file_name())

                # get pitch feature values
                pitch_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(
                    talk.talk_id) + ".pitch"
                talk.parse_pitch_feature(pitch_level_file)

                # get energy feature values
                energy_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(
                    talk.talk_id) + ".energy"
                talk.parse_energy_feature(energy_level_file)

                # normalize features
                talk.normalize()

                # get the training instances
                training_instances = window_slider.list_windows(talk)

                # write training instances to level db
                for training_instance in training_instances:
                    nr_instances += 1

                    # write instance to file
                    s = unicode(training_instance) + "\n"
                    s += "\n"
                    plain_text_instances_file.write(s.encode('utf8'))

                    # write to level db
                    level_db.write_training_instance(training_instance)

        plain_text_instances_file.close()
    def generate(self, parsers, database, is_test):
        level_db = LevelDBCreator(database)
        window_slider = SlidingWindow()

        nr_instances = 0

        if is_test:
            plain_text_instances_file = open(database + "/../test_instances.txt", "w")
        else:
            plain_text_instances_file = open(database + "/../train_instances.txt", "w")

        for i, talk_parser in enumerate(parsers):
            talks = talk_parser.parse()

            prev_progress = 0
            print("")
            print("Processing file %s ..." % talk_parser.get_file_name())

            for talk in talks:
                progress = int(talk_parser.progress() * 100)
                if progress > prev_progress:
                    sys.stdout.write(str(progress) + "% ")
                    sys.stdout.flush()
                    prev_progress = progress

                talk.build_interval_tree()
                base_dir = os.path.dirname(talk_parser.get_file_name())

                # get pitch feature values
                pitch_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".pitch"
                talk.parse_pitch_feature(pitch_level_file)

                # get energy feature values
                energy_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".energy"
                talk.parse_energy_feature(energy_level_file)

                # normalize features
                talk.normalize()

                # get the training instances
                training_instances = window_slider.list_windows(talk)

                # write training instances to level db
                for training_instance in training_instances:
                    nr_instances += 1

                    # write instance to file
                    s = unicode(training_instance) + "\n"
                    s += "\n"
                    plain_text_instances_file.write(s.encode('utf8'))

                    # write to level db
                    level_db.write_training_instance(training_instance)

        plain_text_instances_file.close()
    def predict(self, input_audio):
        sliding_window = SlidingWindow()
        instances = sliding_window.list_windows(input_audio)

        # get caffe predictions
        punctuation_probs = []
        for instance in instances:
            probs = self._predict_caffe(instance)
            punctuation_probs.extend(numpy.copy(probs))

        return punctuation_probs
    def predict(self, input_text):
        for token in input_text.tokens:
            if not token.is_punctuation():
                if not self.word2vec:
                    token.word_vec = numpy.random.rand(300)
                else:
                    token.word_vec = self.word2vec.get_vector(token.word.lower())

        sliding_window = SlidingWindow()
        instances = sliding_window.list_windows(input_text)

        # get caffe predictions
        punctuation_probs = []
        for instance in instances:
            probs = self._predict_caffe(instance)
            punctuation_probs.extend(numpy.copy(probs))

        return punctuation_probs
    def predict(self, input_text):
        for token in input_text.tokens:
            if not token.is_punctuation():
                if not self.word2vec:
                    token.word_vec = numpy.random.rand(300)
                else:
                    token.word_vec = self.word2vec.get_vector(
                        token.word.lower())

        sliding_window = SlidingWindow()
        instances = sliding_window.list_windows(input_text)

        # get caffe predictions
        punctuation_probs = []
        for instance in instances:
            probs = self._predict_caffe(instance)
            punctuation_probs.extend(numpy.copy(probs))

        return punctuation_probs
    def generate(self, parsers, database, is_test):
        level_db = LevelDBCreator(database)
        window_slider = SlidingWindow()
        # count how often each type (COMMA, PERIOD etc.) is in the instances
        class_distribution = dict()

        nr_instances = 0
        nr_instances_used = 0
        label_nr = len(Punctuation)
        if not self.USE_QUESTION_MARK:
            label_nr -= 1
        perfect_distribution = 1.0 / label_nr

        if is_test:
            plain_text_instances_file = open(database + "/../test_instances.txt", "w")
        else:
            plain_text_instances_file = open(database + "/../train_instances.txt", "w")

        for i, text_parser in enumerate(parsers):
            texts = text_parser.parse()

            prev_progress = 0
            print("")
            print("Processing file %s ..." % text_parser.get_file_name())

            foo = open("lineparsing", "w")
            for text in texts:
                progress = int(text_parser.progress() * 100)
                if progress > prev_progress:
                    sys.stdout.write(str(progress) + "% ")
                    sys.stdout.flush()
                    prev_progress = progress

                for sentence in text.sentences:
                    tokens = sentence.get_tokens()
                    # get the word vectors for all tokens in the sentence
                    for i in range(len(tokens)):
                        token = tokens[i]
                        if not token.is_punctuation():
                            if i == len(tokens) - 1:
                                punctuation_string = "PERIOD"
                            else:
                                next_token = tokens[i + 1]
                                if next_token.is_punctuation():
                                    punctuation_string = str(next_token.punctuation_type)
                                    punctuation_string = punctuation_string[12:]
                                else:
                                    punctuation_string = "O"
                            foo.write(token.word.lower() + "\t" + punctuation_string + "\n")
                            token.word_vec = self.word2vec.get_vector(token.word.lower())

                # get the training instances
                training_instances = window_slider.list_windows(text)

                # write training instances to level db
                for training_instance in training_instances:
                    nr_instances += 1
                    class_variation = (class_distribution.get(training_instance.label, 0) / float(max(nr_instances_used, 1))) - perfect_distribution

                    if is_test or (not self.CLASS_DISTRIBUTION_NORMALIZATION) or (class_variation <= self.CLASS_DISTRIBUTION_VARIATION):
                        # write instance to file
                        s = unicode(training_instance) + "\n"
                        s += "\n"
                        plain_text_instances_file.write(s.encode('utf8'))

                        # adapt class distribution
                        nr_instances_used += 1
                        class_distribution[training_instance.label] = class_distribution.get(training_instance.label, 0) + 1

                        # write to level db
                        level_db.write_training_instance(training_instance)
            foo.close()

        plain_text_instances_file.close()
        print("")

        print("Originally " + str(nr_instances) + " instances.")
        print("Created " + str(nr_instances_used) + " instances." )
        print("Class distribution:")
        print(class_distribution)