Example #1
0
    def get_segs_for_file(self, audio_file, sample_rate):
        base_name = strip_ext(os.path.basename(audio_file))

        possible_files = [x for x in self.seg_files if base_name in x]

        res = []

        if len(possible_files) > 0:
            seg_file = os.path.join(self.root_dir, possible_files[0])

            if len(possible_files) > 1:
                get_logger().log(
                    logging.WARNING,
                    "Found multiple matches for %s (%s). Using %s" %
                    (audio_file, " ".join(possible_files), seg_file))

            res = get_eaf_segs(seg_file, sample_rate, base_name,
                               self.add_labels)

        else:
            get_logger().log(
                logging.WARNING,
                "No seg file found for % in %s" % (audio_file, self.root_dir))

        return res
Example #2
0
    def run(self, filepath):
        self.log(logging.INFO, "Starting %s" % (filepath))

        out_file = self.derive_new_file_path(filepath, ".csv")

        if file_utils.should_run(filepath, out_file):

            t = transcript.PlaintextTranscript(
                filepath=filepath,
                label=None,
                pos_tagger_path=self.pos_tagger_path)

            transcript_utterances_fillers = None
            if self.filler_dir:
                file_id = os.path.basename(file_utils.strip_ext(filepath))
                if file_id in self.filler_files:
                    filler_file = os.path.join(self.filler_dir,
                                               self.filler_files[file_id])
                    filler_transcript = transcript.PlaintextTranscript(
                        filepath=filler_file,
                        label=None,
                        pos_tagger_path=self.pos_tagger_path)
                    transcript_utterances_fillers = filler_transcript.tokens

            self.feature_extractor.extract(
                t,
                out_csv=out_file,
                transcript_utterances_fillers=transcript_utterances_fillers)

            self.log(logging.INFO, "Done %s -> %s" % (filepath, out_file))

        self.emit(out_file)
Example #3
0
    def derive_new_file_path(self, old_file, new_ext=None):
        old_fname = os.path.basename(old_file)

        if new_ext is not None:
            ext = new_ext
            if not ext.startswith("."):
                ext = "." + ext

            new_fname = file_utils.strip_ext(old_fname) + ext
        else:
            new_fname = old_fname

        return os.path.join(self.out_dir, new_fname)
Example #4
0
    def setup(self, cfg_file, utterance_sep=" . ", filler_dir=None):
        path_output_parses = os.path.join(self.out_dir, "stanford_parses")
        path_output_lu_parses = os.path.join(self.out_dir, "lu_parses")
        path_output_rst = os.path.join(self.out_dir, "rst_output")
        self.output_csv = os.path.join(
            self.out_dir, 'textfeatures%s.csv' %
            (datetime.datetime.strftime(datetime.datetime.now(), "%Y%m%d")))

        do_wnic = True

        lexical_list, do_lexical, pragmatic_list, do_pragmatic, semantic_list, do_semantic, syntactic_list, do_syntactic = load_conf(
            cfg_file)

        parser_path = config.stanford_parser_path
        pos_tagger_path = config.stanford_pos_path
        lu_analyzer_path = config.lu_analyzer_path
        path_to_stanford_cp = config.path_to_stanford_cp
        cfg_rules_path = config.cfg_rules_path
        path_to_dictionary = config.path_to_dictionary
        path_to_freq_norms = config.path_to_freq_norms
        path_to_image_norms = config.path_to_image_norms
        path_to_anew = config.path_to_anew
        path_to_warringer = config.path_to_warringer
        path_to_mpqa_lexicon = config.path_to_mpqa_lexicon
        path_to_rst_python = config.path_to_rst_python
        path_to_rst = config.path_to_rst
        path_to_lda_model = config.path_to_lda_model
        path_to_lda_wordids = config.path_to_lda_wordids

        self.filler_dir = filler_dir
        self.pos_tagger_path = pos_tagger_path

        self.feature_extractor = feature.FeatureExtractor(
            utterance_sep=utterance_sep,
            path_output_lu_parses=path_output_lu_parses,
            path_output_parses=path_output_parses,
            parser_path=parser_path,
            cfg_rules_path=cfg_rules_path,
            pos_tagger_path=pos_tagger_path,
            path_to_dictionary=path_to_dictionary,
            lu_analyzer_path=lu_analyzer_path,
            path_to_freq_norms=path_to_freq_norms,
            path_to_image_norms=path_to_image_norms,
            path_to_anew=path_to_anew,
            path_to_warringer=path_to_warringer,
            do_wnic=do_wnic,
            path_to_mpqa_lexicon=path_to_mpqa_lexicon,
            path_to_rst_python=path_to_rst_python,
            path_to_rst=path_to_rst,
            path_output_rst=path_output_rst,
            path_to_stanford_cp=path_to_stanford_cp,
            path_to_lda_model=path_to_lda_model,
            path_to_lda_wordids=path_to_lda_wordids,
            do_lexical=do_lexical,
            do_syntactic=do_syntactic,
            do_semantic=do_semantic,
            do_pragmatic=do_pragmatic,
            lexical_list=lexical_list,
            syntactic_list=syntactic_list,
            semantic_list=semantic_list,
            pragmatic_list=pragmatic_list)

        self.filler_files = {
            os.path.basename(file_utils.strip_ext(x)): x
            for x in os.listdir(self.filler_dir)
        }