def get_segs_for_file(self, audio_file, sample_rate): base_name = strip_ext(os.path.basename(audio_file)) possible_files = [x for x in self.seg_files if base_name in x] res = [] if len(possible_files) > 0: seg_file = os.path.join(self.root_dir, possible_files[0]) if len(possible_files) > 1: get_logger().log( logging.WARNING, "Found multiple matches for %s (%s). Using %s" % (audio_file, " ".join(possible_files), seg_file)) res = get_eaf_segs(seg_file, sample_rate, base_name, self.add_labels) else: get_logger().log( logging.WARNING, "No seg file found for % in %s" % (audio_file, self.root_dir)) return res
def run(self, filepath): self.log(logging.INFO, "Starting %s" % (filepath)) out_file = self.derive_new_file_path(filepath, ".csv") if file_utils.should_run(filepath, out_file): t = transcript.PlaintextTranscript( filepath=filepath, label=None, pos_tagger_path=self.pos_tagger_path) transcript_utterances_fillers = None if self.filler_dir: file_id = os.path.basename(file_utils.strip_ext(filepath)) if file_id in self.filler_files: filler_file = os.path.join(self.filler_dir, self.filler_files[file_id]) filler_transcript = transcript.PlaintextTranscript( filepath=filler_file, label=None, pos_tagger_path=self.pos_tagger_path) transcript_utterances_fillers = filler_transcript.tokens self.feature_extractor.extract( t, out_csv=out_file, transcript_utterances_fillers=transcript_utterances_fillers) self.log(logging.INFO, "Done %s -> %s" % (filepath, out_file)) self.emit(out_file)
def derive_new_file_path(self, old_file, new_ext=None): old_fname = os.path.basename(old_file) if new_ext is not None: ext = new_ext if not ext.startswith("."): ext = "." + ext new_fname = file_utils.strip_ext(old_fname) + ext else: new_fname = old_fname return os.path.join(self.out_dir, new_fname)
def setup(self, cfg_file, utterance_sep=" . ", filler_dir=None): path_output_parses = os.path.join(self.out_dir, "stanford_parses") path_output_lu_parses = os.path.join(self.out_dir, "lu_parses") path_output_rst = os.path.join(self.out_dir, "rst_output") self.output_csv = os.path.join( self.out_dir, 'textfeatures%s.csv' % (datetime.datetime.strftime(datetime.datetime.now(), "%Y%m%d"))) do_wnic = True lexical_list, do_lexical, pragmatic_list, do_pragmatic, semantic_list, do_semantic, syntactic_list, do_syntactic = load_conf( cfg_file) parser_path = config.stanford_parser_path pos_tagger_path = config.stanford_pos_path lu_analyzer_path = config.lu_analyzer_path path_to_stanford_cp = config.path_to_stanford_cp cfg_rules_path = config.cfg_rules_path path_to_dictionary = config.path_to_dictionary path_to_freq_norms = config.path_to_freq_norms path_to_image_norms = config.path_to_image_norms path_to_anew = config.path_to_anew path_to_warringer = config.path_to_warringer path_to_mpqa_lexicon = config.path_to_mpqa_lexicon path_to_rst_python = config.path_to_rst_python path_to_rst = config.path_to_rst path_to_lda_model = config.path_to_lda_model path_to_lda_wordids = config.path_to_lda_wordids self.filler_dir = filler_dir self.pos_tagger_path = pos_tagger_path self.feature_extractor = feature.FeatureExtractor( utterance_sep=utterance_sep, path_output_lu_parses=path_output_lu_parses, path_output_parses=path_output_parses, parser_path=parser_path, cfg_rules_path=cfg_rules_path, pos_tagger_path=pos_tagger_path, path_to_dictionary=path_to_dictionary, lu_analyzer_path=lu_analyzer_path, path_to_freq_norms=path_to_freq_norms, path_to_image_norms=path_to_image_norms, path_to_anew=path_to_anew, path_to_warringer=path_to_warringer, do_wnic=do_wnic, path_to_mpqa_lexicon=path_to_mpqa_lexicon, path_to_rst_python=path_to_rst_python, path_to_rst=path_to_rst, path_output_rst=path_output_rst, path_to_stanford_cp=path_to_stanford_cp, path_to_lda_model=path_to_lda_model, path_to_lda_wordids=path_to_lda_wordids, do_lexical=do_lexical, do_syntactic=do_syntactic, do_semantic=do_semantic, do_pragmatic=do_pragmatic, lexical_list=lexical_list, syntactic_list=syntactic_list, semantic_list=semantic_list, pragmatic_list=pragmatic_list) self.filler_files = { os.path.basename(file_utils.strip_ext(x)): x for x in os.listdir(self.filler_dir) }