def __init__(self, en_dir, img_path, jp_dir, img_original_dir=None, img_correspondence_path=None, jp_original_dir=None, compress_word_dim=100, compress_img_dim=100, line_flag=False): # log setting program = os.path.basename(__name__) self.logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(name)s : %(levelname)s : %(message)s') self.english_feature = txt.TextFeatures(en_dir, compress_dim=compress_word_dim, feature_name='eng') self.japanese_feature = txt.TextFeatures(jp_dir, jp_original_dir, compress_dim=compress_word_dim) self.image_feature = img.ImageFeatures(img_path, img_original_dir, img_correspondence_path, compress_img_dim) self.gcca = gcca.GCCA() self.cca = gcca.CCA() self.line_flag = line_flag self.logger.info("===== initializing =====") self.logger.info("line_flag:%s", self.line_flag) self.__prep_dir()
def get_trigram_features(self): with open('/data/corpora/cspan/top_trigrams.pkl', 'rb') as f: trigrams = pickle.load(f) phrases = [t.lower() for t in self.s.alignment.get_phrase_text()] feats = [] keys = [u[0] for u in trigrams] for phrase in phrases: h = {} #keys = [u[0] for u in trigrams] for k in keys: h[k] = 0 grams = text_features.TextFeatures(phrase).get_n_grams(3) for gram in grams: if gram in keys: h[gram] = 1 feats.append(hash_to_list(h)) return np.array(feats)
sys.path.insert(0, '../core') import speech, text_features with open('/data/jrgillick/speeches.pkl') as f: bigram_vocab = {} bigram_counts = {} trigram_vocab = {} trigram_counts = {} speeches = pickle.load(f) bigram_index = 0 trigram_index = 0 for s in tqdm(speeches): phrases = s.alignment.get_phrase_text() for phrase in phrases: tf = text_features.TextFeatures(phrase) bigrams = tf.get_n_grams(2) trigrams = tf.get_n_grams(3) for bigram in bigrams: if bigram not in bigram_vocab: bigram_vocab[bigram] = bigram_index bigram_counts[bigram] = 1 bigram_index += 1 else: bigram_counts[bigram] += 1 for trigram in trigrams: if trigram not in trigram_vocab: trigram_vocab[trigram] = trigram_index trigram_counts[trigram] = 1
def get_euphony_features(self): text = self.s.alignment.get_phrase_text() tfs = [text_features.TextFeatures(t) for t in text] feats = [hash_to_list(tf.get_euphony_features()) for tf in tfs] return feats
def get_name_feature_names(self): text = self.s.alignment.get_phrase_text() tfs = text_features.TextFeatures(text[0]).get_name_features() keys = tfs.keys() keys.sort() return keys
def get_mean_vector_features(self): text = self.s.alignment.get_phrase_text() return [ text_features.TextFeatures(unicode(text)).get_mean_vector(t) for t in text ]
def get_name_features(self): text = [t.lower() for t in self.s.alignment.get_phrase_text()] tfs = [text_features.TextFeatures(t) for t in text] feats = [hash_to_list(tf.get_name_features()) for tf in tfs] return feats