Exemple #1
0
    def __init__(self, en_dir, img_path, jp_dir, img_original_dir=None, img_correspondence_path=None, jp_original_dir=None, compress_word_dim=100, compress_img_dim=100, line_flag=False):

        # log setting
        program = os.path.basename(__name__)
        self.logger = logging.getLogger(program)
        logging.basicConfig(format='%(asctime)s : %(name)s : %(levelname)s : %(message)s')

        self.english_feature = txt.TextFeatures(en_dir, compress_dim=compress_word_dim, feature_name='eng')
        self.japanese_feature = txt.TextFeatures(jp_dir, jp_original_dir, compress_dim=compress_word_dim)
        self.image_feature = img.ImageFeatures(img_path, img_original_dir, img_correspondence_path, compress_img_dim)
        self.gcca = gcca.GCCA()
        self.cca = gcca.CCA()
        self.line_flag = line_flag

        self.logger.info("===== initializing =====")
        self.logger.info("line_flag:%s", self.line_flag)

        self.__prep_dir()
Exemple #2
0
 def get_trigram_features(self):
     with open('/data/corpora/cspan/top_trigrams.pkl', 'rb') as f:
         trigrams = pickle.load(f)
     phrases = [t.lower() for t in self.s.alignment.get_phrase_text()]
     feats = []
     keys = [u[0] for u in trigrams]
     for phrase in phrases:
         h = {}
         #keys = [u[0] for u in trigrams]
         for k in keys:
             h[k] = 0
         grams = text_features.TextFeatures(phrase).get_n_grams(3)
         for gram in grams:
             if gram in keys:
                 h[gram] = 1
         feats.append(hash_to_list(h))
     return np.array(feats)
Exemple #3
0
sys.path.insert(0, '../core')

import speech, text_features

with open('/data/jrgillick/speeches.pkl') as f:
    bigram_vocab = {}
    bigram_counts = {}
    trigram_vocab = {}
    trigram_counts = {}
    speeches = pickle.load(f)
    bigram_index = 0
    trigram_index = 0
    for s in tqdm(speeches):
        phrases = s.alignment.get_phrase_text()
        for phrase in phrases:
            tf = text_features.TextFeatures(phrase)
            bigrams = tf.get_n_grams(2)
            trigrams = tf.get_n_grams(3)

            for bigram in bigrams:
                if bigram not in bigram_vocab:
                    bigram_vocab[bigram] = bigram_index
                    bigram_counts[bigram] = 1
                    bigram_index += 1
                else:
                    bigram_counts[bigram] += 1

            for trigram in trigrams:
                if trigram not in trigram_vocab:
                    trigram_vocab[trigram] = trigram_index
                    trigram_counts[trigram] = 1
Exemple #4
0
 def get_euphony_features(self):
     text = self.s.alignment.get_phrase_text()
     tfs = [text_features.TextFeatures(t) for t in text]
     feats = [hash_to_list(tf.get_euphony_features()) for tf in tfs]
     return feats
Exemple #5
0
 def get_name_feature_names(self):
     text = self.s.alignment.get_phrase_text()
     tfs = text_features.TextFeatures(text[0]).get_name_features()
     keys = tfs.keys()
     keys.sort()
     return keys
Exemple #6
0
 def get_mean_vector_features(self):
     text = self.s.alignment.get_phrase_text()
     return [
         text_features.TextFeatures(unicode(text)).get_mean_vector(t)
         for t in text
     ]
Exemple #7
0
 def get_name_features(self):
     text = [t.lower() for t in self.s.alignment.get_phrase_text()]
     tfs = [text_features.TextFeatures(t) for t in text]
     feats = [hash_to_list(tf.get_name_features()) for tf in tfs]
     return feats