def __run_training():
    # Setup
    args = CliParser()
    args.print_params("EMOJI TRAINING")

    # Build knowledge base
    print("reading training data from: " + args.data_folder)
    kb, ind2phr, ind2emoji = build_kb(args.data_folder)

    # Save the mapping from index to emoji
    pk.dump(ind2emoji, open(args.mapping_file, "wb"))

    # Get the embeddings for each phrase in the training set
    embeddings_array = generate_embeddings(
        ind2phr=ind2phr,
        kb=kb,
        embeddings_file=args.embeddings_file,
        word2vec_file=args.word2vec_file,
    )

    # Get examples of each example type in two sets. This is just a reprocessing of the knowledge base for efficiency,
    # so we don't have to generate the train and dev set on each train
    train_set = get_examples_from_kb(kb=kb, example_type="train")
    dev_set = get_examples_from_kb(kb=kb, example_type="dev")

    train_save_evaluate(
        params=args.model_params,
        kb=kb,
        train_set=train_set,
        dev_set=dev_set,
        ind2emoji=ind2emoji,
        embeddings_array=embeddings_array,
        dataset_name=args.dataset,
    )
def __run_training():
    # Setup
    args = CliParser()
    args.print_params("EMOJI TRAINING")

    # Build knowledge base
    print("reading training data from: " + args.data_folder)
    kb, ind2phr, ind2emoji = build_kb(args.data_folder)

    # Save the mapping from index to emoji
    pk.dump(ind2emoji, open(args.mapping_file, "wb"))

    # Get the embeddings for each phrase in the training set
    embeddings_array = generate_embeddings(
        ind2phr=ind2phr,
        kb=kb,
        embeddings_file=args.embeddings_file,
        word2vec_file=args.word2vec_file,
    )

    # Get examples of each example type in two sets. This is just a reprocessing of the knowledge base for efficiency,
    # so we don't have to generate the train and dev set on each train
    train_set = get_examples_from_kb(kb=kb, example_type="train")
    dev_set = get_examples_from_kb(kb=kb, example_type="dev")

    train_save_evaluate(
        params=args.model_params,
        kb=kb,
        train_set=train_set,
        dev_set=dev_set,
        ind2emoji=ind2emoji,
        embeddings_array=embeddings_array,
        dataset_name=args.dataset,
    )
Exemple #3
0
def __run_grid_search():
    # Read in arguments
    args = pp.CliParser()
    args.print_search_params("EMOJI2VEC GRID SEARCH", search_params)

    # Read in training data, generate mappings, and generate embeddings
    print("reading training data from: " + args.data_folder)
    kb, ind2phr, ind2emoji = build_kb(args.data_folder)
    pk.dump(ind2emoji, open(args.mapping_file, "wb"))
    embeddings_array = generate_embeddings(
        ind2phr=ind2phr,
        kb=kb,
        embeddings_file=args.embeddings_file,
        word2vec_file=args.word2vec_file,
    )

    # Perform grid search
    print("performing grid search")
    results_dict = grid_search(
        params=search_params,
        learning_rate=args.model_params.learning_rate,
        threshold=args.model_params.class_threshold,
        in_dim=args.model_params.in_dim,
        kb=kb,
        embeddings_array=embeddings_array,
        ind2emoji=ind2emoji,
        dataset_name=args.dataset,
    )

    # Get top 5 results
    results = sorted(
        results_dict, key=(lambda x: results_dict[x]["auc"]), reverse=True
    )
    for result in results[:5]:
        print(str.format("{}\n{}", result, results_dict[result]))

    m = results_dict[results[0]]
    print(
        str.format(
            "The best combination, by auc score, is: {} at {}", results[0], m
        )
    )
Exemple #4
0
    def __init__(self, config):
        super().__init__()
        self.config = config
        annos = read_json(config['anno_file'])[config['emo_type']]

        ##################################################
        # 作者将这部分代码注释掉了,就是用是否只用训练数据集进行训练
        # 在train_test模式下,所有的数据都应该预先处理
        if not config["val_file"]:
            print("Caution! Loading Samples from {}".format(config['id_file']))
            ids = []
            tmp_annos = []
            with open(config['id_file']) as fin:
                for line in fin.readlines():
                    ids.append(int(line.strip()))
            
            for jj, anno in enumerate(annos):
                if jj in ids:
                    tmp_annos.append(anno)
            annos = tmp_annos
        ##################################################
            
        emo_num = 9 if config['emo_type'] == 'primary' else 14
        self.emotion_classes = EMOTIONS[:emo_num]
        
        data = read_json(config['data_file'])
        self.visual_features, self.audio_features, self.text_features = [], [], []
        self.visual_valids, self.audio_valids, self.text_valids = [], [], []

        ################################
        # 用来保存概念,用来后面准备加知识图
        self.visual_concepts, self.audio_concepts, self.text_concepts = list(), list(), list()
        self.visual_concepts_lengths, self.audio_concepts_lengths, self.text_concepts_lengths = list(), list(), list()
        ################################

        self.labels = []
        self.charcaters_seq = []
        self.time_seq = []
        self.target_loc = []
        self.seg_len = [] 
        self.n_character = []
        vfe = VisualFeatureExtractor(config)
        afe = AudioFeatureExtractor(config)
        tfe = TextFeatureExtractor(config)
        pfe = PersonalityFeatureExtractor(config)
        self.personality_list = pfe.get_features() # n_c
        self.personality_features = []

        ###################################
        print("Processing Concepts")
        self.concept2id_v, self.id2concept_v = build_vocab(config, 'visual')
        self.concept2id_a, self.id2concept_a = build_vocab(config, 'audio')
        self.concept2id_t, self.id2concept_t = build_vocab(config, 'text')

        vfe.concepts2id = self.concept2id_v
        afe.concepts2id = self.concept2id_a
        tfe.concepts2id = self.concept2id_t

        # print(self.concept2id_t)

        # print(afe.concepts2id)

        assert config["visual"]["concept_size"] == len(self.concept2id_v), "the size of concept in config ({}) mismatches the size captured from data ({})".format(config["visual"]["concept_size"], len(self.concept2id_v))
        assert config["audio"]["concept_size"] == len(self.concept2id_a), "the size of concept in config ({}) mismatches the size captured from data ({})".format(config["audio"]["concept_size"], len(self.concept2id_a))
        assert config["text"]["concept_size"] == len(self.concept2id_t), "the size of concept in config ({}) mismatches the size captured from data ({})".format(config["text"]["concept_size"], len(self.concept2id_t))
        ###################################

        ###################################
        print("Processing Knowledge") 
        vectors = Magnitude(config["knowledge"]["embedding_file"])
        self.embedding_concept_v = get_concept_embedding(self.concept2id_v, config, vectors)
        self.embedding_concept_a = get_concept_embedding(self.concept2id_a, config, vectors)
        self.embedding_concept_t = get_concept_embedding(self.concept2id_t, config, vectors)

        self.edge_matrix_v, self.affectiveness_v = build_kb(self.concept2id_v, config, "visual")
        self.edge_matrix_a, self.affectiveness_a = build_kb(self.concept2id_a, config, "audio")
        self.edge_matrix_t, self.affectiveness_t = build_kb(self.concept2id_t, config, "text")
        ###################################
        
        print('Processing Samples...')
        for jj, anno in enumerate(tqdm(annos)):
            # if jj >= 300: break
            clip = anno['clip']
            target_character = anno['character']
            target_moment = anno['moment']
            on_characters = data[clip]['on_character']
            if target_character not in on_characters:
                on_characters.append(target_character)
            on_characters = sorted(on_characters)
            
            charcaters_seq, time_seq, target_loc, personality_seq = [], [], [], []
            

            for ii in range(len(data[clip]['seg_start'])):
                for character in on_characters:
                    charcaters_seq.append([0 if character != i else 1 for i in range(len(config['speakers']))])
                    time_seq.append(ii)
                    personality_seq.append(self.personality_list[character])
                    if character == target_character and data[clip]['seg_start'][ii] <= target_moment < data[clip]['seg_end'][ii]:
                        target_loc.append(1)
                    else:
                        target_loc.append(0)
            # for character in on_characters:
            #     for ii in range(len(data[clip]['seg_start'])):
            #         charcaters_seq.append([0 if character != i else 1 for i in range(len(config['speakers']))])
            #         time_seq.append(ii)
            #         personality_seq.append(self.personality_list[character])
            #         if character == target_character and data[clip]['seg_start'][ii] <= target_moment < data[clip]['seg_end'][ii]:
            #             target_loc.append(1)
            #         else:
            #             target_loc.append(0)
            
            ####################################################
            # 什么c就是对应的概念,读到列表里面,暂时没想好动作特征咋处理
            vf, v_valid, vc = vfe.get_feature(anno['clip'], target_character) # seqlen * n_c, dim_features_v
            af, a_valid, ac = afe.get_feature(anno['clip'], target_character) # seqlen * n_c, dim_features_a
            tf, t_valid, tc = tfe.get_feature(anno['clip'], target_character) # seqlen * n_c, dim_features_t
            ####################################################
            
            self.n_character.append(len(on_characters))
            self.seg_len.append(len(data[clip]['seg_start']))
    
            self.personality_features.append(torch.stack(personality_seq)) # num_anno, seqlen * n_c, dim_features_p
            self.charcaters_seq.append(torch.tensor(charcaters_seq)) # num_anno, seqlen * n_c, some
            self.time_seq.append(torch.tensor(time_seq)) # num_anno, seqlen * n_c, some
            self.target_loc.append(torch.tensor(target_loc, dtype=torch.int8)) # num_anno, seqlen * n_c
            self.visual_features.append(vf) # num_anno, seqlen * n_c, dim_features_v
            self.audio_features.append(af) # num_anno, seqlen * n_c, dim_features_a
            self.text_features.append(tf) # num_anno, seqlen * n_c, dim_features_t
            self.visual_valids.append(v_valid) # num_anno, seqlen * n_c
            self.audio_valids.append(a_valid) # num_anno, seqlen * n_c
            self.text_valids.append(t_valid) # num_anno, seqlen * n_c

            #######################################################
            # 对应的保存,按照样本对应
            lengths = list()
            vc_new = list()
            for concepts in vc:
                new = torch.zeros(512, dtype=torch.long)
                lengths.append(concepts.size(0))
                new[:concepts.size(0)] = concepts[:]
                vc_new.append(new)
            self.visual_concepts_lengths.append(torch.tensor(lengths, dtype=torch.int8)) # num_anno, seqlen

            # assert len(vc_new) == len(vc) and len(vc_new) == len(data[clip]['seg_start'])

            ac_new = list()
            lengths = list()
            for concepts in ac:
                # print(concepts)
                new = torch.zeros(512, dtype=torch.long) # max_num_concept
                lengths.append(concepts.size(0))
                new[:concepts.size(0)] = concepts[:]
                ac_new.append(new)
            self.audio_concepts_lengths.append(torch.tensor(lengths, dtype=torch.int8)) # num_anno, seqlen

            tc_new = list()
            lengths = list()
            for concepts in tc:
                new = torch.zeros(512, dtype=torch.long)
                lengths.append(concepts.size(0))
                new[:concepts.size(0)] = concepts[:]
                tc_new.append(new)
            self.text_concepts_lengths.append(torch.tensor(lengths, dtype=torch.int8)) # num_anno, seqlen

            self.visual_concepts.append(torch.stack(vc_new, dim=0)) # num_anno, seqlen, max_num_concept
            # assert torch.stack(vc_new, dim=0).size(0) == len(data[clip]['seg_start'])
            self.audio_concepts.append(torch.stack(ac_new, dim=0)) # num_anno, seqlen, max_num_concept
            self.text_concepts.append(torch.stack(tc_new, dim=0)) # num_anno, seqlen, max_num_concept
            #######################################################

            self.labels.append(self.emotion_classes.index(anno['emotion']))