Example #1
0
def get_test_dataset(cfg: DictConfig):
    """
    Get test dataset
    :param cfg: general hydra config
    :return: test dataset
    """
    random.seed(cfg.trainer.seed)

    with open(f'../../{cfg.data_params.folder_path}{cfg.inference.file_name}',
              'r',
              encoding='utf-8') as f:
        data = json.load(f)

    if cfg.data_params.debug_mode:
        data = random.sample(data, len(data) // 10)

    sentences = [s['text'] for s in data]
    labels = [s['label'] for s in data]

    dataset_class = load_obj(cfg.dataset.class_name)
    tokenizer = load_obj(cfg.model.tokenizer).from_pretrained(
        f"../../{cfg.inference.save_dir}", do_lower_case=True)

    test_dataset = dataset_class(
        cfg=cfg,
        data=sentences,
        targets=labels,
        tokenizer=tokenizer,
    )
    return test_dataset
Example #2
0
    def __init__(self, image_ids, image_folder_path, mode = 'train', vocab_file = "", vocab_threshold = 5, batch_size = 10):
        assert mode in ['train', 'val', 'test']
        
        self.mode = mode
        self.image_folder_path = image_folder_path
        self.batch_size = batch_size
        
        # Get pre-processed objects
        all_captions_dict = load_obj('captions_dict')
        captions_dict = { image_id: all_captions_dict[image_id] for image_id in image_ids } # only include selected subset of captions

        # Obtain sample of training images
        #self.training_image_ids, captions_dict = get_training_indices(sample_size = sample_size, mode = "balanced_clean")
        
        # self.training_image_ids, self.images_path, self.image_id_dict, captions_dict \
        # = get_data(image_folder_path, annotations_path, sample_size, data_type)

        # Set up vocabulary or load from training set
        if self.mode == 'train':
            self.vocab = Vocabulary(captions_dict)
            print('Vocabulary successfully created')
        elif vocab_file != "":
            self.vocab = vocab_file
            self.word2idx = self.vocab.word2idx
            self.idx2word = self.vocab.idx2word
            #print('Vocabulary successfully loaded')
        else:
            self.vocab = load_obj("vocab")
            self.word2idx = self.vocab.word2idx
            self.idx2word = self.vocab.idx2word
            print('Vocabulary successfully loaded')

        # Batch_size set to 1 if is test
        if self.mode == 'test':
            self.batch_size = 1
        
        # Set up dataset
        self.im_ids = [] # with duplicates for indexing, i.e. if caption 1-5 all correspond to image 8, the im_ids will be [8,8,8,8,8]
        self.captions = []
        self.images = []
        self.captions_len = []
        for im_id, captions_list in captions_dict.items():
            for item in captions_list:
                self.im_ids.append(im_id)
                self.captions.append(item)
                self.captions_len.append(len(nltk.tokenize.word_tokenize(item)))
        
        # Set up paramteres for image feature extraction 
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)),
        ])
Example #3
0
def get_training_datasets(cfg: DictConfig) -> Tuple:
    """
    Get train and validation datasets
    :param cfg: general hydra config
    :return: tuple of train and valid dataset objects
    """
    random.seed(cfg.trainer.seed)

    with open(
            f'../../{cfg.data_params.folder_path}{cfg.data_params.file_name}',
            'r',
            encoding='utf-8') as f:
        data = json.load(f)

    if cfg.data_params.debug_mode:
        print("DEBUG MOD ON!")
        data = random.sample(data, len(data) // 100)

    sentences = [s['text'] for s in data]
    labels = [s['label'] for s in data]
    train_data, valid_data, train_labels, val_labels = train_test_split(
        sentences,
        labels,
        test_size=cfg.data_params.test_size,
        shuffle=True,
        random_state=cfg.trainer.seed,
    )

    dataset_class = load_obj(cfg.dataset.class_name)
    tokenizer = load_obj(cfg.model.tokenizer).from_pretrained(
        cfg.model.model_path, do_lower_case=True)

    train_dataset = dataset_class(
        cfg=cfg,
        data=train_data,
        targets=train_labels,
        tokenizer=tokenizer,
    )
    valid_dataset = dataset_class(
        cfg=cfg,
        data=valid_data,
        targets=val_labels,
        tokenizer=tokenizer,
    )

    return train_dataset, valid_dataset
Example #4
0
def eval_model(image_folder_path,
               vocab_path='',
               model_path='',
               training_image_ids_path='',
               sample_size=100,
               mode='balanced_mode'):
    test_pred_captions = predict_for_test_samples(image_folder_path = image_folder_path,\
                         sample_size = sample_size, vocab_path = vocab_path, model_path = model_path,\
                           training_image_ids_path = training_image_ids_path, mode = 'balanced_mode')

    # load objects
    im_gender_summary = load_obj("im_gender_summary")
    captions_dict = load_obj("captions_dict")

    # populate ref_captions
    ref_captions = []
    for _, captions in captions_dict.items():
        for c in captions:
            ref_captions.append(c.split())
    # intiaite list
    gt = []
    pred_gender = []
    bleus = []

    for image_id, caption in test_pred_captions.items():
        gt.append(im_gender_summary[image_id]['pred_gt'])
        pred_gender.append(caption_to_gender(caption))
        bleus.append(sentence_bleu(ref_captions, caption.split()))

    labels = ['male', 'neutral', 'female']
    conf_matrix = confusion_matrix(gt, pred_gender, labels=labels)
    confusion_matrix_df = pd.DataFrame(conf_matrix,
                                       index=labels,
                                       columns=labels)
    accuracy = accuracy_score(gt, pred_gender)
    bleu = np.mean(bleus)

    print(
        f"Confusion Matrix (columns = Ground Truth gender, rows = Predicted gender):\n{confusion_matrix_df}"
    )
    print(f"\nAccuracy score: {accuracy}")
    print(
        f"\nTest Bleu Score (compared against original human labels of test set):{bleu}"
    )

    return confusion_matrix_df, accuracy, bleu
Example #5
0
def run(cfg: DictConfig) -> Dict:
    """
    Model's inference on hold-out dataset
    ----------
    Example:
        python predict.py
    :param cfg: general hydra config
    :return: dict: test metrics
    """
    hparams = flatten_omegaconf(cfg)

    exp = Experiment(
        api_key=cfg.logger.comet_api,
        project_name=cfg.general.project_name,
        workspace=cfg.general.workspace,
    )
    exp.log_parameters(hparams)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    bert_model = load_obj(cfg.model.class_name).from_pretrained(cfg)
    bert_model.model.load_state_dict(
        torch.load(f"../../{cfg.inference.save_dir}/pytorch_model.bin"))
    # print(bert_model)
    bert_model.to(device)
    bert_model.eval()

    collator = load_obj(cfg.dataset.collator)(
        percentile=cfg.data_params.percentile,
        pad_value=cfg.data_params.pad_value,
    )
    criterion = nn.CrossEntropyLoss()

    test = get_test_dataset(cfg)

    evaluator = Evaluator(val_dataset=test,
                          collator=collator,
                          criterion=criterion,
                          cfg=cfg,
                          device=device)

    set_seed(cfg.trainer.seed)
    test_metrics = evaluator(model=bert_model, experiment=exp, epoch=0, step=0)

    return test_metrics
Example #6
0
def train_test_split(training_image_ids, test_size=0.3, random_state=123):
    # Get pre-processed objects
    im_gender_summary = load_obj('im_gender_summary')

    X = np.asarray(training_image_ids)
    y = np.asarray([im_gender_summary[x]['pred_gt'] for x in X])
    # Use Stratified shuffle split to ensure the ratio of gender ratio stays the same in train set and validation set (both balanced or random)
    sss = StratifiedShuffleSplit(n_splits=1,
                                 test_size=test_size,
                                 random_state=random_state)
    for train_idx, test_idx in sss.split(X, y):
        train_image_ids, test_image_ids = X[train_idx], X[test_idx]
        gender_train, gender_test = y[train_idx], y[test_idx]
    return train_image_ids, test_image_ids, gender_train, gender_test
def predict_for_test_samples(sample_size,image_folder_path, vocab_path = '', model_path = '', training_image_ids_path = '', embed_size = 256, hidden_size = 512, mode = 'balanced_clean'):
    random.seed(123)
    # Init Dict
    test_pred_captions = dict()
    
    # Get the training image id paths
    if training_image_ids_path == '': # if not specified, assume it is the vocab pickle saved in object
        training_image_ids = load_obj('training_image_ids')
    else:
        with open(training_image_ids_path, 'rb') as f:
            training_image_ids = pickle.load(f)

    # Get test image ids
    test_image_ids = get_test_indices(sample_size, training_image_ids, mode = mode)

    for test_image_id in test_image_ids:
        captions = predict_from_COCO(image_folder_path, vocab_path = vocab_path, model_path = model_path, training_image_ids_path = training_image_ids_path,\
              test_image_id = test_image_id, is_print = False, return_one = True)
        test_pred_captions[test_image_id] = captions

    return test_pred_captions
Example #8
0
def get_qualified_dataset(annotations_path, save_file=False):
    '''
    captions_dict (dict)- key: image_id, value: list of captions

    im_gender_summary (dict of dict)- key: image_id, value: dict()
    keys in dict: pred_gt- predicted ground truth label of the gender noun
                per_gt- % of annotations (out of 5 total) that agreed with the GT
                agreement_score- agreement score calculated using distance between 5 predictions, with 1 being the best
                                male = 1, female = -1, neutral = 0
                                e.g.0 annotations indicate [f, f, f, f, f], agreement_score = 1.00
                                e.g.1 annotations indicate [m, m, f, f, f], agreement_score = 0.00
                                e.g.2 annotation indicate [n, n, f, f, f], agreement_score = 0.50                                                        
                anno_gender- list of gender sentiment, e.g. ['male', 'female', 'neutral', 'female', 'female']
                anno_nouns- list of nouns used to describe human
                clean_gender- binary variable indicating if all notations used the same gender/ gender-neutral noun 
                clean_noun- binary variable indicating if all notations used the identical noun

    not_human_im_ids(list)- list of image ids of images with >1 captions that do not mention humans.
    Since the COCO dataset does not label whether human (or other objects) is the major subject 
    matter of the image. This list helps us isolate images with human figures as the focus.
    '''
    captions_dict = dict()
    im_gender_summary = dict()
    not_human_im_ids = list()

    # load pre-processed data
    gender_nouns_lookup = load_obj('gender_nouns_lookup')

    for datatype in ['train', 'val']:
        print(f"\nEvaluating ground truth labels in {datatype} set")
        with open(f'{annotations_path}/captions_{datatype}2014.json') as f:
            captions_json = json.load(f)

            for i in range(len(captions_json['annotations'])):
                # Check to make sure image exists, as some images' captions are included in the json file but the image does not exist,
                image_id = captions_json['annotations'][i]['image_id']
                l = len(str(image_id))
                fnames = [
                    "COCO_train2014_" + "0" * (12 - l) + str(image_id) +
                    '.jpg',
                    "COCO_val2014_" + "0" * (12 - l) + str(image_id) + '.jpg'
                ]
                image_check = glob.glob('./data/images/*/' +
                                        fnames[0]) + glob.glob(
                                            './data/images/*/' + fnames[1])

                if image_check != []:
                    caption = captions_json['annotations'][i]['caption']
                    tokens = nltk.word_tokenize(caption)
                    c_female = 0  # count of gender nouns and gender-neutral nouns
                    c_male = 0
                    c_neutral = 0
                    noun = []

                    # Evaluate annotator's noun used to describe humans
                    for t in tokens:
                        t = t.lower()
                        if t in gender_nouns_lookup['female']:
                            c_female += 1
                            noun.append(t)
                        elif t in gender_nouns_lookup['male']:
                            c_male += 1
                            noun.append(t)
                        elif t in gender_nouns_lookup['neutral']:
                            c_neutral += 1
                            noun.append(t)

                    # Only include image for training if more than one caption of the image mention human
                    # Conflicting gender mentions are also dropped, e.g. "a boy and a girl are on a beach"
                    if c_female + c_male + c_neutral == 1:
                        # Assign gender sentiment to the caption
                        if c_female > 0:
                            gender = 'female'
                        elif c_male > 0:
                            gender = 'male'
                        else:
                            gender = 'neutral'

                        # Populate captions dict and image gender summary dict
                        if image_id in captions_dict:
                            captions_dict[image_id] += [caption]
                            im_gender_summary[image_id]['anno_gender'].append(
                                gender)
                            im_gender_summary[image_id]['anno_noun'].append(
                                noun[0])
                        else:
                            captions_dict[image_id] = [caption]
                            im_gender_summary[image_id] = dict()
                            im_gender_summary[image_id]['anno_gender'] = [
                                gender
                            ]
                            im_gender_summary[image_id]['anno_noun'] = [
                                noun[0]
                            ]

                    if i % 100000 == 0:
                        print(
                            f"Caption {i} processed, out of {len(captions_json['annotations'])} captions"
                        )
                        print(
                            f"No. of qualified images processed: {len(im_gender_summary)}"
                        )

    for image_id in im_gender_summary:
        # Delete images where <3 annotators mentioned the human figure
        # Because it is impossible to estimate the ground truth using only 1 or 2 captions
        if len(im_gender_summary[image_id]['anno_gender']) < 3:
            not_human_im_ids.append(image_id)

        else:
            pred = im_gender_summary[image_id]['anno_gender']

            # Evaluate groundtruth guesses and agreement scores
            gt = max(set(pred), key=pred.count)

            # Populate dictionary
            im_gender_summary[image_id]['pred_gt'] = gt
            im_gender_summary[image_id]['per_gt'] = sum(
                [1 for p in pred if p == gt]) / len(pred)
            im_gender_summary[image_id]['agreement_score'] = agreement_score(
                pred)
            if len(set(pred)) == 1:
                im_gender_summary[image_id]['clean_gender'] = 1
            else:
                im_gender_summary[image_id]['clean_gender'] = 0
            if len(set(im_gender_summary[image_id]['anno_noun'])) == 1:
                im_gender_summary[image_id]['clean_noun'] = 1
            else:
                im_gender_summary[image_id]['clean_noun'] = 0

    for image_id in not_human_im_ids:
        try:
            del captions_dict[image_id]
            del im_gender_summary[image_id]
        except:
            pass

    if save_file == True:
        export_csv('./data/list/qualified_image_ids.csv',
                   list(im_gender_summary.keys()))
        save_obj(captions_dict, 'captions_dict')
        save_obj(im_gender_summary, 'im_gender_summary')
    else:
        return captions_dict, im_gender_summary
Example #9
0
def get_test_indices(sample_size, training_image_ids=[], mode='random'):
    '''
    training_image_ids: image ids used in training or validation while training the model
    sample_size: # of image ids needed
    '''
    assert mode in ['random', 'balanced_mode', 'balanced_clean']
    assert isinstance(sample_size, int)

    test_captions_dict = dict()

    # Get pre-processed objects
    im_gender_summary = load_obj('im_gender_summary')
    captions_dict = load_obj('captions_dict')
    shuffle_im_keys = list(im_gender_summary.keys())
    random.shuffle(shuffle_im_keys)

    if mode == 'random':
        i = 0
        for image_id in shuffle_im_keys:
            if i < sample_size:
                if image_id not in training_image_ids:
                    test_captions_dict[image_id] = captions_dict[image_id]
                    i += 1

    elif mode == 'balanced_mode':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in shuffle_im_keys:
            if i < sample_size:
                if image_id not in training_image_ids:
                    if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                            male_count < sample_size / 3):
                        test_captions_dict[image_id] = captions_dict[image_id]
                        male_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'female' and (female_count <
                                                        sample_size / 3):
                        test_captions_dict[image_id] = captions_dict[image_id]
                        female_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'neutral' and (neutral_count <
                                                         sample_size / 3):
                        test_captions_dict[image_id] = captions_dict[image_id]
                        neutral_count += 1
                        i += 1

    elif mode == 'balanced_clean':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in shuffle_im_keys:
            if i < sample_size:
                if image_id not in training_image_ids:
                    if im_gender_summary[image_id]['clean_gender'] == 1:
                        if im_gender_summary[image_id][
                                'pred_gt'] == 'male' and (male_count <
                                                          sample_size / 3):
                            test_captions_dict[image_id] = captions_dict[
                                image_id]
                            male_count += 1
                            i += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'female' and (female_count <
                                                            sample_size / 3):
                            test_captions_dict[image_id] = captions_dict[
                                image_id]
                            female_count += 1
                            i += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'neutral' and (neutral_count <
                                                             sample_size / 3):
                            test_captions_dict[image_id] = captions_dict[
                                image_id]
                            neutral_count += 1
                            i += 1

    return test_captions_dict
Example #10
0
def get_training_indices(sample_size, mode='random'):
    assert mode in ['random','balanced_mode','balanced_clean', 'balanced_gender_only', \
                    'balanced_clean_noun', 'clean_noun', 'activity_balanced', 'activity_balanced_clean']
    assert isinstance(sample_size, int)
    '''
    8 different modes of generating data
    - random: randomized selection of qualified images
    - balanced_mode: balanced ratio between male, female and neutral
    - balanced_clean: balanced ratio between male, female and neutral,
                      only use images when all captions agree on using the same gender
    - balanced_gender_only: same as balanced_mode, but without neutral captions
    - balanced_clean_noun: balanced ratio between male, female and neutral, only use images when all captions
                           agree on using the same noun
    - clean_noun: only use images when all captions agree on the same noun
    - activity_balanced: from activity tagged image sets, choose same ratio of male, female, neutral image
    - activity_balanced_clean: similar to activity_balanced, but all captions must agree on the same gender
    
    Note that it is possible that output size may be smaller than sample_size,
    especially for activity_balanced and activity_balanced_clean. As for certain activities, the sample size of
    clean data might be limited for some classes, e.g. women wearing tie.
    '''

    random.seed(123)
    training_captions_dict = dict()

    # Get pre-processed objects
    im_gender_summary = load_obj('im_gender_summary')
    captions_dict = load_obj('captions_dict')
    activity_image_ids = load_obj('activity_image_ids')

    if mode == 'random':
        training_captions_dict = dict(
            random.sample(captions_dict.items(), sample_size))

    elif mode == 'balanced_mode':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                        male_count < sample_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    male_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'female' and (
                        female_count < sample_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    female_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'neutral' and (
                        neutral_count < sample_size / 3):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    neutral_count += 1
                    i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'balanced_clean':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['clean_gender'] == 1:
                    if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                            male_count < sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        male_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'female' and (female_count <
                                                        sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        female_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'neutral' and (neutral_count <
                                                         sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        neutral_count += 1
                        i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'balanced_clean_noun':
        i = 0
        male_count = 0
        female_count = 0
        neutral_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['clean_noun'] == 1:
                    if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                            male_count < sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        male_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'female' and (female_count <
                                                        sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        female_count += 1
                        i += 1
                    elif im_gender_summary[image_id][
                            'pred_gt'] == 'neutral' and (neutral_count <
                                                         sample_size / 3):
                        training_captions_dict[image_id] = captions_dict[
                            image_id]
                        neutral_count += 1
                        i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'clean_noun':
        i = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['clean_noun'] == 1:
                    training_captions_dict[image_id] = captions_dict[image_id]
                    i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'balanced_gender_only':
        i = 0
        male_count = 0
        female_count = 0
        for image_id in im_gender_summary.keys():
            if i < sample_size:
                if im_gender_summary[image_id]['pred_gt'] == 'male' and (
                        male_count < sample_size / 2):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    male_count += 1
                    i += 1
                elif im_gender_summary[image_id]['pred_gt'] == 'female' and (
                        female_count < sample_size / 2):
                    training_captions_dict[image_id] = captions_dict[image_id]
                    female_count += 1
                    i += 1

                if i % 1000 == 0:
                    print(f"captions of {i} images are added")

    elif mode == 'activity_balanced':
        activity_sample_size = sample_size / len(activity_image_ids.keys())
        i = 0
        for activity in activity_image_ids.keys():
            image_ids = activity_image_ids[activity]
            j = 0
            male_count = 0
            female_count = 0
            neutral_count = 0
            for image_id in image_ids:
                if j < activity_sample_size:
                    if image_id in im_gender_summary:
                        if im_gender_summary[image_id][
                                'pred_gt'] == 'male' and (
                                    male_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            male_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'female' and (
                                    female_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            female_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'neutral' and (
                                    neutral_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            neutral_count += 1
                            i += 1
                            j += 1

                    if i > 0 and i % 100 == 0:
                        print(f"captions of {i} images are added")

    elif mode == 'activity_balanced_clean':
        activity_sample_size = sample_size / len(activity_image_ids.keys())
        i = 0
        for activity in activity_image_ids.keys():
            image_ids = activity_image_ids[activity]
            j = 0
            male_count = 0
            female_count = 0
            neutral_count = 0
            for image_id in image_ids:
                if j < activity_sample_size:
                    if image_id in im_gender_summary and im_gender_summary[
                            image_id]['clean_noun'] == 1:
                        if im_gender_summary[image_id][
                                'pred_gt'] == 'male' and (
                                    male_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            male_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'female' and (
                                    female_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            female_count += 1
                            i += 1
                            j += 1
                        elif im_gender_summary[image_id][
                                'pred_gt'] == 'neutral' and (
                                    neutral_count < activity_sample_size / 3):
                            training_captions_dict[image_id] = captions_dict[
                                image_id]
                            neutral_count += 1
                            i += 1
                            j += 1

                        if i > 0 and i % 1000 == 0:
                            print(f"captions of {i} images are added")

    training_image_ids = list(training_captions_dict.keys())
    save_obj(training_image_ids, 'training_image_ids')
    return training_image_ids, training_captions_dict
Example #11
0
    data_loading_cleaning.cleaning_data(dict_df, 'data/filtered/')
    print()

    # transforming the data
    G, dict_geo_data, dict_distances = graph_transformation.\
        graph_transforming('data/filtered/')

    print()
    # saving the results so that this step does not have to be performed again
    utils.save_obj(G, 'objects/graph.pkl')
    utils.save_obj(dict_geo_data, 'objects/dict_geo_data.pkl')
    utils.save_obj(dict_distances, 'objects/dict_distances.pkl')

    # analyzing the graph
    # loading the objects
    G = utils.load_obj('objects/graph.pkl')
    dict_geo_data = utils.load_obj('objects/dict_geo_data.pkl')
    dict_distances = utils.load_obj('objects/dict_distances.pkl')
    print('Computing some metrics...')
    print()
    graph_metrics.computing_metrics(G, 'current network',
                                    'figures/current_network')

    dict_avg_speed = utils.computing_avg_speed_mode(G, dict_geo_data)
    current_efficiency, g_ideal, denom = graph_metrics\
        .global_efficiency_weighted(G, dict_distances, dict_avg_speed['RER'])
    print('Current global efficiency of the network:', current_efficiency)

    # detecting new routes to create
    # setting the costs of each type of route
    # costs of creating a new line and exploiting it, in €/km
def predict_from_image(image_path, vocab_path = '', model_path = '', embed_size = 256, hidden_size = 512, is_print = True, return_one = False):

    sample_size = 1
    
    # Get model
    if model_path == '': # if not specified, assume it is best model saved in models
        model_path = './models/best-model.pkl'
    if torch.cuda.is_available() == True:
        checkpoint = torch.load(model_path)
    else:
        checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
        #checkpoint = torch.load(model_path, map_location='cpu')
    #print(f'Best model is loaded from {model_path} . . .')
    
    # Get the vocabulary and its size
    if vocab_path != '': # if not specified, assume it is the vocab pickle saved in object
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)
            #print("Loaded vocab file of pretrained model")
    else:
        vocab = load_obj('vocab')
    vocab_size = len(vocab)
    
    # Transform image
    transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            #transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225)),
        ])
    image = Image.open(image_path).convert("RGB")
    original_image = np.array(image)
    image = transform(image)
    
    if is_print == True:
        transformed_image = image.numpy()
        transformed_image = np.squeeze(transformed_image)
        transformed_image = transformed_image.transpose((1, 2, 0))
        np.clip(transformed_image, 0, 1)

        # Print sample image, before and after pre-processing
        plt.imshow(np.asarray(original_image))
        #plt.imshow(np.squeeze(original_image))
        plt.title('Test image- original')
        plt.show()
        plt.imshow(transformed_image)
        plt.title('Test image- transformed')
        plt.show()

    # Initialize the encoder and decoder, and set each to inference mode
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the pre-trained weights
    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])

    # Move models to GPU if CUDA is available.
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        image = image.cuda()
    
    image = image.unsqueeze(0)

    features = encoder(image).unsqueeze(1)
    output = decoder.sample_beam_search(features)
    sentences = clean_sentence(output, vocab)
    
    if is_print == True:
        print('Predicted caption: \n')
        for sentence in set(sentences):
            print(f'{sentence}')
    elif return_one == True:
        return sentences[0]
    else:
        return [s for s in set(sentences)]
def predict_from_COCO(image_folder_path, vocab_path = '', model_path = '', training_image_ids_path = '', embed_size = 256, hidden_size = 512, mode = 'balanced_mode', test_image_id = '', is_print = True, return_one = False):

    sample_size = 1
    
    # Get model
    if model_path == '': # if not specified, assume it is best model saved in models
        model_path = './models/best-model.pkl'
    if torch.cuda.is_available() == True:
        checkpoint = torch.load(model_path)
    else:
        checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
        #checkpoint = torch.load('./models/best-model.pkl', map_location='cpu')
    #print(f'Best model is loaded from {model_path} . . .')
    
    # Get the vocabulary and its size
    if vocab_path != '': # if not specified, assume it is the vocab pickle saved in object
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)
            #print("Loaded vocab file of pretrained model")
    else:
        vocab = load_obj('vocab')
    vocab_size = len(vocab)
    
    # Get the training image id paths
    if training_image_ids_path == '': # if not specified, assume it is the vocab pickle saved in object
        training_image_ids = load_obj('training_image_ids')
    else:
        with open(training_image_ids_path, 'rb') as f:
            training_image_ids = pickle.load(f)
    
    
    test_image_ids = get_test_indices(sample_size, training_image_ids, mode = mode)
    image_id = list(test_image_ids.keys())[0]
    if is_print == False:
        test_image_ids[test_image_id] = test_image_ids.pop(image_id)
    test_loader = load_data(test_image_ids.keys(), image_folder_path, mode = 'test', vocab_file = vocab)
    original_image, image = next(iter(test_loader))
    
    if is_print == True:
        transformed_image = image.numpy()
        transformed_image = np.squeeze(transformed_image)
        transformed_image = transformed_image.transpose((1, 2, 0))
        np.clip(transformed_image, 0, 1)

        # Print sample image, before and after pre-processing
        print(f'\nTest_image_id: {image_id}')
        plt.imshow(np.squeeze(original_image))
        plt.title('Test image- original')
        plt.show()
        plt.imshow(transformed_image)
        plt.title('Test image- transformed')
        plt.show()

    # Initialize the encoder and decoder, and set each to inference mode
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the pre-trained weights
    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])

    # Move models to GPU if CUDA is available.
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        image = image.cuda()
        
    features = encoder(image).unsqueeze(1)
    output = decoder.sample_beam_search(features)
    sentences = clean_sentence(output, vocab)
    
    if is_print == True:
        print('Predicted caption: \n')
        for sentence in set(sentences):
            print(f'{sentence}')
            
        original_captions = test_image_ids[image_id]
        print('\n\nOriginal captions labelled by human annotators: \n')
        for caption in set(original_captions):
            print(caption)
    elif return_one == True:
        return sentences[0]
    else:
        return [s for s in set(sentences)]
def run(config: DictConfig, logger=None):
    # ----------- setup experiment ------------------- #
    # modify the datamodule
    def train_dataloader(self):
        ds = CutMixDatasetWrapper(self.train, **config.cutmix)
        return DataLoader(ds, shuffle=True, **self.config)

    LitDataModule.train_dataloader = train_dataloader

    # setup logging
    if logger is None:
        logger = logging.getLogger(__name__)

    set_seed(config.training.seed)
    logger.info(f"using seed {config.training.seed}")
    wandb.login(key=config.logger.api)

    # init wandb logger
    wb = load_obj(config.logger.class_name)(**config.logger.params)

    # log the training config to wandb
    # create a new hparam dictionary with the relevant hparams and
    # log the hparams to wandb
    wb_hparam = OrderedDict({
        "training_fold": config.fold_num,
        "input_dims": config.training.image_dim,
        "batch_size": config.training.dataloaders.batch_size,
        "optimizer": config.optimizer.class_name,
        "scheduler": config.scheduler.class_name,
        "learning_rate": config.optimizer.params.lr,
        "weight_decay": config.optimizer.params.weight_decay,
        "num_epochs": config.training.num_epochs,
    })
    wb.log_hyperparams(wb_hparam)

    # ----------- prepare datasets ------------------- #
    logger.info("Prepare Training/Validation Datasets.")

    processor = Preprocessor(config.csv_dir, config.json_dir, config.image_dir,
                             5)
    df = pd.read_csv(config.fold_csv_dir)
    imsdir = config.image_dir
    df.filePath = [
        os.path.join(imsdir, df.image_id[i]) for i in range(len(df))
    ]

    processor.dataframe = df
    fold_num = config.fold_num
    trainFold, valFold = processor.get_fold(fold_num)
    # testFold, valFold = train_test_split(valFold, stratify=valFold.label, test_size=0.5)

    trainFold.reset_index(drop=True, inplace=True)
    # testFold.reset_index(drop=True, inplace=True)
    valFold.reset_index(drop=True, inplace=True)

    # init weights for loss function
    weights = None  # no weights for cutmix

    tfms_config = config.augmentation
    trn_augs = A.Compose(
        [
            load_obj(augs.class_name)(**augs.params)
            for augs in tfms_config.train_augs
        ],
        p=1.0,
    )
    valid_augs = A.Compose(
        [
            load_obj(augs.class_name)(**augs.params)
            for augs in tfms_config.valid_augs
        ],
        p=1.0,
    )
    test_augs = A.Compose(
        [
            load_obj(augs.class_name)(**augs.params)
            for augs in tfms_config.test_augs
        ],
        p=1.0,
    )

    tfms = {
        "train": trn_augs,
        "valid": valid_augs,
        "test": test_augs,
    }
    # init datamodule
    dl_config = config.training.dataloaders
    dm = LitDataModule(trainFold, valFold, valFold, tfms, dl_config)
    dm.setup()

    # set training total steps
    config.training.total_steps = (len(dm.train_dataloader()) *
                                   config.training.num_epochs)

    logger.info(f"Train dataset size: {len(dm.train_dataloader())}")
    logger.info(f"Validation dataset size: {len(dm.val_dataloader())}")

    # ----------- load lightning trainer ------------------- #

    trainer_cfg = config.lightning
    # init lightning callbacks
    chkpt = pl.callbacks.ModelCheckpoint(**trainer_cfg.model_checkpoint)

    cb_config = config.lightning.callbacks
    cbs = [
        load_obj(module.class_name)(**module.params) for module in cb_config
    ]

    if config.log_to_stdout:
        cbs.append(PrintCallback(log=logger))

    # init trainer
    args = trainer_cfg.init_args
    trainer = pl.Trainer(callbacks=cbs,
                         checkpoint_callback=chkpt,
                         logger=wb,
                         **args)

    # ----------- init lightning module ------------------- #
    logger.info("Build network.")
    model = LitModel(config, weights=weights)
    # update model loss function to soft cross entropy loss
    model.loss_fn = SoftTargetCrossEntropy(weight=weights)
    model.unfreeze_classifier()

    wb.watch(model.net)

    model_name = config.model.params.model_name or config.model.class_name

    logger.info(f"Init from base net: {model_name}")
    logger.info(
        f"Uses {str(config.optimizer.class_name).split('.')[-1]} optimizer.")
    logger.info(
        f"Learning Rate: {config.optimizer.params.lr}, Weight Decay: {config.optimizer.params.weight_decay}"
    )
    logger.info(
        f"Uses {str(config.scheduler.class_name).split('.')[-1]} scheduler.")

    tr_config = config.training

    logger.info(
        f"Training over {tr_config.num_epochs} epochs ~ {tr_config.total_steps} steps."
    )

    # ----------- start train/validaiton/test ------------------- #
    # Pass the datamodule as arg to trainer.fit to override model hooks :)
    trainer.fit(model, datamodule=dm)
    # Compute metrics on test dataset
    _ = trainer.test(model, datamodule=dm, ckpt_path=chkpt.best_model_path)

    # ----------- finish experiment/cleanup/save weights ------------------- #
    PATH = chkpt.best_model_path  # path to the best performing model
    WEIGHTS_PATH = config.training.model_save_dir

    # init best model
    logger.info(f"Restored best model weights from {PATH}.")
    params = {"config": config, "weights": weights}

    loaded_model = model.load_from_checkpoint(PATH, **params)
    torchmodel = loaded_model.net

    torch.save(torchmodel.state_dict(), WEIGHTS_PATH)
    # upload the weights file to wandb
    wandb.save(WEIGHTS_PATH)

    # upload the full config file to wandb
    conf_pth = f"{config.run_name}.yaml"
    OmegaConf.save(config, f=conf_pth)
    logger.info(f"Saved config file {conf_pth}.")

    wandb.save(conf_pth)

    logger.info(f"Saved model {WEIGHTS_PATH}.")

    wandb.finish()
def analysis(STATE,
             method,
             method_kwargs,
             hyperparams_to_test,
             fig,
             spec,
             row,
             precomputed=False,
             separate=False,
             two_cols=False,
             NUM_STATES=1,
             configurations=None,
             default_cluster_num=5):
    #First, define appropriate paths
    SHAPE_PATH, FIGURE_PATH, RAW_DATA_PATH, INCOME_POPULATION_PATH = define_paths(
        STATE)

    #Load the data
    covid_, X, index_X, columns_X = load_data(RAW_DATA_PATH)

    #Do dim red
    print('##################D-RED#################')
    emb_method = method
    if not precomputed:
        errors_results, embeddings_results, trustws_results = choose_dimension(
            X, emb_method, hyperparams_to_test, **method_kwargs)

        save_obj(embeddings_results,
                 STATE + '_embeddings_results' + method.__name__)
        save_obj(errors_results, STATE + '_errors_results' + method.__name__)
        save_obj(trustws_results, STATE + '_trustws_result' + method.__name__)
    if precomputed:
        embeddings_results = load_obj(STATE + '_embeddings_results' +
                                      method.__name__)
        errors_results = load_obj(STATE + '_errors_results' + method.__name__)
        trustws_results = load_obj(STATE + '_trustws_result' + method.__name__)

    if (len(hyperparams_to_test['n_components']) >
            1) and (errors_results['n_components'][0] is not None):
        plt.plot(hyperparams_to_test['n_components'],
                 errors_results['n_components'])

    if (len(hyperparams_to_test['n_components']) > 1):
        kneedle = KneeLocator(hyperparams_to_test['n_components'],
                              np.array(trustws_results['n_components']),
                              S=1,
                              curve='concave',
                              direction='increasing',
                              interp_method='polynomial',
                              online=False)
        kneedle.plot_knee()
        plt.title(emb_method.__name__ + ' trustworthiness')
        plt.xlabel('n_components')
        plt.ylabel('trustworhiness')
        kneedle.knee, kneedle.knee_y

    #Save the dataframe with optimal dim
    if (len(hyperparams_to_test['n_components']) > 1):
        good_dim = int(
            np.squeeze(
                np.where(hyperparams_to_test['n_components'] == kneedle.knee)))
    else:
        good_dim = 0
    X_method = embeddings_results['n_components'][
        good_dim]  #pick the best (knee point) n_components
    X_method_df = pd.DataFrame(
        X_method,
        columns=['Mode {}'.format(i)
                 for i in range(X_method.shape[1])])  #, index = index_X)
    X_method_df.to_csv(
        os.path.join(
            configurations['DATA_PATH'], 'interim',
            method.__name__ + str(X_method.shape[1]) + 'D_' + STATE + '.csv'))
    print('Saving optimal embedding. Method: ', method.__name__, 'shape: ',
          X_method_df.shape)

    print('##################INITIAL VIZ#################')
    #Find the 2D and 3D embeddings and continuous colors based on that
    filename_initial = os.path.join(FIGURE_PATH, 'initial_' + method.__name__)
    if method.__name__ == 'Isomap':
        viz = viz_Isomap
    if method.__name__ == 'SpectralEmbedding':
        viz = viz_SE
    if method.__name__ == 'LocallyLinearEmbedding':
        viz = viz_LLE

    if precomputed:
        load_path = os.path.join('obj', STATE)
        save_path = None
    else:
        load_path = None
        save_path = os.path.join('obj', STATE)
    X_2D_emb, X_3D_emb = viz(X,
                             colors=None,
                             filename=filename_initial,
                             alpha=0.5,
                             load_path=load_path,
                             save_path=save_path)
    cos_colors = find_cos_similarity(X_2D_emb)
    #Color the manifold continuously
    filename_initial_colored = os.path.join(
        FIGURE_PATH, 'initial_' + method.__name__ + '_colored')
    X_2D_emb, X_3D_emb = viz(X,
                             colors=cos_colors,
                             filename=filename_initial_colored,
                             cbar=None,
                             alpha=0.5,
                             load_path=load_path,
                             save_path=save_path)

    print('##################GMM CLUSTERING#################')
    #Import R for clustering
    base = importr('base')
    mclust = importr('mclust')
    ro.r('set.seed(1)')

    dontprecomputeclusters = not precomputed
    #     if not precomputed:
    if dontprecomputeclusters:
        clusters, means, z, uncertainty = GMM_clustering_R(
            X_method_df, method, default_cluster_num=default_cluster_num
        )  #could change this to 5 to be consistent across states to auto-id clust #
        clusters_block_indexed = pd.Series(clusters, index=index_X)

        avg_per_clust = create_avg_df(clusters, index_X, covid_)

        reordered_clusters, reordered_means, reordered_z, reordered_uncertainty = relabel_clusters(
            clusters.astype('int'), avg_per_clust, means, z, uncertainty)
        reordered_avg_per_clust = create_avg_df(reordered_clusters, index_X,
                                                covid_)
        #Save
        np.save(
            os.path.join('obj', STATE + '_reordered_clusters.npy'),
            reordered_clusters,
        )
        reordered_means.to_csv(
            os.path.join('obj', STATE + '_reordered_means.csv'))
        reordered_z.to_csv(os.path.join('obj', STATE + '_reordered_z.csv'))
        np.save(os.path.join('obj', STATE + '_reordered_uncertainty.npy'),
                reordered_uncertainty)

        reordered_avg_per_clust.to_csv(
            os.path.join('obj', STATE + '_reordered_avg_per_clust.csv'))


#     if precomputed:
    if not dontprecomputeclusters:
        reordered_clusters = np.load(
            os.path.join('obj', STATE + '_reordered_clusters.npy'))
        reordered_means = pd.read_csv(os.path.join(
            'obj', STATE + '_reordered_means.csv'),
                                      index_col=0)
        reordered_z = pd.read_csv(os.path.join('obj',
                                               STATE + '_reordered_z.csv'),
                                  index_col=0)
        reordered_uncertainty = np.load(
            os.path.join('obj', STATE + '_reordered_uncertainty.npy'))
        reordered_avg_per_clust = pd.read_csv(os.path.join(
            'obj', STATE + '_reordered_avg_per_clust.csv'),
                                              index_col=0)

    #Save the data for Dennis (for only this method)
    index_with_blocks_and_save(STATE, X_method_df, X_2D_emb, X_3D_emb,
                               reordered_clusters, reordered_z,
                               reordered_uncertainty, index_X, emb_method)

    N_TIMESERIES = 5
    closest_to_mean_samples, closest_to_mean_block_ids = find_closest_time_series(
        X_method_df, reordered_means, covid_, index_X, n=N_TIMESERIES)

    print('##################FINAL VIZ#################')
    sns.set(style="whitegrid")
    if two_cols:
        reordered_clusters = cos_colors  #Change colors
    add_state_to_fig(STATE,
                     fig,
                     spec,
                     row,
                     NUM_STATES,
                     X,
                     reordered_clusters,
                     index_X,
                     reordered_avg_per_clust,
                     load_path=load_path,
                     save_path=save_path,
                     separate=separate,
                     two_cols=two_cols,
                     configurations=configurations)