Beispiel #1
0
def main():
    # Load data

    target_property = sys.argv[1]
    target_collection = sys.argv[2]
    dir_path = '../analysis/bin_distribution/'
    results_path = f'{dir_path}{target_property}.txt'
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    pairs_original = read_pairs(target_collection, source='original')
    pairs_resampled = read_pairs(target_collection, source='resampled')

    concepts_original = pairs_original[target_property]
    concepts_resampled = pairs_resampled[target_property]

    # sort data into bins
    set_info_dict = get_concepts_set(target_property, target_collection)
    general_bin_dict = load_general_bins()
    bin_dict_cosine = load_cosine_bins_prop(set_info_dict)
    general_bin_dict.update(bin_dict_cosine)

    # assign bin data to concepts in dataset
    set_bin_features = get_bin_feature_dict(general_bin_dict,
                                            set_info_dict.values())

    distribution_original = get_bin_distributions(general_bin_dict,
                                                  set_bin_features,
                                                  concepts_original)

    distribution_resampled = get_bin_distributions(general_bin_dict,
                                                   set_bin_features,
                                                   concepts_resampled)

    with open(results_path, 'w') as outfile:
        for name, d_original in distribution_original.items():
            outfile.write(f'\n{name}\n')
            outfile.write(
                'bin\t original (percent)\t original (absolut)\tresampled (percent)\tresampled (absolut)\n'
            )
            d_resampled = distribution_resampled[name]
            for b, percent_original in d_original.items():
                if b in d_resampled:
                    percent_resampled = d_resampled[b]
                else:
                    percent_resampled = (0, 0)
                outfile.write(
                    f'{b}\t{percent_original[0]} \t{percent_original[1]}\t{percent_resampled[0]}\t{percent_resampled[1]}\n\n'
                )

    print('Results written to:', results_path)
def load_lfw():

    file_ext = 'jpg'  # observe, no '.' before jpg

    dataset_path = './data/lfw'

    pairs_path = './data/pairs.txt'

    pairs = utils.read_pairs(pairs_path)
    path_list, issame_list = utils.get_paths(args.dataset_path, pairs,
                                             file_ext)

    print('==> Preparing data..')
    # Define data transforms
    RGB_MEAN = [0.485, 0.456, 0.406]
    RGB_STD = [0.229, 0.224, 0.225]
    test_transform = transforms.Compose([
        transforms.Scale((250, 250)),  # make 250x250
        transforms.CenterCrop(150),  # then take 150x150 center crop
        # resized to the network's required input size
        transforms.Scale((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD),
    ])

    # Create data loader
    test_loader = torch.utils.data.DataLoader(data_loader.LFWDataset(
        path_list, issame_list, test_transform),
                                              batch_size=args.batch_size,
                                              shuffle=False)

    return test_loader
Beispiel #3
0
    def start_requests(self):
        cities = utils.read_pairs(self.cities).keys()

        if ',' in self.start_time:
            for d in self.start_time.split(','):
                start_time = datetime.datetime.strptime(d, '%Y-%m-%d')
                end_time = start_time + datetime.timedelta(days=1)
                for city_id in cities:
                    yield self._request(city_id=city_id,
                                        start_time=start_time,
                                        end_time=end_time)
        else:
            self.start_time = datetime.datetime.strptime(
                self.start_time, '%Y-%m-%d')
            if self.end_time:
                self.end_time = datetime.datetime.strptime(
                    self.end_time, '%Y-%m-%d')
            else:
                self.end_time = self.start_time + datetime.timedelta(days=1)

            delta = (self.end_time - self.start_time).days
            for offset in range(delta):
                start_time = self.start_time + datetime.timedelta(days=offset)
                end_time = start_time + datetime.timedelta(days=1)
                for city_id in cities:
                    yield self._request(city_id=city_id,
                                        start_time=start_time,
                                        end_time=end_time)
 def get_data_dicts(self):
     data_dicts = []
     for coll in self.collections:
         prop_data_dicts = utils.read_pairs(coll, run, source='test')
         for prop, dicts in prop_data_dicts.items():
             for d in dicts:
                 d['concept'] = d['lemma']
                 d.pop('lemma')
                 d['collection'] = d['collection']
                 d['sources'] = 'test'
             data_dicts.extend(dicts)
     return data_dicts
 def get_data_dicts(self):
     data_dicts = []
     for coll in self.collections:
         prop_data_dicts = read_pairs(coll)
         for prop, dicts in prop_data_dicts.items():
             for d in dicts:
                 d['concept'] = d['lemma']
                 d.pop('lemma')
                 d['collection'] = coll
                 d['sources'] = d['sources_str']
                 d.pop('sources_str')
             data_dicts.extend(dicts)
     return data_dicts
Beispiel #6
0
 def __init__(self, data_path, c):
   import shutil
   
   self.data_path = data_path
   self.c = c
   self.output_path = _p.join('result', self.data_path)
   utils.mkdir_p(self.output_path)
   self.cgmdir = _p.join(self.data_path, 'cgm')
   self.spiketime_dir = _p.join(self.data_path, 'spiketime')
   
   self.single_data = utils.read_table(_p.join(self.data_path,'statid_correlation_mu_sigma2_urate.dat'))
   shutil.copyfile(_p.join(self.data_path,'statid_correlation_mu_sigma2_urate.dat'),
                   _p.join(self.output_path,'statid_correlation_mu_sigma2_urate.dat'))
   
   self.zero_firing = [i for i in self.single_data if self.single_data[i]['urate']<1e-8]
   self.pairs = utils.read_pairs(self.cgmdir)
   self.pair_data = [{'id': p} for p in self.pairs]
Beispiel #7
0
 parser.add_argument('-sw', '--stopwords', dest='stop_words_file', help='Stop words file', metavar='<file>')
 parser.add_argument('-idf', '--idffile', dest='idf_file', help='IDF file', metavar='<file>')
 args = parser.parse_args()
  
 input_file = args.input_file
 output_file = args.output_file
 
 stop_words_file = args.stop_words_file
 stop_words = []
 if stop_words_file:
     stop_words = utils.read_lines(stop_words_file)
 
 idf_file = args.idf_file    
 idfs = {}
 if idf_file:
     idfs = utils.read_pairs(idf_file, float)
 
 scored_rows = []
 field_names = []
 with open(input_file, newline='', encoding='utf-8') as csvfile:
     reader = csv.DictReader(csvfile, delimiter=',')
     field_names = reader.fieldnames
     for row in reader:
         content = row['content']
         if 'after_user_comment' in row and len(row['after_user_comment']) > 0:
             content = content + '。' + row['after_user_comment']
         if 'answer_content' in row and len(row['answer_content']) > 0:
             content = content + '。' + row['answer_content']
         
         words = jieba.cut(content)
         words_set = set()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--exp_name', default='lfw_eval')
    parser.add_argument('-g', '--gpu', type=int, default=0)
    parser.add_argument('-d', '--dataset_path', 
                        default='/srv/data1/arunirc/datasets/lfw-deepfunneled')
    parser.add_argument('--fold', type=int, default=0, choices=[0,10])
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('-m', '--model_path', default=None, required=True,
                        help='Path to pre-trained model')
    parser.add_argument('--model_type', default='resnet50',
                        choices=['resnet50', 'resnet101', 'resnet101-512d'])
    
    args = parser.parse_args()


    # CUDA setup
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
    cuda = torch.cuda.is_available()
    torch.manual_seed(1337)
    if cuda:
        torch.cuda.manual_seed(1337)
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True # enable if all images are same size    

    if args.fold == 0:
        pairs_path = './lfw/data/pairsDevTest.txt'
    else:
        pairs_path = './lfw/data/pairs.txt'

    # -----------------------------------------------------------------------------
    # 1. Dataset
    # -----------------------------------------------------------------------------
    file_ext = 'jpg' # observe, no '.' before jpg
    num_class = 8631

    pairs = utils.read_pairs(pairs_path)
    path_list, issame_list = utils.get_paths(args.dataset_path, pairs, file_ext)

    # Define data transforms
    RGB_MEAN = [ 0.485, 0.456, 0.406 ]
    RGB_STD = [ 0.229, 0.224, 0.225 ]
    test_transform = transforms.Compose([
        transforms.Scale((250,250)),  # make 250x250
        transforms.CenterCrop(150),   # then take 150x150 center crop
        transforms.Scale((224,224)),  # resized to the network's required input size
        transforms.ToTensor(),
        transforms.Normalize(mean = RGB_MEAN,
                             std = RGB_STD),
    ])

    # Create data loader
    test_loader = torch.utils.data.DataLoader(
                        data_loader.LFWDataset(
                        path_list, issame_list, test_transform), 
                        batch_size=args.batch_size, shuffle=False )


    # -----------------------------------------------------------------------------
    # 2. Model
    # -----------------------------------------------------------------------------
    if args.model_type == 'resnet50':
        model = torchvision.models.resnet50(pretrained=False)
        model.fc = torch.nn.Linear(2048, num_class)
    elif args.model_type == 'resnet101':
        model = torchvision.models.resnet101(pretrained=False)
        model.fc = torch.nn.Linear(2048, num_class)
    elif args.model_type == 'resnet101-512d':
        model = torchvision.models.resnet101(pretrained=False)
        layers = []
        layers.append(torch.nn.Linear(2048, 512))
        layers.append(torch.nn.Linear(512, num_class))
        model.fc = torch.nn.Sequential(*layers)
    else:
        raise NotImplementedError
    
    checkpoint = torch.load(args.model_path)       

    if checkpoint['arch'] == 'DataParallel':
        # if we trained and saved our model using DataParallel
        model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4])
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.module # get network module from inside its DataParallel wrapper
    else:
        model.load_state_dict(checkpoint['model_state_dict'])

    if cuda:
        model = model.cuda()

    # Convert the trained network into a "feature extractor"
    feature_map = list(model.children())
    if args.model_type == 'resnet101-512d':
        model.eval()
        extractor = model
        extractor.fc = nn.Sequential(extractor.fc[0])
    else: 
        feature_map.pop()
        extractor = nn.Sequential(*feature_map)
    
    extractor.eval() # set to evaluation mode (fixes BatchNorm, dropout, etc.)


    # -----------------------------------------------------------------------------
    # 3. Feature extraction
    # -----------------------------------------------------------------------------
    features = []

    for batch_idx, images in tqdm.tqdm(enumerate(test_loader), 
                                        total=len(test_loader), 
                                        desc='Extracting features'): 
        x = Variable(images, volatile=True) # test-time memory conservation
        if cuda:
            x = x.cuda()
        feat = extractor(x)
        if cuda:
            feat = feat.data.cpu()
        else:
            feat = feat.data
        features.append(feat)

    features = torch.stack(features)
    sz = features.size()
    features = features.view(sz[0]*sz[1], sz[2])
    features = F.normalize(features, p=2, dim=1) # L2-normalize
    # TODO - cache features


    # -----------------------------------------------------------------------------
    # 4. Verification
    # -----------------------------------------------------------------------------
    num_feat = features.size()[0]
    feat_pair1 = features[np.arange(0,num_feat,2),:]
    feat_pair2 = features[np.arange(1,num_feat,2),:]
    feat_dist = (feat_pair1 - feat_pair2).norm(p=2, dim=1)
    feat_dist = feat_dist.numpy()

    # Eval metrics
    scores = -feat_dist
    gt = np.asarray(issame_list)
       
    if args.fold == 0:
        fig_path = osp.join(here, 
                args.exp_name + '_' + args.model_type + '_lfw_roc_devTest.png')
        roc_auc = sklearn.metrics.roc_auc_score(gt, scores)
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(gt, scores)
        print 'ROC-AUC: %.04f' % roc_auc
        # Plot and save ROC curve
        fig = plt.figure()
        plt.title('ROC - lfw dev-test')
        plt.plot(fpr, tpr, lw=2, label='ROC (auc = %0.4f)' % roc_auc)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.grid()
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.tight_layout()
    else:
        # 10 fold
        fold_size = 600 # 600 pairs in each fold
        roc_auc = np.zeros(10)
        roc_eer = np.zeros(10)

        fig = plt.figure()
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.grid()
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')

        for i in tqdm.tqdm(range(10)):
            start = i * fold_size
            end = (i+1) * fold_size
            scores_fold = scores[start:end]
            gt_fold = gt[start:end]
            roc_auc[i] = sklearn.metrics.roc_auc_score(gt_fold, scores_fold)
            fpr, tpr, _ = sklearn.metrics.roc_curve(gt_fold, scores_fold)
            # EER calc: https://yangcha.github.io/EER-ROC/
            roc_eer[i] = brentq(
                            lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.)
            plt.plot(fpr, tpr, alpha=0.4, 
                    lw=2, color='darkgreen',
                    label='ROC(auc=%0.4f, eer=%0.4f)' % (roc_auc[i], roc_eer[i]) )

        plt.title( 'AUC: %0.4f +/- %0.4f, EER: %0.4f +/- %0.4f' % 
                    (np.mean(roc_auc), np.std(roc_auc),
                     np.mean(roc_eer), np.std(roc_eer)) )
        plt.tight_layout()

        fig_path = osp.join(here, 
                args.exp_name + '_' + args.model_type + '_lfw_roc_10fold.png')
        

    plt.savefig(fig_path, bbox_inches='tight')
    print 'ROC curve saved at: ' + fig_path
Beispiel #9
0
dataset_train = datasets.ImageFolder(TRAIN_PATH, train_transform)

# For unbalanced dataset we create a weighted sampler
#   * Balanced class sampling: https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3
weights = utils.make_weights_for_balanced_classes(dataset_train.imgs,
                                                  len(dataset_train.classes))
weights = torch.DoubleTensor(weights)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))
train_loader = torch.utils.data.DataLoader(dataset_train,
                                           batch_size=TRAIN_BATCH_SIZE,
                                           sampler=sampler,
                                           drop_last=True)
num_class = len(train_loader.dataset.classes)
print('Number of Training Classes: %d' % num_class)

pairs = utils.read_pairs(PAIR_TEXT_PATH)
path_list, issame_list = utils.get_paths(VAL_PATH, pairs, FILE_EXT)
val_loader = torch.utils.data.DataLoader(data_loader.LFWDataset(
    path_list, issame_list, val_transform),
                                         batch_size=VAL_BATCH_SIZE,
                                         shuffle=False)

#======= Model & Optimizer =======#
if MODEL_NAME.lower() == 'resnet18':
    model = torchvision.models.resnet18(pretrained=True)
elif MODEL_NAME.lower() == 'resnet34':
    model = torchvision.models.resnet34(pretrained=True)
elif MODEL_NAME.lower() == 'resnet50':
    model = torchvision.models.resnet50(pretrained=True)
elif MODEL_NAME.lower() == 'resnet101':
    model = torchvision.models.resnet101(pretrained=True)
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--exp_name', default='lfw_eval')
    parser.add_argument('-g', '--gpu', type=int, default=0)
    parser.add_argument('-d',
                        '--dataset_path',
                        default='/srv/data1/arunirc/datasets/lfw-deepfunneled')
    parser.add_argument('--fold', type=int, default=0, choices=[0, 10])
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('-m',
                        '--model_path',
                        default=None,
                        required=True,
                        help='Path to pre-trained model')
    parser.add_argument('--model_type',
                        default='resnet50',
                        choices=['resnet50', 'resnet101', 'resnet101-512d'])

    args = parser.parse_args()

    # CUDA setup
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
    cuda = torch.cuda.is_available()
    torch.manual_seed(1337)
    if cuda:
        torch.cuda.manual_seed(1337)
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True  # enable if all images are same size

    if args.fold == 0:
        pairs_path = './lfw/data/pairsDevTest.txt'
    else:
        pairs_path = './lfw/data/pairs.txt'

    # -----------------------------------------------------------------------------
    # 1. Dataset
    # -----------------------------------------------------------------------------
    file_ext = 'jpg'  # observe, no '.' before jpg
    num_class = 8631

    pairs = utils.read_pairs(pairs_path)
    path_list, issame_list = utils.get_paths(args.dataset_path, pairs,
                                             file_ext)

    # Define data transforms
    RGB_MEAN = [0.485, 0.456, 0.406]
    RGB_STD = [0.229, 0.224, 0.225]
    test_transform = transforms.Compose([
        transforms.Scale((250, 250)),  # make 250x250
        transforms.CenterCrop(150),  # then take 150x150 center crop
        transforms.Scale(
            (224, 224)),  # resized to the network's required input size
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD),
    ])

    # Create data loader
    test_loader = torch.utils.data.DataLoader(data_loader.LFWDataset(
        path_list, issame_list, test_transform),
                                              batch_size=args.batch_size,
                                              shuffle=False)

    # -----------------------------------------------------------------------------
    # 2. Model
    # -----------------------------------------------------------------------------
    if args.model_type == 'resnet50':
        model = torchvision.models.resnet50(pretrained=False)
        model.fc = torch.nn.Linear(2048, num_class)
    elif args.model_type == 'resnet101':
        model = torchvision.models.resnet101(pretrained=False)
        model.fc = torch.nn.Linear(2048, num_class)
    elif args.model_type == 'resnet101-512d':
        model = torchvision.models.resnet101(pretrained=False)
        layers = []
        layers.append(torch.nn.Linear(2048, 512))
        layers.append(torch.nn.Linear(512, num_class))
        model.fc = torch.nn.Sequential(*layers)
    else:
        raise NotImplementedError

    checkpoint = torch.load(args.model_path)

    if checkpoint['arch'] == 'DataParallel':
        # if we trained and saved our model using DataParallel
        model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4])
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.module  # get network module from inside its DataParallel wrapper
    else:
        model.load_state_dict(checkpoint['model_state_dict'])

    if cuda:
        model = model.cuda()

    # Convert the trained network into a "feature extractor"
    feature_map = list(model.children())
    if args.model_type == 'resnet101-512d':
        model.eval()
        extractor = model
        extractor.fc = nn.Sequential(extractor.fc[0])
    else:
        feature_map.pop()
        extractor = nn.Sequential(*feature_map)

    extractor.eval()  # set to evaluation mode (fixes BatchNorm, dropout, etc.)

    # -----------------------------------------------------------------------------
    # 3. Feature extraction
    # -----------------------------------------------------------------------------
    features = []

    for batch_idx, images in tqdm.tqdm(enumerate(test_loader),
                                       total=len(test_loader),
                                       desc='Extracting features'):
        x = Variable(images, volatile=True)  # test-time memory conservation
        if cuda:
            x = x.cuda()
        feat = extractor(x)
        if cuda:
            feat = feat.data.cpu()
        else:
            feat = feat.data
        features.append(feat)

    features = torch.stack(features)
    sz = features.size()
    features = features.view(sz[0] * sz[1], sz[2])
    features = F.normalize(features, p=2, dim=1)  # L2-normalize
    # TODO - cache features

    # -----------------------------------------------------------------------------
    # 4. Verification
    # -----------------------------------------------------------------------------
    num_feat = features.size()[0]
    feat_pair1 = features[np.arange(0, num_feat, 2), :]
    feat_pair2 = features[np.arange(1, num_feat, 2), :]
    feat_dist = (feat_pair1 - feat_pair2).norm(p=2, dim=1)
    feat_dist = feat_dist.numpy()

    # Eval metrics
    scores = -feat_dist
    gt = np.asarray(issame_list)

    if args.fold == 0:
        fig_path = osp.join(
            here,
            args.exp_name + '_' + args.model_type + '_lfw_roc_devTest.png')
        roc_auc = sklearn.metrics.roc_auc_score(gt, scores)
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(gt, scores)
        print 'ROC-AUC: %.04f' % roc_auc
        # Plot and save ROC curve
        fig = plt.figure()
        plt.title('ROC - lfw dev-test')
        plt.plot(fpr, tpr, lw=2, label='ROC (auc = %0.4f)' % roc_auc)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.grid()
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.tight_layout()
    else:
        # 10 fold
        fold_size = 600  # 600 pairs in each fold
        roc_auc = np.zeros(10)
        roc_eer = np.zeros(10)

        fig = plt.figure()
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.grid()
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')

        for i in tqdm.tqdm(range(10)):
            start = i * fold_size
            end = (i + 1) * fold_size
            scores_fold = scores[start:end]
            gt_fold = gt[start:end]
            roc_auc[i] = sklearn.metrics.roc_auc_score(gt_fold, scores_fold)
            fpr, tpr, _ = sklearn.metrics.roc_curve(gt_fold, scores_fold)
            # EER calc: https://yangcha.github.io/EER-ROC/
            roc_eer[i] = brentq(
                lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.)
            plt.plot(fpr,
                     tpr,
                     alpha=0.4,
                     lw=2,
                     color='darkgreen',
                     label='ROC(auc=%0.4f, eer=%0.4f)' %
                     (roc_auc[i], roc_eer[i]))

        plt.title('AUC: %0.4f +/- %0.4f, EER: %0.4f +/- %0.4f' %
                  (np.mean(roc_auc), np.std(roc_auc), np.mean(roc_eer),
                   np.std(roc_eer)))
        plt.tight_layout()

        fig_path = osp.join(
            here,
            args.exp_name + '_' + args.model_type + '_lfw_roc_10fold.png')

    plt.savefig(fig_path, bbox_inches='tight')
    print 'ROC curve saved at: ' + fig_path