def main(): from src.config import load_parser parser = load_parser() args, unknown = parser.parse_known_args() args = vars(args) args_to_print = {name: args[name] for name in args if not "files" in name} pprint(args_to_print) pprint(unknown) if args['data_header']: with open(args['data_header']) as f: data_header = f.readlines()[0].strip().split(",") if args['label_header']: with open(args['label_header']) as f: label_header = f.readlines()[0].strip().split(",") train_data_files = args['train_data_files'] train_image_files = args['train_image_files'] train_text_files = args['train_text_files'] train_label_files = args['train_label_files'] key = args['key'] user_size = args['user_size'] text_size = args['text_size'] image_size = args['image_size'] dummy_user_vector = args['dummy_user_vector'] shuffle = args['shuffle'] batch_size = args['batch_size'] num_workers = args['num_workers'] train_data_files[6:], train_image_files[6:], train_text_files[ 6:], train_label_files[6:] = get_overlapping_data_files( train_data_files, train_image_files, train_text_files, train_label_files) for train_data_file, train_image_file, train_text_file, train_label_file in tqdm( zip(train_data_files, train_image_files, train_text_files, train_label_files), desc="files"): dataset = TwitterDatasetChunk(data_file=train_data_file, image_file=train_image_file, text_file=train_text_file, label_file=train_label_file, key=key, data_header=data_header, label_header=label_header, user_size=user_size, text_size=text_size, image_size=image_size, dummy_user_vector=dummy_user_vector) dataloader = DataLoader(dataset, batch_size=args['batch_size'], shuffle=args['shuffle'], num_workers=args['num_workers']) for batch in tqdm(dataloader, desc='dataloader'): pass
def main(): from src.config import load_parser parser = load_parser() args, unknown = parser.parse_known_args() args = vars(args) args_to_print = {name: args[name] for name in args if not "files" in name} pprint(args_to_print) pprint(unknown) if args['data_header']: with open(args['data_header']) as f: data_header = f.readlines()[0].strip().split(",") if args['label_header']: with open(args['label_header']) as f: label_header = f.readlines()[0].strip().split(",") train_data_files = args['train_data_files'] train_image_files = args['train_image_files'] train_text_files = args['train_text_files'] train_label_files = args['train_label_files'] key = args['key'] user_size = args['user_size'] text_size = args['text_size'] image_size = args['image_size'] dummy_user_vector = args['dummy_user_vector'] shuffle = args['shuffle'] batch_size = args['batch_size'] num_workers = args['num_workers'] train_data_files, train_image_files, train_text_files, train_label_files = get_overlapping_data_files( train_data_files, train_image_files, train_text_files, train_label_files) data_loader = TwitterDataloader( data_files=train_data_files, image_files=train_image_files, text_files=train_text_files, label_files=train_label_files, key=key, data_header=data_header, label_header=label_header, user_size=user_size, text_size=text_size, image_size=image_size, dummy_user_vector=dummy_user_vector, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers, ) for batch in tqdm(data_loader): pass print("Finished going through all files...")
def main(): from pprint import pprint from src.config import load_parser parser = load_parser() args, unknown = parser.parse_known_args() args = vars(args) model = UserModel(user_size=args['user_size'], hidden_size=args['hidden_size'], joint_embedding_size=args['joint_embedding_size'])
def main(): from pprint import pprint from src.config import load_parser parser = load_parser() args, unknown = parser.parse_known_args() args = vars(args) model = ContentModel( image_embed_size=args['image_size'], text_embed_size=args['text_size'], hidden_size=args['hidden_size'], joint_embedding_size=args['joint_embedding_size'])
def main(): from src.config import load_parser parser = load_parser() args, unknown = parser.parse_known_args() args = vars(args) if args['data_header']: with open(args['data_header']) as f: data_header = f.readlines()[0].strip().split(",") if args['label_header']: with open(args['label_header']) as f: label_header = f.readlines()[0].strip().split(",") data_files = args['valid_data_files'] image_files = args['valid_image_files'] text_files = args['valid_text_files'] label_files = args['valid_label_files'] key = args['key'] user_size = args['user_size'] text_size = args['text_size'] image_size = args['image_size'] dummy_user_vector = args['dummy_user_vector'] for data_file, image_file, text_file, label_file in tqdm( zip(data_files, image_files, text_files, label_files)): dataset = TwitterDatasetChunk(data_file=data_file, image_file=image_file, text_file=text_file, label_file=label_file, key=key, data_header=data_header, label_header=label_header, user_size=user_size, text_size=text_size, image_size=image_size, dummy_user_vector=dummy_user_vector) if not len(dataset): import ipdb ipdb.set_trace()
def main(): from pprint import pprint from src.config import load_parser parser = load_parser() args, unknown = parser.parse_known_args() args = vars(args) args_to_print = {name: args[name] for name in args if not "files" in name} pprint(args_to_print) pprint(unknown) if args['data_header']: with open(args['data_header']) as f: data_header = f.readlines()[0].strip().split(",") if args['label_header']: with open(args['label_header']) as f: label_header = f.readlines()[0].strip().split(",") train_data_files = args['train_data_files'] train_image_files = args['train_image_files'] train_text_files = args['train_text_files'] train_label_files = args['train_label_files'] valid_data_files = args['valid_data_files'] valid_image_files = args['valid_image_files'] valid_text_files = args['valid_text_files'] valid_label_files = args['valid_label_files'] key = args['key'] seed = args['seed'] user_size = args['user_size'] text_size = args['text_size'] image_size = args['image_size'] dummy_user_vector = args['dummy_user_vector'] shuffle = args['shuffle'] batch_size = args['batch_size'] num_workers = args['num_workers'] micro_lambda = args['micro_lambda'] macro_lambda = args['macro_lambda'] max_epoch = args['epochs'] log_dir = args['log_dir'] checkpoint_file = args['checkpoint'] verbosity = args['verbosity'] save_frequency = args['save_frequency'] if args['align_files']: train_data_files, train_image_files, train_text_files, train_label_files = get_overlapping_data_files( train_data_files, train_image_files, train_text_files, train_label_files) if args['align_files']: valid_data_files, valid_image_files, valid_text_files, valid_label_files = get_overlapping_data_files( valid_data_files, valid_image_files, valid_text_files, valid_label_files) device = torch.device("cpu" if ( args['no_cuda'] or not torch.cuda.is_available()) else "cuda") if args['user_only'] and args['content_only']: raise NotImplementedError("What does user_only and content_only mean?") elif args['user_only']: from src.models.user_model import UserModel model = UserModel(user_size=args['user_size'], hidden_size=args['hidden_size'], joint_embedding_size=args['joint_embedding_size']) elif args['content_only']: from src.models.content_model import ContentModel model = ContentModel(image_embed_size=args['image_size'], text_embed_size=args['text_size'], hidden_size=args['hidden_size'], joint_embedding_size=args['joint_embedding_size']) else: from src.models.feature_model import FeatureModel model = FeatureModel(user_size=args['user_size'], image_embed_size=args['image_size'], text_embed_size=args['text_size'], hidden_size=args['hidden_size'], joint_embedding_size=args['joint_embedding_size']) if (torch.cuda.device_count() > 1) and args['all_gpu']: print("Using %d GPUS!" % torch.cuda.device_count()) model = nn.DataParallel(model) model = model.to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=args['learning_rate']) trainer = Trainer( model=model, optimizer=optimizer, device=device, key=key, data_header=data_header, label_header=label_header, user_size=user_size, text_size=text_size, image_size=image_size, dummy_user_vector=dummy_user_vector, user_only=args['user_only'], content_only=args['content_only'], seed=seed, micro_lambda=micro_lambda, macro_lambda=macro_lambda, max_epoch=max_epoch, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, log_dir=log_dir, checkpoint_file=checkpoint_file, verbosity=verbosity, save_frequency=save_frequency, ) trainer.train(train_data_files, train_image_files, train_text_files, train_label_files, valid_data_files, valid_image_files, valid_text_files, valid_label_files)