def tfidf_from_questions(names, args, dictionary, dataroot='data', target=['rad']): inds = [[], []] # rows, cols for uncoalesce sparse matrix df = dict() N = len(dictionary) if args.use_RAD: dataroot = args.RAD_dir def populate(inds, df, text): tokens = dictionary.tokenize(text, True) for t in tokens: df[t] = df.get(t, 0) + 1 combin = list(itertools.combinations(tokens, 2)) for c in combin: if c[0] < N: inds[0].append(c[0]); inds[1].append(c[1]) if c[1] < N: inds[0].append(c[1]); inds[1].append(c[0]) if 'rad' in target: for name in names: assert name in ['train', 'test'] question_path = os.path.join(dataroot, name + 'set.json') questions = json.load(open(question_path)) for question in questions: populate(inds, df, question['question']) # TF-IDF vals = [1] * len(inds[1]) for idx, col in enumerate(inds[1]): assert df[col] >= 1, 'document frequency should be greater than zero!' vals[col] /= df[col] # Make stochastic matrix def normalize(inds, vals): z = dict() for row, val in zip(inds[0], vals): z[row] = z.get(row, 0) + val for idx, row in enumerate(inds[0]): vals[idx] /= z[row] return vals vals = normalize(inds, vals) tfidf = torch.sparse.FloatTensor(torch.LongTensor(inds), torch.FloatTensor(vals)) tfidf = tfidf.coalesce() # Latent word embeddings emb_dim = 300 glove_file = os.path.join(dataroot, 'glove', 'glove.6B.%dd.txt' % emb_dim) weights, word2emb = utils.create_glove_embedding_init(dictionary.idx2word[N:], glove_file) print('tf-idf stochastic matrix (%d x %d) is generated.' % (tfidf.size(0), tfidf.size(1))) return tfidf, weights
def tfidf_from_questions(names, dictionary, dataroot='data', target=['vqa', 'vg', 'cap']): inds = [[], []] # rows, cols for uncoalesce sparse matrix df = dict() N = len(dictionary) def populate(inds, df, text): tokens = dictionary.tokenize(text, True) for t in tokens: df[t] = df.get(t, 0) + 1 combin = list(itertools.combinations(tokens, 2)) for c in combin: if c[0] < N: inds[0].append(c[0]); inds[1].append(c[1]) if c[1] < N: inds[0].append(c[1]); inds[1].append(c[0]) if 'vqa' in target: # VQA 2.0 for name in names: assert name in ['train', 'val', 'test-dev2015', 'test2015'] question_path = os.path.join( dataroot, 'v2_OpenEnded_mscoco_%s_questions.json' % \ (name + '2014' if 'test'!=name[:4] else name)) questions = json.load(open(question_path))['questions'] for question in questions: populate(inds, df, question['question']) if 'vg' in target: # Visual Genome question_path = os.path.join(dataroot, 'question_answers.json') vgq = json.load(open(question_path, 'r')) for vg in vgq: for q in vg['qas']: populate(inds, df, q['question']) if 'cap' in target: # MSCOCO Caption for split in ['train2017', 'val2017']: captions = json.load(open('data/annotations/captions_%s.json' % split, 'r')) for caps in captions['annotations']: populate(inds, df, caps['caption']) # TF-IDF vals = [1] * len(inds[1]) for idx, col in enumerate(inds[1]): assert df[col] >= 1, 'document frequency should be greater than zero!' vals[col] /= df[col] # Make stochastic matrix def normalize(inds, vals): z = dict() for row, val in zip(inds[0], vals): z[row] = z.get(row, 0) + val for idx, row in enumerate(inds[0]): vals[idx] /= z[row] return vals vals = normalize(inds, vals) tfidf = torch.sparse.FloatTensor(torch.LongTensor(inds), torch.FloatTensor(vals)) tfidf = tfidf.coalesce() # Latent word embeddings emb_dim = 300 glove_file = 'data/glove/glove.6B.%dd.txt' % emb_dim weights, word2emb = utils.create_glove_embedding_init(dictionary.idx2word[N:], glove_file) print('tf-idf stochastic matrix (%d x %d) is generated.' % (tfidf.size(0), tfidf.size(1))) return tfidf, weights
def tfidf_from_questions_gqa(names, dictionary, dataroot='data', target=['gqa']): inds = [[], []] df = dict() N = len(dictionary) def populate(inds, df, text): tokens = dictionary.tokenize(text, True) for t in tokens: df[t] = df.get(t, 0) + 1 combin = list(itertools.combinations(tokens, 2)) for c in combin: if c[0] < N: inds[0].append(c[0]) inds[1].append(c[1]) if c[1] < N: inds[0].append(c[1]) inds[1].append(c[0]) # GQA if 'gqa' in target: for name in names: assert name in ['train_all', 'train_balanced', 'val_all', 'val_balanced', 'challenge_all', 'challenge_balanced', 'testdev_all', 'testdev_balanced', 'test_all', 'test_balanced'] if name == 'train_all': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'train_all_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'train_balanced': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'train_balanced_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'val_all': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'val_all_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'val_balanced': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'val_balanced_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'challenge_all': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'challenge_all_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'challenge_balanced': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'challenge_balanced_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'testdev_all': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'testdev_all_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'testdev_balanced': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'testdev_balanced_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) elif name == 'test_all': questions_path = os.path.join(dataroot, 'gqa', 'questions', 'test_all_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) else: questions_path = os.path.join(dataroot, 'gqa', 'questions', 'test_balanced_questions.pkl') questions = pickle.load(open(questions_path, 'rb')) print (name) count = 0 for question in questions: count = count + 1 populate(inds, df, question['question']) print (count) # TF-IDF vals = np.ones((len(inds[1]))) for idx, col in enumerate(inds[1]): assert df[col] >= 1, 'document frequency should be greater than zero!' vals[col] /= df[col] # Make stochastic matrix def normalize(inds, vals): z = dict() for row, val in zip(inds[0], vals): z[row] = z.get(row, 0) + val for idx, row in enumerate(inds[0]): vals[idx] /= z[row] return vals vals = normalize(inds, vals) tfidf = torch.sparse.FloatTensor(torch.LongTensor(inds), torch.FloatTensor(vals)) tfidf = tfidf.coalesce() # Latent word embeddings emb_dim = 300 glove_file = dataroot+'/glove/glove.6B.%dd.txt' % emb_dim weights, word2emb = utils.create_glove_embedding_init(dictionary.idx2word[N:], glove_file) print('tf-idf stochastic matrix (%d x %d) is generated.' % (tfidf.size(0),tfidf.size(1))) return tfidf, weights