def main(): arg_helper = CmdArgumentsHelper(); arg_helper.add_argument('query', 'q', 'query', 1); arg_helper.add_argument('root_dir', 'r', 'root', 1); arg_helper.add_argument('division_id', 'd', 'division', 1); args = arg_helper.read_arguments(); print (args); query_string = args['query']; division_id = int(args['division_id']); dbHelper = DBHelper(); dbHelper.init(args['root_dir']); dbHelper.gen_config_file(query_string); configFileName = dbHelper.get_config_filepath(query_string); dbHelper.genPCDBFile(query_string); pcdbFileName = dbHelper.getPCDBPath(query_string); outputDir = dbHelper.rawdataDir; downloadQuery(query_string, configFileName, pcdbFileName, outputDir, division_id);
def main(): arg_helper = CmdArgumentsHelper(); arg_helper.add_argument('query', 'q', 'query', 1); arg_helper.add_argument('root_dir', 'r', 'root', 1); args = arg_helper.read_arguments(); print (args); # Load keyword for querying Flickr. query_string = args['query']; dbHelper = DBHelper(); dbHelper.init(args['root_dir']) dbHelper.gen_config_file(query_string) configFileName = dbHelper.get_config_filepath(query_string) dbHelper.genPCDBFile(query_string) pcdbFileName = dbHelper.getPCDBPath(query_string) outputDir = dbHelper.rawdataDir # Split the division further to config.numThreadPerDivisions divisions, and # download data for each division sequentially. total_division = config.numSearchDivisions / config.numThreadPerDivisions for division_id in range(0, total_division): downloadQuery(query_string, configFileName, pcdbFileName, outputDir, division_id)
def main(): arg_helper = CmdArgumentsHelper() arg_helper.add_argument('query', 'q', 'query', 1) arg_helper.add_argument('root_dir', 'r', 'root', 1) arg_helper.add_argument('stats_dir', 's', 'stats', 1) arg_helper.add_argument('knowledge_dir', 'k', 'knowledge', 1) arg_helper.add_argument('min_num_images', 'n', 'min_num_images', 1) args = arg_helper.read_arguments() print (args) root = args['root_dir'] min_num_images = int(args['min_num_images']) query = args['query'] stats_dir = args['stats_dir'] knowledge_dir = args['knowledge_dir'] # Config file configuration stuff argv = cfg.vars numberOfThreads = int(argv["numberOfThreads"]) save_description = "{}_{}".format(query, min_num_images) # If do_skip is set to 1, will skip the item that we have previously # generated data. You should be careful to set it as 1, unless you # are certain the data will not change, i.e. if you remove or add new # photo to the image id list, you should set it as 0 to regenerate # all related data. do_skip = True # Find most frequent concepts # Concept List will be saved in file concept_dir = os.path.join(root, "concepts") if not os.path.exists(concept_dir): os.mkdir(concept_dir) concept_file = os.path.join(concept_dir, '{}_owner_per_concept.txt'.format(save_description)) from database_builder.get_photo_meta_multiprocess import find_vocabulary tic() find_vocabulary(root, stats_dir, query, min_num_images, save_description) toc() with open(concept_file, 'r') as f: all_concepts = all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]] all_concepts_list, all_concepts_freq = zip(*all_concepts) spio.savemat(concept_file[:-3] + 'mat', {'concepts': all_concepts_list}) #Remove some of the vocabulary fitting certain criteria filter_vocab_dir = os.path.join(root, 'filter_lists') all_concepts = filter_vocabulary(filter_vocab_dir, all_concepts) save_description = "{}_{}_extended".format(query, min_num_images) concept_file = os.path.join(concept_dir, '{}_filtered_owner_per_concept.txt'.format(save_description)) with open(concept_file, 'w') as f: all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts] f.write("\n".join(all_concepts_str)) with open(concept_file, 'r') as f: all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]] #Break concatenated word pairs all_concepts = break_pairs(all_concepts) concept_file = os.path.join(concept_dir, '{0}_split_owner_per_concept.txt'.format(save_description)) with open(concept_file, 'w') as f: all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts] f.write("\n".join(all_concepts_str)) with open(concept_file, 'r') as f: all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')]] #Find approximate statistics for concept pairs #For initial vocabulary expansion web_dir = os.path.join(root, 'output') if not os.path.exists(web_dir): os.mkdir(web_dir) from database_builder.get_photo_meta_multiprocess import find_approximate_concept_pairs tic() find_approximate_concept_pairs(root, stats_dir, query, save_description, all_concepts) toc() # Expand vocabulary all_concepts = extend_vocabulary(root, stats_dir, knowledge_dir, all_concepts, query, min_num_images, save_description, do_skip) concept_file = os.path.join(concept_dir, '{}_owner_per_concept.txt'.format(save_description)) with open(concept_file, 'w') as f: all_concepts_str = ["{}\t{}".format(t[0], t[1]) for t in all_concepts] f.write("\n".join(all_concepts_str)) with open(concept_file, 'r') as f: all_concepts = [(x[0], int(float(x[1]))) for x in [t.split('\t') for t in f.read().split('\n')] if len(x)>1] all_concepts = merge_pairs(all_concepts) all_concepts_list, all_concepts_freq = zip(*all_concepts) spio.savemat(concept_file[:-3] + 'mat', {'concepts': all_concepts_list}) #Recount tag cooccurrence with final vocabulary from database_builder.get_photo_meta_multiprocess import find_concept_pairs #if total_new_concepts > 0: tic() web_dir = os.path.join(root, 'output') if not os.path.exists(web_dir): os.mkdir(web_dir) find_concept_pairs(root, stats_dir, web_dir, query, all_concepts) toc() #Process knowledge from structured_knowledge.parse_cache import parse_cache from structured_knowledge.download_structured_knowledge import download_structured_knowledge download_structured_knowledge(knowledge_dir, all_concepts_list, do_skip) parse_cache(knowledge_dir, "ConceptNet", all_concepts, save_description, stats_dir) parse_cache(knowledge_dir, "Freebase", all_concepts, save_description, stats_dir) #Generate adjacency matrices from structured_knowledge.build_adjacency_matrices import build_adjacency_matrices build_adjacency_matrices(knowledge_dir, stats_dir, all_concepts_list, save_description) from database_builder.gen_concept_structure import task_gen_synonym_mask from database_builder.gen_concept_structure import task_gen_lemma_mask task_gen_synonym_mask(all_concepts_list, stats_dir, save_description) task_gen_lemma_mask(all_concepts_list, stats_dir, save_description) from structured_knowledge.parts_of_speech import parse_language from structured_knowledge.parts_of_speech import parse_proper_nouns from structured_knowledge.parts_of_speech import parse_parts_of_speech from structured_knowledge.parse_object_scene import parse_object_concepts from structured_knowledge.parse_object_scene import parse_scene_concepts parse_language(all_concepts_list, stats_dir, save_description) parse_proper_nouns(all_concepts_list, stats_dir, save_description) parse_parts_of_speech(all_concepts_list, knowledge_dir, stats_dir, save_description) parse_scene_concepts(knowledge_dir, stats_dir, all_concepts_list, save_description) parse_object_concepts(knowledge_dir, stats_dir, all_concepts_list, save_description) gen_phrase_mask(all_concepts_list, stats_dir, save_description) from database_builder.get_vocab_features import get_glove print("Start GloVe Feature") model_file = 'E:\data\GloVe\glove.42B.300d.txt' save_model_file = '' dim = 300 save_feature_file = "E:\data\Iconic\data\word2vec_features\\{}_extended_feature_glove.42B.300d.mat".format(save_description) get_glove(dim, model_file, save_model_file, save_feature_file, concept_file)