def get_conceptnet_words(db_file, addl_vocab=[]): A = divisi2.network.conceptnet_matrix('en') wa = A.row_labels #wb = A.col_labels #all_t2 = map(lambda s: s[2], list(wb)) #ww = filter(lambda s: s==s.split()[0], wa) # keep 1-word terms only conn = sqlite3.connect(db_file) cursor = conn.cursor() wd = {} cn_vocab = {} null_count = 0 for w in wa: ww = w.split() vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) wd[w] = vv null_count += (not vv) if vv: for v in vv: if v not in cn_vocab: cn_vocab[v] = 1 #wd = dict(izip(ww, vv)) conn.close() cn_vocab = cn_vocab.keys() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s mapped %d concept-net nodes, %d empty" % (tt, len(wa), null_count) return wd, cn_vocab
def compile_synset_wordlist(w, cw, wn_cnt=None, db_wn=None, db_dict=None, addl_vocab=None, tag_count=None): dbcn = sqlite3.connect(db_dict) # dictionary db dbcsr = dbcn.cursor() conn = sqlite3.connect(db_wn) # wnet tag info db cursor = conn.cursor() nltk_id, wlist, self_syn = map_wn2nltk(w, cursor) synset_exist = True if nltk_id else False if tag_count: td = tag_count else: tcnt = cursor.execute("SELECT tag,count FROM wn_tag WHERE wnid=?", (w, )).fetchall() td = dict(tcnt) synset_info = {} synset_info['nltk_id'] = nltk_id synset_info['num_flickr'] = cw #wn_cnt[w] synset_info['tag_count'] = td children_wn = wn_get_hyponym(w, full=0) if wn_cnt: synset_info['children_wn'] = filter(lambda c: c in wn_cnt, children_wn) else: synset_info['children_wn'] = children_wn if not synset_exist: synset_info['self'] = {'depth': -1, 'words': []} synset_info['ancestor'] = {} synset_info['descendant'] = {} synset_info['other'] = {} else: nltk_id = nltk_id[0] ww = reduce(lambda u, v: u + v.split(), wlist, []) vv = map(lambda s: norm_tag(s, dbcsr, addl_vocab, 1), ww) vv = list(set(filter(len, vv))) synset_info['self'] = {'depth': self_syn.min_depth(), 'words': vv} synset_info['ancestor'] = get_ancestor_words(self_syn, 0, dbcsr, addl_vocab) synset_info['descendant'] = get_descendant_words( self_syn, 0, dbcsr, addl_vocab) all_tags = set(synset_info['tag_count'].keys()) all_hierarchy = set( synset_info['self']['words'] + reduce(lambda a, b: a + b, synset_info['ancestor'].values(), []) + reduce(lambda a, b: a + b, synset_info['descendant'].values(), [])) synset_info['other'] = sorted(list(all_tags - all_hierarchy)) dbcn.close() conn.close() return synset_info, wlist
def get_descendant_words(self_syn, self_depth, db_cursor, addl_vocab): psyn = self_syn.hyponyms() if not psyn: return {} vlist = [] for p in psyn: plemma = map(lambda s: s.name, p.lemmas) wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma) ww = reduce(lambda u, v: u + v, wlist, []) vv = map(lambda s: norm_tag(s, db_cursor, addl_vocab, 1), ww) vv = filter(len, vv) vlist += vv cur_d = self_depth + 1 children_words = {} children_words[cur_d] = sorted(list(set(vlist))) #{'depth': self_depth-1, # 'words': sorted(list(set(vlist))) } for p in psyn: pw = get_descendant_words(p, cur_d, db_cursor, addl_vocab) # aggregate outcome by depth if pw: for d in pw.iterkeys(): if d in children_words: children_words[d] = sorted( list(set(children_words[d]) | set(pw[d]))) else: children_words[d] = pw[d] return children_words
def get_descendant_words(self_syn, self_depth, db_cursor, addl_vocab): psyn = self_syn.hyponyms() if not psyn: return {} vlist = [] for p in psyn: plemma = map(lambda s: s.name, p.lemmas) wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma) ww = reduce(lambda u,v: u + v, wlist, []) vv = map(lambda s:norm_tag(s, db_cursor, addl_vocab,1), ww) vv = filter(len, vv) vlist += vv cur_d = self_depth + 1 children_words = {} children_words[cur_d] = sorted(list(set(vlist))) #{'depth': self_depth-1, # 'words': sorted(list(set(vlist))) } for p in psyn: pw = get_descendant_words(p, cur_d, db_cursor, addl_vocab) # aggregate outcome by depth if pw: for d in pw.iterkeys(): if d in children_words : children_words[d] = sorted(list( set(children_words[d]) | set(pw[d]) )) else: children_words[d] = pw[d] return children_words
def get_conceptnet_words(db_file, addl_vocab=[]): A = divisi2.network.conceptnet_matrix('en') wa = A.row_labels #wb = A.col_labels #all_t2 = map(lambda s: s[2], list(wb)) #ww = filter(lambda s: s==s.split()[0], wa) # keep 1-word terms only conn = sqlite3.connect(db_file) cursor = conn.cursor() wd = {} cn_vocab = {} null_count = 0 for w in wa: ww = w.split() vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) wd[w] = vv null_count += (not vv) if vv : for v in vv: if v not in cn_vocab: cn_vocab[v] = 1 #wd = dict(izip(ww, vv)) conn.close() cn_vocab = cn_vocab.keys() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s mapped %d concept-net nodes, %d empty" % (tt, len(wa), null_count) return wd, cn_vocab
def compile_synset_wordlist(w, cw, wn_cnt=None, db_wn=None, db_dict=None, addl_vocab=None, tag_count=None): dbcn = sqlite3.connect(db_dict) # dictionary db dbcsr = dbcn.cursor() conn = sqlite3.connect(db_wn) # wnet tag info db cursor = conn.cursor() nltk_id, wlist, self_syn = map_wn2nltk(w, cursor) synset_exist = True if nltk_id else False if tag_count: td = tag_count else: tcnt = cursor.execute("SELECT tag,count FROM wn_tag WHERE wnid=?", (w,)).fetchall() td = dict(tcnt) synset_info = {} synset_info['nltk_id'] = nltk_id synset_info['num_flickr'] = cw #wn_cnt[w] synset_info['tag_count'] = td children_wn = wn_get_hyponym(w, full=0) if wn_cnt: synset_info['children_wn'] = filter(lambda c: c in wn_cnt, children_wn) else: synset_info['children_wn'] = children_wn if not synset_exist: synset_info['self'] = {'depth': -1, 'words': [] } synset_info['ancestor'] = {} synset_info['descendant'] = {} synset_info['other'] = {} else: nltk_id = nltk_id[0] ww = reduce(lambda u,v: u + v.split(), wlist, []) vv = map(lambda s:norm_tag(s, dbcsr, addl_vocab,1), ww) vv = list(set(filter(len, vv))) synset_info['self'] = {'depth': self_syn.min_depth(), 'words': vv } synset_info['ancestor'] = get_ancestor_words(self_syn, 0, dbcsr, addl_vocab) synset_info['descendant'] = get_descendant_words(self_syn, 0, dbcsr, addl_vocab) all_tags = set(synset_info['tag_count'].keys()) ; all_hierarchy = set(synset_info['self']['words'] + reduce(lambda a,b: a+b, synset_info['ancestor'].values(), []) + reduce(lambda a,b: a+b, synset_info['descendant'].values(), []) ) synset_info['other'] = sorted(list( all_tags - all_hierarchy )) dbcn.close() conn.close() return synset_info, wlist
def get_conceptnet_words(db_file): A = divisi2.network.conceptnet_matrix('en') wa = A.row_labels ww = filter(lambda s: s == s.split()[0], wa) # keep 1-word terms only conn = sqlite3.connect(db_file) cursor = conn.cursor() vv = map(lambda s: norm_tag(s, cursor), ww) wd = dict(izip(ww, vv)) conn.close() return wd
def get_conceptnet_words(db_file): A = divisi2.network.conceptnet_matrix('en') wa = A.row_labels ww = filter(lambda s: s==s.split()[0], wa) # keep 1-word terms only conn = sqlite3.connect(db_file) cursor = conn.cursor() vv = map(lambda s:norm_tag(s, cursor), ww) wd = dict(izip(ww, vv)) conn.close() return wd
def norm_tag_file(argv): """ clean tags for one single file """ if len(argv)<2: argv = ['-h'] parser = OptionParser(description='rank tags of a single image + compose sentences') parser.add_option('-i', '--in_file', dest='in_file', default="", help='input file') parser.add_option('-o', '--out_dir', dest='out_dir', default="db", help='ouptut format (db or pkl)') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') (opts, __args) = parser.parse_args(sys.argv) db_dict = os.path.join(opts.db_dir, opts.db_dict) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() out_fh = open(os.path.join(opts.out_dir, os.path.split(opts.in_file)[1]), 'wt') conn = sqlite3.connect(db_dict) cursor = conn.cursor() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s processing %s ..." % (tt, opts.in_file) cnt = 0 ecnt = 0 tcnt = 0 for cl in codecs.open(opts.in_file, encoding='utf-8'): imid = cl.split()[0] w = cl.split()[1] ww = w.split(",") vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) if vv: #tag_dict[imid] = vv out_fh.write("%s\t%s\n" % (imid, ",".join(vv))) tcnt += len(vv) else: ecnt += 1 cnt += 1 if cnt % 5000 == 0: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d img-id processed, %d tags, %d empty" % (tt, cnt, tcnt, ecnt) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d img-id processed, %d tags, %d empty. Done.\n" % (tt, cnt, tcnt, ecnt) conn.close()
def get_wordnet_words(db_wn, wnid_file, db_dict, vocab): wn_freq = {} for line in open(wnid_file): t = line.strip().split() wn_freq[t[1]] = int(t[0]) wn_words = {} condict = sqlite3.connect(db_dict) curdict = condict.cursor() conn = sqlite3.connect(db_wn) cursor = conn.cursor() wcnt = 0 vcnt = 0 for k in wn_freq.iterkeys(): #stmt = ("SELECT W.wnid, W.word1, WW.word " + # "FROM wordnet as W, wordnet_word AS WW " + # "WHERE W.wnid='%s' AND W.wnid=WW.wnid" % k ) stmt = "SELECT word FROM wordnet_word WHERE wnid='%s'" % k cursor.execute(stmt) ww = map(lambda t: t[0], cursor.fetchall()) vv = map(lambda s: norm_tag(s.lower(), curdict), ww) vv = filter(lambda s: len(s) and s in vocab, vv) vv = list(set(vv)) wcnt += 1 if vv: wn_words[k] = vv vcnt += 1 if wcnt % 1000 == 0: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d/%d synsets processsed, %d has tags" % ( tt, wcnt, len(wn_freq), vcnt) conn.close() condict.close() uniq_v = reduce(lambda v, s: v + s, wn_words.itervalues()) uniq_v = list(set(uniq_v)) print " %d / %d synsets has non-empty mapping to %d words" % ( len(wn_words), len(wn_freq), len(uniq_v)) return wn_words
def get_wordnet_words(db_wn, wnid_file, db_dict, vocab): wn_freq = {} for line in open(wnid_file): t = line.strip().split() wn_freq[t[1]] = int(t[0]) wn_words = {} condict = sqlite3.connect(db_dict) curdict = condict.cursor() conn = sqlite3.connect(db_wn) cursor = conn.cursor() wcnt = 0 vcnt = 0 for k in wn_freq.iterkeys(): #stmt = ("SELECT W.wnid, W.word1, WW.word " + # "FROM wordnet as W, wordnet_word AS WW " + # "WHERE W.wnid='%s' AND W.wnid=WW.wnid" % k ) stmt = "SELECT word FROM wordnet_word WHERE wnid='%s'" % k cursor.execute(stmt) ww = map(lambda t: t[0], cursor.fetchall()) vv = map(lambda s:norm_tag(s.lower(), curdict), ww) vv = filter(lambda s: len(s) and s in vocab, vv) vv = list(set(vv)) wcnt += 1 if vv: wn_words[k] = vv vcnt += 1 if wcnt % 1000==0: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d/%d synsets processsed, %d has tags" % (tt, wcnt, len(wn_freq), vcnt) conn.close() condict.close() uniq_v = reduce(lambda v, s: v+s, wn_words.itervalues()) uniq_v = list( set(uniq_v) ) print " %d / %d synsets has non-empty mapping to %d words" % (len(wn_words), len(wn_freq), len(uniq_v)) return wn_words
def norm_words(in_wlist, db_file, addl_vocab=[]): conn = sqlite3.connect(db_file) cursor = conn.cursor() wd = {} cn_vocab = {} null_count = 0 for w in in_wlist: ww = w.split() vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) wd[w] = vv null_count += (not vv) if vv: for v in vv: if v not in cn_vocab: cn_vocab[v] = 1 #wd = dict(izip(ww, vv)) conn.close() cn_vocab = cn_vocab.keys() return cn_vocab
def norm_words(in_wlist, db_file, addl_vocab=[]): conn = sqlite3.connect(db_file) cursor = conn.cursor() wd = {} cn_vocab = {} null_count = 0 for w in in_wlist: ww = w.split() vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) wd[w] = vv null_count += (not vv) if vv : for v in vv: if v not in cn_vocab: cn_vocab[v] = 1 #wd = dict(izip(ww, vv)) conn.close() cn_vocab = cn_vocab.keys() return cn_vocab
def get_ancestor_words(self_syn, self_depth, db_cursor, addl_vocab): psyn = self_syn.hypernyms() if not psyn: return {} vlist = [] for p in psyn: plemma = map(lambda s: s.name, p.lemmas) wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma) ww = reduce(lambda u, v: u + v, wlist, []) vv = map(lambda s: norm_tag(s, db_cursor, addl_vocab, 1), ww) vv = filter(len, vv) vlist += vv cur_d = self_depth - 1 parent_words = {} parent_words[cur_d] = sorted(list(set(vlist))) #{'depth': self_depth-1, # 'words': sorted(list(set(vlist))) } for p in psyn: pw = get_ancestor_words(p, cur_d, db_cursor, addl_vocab) # aggregate outcome by depth if pw: try: for d in pw.iterkeys(): if d in parent_words: # merge two dicts parent_words[d] = sorted( list(set(parent_words[d]) | set(pw[d]))) else: parent_words[d] = pw[d] except: print pw raise return parent_words
def get_ancestor_words(self_syn, self_depth, db_cursor, addl_vocab): psyn = self_syn.hypernyms() if not psyn: return {} vlist = [] for p in psyn: plemma = map(lambda s: s.name, p.lemmas) wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma) ww = reduce(lambda u,v: u + v, wlist, []) vv = map(lambda s:norm_tag(s, db_cursor, addl_vocab,1), ww) vv = filter(len, vv) vlist += vv cur_d = self_depth-1 parent_words = {} parent_words[cur_d] = sorted(list(set(vlist))) #{'depth': self_depth-1, # 'words': sorted(list(set(vlist))) } for p in psyn: pw = get_ancestor_words(p, cur_d, db_cursor, addl_vocab) # aggregate outcome by depth if pw: try: for d in pw.iterkeys(): if d in parent_words : # merge two dicts parent_words[d] = sorted(list( set(parent_words[d]) | set(pw[d]) )) else: parent_words[d] = pw[d] except: print pw raise return parent_words
def construct_training_data(argv): opts, db_dict, _addl_vocab, _db_wn = options_get_wnet_tag(argv) addl_vocab = [] dbcn = sqlite3.connect(db_dict) # dictionary db dbcsr = dbcn.cursor() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s start processing '%s' " % (tt, opts.in_wnet_list) if os.path.isfile(opts.in_wnet_list): wnet_list = open(opts.in_wnet_list, 'rt').read().split() else: wnet_list = opts.in_wnet_list.split(",") imgid_list, usr_list, tag_dict = ([], [], {}) for (iw, wn) in enumerate(wnet_list): if iw > opts.endnum: break wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir+"_tags", wn+'.tags.txt') ilist, ulist, tdict = get_wnet_tags(wn, wtag_file, opts, False) imgid_list += ilist usr_list += ulist for u in tdict: tag_dict[u] = tdict[u] # imgid is unique tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "\n%s read %d synsets, found %d imgs from %d unqiue usrs" % (tt, len(wnet_list), len(imgid_list), len(set(usr_list)) ) tnum = map(lambda k: len(tag_dict[k]), tag_dict.iterkeys()) snum = sum(tnum) print "%s %d tags in %d imgs, avg %0.4f tags per img" % (tt, snum, len(imgid_list), 1.*snum/len(tnum)) #if 0: usr_tag = {} tag_cnt = {} empty_cnt = 0 for (imid, u) in zip(imgid_list, usr_list): if u not in usr_tag: usr_tag[u] = {} vv = map(lambda s:norm_tag(s, dbcsr, addl_vocab,1), tag_dict[imid]) vv = list(set(filter(len, vv))) for v in vv: if v in usr_tag[u]: usr_tag[u][v] += 1 else: usr_tag[u][v] = 1 # first time seeing user u use tag v if v in tag_cnt: tag_cnt[v] += 1 else: tag_cnt[v] = 1 tag_list = filter(lambda t: tag_cnt[t]>1, tag_cnt.keys()) tag_list.sort() tag_num = map(lambda t: tag_cnt[t], tag_list) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s obtained %d usrs, %d out of %d tags with cnt>1, %d empty imgs " % (tt, len(usr_tag), len(tag_list), len(tag_cnt), empty_cnt) lcnt = 0 bg_dict = {} #wn_tagf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".txt"), "wt") for u in usr_tag: vv = filter(lambda v: v in tag_list, usr_tag[u].keys()) if vv: #wn_tagf.write(u+"\t"+ ",".join(vv) + "\n") accumulate_bg(vv, bg_dict, None, None, addl_vocab=[]) lcnt += 1 if lcnt<20: print u+"\t"+ ",".join(vv) #Nusr = len(usr_tag) lcnt = 0 bg_tuples = sort_bg(bg_dict) #print bg_tuples bigram_list = [] for u, v, c in bg_tuples: iu = tag_list.index(u) iv = tag_list.index(v) bigram_list += [iu, iv, c] wn_out_mat = os.path.split(opts.in_wnet_list)[1] wn_out_mat = os.path.splitext(wn_out_mat)[0] wn_out_mat = os.path.join(os.path.split(opts.in_wnet_list)[0], wn_out_mat+".mat") data = {'wnet_list': wnet_list, 'usr_list': ulist, 'tag_list': tag_list, 'tag_cnt':tag_num, 'bigram_list': bigram_list} io.savemat(wn_out_mat, data)
def rank_tags(argv): if len(argv)<2: argv = ['-h'] parser = OptionParser(description='rank tags of a single image + compose sentences') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data') parser.add_option('-n', '--num_output', dest='num_output', type="int", default=3, help='number of output images to examine') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--vocab_score', dest='vocab_score', default="flickr_vscore.txt", help='file containing vocabulary count') parser.add_option("", '--tag_file', dest="tag_file", default="demo-data/24.cache", help="") parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') (opts, __args) = parser.parse_args(sys.argv) # intersect the two dictionaries first db_dict = os.path.join(opts.db_dir, opts.db_dict) tag_file = os.path.join(opts.db_dir, opts.tag_file) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score), 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) vocab_score = {} for vl in vocab_lines: t = vl.split() # [word, score, prc] vocab_score[t[0]] = map(float, t[1:]) # gulp all the tags vocab_lines = open(tag_file, 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) img_tag = {} for vl in vocab_lines: t = vl.split() #print t img_tag[t[0]] = t[1] print "read %d tags, %d images" % ( len(vocab_score), len(img_tag) ) id_list = img_tag.keys() if opts.num_output<0: random.shuffle(id_list) num_output = - opts.num_output id_select = id_list[:num_output*10] elif opts.num_output>1e5: id_select = [str(opts.num_output)] num_output = 1 else: num_output = opts.num_output id_select = id_list[:num_output*10] icnt = 0 api_keys = open(FLICKR_KEY_FILE, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) conn = sqlite3.connect(db_dict) conn.text_factory = str cursor = conn.cursor() for cur_id in id_select: ww = img_tag[cur_id].split(",") vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) #find the vscore of vv vs = map(lambda v: vocab_score[v], vv) # get flickr picture url #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg if 1: cur_key = api_keys [random.randint(0, len(api_keys)-1)] jinfo = cache_flickr_info(cur_id, cur_key, rootdir="") p = jinfo['photo'] imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % (p["farm"], p["server"], p['id'], p['secret']) else: imgurl = "" #print zip(vv, vs) if len(vv) > 5: icnt += 1 # print results print "\nimg: %s" % (imgurl if imgurl else cur_id) vtup = sorted(map(lambda s,t: (s, t[0], t[1]), vv, vs), key=itemgetter(2), reverse=True) outstr = "" for i, t in enumerate(vtup): outstr += "%s (%0.3f,%2.1f%%)\t"%(t[0],t[1],100*t[2]) if (i+1)%3==0: outstr += "\n" print outstr """ print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) ) print "other : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) ) print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) ) """ print "" else: pass if icnt >= num_output: break conn.close()
def rank_tags(argv): if len(argv) < 2: argv = ['-h'] parser = OptionParser( description='rank tags of a single image + compose sentences') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data') parser.add_option('-n', '--num_output', dest='num_output', type="int", default=3, help='number of output images to examine') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--vocab_score', dest='vocab_score', default="flickr_vscore.txt", help='file containing vocabulary count') parser.add_option("", '--tag_file', dest="tag_file", default="demo-data/24.cache", help="") parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') (opts, __args) = parser.parse_args(sys.argv) # intersect the two dictionaries first db_dict = os.path.join(opts.db_dir, opts.db_dict) tag_file = os.path.join(opts.db_dir, opts.tag_file) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score), 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) vocab_score = {} for vl in vocab_lines: t = vl.split() # [word, score, prc] vocab_score[t[0]] = map(float, t[1:]) # gulp all the tags vocab_lines = open(tag_file, 'rt').read().split("\n") vocab_lines = filter(len, vocab_lines) img_tag = {} for vl in vocab_lines: t = vl.split() #print t img_tag[t[0]] = t[1] print "read %d tags, %d images" % (len(vocab_score), len(img_tag)) id_list = img_tag.keys() if opts.num_output < 0: random.shuffle(id_list) num_output = -opts.num_output id_select = id_list[:num_output * 10] elif opts.num_output > 1e5: id_select = [str(opts.num_output)] num_output = 1 else: num_output = opts.num_output id_select = id_list[:num_output * 10] icnt = 0 api_keys = open(FLICKR_KEY_FILE, 'r').read().split() api_keys = map(lambda s: s.strip(), api_keys) conn = sqlite3.connect(db_dict) conn.text_factory = str cursor = conn.cursor() for cur_id in id_select: ww = img_tag[cur_id].split(",") vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) #find the vscore of vv vs = map(lambda v: vocab_score[v], vv) # get flickr picture url #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg if 1: cur_key = api_keys[random.randint(0, len(api_keys) - 1)] jinfo = cache_flickr_info(cur_id, cur_key, rootdir="") p = jinfo['photo'] imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % ( p["farm"], p["server"], p['id'], p['secret']) else: imgurl = "" #print zip(vv, vs) if len(vv) > 5: icnt += 1 # print results print "\nimg: %s" % (imgurl if imgurl else cur_id) vtup = sorted(map(lambda s, t: (s, t[0], t[1]), vv, vs), key=itemgetter(2), reverse=True) outstr = "" for i, t in enumerate(vtup): outstr += "%s (%0.3f,%2.1f%%)\t" % (t[0], t[1], 100 * t[2]) if (i + 1) % 3 == 0: outstr += "\n" print outstr """ print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) ) print "other : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) ) print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) ) """ print "" else: pass if icnt >= num_output: break conn.close()
def analyze_tag_pairs(argv): # parser = OptionParser(description='compile tags for all imgs in a wnet synset') opts, db_dict, addl_vocab, db_wn = options_get_wnet_tag(argv) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s start processing '%s' " % (tt, opts.in_wnet_list) dbcn = sqlite3.connect(db_dict) # dictionary db dbcsr = dbcn.cursor() if os.path.isfile(opts.in_wnet_list): wnet_list = open(opts.in_wnet_list, 'rt').read().split() else: wnet_list = opts.in_wnet_list.split(",") for (iw, wn) in enumerate(wnet_list): if iw > opts.endnum: break wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir + "_tags", wn + '.tags.txt') imgid_list, usr_list, tag_dict = get_wnet_tags(wn, wtag_file, opts, False) usr_tag = {} tag_cnt = {} empty_cnt = 0 #for u, utuple in groupby(zip(usr_list, imgid_list), lambda x: x[0]): for (imid, u) in zip(imgid_list, usr_list): if u not in usr_tag: usr_tag[u] = {} vv = map(lambda s: norm_tag(s, dbcsr, addl_vocab, 1), tag_dict[imid]) vv = list(set(filter(len, vv))) for v in vv: if v in usr_tag[u]: usr_tag[u][v] += 1 else: usr_tag[u][v] = 1 # first time seeing user u use tag v if v in tag_cnt: tag_cnt[v] += 1 else: tag_cnt[v] = 1 #for v in list(set(usr_tag[u])): tag_list = filter(lambda t: tag_cnt[t] > 1, tag_cnt.keys()) tag_list.sort() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s obtained %d usrs, %d tags non-trival, %d empty imgs " % ( tt, len(usr_tag), len(tag_list), empty_cnt) ulist = [] tcnt_mat = np.zeros((len(usr_list), len(tag_list))) icnt = 0 #out_dat_fh = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".dat"), "wt") for u in list(set(usr_list)): jj = -1 outstr = "" if usr_tag[u]: for t in usr_tag[u]: if t in tag_list: jj = tag_list.index(t) tcnt_mat[icnt, jj] += 1 outstr += "%d:%d " % (jj + 1, usr_tag[u][t]) else: pass # empty list of tags, skip if outstr: #out_dat_fh.write( outstr + "\n" ) icnt += 1 ulist.append(u) tcnt = tcnt_mat[:icnt, :] tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print '%s wnet "%s": %d unique tags by %d users\n' % ( tt, wn, len(tag_list), len(ulist)) # get and print synset info td = dict([(k, tag_cnt[k]) for k in tag_list]) syn_info, wlist = compile_synset_wordlist(wn, len(ulist), None, db_wn, db_dict, addl_vocab, td) hi_words = [ syn_info['self']['words'] ] + syn_info['ancestor'].values() + syn_info['descendant'].values() hi_depth = [ 0 ] + syn_info['ancestor'].keys() + syn_info['descendant'].keys() other_words = list( set(tag_list) - set(reduce(lambda a, b: a + b, hi_words, []))) print_synset_info(syn_info, wn, wlist, len(ulist)) wn_out_mat = os.path.join(opts.data_home, opts.wnet_out_dir, wn + ".mat") tcnt = tcnt_mat[:icnt, :] #mlab.save(wn_out_mat, 'img_list', 'tag_list', 'tcnt_mat') data = { 'usr_list': ulist, 'tag_list': tag_list, 'tcnt': tcnt, 'other_words': other_words, 'hier_words': hi_words, 'hier_depth': hi_depth } io.savemat(wn_out_mat, data) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print '%s saved to %s \n\n' % (tt, wn_out_mat) print "\n tag occurrence:" hi_words = reduce(lambda a, b: a + b, hi_words, []) lcnt = 0 bg_dict = {} wn_tagf = open( os.path.join(opts.data_home, opts.wnet_out_dir, wn + ".txt"), "wt") for u in usr_tag: vv = filter(lambda v: v in tag_list, usr_tag[u].keys()) if vv: wn_tagf.write(u + "\t" + ",".join(vv) + "\n") accumulate_bg(vv, bg_dict, None, None, addl_vocab=[]) lcnt += 1 if lcnt < 20: print u + "\t" + ",".join(vv) wn_tagf.close() print "\nbigrams#\tMI \ttype\ttag1,tag2\trelations" Nusr = len(usr_tag) lcnt = 0 bg_tuples = sort_bg(bg_dict) wn_bgf = open( os.path.join(opts.data_home, opts.wnet_out_dir, wn + ".bigram.txt"), "wt") for u, v, c in bg_tuples: if c < 3: break assr = cm.Assertion.objects.filter(concept1__text=u, concept2__text=v, language=en) atxt = map(lambda a: str(a).strip("[]"), assr) atxt += map( lambda a: str(a).strip("[]"), cm.Assertion.objects.filter(concept1__text=v, concept2__text=u, language=en)) try: mi = binary_mutual_info(Nusr, tag_cnt[u], tag_cnt[v], c) except: print u, v, c, len(usr_tag) raise if u in other_words and v in other_words: btype = "OO" elif u in hi_words and v in hi_words: btype = "HH" else: btype = "HO" outstr = "%s\t%0.5f\t%0.5f\t%s\t%s,%s\t%s" % \ (syn_info['nltk_id'], 1.*c/Nusr, mi, btype, u, v, ";".join(atxt) if atxt else "None") wn_bgf.write(outstr + "\n") lcnt += 1 if lcnt < 50: print outstr wn_bgf.close()
def construct_training_data(argv): opts, db_dict, _addl_vocab, _db_wn = options_get_wnet_tag(argv) addl_vocab = [] dbcn = sqlite3.connect(db_dict) # dictionary db dbcsr = dbcn.cursor() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s start processing '%s' " % (tt, opts.in_wnet_list) if os.path.isfile(opts.in_wnet_list): wnet_list = open(opts.in_wnet_list, 'rt').read().split() else: wnet_list = opts.in_wnet_list.split(",") imgid_list, usr_list, tag_dict = ([], [], {}) for (iw, wn) in enumerate(wnet_list): if iw > opts.endnum: break wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir + "_tags", wn + '.tags.txt') ilist, ulist, tdict = get_wnet_tags(wn, wtag_file, opts, False) imgid_list += ilist usr_list += ulist for u in tdict: tag_dict[u] = tdict[u] # imgid is unique tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "\n%s read %d synsets, found %d imgs from %d unqiue usrs" % ( tt, len(wnet_list), len(imgid_list), len(set(usr_list))) tnum = map(lambda k: len(tag_dict[k]), tag_dict.iterkeys()) snum = sum(tnum) print "%s %d tags in %d imgs, avg %0.4f tags per img" % ( tt, snum, len(imgid_list), 1. * snum / len(tnum)) #if 0: usr_tag = {} tag_cnt = {} empty_cnt = 0 for (imid, u) in zip(imgid_list, usr_list): if u not in usr_tag: usr_tag[u] = {} vv = map(lambda s: norm_tag(s, dbcsr, addl_vocab, 1), tag_dict[imid]) vv = list(set(filter(len, vv))) for v in vv: if v in usr_tag[u]: usr_tag[u][v] += 1 else: usr_tag[u][v] = 1 # first time seeing user u use tag v if v in tag_cnt: tag_cnt[v] += 1 else: tag_cnt[v] = 1 tag_list = filter(lambda t: tag_cnt[t] > 1, tag_cnt.keys()) tag_list.sort() tag_num = map(lambda t: tag_cnt[t], tag_list) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s obtained %d usrs, %d out of %d tags with cnt>1, %d empty imgs " % ( tt, len(usr_tag), len(tag_list), len(tag_cnt), empty_cnt) lcnt = 0 bg_dict = {} #wn_tagf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".txt"), "wt") for u in usr_tag: vv = filter(lambda v: v in tag_list, usr_tag[u].keys()) if vv: #wn_tagf.write(u+"\t"+ ",".join(vv) + "\n") accumulate_bg(vv, bg_dict, None, None, addl_vocab=[]) lcnt += 1 if lcnt < 20: print u + "\t" + ",".join(vv) #Nusr = len(usr_tag) lcnt = 0 bg_tuples = sort_bg(bg_dict) #print bg_tuples bigram_list = [] for u, v, c in bg_tuples: iu = tag_list.index(u) iv = tag_list.index(v) bigram_list += [iu, iv, c] wn_out_mat = os.path.split(opts.in_wnet_list)[1] wn_out_mat = os.path.splitext(wn_out_mat)[0] wn_out_mat = os.path.join( os.path.split(opts.in_wnet_list)[0], wn_out_mat + ".mat") data = { 'wnet_list': wnet_list, 'usr_list': ulist, 'tag_list': tag_list, 'tag_cnt': tag_num, 'bigram_list': bigram_list } io.savemat(wn_out_mat, data)
def analyze_tag_pairs(argv): # parser = OptionParser(description='compile tags for all imgs in a wnet synset') opts, db_dict, addl_vocab, db_wn = options_get_wnet_tag(argv) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s start processing '%s' " % (tt, opts.in_wnet_list) dbcn = sqlite3.connect(db_dict) # dictionary db dbcsr = dbcn.cursor() if os.path.isfile(opts.in_wnet_list): wnet_list = open(opts.in_wnet_list, 'rt').read().split() else: wnet_list = opts.in_wnet_list.split(",") for (iw, wn) in enumerate(wnet_list): if iw > opts.endnum: break wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir+"_tags", wn+'.tags.txt') imgid_list, usr_list, tag_dict = get_wnet_tags(wn, wtag_file, opts, False) usr_tag = {} tag_cnt = {} empty_cnt = 0 #for u, utuple in groupby(zip(usr_list, imgid_list), lambda x: x[0]): for (imid, u) in zip(imgid_list, usr_list): if u not in usr_tag: usr_tag[u] = {} vv = map(lambda s:norm_tag(s, dbcsr, addl_vocab,1), tag_dict[imid]) vv = list(set(filter(len, vv))) for v in vv: if v in usr_tag[u]: usr_tag[u][v] += 1 else: usr_tag[u][v] = 1 # first time seeing user u use tag v if v in tag_cnt: tag_cnt[v] += 1 else: tag_cnt[v] = 1 #for v in list(set(usr_tag[u])): tag_list = filter(lambda t: tag_cnt[t]>1, tag_cnt.keys()) tag_list.sort() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s obtained %d usrs, %d tags non-trival, %d empty imgs " % (tt, len(usr_tag), len(tag_list), empty_cnt) ulist = [] tcnt_mat = np.zeros( (len(usr_list), len(tag_list))) icnt = 0 #out_dat_fh = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".dat"), "wt") for u in list(set(usr_list)): jj = -1 outstr = "" if usr_tag[u]: for t in usr_tag[u]: if t in tag_list: jj = tag_list.index(t) tcnt_mat[icnt, jj] += 1 outstr += "%d:%d " % (jj+1, usr_tag[u][t]) else: pass # empty list of tags, skip if outstr: #out_dat_fh.write( outstr + "\n" ) icnt += 1 ulist.append(u) tcnt = tcnt_mat[:icnt, :] tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print '%s wnet "%s": %d unique tags by %d users\n' % (tt, wn, len(tag_list), len(ulist)) # get and print synset info td = dict([ (k, tag_cnt[k]) for k in tag_list ]) syn_info, wlist = compile_synset_wordlist(wn, len(ulist), None, db_wn, db_dict, addl_vocab, td) hi_words = [syn_info['self']['words']] + syn_info['ancestor'].values() + syn_info['descendant'].values() hi_depth = [0 ] + syn_info['ancestor'].keys() + syn_info['descendant'].keys() other_words = list(set(tag_list) - set(reduce(lambda a,b: a+b, hi_words, [])) ) print_synset_info(syn_info, wn, wlist, len(ulist)) wn_out_mat = os.path.join(opts.data_home, opts.wnet_out_dir, wn+".mat") tcnt = tcnt_mat[:icnt, :] #mlab.save(wn_out_mat, 'img_list', 'tag_list', 'tcnt_mat') data = {'usr_list': ulist, 'tag_list': tag_list, 'tcnt':tcnt, 'other_words':other_words, 'hier_words':hi_words, 'hier_depth':hi_depth} io.savemat(wn_out_mat, data) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print '%s saved to %s \n\n' % (tt, wn_out_mat) print "\n tag occurrence:" hi_words = reduce(lambda a,b: a+b, hi_words, []) lcnt = 0 bg_dict = {} wn_tagf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".txt"), "wt") for u in usr_tag: vv = filter(lambda v: v in tag_list, usr_tag[u].keys()) if vv: wn_tagf.write(u+"\t"+ ",".join(vv) + "\n") accumulate_bg(vv, bg_dict, None, None, addl_vocab=[]) lcnt += 1 if lcnt<20: print u+"\t"+ ",".join(vv) wn_tagf.close() print "\nbigrams#\tMI \ttype\ttag1,tag2\trelations" Nusr = len(usr_tag) lcnt = 0 bg_tuples = sort_bg(bg_dict) wn_bgf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".bigram.txt"), "wt") for u, v, c in bg_tuples: if c<3: break assr = cm.Assertion.objects.filter(concept1__text=u, concept2__text=v,language=en) atxt = map(lambda a: str(a).strip("[]"), assr) atxt += map(lambda a: str(a).strip("[]"), cm.Assertion.objects.filter(concept1__text=v, concept2__text=u,language=en)) try: mi = binary_mutual_info(Nusr, tag_cnt[u], tag_cnt[v], c) except: print u, v, c, len(usr_tag) raise if u in other_words and v in other_words: btype = "OO" elif u in hi_words and v in hi_words: btype = "HH" else: btype = "HO" outstr = "%s\t%0.5f\t%0.5f\t%s\t%s,%s\t%s" % \ (syn_info['nltk_id'], 1.*c/Nusr, mi, btype, u, v, ";".join(atxt) if atxt else "None") wn_bgf.write(outstr +"\n") lcnt += 1 if lcnt<50: print outstr wn_bgf.close()
def norm_tag_file(argv): """ clean tags for one single file """ if len(argv) < 2: argv = ['-h'] parser = OptionParser( description='rank tags of a single image + compose sentences') parser.add_option('-i', '--in_file', dest='in_file', default="", help='input file') parser.add_option('-o', '--out_dir', dest='out_dir', default="db", help='ouptut format (db or pkl)') parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data') parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary') parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='') (opts, __args) = parser.parse_args(sys.argv) db_dict = os.path.join(opts.db_dir, opts.db_dict) addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split() out_fh = open(os.path.join(opts.out_dir, os.path.split(opts.in_file)[1]), 'wt') conn = sqlite3.connect(db_dict) cursor = conn.cursor() tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s processing %s ..." % (tt, opts.in_file) cnt = 0 ecnt = 0 tcnt = 0 for cl in codecs.open(opts.in_file, encoding='utf-8'): imid = cl.split()[0] w = cl.split()[1] ww = w.split(",") vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww) vv = filter(lambda s: len(s), vv) if vv: #tag_dict[imid] = vv out_fh.write("%s\t%s\n" % (imid, ",".join(vv))) tcnt += len(vv) else: ecnt += 1 cnt += 1 if cnt % 5000 == 0: tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d img-id processed, %d tags, %d empty" % (tt, cnt, tcnt, ecnt) tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') print "%s %d img-id processed, %d tags, %d empty. Done.\n" % (tt, cnt, tcnt, ecnt) conn.close()