Exemple #1
0
def rank_tags(argv):
    if len(argv)<2:
            argv = ['-h']
    parser = OptionParser(description='rank tags of a single image + compose sentences')
    parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data')
    parser.add_option('-n', '--num_output', dest='num_output', type="int", 
                      default=3, help='number of output images to examine')
    parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary')
    parser.add_option("", '--vocab_score', dest='vocab_score', default="flickr_vscore.txt", 
                      help='file containing vocabulary count')
    parser.add_option("", '--tag_file', dest="tag_file", default="demo-data/24.cache", help="")
    parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='')
    parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='')
    
    (opts, __args) = parser.parse_args(sys.argv)
    
    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    
    tag_file = os.path.join(opts.db_dir, opts.tag_file)
    
    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split()
    vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score), 'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    vocab_score = {}
    for vl in vocab_lines:
        t = vl.split()
        # [word, score, prc]
        vocab_score[t[0]] = map(float, t[1:])
    
    # gulp all the tags
    vocab_lines = open(tag_file, 'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    img_tag = {}
    for vl in vocab_lines:
        t = vl.split()        
        #print t
        img_tag[t[0]] = t[1]
    print "read %d tags, %d images" % ( len(vocab_score), len(img_tag) ) 
    
    id_list = img_tag.keys()
    if opts.num_output<0:
        random.shuffle(id_list)
        num_output = - opts.num_output
        id_select = id_list[:num_output*10]
    elif opts.num_output>1e5:
        id_select = [str(opts.num_output)]
        num_output = 1
    else:
        num_output = opts.num_output
        id_select = id_list[:num_output*10]
    
    
    icnt = 0
    
    api_keys = open(FLICKR_KEY_FILE, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)
    
    conn = sqlite3.connect(db_dict)
    conn.text_factory = str
    cursor = conn.cursor()
    
    for cur_id in id_select:
        ww = img_tag[cur_id].split(",")
        vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        #find the vscore of vv
        vs = map(lambda v: vocab_score[v], vv)
        
        # get flickr picture url
        #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg
        if 1:
            cur_key = api_keys [random.randint(0, len(api_keys)-1)]
            jinfo = cache_flickr_info(cur_id, cur_key, rootdir="")
            p = jinfo['photo']
            imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % (p["farm"], p["server"], p['id'], p['secret'])
        else:
            imgurl = ""
        #print zip(vv, vs)
        if len(vv) > 5:
            icnt += 1
            # print results
            print "\nimg: %s" % (imgurl if imgurl else cur_id)
            vtup = sorted(map(lambda s,t: (s, t[0], t[1]), vv, vs), key=itemgetter(2), reverse=True)
            outstr = ""
            for i, t in enumerate(vtup):
                outstr += "%s (%0.3f,%2.1f%%)\t"%(t[0],t[1],100*t[2])
                if (i+1)%3==0:
                    outstr += "\n" 
            print outstr
            """
            print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) )
            print "other      : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) )
            print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) )
            """
            print ""
        else:
            pass
        
        if icnt >= num_output:
            break
        
    conn.close()
Exemple #2
0
def ingest_flickr_info(url_list, conn, argv):
    
    parser = OptionParser(description='return co-occurring tag counts for a given list of flickr URLs')
    parser.add_option('-w', '--wordnet_id', dest='wordnet_id', default='', help='current wnid')
    parser.add_option('-j', '--json_dir', dest='json_dir', default=FLICKR_XML_DIR, 
                    help='dir to cache json metadata of each photo')
    parser.add_option('-k', '--flickr_key_file', dest='flickr_key_file', 
        default='flickr.key.txt', help='file containing a list of API keys, one per line')
    """ 'http://static.flickr.com/2088/[id]_94dbc23839.jpg' """
    parser.add_option("-p", "--id_pattern", dest="id_pattern", 
        default="[^/]*//.*/[0-9]*/(?P<flickrid>[0-9]*)\_([0-9a-z]*).*", help="regexp to get flickr id")
    (opts, __args) = parser.parse_args(argv)
    
    api_keys = open(opts.flickr_key_file, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)
    id_pattern = re.compile(opts.id_pattern)
    
    csr = conn.cursor()
    
    icnt = 0
    ist_cnt = 0
    img_nct = 0
    fail_cnt = 0
    id_list = []
    for cur_u in url_list:
        try:
            m = id_pattern.match(cur_u)
            imgid = m.group('flickrid')
        except:
            print "\t err parsing URL" + cur_u
            continue
        icnt += 1        
        
        cur_key = api_keys [random.randint(0, len(api_keys)-1)]
        jinfo = cache_flickr_info(imgid, cur_key, rootdir=opts.json_dir)
        
        flickrid = int(imgid)
        id_list.append(flickrid)
        wnf_exist = -1
        csr.execute("SELECT COUNT(*) FROM imagenet_flickr WHERE wnid=? AND flickrid=?", 
                    (opts.wordnet_id, flickrid))
        for row in csr:
            wnf_exist = int(row[0])
                
        if 'stat' not in jinfo or not jinfo['stat']=='ok' :
            fail_cnt += 1
            if wnf_exist>0:
                csr.execute("INSERT INTO imagenet_flickr (wnid, flickrid,status) VALUES (?,?,?)", 
                         (opts.wordnet_id, flickrid, 0))
            if icnt%100 == 0 or DEBUG>2:
                tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
                print "%s %5d/%5d flickr img not found: %s" % (tt, icnt, len(url_list), flickrid)
                
        else:
            if not wnf_exist==1:
                ist_cnt += 1
                csr.execute("INSERT INTO imagenet_flickr (wnid, flickrid,status) VALUES (?,?,?)", 
                         (opts.wordnet_id, flickrid, 1))
                
                img_exist = 0
                csr.execute("SELECT COUNT(*) FROM flickr_info WHERE flickrid=%d" % flickrid )            
                for row in csr:
                    img_exist = int(row[0])
                    
                """
                    INSERT OR REPLACE INTO Employee (id,name,role) 
                      VALUES (  1, 
                                'Susan Bar',
                                coalesce((select name from Employee where id = 1),'Benchwarmer')
                              );
                """    
                
                if not img_exist:
                    img_nct += 1
                    pinfo = jinfo["photo"]
                    userid = pinfo["owner"]["nsid"]
                    taken_time = pinfo["dates"]["taken"]
                    image_url = cur_u
                    title = pinfo["title"]["_content"]
                    description = pinfo["description"]["_content"]
                    
                    finfo = (flickrid,userid,taken_time,image_url,title,description)
                    csr.execute("INSERT INTO flickr_info VALUES (?,?,?,?,?,?)", finfo)
                    
                    tmp = pinfo["tags"]["tag"]
                    tg = map(lambda s: s["_content"], tmp)
                    if tg:                    
                        csr.executemany("INSERT INTO flickr_tag (flickrid,tag) VALUES (?,?)", 
                                        zip([flickrid]*len(tg), tg))
                else:
                    print "  img # %d already exist: %d" % (icnt, flickrid)
                    
                if DEBUG>2:
                    print icnt, opts.wordnet_id, flickrid
                    print repr(finfo)
                    print repr( zip([flickrid]*len(tg), tg) )
                    print ""
            else:
                # what case is this?
                print "  entry already exist:"
                csr.execute("SELECT * FROM imagenet_flickr WHERE wnid=? AND flickrid=?", 
                    (opts.wordnet_id, flickrid))            
                for row in csr:
                    print "  " + repr(row)
                #csr.execute("SELECT * FROM flickr_info WHERE flickrid=%d" % flickrid)
                #for row in csr:
                #    print row
                #print flickrid in id_list
                
                
            
        if icnt%100 == 0: 
            conn.commit()
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s %d/%d urls processed, %d new records, %d new images, %d failed" \
                % (tt, icnt, len(url_list), ist_cnt, img_nct, fail_cnt)
            
                
    conn.commit()
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s %d/%d urls processed, %d new records, %d new images, %d failed \n\n" \
        % (tt, icnt, len(url_list), ist_cnt, img_nct, fail_cnt)
        
    return
Exemple #3
0
def rank_tags(argv):
    if len(argv) < 2:
        argv = ['-h']
    parser = OptionParser(
        description='rank tags of a single image + compose sentences')
    parser.add_option('-d',
                      '--db_dir',
                      dest='db_dir',
                      default="",
                      help='dir containing sqlite db and other data')
    parser.add_option('-n',
                      '--num_output',
                      dest='num_output',
                      type="int",
                      default=3,
                      help='number of output images to examine')
    parser.add_option("",
                      '--db_dict',
                      dest='db_dict',
                      default="dict.db",
                      help='dictionary')
    parser.add_option("",
                      '--vocab_score',
                      dest='vocab_score',
                      default="flickr_vscore.txt",
                      help='file containing vocabulary count')
    parser.add_option("",
                      '--tag_file',
                      dest="tag_file",
                      default="demo-data/24.cache",
                      help="")
    parser.add_option("",
                      '--wn_list',
                      dest='wn_list',
                      default="wnet-50.txt",
                      help='')
    parser.add_option("",
                      '--addl_vocab',
                      dest='addl_vocab',
                      default="places_etc.txt",
                      help='')

    (opts, __args) = parser.parse_args(sys.argv)

    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)

    tag_file = os.path.join(opts.db_dir, opts.tag_file)

    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab),
                      'rt').read().split()
    vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score),
                       'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    vocab_score = {}
    for vl in vocab_lines:
        t = vl.split()
        # [word, score, prc]
        vocab_score[t[0]] = map(float, t[1:])

    # gulp all the tags
    vocab_lines = open(tag_file, 'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    img_tag = {}
    for vl in vocab_lines:
        t = vl.split()
        #print t
        img_tag[t[0]] = t[1]
    print "read %d tags, %d images" % (len(vocab_score), len(img_tag))

    id_list = img_tag.keys()
    if opts.num_output < 0:
        random.shuffle(id_list)
        num_output = -opts.num_output
        id_select = id_list[:num_output * 10]
    elif opts.num_output > 1e5:
        id_select = [str(opts.num_output)]
        num_output = 1
    else:
        num_output = opts.num_output
        id_select = id_list[:num_output * 10]

    icnt = 0

    api_keys = open(FLICKR_KEY_FILE, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)

    conn = sqlite3.connect(db_dict)
    conn.text_factory = str
    cursor = conn.cursor()

    for cur_id in id_select:
        ww = img_tag[cur_id].split(",")
        vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        #find the vscore of vv
        vs = map(lambda v: vocab_score[v], vv)

        # get flickr picture url
        #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg
        if 1:
            cur_key = api_keys[random.randint(0, len(api_keys) - 1)]
            jinfo = cache_flickr_info(cur_id, cur_key, rootdir="")
            p = jinfo['photo']
            imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % (
                p["farm"], p["server"], p['id'], p['secret'])
        else:
            imgurl = ""
        #print zip(vv, vs)
        if len(vv) > 5:
            icnt += 1
            # print results
            print "\nimg: %s" % (imgurl if imgurl else cur_id)
            vtup = sorted(map(lambda s, t: (s, t[0], t[1]), vv, vs),
                          key=itemgetter(2),
                          reverse=True)
            outstr = ""
            for i, t in enumerate(vtup):
                outstr += "%s (%0.3f,%2.1f%%)\t" % (t[0], t[1], 100 * t[2])
                if (i + 1) % 3 == 0:
                    outstr += "\n"
            print outstr
            """
            print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) )
            print "other      : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) )
            print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) )
            """
            print ""
        else:
            pass

        if icnt >= num_output:
            break

    conn.close()
def get_wnet_tags(wn, wtag_file, opts, img_usr=None, force_reload=False):
    """
        collect each image id, usr id, and tags for the input synset
    """

    wn_file = os.path.join(opts.data_home, opts.wnet_list_dir, wn + '.txt')
    img_list_from_file = []
    for cl in open(wn_file, "rt"):
        if not cl: continue
        tmp = cl.strip().split()
        if not tmp or len(tmp) < 2:
            continue
        img_list_from_file.append(tmp[1])
    img_list_from_file.sort()
    numim = len(img_list_from_file)

    imgid_list = []
    usr_list = []
    tag_dict = {}

    if os.path.isfile(wtag_file) and not force_reload:
        for cl in codecs.open(wtag_file, encoding='utf-8', mode="r"):
            tt = cl.strip().split()
            imgid_list.append(tt[0])
            usr_list.append(tt[1])
            tag_dict[tt[0]] = tt[2].split(",")
        print " read %d image entries from %s " % (len(imgid_list), wtag_file)
    else:
        #pass #imgid_list = []
        #if numim > len(imgid_list):
        imgid_list = img_list_from_file

        if not img_usr:
            fn = os.path.join(opts.data_home, opts.db_dir, opts.usr_file)
            fh = gzip.open(fn,
                           "rb") if os.path.splitext(fn)[1] == ".gz" else open(
                               fn, "r")
            lines = filter(len,
                           fh.read().split("\n"))  # skip empty line if any
            lines = [cl.strip().split() for cl in lines]
            img_usr = dict(lines)
            fh.close()
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s read %d usr ids from %s" % (tt, len(img_usr),
                                                  opts.usr_file)

        usr_list = map(lambda s: img_usr[s]
                       if s in img_usr else "unk", imgid_list)

        cur_cache_id = []
        empty_cnt = 0
        del_id = []
        tag_dict = {}
        fh = codecs.open(wtag_file, encoding='utf-8', mode="w")
        for (imgid, uid, ii) in zip(imgid_list, usr_list,
                                    range(len(usr_list))):

            if opts.use_json:
                jinfo = cache_flickr_info(imgid,
                                          "",
                                          rootdir=os.path.join(
                                              opts.data_home, opts.json_dir))
                if not jinfo or 'stat' not in jinfo or not jinfo[
                        'stat'] == 'ok':
                    pass  # no valid metadata
                else:
                    pinfo = jinfo["photo"]
                    usr = pinfo["owner"]["nsid"]
                    tt = pinfo["tags"]["tag"]
                    ww = map(lambda s: s["_content"], tt)
                    if not uid == usr:
                        #print "  user info mismatch. (%s): %s (file) vs %s (json)" % (imgid, uid, usr)
                        uid = usr
            else:  # use tag cache
                if not cur_cache_id == imgid[:2]:
                    # read new cache file
                    cur_cache_id = imgid[:2]
                    cur_cache_file = os.path.join(opts.data_home,
                                                  opts.tag_cache_dir,
                                                  cur_cache_id + ".cache")
                    cache_dict = read_tag_cache(cur_cache_file)
                    #print "\t read %s imgs from %s" % (len(cache_dict), cur_cache_id+".cache")
                else:
                    pass

                if imgid in cache_dict:
                    ww = cache_dict[imgid].split(",")
                else:  # img has no tag
                    ww = []
                    empty_cnt += 1

            #print "%s\t%s\t%s" % (imgid, uid, ",".join(ww))

            if ww:
                tag_dict[imgid] = ww
                fh.write("%s\t%s\t%s\n" % (imgid, uid, ",".join(ww)))
            else:
                del_id.append(ii)

        fh.close()
        del_id.sort(reverse=True)  #pop bigger indexes first, trouble otherwise
        for ii in del_id:
            imgid_list.pop(ii)
            usr_list.pop(ii)

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print '%s wnet "%s" - %d imgs, %d with tags, %d unique users' % \
        (tt, wn, numim, len(imgid_list), len(set(usr_list)) )

    return imgid_list, usr_list, tag_dict
def get_wnet_tags(wn, wtag_file, opts, img_usr=None, force_reload=False):
    """
        collect each image id, usr id, and tags for the input synset
    """
    
    wn_file = os.path.join(opts.data_home, opts.wnet_list_dir, wn+'.txt')
    img_list_from_file = []
    for cl in open(wn_file, "rt"):
        if not cl: continue
        tmp = cl.strip().split()
        if not tmp or len(tmp)<2 :
            continue
        img_list_from_file.append(tmp[1])
    img_list_from_file.sort()
    numim = len(img_list_from_file)
    
    imgid_list = []
    usr_list= []
    tag_dict = {}
    
    if os.path.isfile(wtag_file) and not force_reload:
        for cl in codecs.open(wtag_file, encoding='utf-8', mode="r"):
            tt = cl.strip().split()
            imgid_list.append(tt[0])
            usr_list.append(tt[1])
            tag_dict[tt[0]] = tt[2].split(",")
        print " read %d image entries from %s " % (len(imgid_list), wtag_file)
    else:
        #pass #imgid_list = []
        #if numim > len(imgid_list):
        imgid_list = img_list_from_file
        
        if not img_usr:
            fn = os.path.join(opts.data_home, opts.db_dir, opts.usr_file)
            fh = gzip.open(fn, "rb") if os.path.splitext(fn)[1]==".gz" else open(fn, "r")
            lines = filter(len, fh.read().split("\n") )     # skip empty line if any
            lines = [cl.strip().split() for cl in lines]   
            img_usr = dict(lines)
            fh.close()
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s read %d usr ids from %s" % (tt, len(img_usr), opts.usr_file)
            
        usr_list = map(lambda s: img_usr[s] if s in img_usr else "unk", imgid_list)
        
        cur_cache_id = []
        empty_cnt = 0
        del_id = []
        tag_dict = {}
        fh = codecs.open(wtag_file, encoding='utf-8', mode="w")
        for (imgid, uid, ii) in zip(imgid_list, usr_list, range(len(usr_list))):
            
            if opts.use_json:
                jinfo = cache_flickr_info(imgid, "", rootdir=os.path.join(opts.data_home, opts.json_dir) )
                if not jinfo or 'stat' not in jinfo or not jinfo['stat']=='ok' :
                    pass # no valid metadata         
                else:
                    pinfo = jinfo["photo"]
                    usr = pinfo["owner"]["nsid"]
                    tt = pinfo["tags"]["tag"]
                    ww = map(lambda s:s["_content"], tt)
                    if not uid == usr: 
                        #print "  user info mismatch. (%s): %s (file) vs %s (json)" % (imgid, uid, usr)
                        uid = usr
            else: # use tag cache
                if not cur_cache_id==imgid[:2]:
                    # read new cache file
                    cur_cache_id = imgid[:2]
                    cur_cache_file = os.path.join(opts.data_home, opts.tag_cache_dir, cur_cache_id+".cache")
                    cache_dict = read_tag_cache(cur_cache_file)
                    #print "\t read %s imgs from %s" % (len(cache_dict), cur_cache_id+".cache")
                else:
                    pass
                
                if imgid in cache_dict:
                    ww = cache_dict[imgid].split(",")
                else: # img has no tag
                    ww = []
                    empty_cnt += 1
            
            #print "%s\t%s\t%s" % (imgid, uid, ",".join(ww))
            
            if ww:
                tag_dict[imgid] = ww
                fh.write( "%s\t%s\t%s\n" % (imgid, uid, ",".join(ww)) )
            else:
                del_id.append(ii)
                
        fh.close()
        del_id.sort(reverse=True) #pop bigger indexes first, trouble otherwise
        for ii in del_id:
            imgid_list.pop(ii)
            usr_list.pop(ii)
        
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print '%s wnet "%s" - %d imgs, %d with tags, %d unique users' % \
        (tt, wn, numim, len(imgid_list), len(set(usr_list)) )
        
    return imgid_list, usr_list, tag_dict
def download_sbu_imgs(url_file, id_file, img_root_dir, startnum=0, endnum=50, hash_level=2, chars_per_hash=2):
    ss = os.sep
    exist_cnt = 0
    err_cnt = 0
    good_cnt = 0
    cnt = 0
    url_lines = open(url_file, 'rt').read().split("\n")
    id_lines = open(id_file, 'rt').read().split("\n")
    
    api_keys = open(FLICKR_KEY_FILE, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)
    
    json_dir = os.path.join(os.path.split(img_root_dir)[0], 'sbu-json')
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s processing #%d - #%d of %d urls"  % (tt, startnum, endnum, len(url_lines))
    ii = startnum 
    while ii < endnum and ii < len(url_lines):
    #for (ul, imid) in enumerate(url_lines, id_lines):

        imgid = id_lines[ii]
        imgurl = url_lines[ii]
        cnt += 1
        
        curs = 0
        cure = chars_per_hash
        hdir = []
        for i in range(hash_level):
            curs += i*chars_per_hash
            cure = curs + chars_per_hash
            hdir.append(imgid[curs:cure])
        outdir = os.path.join(img_root_dir, ss.join(hdir))
        imfile = os.path.join(outdir, imgid+".jpg")
        
        cur_key = api_keys [random.randint(0, len(api_keys)-1)]
        _jinfo = cache_flickr_info(imgid, cur_key, rootdir=json_dir)
        #if 'stat' not in jinfo or not jinfo['stat']=='ok' :
        #    err_cnt += 1  
        #    continue 
                
        if not os.path.exists(imfile):
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            try:
                buf = urllib2.urlopen(imgurl).read()
                fh = open(imfile, 'wb')
                fh.write(buf)
                fh.close()
                good_cnt += 1
            except:
                print "  ERR downloading url #%d, img %s from %s" % (ii, imgid, imgurl)
                err_cnt += 1
        else:
            exist_cnt += 1
        
        ii += 1
        if cnt % 100 == 0:
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s processed %d urls: %d new, %d exist, %d err \n\t"  % (tt, cnt, good_cnt, exist_cnt, err_cnt)
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s processed %d urls: %d new, %d exist, %d err \n\t from %s"  % (tt, cnt, good_cnt, exist_cnt, err_cnt, url_file)
def compile_flickr_info(url_list, argv):
    
    parser = OptionParser(description='return co-occurring tag counts for a given list of flickr URLs')
    parser.add_option('-o', '--out_dir', dest='out_dir', default=WNET_OUT_DIR, help='output dir for wordnet-flickr log file')
    parser.add_option('-w', '--wordnet_id', dest='wordnet_id', default='', help='current wnid')
    
    parser.add_option('-j', '--json_dir', dest='json_dir', default=FLICKR_XML_DIR, 
                    help='dir to cache json metadata of each photo')
    parser.add_option('-k', '--flickr_key_file', dest='flickr_key_file', 
        default='flickr.key.txt', help='file containing a list of API keys, one per line')
    """ 'http://static.flickr.com/2088/[id]_94dbc23839.jpg' """
    parser.add_option("-p", "--id_pattern", dest="id_pattern", 
        default="[^/]*//.*/[0-9]*/(?P<flickrid>[0-9]*)\_([0-9a-z]*).*", help="regexp to get flickr id")
    (opts, __args) = parser.parse_args(argv)
    
    api_keys = open(opts.flickr_key_file, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)
    id_pattern = re.compile(opts.id_pattern)
    
    out_file = os.path.join(opts.out_dir, opts.wordnet_id+".txt")
    if os.path.exists(out_file):
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print "%s output '%s' already exist, RETURN \n\n" % (tt, out_file)
        return
    
    icnt = 0
    good_cnt = 0
    fail_cnt = 0
    dup_cnt = 0
    id_url = {}
    for cur_u in url_list:
        icnt += 1
        try:
            m = id_pattern.match(cur_u)
            imgid = m.group('flickrid')
        except:
            print "\t err parsing URL" + cur_u # assume url already contains flickr.com/
            continue    
        
        flickrid = int(imgid)
        if flickrid in id_url:
            continue
            dup_cnt += 1
        else:            
            cur_key = api_keys [random.randint(0, len(api_keys)-1)]
            jinfo = cache_flickr_info(imgid, cur_key, rootdir=opts.json_dir)
            if 'stat' not in jinfo or not jinfo['stat']=='ok' :
                fail_cnt += 1               
                #if icnt%100 == 0 or DEBUG>2:
                #    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
                #    print "%s %5d/%5d flickr img not found: %s" % (tt, icnt, len(url_list), flickrid)
                    
            else:
                good_cnt += 1
                id_url[flickrid] = cur_u
                   
        if icnt%200 == 0: 
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s %d/%d urls processed, %d good records, %d dups, %d failed" \
                % (tt, icnt, len(url_list), good_cnt, dup_cnt, fail_cnt)
            
    # write out the resulting tuples
    
    if not os.path.exists(out_file):
        fh = open(out_file, 'wt')
        for ii, uu in id_url.iteritems():
            fh.write("%s\t%11d\t%s\n" % (opts.wordnet_id, ii, uu) )
        fh.close()
        
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print "%s %d urls processed, %d good records, %d dups, %d failed \n\n" \
            % (tt, len(url_list), good_cnt, dup_cnt, fail_cnt)
    else:
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print "%s %d urls processed, SKIP existing output file %s \n\n" \
            % (tt, len(url_list), out_file)
    return