Example #1
0
def get_conceptnet_words(db_file, addl_vocab=[]):
    A = divisi2.network.conceptnet_matrix('en')
    wa = A.row_labels
    #wb = A.col_labels
    #all_t2 = map(lambda s: s[2], list(wb))
    #ww = filter(lambda s: s==s.split()[0], wa) # keep 1-word terms only

    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    wd = {}
    cn_vocab = {}
    null_count = 0
    for w in wa:
        ww = w.split()
        vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        wd[w] = vv
        null_count += (not vv)
        if vv:
            for v in vv:
                if v not in cn_vocab:
                    cn_vocab[v] = 1

    #wd = dict(izip(ww, vv))
    conn.close()

    cn_vocab = cn_vocab.keys()

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s mapped %d concept-net nodes, %d empty" % (tt, len(wa),
                                                        null_count)

    return wd, cn_vocab
Example #2
0
def compile_synset_wordlist(w,
                            cw,
                            wn_cnt=None,
                            db_wn=None,
                            db_dict=None,
                            addl_vocab=None,
                            tag_count=None):
    dbcn = sqlite3.connect(db_dict)  # dictionary db
    dbcsr = dbcn.cursor()

    conn = sqlite3.connect(db_wn)  # wnet tag info db
    cursor = conn.cursor()

    nltk_id, wlist, self_syn = map_wn2nltk(w, cursor)
    synset_exist = True if nltk_id else False

    if tag_count:
        td = tag_count
    else:
        tcnt = cursor.execute("SELECT tag,count FROM wn_tag WHERE wnid=?",
                              (w, )).fetchall()
        td = dict(tcnt)

    synset_info = {}
    synset_info['nltk_id'] = nltk_id
    synset_info['num_flickr'] = cw  #wn_cnt[w]
    synset_info['tag_count'] = td
    children_wn = wn_get_hyponym(w, full=0)
    if wn_cnt:
        synset_info['children_wn'] = filter(lambda c: c in wn_cnt, children_wn)
    else:
        synset_info['children_wn'] = children_wn

    if not synset_exist:
        synset_info['self'] = {'depth': -1, 'words': []}
        synset_info['ancestor'] = {}
        synset_info['descendant'] = {}
        synset_info['other'] = {}
    else:
        nltk_id = nltk_id[0]
        ww = reduce(lambda u, v: u + v.split(), wlist, [])
        vv = map(lambda s: norm_tag(s, dbcsr, addl_vocab, 1), ww)
        vv = list(set(filter(len, vv)))

        synset_info['self'] = {'depth': self_syn.min_depth(), 'words': vv}
        synset_info['ancestor'] = get_ancestor_words(self_syn, 0, dbcsr,
                                                     addl_vocab)
        synset_info['descendant'] = get_descendant_words(
            self_syn, 0, dbcsr, addl_vocab)
        all_tags = set(synset_info['tag_count'].keys())
        all_hierarchy = set(
            synset_info['self']['words'] +
            reduce(lambda a, b: a + b, synset_info['ancestor'].values(), []) +
            reduce(lambda a, b: a + b, synset_info['descendant'].values(), []))
        synset_info['other'] = sorted(list(all_tags - all_hierarchy))

    dbcn.close()
    conn.close()

    return synset_info, wlist
Example #3
0
def get_descendant_words(self_syn, self_depth, db_cursor, addl_vocab):
    psyn = self_syn.hyponyms()
    if not psyn:
        return {}

    vlist = []
    for p in psyn:
        plemma = map(lambda s: s.name, p.lemmas)
        wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma)
        ww = reduce(lambda u, v: u + v, wlist, [])
        vv = map(lambda s: norm_tag(s, db_cursor, addl_vocab, 1), ww)
        vv = filter(len, vv)

        vlist += vv

    cur_d = self_depth + 1
    children_words = {}
    children_words[cur_d] = sorted(list(set(vlist)))
    #{'depth': self_depth-1,
    #    'words': sorted(list(set(vlist))) }

    for p in psyn:
        pw = get_descendant_words(p, cur_d, db_cursor, addl_vocab)
        # aggregate outcome by depth
        if pw:
            for d in pw.iterkeys():
                if d in children_words:
                    children_words[d] = sorted(
                        list(set(children_words[d]) | set(pw[d])))
                else:
                    children_words[d] = pw[d]

    return children_words
Example #4
0
def get_descendant_words(self_syn, self_depth, db_cursor, addl_vocab):
    psyn = self_syn.hyponyms()
    if not psyn:
        return {}
    
    vlist = []
    for p in psyn:
        plemma = map(lambda s: s.name, p.lemmas)
        wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma)
        ww = reduce(lambda u,v: u + v, wlist, [])
        vv = map(lambda s:norm_tag(s, db_cursor, addl_vocab,1), ww)
        vv = filter(len, vv)
        
        vlist += vv
    
    cur_d = self_depth + 1
    children_words = {}
    children_words[cur_d] = sorted(list(set(vlist)))
    #{'depth': self_depth-1,
    #    'words': sorted(list(set(vlist))) }
    
    for p in psyn:
        pw = get_descendant_words(p, cur_d, db_cursor, addl_vocab)
        # aggregate outcome by depth
        if pw:
            for d in pw.iterkeys():
                if d in children_words :
                    children_words[d] = sorted(list( set(children_words[d]) | set(pw[d]) ))
                else:
                    children_words[d] = pw[d]
    
    return children_words
Example #5
0
def get_conceptnet_words(db_file, addl_vocab=[]):
    A = divisi2.network.conceptnet_matrix('en')
    wa = A.row_labels
    #wb = A.col_labels
    #all_t2 = map(lambda s: s[2], list(wb))
    #ww = filter(lambda s: s==s.split()[0], wa) # keep 1-word terms only
    
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    wd = {}
    cn_vocab = {}
    null_count = 0
    for w in wa:
        ww = w.split()
        vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        wd[w] = vv
        null_count += (not vv)
        if vv :
            for v in vv:
                if v not in cn_vocab:
                    cn_vocab[v] = 1
            
    #wd = dict(izip(ww, vv))
    conn.close()
    
    cn_vocab = cn_vocab.keys()
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s mapped %d concept-net nodes, %d empty" % (tt, len(wa), null_count)
    
    return wd, cn_vocab
Example #6
0
def compile_synset_wordlist(w, cw, wn_cnt=None, db_wn=None, db_dict=None, addl_vocab=None, tag_count=None):
    dbcn = sqlite3.connect(db_dict) # dictionary db
    dbcsr = dbcn.cursor()
    
    conn = sqlite3.connect(db_wn) # wnet tag info db
    cursor = conn.cursor()
    
    nltk_id, wlist, self_syn = map_wn2nltk(w, cursor)
    synset_exist = True if nltk_id else False
    
    if tag_count:
        td = tag_count
    else:
        tcnt = cursor.execute("SELECT tag,count FROM wn_tag WHERE wnid=?", (w,)).fetchall()
        td = dict(tcnt)
        
    synset_info = {}
    synset_info['nltk_id'] = nltk_id
    synset_info['num_flickr'] = cw #wn_cnt[w]
    synset_info['tag_count'] = td
    children_wn = wn_get_hyponym(w, full=0)
    if wn_cnt:
        synset_info['children_wn'] = filter(lambda c: c in wn_cnt, children_wn)
    else:
        synset_info['children_wn'] = children_wn
    
    if not synset_exist:
        synset_info['self'] = {'depth': -1,
                              'words': [] }
        synset_info['ancestor'] = {}
        synset_info['descendant'] = {}
        synset_info['other'] = {}
    else:
        nltk_id = nltk_id[0]
        ww = reduce(lambda u,v: u + v.split(), wlist, [])
        vv = map(lambda s:norm_tag(s, dbcsr, addl_vocab,1), ww)
        vv = list(set(filter(len, vv)))
        
        synset_info['self'] = {'depth': self_syn.min_depth(),
                                  'words': vv }
        synset_info['ancestor'] = get_ancestor_words(self_syn, 0, dbcsr, addl_vocab)
        synset_info['descendant'] = get_descendant_words(self_syn, 0, dbcsr, addl_vocab)
        all_tags = set(synset_info['tag_count'].keys()) ;
        all_hierarchy = set(synset_info['self']['words'] 
                             + reduce(lambda a,b: a+b, synset_info['ancestor'].values(), []) 
                             + reduce(lambda a,b: a+b, synset_info['descendant'].values(), []) )
        synset_info['other'] = sorted(list( all_tags - all_hierarchy ))
    
    
    dbcn.close()
    conn.close()
    
    return synset_info, wlist
Example #7
0
def get_conceptnet_words(db_file):
    A = divisi2.network.conceptnet_matrix('en')
    wa = A.row_labels

    ww = filter(lambda s: s == s.split()[0], wa)  # keep 1-word terms only

    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    vv = map(lambda s: norm_tag(s, cursor), ww)
    wd = dict(izip(ww, vv))
    conn.close()

    return wd
Example #8
0
def get_conceptnet_words(db_file):
    A = divisi2.network.conceptnet_matrix('en')
    wa = A.row_labels
    
    ww = filter(lambda s: s==s.split()[0], wa) # keep 1-word terms only
    
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()    
    vv = map(lambda s:norm_tag(s, cursor), ww)
    wd = dict(izip(ww, vv))
    conn.close()
    
    return wd
Example #9
0
def norm_tag_file(argv):
    """
        clean tags for one single file
    """
    if len(argv)<2:
            argv = ['-h']
    parser = OptionParser(description='rank tags of a single image + compose sentences')
    parser.add_option('-i', '--in_file', dest='in_file', default="", help='input file')
    parser.add_option('-o', '--out_dir', dest='out_dir', default="db", help='ouptut format (db or pkl)')
    parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data')
    parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary')
    parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='')
    
    (opts, __args) = parser.parse_args(sys.argv)
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split()
    
    out_fh = open(os.path.join(opts.out_dir, os.path.split(opts.in_file)[1]), 'wt')
    conn = sqlite3.connect(db_dict)
    cursor = conn.cursor()
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s processing %s ..." % (tt, opts.in_file)
    cnt = 0
    ecnt = 0
    tcnt = 0
    for cl in codecs.open(opts.in_file, encoding='utf-8'):
        imid = cl.split()[0]
        w = cl.split()[1]
        ww = w.split(",")
        vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        if vv:
            #tag_dict[imid] = vv
            out_fh.write("%s\t%s\n" % (imid, ",".join(vv)))
            tcnt += len(vv)
        else:
            ecnt += 1
        
        cnt += 1
        if cnt % 5000 == 0:
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s %d img-id processed, %d tags, %d empty" % (tt, cnt, tcnt, ecnt)
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s %d img-id processed, %d tags, %d empty. Done.\n" % (tt, cnt, tcnt, ecnt)
    conn.close()
Example #10
0
def get_wordnet_words(db_wn, wnid_file, db_dict, vocab):
    wn_freq = {}
    for line in open(wnid_file):
        t = line.strip().split()
        wn_freq[t[1]] = int(t[0])

    wn_words = {}

    condict = sqlite3.connect(db_dict)
    curdict = condict.cursor()

    conn = sqlite3.connect(db_wn)
    cursor = conn.cursor()
    wcnt = 0
    vcnt = 0
    for k in wn_freq.iterkeys():
        #stmt = ("SELECT W.wnid, W.word1, WW.word " +
        #    "FROM wordnet as W, wordnet_word AS WW " +
        #    "WHERE W.wnid='%s' AND W.wnid=WW.wnid" % k )
        stmt = "SELECT word FROM wordnet_word WHERE wnid='%s'" % k
        cursor.execute(stmt)
        ww = map(lambda t: t[0], cursor.fetchall())
        vv = map(lambda s: norm_tag(s.lower(), curdict), ww)
        vv = filter(lambda s: len(s) and s in vocab, vv)
        vv = list(set(vv))
        wcnt += 1
        if vv:
            wn_words[k] = vv
            vcnt += 1
        if wcnt % 1000 == 0:
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s %d/%d synsets processsed, %d has tags" % (
                tt, wcnt, len(wn_freq), vcnt)

    conn.close()
    condict.close()
    uniq_v = reduce(lambda v, s: v + s, wn_words.itervalues())
    uniq_v = list(set(uniq_v))
    print " %d / %d synsets has non-empty mapping to %d words" % (
        len(wn_words), len(wn_freq), len(uniq_v))

    return wn_words
Example #11
0
def get_wordnet_words(db_wn, wnid_file, db_dict, vocab):
    wn_freq = {}
    for line in open(wnid_file):
        t = line.strip().split()
        wn_freq[t[1]] = int(t[0])
    
    wn_words = {}
    
    condict = sqlite3.connect(db_dict)
    curdict = condict.cursor()
    
    conn = sqlite3.connect(db_wn)
    cursor = conn.cursor()
    wcnt = 0
    vcnt = 0
    for k in wn_freq.iterkeys():
        #stmt = ("SELECT W.wnid, W.word1, WW.word " + 
        #    "FROM wordnet as W, wordnet_word AS WW " + 
        #    "WHERE W.wnid='%s' AND W.wnid=WW.wnid" % k )
        stmt = "SELECT word FROM wordnet_word WHERE wnid='%s'" % k
        cursor.execute(stmt)
        ww = map(lambda t: t[0], cursor.fetchall())
        vv = map(lambda s:norm_tag(s.lower(), curdict), ww)
        vv = filter(lambda s: len(s) and s in vocab, vv)
        vv = list(set(vv))
        wcnt += 1
        if vv:
            wn_words[k] = vv
            vcnt += 1
        if wcnt % 1000==0:
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s %d/%d synsets processsed, %d has tags" % (tt, wcnt, len(wn_freq), vcnt)
        
    conn.close()
    condict.close()
    uniq_v = reduce(lambda v, s: v+s, wn_words.itervalues())
    uniq_v = list( set(uniq_v) )
    print " %d / %d synsets has non-empty mapping to %d words" % (len(wn_words), len(wn_freq), len(uniq_v))
    
    return wn_words
Example #12
0
def norm_words(in_wlist, db_file, addl_vocab=[]):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    wd = {}
    cn_vocab = {}
    null_count = 0
    for w in in_wlist:
        ww = w.split()
        vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        wd[w] = vv
        null_count += (not vv)
        if vv:
            for v in vv:
                if v not in cn_vocab:
                    cn_vocab[v] = 1

    #wd = dict(izip(ww, vv))
    conn.close()

    cn_vocab = cn_vocab.keys()
    return cn_vocab
Example #13
0
def norm_words(in_wlist, db_file, addl_vocab=[]):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    wd = {}
    cn_vocab = {}
    null_count = 0
    for w in in_wlist:
        ww = w.split()
        vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        wd[w] = vv
        null_count += (not vv)
        if vv :
            for v in vv:
                if v not in cn_vocab:
                    cn_vocab[v] = 1
            
    #wd = dict(izip(ww, vv))
    conn.close()
    
    cn_vocab = cn_vocab.keys()
    return cn_vocab
Example #14
0
def get_ancestor_words(self_syn, self_depth, db_cursor, addl_vocab):
    psyn = self_syn.hypernyms()
    if not psyn:
        return {}

    vlist = []
    for p in psyn:
        plemma = map(lambda s: s.name, p.lemmas)
        wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma)
        ww = reduce(lambda u, v: u + v, wlist, [])
        vv = map(lambda s: norm_tag(s, db_cursor, addl_vocab, 1), ww)
        vv = filter(len, vv)

        vlist += vv

    cur_d = self_depth - 1
    parent_words = {}
    parent_words[cur_d] = sorted(list(set(vlist)))
    #{'depth': self_depth-1,
    #    'words': sorted(list(set(vlist))) }

    for p in psyn:
        pw = get_ancestor_words(p, cur_d, db_cursor, addl_vocab)
        # aggregate outcome by depth
        if pw:
            try:
                for d in pw.iterkeys():
                    if d in parent_words:  # merge two dicts
                        parent_words[d] = sorted(
                            list(set(parent_words[d]) | set(pw[d])))
                    else:
                        parent_words[d] = pw[d]
            except:
                print pw
                raise

    return parent_words
Example #15
0
def get_ancestor_words(self_syn, self_depth, db_cursor, addl_vocab):
    psyn = self_syn.hypernyms()
    if not psyn:
        return {}
    
    vlist = []
    for p in psyn:
        plemma = map(lambda s: s.name, p.lemmas)
        wlist = map(lambda s: s.lower().replace("_", " ").split(), plemma)
        ww = reduce(lambda u,v: u + v, wlist, [])
        vv = map(lambda s:norm_tag(s, db_cursor, addl_vocab,1), ww)
        vv = filter(len, vv)
        
        vlist += vv
    
    cur_d = self_depth-1
    parent_words = {}
    parent_words[cur_d] = sorted(list(set(vlist)))
    #{'depth': self_depth-1,
    #    'words': sorted(list(set(vlist))) }
    
    for p in psyn:
        pw = get_ancestor_words(p, cur_d, db_cursor, addl_vocab)
        # aggregate outcome by depth
        if pw:
            try:
                for d in pw.iterkeys():
                    if d in parent_words : # merge two dicts
                        parent_words[d] = sorted(list( set(parent_words[d]) | set(pw[d]) ))
                    else:
                        parent_words[d] = pw[d]
            except:
                print pw
                raise
    
    return parent_words
Example #16
0
def construct_training_data(argv):
    opts, db_dict, _addl_vocab, _db_wn = options_get_wnet_tag(argv)
    addl_vocab = []
    dbcn = sqlite3.connect(db_dict) # dictionary db
    dbcsr = dbcn.cursor()
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s start processing '%s' " % (tt, opts.in_wnet_list)
    
    if os.path.isfile(opts.in_wnet_list):
        wnet_list = open(opts.in_wnet_list, 'rt').read().split()
    else:
        wnet_list = opts.in_wnet_list.split(",")
        
    imgid_list, usr_list, tag_dict = ([], [], {})
    
    for (iw, wn) in enumerate(wnet_list):
        if iw > opts.endnum:
            break
        
        wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir+"_tags", wn+'.tags.txt')
        ilist, ulist, tdict = get_wnet_tags(wn, wtag_file, opts, False)
        imgid_list += ilist
        usr_list += ulist
        for u in tdict:
            tag_dict[u] = tdict[u] # imgid is unique 
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "\n%s read %d synsets, found %d imgs from %d unqiue usrs" % (tt, len(wnet_list), len(imgid_list), len(set(usr_list)) )
    tnum = map(lambda k: len(tag_dict[k]), tag_dict.iterkeys())
    snum = sum(tnum)
    print "%s %d tags in %d imgs, avg %0.4f tags per img" % (tt, snum, len(imgid_list), 1.*snum/len(tnum))
    
    #if 0:
    usr_tag = {}
    tag_cnt = {}
    empty_cnt = 0
    
    for (imid, u) in zip(imgid_list, usr_list):
        if u not in usr_tag:
            usr_tag[u] = {}
        
        vv = map(lambda s:norm_tag(s, dbcsr, addl_vocab,1), tag_dict[imid])
        vv = list(set(filter(len, vv)))
        for v in vv:
            if v in usr_tag[u]:
                usr_tag[u][v] += 1
            else:
                usr_tag[u][v] = 1 # first time seeing user u use tag v
                if v in tag_cnt:
                    tag_cnt[v] += 1
                else:
                    tag_cnt[v] = 1
        
    tag_list = filter(lambda t: tag_cnt[t]>1, tag_cnt.keys())
    tag_list.sort()
    tag_num = map(lambda t: tag_cnt[t], tag_list)
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s obtained %d usrs, %d out of %d tags with cnt>1, %d empty imgs " % (tt, len(usr_tag), len(tag_list), len(tag_cnt), empty_cnt)
    
    lcnt = 0
    bg_dict = {}
    #wn_tagf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".txt"), "wt")
    for u in usr_tag:
        vv = filter(lambda v: v in tag_list, usr_tag[u].keys())
        if vv:
            #wn_tagf.write(u+"\t"+ ",".join(vv) + "\n")
            accumulate_bg(vv, bg_dict, None, None, addl_vocab=[])
            lcnt += 1
            if lcnt<20: print u+"\t"+ ",".join(vv)
    
    #Nusr = len(usr_tag)
    lcnt = 0
    bg_tuples = sort_bg(bg_dict)
    #print bg_tuples
    bigram_list = []
    for u, v, c in bg_tuples:
        iu = tag_list.index(u)
        iv = tag_list.index(v)
        bigram_list += [iu, iv, c]
    
    wn_out_mat = os.path.split(opts.in_wnet_list)[1]
    wn_out_mat = os.path.splitext(wn_out_mat)[0]
    wn_out_mat = os.path.join(os.path.split(opts.in_wnet_list)[0], wn_out_mat+".mat")
    data = {'wnet_list': wnet_list, 'usr_list': ulist, 'tag_list': tag_list, 'tag_cnt':tag_num, 'bigram_list': bigram_list}
    io.savemat(wn_out_mat, data)
Example #17
0
def rank_tags(argv):
    if len(argv)<2:
            argv = ['-h']
    parser = OptionParser(description='rank tags of a single image + compose sentences')
    parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db and other data')
    parser.add_option('-n', '--num_output', dest='num_output', type="int", 
                      default=3, help='number of output images to examine')
    parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary')
    parser.add_option("", '--vocab_score', dest='vocab_score', default="flickr_vscore.txt", 
                      help='file containing vocabulary count')
    parser.add_option("", '--tag_file', dest="tag_file", default="demo-data/24.cache", help="")
    parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='')
    parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='')
    
    (opts, __args) = parser.parse_args(sys.argv)
    
    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    
    tag_file = os.path.join(opts.db_dir, opts.tag_file)
    
    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split()
    vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score), 'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    vocab_score = {}
    for vl in vocab_lines:
        t = vl.split()
        # [word, score, prc]
        vocab_score[t[0]] = map(float, t[1:])
    
    # gulp all the tags
    vocab_lines = open(tag_file, 'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    img_tag = {}
    for vl in vocab_lines:
        t = vl.split()        
        #print t
        img_tag[t[0]] = t[1]
    print "read %d tags, %d images" % ( len(vocab_score), len(img_tag) ) 
    
    id_list = img_tag.keys()
    if opts.num_output<0:
        random.shuffle(id_list)
        num_output = - opts.num_output
        id_select = id_list[:num_output*10]
    elif opts.num_output>1e5:
        id_select = [str(opts.num_output)]
        num_output = 1
    else:
        num_output = opts.num_output
        id_select = id_list[:num_output*10]
    
    
    icnt = 0
    
    api_keys = open(FLICKR_KEY_FILE, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)
    
    conn = sqlite3.connect(db_dict)
    conn.text_factory = str
    cursor = conn.cursor()
    
    for cur_id in id_select:
        ww = img_tag[cur_id].split(",")
        vv = map(lambda s:norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        #find the vscore of vv
        vs = map(lambda v: vocab_score[v], vv)
        
        # get flickr picture url
        #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg
        if 1:
            cur_key = api_keys [random.randint(0, len(api_keys)-1)]
            jinfo = cache_flickr_info(cur_id, cur_key, rootdir="")
            p = jinfo['photo']
            imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % (p["farm"], p["server"], p['id'], p['secret'])
        else:
            imgurl = ""
        #print zip(vv, vs)
        if len(vv) > 5:
            icnt += 1
            # print results
            print "\nimg: %s" % (imgurl if imgurl else cur_id)
            vtup = sorted(map(lambda s,t: (s, t[0], t[1]), vv, vs), key=itemgetter(2), reverse=True)
            outstr = ""
            for i, t in enumerate(vtup):
                outstr += "%s (%0.3f,%2.1f%%)\t"%(t[0],t[1],100*t[2])
                if (i+1)%3==0:
                    outstr += "\n" 
            print outstr
            """
            print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) )
            print "other      : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) )
            print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) )
            """
            print ""
        else:
            pass
        
        if icnt >= num_output:
            break
        
    conn.close()
Example #18
0
def rank_tags(argv):
    if len(argv) < 2:
        argv = ['-h']
    parser = OptionParser(
        description='rank tags of a single image + compose sentences')
    parser.add_option('-d',
                      '--db_dir',
                      dest='db_dir',
                      default="",
                      help='dir containing sqlite db and other data')
    parser.add_option('-n',
                      '--num_output',
                      dest='num_output',
                      type="int",
                      default=3,
                      help='number of output images to examine')
    parser.add_option("",
                      '--db_dict',
                      dest='db_dict',
                      default="dict.db",
                      help='dictionary')
    parser.add_option("",
                      '--vocab_score',
                      dest='vocab_score',
                      default="flickr_vscore.txt",
                      help='file containing vocabulary count')
    parser.add_option("",
                      '--tag_file',
                      dest="tag_file",
                      default="demo-data/24.cache",
                      help="")
    parser.add_option("",
                      '--wn_list',
                      dest='wn_list',
                      default="wnet-50.txt",
                      help='')
    parser.add_option("",
                      '--addl_vocab',
                      dest='addl_vocab',
                      default="places_etc.txt",
                      help='')

    (opts, __args) = parser.parse_args(sys.argv)

    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)

    tag_file = os.path.join(opts.db_dir, opts.tag_file)

    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab),
                      'rt').read().split()
    vocab_lines = open(os.path.join(opts.db_dir, opts.vocab_score),
                       'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    vocab_score = {}
    for vl in vocab_lines:
        t = vl.split()
        # [word, score, prc]
        vocab_score[t[0]] = map(float, t[1:])

    # gulp all the tags
    vocab_lines = open(tag_file, 'rt').read().split("\n")
    vocab_lines = filter(len, vocab_lines)
    img_tag = {}
    for vl in vocab_lines:
        t = vl.split()
        #print t
        img_tag[t[0]] = t[1]
    print "read %d tags, %d images" % (len(vocab_score), len(img_tag))

    id_list = img_tag.keys()
    if opts.num_output < 0:
        random.shuffle(id_list)
        num_output = -opts.num_output
        id_select = id_list[:num_output * 10]
    elif opts.num_output > 1e5:
        id_select = [str(opts.num_output)]
        num_output = 1
    else:
        num_output = opts.num_output
        id_select = id_list[:num_output * 10]

    icnt = 0

    api_keys = open(FLICKR_KEY_FILE, 'r').read().split()
    api_keys = map(lambda s: s.strip(), api_keys)

    conn = sqlite3.connect(db_dict)
    conn.text_factory = str
    cursor = conn.cursor()

    for cur_id in id_select:
        ww = img_tag[cur_id].split(",")
        vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        #find the vscore of vv
        vs = map(lambda v: vocab_score[v], vv)

        # get flickr picture url
        #http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg
        if 1:
            cur_key = api_keys[random.randint(0, len(api_keys) - 1)]
            jinfo = cache_flickr_info(cur_id, cur_key, rootdir="")
            p = jinfo['photo']
            imgurl = 'http://farm%s.staticflickr.com/%s/%s_%s.jpg' % (
                p["farm"], p["server"], p['id'], p['secret'])
        else:
            imgurl = ""
        #print zip(vv, vs)
        if len(vv) > 5:
            icnt += 1
            # print results
            print "\nimg: %s" % (imgurl if imgurl else cur_id)
            vtup = sorted(map(lambda s, t: (s, t[0], t[1]), vv, vs),
                          key=itemgetter(2),
                          reverse=True)
            outstr = ""
            for i, t in enumerate(vtup):
                outstr += "%s (%0.3f,%2.1f%%)\t" % (t[0], t[1], 100 * t[2])
                if (i + 1) % 3 == 0:
                    outstr += "\n"
            print outstr
            """
            print "visual tags: " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]>.9 else "", vv, vs ) )
            print "other      : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<=.9 and s[1]>=.6 else "", vv, vs ) )
            print "non-visual : " + ", ".join( map(lambda v, s: "%s (%0.3f)"%(v,s[0]) if s[1]<.6 else "", vv, vs ) )
            """
            print ""
        else:
            pass

        if icnt >= num_output:
            break

    conn.close()
Example #19
0
def analyze_tag_pairs(argv):
    # parser = OptionParser(description='compile tags for all imgs in a wnet synset')
    opts, db_dict, addl_vocab, db_wn = options_get_wnet_tag(argv)

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s start processing '%s' " % (tt, opts.in_wnet_list)

    dbcn = sqlite3.connect(db_dict)  # dictionary db
    dbcsr = dbcn.cursor()

    if os.path.isfile(opts.in_wnet_list):
        wnet_list = open(opts.in_wnet_list, 'rt').read().split()
    else:
        wnet_list = opts.in_wnet_list.split(",")

    for (iw, wn) in enumerate(wnet_list):
        if iw > opts.endnum:
            break

        wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir + "_tags",
                                 wn + '.tags.txt')
        imgid_list, usr_list, tag_dict = get_wnet_tags(wn, wtag_file, opts,
                                                       False)

        usr_tag = {}
        tag_cnt = {}
        empty_cnt = 0

        #for u, utuple in groupby(zip(usr_list, imgid_list), lambda x: x[0]):
        for (imid, u) in zip(imgid_list, usr_list):
            if u not in usr_tag:
                usr_tag[u] = {}

            vv = map(lambda s: norm_tag(s, dbcsr, addl_vocab, 1),
                     tag_dict[imid])
            vv = list(set(filter(len, vv)))
            for v in vv:
                if v in usr_tag[u]:
                    usr_tag[u][v] += 1
                else:
                    usr_tag[u][v] = 1  # first time seeing user u use tag v
                    if v in tag_cnt:
                        tag_cnt[v] += 1
                    else:
                        tag_cnt[v] = 1

            #for v in list(set(usr_tag[u])):

        tag_list = filter(lambda t: tag_cnt[t] > 1, tag_cnt.keys())
        tag_list.sort()
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print "%s obtained %d usrs, %d tags non-trival, %d empty imgs " % (
            tt, len(usr_tag), len(tag_list), empty_cnt)

        ulist = []
        tcnt_mat = np.zeros((len(usr_list), len(tag_list)))
        icnt = 0
        #out_dat_fh = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".dat"), "wt")
        for u in list(set(usr_list)):
            jj = -1
            outstr = ""
            if usr_tag[u]:
                for t in usr_tag[u]:
                    if t in tag_list:
                        jj = tag_list.index(t)
                        tcnt_mat[icnt, jj] += 1
                        outstr += "%d:%d " % (jj + 1, usr_tag[u][t])
            else:
                pass  # empty list of tags, skip

            if outstr:
                #out_dat_fh.write( outstr + "\n" )
                icnt += 1
                ulist.append(u)

        tcnt = tcnt_mat[:icnt, :]

        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print '%s wnet "%s": %d unique tags by %d users\n' % (
            tt, wn, len(tag_list), len(ulist))

        # get and print synset info
        td = dict([(k, tag_cnt[k]) for k in tag_list])
        syn_info, wlist = compile_synset_wordlist(wn, len(ulist), None, db_wn,
                                                  db_dict, addl_vocab, td)
        hi_words = [
            syn_info['self']['words']
        ] + syn_info['ancestor'].values() + syn_info['descendant'].values()
        hi_depth = [
            0
        ] + syn_info['ancestor'].keys() + syn_info['descendant'].keys()
        other_words = list(
            set(tag_list) - set(reduce(lambda a, b: a + b, hi_words, [])))
        print_synset_info(syn_info, wn, wlist, len(ulist))

        wn_out_mat = os.path.join(opts.data_home, opts.wnet_out_dir,
                                  wn + ".mat")
        tcnt = tcnt_mat[:icnt, :]
        #mlab.save(wn_out_mat, 'img_list', 'tag_list', 'tcnt_mat')
        data = {
            'usr_list': ulist,
            'tag_list': tag_list,
            'tcnt': tcnt,
            'other_words': other_words,
            'hier_words': hi_words,
            'hier_depth': hi_depth
        }
        io.savemat(wn_out_mat, data)

        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print '%s saved to %s \n\n' % (tt, wn_out_mat)

        print "\n tag occurrence:"
        hi_words = reduce(lambda a, b: a + b, hi_words, [])
        lcnt = 0
        bg_dict = {}
        wn_tagf = open(
            os.path.join(opts.data_home, opts.wnet_out_dir, wn + ".txt"), "wt")
        for u in usr_tag:
            vv = filter(lambda v: v in tag_list, usr_tag[u].keys())
            if vv:
                wn_tagf.write(u + "\t" + ",".join(vv) + "\n")
                accumulate_bg(vv, bg_dict, None, None, addl_vocab=[])
                lcnt += 1
                if lcnt < 20: print u + "\t" + ",".join(vv)
        wn_tagf.close()

        print "\nbigrams#\tMI \ttype\ttag1,tag2\trelations"
        Nusr = len(usr_tag)
        lcnt = 0
        bg_tuples = sort_bg(bg_dict)
        wn_bgf = open(
            os.path.join(opts.data_home, opts.wnet_out_dir,
                         wn + ".bigram.txt"), "wt")
        for u, v, c in bg_tuples:
            if c < 3: break
            assr = cm.Assertion.objects.filter(concept1__text=u,
                                               concept2__text=v,
                                               language=en)
            atxt = map(lambda a: str(a).strip("[]"), assr)
            atxt += map(
                lambda a: str(a).strip("[]"),
                cm.Assertion.objects.filter(concept1__text=v,
                                            concept2__text=u,
                                            language=en))
            try:
                mi = binary_mutual_info(Nusr, tag_cnt[u], tag_cnt[v], c)
            except:
                print u, v, c, len(usr_tag)
                raise

            if u in other_words and v in other_words:
                btype = "OO"
            elif u in hi_words and v in hi_words:
                btype = "HH"
            else:
                btype = "HO"

            outstr = "%s\t%0.5f\t%0.5f\t%s\t%s,%s\t%s" % \
                (syn_info['nltk_id'], 1.*c/Nusr, mi, btype, u, v, ";".join(atxt) if atxt else "None")
            wn_bgf.write(outstr + "\n")
            lcnt += 1
            if lcnt < 50: print outstr

        wn_bgf.close()
Example #20
0
def construct_training_data(argv):
    opts, db_dict, _addl_vocab, _db_wn = options_get_wnet_tag(argv)
    addl_vocab = []
    dbcn = sqlite3.connect(db_dict)  # dictionary db
    dbcsr = dbcn.cursor()

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s start processing '%s' " % (tt, opts.in_wnet_list)

    if os.path.isfile(opts.in_wnet_list):
        wnet_list = open(opts.in_wnet_list, 'rt').read().split()
    else:
        wnet_list = opts.in_wnet_list.split(",")

    imgid_list, usr_list, tag_dict = ([], [], {})

    for (iw, wn) in enumerate(wnet_list):
        if iw > opts.endnum:
            break

        wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir + "_tags",
                                 wn + '.tags.txt')
        ilist, ulist, tdict = get_wnet_tags(wn, wtag_file, opts, False)
        imgid_list += ilist
        usr_list += ulist
        for u in tdict:
            tag_dict[u] = tdict[u]  # imgid is unique

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "\n%s read %d synsets, found %d imgs from %d unqiue usrs" % (
        tt, len(wnet_list), len(imgid_list), len(set(usr_list)))
    tnum = map(lambda k: len(tag_dict[k]), tag_dict.iterkeys())
    snum = sum(tnum)
    print "%s %d tags in %d imgs, avg %0.4f tags per img" % (
        tt, snum, len(imgid_list), 1. * snum / len(tnum))

    #if 0:
    usr_tag = {}
    tag_cnt = {}
    empty_cnt = 0

    for (imid, u) in zip(imgid_list, usr_list):
        if u not in usr_tag:
            usr_tag[u] = {}

        vv = map(lambda s: norm_tag(s, dbcsr, addl_vocab, 1), tag_dict[imid])
        vv = list(set(filter(len, vv)))
        for v in vv:
            if v in usr_tag[u]:
                usr_tag[u][v] += 1
            else:
                usr_tag[u][v] = 1  # first time seeing user u use tag v
                if v in tag_cnt:
                    tag_cnt[v] += 1
                else:
                    tag_cnt[v] = 1

    tag_list = filter(lambda t: tag_cnt[t] > 1, tag_cnt.keys())
    tag_list.sort()
    tag_num = map(lambda t: tag_cnt[t], tag_list)
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s obtained %d usrs, %d out of %d tags with cnt>1, %d empty imgs " % (
        tt, len(usr_tag), len(tag_list), len(tag_cnt), empty_cnt)

    lcnt = 0
    bg_dict = {}
    #wn_tagf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".txt"), "wt")
    for u in usr_tag:
        vv = filter(lambda v: v in tag_list, usr_tag[u].keys())
        if vv:
            #wn_tagf.write(u+"\t"+ ",".join(vv) + "\n")
            accumulate_bg(vv, bg_dict, None, None, addl_vocab=[])
            lcnt += 1
            if lcnt < 20: print u + "\t" + ",".join(vv)

    #Nusr = len(usr_tag)
    lcnt = 0
    bg_tuples = sort_bg(bg_dict)
    #print bg_tuples
    bigram_list = []
    for u, v, c in bg_tuples:
        iu = tag_list.index(u)
        iv = tag_list.index(v)
        bigram_list += [iu, iv, c]

    wn_out_mat = os.path.split(opts.in_wnet_list)[1]
    wn_out_mat = os.path.splitext(wn_out_mat)[0]
    wn_out_mat = os.path.join(
        os.path.split(opts.in_wnet_list)[0], wn_out_mat + ".mat")
    data = {
        'wnet_list': wnet_list,
        'usr_list': ulist,
        'tag_list': tag_list,
        'tag_cnt': tag_num,
        'bigram_list': bigram_list
    }
    io.savemat(wn_out_mat, data)
Example #21
0
def analyze_tag_pairs(argv):
    # parser = OptionParser(description='compile tags for all imgs in a wnet synset')
    opts, db_dict, addl_vocab, db_wn = options_get_wnet_tag(argv)
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s start processing '%s' " % (tt, opts.in_wnet_list)
    
    
    dbcn = sqlite3.connect(db_dict) # dictionary db
    dbcsr = dbcn.cursor()
    
    if os.path.isfile(opts.in_wnet_list):
        wnet_list = open(opts.in_wnet_list, 'rt').read().split()
    else:
        wnet_list = opts.in_wnet_list.split(",")
        
    for (iw, wn) in enumerate(wnet_list):
        if iw > opts.endnum:
            break
        
        wtag_file = os.path.join(opts.data_home, opts.wnet_list_dir+"_tags", wn+'.tags.txt')
        imgid_list, usr_list, tag_dict = get_wnet_tags(wn, wtag_file, opts, False)
        
        usr_tag = {}
        tag_cnt = {}
        empty_cnt = 0
        
        #for u, utuple in groupby(zip(usr_list, imgid_list), lambda x: x[0]):
        for (imid, u) in zip(imgid_list, usr_list):
            if u not in usr_tag:
                usr_tag[u] = {}
                    
            vv = map(lambda s:norm_tag(s, dbcsr, addl_vocab,1), tag_dict[imid])
            vv = list(set(filter(len, vv)))
            for v in vv:
                if v in usr_tag[u]:
                    usr_tag[u][v] += 1
                else:
                    usr_tag[u][v] = 1 # first time seeing user u use tag v
                    if v in tag_cnt:
                        tag_cnt[v] += 1
                    else:
                        tag_cnt[v] = 1
            
            #for v in list(set(usr_tag[u])):
                
        
        tag_list = filter(lambda t: tag_cnt[t]>1, tag_cnt.keys())
        tag_list.sort()
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print "%s obtained %d usrs, %d tags non-trival, %d empty imgs " % (tt, len(usr_tag), len(tag_list), empty_cnt)
        
        ulist = []
        tcnt_mat = np.zeros( (len(usr_list), len(tag_list)))
        icnt = 0
        #out_dat_fh = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".dat"), "wt")
        for u in list(set(usr_list)):
            jj = -1
            outstr = ""
            if usr_tag[u]:
                for t in usr_tag[u]:
                    if t in tag_list:
                        jj  = tag_list.index(t)
                        tcnt_mat[icnt, jj] += 1
                        outstr += "%d:%d " % (jj+1, usr_tag[u][t])
            else:
                pass # empty list of tags, skip
           
            if outstr:
                #out_dat_fh.write( outstr + "\n" )
                icnt += 1
                ulist.append(u)
        
        tcnt = tcnt_mat[:icnt, :]
        
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print '%s wnet "%s": %d unique tags by %d users\n' % (tt, wn, len(tag_list), len(ulist))
        
        # get and print synset info
        td = dict([ (k, tag_cnt[k]) for k in tag_list ])
        syn_info, wlist = compile_synset_wordlist(wn, len(ulist), None, db_wn, db_dict, addl_vocab, td)
        hi_words = [syn_info['self']['words']] + syn_info['ancestor'].values() + syn_info['descendant'].values()
        hi_depth = [0 ] + syn_info['ancestor'].keys() + syn_info['descendant'].keys()
        other_words = list(set(tag_list) - set(reduce(lambda a,b: a+b, hi_words, [])) )
        print_synset_info(syn_info, wn, wlist, len(ulist))
        
        wn_out_mat = os.path.join(opts.data_home, opts.wnet_out_dir, wn+".mat")
        tcnt = tcnt_mat[:icnt, :]
        #mlab.save(wn_out_mat, 'img_list', 'tag_list', 'tcnt_mat')
        data = {'usr_list': ulist, 'tag_list': tag_list, 'tcnt':tcnt, 
                'other_words':other_words, 'hier_words':hi_words, 'hier_depth':hi_depth}
        io.savemat(wn_out_mat, data)
        
        tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
        print '%s saved to %s \n\n' % (tt, wn_out_mat)
        
        print "\n tag occurrence:"
        hi_words = reduce(lambda a,b: a+b, hi_words, [])
        lcnt = 0
        bg_dict = {}
        wn_tagf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".txt"), "wt")
        for u in usr_tag:
            vv = filter(lambda v: v in tag_list, usr_tag[u].keys())
            if vv:
                wn_tagf.write(u+"\t"+ ",".join(vv) + "\n")
                accumulate_bg(vv, bg_dict, None, None, addl_vocab=[])
                lcnt += 1
                if lcnt<20: print u+"\t"+ ",".join(vv) 
        wn_tagf.close()
        
        print "\nbigrams#\tMI \ttype\ttag1,tag2\trelations"
        Nusr = len(usr_tag)
        lcnt = 0
        bg_tuples = sort_bg(bg_dict)
        wn_bgf = open(os.path.join(opts.data_home, opts.wnet_out_dir, wn+".bigram.txt"), "wt")
        for u, v, c in bg_tuples:
            if c<3: break
            assr = cm.Assertion.objects.filter(concept1__text=u, concept2__text=v,language=en)
            atxt = map(lambda a: str(a).strip("[]"), assr)
            atxt += map(lambda a: str(a).strip("[]"), 
                        cm.Assertion.objects.filter(concept1__text=v, concept2__text=u,language=en))
            try:
                mi = binary_mutual_info(Nusr, tag_cnt[u], tag_cnt[v], c)
            except:
                print u, v, c, len(usr_tag)
                raise
            
            if u in other_words and v in other_words:
                btype = "OO"
            elif u in hi_words and v in hi_words:
                btype = "HH"
            else:
                btype = "HO"
                
            outstr = "%s\t%0.5f\t%0.5f\t%s\t%s,%s\t%s" % \
                (syn_info['nltk_id'], 1.*c/Nusr, mi, btype, u, v, ";".join(atxt) if atxt else "None")
            wn_bgf.write(outstr +"\n")
            lcnt += 1
            if lcnt<50: print outstr
            
        wn_bgf.close()
Example #22
0
def norm_tag_file(argv):
    """
        clean tags for one single file
    """
    if len(argv) < 2:
        argv = ['-h']
    parser = OptionParser(
        description='rank tags of a single image + compose sentences')
    parser.add_option('-i',
                      '--in_file',
                      dest='in_file',
                      default="",
                      help='input file')
    parser.add_option('-o',
                      '--out_dir',
                      dest='out_dir',
                      default="db",
                      help='ouptut format (db or pkl)')
    parser.add_option('-d',
                      '--db_dir',
                      dest='db_dir',
                      default="",
                      help='dir containing sqlite db and other data')
    parser.add_option("",
                      '--db_dict',
                      dest='db_dict',
                      default="dict.db",
                      help='dictionary')
    parser.add_option("",
                      '--addl_vocab',
                      dest='addl_vocab',
                      default="places_etc.txt",
                      help='')

    (opts, __args) = parser.parse_args(sys.argv)
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab),
                      'rt').read().split()

    out_fh = open(os.path.join(opts.out_dir,
                               os.path.split(opts.in_file)[1]), 'wt')
    conn = sqlite3.connect(db_dict)
    cursor = conn.cursor()

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s processing %s ..." % (tt, opts.in_file)
    cnt = 0
    ecnt = 0
    tcnt = 0
    for cl in codecs.open(opts.in_file, encoding='utf-8'):
        imid = cl.split()[0]
        w = cl.split()[1]
        ww = w.split(",")
        vv = map(lambda s: norm_tag(s, cursor, addl_vocab), ww)
        vv = filter(lambda s: len(s), vv)
        if vv:
            #tag_dict[imid] = vv
            out_fh.write("%s\t%s\n" % (imid, ",".join(vv)))
            tcnt += len(vv)
        else:
            ecnt += 1

        cnt += 1
        if cnt % 5000 == 0:
            tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
            print "%s %d img-id processed, %d tags, %d empty" % (tt, cnt, tcnt,
                                                                 ecnt)

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s %d img-id processed, %d tags, %d empty. Done.\n" % (tt, cnt,
                                                                  tcnt, ecnt)
    conn.close()