Beispiel #1
0
def intersect_vocab(db_dict, tag_file, addl_vocab=[], db_wn='', wn_list=[]):
    cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab)
    print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab) )
    
    fr_words = {}
    cnt,__ = read_unigram(tag_file, fr_words)
    fr_words = filter(lambda k: fr_words[k]>5, fr_words.keys())
    
    vocab = list( set(cn_vocab) & set(fr_words) )
    vocab = filter(lambda s: len(s)>0, vocab)
    print "Flickr kep %d/%d tags, %d in common with ConceptNet " % (len(fr_words), cnt, len(vocab) )
    print " %d words in the intersected vocab" % len(vocab)
    
    # now deal with wordnet
    #wn_words = get_wordnet_words(db_wn, wn_list, db_dict, vocab)
    
    return vocab, fr_words
Beispiel #2
0
def intersect_vocab(db_dict, tag_file, addl_vocab=[], db_wn='', wn_list=[]):
    cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab)
    print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab))

    fr_words = {}
    cnt, __ = read_unigram(tag_file, fr_words)
    fr_words = filter(lambda k: fr_words[k] > 5, fr_words.keys())

    vocab = list(set(cn_vocab) & set(fr_words))
    vocab = filter(lambda s: len(s) > 0, vocab)
    print "Flickr kep %d/%d tags, %d in common with ConceptNet " % (
        len(fr_words), cnt, len(vocab))
    print " %d words in the intersected vocab" % len(vocab)

    # now deal with wordnet
    #wn_words = get_wordnet_words(db_wn, wn_list, db_dict, vocab)

    return vocab, fr_words
Beispiel #3
0
def make_vocab(argv):
    if len(argv)<2:
            argv = ['-h']
    parser = OptionParser(description='construct+compare conceptnet and flickr word similarities')
    parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db files')
    parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary')
    parser.add_option("", '--unigram_file', dest='unigram_file', default="unigram.txt", help='unigrams file %word count%')
    parser.add_option("", '--wn_list', dest='wn_list', default="wnet-50.txt", help='')
    parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='')
    #parser.add_option("", '--db_wordnet', dest='db_wordnet', default="wordnet.db", help='')
    #parser.add_option("", '--bigram_file', dest='bigram_file', default="bigram_filtered.txt", help='')
    (opts, __args) = parser.parse_args(sys.argv)
    
    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    ug_file = os.path.join(opts.db_dir, opts.unigram_file)
    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab), 'rt').read().split()
    #db_wn = os.path.join(opts.db_dir, opts.db_wordnet)
    #wn_list = os.path.join(opts.db_dir, opts.wn_list)
    
    #vocab, fr_words = intersect_vocab(db_dict, ug_file, addl_vocab=addl_vocab)
    
    cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab)
    print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab) )
    
    fr_words = {}
    cnt,__ = read_unigram(ug_file, fr_words)
    fr_words = filter(lambda k: fr_words[k]>5, fr_words.keys())
    
    vocab = list( set(cn_vocab) & set(fr_words) )
    vocab = filter(lambda s: len(s)>0, vocab)
    print "Flickr kep %d/%d tags, %d in common with ConceptNet " % (len(fr_words), cnt, len(vocab) )
    print " %d words in the intersected vocab" % len(vocab)
    
    #open(os.path.join(opts.db_dir, 'vocab.txt'), "wt").write("\n".join(vocab))
    fr_words.sort()
    open(os.path.join(opts.db_dir, 'vocab_flickr.txt'), "wt").write("\n".join(fr_words))
    
    fo = open(os.path.join(opts.db_dir, 'vocab_conceptnet.txt'), "wt")
    for k, v in cn_words.iteritems():
        if v:
            fo.write("%s\t%s\n" % (k, ",".join(v)) )
    fo.close()
Beispiel #4
0
def make_vocab(argv):
    if len(argv) < 2:
        argv = ['-h']
    parser = OptionParser(
        description='construct+compare conceptnet and flickr word similarities'
    )
    parser.add_option('-d',
                      '--db_dir',
                      dest='db_dir',
                      default="",
                      help='dir containing sqlite db files')
    parser.add_option("",
                      '--db_dict',
                      dest='db_dict',
                      default="dict.db",
                      help='dictionary')
    parser.add_option("",
                      '--unigram_file',
                      dest='unigram_file',
                      default="unigram.txt",
                      help='unigrams file %word count%')
    parser.add_option("",
                      '--wn_list',
                      dest='wn_list',
                      default="wnet-50.txt",
                      help='')
    parser.add_option("",
                      '--addl_vocab',
                      dest='addl_vocab',
                      default="places_etc.txt",
                      help='')
    #parser.add_option("", '--db_wordnet', dest='db_wordnet', default="wordnet.db", help='')
    #parser.add_option("", '--bigram_file', dest='bigram_file', default="bigram_filtered.txt", help='')
    (opts, __args) = parser.parse_args(sys.argv)

    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    ug_file = os.path.join(opts.db_dir, opts.unigram_file)
    addl_vocab = open(os.path.join(opts.db_dir, opts.addl_vocab),
                      'rt').read().split()
    #db_wn = os.path.join(opts.db_dir, opts.db_wordnet)
    #wn_list = os.path.join(opts.db_dir, opts.wn_list)

    #vocab, fr_words = intersect_vocab(db_dict, ug_file, addl_vocab=addl_vocab)

    cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab)
    print "ConceptNet: %d words, %d cleaned" % (len(cn_words), len(cn_vocab))

    fr_words = {}
    cnt, __ = read_unigram(ug_file, fr_words)
    fr_words = filter(lambda k: fr_words[k] > 5, fr_words.keys())

    vocab = list(set(cn_vocab) & set(fr_words))
    vocab = filter(lambda s: len(s) > 0, vocab)
    print "Flickr kep %d/%d tags, %d in common with ConceptNet " % (
        len(fr_words), cnt, len(vocab))
    print " %d words in the intersected vocab" % len(vocab)

    #open(os.path.join(opts.db_dir, 'vocab.txt'), "wt").write("\n".join(vocab))
    fr_words.sort()
    open(os.path.join(opts.db_dir, 'vocab_flickr.txt'),
         "wt").write("\n".join(fr_words))

    fo = open(os.path.join(opts.db_dir, 'vocab_conceptnet.txt'), "wt")
    for k, v in cn_words.iteritems():
        if v:
            fo.write("%s\t%s\n" % (k, ",".join(v)))
    fo.close()