Ejemplo n.º 1
0
def main(argv):
  if (len(argv) != 2):
    print('usage: extract_embeddings.py datadir')
    exit(1)

  np.random.seed(123)
  data_dir = argv[1]
  fname_vocab = os.path.join(data_dir, 'vocab.pickle')
  alphabet = cPickle.load(open(fname_vocab))
  words = alphabet.keys()
  print "Vocab size: ", len(alphabet)

  for fname in [
                'embeddings/aquaint+wiki.txt.gz.ndim=50.bin',
                ]:
    word2vec = load_bin_vec(fname, words)

    ndim = len(word2vec[word2vec.keys()[0]])
    print 'embedding dim: ', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim))
    for word, idx in alphabet.iteritems():
      word_vec = word2vec.get(word, None)
      if word_vec is None:
        word_vec = np.random.uniform(-0.25, 0.25, ndim)

        random_words_count += 1
      vocab_emb[idx] = word_vec
    print "Using zero vector as random"
    print 'random_words_count', random_words_count
    print vocab_emb.shape
    outfile = os.path.join(data_dir, 'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
Ejemplo n.º 2
0
def main(train, test, ngram, we):
    '''
    Given output path (out), liblinear path (liblinear),
    Given ngram string rule (like "123"), ngram
    '''
    global times
    times = 0

    print 'loading data...'
    train_df = read_dataset(train)
    test_df = read_dataset(test)

    print 'cleaning data and add lex indicators...'
    train_df[['text_lex', 'lex_ws']] = train_df.apply(add_lex_indicator,
                                                      axis=1)
    test_df[['text_lex', 'lex_ws']] = test_df.apply(add_lex_indicator, axis=1)

    print "using train to build pos and neg dic..."
    pos_dic, neg_dic = build_dict(train_df, ngram)

    print "computing log-count ratio r..."
    dic, r, v = compute_ratio(pos_dic, neg_dic)

    print 'loading word embedding...'
    word2vec = load_bin_vec(we)

    print "building train and test features --- ngram part..."
    train_df.sort_index(inplace=True)
    test_df.sort_index(inplace=True)
    X_train_ngram, y_train = process_files_ngram(train_df, dic, r, v, ngram)
    X_test_ngram, y_test = process_files_ngram(test_df, dic, r, v, ngram)

    print "building train and test features --- pos embedding part..."
    X_train_embed = process_files_wemb(train_df, word2vec)
    X_test_embed = process_files_wemb(test_df, word2vec)

    print "combining log-count ratio and pos embedding features..."
    train_f = sp.hstack((X_train_ngram, X_train_embed), format='csr')
    test_f = sp.hstack((X_test_ngram, X_test_embed), format='csr')

    print "running model..."
    basemodel = LogisticRegression()
    f_score = model_run(basemodel, train_f, test_f, y_train, y_test)
    print '##############f_score is: ', f_score

    print 'model ended.'
Ejemplo n.º 3
0
def main():

    folder = sys.argv[1]

    print "Folder: %s" % folder

    np.random.seed(123)

    data_dirs = [
        os.path.join(folder, 'TRAIN'),
    ]

    for data_dir in data_dirs:
        fname_vocab = os.path.join(data_dir, 'vocab.pickle')
        alphabet = cPickle.load(open(fname_vocab))
        words = alphabet.keys()
        print "Vocab size", len(alphabet)

        for fname in [
                'embeddings/aquaint+wiki.txt.gz.ndim=50.bin',
        ]:
            word2vec = load_bin_vec(fname, words)

            ndim = len(word2vec[word2vec.keys()[0]])
            print 'ndim', ndim

            random_words_count = 0
            vocab_emb = np.zeros((len(alphabet) + 1, ndim))
            for word, idx in alphabet.iteritems():
                word_vec = word2vec.get(word, None)
                if word_vec is None:
                    word_vec = np.random.uniform(-0.25, 0.25, ndim)

                    random_words_count += 1
                vocab_emb[idx] = word_vec
            print "Using zero vector as random"
            print 'random_words_count', random_words_count
            print vocab_emb.shape
            outfile = os.path.join(
                data_dir, 'emb_{}.npy'.format(os.path.basename(fname)))
            print outfile
            np.save(outfile, vocab_emb)
Ejemplo n.º 4
0
def find_keyword(args):
    print "loading data"
    if args.word_vec_file == '':
        w2v_file = 'GoogleNews-vectors-negative300.bin'
        keywords = load_words()
        vocab = keywords
        w2v = load_bin_vec(w2v_file, vocab)
    else:
        w2v = cPickle.load(open(args.word_vec_file, "r"))
    print "finish loading data"
    W = dict2Mat(w2v)
    kmeans = KMeans(n_clusters=args.keyword_num, random_state=0).fit(W)
    # save index to file
    cPickle.dump(kmeans.labels_, open(args.idx_save_file, "wb"))
    # get center vectors
    ctr_vecs = np.zeros(shape=(args.keyword_num, W.shape[1]))
    for i in range(args.keyword_num):
        ctr_vecs[i] = np.mean(W[kmeans.labels_ == i], axis=0)
    cPickle.dump(ctr_vecs, open('test.p', "wb"))
    print "center vecters saved"
    # save center words
    # get index of the closest vector to center vectors
    nbrs = NearestNeighbors(n_neighbors=1, algorithm=args.tree_algo).fit(W)
    distances, indices = nbrs.kneighbors(ctr_vecs)
    indices = np.reshape(indices, (len(indices)))
    # print words to file
    f_landmark = open(args.word_save_file, 'w')
    for i in range(args.keyword_num):
        print >> f_landmark, w2v.items()[indices[i]][0]
    f_landmark.close()
    print 'landmark words saved'
    # save words for vectors in W
    f_words = open(args.dict_file, 'w')
    for i in range(W.shape[0]):
        print >> f_words, w2v.items()[i][0]
    f_words.close()
    print 'words saved'
    print 'all done'
Ejemplo n.º 5
0
def dump_embedding(outdir, embeddingfile, alphabet):
  words = alphabet.keys()
  print "Vocab size: ", len(alphabet)
  word2vec = load_bin_vec(embeddingfile, words)
  ndim = len(word2vec[word2vec.keys()[0]])
  print 'embedding dim: ', ndim
  random_words_count = 0
  np.random.seed(321)
  vocab_emb = np.zeros((len(alphabet) + 1, ndim))
  dummy_word_emb = np.random.uniform(-0.25, 0.25, ndim)
  for word, idx in alphabet.iteritems():
    word_vec = word2vec.get(word, None)
    if word_vec is None:
      word_vec = np.random.uniform(-0.25, 0.25, ndim)
      #word_vec = dummy_word_emb
      #word_vec = np.zeros(ndim)
      random_words_count += 1
    vocab_emb[idx] = word_vec
  print "Using zero vector as random"
  print 'random_words_count', random_words_count
  print 'vocab_emb.shape', vocab_emb.shape
  outfile = os.path.join(outdir, 'emb_{}.npy'.format(os.path.basename(embeddingfile)))
  print 'saving embedding file', outfile
  np.save(outfile, vocab_emb)
Ejemplo n.º 6
0
                                                                            FLAGS.max_aspect_len, FLAGS.max_context_len,
                                                                            FLAGS.train_data, FLAGS.pre_processed)


    test_aspects, test_contexts, test_labels, test_aspect_lens, \
    test_context_lens, test_aspect_texts, test_context_texts = load_data(FLAGS.test_fname, FLAGS.word2id,
                                                                         FLAGS.max_aspect_len, FLAGS.max_context_len,
                                                                         FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    if FLAGS.embedding == 'glove':
        FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname,
                                              FLAGS.embedding_dim,
                                              FLAGS.word2id)
    else:
        FLAGS.word2vec = load_bin_vec(FLAGS.embedding_fname,
                                      FLAGS.embedding_dim, FLAGS.word2id)

    # Building lexicon embedding
    lex_list = get_lex_file_list(FLAGS.lex_path)
    train = zip(train_aspect_texts, train_context_texts)
    test = zip(test_aspect_texts, test_context_texts)
    lex = LexHelper(lex_list,
                    train,
                    test,
                    max_aspect_len=FLAGS.max_aspect_len,
                    max_context_len=FLAGS.max_context_len)
    train_context_lex, train_aspect_lex, test_context_lex, test_aspect_lex, FLAGS.lex_dim = lex.build_lex_embeddings(
    )

    train_data = zip(train_aspects, train_contexts, train_labels,
                     train_aspect_lens, train_context_lens, train_aspect_lex,
Ejemplo n.º 7
0
        print word2dfs.items()[:10]
        #########

        alphabet = Alphabet(start_feature_id=0)
        alphabet.add('UNKNOWN_WORD_IDX')

        add_to_vocab(answers, alphabet)
        add_to_vocab(questions, alphabet)

        basename = os.path.basename(train)
        cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))
        print "alphabet", len(alphabet)

        embeddings_location = 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin'

        word2vec = load_bin_vec(embeddings_location, alphabet.keys())

        dummy_word_idx = alphabet.fid

        q_max_sent_length = max(map(lambda x: len(x), questions))
        a_max_sent_length = max(map(lambda x: len(x), answers))
        print 'q_max_sent_length', q_max_sent_length
        print 'a_max_sent_length', a_max_sent_length

        # Convert dev and test sets
        for fname in [train, dev, test]:
            print fname
            # qids, questions, answers, labels = load_data(fname, stoplist)
            qids, questions, answers, labels = load_data(fname)

            overlap_feats = compute_overlap_features(questions,
Ejemplo n.º 8
0
def main(train, test, ngram, we):
    '''
    Given output path (out), liblinear path (liblinear),
    Given ngram string rule (like "123"), ngram
    '''
    global times
    times = 0

    print 'loading data...'
    train_df = read_dataset(train)
    test_df = read_dataset(test)

    print 'cleaning data and add lex indicators...'
    train_df[['text_lex', 'lex_ws']] = train_df.apply(add_lex_indicator,
                                                      axis=1)
    test_df[['text_lex', 'lex_ws']] = test_df.apply(add_lex_indicator, axis=1)

    print "using train to build token count dict for each class..."
    counters = build_counters(train_df, ngram)

    print "computing log-count ratio r..."
    dic, r, v = compute_ratios(counters)

    print 'loading word embedding...'
    word2vec = load_bin_vec(we)

    print "building train and test features --- ngram part..."
    train_df.sort_index(inplace=True)
    test_df.sort_index(inplace=True)
    y_train = train_df['y']
    y_test = test_df['y']
    X_train_ngram = process_files_ngram(train_df, dic, r, v, ngram)
    X_test_ngram = process_files_ngram(test_df, dic, r, v, ngram)

    print "building train and test features --- pos embedding part..."
    X_train_embed = process_files_wemb(train_df, word2vec)
    X_test_embed = process_files_wemb(test_df, word2vec)

    print "combining log-count ratio and pos embedding features..."
    X_train = sp.hstack((X_train_ngram, X_train_embed), format='csr')
    X_test = sp.hstack((X_test_ngram, X_test_embed), format='csr')

    print "running model..."
    basemod = LogisticRegression()
    #tunning paramater step especially for multiclass classifier c and class_weight
    cv = 10
    param_grid = [{
        'C': [1, 0.1],
        'class_weight': [{
            1: 1,
            -1: 1
        }, {
            1: 0.9,
            -1: 1
        }, {
            1: 1,
            -1: 0.9
        }]
    }]
    f_score = model_run(basemod, param_grid, cv, X_train, X_test, y_train,
                        y_test)
    print 'f_score is: ', f_score