コード例 #1
0
def main(args):
    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        # for line in f:
        #     Xc = line.rstrip('\n')
        #     Xt.append(Xc[:MAX_LENGTH])
        ###Input dataset with tweet+emoji instead of just tweets
        data = csv.reader(f, delimiter=',', quotechar="|")
        for line in data:
            Xc = line[0].rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])
    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    ###Author - jagathshree
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
コード例 #2
0
ファイル: test_char.py プロジェクト: MorLong/tweet2vec
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc[:MAX_LENGTH])
            yt.append(yc.split(','))

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()

    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions = classify(tweet, t_mask, params, n_classes, n_char)[0]
    embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1]

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr,y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx,:])
            out_emb.append(e[idx,:])
            out_target.append(y[idx])

    # Save
    print("Saving...")
    with open('%s/data.pkl'%save_path,'w') as f:
        pkl.dump(out_data,f)
    with open('%s/predictions.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_pred))
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
    with open('%s/targets.pkl'%save_path,'w') as f:
        pkl.dump(out_target,f)
コード例 #3
0
ファイル: encode_char.py プロジェクト: MorLong/tweet2vec
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
コード例 #4
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
#     predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    print("Num Batches: "+str(numbatches))
    for i in range(int(numbatches)):
        print("processing batch "+str(i))
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
#         p = predict(x,x_m)
        e = encode(x,x_m)
#         ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
#             out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])
        print("saving")
        stansFileName = save_path+"/embeddings_w266_"+str(i)+".npy"
        stansNPArr = np.asarray(out_emb)
        np.save(stansFileName,stansNPArr)
        out_emb = []
    print("DONE")
def annotate_s2s_text():
    """
    テキストファイルにタグを付与する関数
    :return:
    """
    # path
    model_path = MODEL_PATH
    text_path = TEST_INPUT
    save_path = SAVE_PATH

    # seq2seq用regex
    pattern = "(.+?)(\n|\r\n)"
    r = re.compile(pattern)

    print("Loading model params...")
    params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    tweet = T.itensor3()
    t_mask = T.fmatrix()
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)

    # Encoding cmnts
    cmnts = []
    Xt = []
    for line in io.open(text_path, 'r', encoding='utf-8'):
        m = r.search(line)
        if m is not None:
            cmnts.append(m.group(1))
            Xc_cmnt = m.group(1).replace(' ',
                                         '')  # 半角スペースの除去(tweet2vecに入力するため)
            Xt.append(Xc_cmnt[:MAX_LENGTH])

    out_pred = []
    numbatches = len(Xt) / N_BATCH + 1
    print 'number of batches', numbatches
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x, x_m)
        ranks = np.argsort(p)[:, ::-1]
        for idx, item in enumerate(xr):
            out_pred.append([
                inverse_labeldict[r] if r in inverse_labeldict else 'UNK'
                for r in ranks[idx, :5]
            ][0])
        print i, 'batches end...'

    # Save result
    with io.open(save_path, 'w') as f:
        for tag, cmnt in zip(out_pred, cmnts):
            f.write(tag + '\t' + cmnt + '\n')
コード例 #6
0
def generate_embeddings(args):

    data_path = args[0]
    model_path = args[1]
    # save_path = args[2]
    if len(args) > 2:
        m_num = int(args[2])

    print("Preparing Data...")
    # Test data
    # read tweet texts into an array
    Xt = []

    # read from file
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])
    print "Tweets:", len(Xt)
    print "Unique tweets:", len(set(Xt))
    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Encoding...")
    out_data = []
    out_pred = []
    out_emb = []
    numbatches = len(Xt) / N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]
        for idx, item in enumerate(xr):
            out_data.append(item)
            # print [r for r in ranks[idx,:5]]
            # out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]]))
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
    return out_emb
コード例 #7
0
ファイル: encode_char.py プロジェクト: Quartz/tweet2vec
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            # JK/QUARTZ
            # here taking the string only up to the tab character we added
            # to every trump tweet line. Was:
            # Xc = line.rstrip('\n')
            Xc = re.match(r'^(.*)\t', line).group(1).rstrip('\n')
            print(Xc)
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    # JK/QUARTZ Disabling the prediction function, because we just need the vectoring
    # predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Encoding...")
    # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
    # out_pred = []
    out_emb = []
    numbatches = len(Xt) / N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        # JK/QUARTZ Disabling the prediction function, because we just need the vectoring
        # p = predict(x,x_m)
        e = encode(x, x_m)
        # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
        # ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
            # out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx, :])

    # Save
    print("Saving...")
    # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
    # with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
    #     for item in out_pred:
    #         f.write(item + '\n')
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
コード例 #8
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc[:MAX_LENGTH])
            yt.append(yc.split(','))

    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt,
                                  yt,
                                  labeldict,
                                  batch_size=N_BATCH,
                                  max_classes=MAX_CLASSES,
                                  test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()

    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions = classify(tweet, t_mask, params, n_classes, n_char)[0]
    embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1]

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr, y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
            out_target.append(y[idx])

    # Save
    print("Saving...")
    with open('%s/data.pkl' % save_path, 'w') as f:
        pkl.dump(out_data, f)
    with open('%s/predictions.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_pred))
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
    with open('%s/targets.pkl' % save_path, 'w') as f:
        pkl.dump(out_target, f)