Esempio n. 1
0
def main():
    
    os.system("mkdir generated; mv score generated/score")
    os.system("mkdir data; mv trainfile data/train.tsv; mv testfile data/test.tsv")
    get_test() 
    scores = load('score')
    labels = get_labels()

    scores_with_raw = np.vstack(scores.values()).T
    scores_without_raw = np.vstack([scores[n] for n in scores if ('raw:' not in n)]).T

    print 'Best Model:',
    print max([(auc(scores[name]), name) for name in scores])
    print
    print auc(scores_with_raw.mean(axis=1)),
    print 'Simple Average'
    print auc(weighted(scores_with_raw, labels)),
    print 'Weighted'
    print auc(weight_selected(scores_with_raw, labels)),
    print 'Weight selected'
    print
    print auc(scores_without_raw.mean(axis=1)),
    print 'Simple Average (without raw)'
    print auc(weighted(scores_without_raw, labels)),
    print 'Weighted (without raw)'
    print auc(weight_selected(scores_without_raw, labels)),
    print 'Weight selected (without raw)'
    print

    final = weight_selected(scores_without_raw, labels)
    submit(final[len(labels):])
Esempio n. 2
0
def go_test():
    print("=" * 30)
    model = densenet.densenet201(pretrained=True)
    # Replace classification layer
    model.classifier = torch.nn.Linear(model.classifier.in_features,
                                       num_classes)
    model.to(device)
    # Load best performing model based on validation score
    model.load_state_dict(
        torch.load(f"snapshots/densenet201_best_{MODEL_NAME}.pth"))

    print("[-] Performing validation...")
    train_loader, val_loader, val_loader2 = get_train_val_split(batch_size)

    with torch.no_grad():
        val_accuracy = validate(model, val_loader, -1)
        print("[*] Validation accuracy: {}".format(val_accuracy))
        val_accuracy2 = validate(model, val_loader2, -1)
        print("[*] Validation2 accuracy: {}\n".format(val_accuracy2))

    print("[-] Performing testing...")

    test_loader = get_test(batch_size)
    with torch.no_grad():
        test(model, test_loader)
Esempio n. 3
0
def evaluate_te(model, size=None):
    """
  Evaluate the model on a subset of test set.
  """
    model.eval()
    valid = list(map(preprocessing.process_test, data.get_test(size)))

    count = [0] * 10

    for e in valid:
        context, response, distractors = e

        cs = Variable(torch.stack(
            [torch.LongTensor(context) for i in range(10)], 0),
                      volatile=True).cuda()
        rs = [torch.LongTensor(response)]
        rs += [torch.LongTensor(distractor) for distractor in distractors]
        rs = Variable(torch.stack(rs, 0), volatile=True).cuda()

        results = model(cs, rs, [context for i in range(10)])
        results = [e.data.cpu().numpy()[0] for e in results]

        better_count = sum(1 for val in results[1:] if val >= results[0])
        count[better_count] += 1
    model.train()
    return count
def go_test():
    print("=" * 30)
    model = ensemble_models.ensemble_ver1()
    model.to(device)
    model.load_state_dict(torch.load(f"snapshots/base_experiment_hidden.pth"))

    trained_models = [
        base_models.base_classifier(parameter=PARAMETERS[i]) for i in range(5)
    ]
    for i, mod in enumerate(trained_models):
        mod.classifier = torch.nn.Linear(mod.classifier.in_features,
                                         num_classes)
        mod.to(device)
        mod.load_state_dict(torch.load(f"snapshots/{MODEL_NAMES[i]}"))

    # print("[-] Performing validation...")
    # train_loader, val_loader, val_loader2 = get_train_val_split(batch_size)

    # with torch.no_grad():
    #     val_accuracy = validate(model, val_loader, -1)
    #     print("[*] Validation accuracy: {}".format(val_accuracy))
    #     val_accuracy2 = validate(model, val_loader2, -1)
    #     print("[*] Validation2 accuracy: {}\n".format(val_accuracy2))

    print("[-] Performing testing...")

    test_loader = get_test(batch_size)
    with torch.no_grad():
        test(model, trained_models, test_loader)
def main():
    ##################
    # Initialization #
    ##################

    model = ensemble_models.ensemble_ver1()
    model.to(device)
    print(model)
    trained_models = [
        base_models.base_classifier(parameter=PARAMETERS[i]) for i in range(5)
    ]
    for i, mod in enumerate(trained_models):
        mod.classifier = torch.nn.Linear(mod.classifier.in_features,
                                         num_classes)
        mod.to(device)
        mod.load_state_dict(torch.load(f"snapshots/{MODEL_NAMES[i]}"))

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=base_lr)

    ############
    # Training #
    ############
    train_loader, val_loader, val_loader2 = get_train_val_split(batch_size)

    # Use model that performs best on validation for testing
    best_val_accuracy = 0

    for epoch in range(num_epochs):
        # Train
        print("=" * 30)
        train(model, trained_models, train_loader, optimizer, criterion, epoch)

        # Validate
        with torch.no_grad():
            val_accuracy = validate(model, trained_models, val_loader, epoch)
            print("[*] Validation accuracy: {}".format(val_accuracy))
            val_accuracy2 = validate(model, trained_models, val_loader2, epoch)
            print("[*] Validation2 accuracy: {}\n".format(val_accuracy2))

        # New best performing model
        if val_accuracy2 > best_val_accuracy:
            best_val_accuracy = val_accuracy2
            print("[*] New best accuracy!\n")
            torch.save(model.state_dict(),
                       f"snapshots/base_experiment_hidden.pth")

    ###########
    # Testing #
    ###########
    print("=" * 30)
    print("[-] Performing testing...")

    # Load best performing model based on validation score
    model.load_state_dict(torch.load(f"snapshots/base_experiment_hidden.pth"))

    test_loader = get_test(batch_size)
    with torch.no_grad():
        test(model, trained_models, test_loader)
def main():
    ##################
    # Initialization #
    ##################

    # Load a model pretrained on ImageNet
    ensemble_model = ensemble.ensemble_ver3()
    ensemble_model.to(device)

    trained_models = [densenet.densenet201(pretrained=True) for i in range(5)]
    # Replace classification layer
    for i, model in enumerate(trained_models):
        model.classifier = torch.nn.Linear(model.classifier.in_features, num_classes)
        model.to(device)
        # Load best performing model based on validation score
        model.load_state_dict(torch.load(f"snapshots/densenet201_best_{MODEL_NAMES[i]}.pth"))

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=base_lr)

    ############
    # Training #
    ############
    train_loader, val_loader, val_loader2 = get_train_val_split(batch_size)

    # Use model that performs best on validation for testing
    best_val_accuracy = 0

    for epoch in range(num_epochs):
        # Train
        print("="*30)
        train(ensemble_model, trained_models, train_loader, optimizer, criterion, epoch)

        # Validate
        val_accuracy = validate(ensemble_model, trained_models, val_loader, epoch)
        print("[*] Validation accuracy: {}".format(val_accuracy))
        val_accuracy2 = validate(ensemble_model, trained_models, val_loader2, epoch)
        print("[*] Validation2 accuracy: {}\n".format(val_accuracy2))

        # New best performing model
        if val_accuracy2 > best_val_accuracy:
            best_val_accuracy = val_accuracy2
            print("[*] New best accuracy!\n")
            torch.save(ensemble_model.state_dict(), f"snapshots/densenet201_experiment_usual.pth")

    ###########
    # Testing #
    ###########
    print("="*30)
    print("[-] Performing testing...")

    # Load best performing model based on validation score
    ensemble_model.load_state_dict(torch.load(f"snapshots/densenet201_experiment_usual.pth"))

    test_loader = get_test(batch_size)
    test(ensemble_model, trained_models, test_loader)
Esempio n. 7
0
def main():
    # Initialization

    model = base_models.base_classifier(parameter=PARAMETER)
    model.to(device)
    print(model)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=base_lr)

    # Training #
    train_loader, val_loader, ensemble_train_loader, ensemble_val_loader = get_train_val_split(
        batch_size, model_num=int(sys.argv[1]))

    # Use model that performs best on validation for testing
    best_val_accuracy = 0

    for epoch in range(num_epochs):
        print("=" * 30)
        train(model, train_loader, optimizer, criterion, epoch)

        # Validate
        with torch.no_grad():
            val_accuracy = validate(model, val_loader, epoch)
            print("[*] Validation accuracy: {}".format(val_accuracy))
            val_accuracy2 = validate(model, val_loader2, epoch)
            print("[*] Validation2 accuracy: {}\n".format(val_accuracy2))

        # New best performing model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            print("[*] New best accuracy!\n")
            torch.save(model.state_dict(), f"snapshots/{MODEL_NAME}")

    ###########
    # Testing #
    ###########
    print("=" * 30)
    print("[-] Performing testing...")

    # Load best performing model based on validation score
    model.load_state_dict(torch.load(f"snapshots/{MODEL_NAME}"))

    test_loader = get_test(batch_size)
    with torch.no_grad():
        test(model, test_loader)
def go_test():
    print("="*30)
    ensemble_model = ensemble.ensemble_ver3()
    ensemble_model.to(device)
    ensemble_model.load_state_dict(torch.load(f"snapshots/densenet201_experiment_usual.pth"))

    trained_models = [densenet.densenet201(pretrained=True) for i in range(5)]
    for i, model in enumerate(trained_models):
        model.classifier = torch.nn.Linear(model.classifier.in_features, num_classes)
        model.to(device)
        # Load best performing model based on validation score
        model.load_state_dict(torch.load(f"snapshots/densenet201_best_{MODEL_NAMES[i]}.pth"))


    print("[-] Performing testing...")


    test_loader = get_test(batch_size)
    test(ensemble_model, trained_models, test_loader)
Esempio n. 9
0
def go_test():
    print("=" * 30)
    model = base_models.base_classifier(parameter=PARAMETER)
    model.to(device)
    model.load_state_dict(torch.load(f"snapshots/{MODEL_NAME}"))

    # print("[-] Performing validation...")
    # train_loader, val_loader, val_loader2 = get_train_val_split(batch_size)

    # with torch.no_grad():
    #     val_accuracy = validate(model, val_loader, -1)
    #     print("[*] Validation accuracy: {}".format(val_accuracy))
    #     val_accuracy2 = validate(model, val_loader2, -1)
    #     print("[*] Validation2 accuracy: {}\n".format(val_accuracy2))

    print("[-] Performing testing...")

    test_loader = get_test(batch_size)
    with torch.no_grad():
        test(model, test_loader)
Esempio n. 10
0
def evaluate(model, size=None, split='dev'):
    model = model.eval()
    """
    Evaluate the model on a subset of dataset.
    """
    if split == 'dev':
        ds = data.get_validation(size)
    else:
        ds = data.get_test(size)
    ds = list(map(preprocessing.process_valid, ds))
    recall_k = {k: 0 for k in range(1, 11)}

    for e in tqdm(ds):
        context, response, distractors = e

        with torch.no_grad():
            cs = Variable(
                torch.stack([torch.LongTensor(context) for i in range(10)],
                            0)).cuda()
            rs = [torch.LongTensor(response)]
            rs += [torch.LongTensor(distractor) for distractor in distractors]
            rs = Variable(torch.stack(rs, 0)).cuda()

            results, responses = model(cs, rs)
        results = np.array([e.item() for e in results])

        ranking = np.argsort(-results)
        for k in recall_k.keys():
            k = int(k)
            if 0 in ranking[:k]:
                recall_k[k] += 1

    for k, v in recall_k.items():
        recall_k[k] = v / len(ds)

    return recall_k
Esempio n. 11
0
def go_test():
    print("=" * 30)
    # Load a model pretrained on ImageNet
    ensemble_model = ensemble_models.ensemble_ver1()
    # Replace classification layer
    ensemble_model.to(device)
    ensemble_model.load_state_dict(
        torch.load(f"snapshots/base_experiment_hidden.pth"))

    trained_models = [
        base_models.base_classifier(parameter=PARAMETERS[i]) for i in range(5)
    ]
    # Replace classification layer
    for i, model in enumerate(trained_models):
        model.classifier = torch.nn.Linear(model.classifier.in_features,
                                           num_classes)
        model.to(device)
        # Load best performing model based on validation score
        model.load_state_dict(torch.load(f"snapshots/{MODEL_NAMES[i]}"))

    print("[-] Performing testing...")

    test_loader = get_test(batch_size)
    test(ensemble_model, trained_models, test_loader)
Esempio n. 12
0
def main():
    import os.path
    import json
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    loc_seq_index = loc_seq_index[:1000]
    num_locations = max(l for (u, time_loc) in loc_seq_index
                        for t, l in time_loc) + 1
    batch_size = 60
    max_seq_len = 10
    epocs = 50
    embedding_size = 50

    test = get_test(loc_seq_index, max_seq_len)

    seq_input = tf.placeholder(tf.int32, shape=[None, None], name='input_seq')
    class_output = tf.placeholder(tf.int32, shape=[None], name='output_class')
    seq_len = tf.placeholder(tf.int32, shape=[None], name='sequence_length')
    weight_mask = tf.placeholder(tf.float32,
                                 shape=[None, None],
                                 name='weight_mask')
    loss, ndcg_op, acc_op = classifier_seq(seq=seq_input,
                                           labels=class_output,
                                           weight_mask=weight_mask,
                                           num_loc=num_locations,
                                           embed_size=embedding_size,
                                           seq_len=seq_len,
                                           k=50,
                                           num_samples=-1)

    train_op = tf.train.AdagradOptimizer(learning_rate=0.1).minimize(loss)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for iter in range(epocs):
            total_loss = 0
            for u in range(len(loc_seq_index)):
                X, Y, length, weight = get_batch(loc_seq_index[u][1],
                                                 batch_size, max_seq_len,
                                                 num_locations - 1)
                _, loss_value = sess.run(
                    [train_op, loss],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        seq_len: length,
                        weight_mask: weight
                    })
                total_loss += loss_value
            print total_loss

            X, Y, length, weight = test
            ndcg, acc = sess.run(
                [ndcg_op, acc_op],
                feed_dict={
                    seq_input: X,
                    class_output: Y,
                    seq_len: length,
                    weight_mask: weight
                })

            print(ndcg, acc)
Esempio n. 13
0
    else:
        predictions = predictions + np.load(predictions_path)#.ravel()
print "shape of predictions"
print(predictions.shape)
print(predictions.max())

import data

if len(sys.argv) == 3:
    subset = sys.argv[2]
    assert subset in ['train', 'valid', 'test', 'test_valid']
else:
    subset = 'test'

if subset == "test":
    _, mask, y, _ = data.get_test()
elif subset == "train":
    y = data.labels_train
    mask = data.mask_train
elif subset == "train_valid":
    y = data.labels
    mask = data.mask
else:
    y = data.labels_valid
    mask = data.mask_valid

acc = utils.proteins_acc(predictions, y, mask)

print "Accuracy (%s) is: %.5f" % (subset,acc)

## Alternative model avrg!! ##
Esempio n. 14
0
        precision = TP / pred_positive_sample_num
        recall = TP / positive_sample_num

        if recall != 0 and recall != pr_curve[i, 0]:
            precision_pts.append([recall, precision])

        pr_curve[i + 1, 0] = recall
        pr_curve[i + 1, 1] = precision

    return pr_curve, np.array(precision_pts)


if __name__ == '__main__':
    eval_range = range(1, 174)

    writer = SummaryWriter()

    test_data = get_test({"SyntheticData": {"only_point": True}}, loader=False)
    sample_index = random.sample(range(4500),1000)
    test_sample = [test_data[i] for i in sample_index]

    for i in tqdm(eval_range):
        net = torch.load(f'/home/luo3300612/Workspace/PycharmWS/mySuperPoint/superpoint/result/epoch{i}',
                         map_location='cpu')
        pr_curve,precision_pts = mAP_final(test_sample)
        ap = np.mean(precision_pts[:,1])
        writer.add_scalar("eval_ap/ap",ap,i)
        del pr_curve
        del precision_pts
    writer.close()
Esempio n. 15
0
sym_y = T.imatrix('target_output')
sym_x = T.tensor3()

metadata_path_all = glob.glob(sys.argv[1] + "*")

print("shape of metadata_path_all")
print(len(metadata_path_all))

if len(sys.argv) >= 3:
    subset = sys.argv[2]
    assert subset in ['train', 'valid', 'test', 'train_valid']
else:
    subset = 'test'

if subset == "test":
    X, mask, _, num_seq = data.get_test()
elif subset == "train":
    X_train, _, _, _, mask_train, _, num_seq = data.get_train()
elif subset == "train_valid":
    X_train, X_valid, _, _, mask_train, mask_valid, num_seq = data.get_train()
    X = np.concatenate((X_train[:-30], X_valid))
    mask = np.concatenate((mask_train[:-30], mask_valid))
else:
    _, X, _, _, _, mask, num_seq = data.get_train()

for metadata_path in metadata_path_all:

    print("Loading metadata file %s" % metadata_path)

    metadata = np.load(metadata_path)
Esempio n. 16
0
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import data
from matplotlib import pyplot as plt

train_x = tf.convert_to_tensor(
    data.get_description('..\\track1_round1_train_20210222.csv'))
train_y = tf.convert_to_tensor(
    data.get_label('..\\track1_round1_train_20210222.csv'))
print(train_x.shape)

test_x, test_y = data.get_test('..\\track1_round1_train_20210222.csv')

rnn_units = 64

input_layer = keras.Input(shape=(104, 1))
# x = layers.Embedding(input_dim=859, output_dim=10,mask_zero='True')(input_layer)
x = layers.LSTM(rnn_units,
                return_sequences=True,
                recurrent_initializer='orthogonal',
                activation='tanh')(input_layer)
x = layers.LSTM(rnn_units,
                return_sequences=True,
                recurrent_initializer='orthogonal',
                activation='tanh',
                dropout=0.5)(x)

x = layers.Attention()([x, x])
Esempio n. 17
0
    sys.exit("Usage: python eval_avrg.py <predictions_path> [subset=test]")

predictions_path_all = glob.glob(sys.argv[1] + "*")

print("shape of metadata_path_all:", len(predictions_path_all))

import data

if len(sys.argv) == 3:
    subset = sys.argv[2]
    assert subset in ['train', 'valid', 'test', 'test_valid']
else:
    subset = 'test'

if subset == "test":
    _, mask, y, _ = data.get_test()
elif subset == "train":
    y = data.labels_train
    mask = data.mask_train
elif subset == "train_valid":
    y = data.labels
    mask = data.mask
else:
    y = data.labels_valid
    mask = data.mask_valid

acc_vec = np.zeros(len(predictions_path_all))
for i, predictions_path in enumerate(predictions_path_all):
    print(predictions_path)

    predictions = np.load(predictions_path)  # .ravel()
Esempio n. 18
0
def main():
    data = get_train() + get_test()

    f = file('generated/extracted_text', 'w')

    for i, item in enumerate(data):
        # status update
        if (i % 500) == 0:
            print i, datetime.datetime.now().time()

        #  parse file
        data = {}
        soup = boil_soup(item['urlid'])

        # given boilerplate
        data['boilerplate'] = [item['title'], item['body']]

        # extract text
        extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup))
        data['boilerpipe'] = [extractor.getText()]

        # remove non-text tags
        for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()

        # extract text for each tag
        for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items

        # extract meta tags
        meta = soup.find_all('meta')
        for el in meta:
            prop = el.get('property') if el.get('property') else el.get('name')
            if not prop:
                continue
            prop = prop.lower()
            try:
                s = unicode(el['content'])
            except:
                continue

            data['meta-' + prop] = s.split(u',') if prop == 'keywords' else [s]

        # preprocess string
        for item in data:
            data[item] = map(clean_string, data[item])
            data[item] = filter(None, data[item])

        print >> f, json.dumps(data)

    f.close()
Esempio n. 19
0
# compute the means for different configurations
print('compute means')
mean_tab = df_train.groupby('ProductId').agg({'AdjDemand': logmean})
mean_tab2 = df_train.groupby(['ProductId', 'ClientId']).agg({'AdjDemand': logmean})
global_mean = logmean(df_train['AdjDemand'])


# generate estimation for each ProductID-ClientID-pair
def estimate(key):
    key = tuple(key) # key needs to be a tuple
    try:
        est = mean_tab2.at[key,'AdjDemand']
    except KeyError:
        try :
            est = mean_tab.at[key[0],'AdjDemand']
        except KeyError:
            est = global_mean
    return est


# load the test data
print('load test data')
df_test = data.get_test(nrows=10000)
print('compute predictions')
df_test['Demanda_uni_equil'] = df_test[['ProductId', 'ClientId']].\
                apply(lambda x:estimate(x), axis=1)
df_submit = df_test[['id', 'Demanda_uni_equil']]
print(df_submit.shape)
df_submit = df_submit.set_index('id')
df_submit.to_csv('naive_product_client_logmean.csv')
Esempio n. 20
0
    train_loader, val_loader = get_train_val_split(batch_size)

    # Use model that performs best on validation for testing
    best_val_accuracy = 0

    for epoch in range(num_epochs):
        # Train
        train(model, train_loader)
        print("=" * 30)

        # Validate
        val_accuracy = validate(model, val_loader)
        print("Validation accuracy: {}".format(val_accuracy))

        # New best performing model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            print("New best accuracy!")
            torch.save(model.state_dict(), "snapshots/densenet121_best.pth")

    ###########
    # Testing #
    ###########
    print("=" * 30)
    print("Performing testing...")

    # Load best performing model based on validation score
    model.load_state_dict(torch.load("snapshots/densenet121_best.pth"))

    test_loader = get_test(batch_size)
    test(model, test_loader)
Esempio n. 21
0
    to_eval = range(108, 133)
    writer = SummaryWriter(log_dir="./eval")
    num_worker = 1

    for model_i in to_eval:
        print('Parent process %s.' % os.getpid())

        fe = SuperPointFrontend(
            weights_path=
            f'/home/luo3300612/Workspace/PycharmWS/mySuperPoint/superpoint/result/epoch{model_i + 1}',
            nms_dist=4,
            conf_thresh=1 / 65,
            border_remove=0)

        config = {"SyntheticData": {"only_point": True}}
        test_data = get_test(config, loader=False)

        st = datetime.now()
        if num_worker > 1:  # use multiprocess
            interval = np.linspace(0,
                                   len(test_data),
                                   num_worker + 1,
                                   endpoint=True,
                                   dtype=int)

            p = Pool(num_worker)
            results = [
                p.apply_async(eval, args=(i, interval[i], interval[i + 1]))
                for i in range(num_worker)
            ]
            print('Waiting for all subprocesses done...')
Esempio n. 22
0
print('compute means')
mean_tab = df_train.groupby('ProductId').agg({'AdjDemand': logmean})
mean_tab2 = df_train.groupby(['ProductId',
                              'ClientId']).agg({'AdjDemand': logmean})
global_mean = logmean(df_train['AdjDemand'])


# generate estimation for each ProductID-ClientID-pair
def estimate(key):
    key = tuple(key)  # key needs to be a tuple
    try:
        est = mean_tab2.at[key, 'AdjDemand']
    except KeyError:
        try:
            est = mean_tab.at[key[0], 'AdjDemand']
        except KeyError:
            est = global_mean
    return est


# load the test data
print('load test data')
df_test = data.get_test(nrows=10000)
print('compute predictions')
df_test['Demanda_uni_equil'] = df_test[['ProductId', 'ClientId']].\
                apply(lambda x:estimate(x), axis=1)
df_submit = df_test[['id', 'Demanda_uni_equil']]
print(df_submit.shape)
df_submit = df_submit.set_index('id')
df_submit.to_csv('naive_product_client_logmean.csv')
Esempio n. 23
0
def main():
    import os.path
    import json
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    loc_seq_index = loc_seq_index[:1000]
    num_locations = max(l for (u, time_loc) in loc_seq_index
                        for t, l in time_loc) + 1
    print('{0} locations, {1} users'.format(num_locations, len(loc_seq_index)))
    batch_size = 64
    max_seq_len = 10
    epocs = 50
    embedding_size = 50
    learning_rate = 0.1
    print(
        'embed_size:{0}, max sequence length:{1}, batch size:{2}, learn_rate:{3}'
        .format(embedding_size, max_seq_len, batch_size, learning_rate))

    test = get_test(loc_seq_index, max_seq_len)
    batches = prepare_batches(loc_seq_index, -1, batch_size, max_seq_len)

    seq_input = tf.placeholder(tf.int32, shape=[None, None], name='input_seq')
    class_output = tf.placeholder(tf.int32, shape=[None], name='output_class')
    seq_len = tf.placeholder(tf.int32, shape=[None], name='sequence_length')
    weight_mask = tf.placeholder(tf.float32,
                                 shape=[None, None],
                                 name='weight_mask')
    keep_prob = tf.placeholder(tf.float32)
    loss, acc_op, pred_top_op = classifier_seq(seq=seq_input,
                                               labels=class_output,
                                               weight_mask=weight_mask,
                                               num_loc=num_locations,
                                               embed_size=embedding_size,
                                               seq_len=seq_len,
                                               k=50,
                                               num_samples=-1,
                                               keep_prob=keep_prob)
    merged = tf.summary.merge_all()

    train_op = tf.train.AdagradOptimizer(
        learning_rate=learning_rate).minimize(loss)
    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter(
            '/home/dlian/data/location_prediction/gowalla/train', sess.graph)
        test_writer = tf.summary.FileWriter(
            '/home/dlian/data/location_prediction/gowalla/test')
        sess.run(tf.global_variables_initializer())
        total_loss = 0
        for iter in range(3):
            summary = None
            for batch_index in range(len(batches)):
                batch = batches[batch_index]
                X, Y, length, weight = get_batch(loc_seq_index, batch)
                _, loss_value, summary = sess.run(
                    [train_op, loss, merged],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        seq_len: length,
                        weight_mask: weight,
                        keep_prob: 0.5
                    })
                total_loss += loss_value

            train_writer.add_summary(summary)

            X, Y, length, weight = test
            acc, pred, summary = sess.run(
                [acc_op, pred_top_op, merged],
                feed_dict={
                    seq_input: X,
                    class_output: Y,
                    seq_len: length,
                    weight_mask: weight,
                    keep_prob: 1
                })
            test_writer.add_summary(summary)
            print total_loss, acc
            total_loss = 0

            with open(
                    '/home/dlian/data/location_prediction/gowalla/pred{0}.txt'.
                    format(iter), 'w') as fout:
                for ii, (x, p, y) in enumerate(zip(X, pred[:, 0], Y)):
                    if p != y:
                        fout.writelines('{3}, {0}, {1}, {2}\n'.format(
                            y, p, x, ii))
        train_writer.close()
        test_writer.close()
Esempio n. 24
0
def main():
    #unzip raw_content file
    os.system("unzip zipRawcontent; mkdir data; mv raw_content data/")
    os.system("cp trainfile data/train.tsv")
    os.system("cp testfile data/test.tsv")
    os.system("mkdir ../generated ")
 
    data = get_train() + get_test()

    f = file('extracted_text', 'w')

    for i, item in enumerate(data):
        # status update
        if (i % 500) == 0:
            print i, datetime.datetime.now().time()

        #  parse file
        data = {}
        soup = boil_soup(item['urlid'])

        # given boilerplate
        data['boilerplate'] = [item['title'], item['body']]
       

        # extract text
        extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup))
        data['boilerpipe'] = [extractor.getText()]

        # remove non-text tags
        for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()

        # extract text for each tag
        for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items

        # extract meta tags
        meta = soup.find_all('meta')
        for el in meta:
            prop = el.get('property') if el.get('property') else el.get('name')
            if not prop:
                continue
            prop = prop.lower()
            try:
                s = unicode(el['content'])
            except:
                continue

            data['meta-'+prop] = s.split(u',') if prop == 'keywords' else [s]

        # preprocess string
        for item in data:
            data[item] = map(clean_string, data[item])
            data[item] = filter(None, data[item])

        print >>f, json.dumps(data)

    f.close()
Esempio n. 25
0
sym_y = T.imatrix('target_output')
sym_x = T.tensor3()

metadata_path_all = glob.glob(sys.argv[1] + "*")

print "shape of metadata_path_all"
print(len(metadata_path_all))

if len(sys.argv) >= 3:
    subset = sys.argv[2]
    assert subset in ['train', 'valid', 'test', 'train_valid']
else:
    subset = 'test'

if subset == "test":
    X, mask, _, num_seq = data.get_test()
elif subset == "train":
    sys.exit("train not implemented")
elif subset == "train_valid":
    sys.exit("train_valid not implemented")
else:
    sys.exit("valid not implemented")


for metadata_path in metadata_path_all:

    print "Loading metadata file %s" % metadata_path

    metadata = np.load(metadata_path)

    config_name = metadata['config_name']