Esempio n. 1
0
    def __init__(self):

        self.wordSet = set()
        self.vocabGrowth = 0
        self.vocabulary = {}
        self.vocabulary_inv = []

        # Build Vocab
        with open('vocab.csv', 'rb') as f:
            reader = csv.reader(f, delimiter=',')

            for row in reader:
                if len(row) > 0:
                    words = preprocess.clean(row[0])
                    for word in words:
                        self.addWord(word)

        self.addWord(opts["sentence_padding_token"])
        self.addWord(opts["unknown_word_token"])
        self.vocabulary_size = len(self.wordSet)
        store.log("Vocabulary Size: %s" % self.vocabulary_size)


        self.embeddings = None
        self.data_index = 0
        self.data = []
Esempio n. 2
0
def create():
    sid = crypto_util.shash(session['codename'])
    if os.path.exists(store.path(sid)):
        # if this happens, we're not using very secure crypto
        store.log("Got a duplicate ID '%s'" % sid)
    else:
        os.mkdir(store.path(sid))
    session['logged_in'] = True
    session['flagged'] = False
    return redirect(url_for('lookup'))
Esempio n. 3
0
  def POST(self):
    i = web.input('id', fh={}, msg=None, mid=None, action=None)
    sid = crypto.shash(i.id)

    if os.path.exists(store.path(sid)):
      # if this happens, we're not using very secure crypto
      store.log('Got a duplicate ID.')
    else:
      os.mkdir(store.path(sid))
    return store_endpoint(i)
Esempio n. 4
0
    def POST(self):
        i = web.input('id', fh={}, msg=None, mid=None, action=None)
        sid = crypto.shash(i.id)

        if os.path.exists(store.path(sid)):
            # if this happens, we're not using very secure crypto
            store.log('Got a duplicate ID.')
        else:
            os.mkdir(store.path(sid))
        return store_endpoint(i)
Esempio n. 5
0
    def POST(self):
        iid = crypto.genrandomid()
        if os.path.exists(store.path(crypto.shash(iid))):
            # if this happens, we're not using very secure crypto
            store.log('Got a duplicate ID.')
        else:
            os.mkdir(store.path(crypto.shash(iid)))

        web.header('Cache-Control', 'no-cache, no-store, must-revalidate')
        web.header('Pragma', 'no-cache')
        web.header('Expires', '-1')
        return render.generate(iid)
Esempio n. 6
0
 def POST(self):
     iid = crypto.genrandomid()
     if os.path.exists(store.path(crypto.shash(iid))):
         # if this happens, we're not using very secure crypto
         store.log('Got a duplicate ID.')
     else:
         os.mkdir(store.path(crypto.shash(iid)))
         
     web.header('Cache-Control', 'no-cache, no-store, must-revalidate')
     web.header('Pragma', 'no-cache')
     web.header('Expires', '-1')
     return render.generate(iid)
Esempio n. 7
0
 def run(self):
     for item in self.pubsub.listen():
         if item["data"] == "KILL":
             self.pubsub.unsubscribe()
             print(self, "unsubscribe and finished")
             sys.stdout.flush()
             break
         elif isinstance(item["data"], str):
             data = json.loads(item["data"])
             if (data[0] == "eval"):
                 print("eval: %s" % data[1])
                 sys.stdout.flush()
                 self.eval(data[1])
             elif (data[0] == "reload"):
                 store.log("reload")
                 self.reload()
         else:
             print("bad data: %s" % item["data"])
             sys.stdout.flush()
Esempio n. 8
0
    def train(self):

        self.data = [ idx for word, idx in self.vocabulary.iteritems() ]

        store.log('Sample data %s' % self.data[:10])

        def generate_batch(batch_size, num_skips, skip_window):
            assert batch_size % num_skips == 0
            assert num_skips <= 2 * skip_window
            batch = np.ndarray(shape=(batch_size), dtype=np.int32)
            labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
            span = 2 * skip_window + 1 # [ skip_window target skip_window ]
            buffer = collections.deque(maxlen=span)
            for _ in range(span):
                buffer.append(self.data[self.data_index])
                self.data_index = (self.data_index + 1) % len(self.data)
            for i in range(batch_size // num_skips):
                target = skip_window  # target label at the center of the buffer
                targets_to_avoid = [ skip_window ]
            for j in range(num_skips):
                while target in targets_to_avoid:
                    target = random.randint(0, span - 1)
                targets_to_avoid.append(target)
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[target]
            buffer.append(self.data[self.data_index])
            self.data_index = (self.data_index + 1) % len(self.data)
            return batch, labels

        batch, labels = generate_batch(batch_size=10, num_skips=10, skip_window=5)

        for i in range(10):
            store.log('%s -> %s' % (batch[i], labels[i, 0]))
            store.log('%s -> %s' % (self.vocabulary_inv[batch[i]], self.vocabulary_inv[labels[i, 0]]))

        batch_size = 20
        embedding_size = opts["embedding_dim"]  # Dimension of the embedding vector.
        skip_window = 10       # How many words to consider left and right.
        num_skips = 20         # How many times to reuse an input to generate a label.
        # We pick a random validation set to sample nearest neighbors. Here we limit the
        # validation samples to the words that have a low numeric ID, which by
        # construction are also the most frequent.
        valid_size = 16     # Random set of words to evaluate similarity on.
        valid_window = 100  # Only pick dev samples in the head of the distribution.
        valid_examples = np.array(random.sample(np.arange(valid_window), valid_size))
        num_sampled = 64    # Number of negative examples to sample.

        graph = tf.Graph()
        with graph.as_default():
            # Input da 4ta.
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
            # Ops and variables pinned to the CPU because of missing GPU implementation
            with tf.device('/cpu:0'):
                # Look up embeddings for inputs.
                embeddings = tf.Variable(
                    tf.random_uniform([self.vocabulary_size, embedding_size], -1.0, 1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)
                # Construct the variables for the NCE loss
                with tf.name_scope("nce_weights") as scope:
                    nce_weights = tf.Variable(
                        tf.truncated_normal([self.vocabulary_size, embedding_size],
                                            stddev=1.0 / math.sqrt(embedding_size)))
                nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
                nce_biases_hist = tf.histogram_summary("nce_biases", nce_biases)

            # Compute the average NCE loss for the batch.
            # tf.nce_loss automatically draws a new sample of the negative labels each
            # time we evaluate the loss.
            with tf.name_scope("loss") as scope:
                loss = tf.reduce_mean(
                    tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                                 num_sampled, self.vocabulary_size))
            # Construct the SGD optimizer using a learning rate of 1.0.
            with tf.name_scope("train") as scope:
                optimizer = tf.train.GradientDescentOptimizer(0.25).minimize(loss)

            # Compute the cosine similarity between minibatch examples and all embeddings.
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
            normalized_embeddings = embeddings / norm
            valid_embeddings = tf.nn.embedding_lookup(
                normalized_embeddings, valid_dataset)
            similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)


        # Step 5: Begin training.
        num_steps = 100001
        with tf.Session(graph=graph) as session:
            # We must initialize all variables before we use them.
            merged = tf.merge_all_summaries()
            writer = tf.train.SummaryWriter("/tmp/tensor_logs/expiriment_1", session.graph_def)

            #Adds an op to initialize all variables
            init_op = tf.initialize_all_variables()

            # Begins running the init opp
            init_op.run()

            store.log("Initialized")
            average_loss = 0
            for step in xrange(num_steps):
                batch_inputs, batch_labels = generate_batch(
                    batch_size, num_skips, skip_window)
                feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
                # We perform one update step by evaluating the optimizer op (including it
                # in the list of returned values for session.run()
                summary_str, _, loss_val = session.run([merged, optimizer, loss], feed_dict=feed_dict)
                writer.add_summary(summary_str, step)
                average_loss += loss_val
                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    store.log("Average loss at step %s: %s" % (step, average_loss))
                    average_loss = 0
                # Note that this is expensive (~20% slowdown if computed every 500 steps)
                if step % 5000 == 0:
                    sim = similarity.eval()
                    for i in xrange(valid_size):
                        valid_word = self.vocabulary_inv[valid_examples[i]]
                        top_k = 8 # number of nearest neighbors
                        nearest = (-sim[i, :]).argsort()[1:top_k+1]
                        log_str = "Nearest to %s:" % valid_word
                    for k in xrange(top_k):
                        close_word = self.vocabulary_inv[nearest[k]]
                        log_str = "%s %s" % (log_str, close_word)
                    store.log(log_str)

            final_embeddings = normalized_embeddings.eval()

            # save final embeddings for Expiriment #2
            with open("word_embeddings.pkl", "wb") as f:
                pickle.dump(final_embeddings, f)

            self.embeddings = final_embeddings
Esempio n. 9
0
def train():

    sentences = []
    labels = []
    x = []
    y = []
    _y = []

    with open('data.csv', 'rb') as f:
        reader = csv.reader(f, delimiter=',')

        for row in reader:
            words = preprocess.clean(row[1])
            sentences.append(words)
            labels.append(([0, 1] if row[0] == "example" else [1, 0]))
            _y.append(1 if row[0] == "example" else 0)

    padded_sentences = [ preprocess.pad(sentence) for sentence in sentences ]

    x = np.array([[vocab.getIdFromWord(word) for word in sentence] for sentence in padded_sentences])
    embeddings = np.array(map(np.unique, ([ vocab.getEmbeddingFromWord(word) for word in sentence for sentence in padded_sentences])))
    store.log(embeddings)
    store.log(len(embeddings))
    store.log(embeddings[0])
    store.log(len(embeddings[0]))
    y = np.array(labels)


# Split Dataset
# ==================================================

# Load data
    print("Loading data...")
# Randomly shuffle data
    sss = StratifiedShuffleSplit(_y, 1, test_size=0.1, random_state=0)
    for train, test in sss:
        x_train = x[train]
        y_train = y[train]

        x_dev = x[test]
        y_dev = y[test]


# Training
# ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
          allow_soft_placement=opts["allow_soft_placement"],
          log_device_placement=opts["log_device_placement"])
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=x_train.shape[1],
                num_classes=2,
                vocab_size=len(embeddings),
                embedding_size=opts["embedding_dim"],
                embedding_tensor=embeddings,
                filter_sizes=map(int, opts["filter_sizes"].split(",")),
                num_filters=opts["num_filters"],
                l2_reg_lambda=opts["l2_reg_lambda"])

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-4)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            saver = tf.train.Saver(tf.all_variables())

            # Initialize all variables
            sess.run(tf.initialize_all_variables())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: opts["dropout_keep_prob"]
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn.loss, cnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: 1.0
                }
                step, loss, accuracy = sess.run(
                    [global_step, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

            # Generate batches
            batches = batch_iter(
                zip(x_train, y_train), opts["batch_size"], opts["num_epochs"])
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % opts["evaluate_every"] == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev)
                    print("")

            saver.save(sess, opts["model_location"] + "model.chpt")
Esempio n. 10
0
def train():

    sentences = []
    labels = []
    x = []
    y = []
    _y = []

    with open('data.csv', 'rb') as f:
        reader = csv.reader(f, delimiter=',')

        for row in reader:
            words = preprocess.clean(row[1])
            sentences.append(words)
            labels.append(([0, 1] if row[0] == "example" else [1, 0]))
            _y.append(1 if row[0] == "example" else 0)

    padded_sentences = [preprocess.pad(sentence) for sentence in sentences]

    x = np.array([[vocab.getIdFromWord(word) for word in sentence]
                  for sentence in padded_sentences])
    embeddings = np.array(
        map(np.unique, ([
            vocab.getEmbeddingFromWord(word) for word in sentence
            for sentence in padded_sentences
        ])))
    store.log(embeddings)
    store.log(len(embeddings))
    store.log(embeddings[0])
    store.log(len(embeddings[0]))
    y = np.array(labels)

    # Split Dataset
    # ==================================================

    # Load data
    print("Loading data...")
    # Randomly shuffle data
    sss = StratifiedShuffleSplit(_y, 1, test_size=0.1, random_state=0)
    for train, test in sss:
        x_train = x[train]
        y_train = y[train]

        x_dev = x[test]
        y_dev = y[test]

# Training
# ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=opts["allow_soft_placement"],
            log_device_placement=opts["log_device_placement"])
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=2,
                          vocab_size=len(embeddings),
                          embedding_size=opts["embedding_dim"],
                          embedding_tensor=embeddings,
                          filter_sizes=map(int,
                                           opts["filter_sizes"].split(",")),
                          num_filters=opts["num_filters"],
                          l2_reg_lambda=opts["l2_reg_lambda"])

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-4)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            saver = tf.train.Saver(tf.all_variables())

            # Initialize all variables
            sess.run(tf.initialize_all_variables())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: opts["dropout_keep_prob"]
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

            def dev_step(x_batch, y_batch):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, loss, accuracy = sess.run(
                    [global_step, cnn.loss, cnn.accuracy], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))

            # Generate batches
            batches = batch_iter(zip(x_train, y_train), opts["batch_size"],
                                 opts["num_epochs"])
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % opts["evaluate_every"] == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev)
                    print("")

            saver.save(sess, opts["model_location"] + "model.chpt")
Esempio n. 11
0
  print()
  printinfo(" Mapset for " + buildmap + " successful created")
  print()
  quit()

"""
zip the images, kml and log

"""

if args.zip_img:
  store.zip_img()
  store.kml()

if args.log:
  store.log()

if os.path.exists(WORK_DIR + "o5m/bbox_map") == True:
  os.remove(WORK_DIR + "o5m/bbox_map")

today = datetime.datetime.now()
DATE = today.strftime('%Y%m%d_%H%M')


print()
print()
print(" ----- " + (DATE) + " ----- " + (buildmap) + " ready! -----")
print()
print()

quit()
Esempio n. 12
0
if config.get('mkgmap', 'logging') == "yes":
  config.set('mkgmap', 'logging', 'no')
  write_config()

"""
zip the images, kml and log
"""

import store

if (args.zip_img):
  store.zip_img()
  store.kml()

if config.get('mkgmap', 'logging') == "yes":
  store.log()

"""
create the contourlines

"""

if (args.contourlines):
    ExitCode = os.path.exists("mystyles/contourlines_style")
    if ExitCode == True:
      import contourlines
      contourlines.create_cont()
    else:
      printwarning("dir mystyles/contourlines_style not found")

print("")