def train_single_sent_id(self, sentences, iteration, work=None, neu1=None, sent_vec=None, cat_vec=None): if work is None: work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) if neu1 is None: neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) sent_grad = self.init_grad_weight(1) cat_grad = self.init_grad_weight(1) if sent_vec is None: sent_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL) if self.init_adjust: denom = sqrt(self.layer1_size) else: denom = self.layer1_size sent_vec[:] = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom if cat_vec is None: cat_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL) self.cat_learn = 0 for i in range(iteration): alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration) ) if self.update_mode == 0 else self.alpha for sentence in sentences: sampled = [self.vocab.get(word, None) for word in sentence] train_cat_vec(self, sent_vec, cat_vec, sampled, alpha, work, neu1, sent_grad, cat_grad) return sent_vec, cat_vec
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" random.seed(self.seed) self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) self.syn0 += (random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size self.syn0norm = None
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned( self.layer1_size + 8, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job if self.update_mode == 0: alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) else: alpha = self.alpha job_words = sum( train_sent_vec(self, self.sents[sent_no], sentence, alpha, work, neu1, self.sents_grad[sent_no]) for sent_no, sentence in job) with lock: word_count[0] += job_words sent_count[0] += chunksize elapsed = time.time() - start if elapsed >= next_report[0]: logger.info( "PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s" % (100.0 * sent_count[0] / total_sents, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def build_vec(self, sentences, has_vocab = False): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" if not has_vocab : logger.info("build vocabulary and") logger.info("resetting vectors") random.seed(self.seed) sentence_no, vocab = -1, {} total_words = 0 self.sents_len = 0 #the num of sentence ids self.total_sents = 0 #the num of sentences self.cat_len = 0 #the num of category ids sent_cat_hash = {} #hash table for sent_no and cat_no for sentence_no, sent_tuple in enumerate(sentences): if sentence_no % 10000 == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = sent_tuple[0] for word in sentence: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) sent_id = sent_tuple[1] cat_id = sent_tuple[2] self.total_sents += 1 if not self.cat_no_hash.has_key(cat_id): self.cat_no_hash[cat_id] = self.cat_len self.cat_id_list.append(cat_id) self.cat_len += 1 if not self.sent_no_hash.has_key(sent_id): self.sent_no_hash[sent_id] = self.sents_len self.sent_id_list.append(sent_id) self.sents_len += 1 sent_cat = str(self.sent_no_hash[sent_id])+" "+str(self.cat_no_hash[cat_id]) sent_cat_hash.setdefault(sent_cat,0) sent_cat_hash[sent_cat] += 1 logger.info("collected %i word types from a corpus of %i words and %i sentences(ident:%i) with %i categories" % (len(vocab), total_words, self.total_sents, self.sents_len, self.cat_len)) self.build_vocab(vocab) self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size), dtype=REAL) self.cats = matutils.zeros_aligned((self.cat_len, self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once self.reset_weights() # make sent_cat_pair self.sent_cat_pair = empty((len(sent_cat_hash),2), dtype=uint32) self.pair_len = len(sent_cat_hash) idx = 0 for sent_cat in sent_cat_hash.keys(): tpl = sent_cat.split(" ") self.sent_cat_pair[idx][0] = uint32(tpl[0]) self.sent_cat_pair[idx][1] = uint32(tpl[1]) idx += 1 #sort by cat_no, sent_no in place self.sent_cat_pair.view('u4,u4').sort(order=['f1','f0'], axis=0)
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") random.seed(self.seed) self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once for i in xrange(len(self.vocab)): self.syn0[i] = (random.rand(self.layer1_size) - 0.5) / self.layer1_size if self.hs: self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) if self.negative: self.syn1neg = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) self.syn0norm = None
def init_pairnorm(self): # avoid initializing from multiple threads lock = threading.Lock() with lock: if getattr(self, 'pairnorm', None) is not None: return self.pairnorm = matutils.zeros_aligned((self.pair_len, self.layer1_size), dtype=REAL) init_pairtable(self)
def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. Document should be a list of (word) tokens. """ doctag_vectors = empty((1, self.vector_size), dtype=REAL) doctag_vectors[0] = self.seeded_vector(' '.join(doc_words)) doctag_locks = ones(1, dtype=REAL) doctag_indexes = [0] work = zeros(self.layer1_size, dtype=REAL) if not self.sg: neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) for i in range(steps): if self.sg: train_document_dbow(self, doc_words, doctag_indexes, alpha, work, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) elif self.dm_concat: train_document_dm_concat(self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) else: train_document_dm(self, doc_words, doctag_indexes, alpha, work, neu1, learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks) alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha return doctag_vectors[0]
def train_single_sent_id(self, sentences, iteration, work=None, neu1=None): if work is None: work = zeros(self.layer1_size, dtype=REAL) if neu1 is None: neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) num_of_grad = 0 if (self.update_mode == 1): num_of_grad = self.layer1_size elif (self.update_mode == 2): num_of_grad = 2 * self.layer1_size elif (self.update_mode == 3): num_of_grad = 2 * self.layer1_size + 3 sent_grad = zeros(num_of_grad, dtype=REAL) if self.init_adjust: denom = sqrt(self.layer1_size) else: denom = self.layer1_size new_sent = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom for i in range(iteration): alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration) ) if self.update_mode == 0 else self.alpha for sentence in sentences: sampled = [self.vocab.get(word, None) for word in sentence] train_sent_vec(self, new_sent, sampled, alpha, work, neu1, sent_grad) return new_sent
def worker_infer(): while True: job = jobs.get() if job is None: break diff = 0.0 work = np.zeros(model1.layer1_size, dtype=REAL) neu1 = matutils.zeros_aligned(model1.layer1_size, dtype=REAL) for sent_tuple in job: cat_id_gold = sent_tuple[2] sent_vec1 = model1.train_single_sent_id([sent_tuple[0]], 20, work, neu1) sims1 = np.empty(model1.sents_len, dtype=REAL) nearest_sent_fast(model1, sent_vec1, 0, sims1) sent_vec2 = model2.train_single_sent_id([sent_tuple[0]], 20, work, neu1) sims2 = np.empty(model2.sents_len, dtype=REAL) nearest_sent_fast(model2, sent_vec2, 0, sims2) sims1 += sims2 neighbors = np.argsort(sims1)[::-1] cat_ids = {} nearest = [] ident_cat = True for top_cand in neighbors: sent_id = model1.sent_id_list[top_cand] cat_id = sent_cat[sent_id] if not ident_cat or not cat_ids.has_key(cat_id): cat_ids[cat_id] = 1 nearest.append(cat_id) if len(nearest) == topK: break diff += 1.0 if cat_id_gold in nearest else 0.0 print nearest, cat_id_gold confusion_mtx.setdefault(cat_id_gold, {}) confusion_mtx[cat_id_gold].setdefault(nearest[0], 0) confusion_mtx[cat_id_gold][nearest[0]] += 1 qout.put(diff)
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # each thread must have its own work memory while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum( train_sentence(self, sentence, alpha, work) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def init_pairnorm(self): # avoid initializing from multiple threads lock = threading.Lock() with lock: if getattr(self, 'pairnorm', None) is not None: return self.pairnorm = matutils.zeros_aligned( (self.pair_len, self.layer1_size), dtype=REAL) init_pairtable(self)
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") random.seed(self.seed) self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once for i in xrange(len(self.vocab)): self.syn0[i] = (random.rand(self.layer1_size) - 0.5) / self.layer1_size if self.hs: self.syn1 = matutils.zeros_aligned( (len(self.vocab), self.layer1_size), dtype=REAL) if self.negative: self.syn1neg = matutils.zeros_aligned( (len(self.vocab), self.layer1_size), dtype=REAL) self.syn0norm = None
def worker_infer(): while True: job = jobs.get() if job is None: break diff = 0. work = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL) neu1 = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL) for sent_tuple in job: cat_id = sent_tuple[2] ret = model.infer([sent_tuple[0]], iteration=20, k=topK, work=work, neu1=neu1) diff += 1. if cat_id in ret[2] else 0. print ret[2],cat_id confusion_mtx.setdefault(cat_id, {}) confusion_mtx[cat_id].setdefault(ret[2][0], 0) confusion_mtx[cat_id][ret[2][0]] += 1 qout.put(diff)
def worker_infer(): while True: job = jobs.get() if job is None: break diff = 0. work = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL) neu1 = matutils.zeros_aligned(model.layer1_size + 8, dtype=REAL) for sent_tuple in job: cat_id = sent_tuple[2] ret = model.infer([sent_tuple[0]], iteration=20, k=topK, work=work, neu1=neu1) diff += 1. if cat_id in ret[2] else 0. print ret[2], cat_id confusion_mtx.setdefault(cat_id, {}) confusion_mtx[cat_id].setdefault(ret[2][0], 0) confusion_mtx[cat_id][ret[2][0]] += 1 qout.put(diff)
def init_grad_weight(self, length): grad_size = 0 if self.update_mode == 1: grad_size = self.layer1_size elif self.update_mode == 2: grad_size = 2 * self.layer1_size elif self.update_mode == 3: grad_size = 2 * self.layer1_size + 3 grad = matutils.zeros_aligned((length, grad_size), dtype=REAL) if self.update_mode == 3: grad[:,grad_size - 3] = ADAM_BETA1 grad[:,grad_size - 2] = ADAM_BETA1 grad[:,grad_size - 1] = ADAM_BETA2 return grad
def init_grad_weight(self, length): grad_size = 0 if self.update_mode == 1: grad_size = self.layer1_size elif self.update_mode == 2: grad_size = 2 * self.layer1_size elif self.update_mode == 3: grad_size = 2 * self.layer1_size + 3 grad = matutils.zeros_aligned((length, grad_size), dtype=REAL) if self.update_mode == 3: grad[:, grad_size - 3] = ADAM_BETA1 grad[:, grad_size - 2] = ADAM_BETA1 grad[:, grad_size - 1] = ADAM_BETA2 return grad
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job if self.update_mode == 0: alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) else: alpha = self.alpha job_words = train_from_job(self, job, alpha, work, neu1) with lock: word_count[0] += job_words sent_count[0] += chunksize elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s" % (100.0 * sent_count[0] / total_sents, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def train_single_sent_id(self, sentences, iteration, work=None, neu1=None, sent_vec=None, cat_vec=None): if work is None: work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) if neu1 is None: neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) sent_grad = self.init_grad_weight(1) cat_grad = self.init_grad_weight(1) if sent_vec is None: sent_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL) if self.init_adjust: denom = sqrt(self.layer1_size) else: denom = self.layer1_size sent_vec[:] = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom if cat_vec is None: cat_vec = matutils.zeros_aligned(self.layer1_size, dtype=REAL) self.cat_learn = 0 for i in range(iteration): alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration)) if self.update_mode == 0 else self.alpha for sentence in sentences: sampled = [self.vocab.get(word, None) for word in sentence] train_cat_vec(self, sent_vec, cat_vec, sampled, alpha, work, neu1, sent_grad, cat_grad) return sent_vec, cat_vec
def worker_infer(): while True: job = jobs.get() if job is None: break diff = 0. work = matutils.zeros_aligned(model1.layer1_size + 8, dtype=REAL) neu1 = matutils.zeros_aligned(model1.layer1_size + 8, dtype=REAL) for sent_tuple in job: cat_id_gold = sent_tuple[2] sent_vec1, cat_vec1 = model1.train_single_sent_id( [sent_tuple[0]], 20, work, neu1) sims1 = np.empty(model1.pair_len, dtype=REAL) catsentvec_sim_sum(model1, sent_vec1, cat_vec1, sims1) sent_vec2, cat_vec2 = model2.train_single_sent_id( [sent_tuple[0]], 20, work, neu1) sims2 = np.empty(model2.pair_len, dtype=REAL) catsentvec_sim_sum(model2, sent_vec2, cat_vec2, sims2) sims1 += sims2 #joint_catsentvec_sim_sum(pairtable, sent_vec1, cat_vec1, sent_vec2, cat_vec2, sims1) neighbors = np.argsort(sims1)[::-1] cat_ids = {} nearest = [] ident_cat = True for top_cand in neighbors: (sent_no, cat_no) = model1.sent_cat_pair[top_cand] cat_id = model1.cat_id_list[cat_no] if not ident_cat or not cat_ids.has_key(cat_id): cat_ids[cat_id] = 1 nearest.append(cat_id) if len(nearest) == topK: break diff += 1. if cat_id_gold in nearest else 0. print nearest, cat_id_gold confusion_mtx.setdefault(cat_id_gold, {}) confusion_mtx[cat_id_gold].setdefault(nearest[0], 0) confusion_mtx[cat_id_gold][nearest[0]] += 1 qout.put(diff)
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # each thread must have its own work memory while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def train_single_sent_id(self, sentences, iteration, work=None, neu1=None): if work is None: work = zeros(self.layer1_size, dtype=REAL) if neu1 is None: neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) num_of_grad = 0 if (self.update_mode == 1): num_of_grad = self.layer1_size elif (self.update_mode == 2): num_of_grad = 2 * self.layer1_size elif (self.update_mode == 3): num_of_grad = 2 * self.layer1_size + 3 sent_grad = zeros(num_of_grad, dtype=REAL) if self.init_adjust: denom = sqrt(self.layer1_size) else: denom = self.layer1_size new_sent = (random.rand(self.layer1_size).astype(REAL) - 0.5) / denom for i in range(iteration): alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * i / iteration)) if self.update_mode == 0 else self.alpha for sentence in sentences: sampled = [self.vocab.get(word, None) for word in sentence] train_sent_vec(self, new_sent, sampled, alpha, work, neu1, sent_grad) return new_sent
def build_vec(self, sentences, has_vocab=False): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting vectors for sentences") if not has_vocab: logger.info("build vocabulary and") logger.info("resetting vectors") random.seed(self.seed) sentence_no, vocab = -1, {} total_words = 0 self.sents_len = 0 #the num of sentence ids self.total_sents = 0 #the num of sentences for sentence_no, sent_tuple in enumerate(sentences): if sentence_no % 10000 == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = sent_tuple[0] for word in sentence: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) sent_id = sent_tuple[1] self.total_sents += 1 if not self.sent_no_hash.has_key(sent_id): self.sent_no_hash[sent_id] = self.sents_len self.sent_id_list.append(sent_id) self.sents_len += 1 logger.info( "collected %i word types from a corpus of %i words and %i sentences(ident:%i)" % (len(vocab), total_words, self.total_sents, self.sents_len)) self.build_vocab(vocab) self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once self.reset_weights()
def build_vec(self, sentences, has_vocab = False): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting vectors for sentences") if not has_vocab : logger.info("build vocabulary and") logger.info("resetting vectors") random.seed(self.seed) sentence_no, vocab = -1, {} total_words = 0 self.sents_len = 0 #the num of sentence ids self.total_sents = 0 #the num of sentences for sentence_no, sent_tuple in enumerate(sentences): if sentence_no % 10000 == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = sent_tuple[0] for word in sentence: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) sent_id = sent_tuple[1] self.total_sents += 1 if not self.sent_no_hash.has_key(sent_id): self.sent_no_hash[sent_id] = self.sents_len self.sent_id_list.append(sent_id) self.sents_len += 1 logger.info("collected %i word types from a corpus of %i words and %i sentences(ident:%i)" % (len(vocab), total_words, self.total_sents, self.sents_len)) self.build_vocab(vocab) self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once self.reset_weights()
def build_vec(self, sentences, has_vocab=False): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" if not has_vocab: logger.info("build vocabulary and") logger.info("resetting vectors") random.seed(self.seed) sentence_no, vocab = -1, {} total_words = 0 self.sents_len = 0 #the num of sentence ids self.total_sents = 0 #the num of sentences self.cat_len = 0 #the num of category ids sent_cat_hash = {} #hash table for sent_no and cat_no for sentence_no, sent_tuple in enumerate(sentences): if sentence_no % 10000 == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence = sent_tuple[0] for word in sentence: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) sent_id = sent_tuple[1] cat_id = sent_tuple[2] self.total_sents += 1 if not self.cat_no_hash.has_key(cat_id): self.cat_no_hash[cat_id] = self.cat_len self.cat_id_list.append(cat_id) self.cat_len += 1 if not self.sent_no_hash.has_key(sent_id): self.sent_no_hash[sent_id] = self.sents_len self.sent_id_list.append(sent_id) self.sents_len += 1 sent_cat = str(self.sent_no_hash[sent_id]) + " " + str( self.cat_no_hash[cat_id]) sent_cat_hash.setdefault(sent_cat, 0) sent_cat_hash[sent_cat] += 1 logger.info( "collected %i word types from a corpus of %i words and %i sentences(ident:%i) with %i categories" % (len(vocab), total_words, self.total_sents, self.sents_len, self.cat_len)) self.build_vocab(vocab) self.sents = matutils.zeros_aligned((self.sents_len, self.layer1_size), dtype=REAL) self.cats = matutils.zeros_aligned((self.cat_len, self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once self.reset_weights() # make sent_cat_pair self.sent_cat_pair = empty((len(sent_cat_hash), 2), dtype=uint32) self.pair_len = len(sent_cat_hash) idx = 0 for sent_cat in sent_cat_hash.keys(): tpl = sent_cat.split(" ") self.sent_cat_pair[idx][0] = uint32(tpl[0]) self.sent_cat_pair[idx][1] = uint32(tpl[1]) idx += 1 #sort by cat_no, sent_no in place self.sent_cat_pair.view('u4,u4').sort(order=['f1', 'f0'], axis=0)