def mmr_geometricmedian_ker(K): m=K.shape[0] Ka=mean(K,axis=1) aKa=np_sum(Ka)/m niter=1000 xeps=sqrt(np_sum(Ka**2))/100 xerr=2*xeps e1=ones(m) for iiter in range(niter): ## d2u=sqrt((zeros(m)+aKa)+diag(K)-2*Ka) d2u_2=aKa+diag(K)-2*Ka ineg=where(d2u_2<0)[0] d2u_2[ineg]=0.0 d2u=sqrt(d2u_2) inul=where(d2u<xeps)[0] d2u[inul]=xeps xdenom=np_sum(e1/d2u) Kanext=np_sum(K/outer(d2u,e1),axis=0)/xdenom aKanext=np_sum(Ka/d2u)/xdenom if np_max(Kanext-Ka)<xerr: Ka=copy(Kanext) aKa=aKanext break Ka=copy(Kanext) aKa=aKanext return(Ka,aKa)
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] for subword in word2_subwords: ngrams_subwords_indices.append(model.wv.ngrams[subword]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word result += len(word_vocabs) return result
def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True): """ Update distributed memory model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ lbl_indices = [lbl.index for lbl in lbls if lbl is not None] lbl_sum = np_sum(model.syn0[lbl_indices], axis=0) lbl_len = len(lbl_indices) neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + lbl_len) neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words) if train_lbls: model.syn0[lbl_indices] += neu1e return len([word for word in sentence if word is not None])
def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True): """ Update distributed memory model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from doc2vec_inner instead. """ lbl_indices = [lbl.index for lbl in lbls if lbl is not None] if(len(lbl_indices) <= model.K):return 0 docIndxPos = int(model.index2word[lbl_indices[0]][5:]) topKTopics = argsort(model.w_ld[docIndxPos])[::-1][:4] selected_lbl_indices = [lbl_indices[0]]; for i in range(2): selected_lbl_indices.append(lbl_indices[topKTopics[i]+1]) lbl_sum = np_sum(model.syn0[lbl_indices[0]], axis=0) ## lbl_len = len(lbl_indices) lbl_len = 1 neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + lbl_len) neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words) if train_lbls: model.syn0[selected_lbl_indices[0]] += neu1e model.syn0[selected_lbl_indices[1:]] += (neu1e/model.noOfLabels) word2_indices.append(word.index) a_1 = np_sum(model.syn0[word2_indices], axis=0)/len(word2_indices) docIndxNeg = selectNegativeDocs(docIndxPos) myTrain(model, docIndxPos, docIndxNeg, a_1) return len([word for word in sentence if word is not None])
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and model.vocab[w].sample_int > model.random.randint(2**32)] doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0) doctag_len = len(doctag_indexes) for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum # 1 x layer1_size if word2_indexes and model.cbow_mean: l1 /= (len(word2_indexes) + doctag_len) neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if word2_indexes and not model.cbow_mean: neu1e /= (len(word2_indexes) + doctag_len) if learn_doctags: doctag_vectors[doctag_indexes] += neu1e * \ np_repeat(doctag_locks[doctag_indexes], model.vector_size).reshape(-1, model.vector_size) if learn_words: word_vectors[word2_indexes] += neu1e * \ np_repeat(word_locks[word2_indexes], model.vector_size).reshape(-1, model.vector_size) return len(word_vocabs)
def compute(self, today, assets, out, data, decay_rate): weights = self.weights(len(data), decay_rate) mean = average(data, axis=0, weights=weights) variance = average((data - mean) ** 2, axis=0, weights=weights) squared_weight_sum = np_sum(weights) ** 2 bias_correction = squared_weight_sum / (squared_weight_sum - np_sum(weights ** 2)) out[:] = sqrt(variance * bias_correction)
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): """Update CBOW model by training on a sequence of sentences. Called internally from :meth:`~gensim.models.fasttext.FastText.train`. Notes ----- This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from :mod:`gensim.models.fasttext_inner` instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` Model instance. sentences : iterable of list of str Iterable of the sentences. alpha : float Learning rate. work : :class:`numpy.ndarray`, optional UNUSED. neu1 : :class:`numpy.ndarray`, optional UNUSED. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] ngrams_subwords_indices.extend(model.wv.buckets_word[index]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) # train on the sliding window for target word train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) result += len(word_vocabs) return result
def compute(self, today, assets, out, data, decay_rate): weights = self.weights(len(data), decay_rate) mean = average(data, axis=0, weights=weights) variance = average((data - mean) ** 2, axis=0, weights=weights) squared_weight_sum = (np_sum(weights) ** 2) bias_correction = ( squared_weight_sum / (squared_weight_sum - np_sum(weights ** 2)) ) out[:] = sqrt(variance * bias_correction)
def jaccard_distance(self): def jaccard_similarity(list1, list2): intersection = len(list(set(list1).intersection(list2))) union = (len(list1) + len(list2)) - intersection return float(intersection) / union qlist = self.zero_filled_u_l[0] rlist = self.zero_filled_u_l[1] return np_sum(power(qlist - rlist, 2)) / (np_sum(power( qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist))
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): """ Update distributed memory model ("PV-DM") by training on a single document. Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This method implements the DM model with a projection (input) layer that is either the sum or mean of the context vectors, depending on the model's `dm_mean` configuration field. See `train_document_dm_concat()` for the DM model with a concatenated input layer. The document is provided as `doc_words`, a list of word tokens which are looked up in the model's vocab dictionary, and `doctag_indexes`, which provide indexes into the doctag_vectors array. Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to prevent learning-updates to those respective model weights, as if using the (partially-)frozen model to infer other compatible vectors. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ if word_vectors is None: word_vectors = model.wv.syn0 if word_locks is None: word_locks = model.syn0_lockf if doctag_vectors is None: doctag_vectors = model.docvecs.doctag_syn0 if doctag_locks is None: doctag_locks = model.docvecs.doctag_syn0_lockf word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) count = len(word2_indexes) + len(doctag_indexes) if model.cbow_mean and count > 1: l1 /= count neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=learn_hidden) if not model.cbow_mean and count > 1: neu1e /= count if learn_doctags: for i in doctag_indexes: doctag_vectors[i] += neu1e * doctag_locks[i] if learn_words: for i in word2_indexes: word_vectors[i] += neu1e * word_locks[i] return len(word_vocabs)
def offspring(self, other): ''' offspring takes two brains (parents) and returns one (child). The parents exchange some of their genes (weights and biases), and this makes a child. The child is the updated version of self. So, I need to change that, in order to be able to have more complex selection ''' _child = self.brain #decide how many weghts you will take from other _n = random.randint( np_sum(_child.nodes) + 1) # "+1" is needed because of the definition of randint for __n in range(_n): ''' remember that we represent weights as w^{l}_{ij}, with l: layer i the i^{th} node of layer l j the j^{th} node of layer l+1 ''' l = random.randint(_child.layers + 1) i = random.randint(_child.nodes[l]) j = random.randint(_child.nodes[l + 1]) #print '{},{},{}'.format(l,i,j) #print str(self.brain.weights[l][i][j]) +' <-- '+ str(other.brain.weights[l][i][j]) _child.update_weight(l, i, j, other.brain.weights[l][i][j]) #decide how many biases you will take from other # "+1" is not needed because the input biases are always 0 (ie there are total_nodes-1 biases) _n = random.randint(np_sum(_child.total_nodes)) for __n in range(_n): ''' remember that we represent biases as w^{l}_{i}, with l: layer i the i^{th} node of layer l ''' l = random.randint(1, _child.layers + 1) # b^{0}_{i}=0 (can't change) i = random.randint(_child.nodes[l]) #print '{},{}'.format(l,i) #print str(self.brain.biases[l][i]) +' <-- '+ str(other.brain.biases[l][i]) _child.update_bias(l, i, other.brain.biases[l][i])
def s2v_train(sentences, len_sentences, outer_vecs, max_seq_len, wv, weights): """Train sentence embedding on a list of sentences Called internally from :meth:`~fse.models.sentence2vec.Sentence2Vec.train`. Parameters ---------- sentences : iterable of list of str The corpus used to train the model. len_sentences : int Length of the sentence iterable wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` The BaseKeyedVectors instance containing the vectors used for training weights : np.ndarray Weights used in the summation of the vectors Returns ------- np.ndarray The sentence embedding matrix of dim len(sentences) * vector_size int Number of words in the vocabulary actually used for training. int Number of sentences used for training. """ size = wv.vector_size vlookup = wv.vocab w_trans = weights[:, None] output = empty((len_sentences, size), dtype=REAL) for i in range(len_sentences): output[i] = full(size, EPS, dtype=REAL) effective_words = 0 effective_sentences = 0 for i, s in enumerate(sentences): sentence_idx = [vlookup[w].index for w in s if w in vlookup] if len(sentence_idx): v = np_sum(outer_vecs[ i][1:min(max_seq_len, len(sentence_idx) + 1), :] * w_trans[sentence_idx[:max_seq_len - 1]], axis=0) effective_words += len(sentence_idx) effective_sentences += 1 v *= 1 / len(sentence_idx) v /= sqrt(np_sum(v.dot(v))) output[i] = v return output.astype(REAL), effective_words, effective_sentences
def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True): """ Update distributed memory model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Doc2Vec.train()`. This is the non-optimized, Python version. If you have a C compiler, gensim will use the optimized version from doc2vec_inner instead. """ lbl_indices = [lbl.index for lbl in lbls if lbl is not None] lbl_sum = np_sum(model.syn0[lbl_indices], axis=0) lbl_len = len(lbl_indices) neg_labels = [] if model.negative: # precompute negative labels neg_labels = zeros(model.negative + 1) neg_labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint( model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( sentence[start:pos + model.window + 1 - reduced_window], start) word2_indices = [ word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos) ] l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + lbl_len) neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words) if train_lbls: model.syn0[lbl_indices] += neu1e return len([word for word in sentence if word is not None])
def array_kwargs_ones(): """ ones(shape, dtype=float, order='C') """ from numpy import sum as np_sum from numpy import ones n = 4 a = ones((n, n - 1), 'float', 'C') b = ones((n + 1, 2 * n), float, order='F') c = ones((1, n), complex) d = ones(dtype=int, shape=2 + n) return np_sum(a) + np_sum(b) + np_sum(c) + np_sum(d)
def norm_dist(distribution, smooth=True): """ Normalize distribution, and apply add-one smoothing to leave unused probability space. """ global smoothing_parameter if smooth: add_one_smoothing = smoothing_parameter norming_factor = np_sum(distribution[:, 0] + add_one_smoothing) distribution[:, 0] = (distribution[:, 0] + add_one_smoothing) / norming_factor else: distribution[:, 0] = distribution[:, 0] / np_sum(distribution[:, 0]) return distribution
def _narrowImg(src, width): w = src.shape[1] l = 0 r = w - 1 suml = np_sum(src[:, l]) sumr = np_sum(src[:, r]) while w > width: if suml <= sumr: r -= 1 sumr = np_sum(src[:, r]) else: l += 1 suml = np_sum(src[:, l]) w -= 1 return l
def mmr_geometricmedian(X): (m,n)=X.shape u=mean(X,axis=0) niter=1000 xeps=sqrt(np_sum(u**2))/1000 xerr=2*xeps for i in range(niter): d2u=sqrt(np_sum((X-tile(u,(m,1)))**2,axis=1)) inul=where(d2u<xeps)[0] d2u[inul]=xeps unext=np_sum(X/tile(d2u.reshape((m,1)),(1,n)),axis=0)/np_sum(ones(m)/d2u) if np_max(unext-u)<xerr: break u=copy(unext) return(unext,i,np_max(unext-u))
def comp_wind_sym(wind_mat): """Computes the winding pattern periodicity and symmetries Parameters ---------- wind_mat : numpy.ndarray Matrix of the Winding Returns ------- Nperw: int Number of electrical period of the winding """ assert len(wind_mat.shape) == 4, "dim 4 expected for wind_mat" # Summing on all the layers (Nlay_r and Nlay_theta) wind_mat2 = squeeze(np_sum(np_sum(wind_mat, axis=1), axis=0)) qs = wind_mat.shape[3] # Number of phase Zs = wind_mat.shape[2] # Number of Slot Nperw = 1 # Number of electrical period of the winding Nperslot = 1 # Periodicity of the winding in number of slots # Looking for the periodicity of each phase for q in range(0, qs): k = 1 is_sym = False while k <= Zs and not is_sym: # We shift the array arround the slot and check if it's the same if array_equal(wind_mat2[:, q], roll(wind_mat2[:, q], shift=k)): is_sym = True else: k += 1 # least common multiple to find common periodicity between different # phase Nperslot = lcm(Nperslot, k) # If Nperslot > Zs no symmetry if Nperslot > 0 and Nperslot < Zs: # nb of periods of the winding (2 means 180°) Nperw = Zs / float(Nperslot) # if Zs cannot be divided by Nperslot (non integer) if Nperw % 1 != 0: Nperw = 1 return int(Nperw)
def mutate(self): ''' Mutate an individual. Mutate a random number of weights and biases ''' _c = self.brain #decide how many weghts to mutate _n = random.randint( np_sum(_c.nodes) + 1) # "+1" is needed because of the definition of randint for __n in range(_n): ''' remember that we represent weights as w^{l}_{ij}, with l: layer i the i^{th} node of layer l j the j^{th} node of layer l+1 ''' l = random.randint(_c.layers + 1) i = random.randint(_c.nodes[l]) j = random.randint(_c.nodes[l + 1]) #print '{},{},{}'.format(l,i,j) _c.update_weight(l, i, j, random.choice([-1, 1]) * random.random()) # set a weight from -1 to 1 #decide how many biases you will take from other # "+1" is not needed because the input biases are always 0 (ie there are total_nodes-1 biases) _n = random.randint(np_sum(_c.total_nodes)) for __n in range(_n): ''' remember that we represent biases as w^{l}_{i}, with l: layer i the i^{th} node of layer l ''' l = random.randint(1, _c.layers + 1) # b^{0}_{i}=0 (can't change) i = random.randint(_c.nodes[l]) print('{},{}'.format(l, i)) _c.update_bias(l, i, random.choice([-1, 1]) * random.random()) # set a bias from -1 to 1
def preprocess_datasets(self, path_dataset, groupStage): PATH_DATA = os_path.join(path_dataset, "4") print("(INFO) EVALUATING DATASET ...") path_img = sorted(listdir(PATH_DATA)) if path_img == []: return -1, -1 num_img = len(path_img) # Histogram of all images in folder hChannel = [] sChannel = [] vChannel = [] for image_path in path_img: img = imread(os_path.join(PATH_DATA, image_path)) img = resize(img, (6000, 4000)) img = img[500:-500, 750:-750, :] # HSV channel img = cvtColor(img, COLOR_BGR2HSV) # HSV histogram h = calcHist([img], [0], None, [256], [0, 256]).reshape(256, ) s = calcHist([img], [1], None, [256], [0, 256]).reshape(256, ) v = calcHist([img], [2], None, [256], [0, 256]).reshape(256, ) hChannel.append(h) sChannel.append(s) vChannel.append(v) # Compute dissimilarity maxI = 0 for i in range(num_img): one = [] for j in range(num_img): c1 = np_sum( np_absolute(hChannel[j] - hChannel[i])) / (HEIGHT * WIDTH) c2 = np_sum( np_absolute(sChannel[j] - sChannel[i])) / (HEIGHT * WIDTH) c = (c1 + c2) / 2 if c > maxI: maxI = c save = [i, j] img0 = path_img[save[0]] img1 = path_img[save[1]] imgSample1 = os_path.join(PATH_DATA, img0) imgSample2 = os_path.join(PATH_DATA, img1) return imgSample1, imgSample2
def log_score_per_ngram(self, corpus): """ Given a corpus outside of training data. Finds the average ngram log probability of the corpus. :param corpus: String. ASCII encoded corpus to score :return: average ngram log probability """ probability_keys = self._get_padded_ngrams(corpus, self.highest_order) for i in range(0, len(probability_keys)): if (probability_keys[i][-1], ) not in self.vocab: probability_keys[i] = *probability_keys[i][:-1], "<unk>" sentence_probabilities = [ self.ngram_probabilities.get(key) for key in probability_keys ] for i in range(0, len(sentence_probabilities)): # this is the case for completely unknown if sentence_probabilities[i] is None: sentence_probabilities[i] = log(self.av_unk_probability) else: sentence_probabilities[i] = log(sentence_probabilities[i]) log_sum = np_sum(sentence_probabilities) all_ngrams = [] all_ngrams.extend(ngrams(corpus.split(), self.highest_order)) ngram_count = len(all_ngrams) if not ngram_count: print( "Error: Not enough ngrams. Ensure that corpus contains at least as many words as the highest order" ) return float("-inf") # this case is impossible return log_sum / ngram_count
def construct_local_load(element, shape_functions, quad_data, f): # # Set quad data x_quad = quad_data.x w_quad = quad_data.w # # Init empty matrix num_shape_functions = len(shape_functions) f_el = zeros((num_shape_functions, 1)) # # x0 = element.x[0] xl = element.x[-1] # # Transform the quadrature points x_quad_transform = coord_transform(x_quad, x0, xl) # # Evaluate the functions at the quadrature points f_quad = f(x_quad_transform) psi_quad = [p.psi(p, x_quad) for p in shape_functions] for i in range(num_shape_functions): # # Perform quadrature f_el[i, 0] = ((xl - x0) / 2.) * np_sum(w_quad * (f_quad * psi_quad[i])) # return f_el
def train_sentence_cbow(model, sentence,context_vector, alpha, work=None, neu1=None): """ Update CBOW model by training on a single sentence. The sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from `word2mat.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2mat_inner instead. """ word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2mat code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x vector_size l1 = l1.reshape(model.topic_size,model.vector_size) l1 = l1.T.dot(context_vector) if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices,context_vector, l1, alpha) return len(word_vocabs)
def _reduce_constraints(A, b): """ Make the constraint non-singular if the constraint is on the form: dot(A,x) = b A may be singular. to avoid this problem, we extract the non-singular part of the equation thanks to svd: A = U*S*Vh with U.T*U = I and Vh.T*Vh = I if r is the rank of A, we have: Ar = S[:r,:r]*Vh[:r,:] br = U[:,:r].T*b Hence: Ar*x = br """ try: u, s, vh = svd(A, full_matrices=False) r = np_sum(where(s>1e-3, 1, 0)) # compute the rank of A ur, sr, vhr = u[:, :r], s[:r], vh[:r, :] Ar = dot(diag(sr), vhr) br = dot(ur.T, b) except (LinAlgError): Ar = A.copy() br = b.copy() return Ar, br
def mmr_polypower_d(ndim, ndegree): ndegree = int(ndegree) ## number of terms = \binomial(ndegree+ndim,ndim) nd = 1 for i in range(ndim): nd *= (ndegree + i + 1) / (i + 1) nd = int(nd) xpolydir = {} xpower = zeros(ndim, dtype=int) xpolydir[tuple(xpower)] = 1 for i in range(nd): for j in range(ndim): if xpower[j] < ndegree - np_sum(xpower[j + 1 :]): xpower[j] += 1 xpolydir[tuple(xpower)] = 1 break else: xpower[j] = 0 xpolylist = [xpow for xpow in xpolydir.keys()] xpolylist.sort() xpolypower = array(xpolylist) return xpolypower
def mmr_polyfeature_d(xdata, ndegree): (m, ndim) = xdata.shape ndegree = int(ndegree) ## number of terms = \binomial(ndegree+ndim,ndim) nd = 1 for i in range(ndim): nd *= (ndegree + i + 1) / (i + 1) nd = int(nd) xpolydir = {} xpower = zeros(ndim, dtype=int) xpolydir[tuple(xpower)] = ones(m) for i in range(nd): for j in range(ndim): if xpower[j] < ndegree - np_sum(xpower[j + 1 :]): xterm = xpolydir[tuple(xpower)] xpower[j] += 1 xpolydir[tuple(xpower)] = xterm * xdata[:, j] break else: xpower[j] = 0 xpolydata = zeros((m, nd)) xpolylist = [xpow for xpow in xpolydir.keys()] xpolylist.sort() for i in range(nd): xpow = xpolylist[i] xpolydata[:, i] = xpolydir[xpow] return xpolydata
def mmr_polypower_dn(ndim, maxdegree, ldegree): maxdegree = int(maxdegree) if len(ldegree) == 0: ldegree = [maxdegree] * ndim xpolydir = {} xpower = zeros(ndim, dtype=int) xpolydir[tuple(xpower)] = 1 istate = 1 while istate == 1: for j in range(ndim): if xpower[j] < min(maxdegree - np_sum(xpower[j + 1 :]), ldegree[j]): xpower[j] += 1 xpolydir[tuple(xpower)] = 1 break else: if j < ndim - 1: xpower[j] = 0 else: istate = 0 xpolylist = [xpow for xpow in xpolydir.keys()] xpolylist.sort() xpolypower = array(xpolylist) return xpolypower
def mmr_polyfeature_dn(xdata, maxdegree, ldegree): (m, ndim) = xdata.shape maxdegree = int(maxdegree) if len(ldegree) == 0: ldegree = [maxdegree for i in range(ndim)] xpolydir = {} xpower = zeros(ndim, dtype=int) xpolydir[tuple(xpower)] = ones(m) istate = 1 while istate == 1: for j in range(ndim): if xpower[j] < min(maxdegree - np_sum(xpower[j + 1 :]), ldegree[j]): xterm = xpolydir[tuple(xpower)] xpower[j] += 1 xpolydir[tuple(xpower)] = xterm * xdata[:, j] break else: if j < ndim - 1: xpower[j] = 0 else: istate = 0 xpolylist = [xpow for xpow in xpolydir.keys()] xpolylist.sort() nd = len(xpolylist) xpolydata = zeros((m, nd)) for i in range(nd): xpow = xpolylist[i] xpolydata[:, i] = xpolydir[xpow] return xpolydata
def array_kwargs_full(): """ full(shape, fill_value, dtype=None, order='C') """ from numpy import sum as np_sum from numpy import full n = 3 a = full((n, n - 1), 0.5, 'float', 'C') b = full((n + 1, 2 * n), 2.0, order='F') c = full((1, n), 3) d = full(2 + n, order='F', fill_value=5) e = full(dtype=int, fill_value=1.0, shape=2 * n) return np_sum(a) + np_sum(b) + np_sum(c) + np_sum(d) + np_sum(e)
def train_batch_labeled_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: document, target = sentence word_vocabs = [ model.wv.vocab[w] for w in document if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] target_vocabs = [ model.lvocab[t] for t in target if t in model.lvocab ] for target in target_vocabs: word2_indices = [w.index for w in word_vocabs] l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) if model.softmax: train_cbow_pair_softmax(model, target, word2_indices, l1, alpha) else: train_cbow_pair(model, target, word2_indices, l1, alpha) result += len(word_vocabs) return result
def train_sentence_sg(model, sentence, context_vector,alpha, work=None,neu1=None): """ Update skip-gram model by training on a single sentence. The sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from `word2mat.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2mat_inner instead. """ word_vocabs = [(model.vocab[w],t) for w,t in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, item in enumerate(word_vocabs): word,topic = item reduced_window = model.random.randint(model.window) # `b` in the original word2mat code topic_start = max(0, pos - model.topic_window) for i in xrange(model.topic_size): context_vector[i] = 0. for pos2,item2 in enumerate(word_vocabs[topic_start:(pos+model.topic_window+1)],topic_start): word2,topic2 =item2 context_vector[topic2] += 1 context_vector = context_vector / np_sum(context_vector) # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, item2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): word2,topic2 = item2 # don't train on the `word` itself if pos2 != pos: train_sg_pair(model, model.index2word[word2.index], word.index,context_vector, alpha) return len(word_vocabs)
def train_sentence_fastsent(model, sentences, alpha, work=None, neu1=None): """ Update parameters based on three consecutive sentences from the training data model: the model object sentences: an ordered list of three sentences as lists of words alpha: the learning rate """ current_sent = sentences[1] if model.autoencode: context_sents = sentences[0] + sentences[1] + sentences[2] else: context_sents = sentences[0] + sentences[2] word_vocabs = [ model.vocab[w] for w in current_sent if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] context_vocabs = [ model.vocab[w] for w in context_sents if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] word2_indices = [word.index for word in word_vocabs] l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x vector_size if word2_indices and model.fastsent_mean: l1 /= len(word2_indices) for word in context_vocabs: train_fastsent_pair(model, word, word2_indices, l1, alpha) return len(context_vocabs)
def wmd(document1, document2, model): # Remove out-of-vocabulary words. document1 = [token for token in document1 if token in model] document2 = [token for token in document2 if token in model] if len(document1) == 0 or len(document2) == 0: return 1. dictionary = Dictionary(documents=[document1, document2]) vocab_len = len(dictionary) # Compute distance matrix. distance_matrix = zeros((vocab_len, vocab_len), dtype=double) for i, t1 in list(dictionary.items()): for j, t2 in list(dictionary.items()): distance_matrix[i, j] = scipy.spatial.distance.cosine( model[t1], model[t2]) if np_sum(distance_matrix) == 0.0: # `emd` gets stuck if the distance matrix contains only zeros. return 0. def nbow(document): d = zeros(vocab_len, dtype=double) nbow = dictionary.doc2bow(document) # Word frequencies. doc_len = len(document) for idx, freq in nbow: d[idx] = freq / float(doc_len) # Normalized word frequencies. return d # Compute nBOW representation of documents. d1 = nbow(document1) d2 = nbow(document2) # Compute WMD. res = emd(d1, d2, distance_matrix) return res if res >= 0 else 1
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None): """ Update CBOW model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2vec_inner instead. """ labels = [] if model.negative: # precompute negative labels labels = zeros(model.negative + 1) labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint( model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( sentence[start:pos + model.window + 1 - reduced_window], start) word2_indices = [ word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos) ] l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices, l1, alpha, labels) return len([word for word in sentence if word is not None])
def get_field(self, axes_list): """Returns the values of the field (with symmetries and sums). Parameters ---------- self: Data a Data object axes_list: list a list of RequestedAxis objects Returns ------- values: ndarray values of the field """ values = self.values for axis_requested in axes_list: # Rebuild symmetries only for fft case axis_symmetries = self.axes[axis_requested.index].symmetries if (axis_requested.transform == "fft" and "antiperiod" in axis_symmetries): nper = axis_symmetries["antiperiod"] axis_symmetries["antiperiod"] = 2 values = rebuild_symmetries(values, axis_requested.index, axis_symmetries) axis_symmetries["antiperiod"] = nper # Sum over sum axes if axis_requested.extension == "sum": values = np_sum(values, axis=axis_requested.index) return values
def mmr_polypower_dn(ndim,maxdegree,ldegree): maxdegree=int(maxdegree) if len(ldegree)==0: ldegree=[maxdegree]*ndim xpolydir={} xpower=zeros(ndim,dtype=int) xpolydir[tuple(xpower)]=1 istate=1 while istate==1: for j in range(ndim): if xpower[j]<min(maxdegree-np_sum(xpower[j+1:]),ldegree[j]): xpower[j]+=1 xpolydir[tuple(xpower)]=1 break else: if j<ndim-1: xpower[j]=0 else: istate=0 xpolylist=[ xpow for xpow in xpolydir.keys()] xpolylist.sort() xpolypower=array(xpolylist) return(xpolypower)
def compute_semantic_distance_matrix(model, noun_freq_polar1_terms, noun_freq_polar2_terms, dictionary, filename1, filename2): # Dictionary is doc1 terms * doc2 terms and # distance matrix is ((doc1 terms + doc2 terms) * (doc1 terms + doc2 terms)) # This dimension of matrix is required for Earth Mover distance computation vocab_len = len(dictionary) docset1 = set(noun_freq_polar1_terms) docset2 = set(noun_freq_polar2_terms) distance_matrix = np.full((vocab_len, vocab_len), 0.0) for i, t1 in dictionary.items(): for j, t2 in dictionary.items(): if t1 not in docset1 or t2 not in docset2: continue if t1 == t2 and model.strategy != "doc2vec": distance_matrix[i, j] = 0.00001 continue distance_matrix[i, j] = model.compute_semantic_distance( t1, t2, "cosine") if np_sum(distance_matrix) == 0.0: print('The distance matrix is all zeros.') return None return distance_matrix
def mmr_polyfeature_dn(xdata,maxdegree,ldegree): (m,ndim)=xdata.shape maxdegree=int(maxdegree) if len(ldegree)==0: ldegree=[ maxdegree for i in range(ndim)] xpolydir={} xpower=zeros(ndim,dtype=int) xpolydir[tuple(xpower)]=ones(m) istate=1 while istate==1: for j in range(ndim): if xpower[j]<min(maxdegree-np_sum(xpower[j+1:]),ldegree[j]): xterm=xpolydir[tuple(xpower)] xpower[j]+=1 xpolydir[tuple(xpower)]=xterm*xdata[:,j] break else: if j<ndim-1: xpower[j]=0 else: istate=0 xpolylist=[ xpow for xpow in xpolydir.keys()] xpolylist.sort() nd=len(xpolylist) xpolydata=zeros((m,nd)) for i in range(nd): xpow=xpolylist[i] xpolydata[:,i]=xpolydir[xpow] return(xpolydata)
def mmr_polyfeature_d(xdata,ndegree): (m,ndim)=xdata.shape ndegree=int(ndegree) ## number of terms = \binomial(ndegree+ndim,ndim) nd=1 for i in range(ndim): nd*=(ndegree+i+1)/(i+1) nd=int(nd) xpolydir={} xpower=zeros(ndim,dtype=int) xpolydir[tuple(xpower)]=ones(m) for i in range(nd): for j in range(ndim): if xpower[j]<ndegree-np_sum(xpower[j+1:]): xterm=xpolydir[tuple(xpower)] xpower[j]+=1 xpolydir[tuple(xpower)]=xterm*xdata[:,j] break else: xpower[j]=0 xpolydata=zeros((m,nd)) xpolylist=[ xpow for xpow in xpolydir.keys()] xpolylist.sort() for i in range(nd): xpow=xpolylist[i] xpolydata[:,i]=xpolydir[xpow] return(xpolydata)
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None): """ Update CBOW model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2vec_inner instead. """ labels = [] if model.negative: # precompute negative labels labels = zeros(model.negative + 1) labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices, l1, alpha, labels) return len([word for word in sentence if word is not None])
def rmsle(actual, predicted): """ Root mean squared logarithmic error. """ actual, predicted = _preformat_inputs(actual, predicted) count_of = predicted.shape[0] square_logarithm_difference = log((actual + 1) / (predicted + 1)) ** 2 return sqrt((1 / count_of) * np_sum(square_logarithm_difference))
def area_and_convexity_single_batch(data_mb, num_constraints=None, area=None, polygons_number=None, img_width=None, img_height=None, **kwargs): # preallocate an output array that will contain the computed # constraints values for the batch mb_constraints_values = np_empty(shape=(data_mb.shape[0], num_constraints), dtype=np_float32) for i in prange(data_mb.shape[0]): sample = data_mb[i] # preallocate an output array that will contain the computed # constraints value for the i-th element constraints = np_empty(shape=(num_constraints, ), dtype=np_float32) target_area = area * polygons_number nonzero = np_sum(sample) norm = img_width * img_height - target_area greater_area_inner = min(1, max(0, nonzero - target_area) / norm) smaller_area_inner = min(1, max(0, target_area - nonzero) / norm) constraints[0] = greater_area_inner constraints[1] = smaller_area_inner # convexity constraints[2] = _convex(sample, img_width, img_height) mb_constraints_values[i] = constraints return mb_constraints_values
def get_field(self, axes_list): """Returns the values of the field (with symmetries and sums). Parameters ---------- self: Data a Data object axes_list: list a list of RequestedAxis objects Returns ------- values: ndarray values of the field """ values = self.values for axis_requested in axes_list: # Rebuild symmetries only for fft case if ( axis_requested.transform == "fft" and axis_requested.corr_name in self.symmetries.keys() ): if "antiperiod" in self.symmetries.get(axis_requested.corr_name): values = self.rebuild_symmetries( values, axis_requested.corr_name, axis_requested.index, is_antiperiod=True, ) # Sum over sum axes if axis_requested.extension == "sum": values = np_sum(values, axis=axis_requested.index) return values
def mmr_polypower_d(ndim,ndegree): ndegree=int(ndegree) ## number of terms = \binomial(ndegree+ndim,ndim) nd=1 for i in range(ndim): nd*=(ndegree+i+1)/(i+1) nd=int(nd) xpolydir={} xpower=zeros(ndim,dtype=int) xpolydir[tuple(xpower)]=1 for i in range(nd): for j in range(ndim): if xpower[j]<ndegree-np_sum(xpower[j+1:]): xpower[j]+=1 xpolydir[tuple(xpower)]=1 break else: xpower[j]=0 xpolylist=[ xpow for xpow in xpolydir.keys()] xpolylist.sort() xpolypower=array(xpolylist) return(xpolypower)
def _reduce_constraints(A, b): """ Make the constraint non-singular if the constraint is on the form: dot(A,x) = b A may be singular. to avoid this problem, we extract the non-singular part of the equation thanks to svd: A = U*S*Vh with U.T*U = I and Vh.T*Vh = I if r is the rank of A, we have: Ar = S[:r,:r]*Vh[:r,:] br = U[:,:r].T*b Hence: Ar*x = br """ try: u, s, vh = svd(A, full_matrices=False) r = np_sum(where(s > 1e-3, 1, 0)) # compute the rank of A ur, sr, vhr = u[:, :r], s[:r], vh[:r, :] Ar = dot(diag(sr), vhr) br = dot(ur.T, b) except (LinAlgError): Ar = A.copy() br = b.copy() return Ar, br
def coding_bases(self, seq_id): """Calculate number of coding bases in sequence.""" # check if sequence has any genes if seq_id not in self.genes: return 0 return np_sum(self.coding_mask[seq_id])
def kullback_leibler(actual, predicted): """ Kullback-Leibler error. """ actual, predicted = _preformat_inputs(actual, predicted) count_of_inputs = actual.shape[0] return (1. / count_of_inputs) * np_sum( predicted * log(predicted / actual) + (1 - predicted) * log((1 - predicted) / (1 - actual)) )
def train_cat_vec_cbow_pp(model, sent_vec, cat_vec, sentence, alpha, work=None, neu1=None, sent_vec_grad=None, cat_vec_grad=None): """ Update CBOW model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Sent2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2vec_inner instead. """ w2vmodel = model.w2v if model.negative: # precompute negative labels labels = zeros(model.negative + 1) labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip reduced_window = random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(w2vmodel.syn0[word2_indices], axis=0) # 1 x layer1_size l1 += sent_vec + cat_vec if word2_indices and model.cbow_mean: l1 /= (len(word2_indices) + 1) ##modified by jmarui neu1e = zeros(l1.shape) if model.hs: l2a = w2vmodel.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate if model.word_learn == 1: w2vmodel.syn1[word.point] += outer(ga, l1) # learn hidden -> output neu1e += dot(ga, l2a) # save error if model.negative: # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) word_indices = [word.index] while len(word_indices) < model.negative + 1: w = w2vmodel.table[random.randint(w2vmodel.table.shape[0])] if w != word.index: word_indices.append(w) l2b = w2vmodel.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate if model.word_learn == 1: w2vmodel.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output neu1e += dot(gb, l2b) # save error if model.word_learn == 1: w2vmodel.syn0[word2_indices] += neu1e # learn input -> hidden, here for all words in the window separately sent_vec += neu1e # learn input -> hidden, here for all words in the window separately if model.cat_learn == 1: cat_vec += neu1e # learn input -> hidden, here for all words in the window separately return len([word for word in sentence if word is not None])
def train_sentence_cbow(model, sentence, alpha, work=None, neu1=None): """ Update CBOW model by training on a single sentence. The sentence is a list of Vocab objects (or None, where the corresponding word is not in the vocabulary. Called internally from `Word2Vec.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from word2vec_inner instead. """ labels = [] if model.negative: # precompute negative labels labels = zeros(model.negative + 1) labels[0] = 1. for pos, word in enumerate(sentence): if word is None: continue # OOV word in the input sentence => skip #reduced_window = random.randint(model.window) # `b` in the original word2vec code #start = max(0, pos - model.window + reduced_window) start = max(0, pos - model.window) #window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start) window_pos = enumerate(sentence[start : pos + 1], start) #window_pos = enumerate(sentence[start : pos + model.window + 1], start) word2_indices_tmp = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_indices = [] for w2_i in range(0, len(word2_indices_tmp)): w2_index = word2_indices_tmp[w2_i] name = model.index2word[w2_index] if model.context_labeling == True: if w2_i >= model.window: w2_i = w2_i - model.window labeled_name = "LabCon_" + str(name) + "_" + str(w2_i) vocab_obj = model.vocab[labeled_name] word2_indices.append(vocab_obj.index) else: vocab_obj = model.vocab[name] word2_indices.append(vocab_obj.index) l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) train_cbow_pair(model, word, word2_indices, l1, alpha, labels) return len([word for word in sentence if word is not None])
def cross_entropy_error(actual, predicted, espilon=1e-10): """ Cross entropy error. """ actual, predicted = _preformat_inputs(actual, predicted) count_of_inputs = actual.shape[0] return -(1 / count_of_inputs) * np_sum( ( predicted * log(actual + espilon) + (1 - predicted) * log(1 - actual + espilon) ) )
def get_dNr_psi_w_mtx(self,r_pnt, node_ls_values, r_ls_value): ''' Return the derivatives of the shape functions ''' #print "in dN ",r_pnt p_N_mtx = self.parent_fets.get_N_mtx(r_pnt) p_dNr_mtx = self.get_dNr_mtx(r_pnt) p_N_red = vstack((p_N_mtx[2,2::4],p_N_mtx[3,3::4])) second = np_sum((abs(node_ls_values) * p_dNr_mtx), axis = -1) third = np_sum((p_N_red[0] * abs(node_ls_values))) fourth = np_sum((p_dNr_mtx * node_ls_values), axis = -1) A_mtx = p_N_red * ( second - sign(r_ls_value)*fourth)[:,None] #A_mtx = p_N_red * ( -1.* sign(r_ls_value)) B_mtx = p_dNr_mtx * (third - abs(r_ls_value)) dNr_e_mtx = A_mtx + B_mtx return dNr_e_mtx
def train_online(self, sentence, epoch = 20): #a deterministic seed for each sentence s1 = ' '.join(sentence[:10])[:10] #logger.info("online training for a single sentence '%s'" % s1) #start = time.time() #preprocess sentence such that words that doesn't occur is turned into none sentence = filter(lambda x: x in self.vocab, sentence) #generate a document vector for the unseen review doc_vec = empty((1,self.layer1_size), dtype = REAL) random.seed(uint32(self.hashfxn(s1[:10] + str(self.seed)))) doc_vec = (random.rand(self.layer1_size) - 0.5) / self.layer1_size alpha = self.alpha #logger.info("before training %s", doc_vec) for _ in xrange(epoch): #the code below is adapted from train_sentence_dbow and train_sg_pair #logger.info("epoch %d" % i) if self.sg: for w in sentence: word = self.vocab[w] neu1e = zeros(doc_vec.shape) # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) l2a = deepcopy(self.syn1[word.point]) # 2d matrix, codelen x layer1_size fa = expit(dot(doc_vec, l2a.T)) # propagate hidden -> output ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate neu1e += dot(ga, l2a) # save error doc_vec += neu1e else: #the code below is adapted from train_sentence_dm and train_cbow_pair for pos, w in enumerate(sentence): word = self.vocab[w] reduced_window = random.randint(self.window) # `b` in the original doc2vec code start = max(0, pos - self.window + reduced_window) window_pos = enumerate(sentence[start : pos + self.window + 1 - reduced_window], start) word2_indices = [self.vocab[word2].index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] l1 = np_sum(self.syn0[word2_indices], axis=0) + doc_vec # 1 x layer1_size if word2_indices and self.cbow_mean: l1 /= (len(word2_indices) + 1) neu1e = zeros(l1.shape) l2a = self.syn1[word.point] # 2d matrix, codelen x layer1_size #use scipy.special.expit to avoid overflow/underflow fa = expit(dot(l1, l2a.T)) # propagate hidden -> output ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate neu1e += dot(ga, l2a) # save error doc_vec += neu1e #logger.info("after training %s", doc_vec) #elapsed = time.time() - start #logger.info("training 1 sentence took %.1fs" % elapsed) return doc_vec
def normilize_error_output(output): """ Normalize error output when result is non-scalar. Parameters ---------- output : array-like Input can be any numpy array or matrix. Returns ------- int, float Return sum of all absolute values. """ return np_sum(np_abs(output))
def train_cbow_pair(model, word, word2_indices, l1, alpha, labels, train_w1=True, train_w2=True): neu1e = zeros(l1.shape) if model.hs: if len(word2_indices) >= 1: l1 = np_sum(model.syn0[word2_indices], axis=0) # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate if train_w1: model.syn1[word.point] += outer(ga, l1) # learn hidden -> output neu1e += dot(ga, l2a) # save error else: l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate if train_w1: model.syn1[word.point] += outer(ga, l1) # learn hidden -> output neu1e += dot(ga, l2a) # save error if model.negative: # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) word_indices = [word.index] while len(word_indices) < model.negative + 1: w = model.table[random.randint(model.table.shape[0])] if w != word.index: word_indices.append(w) l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate if train_w1: model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output neu1e += dot(gb, l2b) # save error if train_w2: model.syn0[word2_indices] += neu1e # learn input -> hidden, here for all words in the window separately return neu1e
def train_epoch(self, input_train, target_train): centers = self.centers old_centers = centers.copy() output_train = self.predict(input_train) for i, center in enumerate(centers): positions = argwhere(output_train[:, 0] == i) if not np_any(positions): continue class_data = take(input_train, positions, axis=0) centers[i, :] = (1 / len(class_data)) * np_sum(class_data, axis=0) return np_abs(old_centers - centers)
def score_document_labeled_cbow(model, document, labels=None, work=None, neu1=None): word_vocabs = [model.wv.vocab[w] for w in document if w in model.wv.vocab] if labels is not None: targets = [model.lvocab[label] for label in labels] else: targets = model.lvocab.values() labels = model.lvocab.keys() word2_indices = [word2.index for word2 in word_vocabs] l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x layer1_size if word2_indices and model.cbow_mean: l1 /= len(word2_indices) return zip(labels, score_cbow_labeled_pair(model, targets, l1))
def train_epoch(self, input_data, target_train): weights = self.weights minimized = dot(input_data, weights) reconstruct = dot(minimized, weights.T) error = input_data - reconstruct weights += self.step * dot(error.T, minimized) mae = np_sum(np_abs(error)) / input_data.size del minimized del reconstruct del error return mae
def recognize(self, image): mem = self.mem converg = 0 result_img = copy(image) #np.array(dummy.shared_array) #copy(image) for idx in range(8): pred_img = copy(result_img) col = 0 for idx1 in range(self.im_size_sq): assoc = np_sum(pred_img * mem[idx1]) result_img[idx1] = 1 if neuro_tools.sign(assoc) else -1 if pred_img[idx1] == result_img[idx1]: col += 1 converg += abs(pred_img[idx1] - result_img[idx1]) if col == self.im_size_sq: return converg / (self.img_in_memory ** .5) return sys.float_info.max