def testSkipGram(vocabulary_file, training_dir): last_sentence = None print("Reading vocabulary " + vocabulary_file + "...") words, dictionary = read_vocabulary(vocabulary_file, MAX_VOCAB_SIZE) print("Reading sentences and training SkipGram...") start = timer() skip_gram = SkipGram(len(words), WINDOW_SIZE, HIDDEN_LAYER_SIZE) num_words = 0 for i in range(NUM_ITER): sentences = tokenize_files(dictionary, training_dir) for sentence in itertools.islice(sentences, MAX_SENTENCES): last_sentence = sentence skip_gram.train(sentence) num_words += len(sentence) ll = skip_gram.train(last_sentence, compute_ll=True) print("Iteration " + str(i + 1) + "/" + str(NUM_ITER) + " finished (" + str(num_words) + " words)") print("Log-likelihood: " + str(ll)) num_words = 0 print("- Took %.2f sec" % (timer() - start))
def run_skipgram(): vec = SkipGram() vec.train()
class Ensemble(BinaryPredictor): def __init__(self, filename, window=10, size=600, decay=8.0, balanced=False, prior=True): self._window = window self._size = size self._decay = decay self._prior_pred = prior self._stopwordslist = [] self._props = { "window": window, "size": size, "decay": decay, "prior": prior, "balanced": balanced } super(Ensemble, self).__init__(filename) self.collaborative = {} self.skipgram = {} self.cbowsim = {} self.collaborative = CollaborativeFiltering(filename, 27, 300, decay, balanced, True) self.cbowsim = CbowSim(filename, 45, 275, decay, balanced, True) self.skipgram = SkipGram(filename, 23, 350, decay, balanced, False) self._models = ["collaborative", "cbowsim", "skipgram"] def train(self, filename): self.collaborative.train(filename) self.cbowsim.train(filename) self.skipgram.train(filename) self._prior = self.cbowsim._prior self._weights = {m: defaultdict(lambda: 0) for m in self._models} with open(filename) as f: for line in f: feed_events = line.split("|")[2].split(" ") actual = line.split("|")[0].split(",") cf_preds = self.collaborative.predict(feed_events) cbow_preds = self.cbowsim.predict(feed_events) skip_preds = self.skipgram.predict(feed_events) for diag in self._diags: if diag in actual: self._weights["collaborative"][diag] += cf_preds[diag] self._weights["cbowsim"][diag] += cbow_preds[diag] self._weights["skipgram"][diag] += skip_preds[diag] else: self._weights["collaborative"][ diag] += 1 - cf_preds[diag] self._weights["cbowsim"][diag] += 1 - cbow_preds[diag] self._weights["skipgram"][diag] += 1 - skip_preds[diag] # Normalize weights for diag in self._diags: norm = (self._weights["collaborative"][diag] + self._weights["cbowsim"][diag] + self._weights["skipgram"][diag]) self._weights["collaborative"][diag] /= norm self._weights["cbowsim"][diag] /= norm self._weights["skipgram"][diag] /= norm print(self._weights) def predict(self, feed_events): cf_preds = self.collaborative.predict(feed_events) cbow_preds = self.cbowsim.predict(feed_events) skip_preds = self.skipgram.predict(feed_events) predictions = {} for diag in self._diags: predictions[ diag] = cf_preds[diag] * self._weights["collaborative"][diag] predictions[ diag] += cbow_preds[diag] * self._weights["cbowsim"][diag] predictions[ diag] += skip_preds[diag] * self._weights["skipgram"][diag] return predictions
class Ensemble(BinaryPredictor): def __init__(self, filename, window=10, size=600, decay=8.0, balanced=False, prior=True): self._window = window self._size = size self._decay = decay self._prior_pred = prior self._stopwordslist = [] self._props = {"window": window, "size": size, "decay": decay, "prior": prior, "balanced": balanced} super(Ensemble, self).__init__(filename) self.collaborative = {} self.skipgram = {} self.cbowsim = {} self.collaborative = CollaborativeFiltering(filename, 27, 300, decay, balanced, True) self.cbowsim = CbowSim(filename, 45, 275, decay, balanced, True) self.skipgram = SkipGram(filename, 23, 350, decay, balanced, False) self._models = ["collaborative", "cbowsim", "skipgram"] def train(self, filename): self.collaborative.train(filename) self.cbowsim.train(filename) self.skipgram.train(filename) self._prior = self.cbowsim._prior self._weights = {m: defaultdict(lambda: 0) for m in self._models} with open(filename) as f: for line in f: feed_events = line.split("|")[2].split(" ") actual = line.split("|")[0].split(",") cf_preds = self.collaborative.predict(feed_events) cbow_preds = self.cbowsim.predict(feed_events) skip_preds = self.skipgram.predict(feed_events) for diag in self._diags: if diag in actual: self._weights["collaborative"][diag] += cf_preds[diag] self._weights["cbowsim"][diag] += cbow_preds[diag] self._weights["skipgram"][diag] += skip_preds[diag] else: self._weights["collaborative"][diag] += 1 - cf_preds[diag] self._weights["cbowsim"][diag] += 1 - cbow_preds[diag] self._weights["skipgram"][diag] += 1 - skip_preds[diag] # Normalize weights for diag in self._diags: norm = (self._weights["collaborative"][diag] + self._weights["cbowsim"][diag] + self._weights["skipgram"][diag]) self._weights["collaborative"][diag] /= norm self._weights["cbowsim"][diag] /= norm self._weights["skipgram"][diag] /= norm print(self._weights) def predict(self, feed_events): cf_preds = self.collaborative.predict(feed_events) cbow_preds = self.cbowsim.predict(feed_events) skip_preds = self.skipgram.predict(feed_events) predictions = {} for diag in self._diags: predictions[diag] = cf_preds[diag] * self._weights["collaborative"][diag] predictions[diag] += cbow_preds[diag] * self._weights["cbowsim"][diag] predictions[diag] += skip_preds[diag] * self._weights["skipgram"][diag] return predictions