def main(): prepareInput.createInput(logName) scores=[] #----------start Trace2Vec Trace2Vec.learn(logName,vectorsize) y=Trace2Vec.getY(logName) vectors, corpus=Trace2Vec.startCluster(logName, vectorsize) printMatrix(vectors, "Trace2Vec", "vectors") for alg in clustering: assigned_clusters=cluster(alg, vectors, y) printVector(assigned_clusters, "Trace2Vec", "clusters", alg) Trace2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus) #----------end Trace2Vec #----------start Node2Vec args=Node2Vec.parse_args() args.input="input/"+logName+".graph" args.output="output/"+logName+"N2VVS"+str(vectorsize)+".node2vec" nx_G = Node2Vec.read_graph(args) G = node2vec.Graph(nx_G, True, args.p, args.q) G.preprocess_transition_probs() walks = G.simulate_walks(args.num_walks, args.walk_length) Node2Vec.learn_embeddings(args, logName, vectorsize, walks) Node2Vec.extract(logName, vectorsize) y=Node2Vec.getY(logName) vectors, corpus=Node2Vec.startCluster(logName, vectorsize) printMatrix(vectors, "Node2Vec", "vectors") for alg in clustering: assigned_clusters=cluster(alg, vectors, y) printVector(assigned_clusters, "Node2Vec", "clusters", alg) Node2Vec.endCluster(logName, assigned_clusters, vectorsize, alg, corpus) #----------end Node2Vec #----------start NGrams vectors, y=NGrams.ngrams_BPI_2015(logName, vectorsize) printMatrix(vectors, "NGrams", "vectors") for alg in clustering: assigned_clusters=cluster(alg, vectors, y) printVector(assigned_clusters, "NGrams", "clusters", alg) NGrams.endCluster(logName, assigned_clusters, vectorsize, alg, [0]*len(vectors)) #----------end NGrams scores.append(get_scores("Trace2Vec")) scores.append(get_scores("Node2Vec")) scores.append(get_scores("NGrams")) for score in scores: print_scores(score) if vectorsize==2: for emb in embed: myPlot.plot(emb)
def get_salt_file_character_n_grams(): char_n_grams = NGrams.get_character_n_grams(NGramHash.SALT_FILE_NAME, NGramHash.N_GRAMS_SIZE) char_n_gram_set = set() for n_gram in char_n_grams: char_n_gram_set.add(n_gram) return list(char_n_gram_set)
def get_character_n_gram_set(file_name, n): char_n_grams = NGrams.get_character_n_grams( FileIo.get_text_file_contents(file_name), n) char_n_gram_set = set() for n_gram in char_n_grams: char_n_gram_set.add(n_gram) return char_n_gram_set
def get_word_n_gram_set(file_name, n): word_n_grams = NGrams.get_word_n_grams( FileIo.get_text_file_contents(file_name), n) word_n_gram_set = set() for n_gram in word_n_grams: word_n_gram_set.add(n_gram) return word_n_gram_set
def evaluateLine(self, line): print "Evaluating", line words = line.split(' ') currentOrder = 1 index = 0 while (index + currentOrder) < len(words): phrase = "" for i in xrange(index, currentOrder + index): phrase = phrase + " " + words[i] phrase = phrase.strip() possibleWord = words[i+1] #print phrase, possibleWord, currentOrder+1 nGramMap = NGrams.getProbabilities(phrase, [possibleWord], currentOrder+1) #returns an array of maps -- need to convert nGramProbability = nGramMap[0]["probability"] #print phrase, possibleWord, nGramProbability if nGramProbability < -3.75: return False if currentOrder == 4: index += 1 if currentOrder < 4: currentOrder += 1 return True
def __get_salt_file_character_n_grams(self): char_n_grams = NGrams.get_character_n_grams(FileIo.get_text_file_contents(SALT_FILE_NAME), N_GRAMS_SIZE) char_n_gram_set = set() for n_gram in char_n_grams: char_n_gram_set.add(n_gram) return list(char_n_gram_set)
def getNewLine(self, PoS, editedLine, transformedText, rhymeScheme, meter, newTheme, oldTheme, currentLineNumber): #magic happens newLine = "" allwords = self.ctr.getAllWords() originalPoS = ["_"+PoS[i][1] for i in range(len(PoS))] newLine = "" i = 0 for word, part in zip(editedLine.split(), originalPoS): newWord = "" if word == "_": if part == "_NNP": part = "_NN" elif part == "_NNPS": part = "_NNS" allwords = self.ctr.getWordsWithEmphasis(meter[i]) tempWords = [word for word in allwords if self.robotBrain.get_popularity(word) > 175000 and word not in self.nameList.names and self.robotBrain.get_most_likely_POS_tag(word) == part] if len(tempWords) != 0: allwords = tempWords else: print("failed", meter[i], part) newLine = newLine + allwords[random.randint(0, len(allwords)-1)] i = i + 1 continue # if we are at the last word and the current line is not the first rhyming line in the series. ie [0,0,2,2] index != 1 || index != 3 if i == len(editedLine.split())-1 and rhymeScheme[currentLineNumber] != currentLineNumber: # we retrieve the first line in the current ryhme series. ie [['hi', 'guys'],['burgers', 'fries'], ['spies', 'lies']], we would retrieve ['hi', 'guys'] transformedLineBefore = transformedText[rhymeScheme[currentLineNumber]] # retrieve the last word from transformedLineBefore wordToRhyme = transformedLineBefore.split()[len(transformedLineBefore.split())-1] # retrieve all the rhyming words rhymes = self.rhymeDictionary.getRhymes(wordToRhyme) rhymes.append(wordToRhyme) if len(set(allwords) & set(rhymes)) > 0: # combine the rhyming words with the words that have the proper meter rhymes = list(set(allwords) & set(rhymes)) # randomly select a word from this concatenated list; may not necessarily rhyme similarity = [] for word in rhymes: similarity.append(self.get_cosine_similarity(word, newTheme)) similarity = np.asarray(similarity) if similarity.min() < 0: similarity = similarity - similarity.min() if sum(similarity) == 0: similarity = np.ones(similarity.shape) similarity = similarity / sum(similarity) if newLine != "" or currentLineNumber == 0: probabilities = NGrams.getNGramProbabilities(newLine, rhymes) else: probabilities = NGrams.getNGramProbabilities(transformedText[currentLineNumber-1], rhymes) similarity = (similarity + 4 * probabilities) / 5. index = np.argmax(np.random.multinomial(1, similarity)) while similarity[index] < 1./len(similarity): index = np.argmax(np.random.multinomial(1, similarity)) newWord = rhymes[index] elif i == len(editedLine.split()) - 1: newWord = "" possibleWords = [w for w in allwords if self.rhymeDictionary.wordList.has_key(w)] probabilities = np.asarray(NGrams.getNGramProbabilities(newLine, possibleWords)) index = np.argmax(np.random.multinomial(1, probabilities)) while probabilities[index] < 1./len(probabilities): index = np.argmax(np.random.multinomial(1, probabilities)) newWord = possibleWords[index] else: similarity = [] for word in allwords: similarity.append(self.get_cosine_similarity(word, newTheme)) similarity = np.asarray(similarity) if similarity.min() < 0: similarity = similarity - similarity.min() if sum(similarity) == 0: similarity = np.ones(similarity.shape) similarity = similarity / sum(similarity) if newLine != "" or currentLineNumber == 0: probabilities = NGrams.getNGramProbabilities(newLine, allwords) else: probabilities = NGrams.getNGramProbabilities(transformedText[currentLineNumber-1], allwords) similarity = (similarity + 4 * probabilities) / 5. index = np.argmax(np.random.multinomial(1, similarity)) while similarity[index] < 1./len(similarity): index = np.argmax(np.random.multinomial(1, similarity)) newWord = allwords[index] newLine = newLine + " " + newWord else: newLine = newLine + " " + word i = i + 1 return newLine.strip()