def signature_similarity(self, paths, signature1, signature2): model = self._model model.init_sims(replace=True) traces1 = StackTracesGetter.get_stack_traces_for_signature(paths, signature1) traces2 = StackTracesGetter.get_stack_traces_for_signature(paths, signature2) similarities = [] already_processed = set() for doc1 in traces1: words1 = np.unique([word for word in StackTraceProcessor.preprocess(doc1) if word in model.wv.vocab]).tolist() if self.get_model_name() == 'Word2Vec': distances = np.array(1.0 - np.dot(model.wv.vectors_norm, model.wv.vectors_norm[ [model.wv.vocab[word].index for word in words1]].transpose()), dtype=np.double) else: distances = np.array(1.0 - np.dot(model.wv.vectors, model.wv.vectors[ [model.wv.vocab[word].index for word in words1]].transpose()), dtype=np.double) for doc2 in traces2: words2 = [word for word in StackTraceProcessor.preprocess(doc2) if word in model.wv.vocab] if words1 == words2 or frozenset([tuple(words1), tuple(words2)]) in already_processed: continue already_processed.add(frozenset([tuple(words1), tuple(words2)])) similarities.append((doc1, doc2, self.wmdistance(words1, words2, distances))) return sorted(similarities, key=lambda v: v[2])
def test_preprocess(self): stack_trace = 'js::GCM::pMSt | js::GCM::d | js::gc::GCR::w | JS::SIGC | CoreF@0xa74b0 | HTlb@0x312ab | AppKit@0x476cc' expected = ['js::gcm::pmst', 'js::gcm::d', 'js::gc::gcr::w', 'js::sigc', 'coref@0x', 'htlb@0x', 'appkit@0x'] actual = StackTraceProcessor.preprocess(stack_trace) self.assertEqual(actual, expected) actual = StackTraceProcessor.preprocess(stack_trace, 3) self.assertEqual(actual, expected[:3])
def test_clean_func(self): funcs = [('js::jit::MakeMRegExpHoistable ', 'js::jit::makemregexphoistable'), (' AppKit@0x7be82f ', 'appkit@0x'), (' __RtlUserThreadStart ', '__rtluserthreadstart'), (' xul.dll@0x1ade7cf ', 'xul.dll@0x'), ('XUL@0x7bd20f', 'xul@0x'), ('libxul.so@0xe477b4 ', 'libxul.so@0x')] for f, expected in funcs: self.assertEqual(StackTraceProcessor.preprocess(f), [expected])
def top_similar_traces(self, stack_trace, top=10): model = self._model model.init_sims(replace=True) words_to_test = StackTraceProcessor.preprocess(stack_trace) words_to_test_clean = [w for w in np.unique(words_to_test).tolist() if w in model.wv.vocab] # TODO: Test if a first sorting with the average vectors is useful. ''' inferred_vector = model.infer_vector(words_to_test) sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) ''' # Cos-similarity if self.get_model_name() == 'Word2Vec': all_distances = np.array(1.0 - np.dot(model.wv.vectors_norm, model.wv.vectors_norm[ [model.wv.vocab[word].index for word in words_to_test_clean]].transpose()), dtype=np.double) else: all_distances = np.array(1.0 - np.dot(model.wv.vectors, model.wv.vectors[ [model.wv.vocab[word].index for word in words_to_test_clean]].transpose()), dtype=np.double) # Relaxed Word Mover's Distance for selecting t = time.time() distances = [] for doc_id in range(0, len(self._corpus)): doc_words = self._extract_indices_from_model(doc_id) if len(doc_words) != 0: word_dists = all_distances[doc_words] rwmd = max(np.sum(np.min(word_dists, axis=0)), np.sum(np.min(word_dists, axis=1))) else: rwmd = float('inf') distances.append((doc_id, rwmd)) distances.sort(key=lambda v: v[1]) logging.info('First part done in ' + str(time.time() - t) + ' s.') t = time.time() confirmed_distances_ids = [] confirmed_distances = [] for i, (doc_id, rwmd_distance) in enumerate(distances): # Stop once we have 'top' confirmed distances and all the rwmd lower bounds are higher than the smallest top confirmed distance. if len(confirmed_distances) >= top and rwmd_distance > confirmed_distances[top - 1]: logging.debug('stopping at ' + str(i)) logging.debug(top) break doc_words_clean = self._extract_words_from_model(doc_id) wmd = self.wmdistance(words_to_test_clean, doc_words_clean, all_distances) j = bisect.bisect(confirmed_distances, wmd) confirmed_distances.insert(j, wmd) confirmed_distances_ids.insert(j, doc_id) similarities = zip(confirmed_distances_ids, confirmed_distances) logging.info('Query done in ' + str(time.time() - t) + ' s.') return sorted(similarities, key=lambda v: v[1])[:top]
def wmdistance_cosine_non_zero_distance(self, model, trained_model): doc1 = "KiFastSystemCallRet | NtWaitForMultipleObjects | WaitForMultipleObjectsEx | RealMsgWaitForMultipleObjectsEx | CCliModalLoop::BlockFn | CoWaitForMultipleHandles | mozilla::ipc::MessageChannel::WaitForSyncNotifyWithA11yReentry | mozilla::ipc::MessageChannel::WaitForSyncNotify | mozilla::ipc::MessageChannel::Send | mozilla::dom::PScreenManagerChild::SendScreenRefresh | mozilla::widget::ScreenProxy::EnsureCacheIsValid | mozilla::widget::ScreenProxy::GetColorDepth | gfxPlatform::PopulateScreenInfo | gfxPlatform::Init | mozilla::dom::ContentProcess::Init | XRE_InitChildProcess | content_process_main | wmain | remainder | remainder | WinSqmStartSession | _SEH_epilog4 | WinSqmStartSession | _RtlUserThreadStart" doc2 = "Assertion::~Assertion | Assertion::Destroy | InMemoryDataSource::DeleteForwardArcsEntry | PL_DHashTableEnumerate | InMemoryDataSource::~InMemoryDataSource | InMemoryDataSource::`vector deleting destructor' | InMemoryDataSource::Internal::Release | InMemoryDataSource::Release | nsCOMPtr_base::~nsCOMPtr_base | RDFXMLDataSourceImpl::`vector deleting destructor' | RDFXMLDataSourceImpl::Release | DoDeferredRelease<T> | XPCJSRuntime::GCCallback | Collect | js::GC | js::GCForReason | nsXPConnect::Collect | nsCycleCollector::GCIfNeeded | nsCycleCollector::Collect | nsCycleCollector::Shutdown | nsCycleCollector_shutdown | mozilla::ShutdownXPCOM | ScopedXPCOMStartup::~ScopedXPCOMStartup | XREMain::XRE_main | XRE_main | wmain | __tmainCRTStartup | BaseThreadInitThunk | __RtlUserThreadStart | _RtlUserThreadStart" words_to_test1 = StackTraceProcessor.preprocess(doc1) words_to_test_clean1 = [w for w in np.unique(words_to_test1).tolist() if w in trained_model.wv.vocab] words_to_test2 = StackTraceProcessor.preprocess(doc2) words_to_test_clean2 = [w for w in np.unique(words_to_test2).tolist() if w in trained_model.wv.vocab] if model.get_model_name() == 'Word2Vec': all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors_norm, trained_model.wv.vectors_norm[ [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double) else: all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors, trained_model.wv.vectors[ [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double) distance = model.wmdistance(words_to_test_clean1, words_to_test_clean2, all_distances) self.assertNotEqual(float('inf'), distance)
def wmdistance_euclidean_zero_distance(self, model, trained_model): doc1 = "A | A | A" doc2 = "A | A | A" words_to_test1 = StackTraceProcessor.preprocess(doc1) words_to_test_clean1 = [w for w in np.unique(words_to_test1).tolist() if w in trained_model.wv.vocab] words_to_test2 = StackTraceProcessor.preprocess(doc2) words_to_test_clean2 = [w for w in np.unique(words_to_test2).tolist() if w in trained_model.wv.vocab] if model.get_model_name() == 'Word2Vec': all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors_norm, trained_model.wv.vectors_norm[ [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double) else: all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors, trained_model.wv.vectors[ [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double) distance = model.wmdistance(words_to_test_clean1, words_to_test_clean2, all_distances, distance_metric='euclidean') self.assertEqual(float('inf'), distance)
def _read_traces(self): return StackTraceProcessor.process(utils.read_files(self._fnames), 10)
def build_from_raw_traces(traces, file_name=None): return TracesCache.build(list(StackTraceProcessor.process(traces)), file_name)
def test_process(self): actual = list(StackTraceProcessor.process(self.raw_traces)) self.assertEqual(actual, self.expected_traces)