Exemple #1
0
    def signature_similarity(self, paths, signature1, signature2):
        model = self._model
        model.init_sims(replace=True)
        traces1 = StackTracesGetter.get_stack_traces_for_signature(paths, signature1)
        traces2 = StackTracesGetter.get_stack_traces_for_signature(paths, signature2)

        similarities = []
        already_processed = set()

        for doc1 in traces1:
            words1 = np.unique([word for word in StackTraceProcessor.preprocess(doc1) if word in model.wv.vocab]).tolist()

            if self.get_model_name() == 'Word2Vec':
                distances = np.array(1.0 - np.dot(model.wv.vectors_norm, model.wv.vectors_norm[
                    [model.wv.vocab[word].index for word in words1]].transpose()), dtype=np.double)
            else:
                distances = np.array(1.0 - np.dot(model.wv.vectors, model.wv.vectors[
                    [model.wv.vocab[word].index for word in words1]].transpose()), dtype=np.double)

            for doc2 in traces2:
                words2 = [word for word in StackTraceProcessor.preprocess(doc2) if word in model.wv.vocab]

                if words1 == words2 or frozenset([tuple(words1), tuple(words2)]) in already_processed:
                    continue
                already_processed.add(frozenset([tuple(words1), tuple(words2)]))

                similarities.append((doc1, doc2, self.wmdistance(words1, words2, distances)))

        return sorted(similarities, key=lambda v: v[2])
Exemple #2
0
 def test_preprocess(self):
     stack_trace = 'js::GCM::pMSt | js::GCM::d | js::gc::GCR::w | JS::SIGC | CoreF@0xa74b0 | HTlb@0x312ab | AppKit@0x476cc'
     expected = ['js::gcm::pmst', 'js::gcm::d', 'js::gc::gcr::w', 'js::sigc', 'coref@0x', 'htlb@0x', 'appkit@0x']
     actual = StackTraceProcessor.preprocess(stack_trace)
     self.assertEqual(actual, expected)
     actual = StackTraceProcessor.preprocess(stack_trace, 3)
     self.assertEqual(actual, expected[:3])
Exemple #3
0
 def test_clean_func(self):
     funcs = [('js::jit::MakeMRegExpHoistable ', 'js::jit::makemregexphoistable'),
              (' AppKit@0x7be82f ', 'appkit@0x'),
              (' __RtlUserThreadStart ', '__rtluserthreadstart'), (' xul.dll@0x1ade7cf ', 'xul.dll@0x'),
              ('XUL@0x7bd20f', 'xul@0x'), ('libxul.so@0xe477b4 ', 'libxul.so@0x')]
     for f, expected in funcs:
         self.assertEqual(StackTraceProcessor.preprocess(f), [expected])
Exemple #4
0
    def top_similar_traces(self, stack_trace, top=10):
        model = self._model
        model.init_sims(replace=True)

        words_to_test = StackTraceProcessor.preprocess(stack_trace)
        words_to_test_clean = [w for w in np.unique(words_to_test).tolist() if w in model.wv.vocab]

        # TODO: Test if a first sorting with the average vectors is useful.
        '''
        inferred_vector = model.infer_vector(words_to_test)
        sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
        '''

        # Cos-similarity
        if self.get_model_name() == 'Word2Vec':
            all_distances = np.array(1.0 - np.dot(model.wv.vectors_norm, model.wv.vectors_norm[
                [model.wv.vocab[word].index for word in words_to_test_clean]].transpose()), dtype=np.double)
        else:
            all_distances = np.array(1.0 - np.dot(model.wv.vectors, model.wv.vectors[
                [model.wv.vocab[word].index for word in words_to_test_clean]].transpose()), dtype=np.double)

        # Relaxed Word Mover's Distance for selecting
        t = time.time()
        distances = []
        for doc_id in range(0, len(self._corpus)):
            doc_words = self._extract_indices_from_model(doc_id)
            if len(doc_words) != 0:
                word_dists = all_distances[doc_words]
                rwmd = max(np.sum(np.min(word_dists, axis=0)), np.sum(np.min(word_dists, axis=1)))
            else:
                rwmd = float('inf')
            distances.append((doc_id, rwmd))

        distances.sort(key=lambda v: v[1])
        logging.info('First part done in ' + str(time.time() - t) + ' s.')

        t = time.time()
        confirmed_distances_ids = []
        confirmed_distances = []

        for i, (doc_id, rwmd_distance) in enumerate(distances):
            # Stop once we have 'top' confirmed distances and all the rwmd lower bounds are higher than the smallest top confirmed distance.
            if len(confirmed_distances) >= top and rwmd_distance > confirmed_distances[top - 1]:
                logging.debug('stopping at ' + str(i))
                logging.debug(top)
                break

            doc_words_clean = self._extract_words_from_model(doc_id)
            wmd = self.wmdistance(words_to_test_clean, doc_words_clean, all_distances)

            j = bisect.bisect(confirmed_distances, wmd)
            confirmed_distances.insert(j, wmd)
            confirmed_distances_ids.insert(j, doc_id)

        similarities = zip(confirmed_distances_ids, confirmed_distances)

        logging.info('Query done in ' + str(time.time() - t) + ' s.')

        return sorted(similarities, key=lambda v: v[1])[:top]
Exemple #5
0
    def wmdistance_cosine_non_zero_distance(self, model, trained_model):
        doc1 = "KiFastSystemCallRet | NtWaitForMultipleObjects | WaitForMultipleObjectsEx | RealMsgWaitForMultipleObjectsEx | CCliModalLoop::BlockFn | CoWaitForMultipleHandles | mozilla::ipc::MessageChannel::WaitForSyncNotifyWithA11yReentry | mozilla::ipc::MessageChannel::WaitForSyncNotify | mozilla::ipc::MessageChannel::Send | mozilla::dom::PScreenManagerChild::SendScreenRefresh | mozilla::widget::ScreenProxy::EnsureCacheIsValid | mozilla::widget::ScreenProxy::GetColorDepth | gfxPlatform::PopulateScreenInfo | gfxPlatform::Init | mozilla::dom::ContentProcess::Init | XRE_InitChildProcess | content_process_main | wmain | remainder | remainder | WinSqmStartSession | _SEH_epilog4 | WinSqmStartSession | _RtlUserThreadStart"
        doc2 = "Assertion::~Assertion | Assertion::Destroy | InMemoryDataSource::DeleteForwardArcsEntry | PL_DHashTableEnumerate | InMemoryDataSource::~InMemoryDataSource | InMemoryDataSource::`vector deleting destructor' | InMemoryDataSource::Internal::Release | InMemoryDataSource::Release | nsCOMPtr_base::~nsCOMPtr_base | RDFXMLDataSourceImpl::`vector deleting destructor' | RDFXMLDataSourceImpl::Release | DoDeferredRelease<T> | XPCJSRuntime::GCCallback | Collect | js::GC | js::GCForReason | nsXPConnect::Collect | nsCycleCollector::GCIfNeeded | nsCycleCollector::Collect | nsCycleCollector::Shutdown | nsCycleCollector_shutdown | mozilla::ShutdownXPCOM | ScopedXPCOMStartup::~ScopedXPCOMStartup | XREMain::XRE_main | XRE_main | wmain | __tmainCRTStartup | BaseThreadInitThunk | __RtlUserThreadStart | _RtlUserThreadStart"

        words_to_test1 = StackTraceProcessor.preprocess(doc1)
        words_to_test_clean1 = [w for w in np.unique(words_to_test1).tolist() if w in trained_model.wv.vocab]

        words_to_test2 = StackTraceProcessor.preprocess(doc2)
        words_to_test_clean2 = [w for w in np.unique(words_to_test2).tolist() if w in trained_model.wv.vocab]

        if model.get_model_name() == 'Word2Vec':
            all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors_norm, trained_model.wv.vectors_norm[
                [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double)
        else:
            all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors, trained_model.wv.vectors[
                [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double)

        distance = model.wmdistance(words_to_test_clean1, words_to_test_clean2, all_distances)
        self.assertNotEqual(float('inf'), distance)
Exemple #6
0
    def wmdistance_euclidean_zero_distance(self, model, trained_model):
        doc1 = "A | A | A"
        doc2 = "A | A | A"

        words_to_test1 = StackTraceProcessor.preprocess(doc1)
        words_to_test_clean1 = [w for w in np.unique(words_to_test1).tolist() if w in trained_model.wv.vocab]

        words_to_test2 = StackTraceProcessor.preprocess(doc2)
        words_to_test_clean2 = [w for w in np.unique(words_to_test2).tolist() if w in trained_model.wv.vocab]

        if model.get_model_name() == 'Word2Vec':
            all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors_norm, trained_model.wv.vectors_norm[
                [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double)
        else:
            all_distances = np.array(1.0 - np.dot(trained_model.wv.vectors, trained_model.wv.vectors[
                [trained_model.wv.vocab[word].index for word in words_to_test_clean1]].transpose()), dtype=np.double)

        distance = model.wmdistance(words_to_test_clean1, words_to_test_clean2, all_distances, distance_metric='euclidean')
        self.assertEqual(float('inf'), distance)
Exemple #7
0
 def _read_traces(self):
     return StackTraceProcessor.process(utils.read_files(self._fnames), 10)
Exemple #8
0
 def build_from_raw_traces(traces, file_name=None):
     return TracesCache.build(list(StackTraceProcessor.process(traces)), file_name)
Exemple #9
0
 def test_process(self):
     actual = list(StackTraceProcessor.process(self.raw_traces))
     self.assertEqual(actual, self.expected_traces)