def test_part_doc_vec_search(self, processed_files_path, percent): ''' We want to know how many correct searches will be in case of taking only part of the input doc ''' vectorizer, corpus, processed_files = vec_search.create_vectorizer( processed_files_path) num_samples = len(processed_files) expected = np.array(range(num_samples)) result = np.array([0] * num_samples) for i, file in enumerate(processed_files): if percent >= 50: miss = 100 // (100 - percent) file = [ word for i, word in enumerate(re.split(' ', file)) if i % miss > 0 ] else: miss = 100 // percent file = [ word for i, word in enumerate(re.split(' ', file)) if i % miss == 0 ] distances, key_words = vec_search.vec_search( vectorizer, corpus, ' '.join(file)) result[i] = distances[0][1] score = np.sum(expected == result) / num_samples print("Testing vec model, score: ", score) return score
def test_vec_search(self, processed_files_path): ''' Trying to find the most closest doc for the doc from the corpus using vectorized search we expect exactly this doc as a result, because each doc from the corpus is the most similar to itself (vectors which represent these docs are collinear) ''' vectorizer, corpus, processed_files = vec_search.create_vectorizer(processed_files_path) num_samples = len(processed_files) expected = np.array(range(num_samples)) result = np.array([0] * num_samples) for i, file in enumerate(processed_files): distances, key_words = vec_search.vec_search(vectorizer, corpus, file) result[i] = distances[0][1] score = np.sum(expected == result) / num_samples print("Testing vec model, score: ", score) return score
def test_vec_search(self, processed_files_path): ''' Trying to find the most closest doc for the doc from the corpus using vectorized search we expect exactly this doc as a result, because each doc from the corpus is the most similar to itself (vectors which represent these docs are collinear) ''' vectorizer, corpus, processed_files = vec_search.create_vectorizer( processed_files_path) num_samples = len(processed_files) expected = np.array(range(num_samples)) result = np.array([0] * num_samples) for i, file in enumerate(processed_files): distances, key_words = vec_search.vec_search( vectorizer, corpus, file) result[i] = distances[0][1] score = np.sum(expected == result) / num_samples print("Testing vec model, score: ", score) return score
def test_part_doc_vec_search(self, processed_files_path, percent): ''' We want to know how many correct searches will be in case of taking only part of the input doc ''' vectorizer, corpus, processed_files = vec_search.create_vectorizer(processed_files_path) num_samples = len(processed_files) expected = np.array(range(num_samples)) result = np.array([0] * num_samples) for i, file in enumerate(processed_files): if percent >= 50: miss = 100 // (100 - percent) file = [word for i, word in enumerate(re.split(' ', file)) if i % miss > 0] else: miss = 100 // percent file = [word for i, word in enumerate(re.split(' ', file)) if i % miss == 0] distances, key_words = vec_search.vec_search(vectorizer, corpus, ' '.join(file)) result[i] = distances[0][1] score = np.sum(expected == result) / num_samples print("Testing vec model, score: ", score) return score