def rank_relevant_docs_w2v(self, w2v_model, query_as_list, relevant_docs): # counter_of_terms = relevant_docs[1] # key = term ___ value = number of times this term was in the query relevant_doc = relevant_docs[ 0] # the posting dic of all terms of the query qvector = self.get_embedding_w2v(w2v_model, query_as_list) tweet_id_data = {} tweet_id_CosSim = {} tweet_id_data.clear() tweet_id_CosSim.clear() for value in relevant_doc.values(): for v in value.keys(): if v not in tweet_id_data: tweet_id_data[v] = self.docs_dic[v] tweet_id_CosSim[v] = [0] norm_vec_q = norm(qvector) for key, value in tweet_id_data.items(): for term in query_as_list: if '1283449480642064384' == key: x = 1 vec = self.get_embedding_w2v(w2v_model, self.tweet_dic[key]) tweet_id_CosSim[key] = np.dot(qvector, vec) / (norm_vec_q * norm(vec)) res1 = dict( sorted(tweet_id_CosSim.items(), key=lambda e: e[1], reverse=True)) # for test res12 = list(res1.keys()) return res12
def Recognition1(img, m, A, Eigenfaces): # ProjectedImages = np.empty((98304, 1)) ProjectedImages = np.empty((19, 1)) Train_Number = Eigenfaces.shape[1] # print(Train_Number) # print(Eigenfaces.shape) print(Eigenfaces) print(A) for i in range(0, Train_Number): temp = np.dot(np.transpose(Eigenfaces), A[:, i]) # print(temp) # print(temp.shape) ProjectedImages = np.c_[ProjectedImages, temp] ProjectedImages = np.delete(ProjectedImages, 0, axis=1) # print(ProjectedImages) InputImage = img # print(InputImage) temp = InputImage[:, :, 1] # print(temp) row, col = temp.shape # print('row=',row) # print('col=',col) InImage = np.transpose(temp).reshape(row * col, 1) # print(InImage) # print(InImage.shape) # m = m.astype(np.int16) InImage = m.astype(np.int16) Difference = np.double(InImage) - m # print(Difference) # temp = np.double(T[:, i]) - np.double(m) ProjectedTestImages = np.dot(np.transpose(Eigenfaces), Difference) # print(ProjectedTestImages) # global Euc_dist Euc_dist = [] for i in range(0, Train_Number): q = ProjectedImages[:, i] # print(q) temp = np.dot((norm(ProjectedTestImages - q)), (norm(ProjectedTestImages - q))) # print(norm(ProjectedTestImages - q)) # print('temp=',temp) Euc_dist.append(temp) # print('list=',Euc_dist) Euc_dist_min = min(Euc_dist) # print("min=",Euc_dist_min) Recognized_index = Euc_dist.index(min(Euc_dist)) # print('index=',Recognized_index) OutputName = str(Recognized_index) + '.jpg' # print(OutputName) return OutputName #以下为模块测试所需参数、代码 # img = cv2.imread("C:/Users/78111/Desktop/TestDatabase/9.jpg") # T = CreateDatabase.CreateDatabase() # m,A,Eigenfaces = EigenfaceCore(T) # Recognition1(img,m,A,Eigenfaces)
def __init__(self, files, language='french'): self.parser = Parser(language=language, default_remove_stopwords=True, default_stem=True) # retrieve file contents and tokenize it ParsedFile = namedtuple('ParsedFile', 'title content original_content uniq_words') for file in files: title, original_content = parse_course(file) content = self.parser.tokenize( original_content) + self.parser.tokenize(title) self.files[basename(file)[:-4]] = ParsedFile( title, content, original_content, set(content)) # list of all uniq words, eventually optimised with stemming and stopwords sorting. word_list = set(word for file in self.files.values() for word in file.uniq_words) number_of_documents = len(files) # this next operation process idf for each word in the document, it can take a while. self.words_index = { word: (index, number_of_documents / self.__count_docs(word)) for (index, word) in enumerate(word_list) } for acronym, file in self.files.items(): vector = [0] * len(self.words_index) for word in file.content: # we add idf each time we see a word, this ends up having tf*idf vector[self.words_index[word][0]] += self.words_index[word][0] self.vectors[acronym] = vector self.norms = Keydefaultdict( lambda acronym: norm(self.vectors[acronym])) self.cosines = Keydefaultdict(lambda acr_a: Keydefaultdict( lambda acr_b: self.__cosine(self.vectors[acr_a], self.vectors[ acr_b], self.norms[acr_a], self.norms[acr_b])))
def fromLoop(cls, loop): """Returns a Model representing the loop""" #get necessary vectors offset_v = [loop.r_anchor[0].__dict__[c] - loop.l_anchor[0].__dict__[c] for c in 'xyz'] sse0_v = Model.__get_sse_vector(loop.l_anchor, loop.atoms[0]) sse1_v = Model.__get_sse_vector(loop.r_anchor, loop.atoms[-1]) sFrame = TransformFrame.createFromVectors(loop.l_anchor[0], transform.Vec.from_array(offset_v), transform.Vec.from_array(sse0_v)) #Theta and phi are the angles between the SSE and anchor-anchor vector theta = arccos(dot(sse0_v, negative(offset_v)) / (norm(sse0_v) * norm(offset_v))) phi = arccos(dot(sse1_v, offset_v) / (norm(sse1_v) * norm(offset_v))) #Length of the vectorn anchor_dist = norm(offset_v) return Model([loop], [Vec.from_array(sFrame.transformInto(atom)) for atom in loop.atoms], theta, phi, anchor_dist, [loop.l_type, loop.r_type], Model.__gen_seq([loop.seq]) , 1)
def similarity(a, b): if int(a[1]) == int( b[1]): # első autó edge-e megegyezik-e a másik autó edgével? return norm( a[2:4] - b[2:4] ) # ha igen, akkor adjuk meg a tavolsagot. dataframe oszlopait címzem itt else: return 1000000 # egy meglehetosen nagy ertek ami megakadalyozza hogy a nem azonos edge-n levok azonos klaszterben legyenek
def rank_relevant_docs(relevant_docs, norm_query, k=None): """ This function provides rank for each relevant document and sorts them by their scores. The current score considers solely the number of terms shared by the tweet (full_text) and query. :param norm_query: :param k: number of most relevant docs to return, default to everything. :param relevant_docs: dictionary of documents that contains at least one term from the query. :return: sorted list of documents by score """ ranked_results = {} for doc in relevant_docs.keys(): sum_wij_wiq = dot(relevant_docs[doc], norm_query) cos_similarity = sum_wij_wiq / (norm(relevant_docs[doc]) * norm(norm_query)) ranked_results[doc] = cos_similarity ranked_results = sorted(ranked_results.items(), key=lambda item: item[1], reverse=True) if k is not None: ranked_results = ranked_results[:k] return [d[0] for d in ranked_results]
def first_pc(a): epsilon = 1e-10 a = np.array(a, dtype=float) cov = a.dot(a.T) n = cov.shape[0] x = random_unit_vector(n) current_v = x while True: last_v = current_v current_v = np.dot(cov, last_v) current_v = current_v / norm(current_v) if abs(np.dot(current_v, last_v)) > 1 - epsilon: return current_v
def extract_transformation(homography): """ Take a 3x3 homography representing a truncated rigid transformation (where *z* is assumed to be zero so the third column of the rotation matrix is removed) and extract a `RigidTransformation` from it. """ # It's easier to deal with columns homography = homography.T # Normalize to remove any scaling factor homography /= norm(homography[0]) trans = np.empty((4,3)) trans[:2] = homography[:2] trans[2] = np.cross(homography[0], homography[1]) trans[3] = homography[2] trans = trans.T # Constrain the first three columns to be orthogonal # NOTE: The decomposition used is extremely unstable. For now, just skip this step and # hope for the best. #trans[:,0:3] = nearest_rotation_matrix(trans[:,0:3]) return RigidTransformation(matrix=trans)
def rank_relevant_docs(relevant_doc, qurey, indexer, model_vector=None, k=None): """ This function provides rank for each relevant document and sorts them by their scores. The current score considers solely the number of terms shared by the tweet (full_text) and query. :param k: number of most relevant docs to return, default to everything. :param relevant_docs: dictionary of documents that contains at least one term from the query. :return: sorted list of documents by score """ config = indexer.config # key= doc_id, value= (num_of_terms appears_in_doc from qury, [(terms,num_of_term_appears)]) # print("start ranking") start_rank = timer() dict_doc = {} # key - doc_id . value - sigma (tf*idf*vector_term) inverted_index = indexer.inverted_idx #keys-terms. values- [line number in post file,number of documents appears in,total appearance in corpus] documents_data = indexer.documents_data #keys- document id. values- [max freq term, number of difrent words, number of words] temp_list = [] idf_dict = {} # key= term, value= idf vector_query = numpy.zeros(300) dict_vector_model = { } # key = term . value- vector represantion from model for term in qurey: if term in model_vector.vocab: term_vector = model_vector.wv.get_vector(term) else: # not exist in the model term_vector = numpy.zeros(300) dict_vector_model[ term] = term_vector # keeping the vector of the term tf_q = qurey[term] # idf_query df_term = inverted_index[term][1] total_docs = len(documents_data.keys()) idf_term = math.log((total_docs / df_term), 2) idf_dict[term] = idf_term tf_idf_q = tf_q * idf_term result = tf_idf_q * term_vector vector_query += result # calculate the vector of current query start_calculating_vectors = timer() for doc_id in relevant_doc.keys(): for term in relevant_doc[doc_id][ 1]: # key= doc_id, value= (num_of_terms_in_qury, [(term,num_of_term_appears)]) term_name = term[0] tf_term = int(term[1]) / documents_data[doc_id][2] idf_term = float(idf_dict[term_name]) tf_idf = tf_term * idf_term tf_idf_vector = tf_idf * dict_vector_model[term_name] if doc_id in dict_doc.keys(): dict_doc[doc_id] += tf_idf_vector else: new_vec = numpy.zeros(300) dict_doc[doc_id] = new_vec dict_doc[doc_id] += tf_idf_vector end_calculating_vectors = timer() #print(str(timedelta(seconds=end_calculating_vectors - start_calculating_vectors)) + "calculatingvectors time") rank_cosine = [] temp_vec = [] cosine_sim = numpy.zeros(300) start_calculate_cosim = timer() for doc in dict_doc.items(): doc_as_vec = doc[1] if numpy.all(doc_as_vec) == 0 or numpy.all( vector_query) == 0: # if the doc vector is zeros. rank_tuple = (0, doc[0], relevant_doc[doc[0]][0]) rank_cosine.append(rank_tuple) else: dot = numpy.dot(vector_query, doc_as_vec) test_norm = norm(vector_query) doc_weight = math.sqrt(float(documents_data[doc[0]][3])) # doc_weight_norm = doc_weight / documents_data[doc[0]][2] #mechane = test_norm * doc_weight #cosine = dot / mechane normlize = norm(vector_query) * norm(doc_as_vec) cosine_sim = (dot / (normlize * doc_weight)) #cos = (cosine_sim * 0.7) + (0.3 * dot) #0.707 rank_tuple = ( cosine_sim, doc[0], relevant_doc[doc[0]][0] ) # cosine_similarity result, doc_id, number of common word withw if cosine_sim < 0.33: continue rank_cosine.append(rank_tuple) end_calculate_cosim = timer() #print(str(timedelta(seconds=end_calculate_cosim - start_calculate_cosim)) + "calculate_cosim time") rank_list_sorted = sorted(rank_cosine, reverse=True) end_rank = timer() # print("finished rank at {}".format(timedelta(seconds=end_rank-start_rank))) if k is not None: rank_list_sorted = rank_list_sorted[:k] return [d[1] for d in rank_list_sorted]
def inspect_infeasible(f, h, x, ν, k): return np.concatenate([ [norm(ConstrainedNewtonInfeasible.r_primal(f, h, x, ν)), norm(ConstrainedNewtonInfeasible.r_dual(h, x))], [norm(ConstrainedNewtonInfeasible.r(f, h)(x, ν))], x.T[0], ])
def inspect_2d_infeasible(f, h, x, ν, k): return np.concatenate([ [f(x)], *x, [norm(ConstrainedNewtonInfeasible.r_primal(f, h, x, ν)), norm(ConstrainedNewtonInfeasible.r_dual(h, x))] ])
def cos_similarity(vec1, vec2): return dot(vec2, vec1) / (norm(vec2) * norm(vec1))