def calculate_rankings(sketches): n_genomes = len(sketches) jaccard_matrix = np.zeros((n_genomes,n_genomes)) sd_matrix = np.zeros((n_genomes,n_genomes)) forbes_matrix = np.zeros((n_genomes,n_genomes)) print('Calculating pairwise similarities...') for i in range(n_genomes): for j in range(i, n_genomes): h1 = sketches[i] h2 = sketches[j] union = Similarity.union(h1, h2).cardinality() a_excl, b_excl, intersection = Similarity.getJointEstimators(h1, h2) a = h1.cardinality() b = h2.cardinality() jaccard = intersection / union sd = 2*intersection/(a + b) forbes = (intersection*union)/(intersection*union + 1.5*a_excl*b_excl) jaccard_matrix[i, j] = jaccard sd_matrix[i, j] = sd forbes_matrix[i, j] = forbes jaccard_rankings = GenomeRankings.rank_genomes(jaccard_matrix) sd_rankings = GenomeRankings.rank_genomes(sd_matrix) forbes_rankings = GenomeRankings.rank_genomes(forbes_matrix) return jaccard_rankings, forbes_rankings, sd_rankings
def davies_bouldin(self): """ This method computes the davies-bouldin (db) of a given clustering. :return: the davies bouldin value of the clustering """ # get the average internal cluster distances cluster_averages = self.cluster_averages() # create variable for db davies_bouldin = 0.0 s = Similarity(self.e) # for each cluster / centroid i for i in range(self.solution.num_clusters): # for each cluster / centroid j for j in range(self.solution.num_clusters): # when i and j are not the same cluster / centroid if j != i: # calculate the distance between the two centroids of i and j d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j]) # update the variable to equal to sum of internal cluster distances of clusters i and j divided by # the previously computer value i.e. the distance between centroid i and centroid j d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij # update db is this is larger than any db seen before davies_bouldin = max(d_ij, davies_bouldin) return davies_bouldin
def averageSimilarityFeatureExtractor(self, originalDocuments, machineSummary, humanSummaries): simWithOD = s.calculateAverageSimilarity(machineSummary, originalDocuments) simWithHS = s.calculateAverageSimilarity(machineSummary, humanSummaries) return [simWithOD, simWithHS]
def computeSimilarity(self): """ compute the similarity between nodes using the distances computed earlier :return: none """ if self.verbose: print("Computing Similarities...", end=" ", flush=True) # 2d array to hold all distances and relations ids distances = [] ids = [] # get list of patient ids nodes = self.conn.getSortedIDList() for i in range(len(nodes)): # compute average distance to neighbors for node i xi_N = self.conn.getPatientRelations(nodes[i][0]) distances.append([x[2] for x in xi_N]) ids.append([x[1] for x in xi_N]) # compute the similarities buffer = Similarity.measure(ids, distances) # write the similarities to disk if self.verbose: print("Done.") print("Writing Similarities...", end=" ", flush=True) self.conn.updateRelationsFromBuffer(buffer) if self.verbose: print(" Done.")
def parseCreatorFile(self, fileName, delimiter): """ Parse first data type to create the nodes for the graph :param fileName: file from which to read (.csv) :param delimit: delimiter of file :return: none """ task = threading.Thread(target=self.allocateNodes(fileName, delimiter)) task.start() # new reader to measure the distances with open(fileName, 'r') as f_in: if self.verbose: print("Calculating Distances...", end=" ", flush=True) reader = csv.reader(f_in, delimiter=delimiter) # skip headers next(reader) patients = [row for row in reader if row != []] ids = [row[0] for row in patients] buffer = Similarity.initialDistance(ids, patients) # wait until all new nodes are allocated task.join() if self.verbose: print(" Done.") print("Writing Distances...", end=" ", flush=True) self.conn.addRelationsFromBuffer(buffer) self.files_read += 1 if self.verbose: print(" Done.")
def sequenceSimilarities(subtrees): # print(subtrees) length = len(subtrees) maxSimilarity = -1 maxTreeX = "" maxNodeX = "" maxTreeY = "" maxNodeY = "" for fileA, sequencesA in sorted(subtrees.iteritems()): for fileB, sequencesB in sorted(subtrees.iteritems()): # print(fileA) # print(fileB) if fileA != fileB: root = subtrees[fileA]['ROOT'] for sequence in subtrees[fileB]: similarity = Similarity.domainSim( root, subtrees[fileB][sequence]) if similarity > maxSimilarity: maxSimilarity = similarity maxTreeX = fileA maxNodeX = 'ROOT' maxTreeY = fileB maxNodeY = sequence # indices = sorted([(maxTreeX, maxNodeX), (maxTreeY, maxNodeY)], key=lambda x: x[0]) return maxTreeX, maxTreeY, maxNodeY, maxSimilarity
def SimilarityIndexes(expList, simList, indextype, indexname): index = 0 index_sum = 0 number = 0 for i in range(len(expList)): for j in range(len(simList)): index = Similarity.SimilarityIndex(expList[i], simList[j], indextype) if index != 0: index_sum += index number += 1 outdir = './ResultData' if not os.path.exists(outdir): os.mkdir(outdir) pd.DataFrame(expList).to_csv(os.path.join( outdir, 'explist-' + indextype + indexname + '.txt'), mode='a', sep=' ') pd.DataFrame(simList).to_csv(os.path.join( outdir, 'simlist-' + indextype + indexname + '.txt'), mode='a', sep=' ') # pd.DataFrame(simList).to_csv(outdir + '/explist-' + indextype + indexname + '.txt', mode='a', sep=' ') if number == 0: number = 1 return index_sum / number
def SimilarityIndexes(expList, simList, indextype, indexname): index = 0 index_sum = 0 number = 0 index_max = 0 for i in range(len(expList)): for j in range(len(simList)): index = Similarity.SimilarityIndex(expList[i], simList[j], indextype) if index != 0: index_sum += index number += 1 index_max = max(index, index_max) pd.DataFrame(expList).to_csv(r'ResultData//explist' + indextype + indexname + '.txt', mode='a', sep=' ') pd.DataFrame(simList).to_csv(r'ResultData//simlist' + indextype + indexname + '.txt', mode='a', sep=' ') if number == 0: number = 1 index = index_sum / number return index
def estSeries(M, ts): est = np.zeros(ts.shape) for k in range(M.shape[0]): poi = M[k, :] dists, inds = Similarity.findClosestInclusive(poi, M, numlags + 1) w = wgtfunc(np.array(dists)) est[k] = (w * ts[list(inds)]).sum() return est
def __init__(self, indirizzo, doc): # Take as input the address of the feedback ("TIME.REL") and a VSM. # The function fills up a dictionary: the key is the queryID and the values will be a list of relevant documentsID for each. self.VSM = doc self.SIM = SI.Similarity(doc) lines = [line.strip() for line in open(indirizzo)] lines = list(filter(None, lines)) for i in range(0, doc.numquery): l = [int(s) for s in " ".join(lines[i].split()).split(' ')] self.relevance[l[0]] = l[1:]
def plotSerials(plotlist, methods): testFile = join("Test", "pattern_Test.csv") kpi = loadTestKPI(testFile) targetMap, serialMatrix = loadTest(testFile) s1 = serialMatrix[plotlist[0]] s2 = serialMatrix[plotlist[1]] SL = [serialMatrix[i] for i in plotlist] NL = ['serial{0}'.format(i) for i in plotlist] TL = [targetMap[i] for i in plotlist] offsets = {} dtw_ss = [e for e in methods if "dtw_s" in e] for m in dtw_ss: s = Similarity.Similarity(s1, s2) NL.append('serial{0} shifted by {1}({2})'.format( plotlist[-1], m, s.use_method(m))) TL.append(targetMap[plotlist[-1]]) SL.append(s2 + s.bestShiftY) dtw_ss = [e for e in methods if e.startswith('m')] for m in dtw_ss: s = Similarity.Similarity(s1, s2) TL.append(targetMap[plotlist[-1]]) NL.append('serial{0} shifted by {1} :({2})'.format( plotlist[-1], m, s.use_method(m))) if (s.bestShiftX > 0): offsets[len(SL)] = s.bestShiftX SL.append(s2[:-s.bestShiftX]) else: SL.append(s2[-s.bestShiftX:]) CommonOperation.plotSerials(SL, NL, TL, plotList=range(len(SL)), lowerbound=0, offsets=offsets)
def params(self): self.tb_defalt = "new pose name or filename to load and save" self.Min = 0.95 self.Posenames = [] self.finger_state = [] self.max_power = [] self.sim = Similarity.Similarity() self.st_flg = False self.file_name = "" self.portname = "/dev/ttyACM1" self.baudrate = 115200 self.connected = False self.rpy = {"roll": 0, "pitch": 1, "yaw": 2}
def estManifold(Mx,My,wgtfunc): ''' Estimate My from Mx. ''' Mest=np.zeros(My.shape) for k in range(Mx.shape[0]): poi = Mx[k,:] dists,inds = Similarity.findClosestInclusive(poi,Mx,Mx.shape[1]+1) w = wgtfunc(np.array(dists)) pts = [My[j,:] for j in inds] Mest[k,:] = sum([w[j]*pts[j] for j in range(len(w))]) return Mest
def forbes(num_cards, base_start, base_stop, num_trials, exp_forbes, read_lengths): # num_cards = 50 cardinalities = np.logspace( base_start, base_stop, num_cards) # Generates a range of cardinalities for set generation plot = np.zeros(num_cards) for i in range(num_cards): print('Starting ' + str(i + 1) + ' out of ' + str(num_cards)) card = int(cardinalities[i]) # do 10 trials for each cardinality results = np.zeros(num_trials) for j in range(num_trials): h1 = HLL(12) h2 = HLL(12) # Generate reads of length 40 based on the expected Forbes, 0.1 a, b, exp_forbes = Rangen_forbes.generate_reads( exp_forbes, card, card, read_lengths) for s in a: h1.insert(s) for s in b: h2.insert(s) union = Similarity.union(h1, h2).cardinality() intersection = Similarity.intersection(h1, h2) b_size, c_size, a_size = Similarity.getJointEstimators(h1, h2) obs_forbes = (intersection * union) / ( (intersection * union) + 3 / 2 * (b_size * c_size)) error = 100 * (obs_forbes - exp_forbes) / exp_forbes results[j] = error plot[i] = np.mean(results) print(plot) plt.xscale('log') plt.title('Forbes (set at 0.1) Accuracy for reads of length 40') plt.xlabel('Cardinality') plt.ylabel('% Error (mean of 10)') plt.ylim(-100, 100) plt.scatter(cardinalities, plot) plt.show()
def jaccard(num_cards, base_start, base_stop, num_trials, exp_jaccard, read_lengths): cardinalities = np.logspace( base_start, base_stop, num_cards) # Generates a range of cardinalities for set generation plot = np.zeros(num_cards) for i in range(num_cards): print('Starting ' + str(i + 1) + ' out of ' + str(num_cards)) card = int(cardinalities[i]) # do 10 trials for each cardinality results = np.zeros(num_trials) for j in range(num_trials): h1 = HLL(12) h2 = HLL(12) # Generate reads of length 40 based on the expected Jaccard, 0.02 a, b, exp_jaccard, forbes, sd = Rangen_jaccard.generate_reads( exp_jaccard, card, card, read_lengths) for s in a: h1.insert(s) for s in b: h2.insert(s) # Union and intersection calculations for Jaccard calculations union = Similarity.union(h1, h2).cardinality() intersection = Similarity.intersection(h1, h2) obs_jaccard = intersection / union error = 100 * ( obs_jaccard - exp_jaccard ) / exp_jaccard # Expected Jaccard is 0.02, error calculation results[j] = error plot[i] = np.mean(results) print(plot) # Percent Errors being displayed plt.xscale('log') plt.title('Jaccard (set at 0.02) Accuracy for reads of length 40') plt.xlabel('Cardinality') plt.ylabel('% Error (mean of 10)') plt.ylim(-100, 100) plt.scatter(cardinalities, plot) plt.show()
def davies_bouldin(self): """ This method computes the davies-bouldin (db) of a given clustering. :return: the davies bouldin value of the clustering """ # get the average internal cluster distances cluster_averages = self.cluster_averages() # create variable for db davies_bouldin = 0.0 s = Similarity(self.e) # for each cluster / centroid i for i in range(self.solution.num_clusters): # for each cluster / centroid j for j in range(self.solution.num_clusters): # when i and j are not the same cluster / centroid if j != i: # calculate the distance between the two centroids of i and j d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j]) # update the variable to equal to sum of internal cluster distances of clusters i and j divided by # the previously computer value i.e. the distance between centroid i and centroid j d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij # update db is this is larger than any db seen before davies_bouldin = max(d_ij, davies_bouldin) return davies_bouldin
def scoreHandler(event, context): file1Url = event["file1"] file2Url = event["file2"] file1 = requests.get(file1Url) file2 = requests.get(file2Url) if file1.status_code == 200 and file2.status_code == 200: sim = Similarity.Similarity() pdf = ConvertPdf.ConvertPDF() fileStream1 = io.BytesIO(file1.content) fileStream2 = io.BytesIO(file2.content) text1 = pdf.convertPDF(fileStream1) text2 = pdf.convertPDF(fileStream2) returnVal = {} returnVal["sim"] = sim.similarity(text1, text2) return json.dumps(returnVal) return 0
def test_main(filename_s, filename_e, filename_a): wordlist_s = TextPreprocess.Tokens(filename_s) wordlist_e = TextPreprocess.Tokens(filename_e) dict_s = Process.DictBiulder(wordlist_s) dict_e = Process.DictBiulder(wordlist_e) Fin_dict = Process.MergeDict(dict_s, dict_e) V1 = Features.GetVector(dict_s, Fin_dict) V2 = Features.GetVector(dict_e, Fin_dict) ans = Similarity.CosineSimilarity(V1, V2) with open(filename_a, 'w') as f_obj: temp = str(ans) contents = '' for i in range(0, 4): contents = contents + temp[i] f_obj.write(contents) print(contents)
def intersection(num_cards, base_start, base_stop, num_trials, exp_jaccard, read_lengths): cardinalities = np.logspace( base_start, base_stop, num_cards) # Creates a range of cardinalities for set generation plot = np.zeros(num_cards) for i in range(num_cards): print('Starting ' + str(i + 1) + ' out of ' + str(num_cards)) card = int(cardinalities[i]) # do 10 trials for each cardinality results = np.zeros(num_trials) for j in range(num_trials): h1 = HLL(12) h2 = HLL(12) # Random read (length 40) generator based on the cardinalities and the expected Jaccard value a, b, exp_jaccard, forbes, sd = Rangen_jaccard.generate_reads( exp_jaccard, card, card, read_lengths) for s in a: h1.insert(s) for s in b: h2.insert(s) intersection = Similarity.intersection( h1, h2) # Calculation of the intersection between 2 HLLs num_overlapped = math.ceil( 0.02 * (card * 2) / (0.02 + 1) ) # Calculation of the expected intersection based on the Jaccard formula error = 100 * (intersection - num_overlapped ) / num_overlapped # Percent error calculation results[j] = error plot[i] = np.mean(results) print(plot) # Print out percent errors plt.xscale('log') plt.title('Intersection Accuracy for reads of length 40') plt.xlabel('Cardinality') plt.ylabel('% Error (mean of 10)') plt.ylim(-100, 100) plt.scatter(cardinalities, plot) plt.show()
def sd(num_cards, base_start, base_stop, num_trials, exp_sd, read_lengths): cardinalities = np.logspace( base_start, base_stop, num_cards) # Generates a range of cardinalities for set generation plot = np.zeros(num_cards) for i in range(num_cards): print('Starting ' + str(i + 1) + ' out of ' + str(num_cards)) card = int(cardinalities[i]) # do 10 trials for each cardinality results = np.zeros(num_trials) for j in range(num_trials): h1 = HLL(12) h2 = HLL(12) # Generate reads of length 40 based on the expected Sorensen-Dice, 0.04 a, b, exp_sd = Rangen_sorensen_dice.generate_reads( exp_sd, card, card, read_lengths) for s in a: h1.insert(s) for s in b: h2.insert(s) # Calculation of intersection between two HLLs, necessary for SD calculation intersection = Similarity.intersection(h1, h2) obs_sd = 2 * intersection / ( h1.cardinality() + h2.cardinality() ) # Calculation of expected Sorensen-Dice error = 100 * (obs_sd - exp_sd) / exp_sd # Percent error calculation for SD results[j] = error plot[i] = np.mean(results) print(plot) # Print out all percent errors plt.xscale('log') plt.title('Sorensen-Dice (set at 0.04) Accuracy for reads of length 40') plt.xlabel('Cardinality') plt.ylabel('% Error (mean of 10)') plt.ylim(-100, 100) plt.scatter(cardinalities, plot) plt.show()
def parseNewDataType(self, fileName, delimiter): """ Parse additional data types for existing nodes in the graph :param fileName: file from which to read (.csv) :param delimit: delimiter of file :return: none """ task = threading.Thread(target=self.addAttributes(fileName, delimiter)) task.start() with open(fileName, 'r') as f_in: if self.verbose: print("Calculating Distances for Data Type " + str(self.files_read) + " ...", end=" ", flush=True) reader = csv.reader(f_in, delimiter=delimiter) # skip headers next(reader) patients = [row for row in reader if row != []] distances = Similarity.initialDistance(patients) ids = [row[0] for row in patients] task.join() # write initial distances self.conn.addRelationsFromBuffer(ids, distances, "Similarity " + str(self.files_read)) self.files_read += 1 if self.verbose: print("Done.")
def calcErrs(summary, M1est, M2est, M1ref=M1[corr:, :], M2ref=M2[corr:, :], name1='M{0}'.format(names[compind1]), name2='M{0}'.format(names[compind2])): print( '############################################################################' ) print(summary) sys.stdout.flush() err1 = Similarity.RootMeanSquaredErrorManifold(M1ref, M1est) err2 = Similarity.RootMeanSquaredErrorManifold(M2ref, M2est) printMe('RMSE', name1, name2, err1, err2) err1 = Similarity.MeanErrorManifold(M1ref, M1est) err2 = Similarity.MeanErrorManifold(M2ref, M2est) printMe('Mean error per point', name1, name2, err1, err2) err1 = Similarity.HausdorffDistance(M1ref, M1est) err2 = Similarity.HausdorffDistance(M2ref, M2est) printMe('Hausdorff distance', name1, name2, err1, err2)
def reveal(self): #handling blank entries Weight default values are 1 if len(self.Holiday_Type_cb.get()) == 0: self.Holiday_Type_cb.set("Arbitrary") if len(self.Holiday_Type_Weight_cb.get()) == 0: self.Holiday_Type_Weight_cb.set(1) if len(self.Price_Entry.get()) == 0: self.Price_Entry.insert(0, 10000)#maximum price #handle non int input for price try: x = int(self.Price_Entry.get()) except ValueError: output = "Value for Price must be an integer" self.text.delete(0.0, END) self.text.insert(0.0, output) if len(self.Price_Weight_cb.get()) == 0: self.Price_Weight_cb.set(1) if len(self.Number_Of_Persons_cb.get()) == 0: self.Number_Of_Persons_cb.set(1)#minimum number of people if len(self.Number_Of_Persons_Weight_cb.get()) == 0: self.Number_Of_Persons_Weight_cb.set(1) if len(self.Region_cb.get()) == 0: self.Region_cb.set("Arbitrary") if len(self.Region_Weight_cb.get()) == 0: self.Region_Weight_cb.set(1) if len(self.Transportation_cb.get()) == 0: self.Transportation_cb.set("Arbitrary") if len(self.Transportation_Weight_cb.get()) == 0: self.Transportation_Weight_cb.set(1) if len(self.Duration_cb.get()) == 0: self.Duration_cb.set(1) if len(self.Duration_Weight_cb.get()) == 0: self.Duration_Weight_cb.set(1) if len(self.Season_cb.get()) == 0: self.Season_cb.set("Arbitrary") if len(self.Season_Weight_cb.get()) == 0: self.Season_Weight_cb.set(1) if len(self.Accommodation_Type_cb.get()) == 0: self.Accommodation_Type_cb.set("Arbitrary") if len(self.Accommodation_Type_Weight_cb.get()) == 0: self.Accommodation_Type_Weight_cb.set(1) #print(self.Holiday_Type_cb.get() + "\n" + self.Price_Entry.get() + "\n" + self.Number_Of_Persons_cb.get() + "\n" +\ # self.Region_cb.get() + "\n" + self.Transportation_cb.get() + "\n" + self.Duration_cb.get() + "\n" + \ # self.Season_cb.get() + "\n" + self.Accommodation_Type_cb.get() + "\n") query_case = Case('Query Journey', '0', self.Holiday_Type_cb.get(), self.Price_Entry.get(), self.Number_Of_Persons_cb.get(), self.Region_cb.get(), self.Transportation_cb.get(), self.Duration_cb.get(), self.Season_cb.get(), self.Accommodation_Type_cb.get(), "Hotel") #print ("type of cb weight") #print(type(self.Holiday_Type_Weight_cb.get())) Total_Weight = int(self.Holiday_Type_Weight_cb.get()) + int(self.Price_Weight_cb.get()) + \ int(self.Number_Of_Persons_Weight_cb.get()) + int(self.Region_Weight_cb.get()) + \ int(self.Transportation_Weight_cb.get()) + int(self.Duration_Weight_cb.get()) + \ int(self.Season_Weight_cb.get()) + int(self.Accommodation_Type_Weight_cb.get()) Score_list = [] output = "" index_list = [] for i in range(len(cases)): #Calculate local similarity scores holiday_type_similarity = Similarity.holiday_type(query_case, cases[i]) price_similariy = Similarity.price(query_case, cases[i]) number_of_persons_similarity = Similarity.number_of_persons(query_case, cases[i]) region_similarity = Similarity.region(query_case, cases[i]) transportation_similarity = Similarity.transportation(query_case, cases[i]) duration_similarity = Similarity.duration(query_case, cases[i]) season_similarity = Similarity.season(query_case, cases[i]) accommodation_similarity = Similarity.accommodation(query_case, cases[i]) #Calculate global score global_holiday_type = holiday_type_similarity * int(self.Holiday_Type_Weight_cb.get()) global_price = price_similariy * int(self.Price_Weight_cb.get()) global_number_of_persons = number_of_persons_similarity *\ int(self.Number_Of_Persons_Weight_cb.get()) global_region = region_similarity * int(self.Region_Weight_cb.get()) global_transportation = transportation_similarity * int(self.Transportation_Weight_cb.get()) global_duration = duration_similarity * int(self.Duration_Weight_cb.get()) global_season = season_similarity * int(self.Season_Weight_cb.get()) global_accommodation = accommodation_similarity * int(self.Accommodation_Type_Weight_cb.get()) total_similarity = global_holiday_type + global_price + global_number_of_persons+ \ global_region + global_transportation + global_duration + \ global_season + global_accommodation global_similarity = total_similarity/Total_Weight index_list.append([i, global_similarity, [holiday_type_similarity, price_similariy, number_of_persons_similarity, region_similarity, transportation_similarity, duration_similarity, season_similarity, accommodation_similarity]]) #Rank and RETRIEVE the 10 most similar cases index_list.sort(key= lambda x: x[1], reverse = True) k = 10 for i in range(k): cases_index = index_list[i][0] case = cases[cases_index] output += "Similarity score: " + str(round(index_list[i][1], 3)) + "\n" + \ "Holiday Type: " + case.holiday_type + "\n" + \ "Price: " + str(case.price) + "\n" + \ "Number of Persons: " + str(case.number_of_persons) + "\n" + \ "Region: " + case.region + "\n" + \ "Transportation: " + case.transportation + "\n" + \ "Duration: " + str(case.duration) + "\n" + \ "Season: " + case.season + "\n" + \ "Accommodation: "+ case.accommodation + "\n" + \ "Hotel: " + case.hotel + "\n \n" #Out put for displaying individual local similarity scores #output += "Similarity score: " + str(round(index_list[i][1], 3)) + "\n" + \ # "Holiday Type: " + case.holiday_type + "\n" + str(index_list[i][2][0]) + "\n" +\ # "Price: " + str(case.price) + "\n" + str(index_list[i][2][1]) + "\n" + \ # "Number of Persons: " + str(case.number_of_persons) + "\n" + str(index_list[i][2][2]) + "\n" + \ # "Region: " + case.region + "\n" + str(index_list[i][2][3]) + "\n" + \ # "Transportation: " + case.transportation + "\n" + str(index_list[i][2][4]) + "\n" + \ # "Duration: " + str(case.duration) + "\n" + str(index_list[i][2][5]) + "\n" + \ # "Season: " + case.season + "\n" + str(index_list[i][2][6]) + "\n" + \ # "Accommodation: "+ case.accommodation + "\n" + str(index_list[i][2][7]) +"\n" +\ # "Hotel: " + case.hotel + "\n \n" #Display output self.text.delete(0.0, END) self.text.insert(0.0, output)
# [ser.write(i.to_bytes(1, byteorder='little')) for i in buf] if connected: ser.flushInput() ser.flushOutput() [ser.write(chr(i)) for i in buf] def sum_str(str_arr): string = "" for i in str_arr: string += i return string # ----------------------------------- Valiables ----------------------------------- # sub = Subscribers() init_pose_pub = rospy.Publisher("/init_pose", UInt8, queue_size=1) sim = Similarity.Similarity() Posenames = [] finger_state = [] max_power = [] root = Tk() Min = 0.95 tb_defalt = "new pose name or filename to load and save" th1 = th.Thread(target=find_proc) st_flg = False file_path = "/home/fumyia/" portname = "/dev/ttyACM1" baudrate = 115200 connected = False try: ser = serial.Serial(portname, baudrate) connected = True
# words[key[0]] = scores[scores_index[i]] * key[1] # words = sorted(words.items(), key=lambda i: i[1], reverse=True)[:10] # print('dot_att_', words) if len(udata) > 0: ldaModel = LDA.LDAModel(data, n_topics=num_topics) data_vec = [] for i in range(num_data_final): data_vec.append(ldaModel.get_doc_vec(data[i])) user_doc_vec = [] for text in udata: user_doc_vec.append(ldaModel.get_doc_vec(text)) scores = Similarity.avg_sim(user_doc_vec, data_vec) scores_index = np.argsort(scores)[::-1] words = {} for i in range(num_cand): # print(data[scores_index[i]]) keys = jieba.analyse.extract_tags(ldaModel.preprocess.deltag(data[scores_index[i]]), num_word, withWeight=True) # print(keys) for key in keys: if key[0] not in key_set: if key[0] in words: words[key[0]] += scores[scores_index[i]] * key[1] else: words[key[0]] = scores[scores_index[i]] * key[1] words = sorted(words.items(), key=lambda i: i[1], reverse=True)[:10]
raise ValueError("测试文件列表路径 '{}' 不存在!".format(args.test_file_list)) else: with open(args.test_file_list, mode='r') as f: all_test_file = f.readlines() for file_path in all_test_file: if not os.path.exists(file_path.strip()): # print(file_path) raise ValueError('请将测试的文件都移到当前文件夹 或 在列表中写绝对路径!') if args.flag: print('读取中...') wv_from_text = KeyedVectors.load_word2vec_format( args.pre_train_txt, binary=False) print('完毕\n') S = Similarity(wv_from_text) else: print('训练中...') model = word2vec.Word2Vec(whole_file_sens, hs=1, min_count=1, window=3, size=200) print('完毕\n') wv_simple = model.wv del model S = Similarity(wv_simple) for file_path in all_test_file: f = codecs.open(file_path.strip()) txt = f.read()
def compute_similarity(patient_list): global patients_pointer, tanimoto_edges_output, tanimoto_nodes_output, tanimoto_weighted_output, \ tanimoto_bigrams_output, jaccard_output # calculate the similarity and output it to the files od = OrderedDict(sorted(patient_list.items())) patients_pointer = od # create a list of the other patients other_patients = copy.deepcopy(patients_pointer) # iterate over the patients and compute the similarity for keyA, patientA in patients_pointer.iteritems(): # if keyA != 10013: # continue # store the scores of the test tanimoto_edges_scores = [] tanimoto_nodes_scores = [] tanimoto_bigrams_scores = [] jaccard_scores = [] # iterate over all of the other patients for keyB, patientB in other_patients.iteritems(): # if keyB != 10023: # continue # Find all of the common edges between patient A and patient B common_list = sorted(list(set(patientA.get_all_edges()) | set(patientB.get_all_edges()))) # Find all of the common node/node pairs between patient A and patient B common_combined_nodes = sorted( list(set(patientA.get_all_combined_nodes()) | set(patientB.get_all_combined_nodes()))) bigrams_a = patientA.get_all_combined_nodes_bigrams() bigrams_b = patientB.get_all_combined_nodes_bigrams() common_nodes_bigrams = sorted( list( set(bigrams_a) | set(bigrams_b) ) ) # Create the edge vector for each patient vector_a_edges = patientA.get_edges_vector(common_list) vector_b_edges = patientB.get_edges_vector(common_list) # Create the node vector for each patient vector_a_nodes = patientA.get_nodes_vector(common_combined_nodes, False) vector_b_nodes = patientB.get_nodes_vector(common_combined_nodes, False) vector_a_nodes_bigrams = patientA.get_nodes_vector(common_nodes_bigrams, True) vector_b_nodes_bigrams = patientB.get_nodes_vector(common_nodes_bigrams, True) # Compute the scores tanimoto_nodes = sim.compute_tanimoto_coeff(vector_a_nodes, vector_b_nodes) tanimoto_edges = sim.compute_tanimoto_coeff(vector_a_edges, vector_b_edges) tanimoto_bigrams = sim.compute_tanimoto_coeff(vector_a_nodes_bigrams, vector_b_nodes_bigrams) jaccard = sim.compute_jaccard_coeff(patientA.get_all_unique_nodes(), patientB.get_all_unique_nodes()) # Save the scores to their respective arrays tanimoto_edges_scores.append(tanimoto_edges) tanimoto_nodes_scores.append(tanimoto_nodes) tanimoto_bigrams_scores.append(tanimoto_bigrams) jaccard_scores.append(jaccard) # Find the maximum score (necessary because there may be no edges ) max_edge_score = max(tanimoto_edges_scores) if max_edge_score == 0: tanimoto_edges_scores = [0 for i in tanimoto_edges_scores] else: tanimoto_edges_scores = [float(i) / max(tanimoto_edges_scores) for i in tanimoto_edges_scores] # Normalize all of the scores (tanimoto nodes, jaccard, and weighted tanimoto) tanimoto_nodes_scores = [float(i) / max(tanimoto_nodes_scores) for i in tanimoto_nodes_scores] tanimoto_bigrams_scores = [float(i) / max(tanimoto_bigrams_scores) for i in tanimoto_bigrams_scores] jaccard_scores = [float(i) / max(jaccard_scores) for i in jaccard_scores] tanimoto_weighted = [tanimoto_edges_scores[i] * 0.5 + tanimoto_bigrams_scores[i] * 0.5 for i in range(len(tanimoto_edges_scores))] tanimoto_edges_output[keyA] = sort_by_scores(tanimoto_edges_scores, other_patients) tanimoto_nodes_output[keyA] = sort_by_scores(tanimoto_nodes_scores, other_patients) tanimoto_bigrams_output[keyA] = sort_by_scores(tanimoto_bigrams_scores, other_patients) tanimoto_weighted_output[keyA] = sort_by_scores(tanimoto_weighted, other_patients) jaccard_output[keyA] = sort_by_scores(jaccard_scores, other_patients)
import TextPreprocess import Process import Features import Similarity import sys wordlist_s = TextPreprocess.Tokens(sys.argv[1]) wordlist_e = TextPreprocess.Tokens(sys.argv[2]) dict_s = Process.DictBiulder(wordlist_s) dict_e = Process.DictBiulder(wordlist_e) Fin_dict = Process.MergeDict(dict_s, dict_e) V1 = Features.GetVector(dict_s, Fin_dict) V2 = Features.GetVector(dict_e, Fin_dict) ans = Similarity.CosineSimilarity(V1, V2) with open(sys.argv[3], 'w') as f_obj: if ans == 0: f_obj.write("0.00") else: temp = str(ans) contents = '' for i in range(0, 4): contents = contents + temp[i] f_obj.write(contents)
def testMethods(testFile, methodList, output_dir): CommonOperation.checkDirectory(output_dir) index2Class, serialMatrix = loadTest(testFile) N = len(index2Class) assert N == serialMatrix.shape[0] clsSizes = dict([[cls, len([e for e in index2Class if e == cls])] for cls in set(index2Class)]) clsMaxScores = dict([[cls, np.sum([N - e for e in range(clsSizes[cls])])] for cls in set(index2Class)]) clsMinScores = dict( [[cls, np.sum([e + 1 for e in range(clsSizes[cls] - 1)]) + N] for cls in set(index2Class)]) method2ScoreRate = {} method2ErrorRate = {} for method in methodList: print("method:{0}".format(method)) scoreValues = [] scoreRates = [] errorCounts = [] errorRates = [] matrix_dump_file = join(output_dir, "similarity_matrix", "{0}".format(method)) if (os.path.exists(matrix_dump_file)): simiarityMatrix = CommonOperation.pickleLoad(matrix_dump_file, {}) else: simiarityMatrix = np.zeros([N, N], dtype=np.float) for i in range(N): cls = index2Class[i] print("{0}(id:\033[1;{2}m{1}\033[0m)".format(cls, i, cls + 31), end=",") for j in range(i, N): s = Similarity.Similarity(serialMatrix[i], serialMatrix[j]) simiarity_ij = s.use_method(method) simiarityMatrix[i][j] = simiarity_ij simiarityMatrix[j][i] = simiarity_ij print() CommonOperation.pickleDump(matrix_dump_file, simiarityMatrix) for i in range(N): cls = index2Class[i] similarRank = list(np.argsort(simiarityMatrix[i])) similarRank.reverse() scoreValues.append( np.sum([(N - j) for j, r in enumerate(similarRank) if index2Class[r] == cls])) scoreRates.append(100.0 * (scoreValues[-1] - clsMinScores[cls]) / (clsMaxScores[cls] - clsMinScores[cls])) errorCounts.append(clsSizes[cls] - len([ r for r in similarRank[:clsSizes[cls]] if index2Class[r] == cls ])) errorRates.append(100.0 * errorCounts[-1] / clsSizes[cls]) #for j in range(N): # print("{0}(id:\033[1;{2}m{1}\033[0m,sim:{3})".format(index2Class[similarRank[j]],similarRank[j],index2Class[similarRank[j]]+31,simiarityMatrix[i][similarRank[j]]),end=",") print("statistic:score:{0}({1}%),errorCount:{2}/({3}%)".format( scoreValues[-1], scoreRates[-1], errorCounts[-1], errorRates[-1])) #print("final statistic(average):\n\tscoreRate:{0}\n\terrorRate:{1}".format(np.average(scoreRates), np.average(errorRates))) method2ScoreRate[method] = scoreRates method2ErrorRate[method] = errorRates plt.figure(figsize=(40, 10)) plt.bar(range(N), scoreRates) plt.xlabel("index") #设置x轴标签 plt.ylabel("score rate") #设置y轴标签 plt.title("{0} score rate ".format(method)) plt.ylim(50, 100) plt.savefig(join(output_dir, "{0}_score_rate.png".format(method))) plt.close() plt.figure(figsize=(40, 10)) plt.bar(range(N), errorRates) plt.xlabel("index") #设置x轴标签 plt.ylabel("error rate") #设置y轴标签 plt.title("{0} error rate ".format(method)) plt.ylim(0, 50) plt.savefig(join(output_dir, "{0}_error_rate.png".format(method))) plt.close() print( "statistic(average):\n|algorithm|scoreRate|errorRate|\n|:-:| :-: | :-: |" ) print("|{0}|{1}|{2}|".format(method, "%.3f" % np.average(scoreRates), "%.3f" % np.average(errorRates))) methodList = sorted( methodList, key=lambda method: np.average(method2ErrorRate[method])) print(methodList) print( "final statistic(average):\n|algorithm|scoreRate(%)| errorRate(%)|\n|:-:| :-: | :-: |" ) for method in methodList: print("|{0}|{1}|{2}|".format( method, "%.2f" % np.average(method2ScoreRate[method]), "%.2f" % np.average(method2ErrorRate[method]))) CommonOperation.pickleDump(join(output_dir, "method2ScoreRate"), method2ScoreRate) CommonOperation.pickleDump(join(output_dir, "method2ErrorRate"), method2ErrorRate)
class LexC: d = s.Similarity() sim_dict = {} def calculate_centroid(self, c, features, weights, prev_dict, next_dict): clusters = c centroid = [] print("Calculating Centroids------") for wordi in clusters: r = [] for wordj in clusters[wordi]: self.sim_dict.setdefault(wordj, {}) rj = 0 for word in clusters[wordi]: di = self.d.calculate_distance(wordj, word, features, weights, prev_dict, next_dict) self.sim_dict[wordj][word] = di rj = rj + di r.append(rj) m = np.argmax(r) centroid.append(clusters[wordi][m]) print("Centroids Calculated------") return centroid def make_clusters(self, initial_clusters, max_iteration, threshold, features, weights, prev_dict, next_dict): words = list(prev_dict.keys()) c = initial_clusters for it in range(max_iteration): print("Iteration Number:" + str(it) + "-----") centroid = self.calculate_centroid(c, features, weights, prev_dict, next_dict) cluster = {} for i in centroid: cluster.setdefault(i, []) print("Making Clusters----") for w in words: print(w) closest = 0 maxSim = 0 x = list(self.sim_dict[w].keys()) inter = list(set(x).intersection(set(centroid))) for ci in inter: try: if (w[0] == ci[0] and w[1] == ci[1]): if self.sim_dict[w][ ci] > threshold and self.sim_dict[w][ ci] > maxSim: maxSim = self.sim_dict[w][ci] closest = ci except: if (w[0] == ci[0]): if self.sim_dict[w][ ci] > threshold and self.sim_dict[w][ ci] > maxSim: maxSim = self.sim_dict[w][ci] closest = ci if closest != 0: cluster[closest].append(w) else: key = w cluster.setdefault(key, []) centroid.append(key) clusters = {} x = 0 for i in cluster: clusters[x] = cluster[i] if (i not in clusters[x]): clusters[x].append(i) x += 1 c = clusters print("Writing CLusters in file-----") self.write_results(clusters, features, weights) return clusters def write_results(self, clusters, features, weights): filename = features + str(weights[0]) + "_" + str( weights[1]) + "_" + str(weights[2]) + "_" + str(weights[3]) file = open("Result/" + filename + " Cluster.txt", "w") for i in clusters: file.write(str(i) + " ") for j in clusters[i]: file.write(j + ",") file.write("\n")
def compute_similarity(patient_list): global patients_pointer, tanimoto_edges_output, tanimoto_nodes_output, tanimoto_weighted_output, \ tanimoto_bigrams_output, jaccard_output # calculate the similarity and output it to the files od = OrderedDict(sorted(patient_list.items())) patients_pointer = od # create a list of the other patients other_patients = copy.deepcopy(patients_pointer) # iterate over the patients and compute the similarity for keyA, patientA in patients_pointer.items(): # if keyA != 10013: # continue # store the scores of the test tanimoto_edges_scores = [] tanimoto_nodes_scores = [] tanimoto_bigrams_scores = [] jaccard_scores = [] # iterate over all of the other patients for keyB, patientB in other_patients.items(): # if keyB != 10023: # continue # Find all of the common edges between patient A and patient B common_list = sorted( list( set(patientA.get_all_edges()) | set(patientB.get_all_edges()))) # Find all of the common node/node pairs between patient A and patient B common_combined_nodes = sorted( list( set(patientA.get_all_combined_nodes()) | set(patientB.get_all_combined_nodes()))) bigrams_a = patientA.get_all_combined_nodes_bigrams() bigrams_b = patientB.get_all_combined_nodes_bigrams() common_nodes_bigrams = sorted( list(set(bigrams_a) | set(bigrams_b))) # Create the edge vector for each patient vector_a_edges = patientA.get_edges_vector(common_list) vector_b_edges = patientB.get_edges_vector(common_list) # Create the node vector for each patient vector_a_nodes = patientA.get_nodes_vector(common_combined_nodes, False) vector_b_nodes = patientB.get_nodes_vector(common_combined_nodes, False) vector_a_nodes_bigrams = patientA.get_nodes_vector( common_nodes_bigrams, True) vector_b_nodes_bigrams = patientB.get_nodes_vector( common_nodes_bigrams, True) # Compute the scores tanimoto_nodes = sim.compute_tanimoto_coeff( vector_a_nodes, vector_b_nodes) tanimoto_edges = sim.compute_tanimoto_coeff( vector_a_edges, vector_b_edges) tanimoto_bigrams = sim.compute_tanimoto_coeff( vector_a_nodes_bigrams, vector_b_nodes_bigrams) jaccard = sim.compute_jaccard_coeff( patientA.get_all_unique_nodes(), patientB.get_all_unique_nodes()) # Save the scores to their respective arrays tanimoto_edges_scores.append(tanimoto_edges) tanimoto_nodes_scores.append(tanimoto_nodes) tanimoto_bigrams_scores.append(tanimoto_bigrams) jaccard_scores.append(jaccard) # Find the maximum score (necessary because there may be no edges ) max_edge_score = max(tanimoto_edges_scores) if max_edge_score == 0: tanimoto_edges_scores = [0 for i in tanimoto_edges_scores] else: tanimoto_edges_scores = [ float(i) / max(tanimoto_edges_scores) for i in tanimoto_edges_scores ] # Normalize all of the scores (tanimoto nodes, jaccard, and weighted tanimoto) tanimoto_nodes_scores = [ float(i) / max(tanimoto_nodes_scores) for i in tanimoto_nodes_scores ] tanimoto_bigrams_scores = [ float(i) / max(tanimoto_bigrams_scores) for i in tanimoto_bigrams_scores ] jaccard_scores = [ float(i) / max(jaccard_scores) for i in jaccard_scores ] tanimoto_weighted = [ tanimoto_edges_scores[i] * 0.5 + tanimoto_bigrams_scores[i] * 0.5 for i in range(len(tanimoto_edges_scores)) ] tanimoto_edges_output[keyA] = sort_by_scores(tanimoto_edges_scores, other_patients) tanimoto_nodes_output[keyA] = sort_by_scores(tanimoto_nodes_scores, other_patients) tanimoto_bigrams_output[keyA] = sort_by_scores(tanimoto_bigrams_scores, other_patients) tanimoto_weighted_output[keyA] = sort_by_scores( tanimoto_weighted, other_patients) jaccard_output[keyA] = sort_by_scores(jaccard_scores, other_patients)
query_users = 'select UserID from users' cursor.execute(query_users) for userID in cursor: users.append(userID[0]) except mysql.connector.Error as e: print 'connect failed!{}'.format(e) # print movies[-1][0], users[-1][0] # 初始化用户-物品矩阵 print users[-1], movies[-1] dataMat = zeros((users[-1] + 1, movies[-1] + 1)) try: query_ratings = "select UserID, MovieID,Rating from ratings" cursor.execute(query_ratings) for userid, movieid, rating in cursor: # print userid, movieid, rating dataMat[userid, movieid] = rating finally: cursor.close() conn.close() #得到的是一个list,(编号, 评分) # print Similarity.userSimiliar(mat(dataMat), 1, Similarity.cosSim) result = Similarity.simBetweenUsers(mat(dataMat), users, Similarity.cosSim) for user1 in users: for user2 in users: print user1, user2, result[user1, user2]