def calc_similarity(kpi, similarity_method='ecc', weights=(0.5, 0.5), default_dict=dict(), default_value=0): # 两两算相似度, Similarity(score) * weights[0] + Similarity(value) * weights[1] 作为结果 '''k1 = random.sample(list(kpi.keys()), 1)[0] v1 = kpi[k1] similarity_dict=dict() similarity_dict[k1]=dict([[tt, []] for tt in kpi.keys()]) ''' similarity_dict = default_dict if isinstance(default_dict, dict) else dict() for k1, v1 in kpi.items(): for k2, v2 in kpi.items(): if (not k1 in similarity_dict.keys()): similarity_dict[k1] = dict() if (not k2 in similarity_dict.keys()): similarity_dict[k2] = dict() if k1 == k2: similarity_dict[k1][k1] = [ default_value, default_value, default_value ] elif k2 in similarity_dict.keys() and k1 in similarity_dict[k2]: continue else: T0 = time.time() print([k1, k2], end=" ") s1 = Similarity(v1.value.values, v2.value.values).use_method(similarity_method) s2 = Similarity(v1.score.values, v2.score.values).use_method(similarity_method) print("[{1},{2}]:{0}".format(time.time() - T0, s1, s2), flush=True) similarity_dict[k1][k2] = [ s1, s2, s1 * weights[0] + s2 * weights[1] ] similarity_dict[k2][k1] = [ s1, s2, s1 * weights[0] + s2 * weights[1] ] print('finish calculate similarity. current dict size reaches {0} ({1})'. format(len(similarity_dict.keys()), time.asctime()), flush=True) return similarity_dict
def print_equalities(): Constants.SIMILARITY = Similarity() Constants.PARAPHRASE = True sentences = read_test_data(r"C:\Users\Nikolaj\PycharmProjects\LitteralJapaneseTranslation\data\sentences_dev.txt") for sentence in sentences: print([t.english for t in sentence.tokens]) translations = LiteralJapanese.translate(sentence.japanese) print([t.english for t in translations]) for translation in translations: for token in sentence.tokens: (equal,rule) = Equality.equals_rule(token.english,translation[1]) if equal: print(token.english + " = " + translation.english + "\t" + "("+rule+")")
def similarity(): datatext = DataText() datatext.setPTBData() cooccurrence = Cooccurrence(datatext, 2) cooccurrence.createMatrix() cooccurrence.calcPMIfromCoMatrix() cooccurrence.reduceDimensions(100) similarity = Similarity(datatext, cooccurrence.matrix) print("n_sequense: ", len(datatext.words)) print("n_words: ", datatext.n_words) querys = ["you", "year", "car", "toyota"] for query in querys: print("query: ", query) similarity.rankSimilarity(query)
def main(): similarity = Similarity() pg.init() display_width = 740 display_height = 480 clock = pg.time.Clock() gameDisplay = pg.display.set_mode([display_width,display_height]) pg.display.set_caption('Face Dance Machine') faceDanceMachine = FaceDanceMachine(gameDisplay, similarity) faceDanceMachine.run() pg.quit()
def create_statistic(data_path): #Init classes sim = Similarity() #Load vectoro to data frame pandas data_frame = pd.DataFrame.from_dict(data_path) #Loop para filtrar as perguntas não respondidas nao_respondidas = [] #Padrão de resposta esperada i_dont_know = 'I don\'t know how to respond this' for index, row in data_frame.iterrows(): val = sim.symmetric_sentence_similarity(i_dont_know, row.response) if(val > 0.88): nao_respondidas.append([row.input, 1 ]) total_nao_respondidas = len(nao_respondidas) # considerei uma ativação de 0.55 assim consideramos os mesmos temas i = 0 for ask in nao_respondidas: j = 0 for ask1 in nao_respondidas: val = sim.symmetric_sentence_similarity(ask[0], ask1[0]) if(val > 0.55): #similar remove e soma na primeira if(i != j): del nao_respondidas[j] deletado = True else: deletado = False j += 1 ask[1] += 1 if not deletado: i += 1 #Convert Array to Numpy Array np_nao_respondidas = np.array(nao_respondidas, dtype="O") #Sort Numpy Array np_nao_respondidas.sort(axis=0,kind='heapsort') #Cut in 10 first top_10 = np_nao_respondidas.tolist()[-10:][::-1] response = {"total": total_nao_respondidas, "top": top_10} return response
def __init__(self, urls, onchange=None): #onchange is a function with onchange(url) ######################################################################## #HYPERPARAMETERS self.datapath = "./data" self.periode = 10 * 60 #check every hour self.deltastart = 3 #start thread with a delta time of 3 sec self.max_change_ratio = 0.10 ######################################################################## super(PagesChecker, self).__init__() self.isrunning = False self.urls = urls self.onchange = onchange self.threads = [] self.sm = Similarity() if not os.path.exists(self.datapath) or os.path.exists( self.datapath) and not os.path.isdir(self.datapath): os.mkdir(self.datapath, 0o755)
import traceback from flask import Flask, request, jsonify from Similarity import Similarity app = Flask(__name__) sim = Similarity() @app.route('/find_person', methods=['POST']) def find_person(): result = {} result['status'] = 1 try: fp = request.files['img'] person_id, face_sim = sim.find_person(fp) data = {} data['person_id'] = person_id data['face_sim'] = face_sim result['data'] = data except BaseException as e: traceback.print_exc() result['err'] = "{}".format(e) type = 0 # error result['status'] = type return jsonify(result) @app.route('/rm_person', methods=['POST']) def rm_person(): result = {}
print "Loading Data" mentions = [ convert_json(json.loads(x)) for x in open("Data/Mentions.json") ] mid_mention = dict([(x['mid'], x) for x in mentions]) pairs = json.load( open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/Train_Test_Pairs")) mid_eid = dict([ apply(lambda x, y: (int(x), y), line.strip().split("\t")) for line in open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/mid_eid") ]) loaded = True print "Computing Training Similarities" s = Similarity() labels = [] features = [] for (mid0, mid1) in pairs['training']: labels.append(int(mid_eid[mid0] == mid_eid[mid1])) features.append(s.compute(mid_mention[mid0], mid_mention[mid1])) #Adding features here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # A little hack so that first class is always 1: labels.insert(0, 1) features.insert(0, {}) print "Active Similarities:" print "\t".join(reduce(lambda x, y: x | y, [set(x.keys()) for x in features])) print "Training" # We need to convert string feature names to feature numbers for liblinear:
def build(self): log.info('building rnn cell....') if self.cell == 'gru': recurent_x = GRU(self.rng, self.n_input, self.n_hidden, self.x, self.E, self.xmask, self.is_train, self.dropout) recurent_y = GRU(self.rng, self.n_input, self.n_hidden, self.y, self.E, self.ymask, self.is_train, self.dropout) elif self.cell == 'lstm': recurent_x = LSTM(self.rng, self.n_input, self.n_hidden, self.x, self.E, self.xmask, self.is_train, self.dropout) recurent_y = LSTM(self.rng, self.n_input, self.n_hidden, self.y, self.E, self.ymask, self.is_train, self.dropout) log.info('build the sim matrix....') sim_layer = Similarity(recurent_x.activation, recurent_y.activation, metrics=self.sim) log.info('building convolution pooling layer....') conv_pool_layer = ConvPool( input=sim_layer.activation, filter_shape=(2, 1, 3, 3), # feature_maps, 1, filter_h, filter_w input_shape=(self.batch_size, 1, 50, 50)) #sim_layer.activation.shape) projected_layer = basicLayer(conv_pool_layer.activation, input_shape=1152) rav_cost = T.nnet.binary_crossentropy(projected_layer.activation, self.label) cost = T.mean(rav_cost) acc = T.eq(projected_layer.activation > 0.5, self.label) log.info('cost calculated.....') self.params = [ self.E, ] self.params += recurent_x.params self.params += recurent_y.params self.params += conv_pool_layer.params self.params += projected_layer.params lr = T.scalar('lr') gparams = [T.clip(T.grad(cost, p), -3, 3) for p in self.params] #gparams = [T.grad(cost, p) for p in self.params] if self.optimizer == 'sgd': updates = sgd(self.params, gparams, lr) elif self.optimizer == 'adam': updates = adam(self.params, gparams, lr) elif self.optimizer == 'rmsprop': updates = rmsprop(self.params, gparams, lr) log.info('gradient calculated.....') self.train = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask, self.label, lr], outputs=[cost, acc], updates=updates, givens={self.is_train: np.cast['int32'](1)}) self.predict = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask, self.label], outputs=[rav_cost, acc], givens={self.is_train: np.cast['int32'](0)}) self.test = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=projected_layer.activation, givens={self.is_train: np.cast['int32'](0)})
def _init_Sim(self): self.sim = Similarity() self.shelf_db = shelve.open('image_db.dat')
#responseDoc = fileName5; #requestDoc = 'abstract.txt' requestDoc = fileName5 readDoc = ReadDocument() print("Reading input document") fileContent = readDoc.readFile(requestDoc) keywords = readDoc.getKeywords(fileContent) #print(keywords) #print(len(keywords)) webSearch = WebSearch() print("crawling the web") webSearch.google_search(keywords) #webSearch.trialSearch() checkSimilarity = Similarity() print("checking similarity") value = checkSimilarity.similarValue(requestDoc, responseDoc) f = open(responseDoc, 'r+') f.truncate() print("----------SIMILARITY-----------") #print(value) resultValue = str(round((value * 100), 2)) + "%" #print(str(round((value*100),2))+"%") print(resultValue) file = open('similarityValue.txt', 'w', encoding='utf-8') file.write(str(resultValue)) print("-------------------------------")