def main(): filename = "candidate_synonyms.txt" file = open(filename) for line in file: line = line.strip().split(',') print line if not dict_opener.has_key(line[0]): opener1 = Opener(line[0]) dict_opener[line[0]] = opener1 else: opener1 = dict_opener[line[0]] if not dict_opener.has_key(line[1]): opener2 = Opener(line[1]) dict_opener[line[1]] = opener2 else: opener2 = dict_opener[line[1]] if Similarity.similarity_eval_equal(line[0], opener1.get_related_searchs(), line[1], opener2.get_related_searchs()): f.write(line[0] + ";" + line[1] + ";" + "true\n") elif Similarity.similarity_eval_cosine(opener1.get_dict(), opener2.get_dict()): f.write(line[0] + ";" + line[1] + ";" + "true\n") else: f.write(line[0] + ";" + line[1] + ";" + "false\n")
class TestCalculatePairSimilarity: def setup(self): self.userDic = {1:[(1,1),(2,2),(3,3)], 2:[(4,4),(5,5)]} self.similarity = Similarity(self.userDic) def test_with_normal_data(self): userI = [(1,-2),(2,-1),(3,0)] userJ = [(1,2),(2,1),(4,0)] similarity = self.similarity.calculatePairSimilarity(userI,userJ) assert_true(abs(similarity+1.0)<0.001) def test_with_no_common_data(self): userI = [(1,-2),(2,-1),(3,0)] userJ = [(4,2),(5,1),(6,2)] similarity = self.similarity.calculatePairSimilarity(userI,userJ) assert_equals(similarity,0.0) def test_with_all_zero_value(self): userI = [(1,-2),(2,-1),(3,0)] userJ = [(1,0),(2,0),(3,0)] similarity = self.similarity.calculatePairSimilarity(userI,userJ) assert_equals(similarity,0.0)
def get_list_url_similar(abs_url, length=10): fea = feature_all(abs_url) list_cosin = Similarity.distance_list(fea, matrix, cosine=True) list_eu = Similarity.distance_list(fea, matrix, cosine=False) list_url_cosin = get_top(list_cosin.flatten(), length) list_url_eu = get_top(list_eu.flatten(), length) return list_url_cosin, list_url_eu
def similarity(): datatext = DataText() datatext.setPTBData() cooccurrence = Cooccurrence(datatext, 2) cooccurrence.createMatrix() cooccurrence.calcPMIfromCoMatrix() cooccurrence.reduceDimensions(100) similarity = Similarity(datatext, cooccurrence.matrix) print("n_sequense: ", len(datatext.words)) print("n_words: ", datatext.n_words) querys = ["you", "year", "car", "toyota"] for query in querys: print("query: ", query) similarity.rankSimilarity(query)
def create_statistic(data_path): #Init classes sim = Similarity() #Load vectoro to data frame pandas data_frame = pd.DataFrame.from_dict(data_path) #Loop para filtrar as perguntas não respondidas nao_respondidas = [] #Padrão de resposta esperada i_dont_know = 'I don\'t know how to respond this' for index, row in data_frame.iterrows(): val = sim.symmetric_sentence_similarity(i_dont_know, row.response) if(val > 0.88): nao_respondidas.append([row.input, 1 ]) total_nao_respondidas = len(nao_respondidas) # considerei uma ativação de 0.55 assim consideramos os mesmos temas i = 0 for ask in nao_respondidas: j = 0 for ask1 in nao_respondidas: val = sim.symmetric_sentence_similarity(ask[0], ask1[0]) if(val > 0.55): #similar remove e soma na primeira if(i != j): del nao_respondidas[j] deletado = True else: deletado = False j += 1 ask[1] += 1 if not deletado: i += 1 #Convert Array to Numpy Array np_nao_respondidas = np.array(nao_respondidas, dtype="O") #Sort Numpy Array np_nao_respondidas.sort(axis=0,kind='heapsort') #Cut in 10 first top_10 = np_nao_respondidas.tolist()[-10:][::-1] response = {"total": total_nao_respondidas, "top": top_10} return response
def calc_similarity(kpi, similarity_method='ecc', weights=(0.5, 0.5), default_dict=dict(), default_value=0): # 两两算相似度, Similarity(score) * weights[0] + Similarity(value) * weights[1] 作为结果 '''k1 = random.sample(list(kpi.keys()), 1)[0] v1 = kpi[k1] similarity_dict=dict() similarity_dict[k1]=dict([[tt, []] for tt in kpi.keys()]) ''' similarity_dict = default_dict if isinstance(default_dict, dict) else dict() for k1, v1 in kpi.items(): for k2, v2 in kpi.items(): if (not k1 in similarity_dict.keys()): similarity_dict[k1] = dict() if (not k2 in similarity_dict.keys()): similarity_dict[k2] = dict() if k1 == k2: similarity_dict[k1][k1] = [ default_value, default_value, default_value ] elif k2 in similarity_dict.keys() and k1 in similarity_dict[k2]: continue else: T0 = time.time() print([k1, k2], end=" ") s1 = Similarity(v1.value.values, v2.value.values).use_method(similarity_method) s2 = Similarity(v1.score.values, v2.score.values).use_method(similarity_method) print("[{1},{2}]:{0}".format(time.time() - T0, s1, s2), flush=True) similarity_dict[k1][k2] = [ s1, s2, s1 * weights[0] + s2 * weights[1] ] similarity_dict[k2][k1] = [ s1, s2, s1 * weights[0] + s2 * weights[1] ] print('finish calculate similarity. current dict size reaches {0} ({1})'. format(len(similarity_dict.keys()), time.asctime()), flush=True) return similarity_dict
class TestSimilarity: def setup(self): self.userDic = {1:[(1,1),(2,1),(3,4)], 2:[(1,5),(2,5),(3,2),(4,3)]} self.similarity = Similarity(self.userDic) def test_adjust_value(self): adjustedUserDic = self.similarity.adjustValue(self.userDic) assert_equals(adjustedUserDic, {1:[(1,-2),(2,-2),(3,1)],2:[(1,2),(2,2),(3,-1),(4,0)]}) def test_calculate_similarity(self): similarityDict = self.similarity.calculateSimilarity() assert_equals(similarityDict[1][2],-1.0) assert_equals(similarityDict[2][1],-1.0)
def worker(line): line = line.strip().split(',') if not dict_opener.has_key(line[0]): opener1 = Opener(line[0]) dict_opener[line[0]] = opener1 else: opener1 = dict_opener[line[0]] if not dict_opener.has_key(line[1]): opener2 = Opener(line[1]) dict_opener[line[1]] = opener2 else: opener2 = dict_opener[line[1]] if Similarity.similarity_eval_equal(line[0], opener1.get_related_searchs(), line[1], opener2.get_related_searchs()): f.write(line[0] + ";" + line[1] + ";" + "true\n") elif Similarity.similarity_eval_cosine(opener1.get_dict(), opener2.get_dict()): f.write(line[0] + ";" + line[1] + ";" + "true\n") else: f.write(line[0] + ";" + line[1] + ";" + "false\n")
def print_equalities(): Constants.SIMILARITY = Similarity() Constants.PARAPHRASE = True sentences = read_test_data(r"C:\Users\Nikolaj\PycharmProjects\LitteralJapaneseTranslation\data\sentences_dev.txt") for sentence in sentences: print([t.english for t in sentence.tokens]) translations = LiteralJapanese.translate(sentence.japanese) print([t.english for t in translations]) for translation in translations: for token in sentence.tokens: (equal,rule) = Equality.equals_rule(token.english,translation[1]) if equal: print(token.english + " = " + translation.english + "\t" + "("+rule+")")
def __init__(self, urls, onchange=None): #onchange is a function with onchange(url) ######################################################################## #HYPERPARAMETERS self.datapath = "./data" self.periode = 10 * 60 #check every hour self.deltastart = 3 #start thread with a delta time of 3 sec self.max_change_ratio = 0.10 ######################################################################## super(PagesChecker, self).__init__() self.isrunning = False self.urls = urls self.onchange = onchange self.threads = [] self.sm = Similarity() if not os.path.exists(self.datapath) or os.path.exists( self.datapath) and not os.path.isdir(self.datapath): os.mkdir(self.datapath, 0o755)
def main(): similarity = Similarity() pg.init() display_width = 740 display_height = 480 clock = pg.time.Clock() gameDisplay = pg.display.set_mode([display_width,display_height]) pg.display.set_caption('Face Dance Machine') faceDanceMachine = FaceDanceMachine(gameDisplay, similarity) faceDanceMachine.run() pg.quit()
class MainWidget(QWidget): def __init__(self): super().__init__() self._setup_ui() self._init_Sim() self.current_show_name = '' self.current_show_image_path = '' self.add_person_btn.clicked.connect(self._clilked_add_persion_btn_slot) self.find_person_btn.clicked.connect( self._clilked_find_persion_btn_slot) def _setup_ui(self): self.lefttop = (100, 100) self.size = (1200, 600) self.setGeometry(*self.lefttop, *self.size) self.add_person_btn = QPushButton('add') self.rm_person_btn = QPushButton('rm') self.find_person_btn = QPushButton('find') self.person_id_lable = QLabel('person_id') self.person_image_lable = QLabel('person_image') self.person_info_list = QListWidget() self.vbox_1L = QVBoxLayout(self) self.vbox_2L_btns = QHBoxLayout() self.vbox_2L_labels = QHBoxLayout() self.vbox_3L_lists = QVBoxLayout() self.vbox_1L.addLayout(self.vbox_2L_btns) self.vbox_1L.addLayout(self.vbox_2L_labels) self.vbox_2L_labels.addLayout(self.vbox_3L_lists, stretch=3) self.vbox_2L_btns.addWidget(self.add_person_btn) self.vbox_2L_btns.addWidget(self.rm_person_btn) self.vbox_2L_btns.addWidget(self.find_person_btn) self.vbox_2L_labels.addWidget(self.person_image_lable, stretch=7) self.vbox_3L_lists.addWidget(self.person_id_lable) self.vbox_3L_lists.addWidget(self.person_info_list) self.show() def _init_Sim(self): self.sim = Similarity() self.shelf_db = shelve.open('image_db.dat') def __del__(self): self.shelf_db.close() def add_person(self, image_path: str): _, name = self.sim.add_person(image_path) self.shelf_db['name'] = image_path def rm_person(self, person_id: str): if person_id in self.shelf_db.keys(): del self.shelf_db[person_id] self.rm_persion(person_id) def find_person(self, image_path: str) -> Union[str, None]: name, score = self.sim.find_person(image_path) if score > 0.7: return name def update_image_ui(self): if self.current_show_image_path and self.current_show_name: self.person_id_lable.setText(self.current_show_name) img = QPixmap(self.current_show_image_path) if img.width() > 300: img = img.scaledToWidth(300) self.person_image_lable.setPixmap(img) else: self.person_id_lable.setText("none") self.person_image_lable.setPixmap(QPixmap()) def update_person_list_ui(self): # for k,v in self.shelf_db.items(): # item=QListWidgetItem(self.person_info_list) pass def _clilked_add_persion_btn_slot(self): files, _ = QFileDialog.getOpenFileNames( self, 'select files', filter='jpg(*.jpg);;png(*.png)') for file in files: is_unique, name = self.sim.add_person(file) if is_unique: self.shelf_db[name] = file def _clilked_find_persion_btn_slot(self): files, _ = QFileDialog.getOpenFileNames( self, 'select files', filter='jpg(*.jpg);;png(*.png)') if files: file = files[0] name, score = self.sim.find_person(file) if score > 0.7: self.current_show_name = name self.current_show_image_path = self.shelf_db[name] else: self.current_show_name = "" self.current_show_image_path = '' self.update_image_ui()
import traceback from flask import Flask, request, jsonify from Similarity import Similarity app = Flask(__name__) sim = Similarity() @app.route('/find_person', methods=['POST']) def find_person(): result = {} result['status'] = 1 try: fp = request.files['img'] person_id, face_sim = sim.find_person(fp) data = {} data['person_id'] = person_id data['face_sim'] = face_sim result['data'] = data except BaseException as e: traceback.print_exc() result['err'] = "{}".format(e) type = 0 # error result['status'] = type return jsonify(result) @app.route('/rm_person', methods=['POST']) def rm_person(): result = {}
class PagesChecker(object): """docstring for PagesChecker.""" def __init__(self, urls, onchange=None): #onchange is a function with onchange(url) ######################################################################## #HYPERPARAMETERS self.datapath = "./data" self.periode = 10 * 60 #check every hour self.deltastart = 3 #start thread with a delta time of 3 sec self.max_change_ratio = 0.10 ######################################################################## super(PagesChecker, self).__init__() self.isrunning = False self.urls = urls self.onchange = onchange self.threads = [] self.sm = Similarity() if not os.path.exists(self.datapath) or os.path.exists( self.datapath) and not os.path.isdir(self.datapath): os.mkdir(self.datapath, 0o755) def run(self): self.isrunning = True urls = self.urls self.urls = [] for url in urls: self.addNewChecker(url) time.sleep(self.deltastart) def stop(self): self.isrunning = False for thread in self.threads: thread.stop() def state(self): print("Got " + str(len(self.threads)) + " running !") def getFileNameFromUrl(self, url): nurl = url.replace("https://", "").replace("http://", "").replace( "/", "-").replace(".html", "").replace(".php", "").replace(".js", "").replace(".", "_") return self.datapath + "/" + nurl def formatDate(self): return '{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) def saveState(self, url, data): f = open(self.getFileNameFromUrl(url), "w+") f.truncate(0) f.seek(0) f.write(json.dumps(data)) f.close() def strIsSame(self, html1, html2): if len(html1) == len(html2): return self.sm.isSimilar(html1, html2) else: return False def addNewChecker(self, url): if not os.path.exists(self.getFileNameFromUrl(url)): f = open(self.getFileNameFromUrl(url), "w+") f.write( json.dumps({ "url": url, "created_at": self.formatDate(), "html": "", "last_check": "0", "nb_change": 0 })) f.close() t = threading.Thread(target=self.worker, args=(url, )) self.threads.append(t) self.urls.append(url) t.start() def stopChecker(self, url): for i in range(0, len(self.urls)): if self.urls[i] == url: self.threads[i].stop() self.threads.pop(i) self.urls.pop(i) break def worker(self, url): html = str(requests.get(url).content, "utf-8") datafile = open(self.getFileNameFromUrl(url), "r+") raw = datafile.read() data = json.loads(raw) datafile.close() #isSame = self.strIsSame(html, data["html"]) distance = sum( [1 for x, y in zip(html, data["html"]) if x.lower() != y.lower()]) change_ratio = distance * 2 / (len(html) + len(data["html"])) if (change_ratio > self.max_change_ratio or data["html"] == ""): data["nb_change"] += 1 data["html"] = html self.onchange(url) data["last_check"] = self.formatDate() self.saveState(url, data) time.sleep(self.periode)
print "Loading Data" mentions = [ convert_json(json.loads(x)) for x in open("Data/Mentions.json") ] mid_mention = dict([(x['mid'], x) for x in mentions]) pairs = json.load( open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/Train_Test_Pairs")) mid_eid = dict([ apply(lambda x, y: (int(x), y), line.strip().split("\t")) for line in open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/mid_eid") ]) loaded = True print "Computing Training Similarities" s = Similarity() labels = [] features = [] for (mid0, mid1) in pairs['training']: labels.append(int(mid_eid[mid0] == mid_eid[mid1])) features.append(s.compute(mid_mention[mid0], mid_mention[mid1])) #Adding features here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # A little hack so that first class is always 1: labels.insert(0, 1) features.insert(0, {}) print "Active Similarities:" print "\t".join(reduce(lambda x, y: x | y, [set(x.keys()) for x in features])) print "Training" # We need to convert string feature names to feature numbers for liblinear:
print "Using Default output Results/Test"; out_name = "Test"; else: out_name = sys.argv[1]; try: loaded except NameError: print "Loading Data"; mentions = [convert_json(json.loads(x)) for x in open("Data/Mentions.json")]; mid_mention = dict([(x['mid'],x) for x in mentions]); pairs = json.load(open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/Train_Test_Pairs")); mid_eid = dict([apply(lambda x,y:(int(x),y),line.strip().split("\t")) for line in open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/mid_eid")]); loaded = True; print "Computing Training Similarities"; s = Similarity(); labels = []; features = []; for (mid0,mid1) in pairs['training']: labels.append(int(mid_eid[mid0] == mid_eid[mid1])); features.append(s.compute(mid_mention[mid0],mid_mention[mid1])); #Adding features here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # A little hack so that first class is always 1: labels.insert(0,1); features.insert(0,{}); print "Active Similarities:" print "\t".join(reduce(lambda x,y:x|y, [set(x.keys()) for x in features])); print "Training"; # We need to convert string feature names to feature numbers for liblinear:
def build(self): log.info('building rnn cell....') if self.cell == 'gru': recurent_x = GRU(self.rng, self.n_input, self.n_hidden, self.x, self.E, self.xmask, self.is_train, self.dropout) recurent_y = GRU(self.rng, self.n_input, self.n_hidden, self.y, self.E, self.ymask, self.is_train, self.dropout) elif self.cell == 'lstm': recurent_x = LSTM(self.rng, self.n_input, self.n_hidden, self.x, self.E, self.xmask, self.is_train, self.dropout) recurent_y = LSTM(self.rng, self.n_input, self.n_hidden, self.y, self.E, self.ymask, self.is_train, self.dropout) log.info('build the sim matrix....') sim_layer = Similarity(recurent_x.activation, recurent_y.activation, metrics=self.sim) log.info('building convolution pooling layer....') conv_pool_layer = ConvPool( input=sim_layer.activation, filter_shape=(2, 1, 3, 3), # feature_maps, 1, filter_h, filter_w input_shape=(self.batch_size, 1, 50, 50)) #sim_layer.activation.shape) projected_layer = basicLayer(conv_pool_layer.activation, input_shape=1152) rav_cost = T.nnet.binary_crossentropy(projected_layer.activation, self.label) cost = T.mean(rav_cost) acc = T.eq(projected_layer.activation > 0.5, self.label) log.info('cost calculated.....') self.params = [ self.E, ] self.params += recurent_x.params self.params += recurent_y.params self.params += conv_pool_layer.params self.params += projected_layer.params lr = T.scalar('lr') gparams = [T.clip(T.grad(cost, p), -3, 3) for p in self.params] #gparams = [T.grad(cost, p) for p in self.params] if self.optimizer == 'sgd': updates = sgd(self.params, gparams, lr) elif self.optimizer == 'adam': updates = adam(self.params, gparams, lr) elif self.optimizer == 'rmsprop': updates = rmsprop(self.params, gparams, lr) log.info('gradient calculated.....') self.train = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask, self.label, lr], outputs=[cost, acc], updates=updates, givens={self.is_train: np.cast['int32'](1)}) self.predict = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask, self.label], outputs=[rav_cost, acc], givens={self.is_train: np.cast['int32'](0)}) self.test = theano.function( inputs=[self.x, self.xmask, self.y, self.ymask], outputs=projected_layer.activation, givens={self.is_train: np.cast['int32'](0)})
__credits__ = ["Samuel de Oliveira Gamito"] __license__ = "GPL" __maintainer__ = "Samuel de Oliveira Gamito" __email__ = "*****@*****.**" __status__ = "Production" from Similarity import Similarity import pandas as pd import numpy as np import sys import json if __name__ == "__main__": sys.stderr.close() #Close error output #Init classes sim = Similarity() if (not sys.argv[1] or sys.argv[1] == ""): print("erro") #Load path from arg data_path = sys.argv[1] #Load vectoro to data frame pandas data_frame = pd.read_json(data_path, orient='records') #Loop para filtrar as perguntas não respondidas nao_respondidas = [] #Padrão de resposta esperada i_dont_know = 'I don\'t know how to respond this' for index, row in data_frame.iterrows(): val = sim.symmetric_sentence_similarity(i_dont_know, row.resposta)
def setup(self): self.userDic = {1:[(1,1),(2,2),(3,3)], 2:[(4,4),(5,5)]} self.similarity = Similarity(self.userDic)
def _init_Sim(self): self.sim = Similarity() self.shelf_db = shelve.open('image_db.dat')
def setup(self): self.userDic = {1:[(1,1),(2,1),(3,4)], 2:[(1,5),(2,5),(3,2),(4,3)]} self.similarity = Similarity(self.userDic)
#responseDoc = fileName5; #requestDoc = 'abstract.txt' requestDoc = fileName5 readDoc = ReadDocument() print("Reading input document") fileContent = readDoc.readFile(requestDoc) keywords = readDoc.getKeywords(fileContent) #print(keywords) #print(len(keywords)) webSearch = WebSearch() print("crawling the web") webSearch.google_search(keywords) #webSearch.trialSearch() checkSimilarity = Similarity() print("checking similarity") value = checkSimilarity.similarValue(requestDoc, responseDoc) f = open(responseDoc, 'r+') f.truncate() print("----------SIMILARITY-----------") #print(value) resultValue = str(round((value * 100), 2)) + "%" #print(str(round((value*100),2))+"%") print(resultValue) file = open('similarityValue.txt', 'w', encoding='utf-8') file.write(str(resultValue)) print("-------------------------------")