def main():
    filename = "candidate_synonyms.txt"
    file = open(filename)
    for line in file:
        line = line.strip().split(',')
        print line
        if not dict_opener.has_key(line[0]):
            opener1 = Opener(line[0])
            dict_opener[line[0]] = opener1
        else:
            opener1 = dict_opener[line[0]]

        if not dict_opener.has_key(line[1]):
            opener2 = Opener(line[1])
            dict_opener[line[1]] = opener2
        else:
            opener2 = dict_opener[line[1]]

        if Similarity.similarity_eval_equal(line[0], opener1.get_related_searchs(),
                                            line[1], opener2.get_related_searchs()):
            f.write(line[0] + ";" + line[1] + ";" + "true\n")
        elif Similarity.similarity_eval_cosine(opener1.get_dict(), opener2.get_dict()):
            f.write(line[0] + ";" + line[1] + ";" + "true\n")
        else:
            f.write(line[0] + ";" + line[1] + ";" + "false\n")
class TestCalculatePairSimilarity:

	def setup(self):
		self.userDic = {1:[(1,1),(2,2),(3,3)],
					2:[(4,4),(5,5)]}
		self.similarity = Similarity(self.userDic)

	def test_with_normal_data(self):
		userI = [(1,-2),(2,-1),(3,0)]
		userJ = [(1,2),(2,1),(4,0)]

		similarity = self.similarity.calculatePairSimilarity(userI,userJ)

		assert_true(abs(similarity+1.0)<0.001)

	def test_with_no_common_data(self):
		userI = [(1,-2),(2,-1),(3,0)]
		userJ = [(4,2),(5,1),(6,2)]

		similarity = self.similarity.calculatePairSimilarity(userI,userJ)

		assert_equals(similarity,0.0)

	def test_with_all_zero_value(self):
		userI = [(1,-2),(2,-1),(3,0)]
		userJ = [(1,0),(2,0),(3,0)]

		similarity = self.similarity.calculatePairSimilarity(userI,userJ)

		assert_equals(similarity,0.0)
def get_list_url_similar(abs_url, length=10):
    fea = feature_all(abs_url)

    list_cosin = Similarity.distance_list(fea, matrix, cosine=True)
    list_eu = Similarity.distance_list(fea, matrix, cosine=False)

    list_url_cosin = get_top(list_cosin.flatten(), length)
    list_url_eu = get_top(list_eu.flatten(), length)

    return list_url_cosin, list_url_eu
Example #4
0
def similarity():
    datatext = DataText()
    datatext.setPTBData()
    cooccurrence = Cooccurrence(datatext, 2)
    cooccurrence.createMatrix()
    cooccurrence.calcPMIfromCoMatrix()
    cooccurrence.reduceDimensions(100)
    similarity = Similarity(datatext, cooccurrence.matrix)

    print("n_sequense: ", len(datatext.words))
    print("n_words: ", datatext.n_words)
    querys = ["you", "year", "car", "toyota"]
    for query in querys:
        print("query: ", query)            
        similarity.rankSimilarity(query)
Example #5
0
def create_statistic(data_path):
    #Init classes
    sim = Similarity()

    #Load vectoro to data frame pandas
    data_frame =  pd.DataFrame.from_dict(data_path)



    #Loop para filtrar as perguntas não respondidas 
    nao_respondidas = []
    #Padrão de resposta esperada
    i_dont_know = 'I don\'t know how to respond this'
    for index, row in data_frame.iterrows():
        val = sim.symmetric_sentence_similarity(i_dont_know, row.response)
        if(val > 0.88):
            nao_respondidas.append([row.input, 1 ])

    total_nao_respondidas = len(nao_respondidas)

    # considerei uma ativação de 0.55 assim consideramos os mesmos temas
    i = 0
    for ask in nao_respondidas:
        j = 0
        for ask1 in nao_respondidas:
            val = sim.symmetric_sentence_similarity(ask[0], ask1[0])
            if(val > 0.55): #similar remove e soma na primeira
                if(i != j):
                    del nao_respondidas[j]
                    deletado = True
                else:
                    deletado = False
                    j += 1
                ask[1] += 1
        if not deletado:
            i += 1

    #Convert Array to Numpy Array
    np_nao_respondidas = np.array(nao_respondidas, dtype="O")
    #Sort Numpy Array 
    np_nao_respondidas.sort(axis=0,kind='heapsort')
    #Cut in 10 first
    top_10 = np_nao_respondidas.tolist()[-10:][::-1]

    response = {"total": total_nao_respondidas, "top": top_10}
    return response
def calc_similarity(kpi,
                    similarity_method='ecc',
                    weights=(0.5, 0.5),
                    default_dict=dict(),
                    default_value=0):
    # 两两算相似度, Similarity(score) * weights[0] + Similarity(value) * weights[1] 作为结果
    '''k1 = random.sample(list(kpi.keys()), 1)[0]
    v1 = kpi[k1]
    similarity_dict=dict()
    similarity_dict[k1]=dict([[tt, []] for tt in kpi.keys()])
    '''
    similarity_dict = default_dict if isinstance(default_dict,
                                                 dict) else dict()

    for k1, v1 in kpi.items():
        for k2, v2 in kpi.items():
            if (not k1 in similarity_dict.keys()):
                similarity_dict[k1] = dict()
            if (not k2 in similarity_dict.keys()):
                similarity_dict[k2] = dict()
            if k1 == k2:
                similarity_dict[k1][k1] = [
                    default_value, default_value, default_value
                ]
            elif k2 in similarity_dict.keys() and k1 in similarity_dict[k2]:
                continue
            else:
                T0 = time.time()
                print([k1, k2], end=" ")
                s1 = Similarity(v1.value.values,
                                v2.value.values).use_method(similarity_method)
                s2 = Similarity(v1.score.values,
                                v2.score.values).use_method(similarity_method)
                print("[{1},{2}]:{0}".format(time.time() - T0, s1, s2),
                      flush=True)
                similarity_dict[k1][k2] = [
                    s1, s2, s1 * weights[0] + s2 * weights[1]
                ]
                similarity_dict[k2][k1] = [
                    s1, s2, s1 * weights[0] + s2 * weights[1]
                ]
    print('finish calculate similarity. current dict size reaches {0} ({1})'.
          format(len(similarity_dict.keys()), time.asctime()),
          flush=True)
    return similarity_dict
class TestSimilarity:

	def setup(self):
		self.userDic = {1:[(1,1),(2,1),(3,4)],
					2:[(1,5),(2,5),(3,2),(4,3)]}
		self.similarity = Similarity(self.userDic)

	def test_adjust_value(self):
		
		adjustedUserDic = self.similarity.adjustValue(self.userDic)

		assert_equals(adjustedUserDic, {1:[(1,-2),(2,-2),(3,1)],2:[(1,2),(2,2),(3,-1),(4,0)]})

	def test_calculate_similarity(self):

		similarityDict = self.similarity.calculateSimilarity()

		assert_equals(similarityDict[1][2],-1.0)
		assert_equals(similarityDict[2][1],-1.0)
def worker(line):
   line = line.strip().split(',')
   if not dict_opener.has_key(line[0]):
       opener1 = Opener(line[0])
       dict_opener[line[0]] = opener1
   else:
       opener1 = dict_opener[line[0]]

   if not dict_opener.has_key(line[1]):
       opener2 = Opener(line[1])
       dict_opener[line[1]] = opener2
   else:
       opener2 = dict_opener[line[1]]

   if Similarity.similarity_eval_equal(line[0], opener1.get_related_searchs(),
                                       line[1], opener2.get_related_searchs()):
       f.write(line[0] + ";" + line[1] + ";" + "true\n")
   elif Similarity.similarity_eval_cosine(opener1.get_dict(), opener2.get_dict()):
       f.write(line[0] + ";" + line[1] + ";" + "true\n")
   else:
       f.write(line[0] + ";" + line[1] + ";" + "false\n")
Example #9
0
def print_equalities():
    Constants.SIMILARITY = Similarity()
    Constants.PARAPHRASE = True
    sentences = read_test_data(r"C:\Users\Nikolaj\PycharmProjects\LitteralJapaneseTranslation\data\sentences_dev.txt")
    for sentence in sentences:
        print([t.english for t in sentence.tokens])
        translations = LiteralJapanese.translate(sentence.japanese)
        print([t.english for t in translations])
        for translation in translations:
            for token in sentence.tokens:
                (equal,rule) = Equality.equals_rule(token.english,translation[1])
                if equal:
                    print(token.english + " = " + translation.english + "\t" + "("+rule+")")
Example #10
0
    def __init__(self,
                 urls,
                 onchange=None):  #onchange is a function with onchange(url)

        ########################################################################
        #HYPERPARAMETERS
        self.datapath = "./data"
        self.periode = 10 * 60  #check every hour
        self.deltastart = 3  #start thread with a delta time of 3 sec
        self.max_change_ratio = 0.10
        ########################################################################

        super(PagesChecker, self).__init__()
        self.isrunning = False
        self.urls = urls
        self.onchange = onchange
        self.threads = []
        self.sm = Similarity()

        if not os.path.exists(self.datapath) or os.path.exists(
                self.datapath) and not os.path.isdir(self.datapath):
            os.mkdir(self.datapath, 0o755)
def main():
    similarity = Similarity()

    pg.init() 

    display_width = 740
    display_height = 480

    clock = pg.time.Clock()
    gameDisplay = pg.display.set_mode([display_width,display_height])
    pg.display.set_caption('Face Dance Machine') 
    
    faceDanceMachine = FaceDanceMachine(gameDisplay, similarity)
    faceDanceMachine.run()
    pg.quit()
Example #12
0
class MainWidget(QWidget):
    def __init__(self):
        super().__init__()
        self._setup_ui()
        self._init_Sim()

        self.current_show_name = ''
        self.current_show_image_path = ''

        self.add_person_btn.clicked.connect(self._clilked_add_persion_btn_slot)
        self.find_person_btn.clicked.connect(
            self._clilked_find_persion_btn_slot)

    def _setup_ui(self):
        self.lefttop = (100, 100)
        self.size = (1200, 600)
        self.setGeometry(*self.lefttop, *self.size)

        self.add_person_btn = QPushButton('add')
        self.rm_person_btn = QPushButton('rm')
        self.find_person_btn = QPushButton('find')

        self.person_id_lable = QLabel('person_id')
        self.person_image_lable = QLabel('person_image')
        self.person_info_list = QListWidget()

        self.vbox_1L = QVBoxLayout(self)
        self.vbox_2L_btns = QHBoxLayout()
        self.vbox_2L_labels = QHBoxLayout()
        self.vbox_3L_lists = QVBoxLayout()
        self.vbox_1L.addLayout(self.vbox_2L_btns)
        self.vbox_1L.addLayout(self.vbox_2L_labels)
        self.vbox_2L_labels.addLayout(self.vbox_3L_lists, stretch=3)

        self.vbox_2L_btns.addWidget(self.add_person_btn)
        self.vbox_2L_btns.addWidget(self.rm_person_btn)
        self.vbox_2L_btns.addWidget(self.find_person_btn)

        self.vbox_2L_labels.addWidget(self.person_image_lable, stretch=7)

        self.vbox_3L_lists.addWidget(self.person_id_lable)
        self.vbox_3L_lists.addWidget(self.person_info_list)

        self.show()

    def _init_Sim(self):
        self.sim = Similarity()
        self.shelf_db = shelve.open('image_db.dat')

    def __del__(self):
        self.shelf_db.close()

    def add_person(self, image_path: str):
        _, name = self.sim.add_person(image_path)
        self.shelf_db['name'] = image_path

    def rm_person(self, person_id: str):
        if person_id in self.shelf_db.keys():
            del self.shelf_db[person_id]
            self.rm_persion(person_id)

    def find_person(self, image_path: str) -> Union[str, None]:
        name, score = self.sim.find_person(image_path)
        if score > 0.7:
            return name

    def update_image_ui(self):
        if self.current_show_image_path and self.current_show_name:
            self.person_id_lable.setText(self.current_show_name)
            img = QPixmap(self.current_show_image_path)
            if img.width() > 300:
                img = img.scaledToWidth(300)
            self.person_image_lable.setPixmap(img)
        else:
            self.person_id_lable.setText("none")
            self.person_image_lable.setPixmap(QPixmap())

    def update_person_list_ui(self):
        # for k,v in self.shelf_db.items():
        #     item=QListWidgetItem(self.person_info_list)
        pass

    def _clilked_add_persion_btn_slot(self):
        files, _ = QFileDialog.getOpenFileNames(
            self, 'select files', filter='jpg(*.jpg);;png(*.png)')
        for file in files:
            is_unique, name = self.sim.add_person(file)
            if is_unique:
                self.shelf_db[name] = file

    def _clilked_find_persion_btn_slot(self):
        files, _ = QFileDialog.getOpenFileNames(
            self, 'select files', filter='jpg(*.jpg);;png(*.png)')
        if files:
            file = files[0]
            name, score = self.sim.find_person(file)
            if score > 0.7:
                self.current_show_name = name
                self.current_show_image_path = self.shelf_db[name]
            else:
                self.current_show_name = ""
                self.current_show_image_path = ''
            self.update_image_ui()
Example #13
0
import traceback
from flask import Flask, request, jsonify
from Similarity import Similarity

app = Flask(__name__)
sim = Similarity()


@app.route('/find_person', methods=['POST'])
def find_person():
    result = {}
    result['status'] = 1
    try:
        fp = request.files['img']
        person_id, face_sim = sim.find_person(fp)

        data = {}
        data['person_id'] = person_id
        data['face_sim'] = face_sim
        result['data'] = data
    except BaseException as e:
        traceback.print_exc()
        result['err'] = "{}".format(e)
        type = 0  # error
        result['status'] = type
    return jsonify(result)


@app.route('/rm_person', methods=['POST'])
def rm_person():
    result = {}
Example #14
0
class PagesChecker(object):
    """docstring for PagesChecker."""
    def __init__(self,
                 urls,
                 onchange=None):  #onchange is a function with onchange(url)

        ########################################################################
        #HYPERPARAMETERS
        self.datapath = "./data"
        self.periode = 10 * 60  #check every hour
        self.deltastart = 3  #start thread with a delta time of 3 sec
        self.max_change_ratio = 0.10
        ########################################################################

        super(PagesChecker, self).__init__()
        self.isrunning = False
        self.urls = urls
        self.onchange = onchange
        self.threads = []
        self.sm = Similarity()

        if not os.path.exists(self.datapath) or os.path.exists(
                self.datapath) and not os.path.isdir(self.datapath):
            os.mkdir(self.datapath, 0o755)

    def run(self):
        self.isrunning = True
        urls = self.urls
        self.urls = []
        for url in urls:
            self.addNewChecker(url)
            time.sleep(self.deltastart)

    def stop(self):
        self.isrunning = False
        for thread in self.threads:
            thread.stop()

    def state(self):
        print("Got " + str(len(self.threads)) + " running !")

    def getFileNameFromUrl(self, url):
        nurl = url.replace("https://", "").replace("http://", "").replace(
            "/", "-").replace(".html",
                              "").replace(".php",
                                          "").replace(".js",
                                                      "").replace(".", "_")
        return self.datapath + "/" + nurl

    def formatDate(self):
        return '{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())

    def saveState(self, url, data):
        f = open(self.getFileNameFromUrl(url), "w+")
        f.truncate(0)
        f.seek(0)
        f.write(json.dumps(data))
        f.close()

    def strIsSame(self, html1, html2):
        if len(html1) == len(html2):
            return self.sm.isSimilar(html1, html2)
        else:
            return False

    def addNewChecker(self, url):
        if not os.path.exists(self.getFileNameFromUrl(url)):
            f = open(self.getFileNameFromUrl(url), "w+")
            f.write(
                json.dumps({
                    "url": url,
                    "created_at": self.formatDate(),
                    "html": "",
                    "last_check": "0",
                    "nb_change": 0
                }))
            f.close()
        t = threading.Thread(target=self.worker, args=(url, ))
        self.threads.append(t)
        self.urls.append(url)
        t.start()

    def stopChecker(self, url):
        for i in range(0, len(self.urls)):
            if self.urls[i] == url:
                self.threads[i].stop()
                self.threads.pop(i)
                self.urls.pop(i)
                break

    def worker(self, url):
        html = str(requests.get(url).content, "utf-8")

        datafile = open(self.getFileNameFromUrl(url), "r+")
        raw = datafile.read()
        data = json.loads(raw)
        datafile.close()

        #isSame = self.strIsSame(html, data["html"])
        distance = sum(
            [1 for x, y in zip(html, data["html"]) if x.lower() != y.lower()])
        change_ratio = distance * 2 / (len(html) + len(data["html"]))

        if (change_ratio > self.max_change_ratio or data["html"] == ""):
            data["nb_change"] += 1
            data["html"] = html

            self.onchange(url)

        data["last_check"] = self.formatDate()
        self.saveState(url, data)
        time.sleep(self.periode)
Example #15
0
    print "Loading Data"
    mentions = [
        convert_json(json.loads(x)) for x in open("Data/Mentions.json")
    ]
    mid_mention = dict([(x['mid'], x) for x in mentions])
    pairs = json.load(
        open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/Train_Test_Pairs"))
    mid_eid = dict([
        apply(lambda x, y: (int(x), y),
              line.strip().split("\t"))
        for line in open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/mid_eid")
    ])
    loaded = True

print "Computing Training Similarities"
s = Similarity()
labels = []
features = []
for (mid0, mid1) in pairs['training']:
    labels.append(int(mid_eid[mid0] == mid_eid[mid1]))
    features.append(s.compute(mid_mention[mid0], mid_mention[mid1]))
#Adding features here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# A little hack so that first class is always 1:
labels.insert(0, 1)
features.insert(0, {})

print "Active Similarities:"
print "\t".join(reduce(lambda x, y: x | y, [set(x.keys()) for x in features]))

print "Training"
# We need to convert string feature names to feature numbers for liblinear:
Example #16
0
    print "Using Default output Results/Test";
    out_name = "Test";
else:
    out_name = sys.argv[1];
try:
    loaded
except NameError:
    print "Loading Data";
    mentions = [convert_json(json.loads(x)) for x in open("Data/Mentions.json")];
    mid_mention = dict([(x['mid'],x) for x in mentions]);
    pairs = json.load(open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/Train_Test_Pairs"));
    mid_eid = dict([apply(lambda x,y:(int(x),y),line.strip().split("\t")) for line in open("/afs/inf.ed.ac.uk/group/teaching/anlp/asgn3/mid_eid")]);
    loaded = True;

print "Computing Training Similarities";
s = Similarity();
labels = [];
features = [];
for (mid0,mid1) in pairs['training']:
	labels.append(int(mid_eid[mid0] == mid_eid[mid1]));
	features.append(s.compute(mid_mention[mid0],mid_mention[mid1]));
    #Adding features here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# A little hack so that first class is always 1:
labels.insert(0,1);
features.insert(0,{});

print "Active Similarities:"
print "\t".join(reduce(lambda x,y:x|y, [set(x.keys()) for x in features]));

print "Training";
# We need to convert string feature names to feature numbers for liblinear:
    def build(self):
        log.info('building rnn cell....')
        if self.cell == 'gru':
            recurent_x = GRU(self.rng, self.n_input, self.n_hidden, self.x,
                             self.E, self.xmask, self.is_train, self.dropout)

            recurent_y = GRU(self.rng, self.n_input, self.n_hidden, self.y,
                             self.E, self.ymask, self.is_train, self.dropout)
        elif self.cell == 'lstm':
            recurent_x = LSTM(self.rng, self.n_input, self.n_hidden, self.x,
                              self.E, self.xmask, self.is_train, self.dropout)

            recurent_y = LSTM(self.rng, self.n_input, self.n_hidden, self.y,
                              self.E, self.ymask, self.is_train, self.dropout)
        log.info('build the sim matrix....')
        sim_layer = Similarity(recurent_x.activation,
                               recurent_y.activation,
                               metrics=self.sim)

        log.info('building convolution pooling layer....')
        conv_pool_layer = ConvPool(
            input=sim_layer.activation,
            filter_shape=(2, 1, 3, 3),  # feature_maps, 1, filter_h, filter_w
            input_shape=(self.batch_size, 1, 50,
                         50))  #sim_layer.activation.shape)
        projected_layer = basicLayer(conv_pool_layer.activation,
                                     input_shape=1152)
        rav_cost = T.nnet.binary_crossentropy(projected_layer.activation,
                                              self.label)
        cost = T.mean(rav_cost)
        acc = T.eq(projected_layer.activation > 0.5, self.label)
        log.info('cost calculated.....')

        self.params = [
            self.E,
        ]
        self.params += recurent_x.params
        self.params += recurent_y.params
        self.params += conv_pool_layer.params
        self.params += projected_layer.params

        lr = T.scalar('lr')
        gparams = [T.clip(T.grad(cost, p), -3, 3) for p in self.params]
        #gparams = [T.grad(cost, p) for p in self.params]

        if self.optimizer == 'sgd':
            updates = sgd(self.params, gparams, lr)
        elif self.optimizer == 'adam':
            updates = adam(self.params, gparams, lr)
        elif self.optimizer == 'rmsprop':
            updates = rmsprop(self.params, gparams, lr)

        log.info('gradient calculated.....')

        self.train = theano.function(
            inputs=[self.x, self.xmask, self.y, self.ymask, self.label, lr],
            outputs=[cost, acc],
            updates=updates,
            givens={self.is_train: np.cast['int32'](1)})

        self.predict = theano.function(
            inputs=[self.x, self.xmask, self.y, self.ymask, self.label],
            outputs=[rav_cost, acc],
            givens={self.is_train: np.cast['int32'](0)})

        self.test = theano.function(
            inputs=[self.x, self.xmask, self.y, self.ymask],
            outputs=projected_layer.activation,
            givens={self.is_train: np.cast['int32'](0)})
Example #18
0
__credits__ = ["Samuel de Oliveira Gamito"]
__license__ = "GPL"
__maintainer__ = "Samuel de Oliveira Gamito"
__email__ = "*****@*****.**"
__status__ = "Production"
from Similarity import Similarity

import pandas as pd
import numpy as np
import sys
import json

if __name__ == "__main__":
    sys.stderr.close()  #Close error output
    #Init classes
    sim = Similarity()

    if (not sys.argv[1] or sys.argv[1] == ""):
        print("erro")

    #Load path from arg
    data_path = sys.argv[1]
    #Load vectoro to data frame pandas
    data_frame = pd.read_json(data_path, orient='records')

    #Loop para filtrar as perguntas não respondidas
    nao_respondidas = []
    #Padrão de resposta esperada
    i_dont_know = 'I don\'t know how to respond this'
    for index, row in data_frame.iterrows():
        val = sim.symmetric_sentence_similarity(i_dont_know, row.resposta)
	def setup(self):
		self.userDic = {1:[(1,1),(2,2),(3,3)],
					2:[(4,4),(5,5)]}
		self.similarity = Similarity(self.userDic)
Example #20
0
 def _init_Sim(self):
     self.sim = Similarity()
     self.shelf_db = shelve.open('image_db.dat')
	def setup(self):
		self.userDic = {1:[(1,1),(2,1),(3,4)],
					2:[(1,5),(2,5),(3,2),(4,3)]}
		self.similarity = Similarity(self.userDic)
Example #22
0
#responseDoc = fileName5;

#requestDoc = 'abstract.txt'
requestDoc = fileName5

readDoc = ReadDocument()
print("Reading input document")
fileContent = readDoc.readFile(requestDoc)
keywords = readDoc.getKeywords(fileContent)

#print(keywords)
#print(len(keywords))

webSearch = WebSearch()
print("crawling the web")
webSearch.google_search(keywords)
#webSearch.trialSearch()

checkSimilarity = Similarity()
print("checking similarity")
value = checkSimilarity.similarValue(requestDoc, responseDoc)
f = open(responseDoc, 'r+')
f.truncate()
print("----------SIMILARITY-----------")
#print(value)
resultValue = str(round((value * 100), 2)) + "%"
#print(str(round((value*100),2))+"%")
print(resultValue)
file = open('similarityValue.txt', 'w', encoding='utf-8')
file.write(str(resultValue))
print("-------------------------------")