Exemple #1
0
def REACH_extraction(candidate_sentence, query_genes, Positive, Negative):
    all_statements = []
    for sent in candidate_sentence.itertuples():
        reach_processor = reach.process_text(sent.sentence)
        if reach_processor is not None:
            all_statements += reach_processor.statements

        for st in all_statements:
            st = str(st)
            event1 = st[st.find("(") + 1:st.rfind(")")]
            src = event1.split(',')[0].replace('()', '').strip().upper()
            trg = event1.split(',')[1].replace('()', '').strip().upper()
            mode = st.replace('(' + event1 + ')', '').strip().upper()
            if ((src in query_genes[0]) and trg in query_genes[1]):
                max_pos = 0
                max_neg = 0
                for term in Positive:
                    score = jellyfish.jaro_distance(term, mode)
                    if (score > max_pos):
                        max_pos = score
                for term in Negative:
                    score = jellyfish.jaro_distance(term, mode)
                    if (score > max_neg):
                        max_neg = score
                if max_pos > max_neg:
                    return 'positive'
                else:
                    return 'negative'
Exemple #2
0
def fix(word):
    word_list = words.words(lang='es')
    score = 0
    op = []
    bst = []
    bst2 = []
    ind = ''
    opl = []
    nope = []
    done = {}
    for x in range(len(word_list)):
        if jellyfish.jaro_distance(word, word_list[x]) > score:
            score = jellyfish.jaro_distance(word, word_list[x])
            op = (word_list[x])
            opl.append(score)
            bst.append(score)
            bst2.append(word_list[x])
        if jellyfish.jaro_distance(word, word_list[x]) == score:
            bst.append(score)
            bst2.append(word_list[x])
    for x in range(len(bst)):
        done[x + bst[x]] = bst2[x]

    for x in range(len(bst)):
        gog = bst[x]
        if str(score) in str(gog):
            nope.append(bst2[x])
    nope = list(set(nope))
    print(nope[0])
Exemple #3
0
def fda_process(url, gname, HLGT_dict, SOC_dict, all_reports):
    resp = requests.get(url=url)
    data = resp.json()
    try:
        for result in data['results']:
            try:
                if ('primarysource' in result.keys()):
                    if ('qualification' in result['primarysource'].keys()):
                        if (result['primarysource']['qualification'] == '1'):
                            for drug in result['patient']['drug']:
                                if ('drugcharacterization' in drug.keys()):
                                    if (drug['drugcharacterization'] == '1'):
                                        exclude = set(string.punctuation)
                                        gname = ''.join(ch for ch in gname
                                                        if ch not in exclude)
                                        gname = gname.strip().upper()
                                        flag = 0
                                        if ('activesubstance' in drug.keys()):
                                            drugname = drug['activesubstance'][
                                                'activesubstancename'].upper()
                                            score = jellyfish.jaro_distance(
                                                gname, drugname)
                                            if (score >= 0.8):
                                                flag = 1
                                        if ('medicinalproduct' in drug.keys()
                                                and flag == 0):
                                            drugname = drug[
                                                'medicinalproduct'].upper()
                                            score = jellyfish.jaro_distance(
                                                gname, drugname)
                                            if (score >= 0.8):
                                                flag = 1
                                        else:
                                            continue
                            if (flag == 1):
                                reportid = result['safetyreportid']
                                PTs = []
                                HGLTs = []
                                SOCs = []
                                for reaction in result['patient']['reaction']:
                                    if ('reactionmeddrapt' in reaction.keys()):
                                        PT = reaction[
                                            'reactionmeddrapt'].lower()
                                        PTs.append(PT)
                                        mapped_hglt = map_adr_to_meddra(
                                            PT, HLGT_dict)
                                        HGLTs.append(mapped_hglt)
                                        mapped_soc = map_adr_to_meddra(
                                            PT, SOC_dict)
                                        SOCs.append(mapped_soc)
                                all_reports[reportid] = {
                                    'PTs': PTs,
                                    'HGLTs': HGLTs,
                                    'SOCs': SOCs
                                }
            except KeyError:
                continue
    except KeyError:
        pass
    return all_reports
Exemple #4
0
def entity_cell_score(e, c):
    if type(e) == sparql.IRI:
        e_v = str(e)
        if dbp_prefix in e_v:
            e_v = e_v.split(dbp_prefix)[1].replace('_', ' ')
            if not Is_Number(c):
                jd = jaro_distance(unicode(e_v), c)
                if jd > FLAGS.str_match_threshold:
                    return 1.0

    elif type(e) == sparql.Literal:
        try:
            e_v = str(e)
            # object is a number: equality
            if Is_Number(e_v):
                if Is_Number(c) and float(c) == float(e_v):
                    return 1.0
            # object is date or datetime: consider the year only
            elif e.datatype in date_types:
                year = e_v.split('-')[0]
                if year in c:
                    return 1.0
            # object is text
            elif e.datatype is None:
                if not Is_Number(c):
                    jd = jaro_distance(unicode(e_v), c)
                    if jd > FLAGS.str_match_threshold:
                        return 1.0
        except UnicodeEncodeError:
            pass
    return 0.0
def get_closest_jaro(needle,haystack):
	closest = None;
	for x in haystack:
		if(closest == None):
			closest = (x,jellyfish.jaro_distance(needle,x));
		else:
			temp = (x,jellyfish.jaro_distance(needle,x));
			if(temp[1] > closest[1]):
				closest = temp;
	if(closest == None):
		return None;
	return closest[0];
Exemple #6
0
def get_closest_jaro(needle, haystack):
    closest = None
    for x in haystack:
        if (closest == None):
            closest = (x, jellyfish.jaro_distance(needle, x))
        else:
            temp = (x, jellyfish.jaro_distance(needle, x))
            if (temp[1] > closest[1]):
                closest = temp
    if (closest == None):
        return None
    return closest[0]
Exemple #7
0
def _compare_inv(f, o=None):
    if not f or (o and jfish.jaro_distance(f, o) > .75):
        return o
    if f not in weapons:
        f = common_errors.get(f, f)
        sim = [jfish.jaro_distance(f, w) for w in weapons]
        m = max(sim)
        if m > .75:
            f = weapons[sim.index(m)]
        else:
            return o
    return f
Exemple #8
0
def jelly():
    import jellyfish
    a = u'Korle Bu Teaching Hospital Sickle Cell Dept'
    b = u'Korle Bu Teaching Hospital'
    # a = u'x'
    # b = u'a'
    print jellyfish.levenshtein_distance(a, b)
    print jellyfish.jaro_distance(a, b)
    print jellyfish.damerau_levenshtein_distance(a, b)
    # print jellyfish.match_rating_comparison(a,b)

    from fuzzywuzzy import fuzz

    print fuzz.ratio(a, b)
def test_jellyfish():
    text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.'
    text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi'
    lst1 = normalize(text1)
    lst2 = normalize(text2)
    text_norm1 = ' '.join(lst1)
    text_norm2 = ' '.join(lst2)
    print(jellyfish.jaro_distance(text1, text2))
    print(jellyfish.jaro_distance(text_norm1, text_norm2))
    print(jellyfish.jaro_winkler(text1, text2))
    print(jellyfish.jaro_winkler(text_norm1, text_norm2))
    print(jellyfish.nysiis(text1))
    print(jellyfish.nysiis(text2))
    exit()
def test_jellyfish():
    text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.'
    text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi'
    lst1 = normalize(text1)
    lst2 = normalize(text2)
    text_norm1 = ' '.join(lst1)
    text_norm2 = ' '.join(lst2)
    print(jellyfish.jaro_distance(text1, text2))
    print(jellyfish.jaro_distance(text_norm1, text_norm2))
    print(jellyfish.jaro_winkler(text1, text2))
    print(jellyfish.jaro_winkler(text_norm1, text_norm2))
    print(jellyfish.nysiis(text1))
    print(jellyfish.nysiis(text2))
    exit()
def fuzzyLocMatch_wGT(locList1, locList2):
    """
    Fuzzy location match using string comparision with jellyfish, can be applied to tweets extracted local gazetteers
    and ground truth or url extracted local gazetteers and ground truth

    :param locList1: tw extracted local gazetteers
    :param locList2: ground truth
    :return: score with tid (how reliable the tw is based only on either address or place name fuzzy match)
    """
    scores = []
    for loc1 in locList1:
        print(loc1[-1])
        score = 0
        for loc11 in loc1[0]:
            loc11 = roadNameFormat(loc11)

            for loc2 in locList2:
                loc2 = roadNameFormat(loc2)

                s = jellyfish.jaro_distance(str(loc11), str(loc2))
                # print(str(loc11), '###', str(loc2), s)
                score = max(score, s)

        scores.append((round(score, 2), loc1[-1]))
    return scores
    def get_similar_titles(title: str) -> list:
        """Main function for extracting alternate titles

        :type title: str
        :return:
        """
        payload = {
            'stype': 'title',
            'search': title
        }

        link = requests.get(url=BakaUpdates.SEARCH_URL, params=payload)
        soup = Soup(link.text, 'html.parser')

        seen_titles = []
        results = []
        for s in soup.find_all('td', attrs={"class": "text pad col1"}):
            search_result = BakaUpdates.clean_title(s.text)
            # I decided to add a seen titles list to prevent duplicate titles in the output.
            # BakaUpdates search returns titles without the added keywords first, so I'll skip the second result
            # which is most likely a novel
            if search_result not in seen_titles:
                results.append({
                    'title': search_result,
                    'link': s.find_next('a', href=True)['href'],
                    'similarity': jellyfish.jaro_distance(search_result.lower(), title.lower())
                })
                seen_titles.append(search_result)

        results.sort(key=lambda item: item['similarity'], reverse=True)
        return results
Exemple #13
0
def ActionneEntity(name,action,myListSceneOrSwitch,conf):
    #derived from nice work of https://github.com/iMartyn/domoticz-snips
    lowest_distance = MAX_JARO_DISTANCE
    lowest_idx = 65534
    lowest_name = "Unknown"
    MyWord=name
    DomoticzRealName=""
    print(" - ActionneEntity: "+MyWord)
    for idx,scene in myListSceneOrSwitch.items():
        print("Scene/Schalter: "+str(scene['Name'],'utf-8')+" idx: "+idx)
        distance = 1-jellyfish.jaro_distance(str(scene['Name'],'utf-8'), MyWord)
        print("  Distance is "+str(distance))
        if distance < lowest_distance:
            print("  Low enough and lowest!")
            lowest_distance = distance
            lowest_idx = idx
            lowest_name = scene['Name']
            lowest_Type= scene['Type']
    if lowest_distance < MAX_JARO_DISTANCE:
        print(" - ActionneEntity - lowest_Type: "+lowest_Type)
        DomoticzRealName=str(lowest_name,'utf-8')
        print(" - ActionneEntity - DomoticzRealName: "+DomoticzRealName)
        print(" - ActionneEntity - lowest_idx: "+lowest_idx)
        curlCmd(lowest_idx,action,lowest_Type,conf)
        return True,DomoticzRealName
        hermes.publish_end_session(intent_message.session_id, "Einschalten "+lowest_name)
    else:
        return False,DomoticzRealName
Exemple #14
0
 def calc_similarity(self,
                     target: Entity or any,
                     distance_table: dict,
                     threshold: float = 1) -> float:
     """
     to calculate the similarity between this cluster and an entity,
     i.e the possibility of an entity to be a member of this cluster
     :param target: the target entity to be compared with, or the target Cluster to be compared with
                     (error when put Cluster in param type T_T)
     :param distance_table: {(ent_1_uri, ent_2_uri): distance_float} where ent_1_uri < ent_2_uri
     :param threshold: float, return if over this threshold
     :return: float number in [0, 1] to represent the similarity
     """
     # TODO: ignore outliers? or use more sophisticated methods
     max_similarity = 0
     if isinstance(target, Entity):
         if target.type in self.types:
             for name, cnt in self.names.items():
                 _key = (name, target.name) if name < target.name else (
                     target.name, name)
                 if _key not in distance_table:
                     distance_table[_key] = jaro_distance(name, target.name)
                 if max_similarity < distance_table[_key]:
                     max_similarity = distance_table[_key]
                     if max_similarity > threshold:
                         return max_similarity
     else:
         if set(target.types.keys()).union(self.types.keys()):
             for ent_uri, ent in target.members.items():
                 similarity = self.calc_similarity(ent, distance_table)
                 if max_similarity < similarity:
                     max_similarity = similarity
                     if max_similarity > threshold:
                         return max_similarity
     return max_similarity
def get_links_edge_list(path_to_cluster_heads, path_to_output):
    cluster_heads = json.load(open(path_to_cluster_heads))

    IDs = list(cluster_heads.keys())

    G = nx.Graph()
    G.add_nodes_from(IDs)

    for i, id1 in enumerate(IDs):
        for j in range(i + 1, len(IDs)):
            id2 = IDs[j]
            if cluster_heads[id1][1] == cluster_heads[id2][1]:
                if cluster_heads[id1][2] == cluster_heads[id2][
                        2] and cluster_heads[id1][2] != '':
                    G.add_edge(id1, id2)
                elif "NIL" in cluster_heads[id1][2] or "NIL" in cluster_heads[
                        id2][2]:
                    name1 = unicode(cluster_heads[id1][0])
                    name2 = unicode(cluster_heads[id2][0])
                    #score = jf.jaro_winkler(name1,name2)
                    score = jf.jaro_distance(name1, name2)
                    if score > 0.9:
                        print cluster_heads[id1]
                        print cluster_heads[id2]
                        print ""
                        G.add_edge(id1, id2)

    with open(path_to_output, 'w') as output:
        output.write(str(G.nodes()) + '\n')
        for e in G.edges:
            output.write(str(e) + '\n')
def pair_name(name, names_and_ids, existing_names):
    if name in existing_names:
        return name, existing_names[name]
    options = ((jellyfish.jaro_distance(name.lower(), new_name.lower()), id_)
               for id_, new_name in names_and_ids.items())
    options = tuple(enumerate(sorted(options, reverse=True)[:5]))
    selection = ''
    try:
        _, (_, selection) = options[int(input('''\

Select one of the following for {!r}.
Press Enter to select the first option and ^C and Enter to skip or
^C again to exit.

{}
'''.format(name, '\n'.join(map(repr, options)))) or 0)]
    except KeyboardInterrupt:
        if input('''\

Create record?  [y/N]
''') == 'y':
            mp = MP(name=MultilingualField(el=name,
                                           en=translit_elGrek2Latn(name),
                                           tr=translit_el2tr(name)))
            mp.insert()
            selection = mp._id
    return name, selection
Exemple #17
0
def main(filename, motiflength):
    motif_list = {}
    records = list(SeqIO.parse(filename, "fasta"))
    pool = ThreadPool(processes=cpu_count())
    for batch_counter in range(Config.batch):
        async_result = pool.apply_async(run, (records, motiflength))
        motif_list[batch_counter] = async_result.get()
    pool.close()
    pool.join()
    # getting the best motif
    best = None
    for count, motif in motif_list.items():
        if best is None or motif[1] < best[1]:
            best = motif
    # printing the result
    # enable loggin for this part
    sys.stdout = sys.__stdout__
    print("Finale Profile")
    print_pseudo(best[0])
    print("Consensus sequence")
    if Config.max_gapsize > 0:
        print("gapsize: " + str(best[2]))
    print("new solution: %d" % best[1])
    if Config.palindrome_enable:
        print(best[0].consensus + "----" + best[0].consensus.reverse_complement())
        print(jaro_distance(str(best[0].consensus), str(best[0].consensus.reverse_complement())))
    else:
        print(best[0].consensus)

    return best
def get_jaro_avg(row1, row2):
	sum = 0
	for columnIndex in xrange(1,15):
		a = row1[columnIndex]
		b = row2[columnIndex]
		sum += jellyfish.jaro_distance(a, b)
	return sum / 14.0
def check_nan(df):
    '''
    Match fuzzy city names
    '''
    cities = pd.read_csv("uscities.csv")
    cities = cities.astype({
        'city': 'str',
        "state_id": "str",
        "county_name": "str",
        "county_fips": "str"
    })

    df_fuzzy = df[df.fips == "00nan"]
    df_tuple = list(zip(df_fuzzy.state_id, df_fuzzy.City, df_fuzzy.index))
    cities_tuple = list(zip(cities.state_id, cities.city, cities.county_fips))

    rv = []
    for d in df_tuple:
        for c in cities_tuple:
            if c[0] == d[0]:
                if jellyfish.jaro_distance(c[1], d[1]) >= 0.85 or c[1] in d[1]\
                    or d[1] in c[1]:
                    df.loc[df.index == d[2], ['City']] = c[1]
                    df.loc[df.index == d[2], ['fips']] = c[2]
                    break
        else:
            df = df.drop(d[2])
    return df
def sceneOn_received(hermes, intent_message):

    print('Intent {}'.format(intent_message.intent))

    for (slot_value, slot) in intent_message.slots.items():
        print('Slot {} -> \n\tRaw: {} \tValue: {}'.format(
            slot_value, slot[0].raw_value, slot[0].slot_value.value.value))
    scenes = getSceneNames(domoticz_base_url)
    lowest_distance = MAX_JARO_DISTANCE
    lowest_idx = 65534
    lowest_name = "Unknown"
    for idx, scene in scenes.items():
        print "Comparing " + scene + " and " + slot[0].slot_value.value.value
        distance = 1 - jellyfish.jaro_distance(
            scene, unicode(slot[0].slot_value.value.value, "utf-8"))
        print "Distance is " + str(distance)
        if distance < lowest_distance:
            print "Low enough and lowest!"
            lowest_distance = distance
            lowest_idx = idx
            lowest_name = scene
    if lowest_distance < MAX_JARO_DISTANCE:
        command_url = global_conf.get("secret").get(
            "domoticz url"
        ) + '/json.htm?type=command&param=switchscene&idx=' + str(
            lowest_idx) + '&switchcmd=On'
        print '"curl"ing ' + command_url
        ignore_result = urllib2.urlopen(command_url)
        #ignore_result.read() # So we finish the connection correctly.
        hermes.publish_end_session(intent_message.session_id,
                                   "Turning on scene " + lowest_name)
    else:
        hermes.publish_end_session(
            intent_message.session_id,
            "I'm sorry, I couldn't find a scene like " + lowest_name)
Exemple #21
0
 def strCompare(self, new, ratiomin=0.8):
     """
     Compare a new string against self.Lastwithstring.
     Uses one of the "similarity hashing" modules if availble.
     Returns a ratio describing the similarity level of
         new vs self.Lastwithstring
     where a ratio of 1.0 means full equality and 0.0 means no similarity.
     """
     if self.Lastswithstring is None:
         self.Lastswithstring = ""
     old = self.Lastswithstring  # can be Lastswithstring or LastComparedString
     if levenshtein_available:
         ratio = Levenshtein.ratio(new, old)
     elif jellyfish_available:
         ratio = jellyfish.jaro_distance(new, old)
     elif fuzzywuzzy_available:
         fuzzywuzzy.fuzz.ratio(new, old)
     elif difflib_available:
         print("Comparing {} with {} using difflib.SequenceMatcher.".format(
             new, old))
         ratio = difflib.SequenceMatcher(None, new, old).ratio()
     elif simhash_available:
         ratio = simhash_compare(new, old)
     else:
         print("No string diff lib available!")
         ratio = 0.9
     return ratio
Exemple #22
0
def get_distance(string_a, string_b):
    # similarity scores given by edit distance functions are reversed  to turn them into distances
    lev = 1 - fuzz.ratio(string_a, string_b) / 100  # given value is normalized in range 1-100, not in 0-1
    jar = 1 - jellyfish.jaro_distance(string_a, string_b)
    jw = 1 - jellyfish.jaro_winkler(string_a, string_b)
    score = (lev + jar + jw) / 3  # calculate mean value of all distances
    return score
Exemple #23
0
 def distance(word):
     matches = []
     for i in range(num):
         score = jaro_distance(word, words[i])
         if score > thresh and words[i] != word:
             matches.append(i)
     return matches
Exemple #24
0
async def message_resolve(client, message, cmd_prefix):
    if message.author.bot:
        return
    if message.author.id in variables.noflylist:
        return
    if message.content.startswith(cmd_prefix):
        await log(message.author, message.guild, message.content)
        args = split_args(message.content[len(cmd_prefix):])
        command = args[0].lower()
        if command == "help":
            await print_help(client,
                             message,
                             *args[len(cmd_prefix):],
                             full=False)
        elif command == "fullhelp":
            await print_help(client,
                             message,
                             *args[len(cmd_prefix):],
                             full=True)
        elif command in functions.keys():
            await functions[command][0](client, message,
                                        *args[len(cmd_prefix):])
        else:
            jaro_dists = [(i, jellyfish.jaro_distance(command, i))
                          for i in functions.keys()]
            jaro_dists = [i for i in jaro_dists if i[1] > 0.8]
            if len(jaro_dists) == 0:
                return
            jaro_dists.sort(key=lambda i: i[1], reverse=True)
            txt = ",".join([f"`{i[0]}`" for i in jaro_dists])
            await message.channel.send(
                f"`{variables.PREFIX}{command}` not found. Did you mean: {txt}"
            )
    for handler in handlers:
        await handler(client, message)
Exemple #25
0
def ActionneEntity(name, action, myListSceneOrSwitch, conf):
    #derived from nice work of https://github.com/iMartyn/domoticz-snips
    lowest_distance = MAX_JARO_DISTANCE
    lowest_idx = 65534
    lowest_name = "Unknown"
    MyWord = name
    for idx, scene in myListSceneOrSwitch.items():
        distance = 1 - jellyfish.jaro_distance(unicode(scene['Name'], 'utf-8'),
                                               MyWord)
        #    print "Distance is "+str(distance)
        if distance < lowest_distance:
            #        print "Low enough and lowest!"
            lowest_distance = distance
            lowest_idx = idx
            lowest_name = scene['Name']
            lowest_Type = scene['Type']
    if lowest_distance < MAX_JARO_DISTANCE:
        #print (lowest_Type)
        #print(lowest_name)
        #print(lowest_idx)
        curlCmd(lowest_idx, action, lowest_Type, conf)
        return True
        #hermes.publish_end_session(intent_message.session_id, "j'allume "+lowest_name)
    else:
        return False
Exemple #26
0
def add_query_features(df, inc, exc, k1list, k2list):
    """
    Return a copy of a dataframe with summary features added for
    the named text files defining the query
    """
    df_new = df.copy()
    k1lens = list(map(len, k1list))
    k2lens = list(map(len, k2list))
    k1max = max(k1lens)
    k2max = max(k2lens)
    k1count = len(k1list)
    k2count = len(k2list)
    df_new['k1_count'] = k1count
    df_new['k2_count'] = k2count
    df_new['k1_max'] = k1max
    df_new['k2_max'] = k2max
    jaro_dist = jellyfish.jaro_distance(inc, exc)
    lev_dist = jellyfish.levenshtein_distance(inc, exc)
    ji = textdistance.jaccard(inc, exc)
    sd = textdistance.sorensen(inc, exc)
    ro = textdistance.ratcliff_obershelp(inc, exc)
    #jellyfish.damerau_levenshtein_distance(inc,exc)
    #jellyfish.jaro_winkler(inc,exc)
    df_new['inc_jaro_exc'] = jaro_dist
    df_new['inc_lev_exc'] = lev_dist
    df_new['inc_ji_exc'] = ji
    df_new['inc_sd_exc'] = sd
    df_new['inc_ro_exc'] = ro
    return df_new
Exemple #27
0
    def A_vio(self, col_target, alpha=0.8):
        '''N_lbl * N_lbl '''
        logging.info('A_vio')
        J_train = self.extract(TRAIN, col_target)
        J_valid = self.extract(VALID, col_target)
        J_test = self.extract(TEST, col_target)

        J = []
        J.extend(J_train)
        J.extend(J_valid)
        J.extend(J_test)

        L = []
        L.extend(self.labels_train)
        L.extend(self.labels_test)

        d = dict(set(zip(L, J)))
        n_lbl = self.N_lbl
        V = np.zeros((n_lbl, n_lbl), dtype='int16')
        print(self.N_lbl)
        for (i, j) in itertools.product(range(n_lbl), range(n_lbl)):
            sim = jellyfish.jaro_distance(d.get(i, ''), d.get(j, ''))
            # if i % 1000 == 0:
            #     print(i)
            if sim < alpha:
                V[i, j] = 1
        self.A_V = V
Exemple #28
0
def call_reia():
    while (True):
        max_score = 0.1
        map_val = ""
        with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt',
                  'r') as f:
            first_line = f.readline()
            while first_line == "":
                time.sleep(1)
                call_reia()
        print('-----------------------')
        user_input = first_line.split(' ', 1)[1]
        user_name = get_username(first_line.split(' ', 1)[0])
        suggest_list = []
        suggest_message = ""
        #prev_ts = ts
        print("\nINPUT = ")
        print(user_input)
        label = classify(user_input)
        if label == "":
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            continue
        print("Classified as : " + str(label))
        tokens = nltk.word_tokenize(user_input)
        print(tokens)
        st = StanfordPOSTagger(config['tagger']['model'],
                               path_to_jar=config['tagger']['path'])
        stanford_tag = st.tag(user_input.split())
        print("Tags")
        print(stanford_tag)
        with open(MAPPING_PATH, 'r') as data_file:
            data = json.load(data_file)
        for i in data[label]:
            dist = jf.jaro_distance(str(user_input), str(i))
            suggest_list.append(tuple((dist, i)))
            print(dist)
            if (dist > max_score):
                max_score = dist
                map_val = i
        if max_score < config['preferences']['similarity_threshold']:
            post_message(
                "Sorry, I could not understand. Please rephrase and try again."
            )
            consume_message()
            if config['preferences']['suggestions'] == True:
                suggest = suggestions(suggest_list)
                post_message("Did you mean :")
                for i in suggest:
                    suggest_message += (str(i[1]) + "\n")
                post_message(suggest_message)
            continue
        print("\nMapped to : " + map_val)
        #post_message(map_val)
        construct_command(user_input, label, tokens, map_val, stanford_tag,
                          exec_command, user_name)
        #call('sed -i -e "1d	" REIA/mqueue.txt')
        consume_message()
def build_list(file1, file2):
    operatorList = []
    for name, count in file2.items():
        index = name.rfind("_")
        newName = name[index + 1:]
        maxRatio = 0
        partner = ""
        partnerGroup = ""
        for fullName, group in file1:
            newFullName = fullName.lower().replace(" ", "")
            ratio = jellyfish.jaro_distance(newName, newFullName)
            if ratio > maxRatio:
                maxRatio = ratio
                partner = fullName
                partnerGroup = group
        operatorList.append((name, partner, partnerGroup, count))

    path = os.getcwd()
    with open(os.path.join(path, "particle_operators.txt"), "w") as file:
        text = ""
        for a, b, c, d in operatorList:
            aLen = len(a)
            aTabs = "\t" * (12 - int(aLen / 4))
            bLen = len(b)
            bTabs = "\t" * (16 - int(bLen / 4))
            cLen = len(c)
            cTabs = "\t" * (4 - int(cLen / 4))
            text += a + aTabs + b + bTabs + c + cTabs + d + "\n"
        file.write(text)

    return operatorList
Exemple #30
0
def score_jaro_distance(string1, string2):
    threshold = float(0.90)
    flag = False
    score = jellyfish.jaro_distance(unicode(string1), unicode(string2))
    if score > threshold:
        flag = True
    return flag, score
    def match2(self, query):
        bestRatio = 0
        bestIndex = None
        print(len(self.sents))
        # dists = set()
        for index, target in enumerate(self.sents):
            smaller = query
            larger = target
            if (len(smaller) > len(larger)):
                smaller, larger = larger, smaller
            smallerLength, largerLength = len(smaller), len(larger)

            words = larger.split(" ")
            offset = 0
            # matcher = SequenceMatcher(None, smaller)
            while offset <= largerLength - smallerLength:
                windowed = " ".join(words[offset:smallerLength])
                # matcher.set_seq2(windowed)
                # current = matcher.quick_ratio()
                current = jellyfish.jaro_distance(windowed, smaller)
                # current = jellyfish.jaro_winkler(windowed, smaller)
                # dists.add(current)
                if (current > bestRatio):
                    bestRatio = current
                    bestIndex = index
                offset += 1
        # print(list(reversed(sorted(list(dists))))[0:20])
        return (bestIndex, bestIndex), bestRatio
def alldist(filex, filey):
    xread = open(filex, 'r').read()
    yread = open(filey, 'r').read()
    lvd = jellyfish.levenshtein_distance(xread,yread)
    dlvd= jellyfish.damerau_levenshtein_distance(xread,yread)
    spsum = spamsum.match(xread,yread)
    spsum = 100 - spsum
    spsum = float(spsum/100.00)
#    print lvd
    res = float( lvd / 100.00 )
    dres= float(dlvd / 100.00 )
#    print res
#    print "Levenshtein Distance=",res
    jaro = jellyfish.jaro_distance(xread,yread)
## Added jaro-winkler distance by fahim 20111011
    jarowink = jellyfish.jaro_winkler(xread,yread)
    jaro = 1.0 - jaro
    jarowink = 1.0 - jarowink
#   print "Jaro Distance = ",jaro
    ham = jellyfish.hamming_distance(xread,yread)
    ham = float ( ham / 100.00)
    print "Hamming Distance = ", ham
#	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
#	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
#    print "Spamsum Match score: ", spsum
    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres , jaro, jarowink, ham, kl, spsum
Exemple #33
0
def join_by_name_distance(yelp_result, candidates, threshold=0.75):
    """ 
    ### Kevin's suggestion: change match to match_list 
    ### Edit: function needs to match on yelp_list and database_list

    This function calculates the Jaro distance between two strings. 
    If this is above the threshold, it returns the best match.
    
    Inputs:
        - yelp_results: A yelp result (name) as a string (string)
        - candidates: A list of candidate matching names (strings)
        
    Outputs:
        - match: best match for the yelp_result (string)
        If there is no match above the threshold, then None
        
    """
    yelp_result = yelp_list
    #match_list = []
    eligible_matches = PriorityQueue()
    # for now candidates and yelp_result are simple strings
    # we can make that more complex if needed
    for i in candidates:
        eligible_matches.put((1 - jaro_distance(yelp_result, i), i))
    match = eligible_matches.get()[1]
    if match >= threshold:
        return match
    else:
        return None
Exemple #34
0
def CleanVillageNames():
    import jellyfish
    subcenters = SubCenter.objects.all()
    for subc in subcenters:
        villages = Address.objects.filter(beneficiaries__subcenter=subc).distinct()
        nl_vills = villages.filter(village_mcts_id = None) 
        l_vills = villages.exclude(village_mcts_id = None)
        phonetic_codes = []
        for l_vill in l_vills:
            phonetic_codes.append(jellyfish.nysiis(l_vill.village))
        #match the non-legitimate ones
        for nl_vill in nl_vills:
            pc = jellyfish.nysiis(nl_vill.village)
            min_dist = 100
            min_ind = 0
            ind = 0
            for spc in phonetic_codes:
                dist = jellyfish.jaro_distance(spc ,pc)
                if dist <= min_dist:
                    min_ind = ind
                    min_dist = dist
                ind +=1
            if min_dist < 1.0:
                match_vill = l_vills[min_ind]
                nl_vill.village_mcts_id = match_vill.village_mcts_id
                nl_vill.value = nl_vill.value+'_m'
                nl_vill.save()
 def interpretAsColor(self, words):
     '''tries to map to first couple of words to a color'''
     best_match_color = collections.namedtuple('Match', 'index chance')
     best_match_color.chance = 0
     for i, word in enumerate(words):
         for j, color in enumerate(COLORS):
             matches = []
             for k, c in enumerate(color['name'].split(), start=0):
                 try:
                     m = jellyfish.jaro_distance(unicode(c),
                                                 unicode(words[i + k]))
                 except IndexError:
                     break  # end of words
                 else:
                     matches.append(m)
             match = sum(matches) / len(matches)
             if match > self.threshold and match > best_match_color.chance:
                 best_match_color.chance = match
                 best_match_color.index = j
                 if best_match_color.chance > 0:
                     rgb = map(
                         lambda x: x / 255.0,
                         list(eval(
                             COLORS[best_match_color.index]['rgb']))) + [1]
                     return rgb
     return None
def error_highlight_table(dropdown_value, data):
    df = pd.DataFrame(data)

    # print(df)
    a = dropdown_value
    b = "winter_{a}".format(a=dropdown_value)

    # print(a,b)

    for i in range(0, len(combined_columns), 2):
        df[combined_columns[i]] = df[combined_columns[i]].fillna(
            df[combined_columns[i + 1]])
        df[combined_columns[i]] = df[combined_columns[i]].mask(df[combined_columns[i]] == 0).fillna(
            df[combined_columns[i + 1]])

    for i in range(1, len(combined_columns), 2):
        df[combined_columns[i]] = df[combined_columns[i]].fillna(
            df[combined_columns[i - 1]])
        df[combined_columns[i]] = df[combined_columns[i]].mask(df[combined_columns[i]] == 0).fillna(
            df[combined_columns[i - 1]])

    temp = df.loc[df[a] != df[b], [a, b]].drop_duplicates()
    # print(temp)
    temp["jaro_distance"] = temp.apply(
        lambda x: jellyfish.jaro_distance(x[a], x[b]), axis=1)
    temp = temp.sort_values(by="jaro_distance", ascending=False)

    columns = [{'id': c, 'name': c, } for c in temp.columns]

    return temp.to_dict('records'), columns
Exemple #37
0
	def _similarTeams(self, optteam, opttable=None):
		"""Do fuzzy string matching to find similar team names."""

		similar = [] # empty lists to put our results in.
		# now do our sql work.
		with sqlite3.connect(self._cfbdb) as db:
			cursor = db.cursor() # select all fullnames, eid, rid.
			cursor.execute("SELECT nn, team, %s FROM cfb" % opttable)
			rows = cursor.fetchall()
		# iterate over all rows and do math.
		for row in rows:  # row[0] = nn, row[1] = team, row[2] (what we're looking for.)
			similar.append({'jaro':jellyfish.jaro_distance(optteam, row[0]), 'team':row[1], 'id':row[2]})
			similar.append({'jaro':jellyfish.jaro_distance(optteam, row[1]), 'team':row[1], 'id':row[2]})
		# now, we do two "sorts" to find the "top5" matches. reverse is opposite on each.
		matching = sorted(similar, key=itemgetter('jaro'), reverse=True)[0:5] # bot five.
		# return matching now.
		return matching
Exemple #38
0
    def jaro_apply(x):

        try:
            return jellyfish.jaro_distance(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
Exemple #39
0
def jaroWinklerDistanceAffiliation(authorId, paperId):
	authors = __builtin__.authors
	paperauthor = __builtin__.paperauthor

	if authors[authorId]['name'] and paperauthor[paperId][authorId]['authorName']:
		return jellyfish.jaro_distance(authors[authorId]['affiliation'], 
								       paperauthor[paperId][authorId]['affiliation'])
	else:
		return 0.5
def fuzzy_match(s1, s2, max_dist=.9):
    try:
        distance = jellyfish.jaro_distance(s1, s2)
        is_match = distance >= max_dist
    except:
        is_match = False
        distance = 0

    return is_match, distance
Exemple #41
0
    def test_jaro_distance(self):
        cases = [("dicksonx", "dixon", 0.767),
                 ("dixon", "dicksonx", 0.767),
                 ("martha", "marhta", 0.944),
                 ("dwayne", "duane", 0.822)]

        for (s1, s2, value) in cases:
            actual = jellyfish.jaro_distance(s1, s2)
            self.assertAlmostEqual(actual, value, places=3)
Exemple #42
0
 def _fuzzy_match(self, term, text):
     """
     Fuzzy match on phrases.
     """
     n = phrase_grams(term)
     for gram in tokenizer(text, n):
         d = jellyfish.jaro_distance(term, gram)
         if d >= self.fuzzy_threshold:
             return True
     return False
 def compare_two_texts(self, string_a, string_b):
     """
     Compare two string and return the value of Jaro algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         return jellyfish.jaro_distance(string_a, string_b)
     else:
         raise TypeError
    def flag(self, text):
        """Very simple check for naughty words"""
        total_weight = 0
        words = text.lower().split()        
        for naughty in self.words:
            for word in words:
                score = jellyfish.jaro_distance(word, naughty)
                if score > 0.7:
                    total_weight = total_weight + (score * self.words[naughty])

        return total_weight > self.threshold
def getCityStateResolved(address):
    dfCities = pd.read_csv("./locdata/google_cities_nodups.csv")
    dfStates = pd.read_csv("./locdata/google_states.csv")
    city, state = getCityState(address)

    ## Merge the two dfs
    df_merged = pd.merge(dfCities, dfStates, left_on='SID', right_on='ID')

    ##Add two new columns for jaro scores in the df
    df_merged['jaroCity'] = df_merged['city'].map(lambda x: jf.jaro_distance(x.lower(), city.lower()))
    df_merged['jaroState'] = df_merged['state'].map(lambda x: jf.jaro_distance(x.lower(), state.lower()))

    df_merged['jaroFinal'] = 0.5*df_merged['jaroCity'] + 0.5*df_merged['jaroState']
    ##Select those rows whose jaro is over a threshold
    ##df_merged = df_merged[df_merged['jaroFinal'] > 0.9]

    ##Find row with max value for jaro
    resolvedCityState = df_merged.ix[df_merged['jaroFinal'].idxmax()]

    return resolvedCityState['city'], resolvedCityState['state']
 def findParameters(self, words, effectname):
     ''' only check words after a parameter indicator'''
     parameters = LEDMaster.getDefaultParameters(
         effectname)  # always load default
     for i, word in enumerate(words):
         for j, p in enumerate(parameters.keys()):
             match = jellyfish.jaro_distance(unicode(p), unicode(word))
             if match > self.threshold:
                 value = self.understandParameterValue(
                     p, parameters[p], words[i+1:])
                 parameters[p] = value
     return parameters
 def findAreas(self, words):
     '''find every word that could indicate the area.
     TODO: handle numbers, handle aliases, choose lowest granularity'''
     areas = []
     # TODO detect stuff like Balken1, Wand2
     for i, word in enumerate(words):
         for a in AREAS:
             if a[-1] in '1234':  # skip Balken1, 2 and so on
                 continue  # TODO detect if number is attached
             if jellyfish.jaro_distance(unicode(a), unicode(word)) > self.threshold and a not in areas:
                 areas.append(a)
     return areas
def fgen_normname_tokens(x,y):
    l = x[1].split(' ')
    r = y[1].split(' ')
    sig1 = get_signature_from_tokens(x[2])
    sig2 = get_signature_from_tokens(y[2])
    if len(sig1) == len(sig2):
        if sig1!=sig2:
            return -1.0
    l.pop()
    r.pop()
    dist = jellyfish.jaro_distance(' '.join(l),' '.join(r))
    return dist
Exemple #49
0
def get_similar(queryset, criteria):
    if not queryset.count():
        return 0
    result = {
        element.id: jellyfish.jaro_distance(element.name.encode("utf8"), criteria.encode("utf8"))
        for element in queryset
    }
    max_simular = max(result, key=result.get)
    if result[max_simular] > JARO_SIMULARITY:
        return max_simular
    else:
        return 0
Exemple #50
0
def findClosest(g, actor):
	import jellyfish
	i = -1
	maxMatch = -1
	rs = None
	for node in g.nodes():
		tmp = jellyfish.jaro_distance(node, actor)
		if tmp>maxMatch:
			rs = node
			maxMatch = tmp

	print actor + " is matched with value " + str(maxMatch) + " against " + rs
	return rs
 def getParameterIndicatorIndex(self, words):
     '''return index of word which could mostly indicate the beginning of parameter inputs'''
     best_match = 0
     best_index = 0
     for i, word in enumerate(words):
         for pi in self.parameters_indicators:
             match = jellyfish.jaro_distance(pi, unicode(word))
             if match > self.threshold and match > best_match:
                 best_match = match
                 best_index = i
                 if match == 1:
                     return i  # shortcut, no need to search for more
     return best_index
    def flag(self, text):
        """Very simple check for naughty words"""
        # Normalize diacritic characters into ASCII since current version of 
        # jaro_distance cannot handle them.
        normalized_text = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
        total_weight = 0
        words = normalized_text.lower().split()        
        for naughty in self.words:
            for word in words:
                score = jellyfish.jaro_distance(word, naughty)
                if score > 0.7:
                    total_weight = total_weight + (score * self.words[naughty])

        return total_weight > self.threshold
Exemple #53
0
    def flag(text, threshold, words):
        """Very simple check for naughty words"""
        # Normalize diacritic characters into ASCII since current version of
        # jaro_distance cannot handle them.
        normalized_text = unicodedata.normalize('NFKD', force_unicode(text)).encode('ascii', 'ignore')
        total_weight = 0
        lwords = normalized_text.lower().split()
        for naughty in words:
            for word in lwords:
                score = jellyfish.jaro_distance(word, naughty)
                if score > 0.7:
                    total_weight = total_weight + (score * words[naughty])

        return total_weight > threshold
Exemple #54
0
def autocorrect(request,word):
    auto = []
    i = 0
    words = list(Master_HindiWords.objects.all())
    count = Master_HindiWords.objects.all().count()
    
    while i < count:
        try:
            a = jellyfish.jaro_distance(word,str(words[i]))
        except:
            log.exception("Something wrong with call")
        if a > 0.85:
            auto.append(str(words[i]))
        i += 1
    return HttpResponse(auto)
def similarityMeasures(row1, row2):
	jaro_sum = 0
	jaro_winkler_sum = 0
	levenshtein_sum = 0
	damerau_levenshtein_sum = 0
	
	for columnIndex in range(1,15): #skips id column
		a = row1[columnIndex]
		b = row2[columnIndex]
		jaro_sum += jellyfish.jaro_distance(a, b)
		jaro_winkler_sum += jellyfish.jaro_winkler(a, b)
		levenshtein_sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b)))
		damerau_levenshtein_sum += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b)))

	returnV =  "%.6f,%.6f,%.6f,%.6f" % (
		jaro_sum / 14.0,
		jaro_winkler_sum / 14.0,
		levenshtein_sum / 14.0,
		damerau_levenshtein_sum / 14.0)
		
	for i in range(1,15):
		returnV += ",%.6f" % (jellyfish.jaro_distance(row1[i], row2[i]))

	return returnV
    def findEffectName(self, words):
        '''search only in words before buzzword paramerers'''
        # find effect name
        # word with highest match and before keyword parameters
        choices = self.effect_choices
        best_match_effect = collections.namedtuple('Match', 'effect chance')
        best_match_effect.chance = 0
        for effect in choices:
            for i, word in enumerate(words):
                match = jellyfish.jaro_distance(unicode(effect), unicode(word))
                if match > best_match_effect.chance and match > self.threshold:
                    best_match_effect.chance = match
                    best_match_effect.effect = effect

        return best_match_effect.effect
Exemple #57
0
def string_compare(str1, str2, method='JARO'):
    ''' (string, string, string) -> double
    returns the similarity of str1 and str2 according to the method: LEV or JARO
    
    '''

    if method == "LEV":
        # computes Levnenshtein distance which is an integer larger or equal to zero
        # return jellyfish.levenshtein_distance(str1,str2)
        return jellyfish.levenshtein_distance(str1.lower(), str2.lower())

    if method == "JARO":
        # computes Jaro Winkler measure which is always between 0 and 1
        return jellyfish.jaro_distance(str1, str2)

    print("ERROR: Choose the right string similarity measure : LEV or JARO")
Exemple #58
0
def distance(string_1, string_2):
    """Compute the edit distance between two strings.
    """
    return jsonify({
        "levenshtein": jellyfish.levenshtein_distance(string_1, string_2),
        "damerau-levenshtein": jellyfish.damerau_levenshtein_distance(
            string_1,
            string_2
        ),
        "jaro": jellyfish.jaro_distance(string_1, string_2),
        "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2),
        "match_rating_codex": jellyfish.match_rating_comparison(
            string_1,
            string_2
        ),
        "sift3": pymailcheck.sift3_distance(string_1, string_2),
    })
Exemple #59
0
def stringDistance(str1, str2):
  """
  Return distance between two strings
    String distance : jaro + levenshtein + damerau
  """
  distance = 0
  if len(str1) > 0 and len(str2) > 0:
    str1 = str1.decode('utf-8')
    str2 = str2.decode('utf-8')

    jaro = jellyfish.jaro_distance(str1, str2)
    leven = jellyfish.levenshtein_distance(str1, str2)
    damerau = jellyfish.damerau_levenshtein_distance(str1, str2)

    norm = max(len(str1), len(str2))
    distance = 0.5 * jaro + 0.25 * (1 - leven / norm)   \
                          + 0.25 * (1 - damerau / norm)

  return distance
Exemple #60
0
def index(request):
    
    if not DEBUG:
        return
    
    DEFAULT_DISTANCE = 0
        
    person_into = request.GET.get('into', False)
    victims = map(lambda x: int(x), request.GET.getlist('combine'))
    if person_into is not False:
        victims.remove(int(person_into))
        args_array = [person_into] + victims
        # call_command('mail_combine_people', *args_array)
        combcomm = CombineCommand()
        print person_into, victims
        result = combcomm.merge(person_into, victims, noprint=True)
    
    
    people = []
    for p in Person.objects.filter(merged_into=None).order_by('name_hash'):
        people.append({'obj': p, 'dist': DEFAULT_DISTANCE})
    
    target_person = None
    target_id = request.GET.get('id', False)
    if target_id is not False:
        target_person = Person.objects.get(id=target_id)
    
        if target_person:
            for (i,p) in enumerate(people):
                people[i]['dist'] = jellyfish.jaro_distance(target_person.name_hash, p['obj'].name_hash)
            people.sort(key=lambda x: x['dist'], reverse=True)
    
    total = len(people)
    
    template_vars = {
        'people': people,
        'total': total
    }
    
    return render_to_response('dedupe.html', template_vars, context_instance=RequestContext(request))