Python smの例、difflib.sm Pythonの例

コード例 #1

0

ファイルを表示

ファイル: datacleanup.py プロジェクト: shashankgupta12/presskey-extended

def processText4(directory, newfile):
    for i in range(1, 41):
        filename = '{0}data{1}.json'.format(directory, i)
        if not os.path.isfile(filename):
            continue
        data = json.load(open(filename, 'r'))
        keyPressData, keyReleaseData = data['keyPressData'], data[
            'keyReleaseData']
        if len(keyPressData) > 15 and len(keyReleaseData) > 15:
            keyPressData, keyReleaseData = removeExtraKeys(
                dataProcessKeyPress(keyPressData)), removeExtraKeys(
                    dataProcessKeyRelease(keyReleaseData))
            keyPressData, keyReleaseData = swapShift(
                *removeExtraShift(keyPressData, keyReleaseData))
            kp = ''.join([i.split('-')[1]
                          for i in keyPressData]).replace('\'', '')
            kr = ''.join([i.split('-')[1]
                          for i in keyReleaseData]).replace('\'', '')
            pressString = [
                r'worldKey.shift%99,Key.space12Key.shift@hello;Key.spacewhy.not72,Key.spaceKey.shift#dream5Key.shift$0Key.shift*[email protected]*Key.space32greatKey.shift#Key.shift%have',
                r'worldKey.shift_r%99,Key.space12Key.shift_r@hello;Key.spacewhy.not72,Key.spaceKey.shift_r#dream5Key.shift_r$0Key.shift_r*[email protected]_r*Key.space32greatKey.shift_r#Key.shift_r%have'
            ]
            if sm(None, pressString[0], kp).ratio() > 0.90 or sm(
                    None, pressString[1], kp).ratio() > 0.90:
                keyPressData, keyReleaseData = extractTimings(
                    keyPressData, keyReleaseData)
                dt = dict(keyPressData=keyPressData,
                          keyReleaseData=keyReleaseData)
                with open(newfile, 'a') as f:
                    json.dump(dt, f)
                    f.write(os.linesep)

コード例 #2

0

ファイルを表示

def saveImg(data, base_host, base_url, base_path, image_title):
    maxval = 0
    maxurl = ""
    imglinks = data.findAll("img")
    for link in imglinks:
        img_size = getsize(link['src'].strip())
        sequence_match_ratio = 0
        if 'alt' in link.attrs:
            sequence_match_ratio = sm(None, str(link['alt']),
                                      base_url.replace("http", "")).ratio()
        if sequence_match_ratio > maxval and img_size > 99999:
            maxval = sm(None, str(link['alt']), base_url).ratio()
            maxl = link
            maxurl = link['src'].strip()

    if "http" not in str(maxurl):
        maxurl = base_host.split("//")[0] + maxurl

    try:
        # uncomment if proxy settings
        # image = requests.get(maxurl, proxies=proxyDict)

        # comment if proxy settings
        image = requests.get(maxurl)
        print("Images Saved Successfully")
    except:
        print("Error")
        exit(0)
    saveImageToFile(base_path, image_title, image)
    return maxl

コード例 #3

0

ファイルを表示

ファイル: mangaScrape.py プロジェクト: SaptakS/mangaExtractor

def saveImg(data, base_host, base_url, base_path, image_title):
    maxval = 0
    maxurl = ""
    imglinks = data.findAll("img")
    check_link = base_host.split("//")[1]
    if len(check_link.split('.')) > 2:
        check_link = check_link.split('.')[1] + "." + check_link.split('.')[2]
    for link in imglinks:
        if check_link in str(link['src']) and sm(None, str(link['src']), base_url.replace("http", "")).ratio() > maxval:
            maxval = sm(None, str(link['src']), base_url).ratio()
            maxl = link
            maxurl = link['src']
    if maxurl == '':
        for link in imglinks:
            if sm(None, str(link['src']), base_url.replace("http", "")).ratio() > maxval:
                maxval = sm(None, str(link['src']), base_url).ratio()
                maxl = link
                maxurl = link['src']

    if "http" not in str(maxurl):
        maxurl = base_host.split("//")[0] + maxurl

    try:
        image = requests.get(maxurl, proxies=proxyDict)
        print "Images Saved Successfully"
    except:
        print "Error        "
        exit(0)

    file = open(os.path.join(base_path, "%s.jpg") % image_title, 'wb')
    try:
        Image.open(StringIO(image.content)).save(file, 'JPEG')
    except IOError, e:
        print "Couldnt Save:", e

コード例 #4

0

ファイルを表示

def saveImg(data):
    maxval = 0
    maxurl = ""
    imglinks = data.findAll("img")
    check_link = BASE_HOST.split("//")[1]
    if len(check_link.split('.')) > 2:
        check_link = check_link.split('.')[1] + "." + check_link.split('.')[2]
    for link in imglinks:
        if check_link in str(link['src']) and sm(None, str(link['src']), BASE_URL.replace("http", "")).ratio() > maxval:
            maxval = sm(None, str(link['src']), BASE_URL).ratio()
            maxl = link
            maxurl = link['src']
    if maxurl == '':
        for link in imglinks:
            if sm(None, str(link['src']), BASE_URL.replace("http", "")).ratio() > maxval:
                maxval = sm(None, str(link['src']), BASE_URL).ratio()
                maxl = link
                maxurl = link['src']

    if "http" not in str(maxurl):
        maxurl = BASE_HOST.split("//")[0] + maxurl

    try:
        image = requests.get(maxurl, proxies=proxyDict)
        print "Images Saved Successfully"
    except:
        print "Error        "
        exit(0)

    file = open(os.path.join(BASE_PATH, "%s.jpg") % IMAGE_TITLE, 'wb')
    try:
        Image.open(StringIO(image.content)).save(file, 'JPEG')
    except IOError, e:
        print "Couldnt Save:", e

コード例 #5

0

ファイルを表示

ファイル: downloader.py プロジェクト: SaptakS/mangadownloader

    def get_valid_image_obj(self, img_objs):
        """ Find the valid BeautifulSoup image object from the various
        different image objects present in the current page or URL based
        on string matching of the image link and base_url, and the size of
        the image in the image obj.

        :param img_objs: List of all BeautifulSoup image objects present in the
                         current page
        """
        maxval = 0
        match_obj = None
        for img_obj in img_objs:
            img_size = get_content_size(img_obj['src'].strip(),
                                        self.proxy_dict)

            if 'alt' in img_obj.attrs:
                sequence_match_ratio = sm(None, str(img_obj['alt']),
                                          self.base_url.replace("http",
                                                                "")).ratio()

                if sequence_match_ratio > maxval and img_size > 99999:
                    maxval = sm(None, str(img_obj['alt']),
                                self.base_url).ratio()
                    match_obj = img_obj

        return match_obj

コード例 #6

0

ファイルを表示

ファイル: topSongsDownloader.py プロジェクト: aKummur/topSongsDownloader

def downloader(songs, lang):
    for song, album in songs:
        print 'http://music.vidmate.mobi/search-' + song.strip().replace(
            ' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html'
        response = urllib2.urlopen('http://music.vidmate.mobi/search-' +
                                   song.strip().replace(' ', '%20') + '%20' +
                                   album.strip().replace(' ', '%20') + '.html')
        htmlsrc = response.read()

        htmlsrc = htmlsrc.split('id="music-search-song-container">', 1)[1]
        results = htmlsrc.split('music-song-search-item-open')
        for result in results:
            try:
                ps = result.split('<p')
                s = ps[1].split('</p>', 1)[0][1:]
                a = ps[2].split('<a', 1)[1].split('>', 1)[1].split(
                    '</a>', 1)[0].split('|')[0].strip()
                link = 'http://music.vidmate.mobi' + ps[2].split(
                    '<a', 1)[1].split('>',
                                      1)[0].split('href="')[1].split('">')[0]
                #print a, s
                #print sm(None, song, s).ratio(), sm(None, album, a).ratio()
                if sm(None, song, s).ratio() > 0.8 and sm(None, album,
                                                          a).ratio() > 0.8:
                    print s
                    download(link, s, lang)
                    break
                else:
                    pass
            except:
                pass

コード例 #7

0

ファイルを表示

ファイル: datacleanup.py プロジェクト: shashankgupta12/presskey-extended

def processText1(directory, newfile):
    for i in range(1, 41):
        filename = '{0}data{1}.json'.format(directory, i)
        if not os.path.isfile(filename):
            continue
        data = json.load(open(filename, 'r'))
        keyPressData, keyReleaseData = data['keyPressData'], data[
            'keyReleaseData']
        if len(keyPressData) > 6 and len(keyReleaseData) > 6:
            keyPressData, keyReleaseData = removeExtraKeys(
                dataProcessKeyPress(keyPressData)), removeExtraKeys(
                    dataProcessKeyRelease(keyReleaseData))
            keyPressData, keyReleaseData = swapShift(
                *removeExtraShift(keyPressData, keyReleaseData))
            kp = ''.join([i.split('-')[1]
                          for i in keyPressData]).replace('\'', '')
            kr = ''.join([i.split('-')[1]
                          for i in keyReleaseData]).replace('\'', '')
            pressString = [
                'abuKey.shift@9,l12Key.shift$',
                'abuKey.shift_r@9,l12Key.shift_r$'
            ]
            if sm(None, pressString[0], kp).ratio() > 0.95 or sm(
                    None, pressString[1], kp).ratio() > 0.95:
                keyPressData, keyReleaseData = extractTimings(
                    keyPressData, keyReleaseData)
                dt = dict(keyPressData=keyPressData,
                          keyReleaseData=keyReleaseData)
                with open(newfile, 'a') as f:
                    json.dump(dt, f)
                    f.write(os.linesep)

コード例 #8

0

ファイルを表示

 def songsAreSame(s1, s2):
     from difflib import SequenceMatcher as sm
     # Idea credit: https://bigishdata.com/2016/10/25/
     seqA = sm(None, s1.lyrics, s2['lyrics'])
     if seqA.ratio() > 0.4:
         seqB = sm(None, s2['lyrics'], s1.lyrics)
         return seqA.ratio() > 0.5 or seqB.ratio() > 0.5
     return False

コード例 #9

0

ファイルを表示

def MatchWord(word):
    ratio = 0
    for exitsWord in words.keys():
        if ratio < sm(None, word, exitsWord).ratio():
            ratio = sm(None, word, exitsWord).ratio()
            nearestWord = exitsWord
    if ratio >= 0.8:
        return nearestWord
    else:
        return "0"

コード例 #10

0

ファイルを表示

ファイル: gmaps.py プロジェクト: pveglia/gmaps-distance

def geocode(origin, destination, sensor):
    geo_args = {
        'origin': origin,
        'destination': destination,
        'sensor': sensor
    }

    url = GEOCODE_BASE_URL + '?' + urllib.urlencode(geo_args)
    result = simplejson.load(urllib.urlopen(url))
    res = {}
    status = False

    if simplejson.dumps(result['status']) == "\"OK\"" :
        leg = result['routes'][0]['legs'][0]
        duration = simplejson.dumps(leg['duration']['value']).strip("\"")
        distance = simplejson.dumps(leg['distance']['value']).strip("\"")

        # conversions
        duration = float(duration) / 60
        distance = float(distance) / 1000

        # string comparison
        start_index = sm(None, origin, leg['start_address']).ratio()
        end_index = sm(None, destination, leg['end_address']).ratio()
        if (start_index < THRESHOLD) or (end_index < THRESHOLD):
            res['duration'] = "%.0f *" % duration
            res['distance'] = "%.1f *" % distance
            print >> sys.stderr, "ATTENZIONE! Forse alcune cittá non sono " \
                "state trovate\n. Partenza voluta: \"%s\", partenza suggerita" \
                ": \"%s\".\n Destinazione voluta: %s, " \
                " destinazione suggerita: \"%s\"" % (origin,
                                                     leg['start_address'],
                                                     destination,
                                                     leg['end_address'])
        else:
            res['duration'] = "%.0f" % (duration)
            res['distance'] = "%.1f" % (distance)
        status = True
    elif (simplejson.dumps(result['status']) == "\"ZERO_RESULTS\""):
        res['duration'] = "?"
        res['distance'] = "?"
        print >> sys.stderr, "Errore in %s -> %s" % (origin, destination)
        status = True
    else:
        res['duration'] = "?"
        res['distance'] = "?"
        print >> sys.stderr, "Errore in %s -> %s" % (origin, destination)
        print >> sys.stderr, simplejson.dumps(result['status'])

    return res, status, simplejson.dumps(result['status'])

コード例 #11

0

ファイルを表示

ファイル: datacleanup.py プロジェクト: shashankgupta12/presskey-extended

def processText2(directory, newfile):
    for i in range(1, 41):
        filename = '{0}data{1}.json'.format(directory, i)
        if not os.path.isfile(filename):
            continue
        data = json.load(open(filename, 'r'))
        keyPressData, keyReleaseData = data['keyPressData'], data[
            'keyReleaseData']
        if len(keyPressData) > 15 and len(keyReleaseData) > 15:
            keyPressData, keyReleaseData = removeExtraKeys(
                dataProcessKeyPress(keyPressData)), removeExtraKeys(
                    dataProcessKeyRelease(keyReleaseData))
            kp = ''.join([i.split('-')[1]
                          for i in keyPressData]).replace('\'', '')
            kr = ''.join([i.split('-')[1]
                          for i in keyReleaseData]).replace('\'', '')
            pressString = [
                'theKey.spacepersonKey.spaceandKey.spacegreatKey.spaceforKey.spacegovernmentKey.spaceknowKey.spaceskillKey.spacenewKey.spacehaveKey.spaceyearKey.spaceevenKey.spaceaboutKey.spacefromKey.spaceforKey.spacemakeKey.spacewhichKey.spacepeopleKey.spacehowKey.spacenot'
            ]
            if sm(None, pressString[0], kp).ratio() > 0.95:
                keyPressData, keyReleaseData = extractTimings(
                    keyPressData, keyReleaseData)
                dt = dict(keyPressData=keyPressData,
                          keyReleaseData=keyReleaseData)
                with open(newfile, 'a') as f:
                    json.dump(dt, f)
                    f.write(os.linesep)

コード例 #12

0

ファイルを表示

 def cmp_list(list1: list, list2: list):
     """
     比较两个列表相似度
     :param list1:
     :param list2:
     :return:
     """
     len1 = len(list1)
     len2 = len(list2)
     list1.sort()
     list2.sort()
     mark = 0
     if len1 <= len2:
         min_l = list1
         max_l = list2
     else:
         max_l = list1
         min_l = list2
     for s in min_l:
         if s in max_l:
             mark += 1
         else:
             for t in max_l:
                 if sm(None, s.lower(), t.lower()).ratio() > 0.9:
                     mark += 1
     return (mark / len1 + mark / len2) * 0.5

コード例 #13

0

ファイルを表示

    def cmp_items(item1: dict, item2: dict):
        """
        比较两个item相似度
        :param item1:
        :param item2:
        :return:
        """
        # title 相似度
        r_title = sm(None, item1[TITLE].lower(), item2[TITLE].lower()).ratio()
        # 导演
        if DIRECTORS in item1 and DIRECTORS in item2:
            r_dir = Deduplicate.cmp_list(item1[DIRECTORS], item2[DIRECTORS])
        else:
            r_dir = 0.5
        # 演员
        if ACTORS in item1 and ACTORS in item2:
            r_act = Deduplicate.cmp_list(item1[ACTORS], item2[ACTORS])
        else:
            r_act = 0.5
        # count评论
        if item1[COUNT] == item2[COUNT] and item1[COUNT] != 0:
            r_count = 0.3
        else:
            r_count = 0

        total = 0.4 * r_title + 0.3 * r_dir + 0.3 * r_act + r_count
        return total

コード例 #14

0

ファイルを表示

ファイル: word_cutter.py プロジェクト: maymai/exp-tracker

def get_similarity(a, b):
    """
    :param a: string
    :param b: string
    :return: percentage similarity rate between a and b
    """
    return sm(None, a, b).ratio()

コード例 #15

0

ファイルを表示

ファイル: matching_test2.py プロジェクト: miniaishwarya/Bus-Route-Navigator

def find_route_list(
    list_eng,
    route,
    check=0
):  #check value determines whether the function find number of dest(check = 0) or get the list of matched dest
    count = 0
    index = 0
    for i in range(len(list_eng)):
        ratio = 0
        for j in range(len(route)):
            test_ratio = sm(None, list_eng[i], route[j]).ratio(
            )  #for calculating similarity ratio between strings
            val = td.levenshtein(
                list_eng[i], route[j]
            )  #for calculating levenshtein distance between strings
            if ((test_ratio > 0.73) and (test_ratio > ratio) and (val < 5)):
                ratio = test_ratio
                index = j  #index of most probable destination

        if (ratio > 0.73):
            if (check == 0):
                #print(list_eng[i]," has ratio ",ratio, "with" ,route[index]," AT", index,"\n")
                count = count + 1
            elif (check == 1):
                list_final.append(route[index])

    if (check == 0):
        count_final.append(count)

コード例 #16

0

ファイルを表示

def sum_news_sen(sum_org, news_org, news_con, summary):
    news_idx = []
    extr_sum = []
    reference = []
    core_sim = []
    for i in range(len(sum_org)):
        max_news_idx = []
        temp_sum = []
        sum_con = []
        con = []
        temp_max = []
        for j in range(len(sum_org[i])):
            sim_ratio = []
            for k in range(len(news_org[i])):
                sim_ratio.append(
                    sm(None, sum_org[i][j], news_org[i][k]).ratio())
            idx = [
                m[0] for m in sorted(
                    enumerate(sim_ratio), key=lambda x: x[1], reverse=True)
            ]
            mx = max(sim_ratio)
            temp_max.append(mx)
            for m in range(len(idx)):
                if idx[m] not in max_news_idx:
                    max_news_idx.append(idx[m])
                    temp_sum += news_con[i][idx[m]]
                    break
            con += summary[i][j]
        # print 'Most rep news sens for doc {} is {}'.format(i, max_news_idx)
        core_sim.append(temp_max)
        sum_con.append(con)
        reference.append(sum_con)
        extr_sum.append(temp_sum)
        news_idx.append(max_news_idx)
    return news_idx, extr_sum, reference, core_sim

コード例 #17

0

ファイルを表示

def is_same_name(a, b):
    for (x, y) in [(a, b), (b, a)]:
        for word in x.split():
            if word in y:
                return True
    if sm(a=list(set(b)), b=list(set(a))).ratio() > 0.7:
        return True
    return False

コード例 #18

0

ファイルを表示

def org_sim(news_org):
    sim_mat = []
    for i in range(len(news_org)):
        row_sim = []
        for j in range(len(news_org)):
            row_sim.append(sm(None, news_org[i], news_org[j]).ratio())
        sim_mat.append(row_sim)
    return sim_mat

コード例 #19

0

ファイルを表示

ファイル: Phrase-Match-In-Text.py プロジェクト: SrLebre/Phrase-Match-In-Text

def pmit(phrase, text, min_word_size=3):
    if not (isinstance(phrase, str) and isinstance(text, str)
            and isinstance(min_word_size, int)):
        print("One of the arguments is not valid")
        end = input("")
        return
    non_relevant_char = [
        ",", ".", ":", ";", "?", "/", "'", '"', "!", "<", ">", "@", "#", "$",
        "%", "^", "&", "*", "(", ")", "_", "=", "+", "~", "`"
    ]
    phrase = phrase.split()
    text = text.split()
    for word in phrase:
        if len(word) <= min_word_size - 1:
            phrase.remove(word)
    for word in text:
        if len(word) <= min_word_size - 1:
            text.remove(word)
    phrase = " ".join(phrase)
    text = " ".join(text)
    for character in non_relevant_char:
        text.replace(character, " ")
        phrase.replace(character, " ")
    while text.find("  ") != -1:
        text.replace("  ", " ")
    while phrase.find("  ") != -1:
        phrase.replace("  ", " ")
    phrase = phrase.split()
    if text.find("\n") != -1:
        paragraphs = [text]
    else:
        paragraphs = text.splitlines()
    highest_match_ratio = [0]
    for paragraph in paragraphs:
        paragraph = paragraph.split()
        matches = []
        for pword in phrase:
            close_matches = gcm(pword, paragraph)
            for index in range(0, len(paragraph) - 1):
                if (paragraph[index]
                        in close_matches) and (not index in matches):
                    matches.append(index)
        for match in matches:
            if match - int(len(phrase) / 2) < 0:
                begin = 0
            else:
                begin = match - int(len(phrase) / 2)
            if match + int(len(phrase) / 2) + len(phrase) > len(paragraph) - 1:
                end = len(paragraph) - 1
            else:
                end = match + int(len(phrase) / 2) + len(phrase)
            match_ratio = sm(None, " ".join(phrase),
                             " ".join(paragraph[begin:end])).ratio()
            if match_ratio > highest_match_ratio[0]:
                highest_match_ratio[0] = match_ratio
    result = highest_match_ratio[0]**(1 / len(phrase))
    return result

コード例 #20

0

ファイルを表示

ファイル: openstackbot.py プロジェクト: wolverineav/openstackbot

def mostAlikeRatio(key, command):
    cmd = command if len(key) <= len(command) else command.ljust(len(key))
    bestRatio = 0.0
    for i in xrange(len(cmd)-(len(key)-1)):
        ratio = sm(None, cmd[i:i+len(key)], key).ratio()
        bestRatio = max(bestRatio, ratio)
        if bestRatio == 1:
            return 1
    return bestRatio

コード例 #21

0

ファイルを表示

 def greet_engine(self):
     assistant_name = self.c.config.get('SYSTEM', 'assistant_name')
     meta_name = dm(assistant_name)[0]
     for index, raw_text in enumerate(self.raw_text_array):
         meta_text = dm(raw_text)[0]
         chances = sm(None, meta_name, meta_text).ratio()
         if chances > 0.7:
             self.raw_text_array = self.raw_text_array[index + 1:]
             return

コード例 #22

0

ファイルを表示

ファイル: cluster_process.py プロジェクト: wangsibovictor/s2s_sum

def grd_news(sum_org, news_org):
    top_news_grd = []
    num3 = []
    rest = []
    news_grd_sim_value = []
    news_grd_sim_rank = []
    for i in range(len(sum_org)):
        top_sim_dict = {}
        n3 = []
        re = []
        asp_sim = []
        asp_rank = []
        for j in range(len(sum_org[i])):
            sim_ratio = []
            for k in range(len(news_org[i])):
                sim_ratio.append(
                    sm(None, sum_org[i][j], news_org[i][k]).ratio())
            asp_sim.append(sim_ratio)
            idx = [
                m[0] for m in sorted(
                    enumerate(sim_ratio), key=lambda x: x[1], reverse=True)
            ]
            sort_sim = [
                m[1] for m in sorted(
                    enumerate(sim_ratio), key=lambda x: x[1], reverse=True)
            ]
            sim_rank = []
            for id in range(len(idx)):
                sim_rank.append(idx.index(id))
            asp_rank.append(sim_rank)
            idx_fin = []
            n3.append(idx[:2])
            for m in idx[-5:]:
                re.append(idx[m])
            for s in range(len(sort_sim)):
                if sort_sim[s] > 0.5:
                    idx_fin.append(idx[s])
            if len(idx_fin) < 5:
                idx_fin = idx[:5]
            else:
                print 'Number of sim news larger than 0.5 is ', len(idx_fin)
            top_num = len(idx_fin)
            for m in range(top_num):
                if idx_fin[m] not in top_sim_dict:
                    top_sim_dict[idx_fin[m]] = 1
                else:
                    num = top_sim_dict[idx_fin[m]]
                    top_sim_dict[idx_fin[m]] = num + 1
        # print 'The least similar news sentence length is ', len(set(re))
        new_asp_sim = map(list, zip(*asp_sim))
        news_grd_sim_value.append(new_asp_sim)
        new_asp_rank = map(list, zip(*asp_rank))
        news_grd_sim_rank.append(new_asp_rank)
        top_news_grd.append(top_sim_dict)
        num3.append(n3)
        rest.append(re)
    return top_news_grd, num3, rest, news_grd_sim_value, news_grd_sim_rank

コード例 #23

0

ファイルを表示

ファイル: doube_substring.py プロジェクト: pombredanne/py.checkio

def double_substring_sm(line: str) -> int:
    """Length of the longest substring that non-overlapping repeats more than once."""
    counter = 0
    for i, _ in enumerate(line, start=1):
        left, right = line[:i], line[i:]
        match = sm(None, left,
                   right).find_longest_match(0, len(left), 0, len(right))
        counter = max(counter, match.size)
    return counter

コード例 #24

0

ファイルを表示

def get_similarities(tags):
    similars = []
    for tag1 in tags:
        for tag2 in tags:
            seq = sm(None, tag1, tag2)
            ratio = seq.ratio()
            if ratio >= SIMILAR and tag1 != tag2:
                similars.append(sorted((tag1, tag2),
                                key=lambda t: len(t)))
    return similars

コード例 #25

0

ファイルを表示

ファイル: seqmatch_m.py プロジェクト: ppw0/minhash

def smratio(pair):
    f1, f2 = pair
    while True:
        try:
            r = int(100 * sm(None, open(f1).read(), open(f2).read()).ratio())
        except OSError:
            print("access lock at " + str(f1) + " and " + str(f2))
            continue
        break
    return (r, f1, f2)

コード例 #26

0

ファイルを表示

def getNearestMatchIn(word, list):
    curClosest = None
    ratio = 0.0
    for each in list:
        curr_ratio = sm(None, word, each).ratio()
        if (curr_ratio > ratio):
            curClosest = each
            ratio = curr_ratio
    if ratio > 0.76:
        return curClosest

コード例 #27

0

ファイルを表示

def news_grd_sim(sum_org, news_org, news_idx):
    news_grd_max_sim = {}
    for n in news_idx:
        max_sim = 0
        for s in range(len(sum_org)):
            sim = sm(None, sum_org[s], news_org[n]).ratio()
            if sim > max_sim:
                max_sim = sim
        news_grd_max_sim[n] = max_sim
    return news_grd_max_sim

コード例 #28

0

ファイルを表示

ファイル: mangaScrape.py プロジェクト: codebhendi/mangaExtractor

def saveImg(data):
    
    maxval = 0
    maxurl = ""
    imglinks = data.findAll("img")
    for link in imglinks:
        if sm(None, str(link['src']), BASE_URL).ratio() > maxval:
            maxval = sm(None, str(link['src']),BASE_URL).ratio()
            maxl = link
            maxurl = link['src']
    try:
        image = requests.get(maxurl)
    except:
        print "Error        "
    
    file = open(os.path.join(BASE_PATH, "%s.jpg") % IMAGE_TITLE, 'wb')
    try:
        Image.open(StringIO(image.content)).save(file, 'JPEG')
    except IOError, e:
        print "Couldnt Save:"

コード例 #29

0

ファイルを表示

def is_same_name(a,b):
    for (x,y) in [(a,b),(b,a)]:
        for word in x.split():
            if word in y:
                return True

    ratio = sm(a=list(set(b)), b=list(set(a))).ratio()
    if ratio > 0.7:
        log.msg(" ============> %s : %s = %s" % (a, b, ratio))
        return True
    return False

コード例 #30

0

ファイルを表示

ファイル: __main__.py プロジェクト: wjkoh/cs262a

def get_variable_str(common_str, log_msg):
    y = sm(None, a=common_str, b=log_msg)
    matched = y.get_matching_blocks()

    variables = []
    end_of_prev_block = 0
    for m in matched:
        if end_of_prev_block != m.b:
            variables.append(parse_num(log_msg[end_of_prev_block:m.b]))
        end_of_prev_block = m.b + m.size
    return variables

コード例 #31

0

ファイルを表示

def _process_chances(dataset, query):#module kwds, cmd kwds
    scores = []
    for data in dataset:#for kwd list on module kwds
        #[[r(q1, d1)... r(qn, d1)], [r(q1, d2)... r(qn, d2)]...]
        avg_scores = [[sm(a=kw, b=sw).ratio() for kw in query] for sw in data]
        print(f"avg_scores=\n{avg_scores}")
        word_score = [score[0] for score in _hungarian_algorithm(avg_scores)]
        print(f"word_score=\n{word_score}\n")
        avg_score = sum(word_score) / len(word_score)
        scores.append([avg_score, word_score])
    return scores

コード例 #32

0

ファイルを表示

ファイル: manga_scrape.py プロジェクト: SaptakS/mangaotaku

def saveImg(data, base_host, base_url, base_path, image_title):
    maxval = 0
    maxurl = ""
    imglinks = data.findAll("img")
    for link in imglinks:
        img_size = getsize(link['src'].strip())
        sequence_match_ratio = 0
        if 'alt' in link.attrs:
            sequence_match_ratio = sm(None,
                                  str(link['alt']),
                                  base_url.replace("http", "")).ratio()
        if sequence_match_ratio > maxval and img_size > 99999:
            maxval = sm(None, str(link['alt']), base_url).ratio()
            maxl = link
            maxurl = link['src'].strip()

    if "http" not in str(maxurl):
        maxurl = base_host.split("//")[0] + maxurl

    try:
        # uncomment if proxy settings
        # image = requests.get(maxurl, proxies=proxyDict)

        # comment if proxy settings
        image = requests.get(maxurl)
        print("Images Saved Successfully")
    except:
        print ("Error")
        exit(0)

    file = open(os.path.join(base_path, "%s.jpg") % image_title, 'wb')
    try:
        Image.open(BytesIO(image.content)).save(file, 'JPEG')
    except IOError as e:
        print("Couldnt Save:", e)

    finally:
        file.close()

    return maxl

コード例 #33

0

ファイルを表示

ファイル: undp_experiments_Utils.py プロジェクト: sainiudit/Semantic-Search-for-Sustainable-Development

def evaluateByTarget(score_dict, test_target_matches, num):
    '''
    Finds matches with prior RIA as the number of sentences outputted increases
    
    Args:
        score_dict (dict)          : Our target matches
        test_target_matches (dict) : Target matches from prior RIA
        num (int)                  : Number of output sentences to match.
   

    Returns:
        (dict) : Dictionary of how many matches were found after each sentence per target
    '''
    truths = []
    match_by_sent = {}
    truth_dict = {}
    check = []

    for target in score_dict.keys():
        for result in get_matches(target, score_dict, num):
            if target in test_target_matches and len(
                    test_target_matches[target]) > 1:
                sentences = result.split('.')
                for sent in sentences:
                    for ground_truth in test_target_matches[target]:
                        score = sm(None, ground_truth, sent).ratio()
                        if score > 0.50:
                            if score < .55:
                                check.append((ground_truth, sent))
                            if target in truth_dict and ground_truth not in truths:
                                truths.append(ground_truth)
                                truth_dict[target].append(ground_truth)
                            elif target not in truth_dict and ground_truth not in truths:
                                truth_dict[target] = [ground_truth]
                                truths.append(ground_truth)

                if target in truth_dict:
                    if target in match_by_sent:
                        match_by_sent[target].append(
                            len(truth_dict[target]) /
                            (len(test_target_matches[target]) - 1))
                    else:
                        match_by_sent[target] = [
                            len(truth_dict[target]) /
                            (len(test_target_matches[target]) - 1)
                        ]
                else:
                    if target in match_by_sent:
                        match_by_sent[target].append(0)
                    else:
                        match_by_sent[target] = [0]
    return match_by_sent

コード例 #34

0

ファイルを表示

ファイル: __init__.py プロジェクト: toxicrecker/similar.py

 def results(self):
     if isinstance(self.words, str):
         self.words = [self.words]
     similarities = {}
     words = [word.strip() for word in self.words]
     if not words:
         return None
     for x in words:
         ratio = sm(None, self.word, x).ratio()
         similarities[x] = ratio
     results = dict(
         sorted(similarities.items(), key=lambda x: x[1], reverse=True))
     return results

コード例 #35

0

ファイルを表示

ファイル: move_dupes.py プロジェクト: rubillionaire/audio_dupes

    def map_audio(self):
        """
        Takes in a string that represents thefile directory
        that will be inspected for MP3 and M4A files.
    
        Returns a dictionary. The key of the dictionary is
        an Artist. The value is another dictionary, whose
        key is the Album title, and value is a dictionary
        of songs. Each song dictionary is key'ed by its name,
        and its value is a list of the songs that were considered
        duplicates as a list. The song list object has dictionaries
        of each song, include keys for bitrate, path, and file type.
        """
        for root, dirs, files in os.walk(self.dir):
            for name in files:
                if (name.split(".")[-1].lower() == 'm4a' or \
                    name.split(".")[-1].lower() == 'mp3'):

                    cur_path = "{0}/{1}".format(root, name)
                    cur_file = auto.File(cur_path)

                    artist = cur_file.artist.lower().strip()
                    album = cur_file.album.lower().strip()
                    title = cur_file.title.lower().strip()
                    bitrate = cur_file.bitrate

                    if not artist in self.audio_dict:
                        self.audio_dict[artist] = {}

                    if not album in self.audio_dict[artist]:
                        self.audio_dict[artist][album] = {}

                    title_key = title
                    for in_album_title in self.audio_dict[artist][album]:
                        if sm(None, title, in_album_title).ratio() > 0.9:
                            title_key = in_album_title

                    if not title_key in \
                        self.audio_dict[artist][album]:
                        self.audio_dict[artist][album][title_key] = []

                    self.audio_dict[artist][album][title_key].append({
                        'path':
                        cur_path,
                        'bitrate':
                        bitrate,
                        'file_name':
                        name
                    })

        return self

コード例 #36

0

ファイルを表示

ファイル: topSongsDownloader.py プロジェクト: aKummur/topSongsDownloader

def downloader(songs, lang):
	for song, album in songs:
		print 'http://music.vidmate.mobi/search-' + song.strip().replace(' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html'
		response = urllib2.urlopen('http://music.vidmate.mobi/search-' + song.strip().replace(' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html')
		htmlsrc = response.read()

		htmlsrc = htmlsrc.split('id="music-search-song-container">', 1)[1]
		results = htmlsrc.split('music-song-search-item-open')
		for result in results:
			try:
				ps = result.split('<p')
				s = ps[1].split('</p>', 1)[0][1:]
				a = ps[2].split('<a', 1)[1].split('>', 1)[1].split('</a>', 1)[0].split('|')[0].strip()
				link = 'http://music.vidmate.mobi' + ps[2].split('<a', 1)[1].split('>', 1)[0].split('href="')[1].split('">')[0]
				#print a, s
				#print sm(None, song, s).ratio(), sm(None, album, a).ratio()
				if sm(None, song, s).ratio() > 0.8 and sm(None, album, a).ratio() > 0.8:
					print s
					download(link, s, lang)
					break
				else:
					pass
			except:
				pass

コード例 #37

0

ファイルを表示

ファイル: move_dupes.py プロジェクト: rubillionaire/audio_dupes

 def map_audio(self):
     """
     Takes in a string that represents thefile directory
     that will be inspected for MP3 and M4A files.
 
     Returns a dictionary. The key of the dictionary is
     an Artist. The value is another dictionary, whose
     key is the Album title, and value is a dictionary
     of songs. Each song dictionary is key'ed by its name,
     and its value is a list of the songs that were considered
     duplicates as a list. The song list object has dictionaries
     of each song, include keys for bitrate, path, and file type.
     """    
     for root, dirs, files in os.walk(self.dir):
         for name in files:
             if (name.split(".")[-1].lower() == 'm4a' or \
                 name.split(".")[-1].lower() == 'mp3'):
             
                 cur_path = "{0}/{1}".format(root, name)
                 cur_file = auto.File(cur_path)
             
                 artist = cur_file.artist.lower().strip()
                 album = cur_file.album.lower().strip()
                 title = cur_file.title.lower().strip()
                 bitrate = cur_file.bitrate
             
                 if not artist in self.audio_dict:
                     self.audio_dict[artist] = {}
             
                 if not album in self.audio_dict[artist]:
                     self.audio_dict[artist][album] = {}
             
                 title_key = title
                 for in_album_title in self.audio_dict[artist][album]:
                     if sm(None, title, in_album_title).ratio() > 0.9:
                         title_key = in_album_title
             
                 if not title_key in \
                     self.audio_dict[artist][album]:
                     self.audio_dict[artist][album][title_key] = []
             
                 self.audio_dict[artist][album][title_key].append({
                     'path': cur_path,
                     'bitrate': bitrate,
                     'file_name': name
                 })
             
     return self

コード例 #38

0

ファイルを表示

ファイル: __main__.py プロジェクト: wjkoh/cs262a

def get_common_str(log_msgs):
    common_str = None
    for i in log_msgs:
        if common_str is None:
            common_str = i
            continue

        y = sm(None, a=common_str, b=i)
        matched = y.get_matching_blocks()

        common_str = ''
        for m in matched[:-1]:
            if m.b == 0 or m.b + m.size == len(i) or m.size > 1:
                beg_idx = m.b
                end_idx = beg_idx + m.size
                # A heuristic for decimal fractions such as 0.xxx
                if i[end_idx - 2:end_idx] == '0.':
                    end_idx -= 2
                assert(end_idx >= beg_idx)
                common_str += i[beg_idx:end_idx]
    return common_str

コード例 #39

0

ファイルを表示

ファイル: FaceController.py プロジェクト: terminalstderr/ChernoffFaceAnimator

from nimble import cmds as mc
from difflib import SequenceMatcher as sm

def findName(namelike):
    objs = mc.ls()
    bodyName = ""
	compareScore = 0
	newScore = 0
    for nm in objs:
		newScore = sm(None, nameLike, nm).ratio()
		if newScore > compareScore:
			compareScore = newScore
			bodyName = nm
	
    print("query for: \"" + namelike + "\" returns: " + bodyName)
    return bodyName

# alters the aspects of some object.
def bAlter(bName, timeFrame, changeVal, chStr):
    mc.select(bName)    
    startTime = mc.currentTime(query=True)
    endTime = startTime + timeFrame
    rotBy = mc.getAttr(bName+ "." + chStr) + changeVal
    mc.setKeyframe(attribute=chStr)
    mc.currentTime(endTime)
    mc.setKeyframe(attribute=chStr, v=rotBy)	
    mc.currentTime(startTimeTime)

def limbRotate(limbName, timeFrame, changeVal):
    bAlter(limbName, timeFrame, changeVal, "rotateX")

コード例 #40

0

ファイルを表示

ファイル: fp.py プロジェクト: jyaunches/echoprint-server

def best_match_for_query(code_string, elbow=10, local=False):
    # DEC strings come in as unicode so we have to force them to ASCII
    code_string = code_string.encode("utf8")
    tic = int(time.time()*1000)

    # First see if this is a compressed code
    if re.match('[A-Za-z\/\+\_\-]', code_string) is not None:
        code_string = decode_code_string(code_string)
        if code_string is None:
            return Response(Response.CANNOT_DECODE, tic=tic)

    code_len = len(code_string.split(" ")) / 2
    if code_len < elbow:
        logger.warn("Query code length (%d) is less than elbow (%d)" % (code_len, elbow))
        return Response(Response.NOT_ENOUGH_CODE, tic=tic)

    code_string = cut_code_string_length(code_string)
    code_len = len(code_string.split(" ")) / 2

    # Query the FP flat directly.
    response = query_fp(code_string, rows=30, local=local, get_data=True)
    logger.debug("solr qtime is %d" % (response.header["QTime"]))

    if len(response.results) == 0:
        return Response(Response.NO_RESULTS, qtime=response.header["QTime"], tic=tic)

    # If we just had one result, make sure that it is close enough. We rarely if ever have a single match so this is not helpful (and probably doesn't work well.)
    top_match_score = int(response.results[0]["score"])
    if len(response.results) == 1:
        trackid = response.results[0]["track_id"]
        trackid = trackid.split("-")[0] # will work even if no `-` in trid
        meta = metadata_for_track_id(trackid, local=local)
        if code_len - top_match_score < elbow:
            return Response(Response.SINGLE_GOOD_MATCH, TRID=trackid, score=top_match_score, qtime=response.header["QTime"], tic=tic, metadata=meta)
        else:
            return Response(Response.SINGLE_BAD_MATCH, qtime=response.header["QTime"], tic=tic)

    # If the scores are really low (less than 5% of the query length) then say no results
    if top_match_score < code_len * 0.05:
        return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime = response.header["QTime"], tic=tic)

    # Not a strong match, so we look up the codes in the keystore and compute actual matches...

    # Get the actual score for all responses
    original_scores = {}
    actual_scores = {}

    trackids = [r["track_id"].encode("utf8") for r in response.results]
    if local:
        tcodes = [_fake_solr["store"][t] for t in trackids]
    else:
        tcodes = get_tyrant().multi_get(trackids)

    # For each result compute the "actual score" (based on the histogram matching)
    for (i, r) in enumerate(response.results):
        track_id = r["track_id"]
        original_scores[track_id] = int(r["score"])
        track_code = tcodes[i]
        if track_code is None:
            # Solr gave us back a track id but that track
            # is not in our keystore
            continue
        actual_scores[track_id] = actual_matches(code_string, track_code, elbow = elbow)

    #logger.debug("Actual score for %s is %d (code_len %d), original was %d" % (r["track_id"], actual_scores[r["track_id"]], code_len, top_match_score))
    # Sort the actual scores
    sorted_actual_scores = sorted(actual_scores.iteritems(), key=lambda (k,v): (v,k), reverse=True)

    # Because we split songs up into multiple parts, sometimes the results will have the same track in the
    # first few results. Remove these duplicates so that the falloff is (potentially) higher.
    new_sorted_actual_scores = []
    existing_trids = []
    for trid, result in sorted_actual_scores:
        trid_split = trid.split("-")[0]
        if trid_split not in existing_trids:
            new_sorted_actual_scores.append((trid, result))
            existing_trids.append(trid_split)
    sorted_actual_scores = new_sorted_actual_scores

    #remove duplicate matches - happens when same song appears in database
    new_sorted_actual_scores = []
    trid,result = sorted_actual_scores[0]
    metaTop = metadata_for_track_id(trid, local=local)
    try:
        topArtist = metaTop['artist']
    except:
        topArtist = 'NA'
    try:
        topTrack = metaTop['track']
    except:
        topTrack = 'NA'


    new_sorted_actual_scores.append((trid, result))
    for trid, result in sorted_actual_scores:
        trid_split = trid.split("-")[0]
        meta = metadata_for_track_id(trid, local=local)
        try:
            artist = meta['artist']
        except:
            artist = 'NA'
        try:
            track = meta['track']
        except:
            track = 'NA'

        if sm(None, artist, topArtist).ratio()< 0.6 and sm(None, track, topTrack).ratio() < 0.6:
            new_sorted_actual_scores.append((trid, result))
            existing_trids.append(trid_split)
    sorted_actual_scores = new_sorted_actual_scores


    # We might have reduced the length of the list to 1
    if len(sorted_actual_scores) == 1:
        logger.info("only have 1 score result...")
        (top_track_id, top_score) = sorted_actual_scores[0]
        if top_score < code_len * 0.1:
            logger.info("only result less than 10%% of the query string (%d < %d *0.1 (%d)) SINGLE_BAD_MATCH", top_score, code_len, code_len*0.1)
            return Response(Response.SINGLE_BAD_MATCH, qtime = response.header["QTime"], tic=tic)
        else:
            if top_score > (original_scores[top_track_id] / 2):
                logger.info("top_score > original_scores[%s]/2 (%d > %d) GOOD_MATCH_DECREASED",
                    top_track_id, top_score, original_scores[top_track_id]/2)
                trid = top_track_id.split("-")[0]
                meta = metadata_for_track_id(trid, local=local)
                return Response(Response.MULTIPLE_GOOD_MATCH_HISTOGRAM_DECREASED, TRID=trid, score=top_score, qtime=response.header["QTime"], tic=tic, metadata=meta)
            else:
                logger.info("top_score NOT > original_scores[%s]/2 (%d <= %d) BAD_HISTOGRAM_MATCH",
                    top_track_id, top_score, original_scores[top_track_id]/2)
                return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime=response.header["QTime"], tic=tic)

    # Get the top one
    (actual_score_top_track_id, actual_score_top_score) = sorted_actual_scores[0]
    # Get the 2nd top one (we know there is always at least 2 matches)
    (actual_score_2nd_track_id, actual_score_2nd_score) = sorted_actual_scores[1]

    trackid = actual_score_top_track_id.split("-")[0]
    meta = metadata_for_track_id(trackid, local=local)

    if actual_score_top_score < code_len * 0.05:
        return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime = response.header["QTime"], tic=tic)
    else:
        # If the actual score went down it still could be close enough, so check for that
        if actual_score_top_score > (original_scores[actual_score_top_track_id] / 4):
            if (actual_score_top_score - actual_score_2nd_score) >= (actual_score_top_score / 3):  # for examples [10,4], 10-4 = 6, which >= 5, so OK
                return Response(Response.MULTIPLE_GOOD_MATCH_HISTOGRAM_DECREASED, TRID=trackid, score=actual_score_top_score, qtime=response.header["QTime"], tic=tic, metadata=meta)
            else:
                return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime = response.header["QTime"], tic=tic)
        else:
            # If the actual score was not close enough, then no match.
            return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime=response.header["QTime"], tic=tic)

コード例 #41

0

ファイルを表示

ファイル: __init__.py プロジェクト: Cristianf/mptracker

 def matching_score(first_name, second_name):
     return sm(None, first_name, second_name).ratio() * 100

コード例 #42

0

ファイルを表示

ファイル: neiraschools.py プロジェクト: Severes1/neira

def compare(school, name):
    return sm(None, deUnique(school), deUnique(name)).ratio()