def processText4(directory, newfile): for i in range(1, 41): filename = '{0}data{1}.json'.format(directory, i) if not os.path.isfile(filename): continue data = json.load(open(filename, 'r')) keyPressData, keyReleaseData = data['keyPressData'], data[ 'keyReleaseData'] if len(keyPressData) > 15 and len(keyReleaseData) > 15: keyPressData, keyReleaseData = removeExtraKeys( dataProcessKeyPress(keyPressData)), removeExtraKeys( dataProcessKeyRelease(keyReleaseData)) keyPressData, keyReleaseData = swapShift( *removeExtraShift(keyPressData, keyReleaseData)) kp = ''.join([i.split('-')[1] for i in keyPressData]).replace('\'', '') kr = ''.join([i.split('-')[1] for i in keyReleaseData]).replace('\'', '') pressString = [ r'worldKey.shift%99,Key.space12Key.shift@hello;Key.spacewhy.not72,Key.spaceKey.shift#dream5Key.shift$0Key.shift*[email protected]*Key.space32greatKey.shift#Key.shift%have', r'worldKey.shift_r%99,Key.space12Key.shift_r@hello;Key.spacewhy.not72,Key.spaceKey.shift_r#dream5Key.shift_r$0Key.shift_r*[email protected]_r*Key.space32greatKey.shift_r#Key.shift_r%have' ] if sm(None, pressString[0], kp).ratio() > 0.90 or sm( None, pressString[1], kp).ratio() > 0.90: keyPressData, keyReleaseData = extractTimings( keyPressData, keyReleaseData) dt = dict(keyPressData=keyPressData, keyReleaseData=keyReleaseData) with open(newfile, 'a') as f: json.dump(dt, f) f.write(os.linesep)
def saveImg(data, base_host, base_url, base_path, image_title): maxval = 0 maxurl = "" imglinks = data.findAll("img") for link in imglinks: img_size = getsize(link['src'].strip()) sequence_match_ratio = 0 if 'alt' in link.attrs: sequence_match_ratio = sm(None, str(link['alt']), base_url.replace("http", "")).ratio() if sequence_match_ratio > maxval and img_size > 99999: maxval = sm(None, str(link['alt']), base_url).ratio() maxl = link maxurl = link['src'].strip() if "http" not in str(maxurl): maxurl = base_host.split("//")[0] + maxurl try: # uncomment if proxy settings # image = requests.get(maxurl, proxies=proxyDict) # comment if proxy settings image = requests.get(maxurl) print("Images Saved Successfully") except: print("Error") exit(0) saveImageToFile(base_path, image_title, image) return maxl
def saveImg(data, base_host, base_url, base_path, image_title): maxval = 0 maxurl = "" imglinks = data.findAll("img") check_link = base_host.split("//")[1] if len(check_link.split('.')) > 2: check_link = check_link.split('.')[1] + "." + check_link.split('.')[2] for link in imglinks: if check_link in str(link['src']) and sm(None, str(link['src']), base_url.replace("http", "")).ratio() > maxval: maxval = sm(None, str(link['src']), base_url).ratio() maxl = link maxurl = link['src'] if maxurl == '': for link in imglinks: if sm(None, str(link['src']), base_url.replace("http", "")).ratio() > maxval: maxval = sm(None, str(link['src']), base_url).ratio() maxl = link maxurl = link['src'] if "http" not in str(maxurl): maxurl = base_host.split("//")[0] + maxurl try: image = requests.get(maxurl, proxies=proxyDict) print "Images Saved Successfully" except: print "Error " exit(0) file = open(os.path.join(base_path, "%s.jpg") % image_title, 'wb') try: Image.open(StringIO(image.content)).save(file, 'JPEG') except IOError, e: print "Couldnt Save:", e
def saveImg(data): maxval = 0 maxurl = "" imglinks = data.findAll("img") check_link = BASE_HOST.split("//")[1] if len(check_link.split('.')) > 2: check_link = check_link.split('.')[1] + "." + check_link.split('.')[2] for link in imglinks: if check_link in str(link['src']) and sm(None, str(link['src']), BASE_URL.replace("http", "")).ratio() > maxval: maxval = sm(None, str(link['src']), BASE_URL).ratio() maxl = link maxurl = link['src'] if maxurl == '': for link in imglinks: if sm(None, str(link['src']), BASE_URL.replace("http", "")).ratio() > maxval: maxval = sm(None, str(link['src']), BASE_URL).ratio() maxl = link maxurl = link['src'] if "http" not in str(maxurl): maxurl = BASE_HOST.split("//")[0] + maxurl try: image = requests.get(maxurl, proxies=proxyDict) print "Images Saved Successfully" except: print "Error " exit(0) file = open(os.path.join(BASE_PATH, "%s.jpg") % IMAGE_TITLE, 'wb') try: Image.open(StringIO(image.content)).save(file, 'JPEG') except IOError, e: print "Couldnt Save:", e
def get_valid_image_obj(self, img_objs): """ Find the valid BeautifulSoup image object from the various different image objects present in the current page or URL based on string matching of the image link and base_url, and the size of the image in the image obj. :param img_objs: List of all BeautifulSoup image objects present in the current page """ maxval = 0 match_obj = None for img_obj in img_objs: img_size = get_content_size(img_obj['src'].strip(), self.proxy_dict) if 'alt' in img_obj.attrs: sequence_match_ratio = sm(None, str(img_obj['alt']), self.base_url.replace("http", "")).ratio() if sequence_match_ratio > maxval and img_size > 99999: maxval = sm(None, str(img_obj['alt']), self.base_url).ratio() match_obj = img_obj return match_obj
def downloader(songs, lang): for song, album in songs: print 'http://music.vidmate.mobi/search-' + song.strip().replace( ' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html' response = urllib2.urlopen('http://music.vidmate.mobi/search-' + song.strip().replace(' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html') htmlsrc = response.read() htmlsrc = htmlsrc.split('id="music-search-song-container">', 1)[1] results = htmlsrc.split('music-song-search-item-open') for result in results: try: ps = result.split('<p') s = ps[1].split('</p>', 1)[0][1:] a = ps[2].split('<a', 1)[1].split('>', 1)[1].split( '</a>', 1)[0].split('|')[0].strip() link = 'http://music.vidmate.mobi' + ps[2].split( '<a', 1)[1].split('>', 1)[0].split('href="')[1].split('">')[0] #print a, s #print sm(None, song, s).ratio(), sm(None, album, a).ratio() if sm(None, song, s).ratio() > 0.8 and sm(None, album, a).ratio() > 0.8: print s download(link, s, lang) break else: pass except: pass
def processText1(directory, newfile): for i in range(1, 41): filename = '{0}data{1}.json'.format(directory, i) if not os.path.isfile(filename): continue data = json.load(open(filename, 'r')) keyPressData, keyReleaseData = data['keyPressData'], data[ 'keyReleaseData'] if len(keyPressData) > 6 and len(keyReleaseData) > 6: keyPressData, keyReleaseData = removeExtraKeys( dataProcessKeyPress(keyPressData)), removeExtraKeys( dataProcessKeyRelease(keyReleaseData)) keyPressData, keyReleaseData = swapShift( *removeExtraShift(keyPressData, keyReleaseData)) kp = ''.join([i.split('-')[1] for i in keyPressData]).replace('\'', '') kr = ''.join([i.split('-')[1] for i in keyReleaseData]).replace('\'', '') pressString = [ 'abuKey.shift@9,l12Key.shift$', 'abuKey.shift_r@9,l12Key.shift_r$' ] if sm(None, pressString[0], kp).ratio() > 0.95 or sm( None, pressString[1], kp).ratio() > 0.95: keyPressData, keyReleaseData = extractTimings( keyPressData, keyReleaseData) dt = dict(keyPressData=keyPressData, keyReleaseData=keyReleaseData) with open(newfile, 'a') as f: json.dump(dt, f) f.write(os.linesep)
def songsAreSame(s1, s2): from difflib import SequenceMatcher as sm # Idea credit: https://bigishdata.com/2016/10/25/ seqA = sm(None, s1.lyrics, s2['lyrics']) if seqA.ratio() > 0.4: seqB = sm(None, s2['lyrics'], s1.lyrics) return seqA.ratio() > 0.5 or seqB.ratio() > 0.5 return False
def MatchWord(word): ratio = 0 for exitsWord in words.keys(): if ratio < sm(None, word, exitsWord).ratio(): ratio = sm(None, word, exitsWord).ratio() nearestWord = exitsWord if ratio >= 0.8: return nearestWord else: return "0"
def geocode(origin, destination, sensor): geo_args = { 'origin': origin, 'destination': destination, 'sensor': sensor } url = GEOCODE_BASE_URL + '?' + urllib.urlencode(geo_args) result = simplejson.load(urllib.urlopen(url)) res = {} status = False if simplejson.dumps(result['status']) == "\"OK\"" : leg = result['routes'][0]['legs'][0] duration = simplejson.dumps(leg['duration']['value']).strip("\"") distance = simplejson.dumps(leg['distance']['value']).strip("\"") # conversions duration = float(duration) / 60 distance = float(distance) / 1000 # string comparison start_index = sm(None, origin, leg['start_address']).ratio() end_index = sm(None, destination, leg['end_address']).ratio() if (start_index < THRESHOLD) or (end_index < THRESHOLD): res['duration'] = "%.0f *" % duration res['distance'] = "%.1f *" % distance print >> sys.stderr, "ATTENZIONE! Forse alcune cittá non sono " \ "state trovate\n. Partenza voluta: \"%s\", partenza suggerita" \ ": \"%s\".\n Destinazione voluta: %s, " \ " destinazione suggerita: \"%s\"" % (origin, leg['start_address'], destination, leg['end_address']) else: res['duration'] = "%.0f" % (duration) res['distance'] = "%.1f" % (distance) status = True elif (simplejson.dumps(result['status']) == "\"ZERO_RESULTS\""): res['duration'] = "?" res['distance'] = "?" print >> sys.stderr, "Errore in %s -> %s" % (origin, destination) status = True else: res['duration'] = "?" res['distance'] = "?" print >> sys.stderr, "Errore in %s -> %s" % (origin, destination) print >> sys.stderr, simplejson.dumps(result['status']) return res, status, simplejson.dumps(result['status'])
def processText2(directory, newfile): for i in range(1, 41): filename = '{0}data{1}.json'.format(directory, i) if not os.path.isfile(filename): continue data = json.load(open(filename, 'r')) keyPressData, keyReleaseData = data['keyPressData'], data[ 'keyReleaseData'] if len(keyPressData) > 15 and len(keyReleaseData) > 15: keyPressData, keyReleaseData = removeExtraKeys( dataProcessKeyPress(keyPressData)), removeExtraKeys( dataProcessKeyRelease(keyReleaseData)) kp = ''.join([i.split('-')[1] for i in keyPressData]).replace('\'', '') kr = ''.join([i.split('-')[1] for i in keyReleaseData]).replace('\'', '') pressString = [ 'theKey.spacepersonKey.spaceandKey.spacegreatKey.spaceforKey.spacegovernmentKey.spaceknowKey.spaceskillKey.spacenewKey.spacehaveKey.spaceyearKey.spaceevenKey.spaceaboutKey.spacefromKey.spaceforKey.spacemakeKey.spacewhichKey.spacepeopleKey.spacehowKey.spacenot' ] if sm(None, pressString[0], kp).ratio() > 0.95: keyPressData, keyReleaseData = extractTimings( keyPressData, keyReleaseData) dt = dict(keyPressData=keyPressData, keyReleaseData=keyReleaseData) with open(newfile, 'a') as f: json.dump(dt, f) f.write(os.linesep)
def cmp_list(list1: list, list2: list): """ 比较两个列表相似度 :param list1: :param list2: :return: """ len1 = len(list1) len2 = len(list2) list1.sort() list2.sort() mark = 0 if len1 <= len2: min_l = list1 max_l = list2 else: max_l = list1 min_l = list2 for s in min_l: if s in max_l: mark += 1 else: for t in max_l: if sm(None, s.lower(), t.lower()).ratio() > 0.9: mark += 1 return (mark / len1 + mark / len2) * 0.5
def cmp_items(item1: dict, item2: dict): """ 比较两个item相似度 :param item1: :param item2: :return: """ # title 相似度 r_title = sm(None, item1[TITLE].lower(), item2[TITLE].lower()).ratio() # 导演 if DIRECTORS in item1 and DIRECTORS in item2: r_dir = Deduplicate.cmp_list(item1[DIRECTORS], item2[DIRECTORS]) else: r_dir = 0.5 # 演员 if ACTORS in item1 and ACTORS in item2: r_act = Deduplicate.cmp_list(item1[ACTORS], item2[ACTORS]) else: r_act = 0.5 # count评论 if item1[COUNT] == item2[COUNT] and item1[COUNT] != 0: r_count = 0.3 else: r_count = 0 total = 0.4 * r_title + 0.3 * r_dir + 0.3 * r_act + r_count return total
def get_similarity(a, b): """ :param a: string :param b: string :return: percentage similarity rate between a and b """ return sm(None, a, b).ratio()
def find_route_list( list_eng, route, check=0 ): #check value determines whether the function find number of dest(check = 0) or get the list of matched dest count = 0 index = 0 for i in range(len(list_eng)): ratio = 0 for j in range(len(route)): test_ratio = sm(None, list_eng[i], route[j]).ratio( ) #for calculating similarity ratio between strings val = td.levenshtein( list_eng[i], route[j] ) #for calculating levenshtein distance between strings if ((test_ratio > 0.73) and (test_ratio > ratio) and (val < 5)): ratio = test_ratio index = j #index of most probable destination if (ratio > 0.73): if (check == 0): #print(list_eng[i]," has ratio ",ratio, "with" ,route[index]," AT", index,"\n") count = count + 1 elif (check == 1): list_final.append(route[index]) if (check == 0): count_final.append(count)
def sum_news_sen(sum_org, news_org, news_con, summary): news_idx = [] extr_sum = [] reference = [] core_sim = [] for i in range(len(sum_org)): max_news_idx = [] temp_sum = [] sum_con = [] con = [] temp_max = [] for j in range(len(sum_org[i])): sim_ratio = [] for k in range(len(news_org[i])): sim_ratio.append( sm(None, sum_org[i][j], news_org[i][k]).ratio()) idx = [ m[0] for m in sorted( enumerate(sim_ratio), key=lambda x: x[1], reverse=True) ] mx = max(sim_ratio) temp_max.append(mx) for m in range(len(idx)): if idx[m] not in max_news_idx: max_news_idx.append(idx[m]) temp_sum += news_con[i][idx[m]] break con += summary[i][j] # print 'Most rep news sens for doc {} is {}'.format(i, max_news_idx) core_sim.append(temp_max) sum_con.append(con) reference.append(sum_con) extr_sum.append(temp_sum) news_idx.append(max_news_idx) return news_idx, extr_sum, reference, core_sim
def is_same_name(a, b): for (x, y) in [(a, b), (b, a)]: for word in x.split(): if word in y: return True if sm(a=list(set(b)), b=list(set(a))).ratio() > 0.7: return True return False
def org_sim(news_org): sim_mat = [] for i in range(len(news_org)): row_sim = [] for j in range(len(news_org)): row_sim.append(sm(None, news_org[i], news_org[j]).ratio()) sim_mat.append(row_sim) return sim_mat
def pmit(phrase, text, min_word_size=3): if not (isinstance(phrase, str) and isinstance(text, str) and isinstance(min_word_size, int)): print("One of the arguments is not valid") end = input("") return non_relevant_char = [ ",", ".", ":", ";", "?", "/", "'", '"', "!", "<", ">", "@", "#", "$", "%", "^", "&", "*", "(", ")", "_", "=", "+", "~", "`" ] phrase = phrase.split() text = text.split() for word in phrase: if len(word) <= min_word_size - 1: phrase.remove(word) for word in text: if len(word) <= min_word_size - 1: text.remove(word) phrase = " ".join(phrase) text = " ".join(text) for character in non_relevant_char: text.replace(character, " ") phrase.replace(character, " ") while text.find(" ") != -1: text.replace(" ", " ") while phrase.find(" ") != -1: phrase.replace(" ", " ") phrase = phrase.split() if text.find("\n") != -1: paragraphs = [text] else: paragraphs = text.splitlines() highest_match_ratio = [0] for paragraph in paragraphs: paragraph = paragraph.split() matches = [] for pword in phrase: close_matches = gcm(pword, paragraph) for index in range(0, len(paragraph) - 1): if (paragraph[index] in close_matches) and (not index in matches): matches.append(index) for match in matches: if match - int(len(phrase) / 2) < 0: begin = 0 else: begin = match - int(len(phrase) / 2) if match + int(len(phrase) / 2) + len(phrase) > len(paragraph) - 1: end = len(paragraph) - 1 else: end = match + int(len(phrase) / 2) + len(phrase) match_ratio = sm(None, " ".join(phrase), " ".join(paragraph[begin:end])).ratio() if match_ratio > highest_match_ratio[0]: highest_match_ratio[0] = match_ratio result = highest_match_ratio[0]**(1 / len(phrase)) return result
def mostAlikeRatio(key, command): cmd = command if len(key) <= len(command) else command.ljust(len(key)) bestRatio = 0.0 for i in xrange(len(cmd)-(len(key)-1)): ratio = sm(None, cmd[i:i+len(key)], key).ratio() bestRatio = max(bestRatio, ratio) if bestRatio == 1: return 1 return bestRatio
def greet_engine(self): assistant_name = self.c.config.get('SYSTEM', 'assistant_name') meta_name = dm(assistant_name)[0] for index, raw_text in enumerate(self.raw_text_array): meta_text = dm(raw_text)[0] chances = sm(None, meta_name, meta_text).ratio() if chances > 0.7: self.raw_text_array = self.raw_text_array[index + 1:] return
def grd_news(sum_org, news_org): top_news_grd = [] num3 = [] rest = [] news_grd_sim_value = [] news_grd_sim_rank = [] for i in range(len(sum_org)): top_sim_dict = {} n3 = [] re = [] asp_sim = [] asp_rank = [] for j in range(len(sum_org[i])): sim_ratio = [] for k in range(len(news_org[i])): sim_ratio.append( sm(None, sum_org[i][j], news_org[i][k]).ratio()) asp_sim.append(sim_ratio) idx = [ m[0] for m in sorted( enumerate(sim_ratio), key=lambda x: x[1], reverse=True) ] sort_sim = [ m[1] for m in sorted( enumerate(sim_ratio), key=lambda x: x[1], reverse=True) ] sim_rank = [] for id in range(len(idx)): sim_rank.append(idx.index(id)) asp_rank.append(sim_rank) idx_fin = [] n3.append(idx[:2]) for m in idx[-5:]: re.append(idx[m]) for s in range(len(sort_sim)): if sort_sim[s] > 0.5: idx_fin.append(idx[s]) if len(idx_fin) < 5: idx_fin = idx[:5] else: print 'Number of sim news larger than 0.5 is ', len(idx_fin) top_num = len(idx_fin) for m in range(top_num): if idx_fin[m] not in top_sim_dict: top_sim_dict[idx_fin[m]] = 1 else: num = top_sim_dict[idx_fin[m]] top_sim_dict[idx_fin[m]] = num + 1 # print 'The least similar news sentence length is ', len(set(re)) new_asp_sim = map(list, zip(*asp_sim)) news_grd_sim_value.append(new_asp_sim) new_asp_rank = map(list, zip(*asp_rank)) news_grd_sim_rank.append(new_asp_rank) top_news_grd.append(top_sim_dict) num3.append(n3) rest.append(re) return top_news_grd, num3, rest, news_grd_sim_value, news_grd_sim_rank
def double_substring_sm(line: str) -> int: """Length of the longest substring that non-overlapping repeats more than once.""" counter = 0 for i, _ in enumerate(line, start=1): left, right = line[:i], line[i:] match = sm(None, left, right).find_longest_match(0, len(left), 0, len(right)) counter = max(counter, match.size) return counter
def get_similarities(tags): similars = [] for tag1 in tags: for tag2 in tags: seq = sm(None, tag1, tag2) ratio = seq.ratio() if ratio >= SIMILAR and tag1 != tag2: similars.append(sorted((tag1, tag2), key=lambda t: len(t))) return similars
def smratio(pair): f1, f2 = pair while True: try: r = int(100 * sm(None, open(f1).read(), open(f2).read()).ratio()) except OSError: print("access lock at " + str(f1) + " and " + str(f2)) continue break return (r, f1, f2)
def getNearestMatchIn(word, list): curClosest = None ratio = 0.0 for each in list: curr_ratio = sm(None, word, each).ratio() if (curr_ratio > ratio): curClosest = each ratio = curr_ratio if ratio > 0.76: return curClosest
def news_grd_sim(sum_org, news_org, news_idx): news_grd_max_sim = {} for n in news_idx: max_sim = 0 for s in range(len(sum_org)): sim = sm(None, sum_org[s], news_org[n]).ratio() if sim > max_sim: max_sim = sim news_grd_max_sim[n] = max_sim return news_grd_max_sim
def saveImg(data): maxval = 0 maxurl = "" imglinks = data.findAll("img") for link in imglinks: if sm(None, str(link['src']), BASE_URL).ratio() > maxval: maxval = sm(None, str(link['src']),BASE_URL).ratio() maxl = link maxurl = link['src'] try: image = requests.get(maxurl) except: print "Error " file = open(os.path.join(BASE_PATH, "%s.jpg") % IMAGE_TITLE, 'wb') try: Image.open(StringIO(image.content)).save(file, 'JPEG') except IOError, e: print "Couldnt Save:"
def is_same_name(a,b): for (x,y) in [(a,b),(b,a)]: for word in x.split(): if word in y: return True ratio = sm(a=list(set(b)), b=list(set(a))).ratio() if ratio > 0.7: log.msg(" ============> %s : %s = %s" % (a, b, ratio)) return True return False
def get_variable_str(common_str, log_msg): y = sm(None, a=common_str, b=log_msg) matched = y.get_matching_blocks() variables = [] end_of_prev_block = 0 for m in matched: if end_of_prev_block != m.b: variables.append(parse_num(log_msg[end_of_prev_block:m.b])) end_of_prev_block = m.b + m.size return variables
def _process_chances(dataset, query):#module kwds, cmd kwds scores = [] for data in dataset:#for kwd list on module kwds #[[r(q1, d1)... r(qn, d1)], [r(q1, d2)... r(qn, d2)]...] avg_scores = [[sm(a=kw, b=sw).ratio() for kw in query] for sw in data] print(f"avg_scores=\n{avg_scores}") word_score = [score[0] for score in _hungarian_algorithm(avg_scores)] print(f"word_score=\n{word_score}\n") avg_score = sum(word_score) / len(word_score) scores.append([avg_score, word_score]) return scores
def saveImg(data, base_host, base_url, base_path, image_title): maxval = 0 maxurl = "" imglinks = data.findAll("img") for link in imglinks: img_size = getsize(link['src'].strip()) sequence_match_ratio = 0 if 'alt' in link.attrs: sequence_match_ratio = sm(None, str(link['alt']), base_url.replace("http", "")).ratio() if sequence_match_ratio > maxval and img_size > 99999: maxval = sm(None, str(link['alt']), base_url).ratio() maxl = link maxurl = link['src'].strip() if "http" not in str(maxurl): maxurl = base_host.split("//")[0] + maxurl try: # uncomment if proxy settings # image = requests.get(maxurl, proxies=proxyDict) # comment if proxy settings image = requests.get(maxurl) print("Images Saved Successfully") except: print ("Error") exit(0) file = open(os.path.join(base_path, "%s.jpg") % image_title, 'wb') try: Image.open(BytesIO(image.content)).save(file, 'JPEG') except IOError as e: print("Couldnt Save:", e) finally: file.close() return maxl
def evaluateByTarget(score_dict, test_target_matches, num): ''' Finds matches with prior RIA as the number of sentences outputted increases Args: score_dict (dict) : Our target matches test_target_matches (dict) : Target matches from prior RIA num (int) : Number of output sentences to match. Returns: (dict) : Dictionary of how many matches were found after each sentence per target ''' truths = [] match_by_sent = {} truth_dict = {} check = [] for target in score_dict.keys(): for result in get_matches(target, score_dict, num): if target in test_target_matches and len( test_target_matches[target]) > 1: sentences = result.split('.') for sent in sentences: for ground_truth in test_target_matches[target]: score = sm(None, ground_truth, sent).ratio() if score > 0.50: if score < .55: check.append((ground_truth, sent)) if target in truth_dict and ground_truth not in truths: truths.append(ground_truth) truth_dict[target].append(ground_truth) elif target not in truth_dict and ground_truth not in truths: truth_dict[target] = [ground_truth] truths.append(ground_truth) if target in truth_dict: if target in match_by_sent: match_by_sent[target].append( len(truth_dict[target]) / (len(test_target_matches[target]) - 1)) else: match_by_sent[target] = [ len(truth_dict[target]) / (len(test_target_matches[target]) - 1) ] else: if target in match_by_sent: match_by_sent[target].append(0) else: match_by_sent[target] = [0] return match_by_sent
def results(self): if isinstance(self.words, str): self.words = [self.words] similarities = {} words = [word.strip() for word in self.words] if not words: return None for x in words: ratio = sm(None, self.word, x).ratio() similarities[x] = ratio results = dict( sorted(similarities.items(), key=lambda x: x[1], reverse=True)) return results
def map_audio(self): """ Takes in a string that represents thefile directory that will be inspected for MP3 and M4A files. Returns a dictionary. The key of the dictionary is an Artist. The value is another dictionary, whose key is the Album title, and value is a dictionary of songs. Each song dictionary is key'ed by its name, and its value is a list of the songs that were considered duplicates as a list. The song list object has dictionaries of each song, include keys for bitrate, path, and file type. """ for root, dirs, files in os.walk(self.dir): for name in files: if (name.split(".")[-1].lower() == 'm4a' or \ name.split(".")[-1].lower() == 'mp3'): cur_path = "{0}/{1}".format(root, name) cur_file = auto.File(cur_path) artist = cur_file.artist.lower().strip() album = cur_file.album.lower().strip() title = cur_file.title.lower().strip() bitrate = cur_file.bitrate if not artist in self.audio_dict: self.audio_dict[artist] = {} if not album in self.audio_dict[artist]: self.audio_dict[artist][album] = {} title_key = title for in_album_title in self.audio_dict[artist][album]: if sm(None, title, in_album_title).ratio() > 0.9: title_key = in_album_title if not title_key in \ self.audio_dict[artist][album]: self.audio_dict[artist][album][title_key] = [] self.audio_dict[artist][album][title_key].append({ 'path': cur_path, 'bitrate': bitrate, 'file_name': name }) return self
def downloader(songs, lang): for song, album in songs: print 'http://music.vidmate.mobi/search-' + song.strip().replace(' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html' response = urllib2.urlopen('http://music.vidmate.mobi/search-' + song.strip().replace(' ', '%20') + '%20' + album.strip().replace(' ', '%20') + '.html') htmlsrc = response.read() htmlsrc = htmlsrc.split('id="music-search-song-container">', 1)[1] results = htmlsrc.split('music-song-search-item-open') for result in results: try: ps = result.split('<p') s = ps[1].split('</p>', 1)[0][1:] a = ps[2].split('<a', 1)[1].split('>', 1)[1].split('</a>', 1)[0].split('|')[0].strip() link = 'http://music.vidmate.mobi' + ps[2].split('<a', 1)[1].split('>', 1)[0].split('href="')[1].split('">')[0] #print a, s #print sm(None, song, s).ratio(), sm(None, album, a).ratio() if sm(None, song, s).ratio() > 0.8 and sm(None, album, a).ratio() > 0.8: print s download(link, s, lang) break else: pass except: pass
def get_common_str(log_msgs): common_str = None for i in log_msgs: if common_str is None: common_str = i continue y = sm(None, a=common_str, b=i) matched = y.get_matching_blocks() common_str = '' for m in matched[:-1]: if m.b == 0 or m.b + m.size == len(i) or m.size > 1: beg_idx = m.b end_idx = beg_idx + m.size # A heuristic for decimal fractions such as 0.xxx if i[end_idx - 2:end_idx] == '0.': end_idx -= 2 assert(end_idx >= beg_idx) common_str += i[beg_idx:end_idx] return common_str
from nimble import cmds as mc from difflib import SequenceMatcher as sm def findName(namelike): objs = mc.ls() bodyName = "" compareScore = 0 newScore = 0 for nm in objs: newScore = sm(None, nameLike, nm).ratio() if newScore > compareScore: compareScore = newScore bodyName = nm print("query for: \"" + namelike + "\" returns: " + bodyName) return bodyName # alters the aspects of some object. def bAlter(bName, timeFrame, changeVal, chStr): mc.select(bName) startTime = mc.currentTime(query=True) endTime = startTime + timeFrame rotBy = mc.getAttr(bName+ "." + chStr) + changeVal mc.setKeyframe(attribute=chStr) mc.currentTime(endTime) mc.setKeyframe(attribute=chStr, v=rotBy) mc.currentTime(startTimeTime) def limbRotate(limbName, timeFrame, changeVal): bAlter(limbName, timeFrame, changeVal, "rotateX")
def best_match_for_query(code_string, elbow=10, local=False): # DEC strings come in as unicode so we have to force them to ASCII code_string = code_string.encode("utf8") tic = int(time.time()*1000) # First see if this is a compressed code if re.match('[A-Za-z\/\+\_\-]', code_string) is not None: code_string = decode_code_string(code_string) if code_string is None: return Response(Response.CANNOT_DECODE, tic=tic) code_len = len(code_string.split(" ")) / 2 if code_len < elbow: logger.warn("Query code length (%d) is less than elbow (%d)" % (code_len, elbow)) return Response(Response.NOT_ENOUGH_CODE, tic=tic) code_string = cut_code_string_length(code_string) code_len = len(code_string.split(" ")) / 2 # Query the FP flat directly. response = query_fp(code_string, rows=30, local=local, get_data=True) logger.debug("solr qtime is %d" % (response.header["QTime"])) if len(response.results) == 0: return Response(Response.NO_RESULTS, qtime=response.header["QTime"], tic=tic) # If we just had one result, make sure that it is close enough. We rarely if ever have a single match so this is not helpful (and probably doesn't work well.) top_match_score = int(response.results[0]["score"]) if len(response.results) == 1: trackid = response.results[0]["track_id"] trackid = trackid.split("-")[0] # will work even if no `-` in trid meta = metadata_for_track_id(trackid, local=local) if code_len - top_match_score < elbow: return Response(Response.SINGLE_GOOD_MATCH, TRID=trackid, score=top_match_score, qtime=response.header["QTime"], tic=tic, metadata=meta) else: return Response(Response.SINGLE_BAD_MATCH, qtime=response.header["QTime"], tic=tic) # If the scores are really low (less than 5% of the query length) then say no results if top_match_score < code_len * 0.05: return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime = response.header["QTime"], tic=tic) # Not a strong match, so we look up the codes in the keystore and compute actual matches... # Get the actual score for all responses original_scores = {} actual_scores = {} trackids = [r["track_id"].encode("utf8") for r in response.results] if local: tcodes = [_fake_solr["store"][t] for t in trackids] else: tcodes = get_tyrant().multi_get(trackids) # For each result compute the "actual score" (based on the histogram matching) for (i, r) in enumerate(response.results): track_id = r["track_id"] original_scores[track_id] = int(r["score"]) track_code = tcodes[i] if track_code is None: # Solr gave us back a track id but that track # is not in our keystore continue actual_scores[track_id] = actual_matches(code_string, track_code, elbow = elbow) #logger.debug("Actual score for %s is %d (code_len %d), original was %d" % (r["track_id"], actual_scores[r["track_id"]], code_len, top_match_score)) # Sort the actual scores sorted_actual_scores = sorted(actual_scores.iteritems(), key=lambda (k,v): (v,k), reverse=True) # Because we split songs up into multiple parts, sometimes the results will have the same track in the # first few results. Remove these duplicates so that the falloff is (potentially) higher. new_sorted_actual_scores = [] existing_trids = [] for trid, result in sorted_actual_scores: trid_split = trid.split("-")[0] if trid_split not in existing_trids: new_sorted_actual_scores.append((trid, result)) existing_trids.append(trid_split) sorted_actual_scores = new_sorted_actual_scores #remove duplicate matches - happens when same song appears in database new_sorted_actual_scores = [] trid,result = sorted_actual_scores[0] metaTop = metadata_for_track_id(trid, local=local) try: topArtist = metaTop['artist'] except: topArtist = 'NA' try: topTrack = metaTop['track'] except: topTrack = 'NA' new_sorted_actual_scores.append((trid, result)) for trid, result in sorted_actual_scores: trid_split = trid.split("-")[0] meta = metadata_for_track_id(trid, local=local) try: artist = meta['artist'] except: artist = 'NA' try: track = meta['track'] except: track = 'NA' if sm(None, artist, topArtist).ratio()< 0.6 and sm(None, track, topTrack).ratio() < 0.6: new_sorted_actual_scores.append((trid, result)) existing_trids.append(trid_split) sorted_actual_scores = new_sorted_actual_scores # We might have reduced the length of the list to 1 if len(sorted_actual_scores) == 1: logger.info("only have 1 score result...") (top_track_id, top_score) = sorted_actual_scores[0] if top_score < code_len * 0.1: logger.info("only result less than 10%% of the query string (%d < %d *0.1 (%d)) SINGLE_BAD_MATCH", top_score, code_len, code_len*0.1) return Response(Response.SINGLE_BAD_MATCH, qtime = response.header["QTime"], tic=tic) else: if top_score > (original_scores[top_track_id] / 2): logger.info("top_score > original_scores[%s]/2 (%d > %d) GOOD_MATCH_DECREASED", top_track_id, top_score, original_scores[top_track_id]/2) trid = top_track_id.split("-")[0] meta = metadata_for_track_id(trid, local=local) return Response(Response.MULTIPLE_GOOD_MATCH_HISTOGRAM_DECREASED, TRID=trid, score=top_score, qtime=response.header["QTime"], tic=tic, metadata=meta) else: logger.info("top_score NOT > original_scores[%s]/2 (%d <= %d) BAD_HISTOGRAM_MATCH", top_track_id, top_score, original_scores[top_track_id]/2) return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime=response.header["QTime"], tic=tic) # Get the top one (actual_score_top_track_id, actual_score_top_score) = sorted_actual_scores[0] # Get the 2nd top one (we know there is always at least 2 matches) (actual_score_2nd_track_id, actual_score_2nd_score) = sorted_actual_scores[1] trackid = actual_score_top_track_id.split("-")[0] meta = metadata_for_track_id(trackid, local=local) if actual_score_top_score < code_len * 0.05: return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime = response.header["QTime"], tic=tic) else: # If the actual score went down it still could be close enough, so check for that if actual_score_top_score > (original_scores[actual_score_top_track_id] / 4): if (actual_score_top_score - actual_score_2nd_score) >= (actual_score_top_score / 3): # for examples [10,4], 10-4 = 6, which >= 5, so OK return Response(Response.MULTIPLE_GOOD_MATCH_HISTOGRAM_DECREASED, TRID=trackid, score=actual_score_top_score, qtime=response.header["QTime"], tic=tic, metadata=meta) else: return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime = response.header["QTime"], tic=tic) else: # If the actual score was not close enough, then no match. return Response(Response.MULTIPLE_BAD_HISTOGRAM_MATCH, qtime=response.header["QTime"], tic=tic)
def matching_score(first_name, second_name): return sm(None, first_name, second_name).ratio() * 100
def compare(school, name): return sm(None, deUnique(school), deUnique(name)).ratio()