def rank_ngram(query): ngram_analyse = [] clean_query = cleaning(query) length_query = len(clean_query) if length_query is 1: return {}, {} docs = [] for line in open('ngram.json', 'r'): docs.append(json.loads(line)) rank_sum_dict = {} for doc in docs: rank_sum_dict[doc['url']] = 0 query_gram2 = [] for index, word in enumerate(clean_query): if index is length_query - 1: break two_word = word + ' ' + clean_query[index + 1] query_gram2.append(two_word) #print(query_gram2) query_gram3 = [] for index, word in enumerate(clean_query): if index is length_query - 2: break three_word = word + ' ' + clean_query[index + 1] + ' ' + clean_query[index + 2] query_gram3.append(three_word) #print(query_gram3) for doc in docs: for query_word_gram2 in query_gram2: for doc_word in doc['gram2']: if NGram.compare(query_word_gram2, doc_word) >= 0.5: rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 1 ngram_analyse.append(doc_word) ngram_analyse = list(set(ngram_analyse)) for doc in docs: for query_word_gram3 in query_gram3: for doc_word in doc['gram3']: if NGram.compare(query_word_gram3, doc_word) >= 0.5: rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 3 ngram_analyse.append(doc_word) ngram_analyse = list(set(ngram_analyse)) #print(rank_sum_dict) rank_sum_dict_unsorted = {} for key, value in rank_sum_dict.items(): if value > 0: rank_sum_dict_unsorted[key] = value return rank_sum_dict_unsorted, ngram_analyse
def ngram_similarity(univ_name): out = {} with open("static/UniqueFBUnivNames.csv", 'rb') as f: reader = csv.reader(f) for row in reader: row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row)) row = re.sub(' ', ' ', str(row)) out['score'] = NGram.compare(str(row).lower(), univ_name, N=1) if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5: out['score_used'] = NGram.compare(str(row).lower(), univ_name) out['univ'] = str(row) return out return out
def FixedText(textt): word = {} word_count_index = [] Value = [0.5] a = 0 fix = "" maxvalue = Value[0] for key in r.scan_iter(): x = NGram.compare(textt, key, N=1) if x >= 0.5: a = float(r.get(key)) b = 100 * x + float(a) if b > maxvalue: maxvalue = b fix = key word[fix] = b else: continue for key in word.keys(): if key is None: pass else: word_count_index.append(r.get(key)) return word_count_index, word.values(), word.keys()
def compare(initial, other): """ Renvoyer l'indice de similarité entre deux chaînes :returns: un nombre entre 0 et 1 """ return NGram.compare(initial, other)
def search_for_track_by_album(self, query_track, api_version=ApiVersion.V2_2): perms = self.get_search_permutations(query_track) for query in perms: # query = '{} {}'.format(query_track.artist_name, query_track.title) query_params = urlencode({ 'apikey': self._api_key, 'query': query, 'type': 'track' }) url = self.SEARCH_VERBOSE_URL.format(api_version=api_version) headers = {'Authorization': 'Bearer {}'.format(self._access_token_provider())} response = json.loads(requests.get(url, query_params, headers=headers).text) simple_tracks = { SimpleTrackNapster(track) for track in response['search']['data']['tracks'] } final_track = { track for track in simple_tracks if ( ( NGram.compare(query_track.album_title, track.album_title, N=1) > 0.8 or query_track.album_title in track.album_title ) and query_track.artist_name == track.artist_name and query_track.title in track.title ) } if len(final_track) > 0: return final_track return {}
def get(self, title, ner_tag): if ner_tag not in self.CHARACTERISTIC_PROPERTIES.keys(): raise ValueError('NER tag is not supported for entity lookup.') # Prepare title title = title.replace('ue', 'ü').replace('ae', 'ä').replace('ue', 'ü') # Get candidate items candidate_ids = flatten( [self._search_items(x) for x in self._extend_title(title)], True) candidates = self._get_items(candidate_ids) # Remove items from candidates, which do not have any of the # characteristic properties regarding their NER tag present_properties = { item['id']: item['claims'].keys() for item in candidates } characteristic_properties = self.CHARACTERISTIC_PROPERTIES[ner_tag] candidates = [ item for item in candidates if characteristic_properties.intersection(present_properties[ item['id']]) ] # Return candidate with the maximal similarity of its label and the # provided title if candidates: return max(candidates, key=lambda item: NGram.compare( title, item['labels']['en']['value'], N=2))
def analyze_pdb(self, dict_s, dict_t): ''' pdb의 GUID는 hash로 비교(ToF) pdb의 pdb path는 ngram으로 비교(유사도 수치) 가중치는 0.5씩 :return: ''' guid_score = 0 path_score = 0 if dict_s['pe_pdb_GUID'] == "-" or dict_t['pe_pdb_GUID'] == "-": return "-" else: s_guid = hashlib.md5(dict_s['pe_pdb_GUID'].encode()).hexdigest() t_guid = hashlib.md5(dict_t['pe_pdb_GUID'].encode()).hexdigest() if s_guid == t_guid: guid_score += 50 else: guid_score += 0 if dict_s['pe_pdb_Pdbpath'] == "-" or dict_t['pe_pdb_Pdbpath'] == "-": path_score += 0 else: path_score += NGram.compare( dict_s['pe_pdb_Pdbpath'], dict_t['pe_pdb_Pdbpath'], N=2) * 50 score = (guid_score + path_score) #score = str(guid_score)+','+str(path_score) #하나로 묶어야 함 return round(score, 2)
def simtitle(request): """calculate similarity based on title and naive threshold""" n = NGram(warp=2.5, iconv=enrich) articles = Article.objects.filter( status="live").order_by("date_published")[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter(lambda a: a[1] >= 0.7, n.search(article.title)) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare(article.title, nearest.title) < 0.7: results.append(article) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append(article) n.add(article) return render(request, "dump.html", dictionary={ "article_list": results, })
def verify(self,text_compare): results = [] dictio = [] file2 = open(text_compare,"r") linea2 = file2.readline() while linea2 != '': if linea2 != '\n': dictio += [self.ng.items_sharing_ngrams(linea2)] compares = 0.0 for parrafo in self.lsn: comp = NGram.compare(parrafo,linea2) if compares < comp: compares = comp results += [compares] linea2 = file2.readline() file2.close() major_ocurrences=[] for d in dictio: major=0 for val in d.values(): if major<val: major=val major_ocurrences+=[major] avg_perc=0.0 for r in results: avg_perc+=r avg_perc=avg_perc/len(results) print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences)) print("Porcentaje Similitud: "+repr(avg_perc))
def simtitle( request ): """calculate similarity based on title and naive threshold""" n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title ) articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) ) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare( article.title, nearest.title ) < 0.7: results.append( article ) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append( article ) n.add( article ) return render( request, "dump.html", dictionary = { "article_list": results, } )
def compare_flow_const(self, base_flow_const, tar_flow_const): if base_flow_const == tar_flow_const: return 1.0 else: return NGram.compare(' '.join(base_flow_const), ' '.join(tar_flow_const), N=2)
def compare_ngrams(left, right, N=2, pad_len=0): left = ascii(left) right = ascii(right) if len(left) == 1 and len(right) == 1: # NGram.compare returns 0.0 for 1 letter comparison, even if letters # are equal. return 1.0 if left == right else 0.0 return NGram.compare(left, right, N=N, pad_len=pad_len)
def test_ngram_search(self): """Tests from the original ngram.py, to check that the rewrite still uses the same underlying algorithm""" # Basic searching of the index idx = NGram(self.items) self.assertEqual(idx.search('askfjwehiuasdfji'), [ ('askfjwehiuasdfji', 1.0), ('asdfawe', 0.17391304347826086), ('asfwef', 0.083333333333333329), ('adfwe', 0.041666666666666664)]) self.assertEqual(idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998), ('asdfawe', 0.20000000000000001)]) # Pairwise comparison of strings self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0) self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def test_ngram_search(self): """Tests from the original ngram.py, to check that the rewrite still uses the same underlying algorithm""" # Basic searching of the index idx = NGram(self.items) self.assertEqual(idx.search('askfjwehiuasdfji'), [('askfjwehiuasdfji', 1.0), ('asdfawe', 0.17391304347826086), ('asfwef', 0.083333333333333329), ('adfwe', 0.041666666666666664)]) self.assertEqual( idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998), ('asdfawe', 0.20000000000000001)]) # Pairwise comparison of strings self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0) self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def cumulative_score_strings(iline, jline, N): iString = " ".join(iline.split(" :::: ")[:3]) jString = " ".join(jline.split(" :::: ")[:3]) score = 0 while N >= 1: score += (NGram.compare(iString, jString, N=N)) #* N) N = N - 1 return score
def backoff_score_strings(iline, jline, N, T=0.0): iString = " ".join(iline.split(" :::: ")[:3]) jString = " ".join(jline.split(" :::: ")[:3]) score = -1 while score <= T and N >= 1: score = NGram.compare(iString, jString, N=N) N = N - 1 return score
def distance_ngrames(source_string, comparaison_string): """ Cette methode retourne le degre de similarite entre deux chaines en utilisant le methode ngram, avec le valeur de N=2 - La methode de bigrame fait une combination a deux des caracteres sequentiels de chaque chaine, et apres compare les couples entre elles pour savoir le niveau du matching :param source_string: le chaine de caracteres qu'on veut comparer :param comparaison_string: une autre chaine de caracteres :return: le pourcentage de matching """ return NGram.compare(source_string.lower(), comparaison_string.lower(), N=2)
def get_string_similarity(standard, target): # input json path string 1, 2 ''' 문자열을 넣어주면 비교해서 얼마나 같은지 점수로 반환해주는 함수 :param standard: stand string value for compare :param target: target string value for compare :return: result : dictionary {'1-Gram': reslut, '2-Gram': reslut, .... '5-Gram': reslut} ''' result = dict() # print(f'[+]Stand Binary(length {len(s)}) ::: target Binary(length {len(t)})') for i in range(1, 6): print(f"ngram base :: {standard}") print(f"ngram target :: {target}") result.update({str(i) + "-Gram": NGram.compare(standard, target, N=i)}) return result
def guess_image(name): ''' Guess which meme image they mean by finding the alias with greatest ngram similarity ''' name = tokenize(name) best = '404' best_score = None for guess_image, names in IMAGES.iteritems(): for guess in names: score = NGram.compare(guess, name) if best_score is None or score > best_score: best_score = score best = guess_image app.logger.info('Pick image %s for name "%s"' % (best, name)) return best
def get(self,person_id): n=2 occurs=[] grams_arr=[] sixgrams = ngrams(str_read.split(), n) for grams in sixgrams: #print str(grams) x=NGram.compare('{}'.format(person_id.decode('latin-1')),str(grams)) occurs.append(x) grams_arr.append(str(grams)) main_fields={'occurs':fields.String,"word":fields.String} datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])} x=marshal(datas,main_fields) #json.dumps(marshal(datas,main_fields)) return x
def predict_char_ngram(N, paragraph, question, options, isSimilarity): paragraph = removePunctuation(paragraph) options = [removePunctuation(x) for x in options] # similarities = [NGram.compare(paragraph, s, N=N) for s in options] # similarities = [ngram_compare(paragraph, s, N=N) for s in options] similarities = [ NGram.compare(paragraph, s, N=N, pad_len=0) for s in options ] if isSimilarity: prob = [x / sum(similarities) for x in similarities] else: similarities = [x + 0.00001 for x in similarities] # 0'a bolme hatasi icin prob = [(1 / x) / sum(1 / y for y in similarities) for x in similarities] return prob
def guess_meme_image(meme_name): ''' Guess which meme image they mean by finding the alias with greatest ngram similarity ''' meme_name = tokenize(meme_name) best = '' best_score = None for guess_image, names in MEMES.items(): for guess in names: guess = tokenize(guess) score = NGram.compare(guess, meme_name) if best_score is None or score > best_score: best_score = score best = guess_image app.logger.info('New best meme for "%s": "%s" (Score: %s)', meme_name, guess, score) app.logger.info('Picked meme "%s" for name "%s"' % (best, meme_name)) return best
def get(self, param_word): status = False n = 2 occurs = [] grams_arr = [] words = [] for key in r_server.scan_iter(): words.append(key) #sixgrams = ngrams(str_read.split(), n) for keys in words: #print str(grams) x = NGram.compare('{}'.format(param_word.decode('latin-1')), str(keys)) occurs.append(x) grams_arr.append(str(keys)) for key in r_server.scan_iter(): if key == param_word: status = True if status is True: main_fields_true = { "word": fields.String, "status": fields.Boolean } datas_true = {'word': "{}".format(param_word), 'status': status} x_true = marshal(datas_true, main_fields_true) return x_true else: main_fields_false = { 'occurs': fields.String, "word": fields.String, "freq": fields.String, "status": fields.Boolean } datas_false = { 'occurs': "{}".format(max(occurs) * 1000), 'word': "{}".format(grams_arr[occurs.index(max(occurs))]), 'freq': r_server.get(param_word), 'status': status } x_false = marshal(datas_false, main_fields_false) return x_false
def get_similarity(self, content): """ Renvoyer l'indice de similarité avec un autre contenu :returns: similarité entre 0.0 et 1.0, 1.0 étant l'identité """ labels = {'title', 'body', 'tags'} this, that = ({ label: render_block_to_string("content/similarity/content.djhtml", item, {'content': self}) for label in labels } for item in [self, content]) weights = text_to_dict(render_to_string( "content/similarity/weights.txt", {}), evaluate=True) # texte de la forme key:value\n return sum([ NGram.compare(this[label], that[label]) * weights[label] for label in labels ]) / sum(weights.values())
def guess_meme_image(meme_name): ''' Guess which meme image they mean by finding the alias with greatest ngram similarity ''' meme_name = tokenize(meme_name) best = '' best_score = None for guess_image, names in MEMES.items(): for guess in names: guess = tokenize(guess) score = NGram.compare(guess, meme_name) if best_score is None or score > best_score: best_score = score best = guess_image app.logger.debug('New best meme for "%s": "%s" (Score: %s)', meme_name, guess, score) app.logger.info('Picked meme "%s" for name "%s" (Score: %s)', best, meme_name, best_score) return best
def process(hr,sr,he,se): categories_relevant = {} categories_extracted = {} category_idx_list = [] for i,h in enumerate(hr): for j,h1 in enumerate(he): if NGram.compare(hr[i], he[j]) > 0.95: category_idx_list.append((i,j)) if he: if len(he) != len(se): return 0 , 0 for i,C in enumerate(category_idx_list): categories_relevant[i] = sr[C[0]] tmp = se[C[1]].replace('\r', '').replace('\n','') categories_extracted[i] = tmp e = Evaluator(categories_relevant, categories_extracted) p, r = e.evaluate_using_ngrams(3) return p, r
def smart_read(url): resp = urllib2.urlopen(url) #resolve url url = resp.url domain = urlparse(url).netloc path = urlparse(url).path html = resp.read() tree = etree.parse(StringIO.StringIO(html), parser) links = tree.xpath("//body//@href") nmax = 0 for link in links: if urlparse(link).netloc == domain: ng = NGram.compare(urlparse(link).path,path) #print link,ng if ng > nmax and ng < 1: nmax = ng mirror = link diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"]) tree = etree.parse(StringIO.StringIO(diffh), parser) diff = tree.xpath("//ins//text()") for d in diff: print d
def FixedText(textt): word = {} Value = [0.5] a = 0 fix = "" maxvalue = Value[0] for key in r.scan_iter(): x = NGram.compare(textt, key, N=1) if x >= 0.5: a = float(r.lindex(key, 0)) b = 100 * x + float(a) if b > maxvalue: maxvalue = b fix = key word[fix] = b else: continue return word
my_dict = get_filelist("dict.txt") misspell = get_filelist("misspell.txt") correct = get_filelist("correct.txt") match = 0 total = 0 test_size = len(misspell) ngrams_result = list(range(test_size)) for i in range(test_size): max_similarity = 0 similarity_dict = {} for item in my_dict: similarity = NGram.compare(misspell[i], item) similarity_dict[item] = similarity if similarity > max_similarity: max_similarity = similarity if max_similarity > 0: ngrams_result[i] = get_keys(similarity_dict, max_similarity) else: ngrams_result[i] = [] print('Word Number:', i + 1, '/', test_size) print('MISSPELL:', misspell[i], '\nN-GRAMS:', ngrams_result[i], '\nCORRECT:', correct[i], '\n--------------------------------') total += len(ngrams_result[i]) if correct[i] in ngrams_result[i]: match += 1 precision = match / total
from ngram import NGram distance = NGram.compare("Ham", "Spam", N=2) print(distance)
def nearlySameText(text_1, text_2): return NGram.compare(text_1.strip(), text_2.strip()) >= 0.9
def ng_pare(needle='default', fn='/usr/share/dict/words', pth=.50): with open(fn, 'r') as fh: ng_haystack = {line.lower() for line in fh if NGram.compare(needle, line.lower(), N=1) - pth >= 0.0} return ng_haystack
jellyfish.match_rating_comparison(unicode(string[0]), unicode(s)) for s in string ] print 'match rating comparison', sim_arry7 # tokens = word_tokenize([string]) # print(string_token) # print tfidf_matrix # print(y.toarray() ngram_array = [word_grams(s.split(' ')) for s in string] # print ngram_array n = NGram() # print list(n.split(string[0])) ngram_array = [list(n.split(s)) for s in string] # print ngram_array sim_arry8 = [NGram.compare(string[0].lower(), s.lower(), N=4) for s in string] print 'ngram', sim_arry8 def jaccard_distance(a, b): # print a, b inter_len = float(len(list(a.intersection(b)))) union_len = float(len(list(a.union(b)))) return inter_len / union_len # print list(ngram_array[0].intersection(ngram_array[1])) sim_arry9 = [ jaccard_distance(NGram(ngram_array[0]), NGram(s)) for s in ngram_array ] print 'jaccard', sim_arry9
class KCM(object): """docstring for KCM""" '''args lang help='language, english or chinese (eng/cht)', required=True) io_dir help='input output directory, required=True) max_file_count help='maximum number of input files, 0 for no limit, type=int, default=0) thread_count help='number of thread used, type=int, default=1) ''' def __init__(self, lang, io_dir, max_file_count=0, thread_count=1, uri=None): self.BASE_DIR = BASE_DIR self.lang = lang self.io_dir = os.path.join(io_dir, self.lang) self.max_file_count = max_file_count self.thread_count = thread_count self.client = MongoClient(uri) self.db = self.client['nlp'] self.Collect = self.db['kcm'] # ngram search self.kcmNgram = NGram((i['key'] for i in self.Collect.find({}, { 'key': 1, '_id': False }))) logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s', filename='KCM_{}.log'.format(self.lang), level=logging.INFO) logging.info('Begin gen_kcm.py') logging.info('input {self.max_file_count} files, ' 'output to {self.io_dir}, ' 'maximum file count {self.max_file_count}, ' 'use {self.thread_count} threads'.format(**locals())) def get_source_file_list(self): """Generate list of term data source files Args: args: input arguments, use self.lang, self.max_file_count Returns: list of source files """ file_list = [] # wiki files for (dir_path, dir_names, file_names) in os.walk(self.io_dir): for file_name in file_names: if self.max_file_count and len( file_list) >= self.max_file_count: break if file_name == '.DS_Store' or '.model' in file_name: # for OS X continue file_list.append(os.path.join(dir_path, file_name)) logging.info('appended file {}'.format( os.path.join(dir_path, file_name))) if not file_list: logging.info('no file selected, end of script') exit() return file_list @timing def remove_symbols_tags(self, if_name): """Remove symbols and tags. Read input file, output to output file. Args: if_name: input file name args: input arguments, use self.io_dir, self.lang Returns: output file name """ return rm_tags(if_name) @timing def paragraphs_to_sentences(self, inputData): """Generate sentences from paragraphs. Read input file, output to output file Args: inputData: input data from former process. args: input arguments, use self.io_dir, self.lang Returns: output file name """ return paragraphs_to_sentences_cht(inputData) @timing def sentences_to_terms(self, if_name, inputData): """generate terms from sentences Args: if_name: input file name args: input arguments, use self.io_dir, self.lang Returns: output file name """ prefix = if_name.split('/')[-1].split('_')[0] of_name = '{self.io_dir}/{prefix}_terms_{self.lang}'.format(**locals()) PosTokenizer(self.BASE_DIR, inputData, of_name, 'r') return of_name @removeInputFile @timing def terms_to_term_pairs(self, if_name): """Generate term pairs from terms. Args: if_name: input file name args: input arguments, use self.io_dir, self.lang Returns: output file name """ of_name = '{self.io_dir}/{self.lang}.model'.format(**locals()) script_file = 'build/terms_to_term_pair_freq.py' terms_to_term_pair_freq(if_name, of_name, min_freq=1, max_term_len=20) @timing def join_terms_files(self, if_names): """Join terms files into one Args: if_names: input terms files names args: input arguments """ of_name = '{self.io_dir}/terms_{self.lang}'.format(**locals()) with open(of_name, 'w') as output_file: for if_name in if_names: with open(if_name, 'r') as input_file: for line in input_file: output_file.write(line) return of_name def gen_terms_file(self, if_name, o_list): """Generate terms file Args: if_name: input wiki source file name args: input arguments o_list: output list saving generated file name """ result = self.remove_symbols_tags(if_name) result = self.paragraphs_to_sentences(result) of_name = self.sentences_to_terms(if_name, result) o_list.append(of_name) def thread_job(self, input_file_queue, o_term_files): """Job to be done by thread (generate terms file) Args: input_file_queue: queue containing input files that needs processing args: input arguments o_term_files: list for outputting generated term file names """ while True: if_name = input_file_queue.get() if if_name is None: break # end of thread self.gen_terms_file(if_name, o_list=o_term_files) input_file_queue.task_done() @timing def main(self): """main function""" if_list = self.get_source_file_list() term_files = [] input_file_queue = queue.Queue() threads = [] for i in range(self.thread_count): t = Thread(target=self.thread_job, args=(input_file_queue, term_files)) t.start() threads.append(t) for if_name in if_list: input_file_queue.put(if_name) # block till all tasks are done (here means all input file processed) input_file_queue.join() # stop all threads for i in range(self.thread_count): input_file_queue.put(None) # in thread_job, when input_file_queue.get == None, thread will end for t in threads: t.join() # wait till all threads really end of_name = self.join_terms_files(term_files) self.terms_to_term_pairs(of_name) self.import2DB() def setLang(self, lang): self.lang = lang self.io_dir = os.path.join(io_dir, self.lang) def removeDB(self): self.Collect.remove({}) def import2DB(self): result = dict() with open(os.path.join(self.io_dir, "{0}.model".format(self.lang)), 'r', encoding='utf8') as f: for i in f: tmp = i.split() result.setdefault(tmp[0], []).append([tmp[1], int(tmp[2])]) result.setdefault(tmp[1], []).append([tmp[0], int(tmp[2])]) documentArr = [{ 'key': pair[0], 'value': sorted(pair[1], key=lambda x: -x[1]) } for pair in result.items()] del result self.Collect.insert(documentArr) self.Collect.create_index([("key", pymongo.HASHED)]) def get(self, keyword, amount): result = self.Collect.find({'key': keyword}, {'_id': False}).limit(1) if result.count(): return {**(result[0]), 'similarity': 1} else: ngramKeyword = self.kcmNgram.find(keyword) if ngramKeyword: result = self.Collect.find({ 'key': ngramKeyword }, { '_id': False }).limit(1) return { 'key': ngramKeyword, 'value': result[0]['value'][:amount], 'similarity': self.kcmNgram.compare(keyword, ngramKeyword) } return {'key': ngramKeyword, 'value': [], 'similarity': 0}
string += str(index) return string if __name__ == '__main__': n = NGramCompare() solos, generatedSolo = n.importFiles( 'Recreated 2, 100 epochs, learning rate = 0.0055.mid') trans = [] for solo in solos: trans.append(n.translateToString(solo)) s1 = n.translateToString(generatedSolo) ngramScores = [] editdistances = [] for solo in trans: ngramScores.append(NGram.compare(s1, solo)) editdistances.append(editdistance.eval(s1, solo)) avgngram = 0 avgedit = 0 for g in ngramScores: avgngram += g avgngram = avgngram / len(ngramScores) for dist in editdistances: avgedit += dist avgedit = avgedit / len(editdistances) ngramScores.sort(reverse=True) editdistances.sort() print('Average n-gram score: ' + str(avgngram)) print('Average edit-distance score: ' + str(avgedit)) print('Best n-gram score: ' + str(ngramScores[0])) print('Best edit distance score: ' + str(editdistances[0]))
def score_oro_string(iline, jline, N): scores = [NGram.compare(x, y, N=N) for x,y in zip(iline.split(" :::: ")[:3], jline.split(" :::: ")[:3])] scores += [NGram.compare(" ".join(iline.split(" :::: ")[:3]), " ".join(jline.split(" :::: ")[:3]), N=N) * 10] return sum([log(x) if x > 0 else -10 for x in scores])
query = dup_label # Set up pp = pprint.PrettyPrinter(indent=2) sparql = SPARQLWrapper("http://husky-big.cs.uwaterloo.ca:8890/sparql") # Send query sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert() entries = results["results"]["bindings"] """ for result in entries: pp.pprint((result['s']['value'], result['o']["value"])) """ # Approximately 20k entries print(len(entries), " entries retrieved") for (indexA, entryA) in enumerate(entries): for (indexB, entryB) in enumerate(entries): if (indexA <= indexB): distance = NGram.compare(entryA['o']['value'], entryB['o']['value']) if (distance > 0.8 and entryA['s']['value'] != entryB['s']['value']): pp.pprint((distance, entryA['s']['value'], entryA['o']['value'], entryB['s']['value'],entryB['o']['value']))
from nltk import ngrams from ngram import NGram sentence = 'this is a foo bar sentences and i want to ngramize it' n = 2 sixgrams = ngrams(sentence.split(), n) for grams in sixgrams: print grams print NGram.compare('spa', 'spam')
# Open the corresponding edited subtitle file subs = pysrt.open(osp.join('edited', 'edited_' + basename + '.srt')) num_speakers_matched = 0 # Number of speakers identified match_threshold = 0.15 # NGram matching confidence required to declare a match dialogue_idx = 0 # Index of dialogue list sub_idx = 0 # Index of subtitle list num_fails = 0 # Number of dialogue pairs in which no match was found for current subtitle return_to_dialogue_idx = 0 # Dialogue index to return to for next subtitle if no match is ever found for current subtitle while dialogue_idx < len(dialogue) and sub_idx < len(subs): next_dialogue_idx = min(dialogue_idx + 1, len(dialogue) - 1) curr_line = dialogue[dialogue_idx] next_line = dialogue[next_dialogue_idx] curr_sub = subs[sub_idx].text curr_score = NGram.compare(curr_line, curr_sub) next_score = NGram.compare(next_line, curr_sub) #flog.write('SUBTITLE: ' + curr_sub + '\n') #flog.write('DIALOGUE_PAIR: ({}, {}), ({}, {})\n'.format(curr_line, curr_score, next_line, next_score)) # If we're somewhat confident that a match is found, handle it if curr_score > match_threshold or next_score > match_threshold: # If the subtitle matches to the current line of dialogue # better than it matches to the next one, assign it the speaker of this line # and increment the subtitle index to the next one if curr_score >= next_score: subs[sub_idx].text = speakers[ dialogue_idx] + ': ' + curr_sub #flog.write('RESULT: ' + subs[sub_idx].text + '\n') sub_idx += 1 num_fails = 0 # If the better match is to the next line of dialogue, then
from ngram import NGram a = NGram.compare('sin', 'sing') print(a)
def sim( a, b ): return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )
def is_album_name_the_same(s1, s2): return NGram.compare(s1.lower(), s2.lower()) > 0.8
class PMI(object): """docstring for PMI""" def __init__(self, lang, uri=None, ngram=False): self.client = pymongo.MongoClient(uri) self.uri = uri self.lang = lang self.db = self.client['nlp_{}'.format(self.lang)] self.fs = gridfs.GridFS(self.db) self.Collect = self.db['pmi'] self.cpus = math.ceil(mp.cpu_count() * 0.2) self.frequency = {} if ngram: # use ngram for searching self.pmiNgram = NGram((i['key'] for i in self.db['pmi'].find({}, { 'key': 1, '_id': False }))) def getWordFreqItems(self): # use cache if os.path.exists('frequency.pkl'): self.frequency = pickle.load(open('frequency.pkl', 'rb')) frequency_of_total_keyword = pickle.load( open('frequency_of_total_keyword.pkl', 'rb')) return frequency_of_total_keyword # return all frequency of word in type of dict. self.frequency = {} frequency_of_total_keyword = 0 # iterate through gridFS for keyword in self.fs.list(): cursor = self.fs.find({"filename": keyword})[0] value = { 'PartOfSpeech': cursor.contentType, 'value': json.loads(self.fs.get(cursor._id).read().decode('utf-8')) } for correlation_keyword, PartOfSpeech, corTermCount in value[ 'value']: frequency_of_total_keyword += corTermCount # accumulate keyword's frequency. self.frequency[keyword] = self.frequency.setdefault( keyword, 0) + corTermCount # iterate through all normal collection for i in self.db['kcm'].find({}): keyword = i['key'] for correlation_keyword, PartOfSpeech, corTermCount in i['value']: frequency_of_total_keyword += corTermCount # accumulate keyword's frequency. self.frequency[keyword] = self.frequency.setdefault( keyword, 0) + corTermCount pickle.dump(self.frequency, open('frequency.pkl', 'wb')) pickle.dump(frequency_of_total_keyword, open('frequency_of_total_keyword.pkl', 'wb')) return frequency_of_total_keyword def build(self): self.Collect.remove({}) # read all frequency from KCM and build all PMI of KCM in MongoDB. # with format {key:'中興大學', freq:100, value:[(keyword, PMI-value), (keyword, PMI-value)...]} frequency_of_total_keyword = self.getWordFreqItems() print('frequency of total keyword:' + str(frequency_of_total_keyword)) @graceful_auto_reconnect def process_job(job_list): # Each process need independent Mongo Client # or it may raise Deadlock in Mongo. client = pymongo.MongoClient(self.uri) db = client['nlp_{}'.format(self.lang)] process_collect = db['pmi'] kcm_collect = db['kcm'] fs = gridfs.GridFS(db) result = [] for keyword, keyword_freq in job_list: pmiResult = [] collection_cursor = kcm_collect.find({ 'key': keyword }, { 'value': 1, '_id': False }).limit(1) if collection_cursor.count() == 0: gridfs_cursor = fs.find({"filename": keyword}).limit(1)[0] cursor_result = json.loads( fs.get(gridfs_cursor._id).read().decode('utf-8'))[:500] else: cursor_result = collection_cursor[0]['value'] for kcmKeyword, PartOfSpeech, kcmCount in cursor_result: # algorithm: # PMI = log2(p(x, y)/p(x)*p(y)) # p(x, y) = frequency of (x, y) / frequency of total keyword. # p(x) = frequency of x / frequency of total keyword. value = math.log2( kcmCount * frequency_of_total_keyword / (keyword_freq * self.frequency[kcmKeyword])) # this equation is contributed by 陳聖軒. # contact him with facebook: https://www.facebook.com/henrymayday value *= math.log2(self.frequency[kcmKeyword]) pmiResult.append((kcmKeyword, value)) pmiResult = sorted(pmiResult, key=lambda x: -x[1]) result.append({ 'key': keyword, 'freq': keyword_freq, 'value': pmiResult }) # Insert Collections into MongoDB if len(result) > 5000: process_collect.insert(result) result = [] amount = math.ceil(len(self.frequency) / self.cpus) job_list = list(self.frequency.items()) job_list = [ job_list[i:i + amount] for i in range(0, len(self.frequency), amount) ] processes = [ mp.Process(target=process_job, kwargs={'job_list': job_list[i]}) for i in range(self.cpus) ] for process in processes: process.start() for process in processes: process.join() self.Collect.create_index([("key", pymongo.HASHED)]) def get(self, keyword, amount): cursor = self.Collect.find({ 'key': keyword }, { 'value': 1, '_id': False }).limit(1) if cursor.count() != 0: return { 'key': keyword, 'value': cursor[0]['value'][:amount], 'similarity': 1 } else: pmiNgramKeyword = self.pmiNgram.find(keyword) if pmiNgramKeyword: result = self.Collect.find({ 'key': pmiNgramKeyword }, { 'value': 1, '_id': False }).limit(1)[0]['value'][:amount] return { 'key': pmiNgramKeyword, 'value': result, 'similarity': self.pmiNgram.compare(pmiNgramKeyword, keyword) } return {}
def simple_score_strings(iline, jline, N): iString = " ".join(iline.split(" :::: ")[:3]) jString = " ".join(jline.split(" :::: ")[:3]) return NGram.compare(iString, jString, N=N)
def ngram_compare(source_wl, target_wl): _w1, _w2 = " ".join(source_wl).lower(), " ".join(target_wl).lower() return NGram.compare(_w1, _w2)
def ngram_test(w1, w2, n): return NGram.compare(w1, w2, N=n)