def rank_ngram(query):
    ngram_analyse = []
    clean_query = cleaning(query)
    length_query = len(clean_query)
    if length_query is 1:
        return {}, {}

    docs = []

    for line in open('ngram.json', 'r'):
        docs.append(json.loads(line))

    rank_sum_dict = {}
    for doc in docs:
        rank_sum_dict[doc['url']] = 0

    query_gram2 = []
    for index, word in enumerate(clean_query):
        if index is length_query - 1:
            break
        two_word = word + ' ' + clean_query[index + 1]
        query_gram2.append(two_word)

    #print(query_gram2)

    query_gram3 = []
    for index, word in enumerate(clean_query):
        if index is length_query - 2:
            break
        three_word = word + ' ' + clean_query[index +
                                              1] + ' ' + clean_query[index + 2]
        query_gram3.append(three_word)

    #print(query_gram3)

    for doc in docs:
        for query_word_gram2 in query_gram2:
            for doc_word in doc['gram2']:
                if NGram.compare(query_word_gram2, doc_word) >= 0.5:
                    rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 1
                    ngram_analyse.append(doc_word)

    ngram_analyse = list(set(ngram_analyse))

    for doc in docs:
        for query_word_gram3 in query_gram3:
            for doc_word in doc['gram3']:
                if NGram.compare(query_word_gram3, doc_word) >= 0.5:
                    rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 3
                    ngram_analyse.append(doc_word)

    ngram_analyse = list(set(ngram_analyse))

    #print(rank_sum_dict)
    rank_sum_dict_unsorted = {}
    for key, value in rank_sum_dict.items():
        if value > 0:
            rank_sum_dict_unsorted[key] = value

    return rank_sum_dict_unsorted, ngram_analyse
def ngram_similarity(univ_name):
    out = {}
    with open("static/UniqueFBUnivNames.csv", 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
            row = re.sub('  ', ' ', str(row))
            out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
            if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
                out['score_used'] = NGram.compare(str(row).lower(), univ_name)
                out['univ'] = str(row)
                return out
    return out
def ngram_similarity(univ_name):
    out = {}
    with open("static/UniqueFBUnivNames.csv", 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
            row = re.sub('  ', ' ', str(row))
            out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
            if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
                out['score_used'] = NGram.compare(str(row).lower(), univ_name)
                out['univ'] = str(row)
                return out
    return out
def FixedText(textt):

    word = {}
    word_count_index = []
    Value = [0.5]
    a = 0
    fix = ""
    maxvalue = Value[0]
    for key in r.scan_iter():
        x = NGram.compare(textt, key, N=1)
        if x >= 0.5:
            a = float(r.get(key))
            b = 100 * x + float(a)

            if b > maxvalue:
                maxvalue = b
                fix = key

            word[fix] = b

        else:
            continue
    for key in word.keys():
        if key is None:
            pass
        else:
            word_count_index.append(r.get(key))

    return word_count_index, word.values(), word.keys()
Esempio n. 5
0
def compare(initial, other):
    """
    Renvoyer l'indice de similarité entre deux chaînes

    :returns: un nombre entre 0 et 1
    """
    return NGram.compare(initial, other)
Esempio n. 6
0
    def search_for_track_by_album(self, query_track, api_version=ApiVersion.V2_2):
        perms = self.get_search_permutations(query_track)

        for query in perms:
            # query = '{} {}'.format(query_track.artist_name, query_track.title)
            query_params = urlencode({
                'apikey': self._api_key,
                'query': query,
                'type': 'track'
            })
            url = self.SEARCH_VERBOSE_URL.format(api_version=api_version)
            headers = {'Authorization': 'Bearer {}'.format(self._access_token_provider())}
            response = json.loads(requests.get(url, query_params, headers=headers).text)
            simple_tracks = {
                SimpleTrackNapster(track)
                for track
                in response['search']['data']['tracks']
            }

            final_track = {
                track for track in simple_tracks
                if (
                       (
                           NGram.compare(query_track.album_title, track.album_title, N=1) > 0.8 or
                           query_track.album_title in track.album_title
                        ) and
                       query_track.artist_name == track.artist_name and
                       query_track.title in track.title

                )
            }

            if len(final_track) > 0:
                return final_track
        return {}
Esempio n. 7
0
    def get(self, title, ner_tag):
        if ner_tag not in self.CHARACTERISTIC_PROPERTIES.keys():
            raise ValueError('NER tag is not supported for entity lookup.')

        # Prepare title
        title = title.replace('ue', 'ü').replace('ae', 'ä').replace('ue', 'ü')

        # Get candidate items
        candidate_ids = flatten(
            [self._search_items(x) for x in self._extend_title(title)], True)
        candidates = self._get_items(candidate_ids)

        # Remove items from candidates, which do not have any of the
        # characteristic properties regarding their NER tag
        present_properties = {
            item['id']: item['claims'].keys()
            for item in candidates
        }
        characteristic_properties = self.CHARACTERISTIC_PROPERTIES[ner_tag]
        candidates = [
            item for item in candidates
            if characteristic_properties.intersection(present_properties[
                item['id']])
        ]

        # Return candidate with the maximal similarity of its label and the
        # provided title
        if candidates:
            return max(candidates,
                       key=lambda item: NGram.compare(
                           title, item['labels']['en']['value'], N=2))
Esempio n. 8
0
    def analyze_pdb(self, dict_s, dict_t):
        '''
        pdb의 GUID는 hash로 비교(ToF)
        pdb의 pdb path는 ngram으로 비교(유사도 수치)
        가중치는 0.5씩
        :return:
        '''
        guid_score = 0
        path_score = 0
        if dict_s['pe_pdb_GUID'] == "-" or dict_t['pe_pdb_GUID'] == "-":
            return "-"
        else:
            s_guid = hashlib.md5(dict_s['pe_pdb_GUID'].encode()).hexdigest()
            t_guid = hashlib.md5(dict_t['pe_pdb_GUID'].encode()).hexdigest()
            if s_guid == t_guid:
                guid_score += 50
            else:
                guid_score += 0
        if dict_s['pe_pdb_Pdbpath'] == "-" or dict_t['pe_pdb_Pdbpath'] == "-":
            path_score += 0
        else:
            path_score += NGram.compare(
                dict_s['pe_pdb_Pdbpath'], dict_t['pe_pdb_Pdbpath'], N=2) * 50

        score = (guid_score + path_score)
        #score = str(guid_score)+','+str(path_score)     #하나로 묶어야 함

        return round(score, 2)
Esempio n. 9
0
def simtitle(request):
    """calculate similarity based on title and naive threshold"""
    n = NGram(warp=2.5, iconv=enrich)
    articles = Article.objects.filter(
        status="live").order_by("date_published")[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter(lambda a: a[1] >= 0.7, n.search(article.title))
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare(article.title, nearest.title) < 0.7:
                    results.append(article)
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append(article)
        n.add(article)
    return render(request, "dump.html", dictionary={
        "article_list": results,
    })
Esempio n. 10
0
	def verify(self,text_compare):
		results = []
		dictio = []
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':	
			if linea2 != '\n':
				dictio += [self.ng.items_sharing_ngrams(linea2)]
				compares = 0.0
				for parrafo in self.lsn:
					comp = NGram.compare(parrafo,linea2)
					if compares < comp:
						compares = comp
				results += [compares]
			linea2 = file2.readline()
		file2.close()

		major_ocurrences=[]
		for d in dictio:
			major=0
			for val in d.values():
				if major<val:
					major=val
			major_ocurrences+=[major]
			

		avg_perc=0.0
		for r in results:
			avg_perc+=r
		avg_perc=avg_perc/len(results)

		print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
		print("Porcentaje Similitud: "+repr(avg_perc))
Esempio n. 11
0
def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )
    def compare_flow_const(self, base_flow_const, tar_flow_const):

        if base_flow_const == tar_flow_const:
            return 1.0
        else:
            return NGram.compare(' '.join(base_flow_const),
                                 ' '.join(tar_flow_const),
                                 N=2)
Esempio n. 13
0
def compare_ngrams(left, right, N=2, pad_len=0):
    left = ascii(left)
    right = ascii(right)
    if len(left) == 1 and len(right) == 1:
        # NGram.compare returns 0.0 for 1 letter comparison, even if letters
        # are equal.
        return 1.0 if left == right else 0.0
    return NGram.compare(left, right, N=N, pad_len=pad_len)
Esempio n. 14
0
def compare_ngrams(left, right, N=2, pad_len=0):
    left = ascii(left)
    right = ascii(right)
    if len(left) == 1 and len(right) == 1:
        # NGram.compare returns 0.0 for 1 letter comparison, even if letters
        # are equal.
        return 1.0 if left == right else 0.0
    return NGram.compare(left, right, N=N, pad_len=pad_len)
Esempio n. 15
0
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'), [
            ('askfjwehiuasdfji', 1.0),
            ('asdfawe', 0.17391304347826086),
            ('asfwef', 0.083333333333333329),
            ('adfwe', 0.041666666666666664)])
        self.assertEqual(idx.search('afadfwe')[:2],
                [('adfwe', 0.59999999999999998),
                 ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
Esempio n. 16
0
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
                         [('askfjwehiuasdfji', 1.0),
                          ('asdfawe', 0.17391304347826086),
                          ('asfwef', 0.083333333333333329),
                          ('adfwe', 0.041666666666666664)])
        self.assertEqual(
            idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998),
                                        ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
Esempio n. 17
0
def cumulative_score_strings(iline, jline, N):
	iString = " ".join(iline.split(" :::: ")[:3])
	jString = " ".join(jline.split(" :::: ")[:3])

	score = 0
	while N >= 1:
		score += (NGram.compare(iString, jString, N=N)) #* N)
		N = N - 1

	return score
Esempio n. 18
0
def backoff_score_strings(iline, jline, N, T=0.0):
	iString = " ".join(iline.split(" :::: ")[:3])
	jString = " ".join(jline.split(" :::: ")[:3])

	score = -1
	while score <= T and N >= 1:
		score = NGram.compare(iString, jString, N=N)
		N = N - 1

	return score
Esempio n. 19
0
def distance_ngrames(source_string, comparaison_string):
    """
    Cette methode retourne le degre de similarite entre deux chaines en utilisant le methode
    ngram, avec le valeur de N=2

    - La methode de bigrame fait une combination a deux des caracteres sequentiels de chaque
    chaine, et apres compare les couples entre elles pour savoir le niveau du matching
    :param source_string: le chaine de caracteres qu'on veut comparer
    :param comparaison_string: une autre chaine de caracteres
    :return: le pourcentage de matching
    """
    return NGram.compare(source_string.lower(), comparaison_string.lower(), N=2)
Esempio n. 20
0
def get_string_similarity(standard, target):  # input json path string 1, 2
    '''
    문자열을 넣어주면 비교해서 얼마나 같은지 점수로 반환해주는 함수
    :param standard: stand string value for compare
    :param target: target string value for compare
    :return: result : dictionary
            {'1-Gram': reslut, '2-Gram': reslut,  .... '5-Gram': reslut}
    '''
    result = dict()
    # print(f'[+]Stand Binary(length {len(s)}) ::: target Binary(length {len(t)})')

    for i in range(1, 6):
        print(f"ngram base :: {standard}")
        print(f"ngram target :: {target}")
        result.update({str(i) + "-Gram": NGram.compare(standard, target, N=i)})
    return result
Esempio n. 21
0
def guess_image(name):
    '''
    Guess which meme image they mean by finding the alias with greatest ngram
    similarity
    '''
    name = tokenize(name)
    best = '404'
    best_score = None
    for guess_image, names in IMAGES.iteritems():
        for guess in names:
            score = NGram.compare(guess, name)
            if best_score is None or score > best_score:
                best_score = score
                best = guess_image
    app.logger.info('Pick image %s for name "%s"' % (best, name))
    return best
Esempio n. 22
0
    def get(self,person_id):
        n=2
        occurs=[]
        grams_arr=[]
        sixgrams = ngrams(str_read.split(), n)
        for grams in sixgrams:
            #print str(grams)
            x=NGram.compare('{}'.format(person_id.decode('latin-1')),str(grams))
            occurs.append(x)
            grams_arr.append(str(grams))

        main_fields={'occurs':fields.String,"word":fields.String}
        datas={'occurs':"{}".format(max(occurs)*1000),'word':"{}".format(grams_arr[occurs.index(max(occurs))])}
        x=marshal(datas,main_fields)
        #json.dumps(marshal(datas,main_fields))
        return x
def predict_char_ngram(N, paragraph, question, options, isSimilarity):
    paragraph = removePunctuation(paragraph)
    options = [removePunctuation(x) for x in options]
    # similarities = [NGram.compare(paragraph, s, N=N) for s in options]
    # similarities = [ngram_compare(paragraph, s, N=N) for s in options]
    similarities = [
        NGram.compare(paragraph, s, N=N, pad_len=0) for s in options
    ]

    if isSimilarity:
        prob = [x / sum(similarities) for x in similarities]
    else:
        similarities = [x + 0.00001
                        for x in similarities]  # 0'a bolme hatasi icin
        prob = [(1 / x) / sum(1 / y for y in similarities)
                for x in similarities]
    return prob
Esempio n. 24
0
def guess_meme_image(meme_name):
    '''
    Guess which meme image they mean by finding the alias with greatest ngram
    similarity
    '''
    meme_name = tokenize(meme_name)
    best = ''
    best_score = None
    for guess_image, names in MEMES.items():
        for guess in names:
            guess = tokenize(guess)
            score = NGram.compare(guess, meme_name)
            if best_score is None or score > best_score:
                best_score = score
                best = guess_image
                app.logger.info('New best meme for "%s": "%s" (Score: %s)', meme_name, guess, score)
    app.logger.info('Picked meme "%s" for name "%s"' % (best, meme_name))
    return best
Esempio n. 25
0
    def get(self, param_word):
        status = False
        n = 2
        occurs = []
        grams_arr = []
        words = []
        for key in r_server.scan_iter():
            words.append(key)

        #sixgrams = ngrams(str_read.split(), n)
        for keys in words:
            #print str(grams)
            x = NGram.compare('{}'.format(param_word.decode('latin-1')),
                              str(keys))
            occurs.append(x)
            grams_arr.append(str(keys))

        for key in r_server.scan_iter():
            if key == param_word:
                status = True

        if status is True:
            main_fields_true = {
                "word": fields.String,
                "status": fields.Boolean
            }
            datas_true = {'word': "{}".format(param_word), 'status': status}
            x_true = marshal(datas_true, main_fields_true)
            return x_true
        else:
            main_fields_false = {
                'occurs': fields.String,
                "word": fields.String,
                "freq": fields.String,
                "status": fields.Boolean
            }
            datas_false = {
                'occurs': "{}".format(max(occurs) * 1000),
                'word': "{}".format(grams_arr[occurs.index(max(occurs))]),
                'freq': r_server.get(param_word),
                'status': status
            }
            x_false = marshal(datas_false, main_fields_false)
            return x_false
Esempio n. 26
0
    def get_similarity(self, content):
        """
        Renvoyer l'indice de similarité avec un autre contenu

        :returns: similarité entre 0.0 et 1.0, 1.0 étant l'identité
        """
        labels = {'title', 'body', 'tags'}
        this, that = ({
            label: render_block_to_string("content/similarity/content.djhtml",
                                          item, {'content': self})
            for label in labels
        } for item in [self, content])
        weights = text_to_dict(render_to_string(
            "content/similarity/weights.txt", {}),
                               evaluate=True)  # texte de la forme key:value\n
        return sum([
            NGram.compare(this[label], that[label]) * weights[label]
            for label in labels
        ]) / sum(weights.values())
Esempio n. 27
0
def guess_meme_image(meme_name):
    '''
    Guess which meme image they mean by finding the alias with greatest ngram
    similarity
    '''
    meme_name = tokenize(meme_name)
    best = ''
    best_score = None
    for guess_image, names in MEMES.items():
        for guess in names:
            guess = tokenize(guess)
            score = NGram.compare(guess, meme_name)
            if best_score is None or score > best_score:
                best_score = score
                best = guess_image
                app.logger.debug('New best meme for "%s": "%s" (Score: %s)',
                                 meme_name, guess, score)
    app.logger.info('Picked meme "%s" for name "%s" (Score: %s)', best,
                    meme_name, best_score)
    return best
Esempio n. 28
0
def process(hr,sr,he,se):

    categories_relevant = {}
    categories_extracted = {}

    category_idx_list = []
    for i,h in enumerate(hr):
        for j,h1 in enumerate(he):
            if NGram.compare(hr[i], he[j]) > 0.95:
                category_idx_list.append((i,j))

    if he:
        if len(he) != len(se):
            return 0 , 0
    for i,C in enumerate(category_idx_list):
        categories_relevant[i] = sr[C[0]]
        tmp = se[C[1]].replace('\r', '').replace('\n','')
        categories_extracted[i] = tmp

    e = Evaluator(categories_relevant, categories_extracted)
    p, r = e.evaluate_using_ngrams(3)

    return p, r
Esempio n. 29
0
def smart_read(url):
    resp = urllib2.urlopen(url)
    #resolve url
    url = resp.url
    domain = urlparse(url).netloc
    path = urlparse(url).path
    
    html = resp.read()
    tree = etree.parse(StringIO.StringIO(html), parser)
    links = tree.xpath("//body//@href")
    nmax = 0
    for link in links:
        if urlparse(link).netloc == domain:
            ng = NGram.compare(urlparse(link).path,path)
            #print link,ng
            if ng > nmax and ng < 1:
                nmax = ng
                mirror = link
    diffh = htmldiff(visit_page(url)["body"], visit_page(mirror)["body"])
    tree = etree.parse(StringIO.StringIO(diffh), parser)
    diff = tree.xpath("//ins//text()")
    for d in diff:
        print d
def FixedText(textt):

    word = {}

    Value = [0.5]
    a = 0
    fix = ""
    maxvalue = Value[0]
    for key in r.scan_iter():
        x = NGram.compare(textt, key, N=1)
        if x >= 0.5:
            a = float(r.lindex(key, 0))
            b = 100 * x + float(a)

            if b > maxvalue:
                maxvalue = b
                fix = key

            word[fix] = b

        else:
            continue

    return word
Esempio n. 31
0

my_dict = get_filelist("dict.txt")
misspell = get_filelist("misspell.txt")
correct = get_filelist("correct.txt")

match = 0
total = 0
test_size = len(misspell)
ngrams_result = list(range(test_size))

for i in range(test_size):
    max_similarity = 0
    similarity_dict = {}
    for item in my_dict:
        similarity = NGram.compare(misspell[i], item)
        similarity_dict[item] = similarity
        if similarity > max_similarity:
            max_similarity = similarity
    if max_similarity > 0:
        ngrams_result[i] = get_keys(similarity_dict, max_similarity)
    else:
        ngrams_result[i] = []
    print('Word Number:', i + 1, '/', test_size)
    print('MISSPELL:', misspell[i], '\nN-GRAMS:', ngrams_result[i],
          '\nCORRECT:', correct[i], '\n--------------------------------')
    total += len(ngrams_result[i])
    if correct[i] in ngrams_result[i]:
        match += 1

precision = match / total
Esempio n. 32
0
from ngram import NGram

distance = NGram.compare("Ham", "Spam", N=2)
print(distance)
Esempio n. 33
0
def nearlySameText(text_1, text_2):
    return NGram.compare(text_1.strip(), text_2.strip()) >= 0.9
def ng_pare(needle='default', fn='/usr/share/dict/words', pth=.50):
	with open(fn, 'r') as fh:
        	ng_haystack = {line.lower()
                        	for line in fh
                        	if NGram.compare(needle, line.lower(), N=1) - pth >= 0.0}
	return ng_haystack
Esempio n. 35
0
    jellyfish.match_rating_comparison(unicode(string[0]), unicode(s))
    for s in string
]
print 'match rating comparison', sim_arry7
# tokens = word_tokenize([string])
# print(string_token)
# print tfidf_matrix

# print(y.toarray()
ngram_array = [word_grams(s.split(' ')) for s in string]
# print ngram_array
n = NGram()
# print list(n.split(string[0]))
ngram_array = [list(n.split(s)) for s in string]
# print ngram_array
sim_arry8 = [NGram.compare(string[0].lower(), s.lower(), N=4) for s in string]
print 'ngram', sim_arry8


def jaccard_distance(a, b):
    # print a, b
    inter_len = float(len(list(a.intersection(b))))
    union_len = float(len(list(a.union(b))))
    return inter_len / union_len


# print list(ngram_array[0].intersection(ngram_array[1]))
sim_arry9 = [
    jaccard_distance(NGram(ngram_array[0]), NGram(s)) for s in ngram_array
]
print 'jaccard', sim_arry9
Esempio n. 36
0
class KCM(object):
    """docstring for KCM"""
    '''args
    lang                help='language, english or chinese (eng/cht)', required=True)
    io_dir             help='input output directory, required=True)
    max_file_count      help='maximum number of input files, 0 for no limit, type=int, default=0)
    thread_count        help='number of thread used, type=int, default=1)
    '''
    def __init__(self,
                 lang,
                 io_dir,
                 max_file_count=0,
                 thread_count=1,
                 uri=None):
        self.BASE_DIR = BASE_DIR
        self.lang = lang
        self.io_dir = os.path.join(io_dir, self.lang)
        self.max_file_count = max_file_count
        self.thread_count = thread_count

        self.client = MongoClient(uri)
        self.db = self.client['nlp']
        self.Collect = self.db['kcm']

        # ngram search
        self.kcmNgram = NGram((i['key'] for i in self.Collect.find({}, {
            'key': 1,
            '_id': False
        })))
        logging.basicConfig(format='%(levelname)s : %(asctime)s : %(message)s',
                            filename='KCM_{}.log'.format(self.lang),
                            level=logging.INFO)
        logging.info('Begin gen_kcm.py')
        logging.info('input {self.max_file_count} files, '
                     'output to {self.io_dir}, '
                     'maximum file count {self.max_file_count}, '
                     'use {self.thread_count} threads'.format(**locals()))

    def get_source_file_list(self):
        """Generate list of term data source files

        Args:
            args: input arguments, use self.lang, self.max_file_count

        Returns:
            list of source files
        """

        file_list = []  # wiki files
        for (dir_path, dir_names, file_names) in os.walk(self.io_dir):
            for file_name in file_names:
                if self.max_file_count and len(
                        file_list) >= self.max_file_count:
                    break
                if file_name == '.DS_Store' or '.model' in file_name:  # for OS X
                    continue
                file_list.append(os.path.join(dir_path, file_name))
                logging.info('appended file {}'.format(
                    os.path.join(dir_path, file_name)))

        if not file_list:
            logging.info('no file selected, end of script')
            exit()

        return file_list

    @timing
    def remove_symbols_tags(self, if_name):
        """Remove symbols and tags. Read input file, output to output file.

        Args:
            if_name: input file name
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        return rm_tags(if_name)

    @timing
    def paragraphs_to_sentences(self, inputData):
        """Generate sentences from paragraphs. Read input file, output to output file

        Args:
            inputData: input data from former process.
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        return paragraphs_to_sentences_cht(inputData)

    @timing
    def sentences_to_terms(self, if_name, inputData):
        """generate terms from sentences

        Args:
            if_name: input file name
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        prefix = if_name.split('/')[-1].split('_')[0]
        of_name = '{self.io_dir}/{prefix}_terms_{self.lang}'.format(**locals())
        PosTokenizer(self.BASE_DIR, inputData, of_name, 'r')

        return of_name

    @removeInputFile
    @timing
    def terms_to_term_pairs(self, if_name):
        """Generate term pairs from terms.

        Args:
            if_name: input file name
            args: input arguments, use self.io_dir, self.lang

        Returns:
            output file name
        """
        of_name = '{self.io_dir}/{self.lang}.model'.format(**locals())
        script_file = 'build/terms_to_term_pair_freq.py'

        terms_to_term_pair_freq(if_name, of_name, min_freq=1, max_term_len=20)

    @timing
    def join_terms_files(self, if_names):
        """Join terms files into one

        Args:
            if_names: input terms files names
            args: input arguments
        """
        of_name = '{self.io_dir}/terms_{self.lang}'.format(**locals())
        with open(of_name, 'w') as output_file:
            for if_name in if_names:
                with open(if_name, 'r') as input_file:
                    for line in input_file:
                        output_file.write(line)

        return of_name

    def gen_terms_file(self, if_name, o_list):
        """Generate terms file

        Args:
            if_name: input wiki source file name
            args: input arguments
            o_list: output list saving generated file name
        """
        result = self.remove_symbols_tags(if_name)
        result = self.paragraphs_to_sentences(result)
        of_name = self.sentences_to_terms(if_name, result)
        o_list.append(of_name)

    def thread_job(self, input_file_queue, o_term_files):
        """Job to be done by thread (generate terms file)

        Args:
            input_file_queue: queue containing input files that needs processing
            args: input arguments
            o_term_files: list for outputting generated term file names
        """
        while True:
            if_name = input_file_queue.get()
            if if_name is None:
                break  # end of thread
            self.gen_terms_file(if_name, o_list=o_term_files)
            input_file_queue.task_done()

    @timing
    def main(self):
        """main function"""
        if_list = self.get_source_file_list()

        term_files = []
        input_file_queue = queue.Queue()
        threads = []
        for i in range(self.thread_count):
            t = Thread(target=self.thread_job,
                       args=(input_file_queue, term_files))
            t.start()
            threads.append(t)

        for if_name in if_list:
            input_file_queue.put(if_name)

        # block till all tasks are done (here means all input file processed)
        input_file_queue.join()

        # stop all threads
        for i in range(self.thread_count):
            input_file_queue.put(None)
            # in thread_job, when input_file_queue.get == None, thread will end
        for t in threads:
            t.join()  # wait till all threads really end

        of_name = self.join_terms_files(term_files)
        self.terms_to_term_pairs(of_name)
        self.import2DB()

    def setLang(self, lang):
        self.lang = lang
        self.io_dir = os.path.join(io_dir, self.lang)

    def removeDB(self):
        self.Collect.remove({})

    def import2DB(self):
        result = dict()
        with open(os.path.join(self.io_dir, "{0}.model".format(self.lang)),
                  'r',
                  encoding='utf8') as f:
            for i in f:
                tmp = i.split()
                result.setdefault(tmp[0], []).append([tmp[1], int(tmp[2])])
                result.setdefault(tmp[1], []).append([tmp[0], int(tmp[2])])

        documentArr = [{
            'key': pair[0],
            'value': sorted(pair[1], key=lambda x: -x[1])
        } for pair in result.items()]
        del result

        self.Collect.insert(documentArr)
        self.Collect.create_index([("key", pymongo.HASHED)])

    def get(self, keyword, amount):
        result = self.Collect.find({'key': keyword}, {'_id': False}).limit(1)
        if result.count():
            return {**(result[0]), 'similarity': 1}
        else:
            ngramKeyword = self.kcmNgram.find(keyword)
            if ngramKeyword:
                result = self.Collect.find({
                    'key': ngramKeyword
                }, {
                    '_id': False
                }).limit(1)
                return {
                    'key': ngramKeyword,
                    'value': result[0]['value'][:amount],
                    'similarity': self.kcmNgram.compare(keyword, ngramKeyword)
                }
            return {'key': ngramKeyword, 'value': [], 'similarity': 0}
Esempio n. 37
0
                string += str(index)
        return string


if __name__ == '__main__':
    n = NGramCompare()
    solos, generatedSolo = n.importFiles(
        'Recreated 2, 100 epochs, learning rate = 0.0055.mid')
    trans = []
    for solo in solos:
        trans.append(n.translateToString(solo))
    s1 = n.translateToString(generatedSolo)
    ngramScores = []
    editdistances = []
    for solo in trans:
        ngramScores.append(NGram.compare(s1, solo))
        editdistances.append(editdistance.eval(s1, solo))
    avgngram = 0
    avgedit = 0
    for g in ngramScores:
        avgngram += g
    avgngram = avgngram / len(ngramScores)
    for dist in editdistances:
        avgedit += dist
    avgedit = avgedit / len(editdistances)
    ngramScores.sort(reverse=True)
    editdistances.sort()
    print('Average n-gram score: ' + str(avgngram))
    print('Average edit-distance score: ' + str(avgedit))
    print('Best n-gram score: ' + str(ngramScores[0]))
    print('Best edit distance score: ' + str(editdistances[0]))
Esempio n. 38
0
def score_oro_string(iline, jline, N):
	scores = [NGram.compare(x, y, N=N) for x,y in zip(iline.split(" :::: ")[:3], jline.split(" :::: ")[:3])]
	scores += [NGram.compare(" ".join(iline.split(" :::: ")[:3]), " ".join(jline.split(" :::: ")[:3]), N=N) * 10]
	return sum([log(x) if x > 0 else -10 for x in scores])
Esempio n. 39
0
query = dup_label

# Set up
pp = pprint.PrettyPrinter(indent=2)
sparql = SPARQLWrapper("http://husky-big.cs.uwaterloo.ca:8890/sparql")

# Send query
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
entries = results["results"]["bindings"]
"""
for result in entries:
    pp.pprint((result['s']['value'], result['o']["value"]))
    """
# Approximately 20k entries
print(len(entries), " entries retrieved")

for (indexA, entryA) in enumerate(entries):
    for (indexB, entryB) in enumerate(entries):
        if (indexA <= indexB):
            distance = NGram.compare(entryA['o']['value'], entryB['o']['value'])
            if (distance > 0.8 and entryA['s']['value'] != entryB['s']['value']):
                pp.pprint((distance, entryA['s']['value'], entryA['o']['value'], 
                    entryB['s']['value'],entryB['o']['value']))





Esempio n. 40
0
from nltk import ngrams
from ngram import NGram
sentence = 'this is a foo bar sentences and i want to ngramize it'
n = 2
sixgrams = ngrams(sentence.split(), n)
for grams in sixgrams:
    print grams

print NGram.compare('spa', 'spam')
Esempio n. 41
0
            # Open the corresponding edited subtitle file
            subs = pysrt.open(osp.join('edited',
                                       'edited_' + basename + '.srt'))

            num_speakers_matched = 0  # Number of speakers identified
            match_threshold = 0.15  # NGram matching confidence required to declare a match
            dialogue_idx = 0  # Index of dialogue list
            sub_idx = 0  # Index of subtitle list
            num_fails = 0  # Number of dialogue pairs in which no match was found for current subtitle
            return_to_dialogue_idx = 0  # Dialogue index to return to for next subtitle if no match is ever found for current subtitle
            while dialogue_idx < len(dialogue) and sub_idx < len(subs):
                next_dialogue_idx = min(dialogue_idx + 1, len(dialogue) - 1)
                curr_line = dialogue[dialogue_idx]
                next_line = dialogue[next_dialogue_idx]
                curr_sub = subs[sub_idx].text
                curr_score = NGram.compare(curr_line, curr_sub)
                next_score = NGram.compare(next_line, curr_sub)
                #flog.write('SUBTITLE: ' + curr_sub + '\n')
                #flog.write('DIALOGUE_PAIR: ({}, {}), ({}, {})\n'.format(curr_line, curr_score, next_line, next_score))
                # If we're somewhat confident that a match is found, handle it
                if curr_score > match_threshold or next_score > match_threshold:
                    # If the subtitle matches to the current line of dialogue
                    # better than it matches to the next one, assign it the speaker of this line
                    # and increment the subtitle index to the next one
                    if curr_score >= next_score:
                        subs[sub_idx].text = speakers[
                            dialogue_idx] + ': ' + curr_sub
                        #flog.write('RESULT: ' + subs[sub_idx].text + '\n')
                        sub_idx += 1
                        num_fails = 0
                    # If the better match is to the next line of dialogue, then
from ngram import NGram

a = NGram.compare('sin', 'sing')
print(a)
Esempio n. 43
0
 def sim( a, b ):
     return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich )
Esempio n. 44
0
def is_album_name_the_same(s1, s2):
    return NGram.compare(s1.lower(), s2.lower()) > 0.8
Esempio n. 45
0
class PMI(object):
    """docstring for PMI"""
    def __init__(self, lang, uri=None, ngram=False):
        self.client = pymongo.MongoClient(uri)
        self.uri = uri
        self.lang = lang
        self.db = self.client['nlp_{}'.format(self.lang)]
        self.fs = gridfs.GridFS(self.db)

        self.Collect = self.db['pmi']
        self.cpus = math.ceil(mp.cpu_count() * 0.2)
        self.frequency = {}

        if ngram:
            # use ngram for searching
            self.pmiNgram = NGram((i['key']
                                   for i in self.db['pmi'].find({}, {
                                       'key': 1,
                                       '_id': False
                                   })))

    def getWordFreqItems(self):
        # use cache
        if os.path.exists('frequency.pkl'):
            self.frequency = pickle.load(open('frequency.pkl', 'rb'))
            frequency_of_total_keyword = pickle.load(
                open('frequency_of_total_keyword.pkl', 'rb'))
            return frequency_of_total_keyword

        # return all frequency of word in type of dict.
        self.frequency = {}
        frequency_of_total_keyword = 0

        # iterate through gridFS
        for keyword in self.fs.list():
            cursor = self.fs.find({"filename": keyword})[0]
            value = {
                'PartOfSpeech': cursor.contentType,
                'value':
                json.loads(self.fs.get(cursor._id).read().decode('utf-8'))
            }
            for correlation_keyword, PartOfSpeech, corTermCount in value[
                    'value']:
                frequency_of_total_keyword += corTermCount
                # accumulate keyword's frequency.
                self.frequency[keyword] = self.frequency.setdefault(
                    keyword, 0) + corTermCount

        # iterate through all normal collection
        for i in self.db['kcm'].find({}):
            keyword = i['key']
            for correlation_keyword, PartOfSpeech, corTermCount in i['value']:
                frequency_of_total_keyword += corTermCount
                # accumulate keyword's frequency.
                self.frequency[keyword] = self.frequency.setdefault(
                    keyword, 0) + corTermCount

        pickle.dump(self.frequency, open('frequency.pkl', 'wb'))
        pickle.dump(frequency_of_total_keyword,
                    open('frequency_of_total_keyword.pkl', 'wb'))
        return frequency_of_total_keyword

    def build(self):
        self.Collect.remove({})
        # read all frequency from KCM and build all PMI of KCM in MongoDB.
        # with format {key:'中興大學', freq:100, value:[(keyword, PMI-value), (keyword, PMI-value)...]}
        frequency_of_total_keyword = self.getWordFreqItems()
        print('frequency of total keyword:' + str(frequency_of_total_keyword))

        @graceful_auto_reconnect
        def process_job(job_list):
            # Each process need independent Mongo Client
            # or it may raise Deadlock in Mongo.
            client = pymongo.MongoClient(self.uri)
            db = client['nlp_{}'.format(self.lang)]
            process_collect = db['pmi']
            kcm_collect = db['kcm']
            fs = gridfs.GridFS(db)

            result = []
            for keyword, keyword_freq in job_list:
                pmiResult = []

                collection_cursor = kcm_collect.find({
                    'key': keyword
                }, {
                    'value': 1,
                    '_id': False
                }).limit(1)
                if collection_cursor.count() == 0:
                    gridfs_cursor = fs.find({"filename": keyword}).limit(1)[0]
                    cursor_result = json.loads(
                        fs.get(gridfs_cursor._id).read().decode('utf-8'))[:500]
                else:
                    cursor_result = collection_cursor[0]['value']
                for kcmKeyword, PartOfSpeech, kcmCount in cursor_result:
                    # algorithm:
                    # PMI = log2(p(x, y)/p(x)*p(y))
                    # p(x, y) = frequency of (x, y) / frequency of total keyword.
                    # p(x) = frequency of x / frequency of total keyword.
                    value = math.log2(
                        kcmCount * frequency_of_total_keyword /
                        (keyword_freq * self.frequency[kcmKeyword]))

                    # this equation is contributed by 陳聖軒.
                    # contact him with facebook: https://www.facebook.com/henrymayday
                    value *= math.log2(self.frequency[kcmKeyword])

                    pmiResult.append((kcmKeyword, value))

                pmiResult = sorted(pmiResult, key=lambda x: -x[1])
                result.append({
                    'key': keyword,
                    'freq': keyword_freq,
                    'value': pmiResult
                })

                # Insert Collections into MongoDB
                if len(result) > 5000:
                    process_collect.insert(result)
                    result = []

        amount = math.ceil(len(self.frequency) / self.cpus)
        job_list = list(self.frequency.items())
        job_list = [
            job_list[i:i + amount]
            for i in range(0, len(self.frequency), amount)
        ]
        processes = [
            mp.Process(target=process_job, kwargs={'job_list': job_list[i]})
            for i in range(self.cpus)
        ]
        for process in processes:
            process.start()
        for process in processes:
            process.join()
        self.Collect.create_index([("key", pymongo.HASHED)])

    def get(self, keyword, amount):
        cursor = self.Collect.find({
            'key': keyword
        }, {
            'value': 1,
            '_id': False
        }).limit(1)
        if cursor.count() != 0:
            return {
                'key': keyword,
                'value': cursor[0]['value'][:amount],
                'similarity': 1
            }
        else:
            pmiNgramKeyword = self.pmiNgram.find(keyword)
            if pmiNgramKeyword:
                result = self.Collect.find({
                    'key': pmiNgramKeyword
                }, {
                    'value': 1,
                    '_id': False
                }).limit(1)[0]['value'][:amount]
                return {
                    'key': pmiNgramKeyword,
                    'value': result,
                    'similarity':
                    self.pmiNgram.compare(pmiNgramKeyword, keyword)
                }
        return {}
Esempio n. 46
0
def simple_score_strings(iline, jline, N):
	iString = " ".join(iline.split(" :::: ")[:3])
	jString = " ".join(jline.split(" :::: ")[:3])

	return NGram.compare(iString, jString, N=N)
Esempio n. 47
0
def ngram_compare(source_wl, target_wl):
    _w1, _w2 = " ".join(source_wl).lower(), " ".join(target_wl).lower()
    return NGram.compare(_w1, _w2)
Esempio n. 48
0
def ngram_test(w1, w2, n):
    return NGram.compare(w1, w2, N=n)