def ngram_similarity(data, col1, col2): cos = [] for i in range(len(data.id)): st = data[col1][i] title = data[col2][i] n = NGram(title.split(), key=lambda x: x[1]) for s in st.split(): n.search(s) tfidf = sktf.TfidfVectorizer().fit_transform([st, title]) c = ((tfidf * tfidf.T).A)[0, 1] cos.append(c) return cos
def build_multiclusters(inlines, threshold=0.05, N=4): clusters = [] ignoreus = [] for i, iline in enumerate(inlines): if i in ignoreus: continue iString = " ".join(iline.split(" :::: ")[:3]) ignoreus.append(i) icluster = {} icluster[iline] = -1 iModel = NGram(iString) for j in range(i, len(inlines)): if j in ignoreus: continue jline = inlines[j] jString = " ".join(jline.split(" :::: ")[:3]) results = iModel.search(jString) score = sum([y for x,y in results]) / len(results) \ if len(results) > 0 else 0.0 print score if score > threshold: icluster[jline] = score iModel.add(jString) ignoreus.append(j) clusters.append(icluster) return clusters
def map(self,phrase): for term in phrase: if len(term) > 4: continue for word in self.corpus: z = Set(term) & Set(word) matches = [] if len(z) > 0 and len(z) < len(term): # # g=NGram(z - Set(term)) #matches = g.search(term) else: # # At this point we assume context is not informative # In the advent of context not being informative, we resort to fuzzy lookup # g = NGram(word) #matches = g.search(term) g.remove(term) matches = g.search(term) key = None value = None if len(matches) > 0: matches = list(matches[0]) Pz_ = len(matches) / self.size Px_ = fuzz.ratio(term,matches[0]) / 100 if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4: key = term value= {} value= [matches[0],Pz_,Px_,1] self.emit (key,value)
def simtitle( request ): """calculate similarity based on title and naive threshold""" n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title ) articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) ) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare( article.title, nearest.title ) < 0.7: results.append( article ) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append( article ) n.add( article ) return render( request, "dump.html", dictionary = { "article_list": results, } )
def main(left_path, left_column, right_path, right_column, outfile, titles, join, minscore, count, warp): """Perform the similarity join""" right_file = csv.reader(open(right_path, 'r')) if titles: right_header = next(right_file) index = NGram((tuple(r) for r in right_file), threshold=minscore, warp=warp, key=lambda x: lowstrip(x[right_column])) left_file = csv.reader(open(left_path, 'r')) out = csv.writer(open(outfile, 'w'), lineterminator='\n') if titles: left_header = next(left_file) out.writerow(left_header + ["Rank", "Similarity"] + right_header) for row in left_file: if not row: continue # skip blank lines row = tuple(row) results = index.search(lowstrip(row[left_column]), threshold=minscore) if results: if count > 0: results = results[:count] for rank, result in enumerate(results, 1): out.writerow(row + (rank, result[1]) + result[0]) elif join == "outer": out.writerow(row)
def sonucbul(): kelimeler = list() v = NGram(ngramdatawords) sonucthreshold = list() sonuckelime = list() kelimedizisi = np.zeros((1, len(ngramdatawords)), dtype='int8') yorum = e1.get() ############### cevirici = str.maketrans('', '', punctuation) yorum = yorum.translate(cevirici) cevirici = str.maketrans('', '', digits) yorum = yorum.translate(cevirici) yorum = yorum.lower() kelimeler.clear() kelimeler = yorum.split() for j in range(0, len(kelimeler), 1): sonucthreshold.clear() sonuckelime.clear() for ngrami in v.search(kelimeler[j], threshold=0.4): sonuckelime.append(str(ngrami[0])) sonucthreshold.append(int(ngrami[1])) if (len(sonuckelime) != 0): kelimedizisi[0][ngramdatawords.index( sonuckelime[sonucthreshold.index(max(sonucthreshold))])] += 1 tmpdf = pd.DataFrame(kelimedizisi) sonuc = ngrammodel.predict(tmpdf) cevirici = str.maketrans('', '', punctuation) cevap = str(sonuc).translate(cevirici) print("Yorum= " + yorum + " Yorum Sonucu= " + str(sonuc)) e1.delete(0, END) Label(master, text="Puan(1-5) =" + str(cevap)).grid(row=2)
def simtitle(request): """calculate similarity based on title and naive threshold""" n = NGram(warp=2.5, iconv=enrich) articles = Article.objects.filter( status="live").order_by("date_published")[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter(lambda a: a[1] >= 0.7, n.search(article.title)) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare(article.title, nearest.title) < 0.7: results.append(article) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append(article) n.add(article) return render(request, "dump.html", dictionary={ "article_list": results, })
def main(left_path, left_column, right_path, right_column, outfile, titles, join, minscore, count, warp): """Perform the similarity join""" right_file = csv.reader(open(right_path, 'r')) if titles: right_header = next(right_file) index = NGram((tuple(r) for r in right_file), threshold=minscore, warp=warp, key=lambda x: lowstrip(x[right_column])) left_file = csv.reader(open(left_path, 'r')) out = csv.writer(open(outfile, 'w')) if titles: left_header = next(left_file) out.writerow(left_header + ["Rank", "Similarity"] + right_header) for row in left_file: if not row: continue # skip blank lines row = tuple(row) results = index.search(lowstrip(row[left_column]), threshold=minscore) if results: if count > 0: results = results[:count] for rank, result in enumerate(results, 1): out.writerow(row + (rank, result[1]) + result[0]) elif join == "outer": out.writerow(row)
def test_ngram_search(self): """Tests from the original ngram.py, to check that the rewrite still uses the same underlying algorithm""" # Basic searching of the index idx = NGram(self.items) self.assertEqual(idx.search('askfjwehiuasdfji'), [('askfjwehiuasdfji', 1.0), ('asdfawe', 0.17391304347826086), ('asfwef', 0.083333333333333329), ('adfwe', 0.041666666666666664)]) self.assertEqual( idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998), ('asdfawe', 0.20000000000000001)]) # Pairwise comparison of strings self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0) self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def test_set_operations(self): """Test advanced set operations""" items1 = set(["abcde", "cdefg", "fghijk", "ijklm"]) items2 = set(["cdefg", "lmnop"]) idx1 = NGram(items1) idx2 = NGram(items2) results = lambda L: sorted(x[0] for x in L) # Item removal self.assertEqual(results(idx1.search('cde')), ["abcde", "cdefg"]) idx1.remove('abcde') self.assertEqual(results(idx1.search('cde')), ["cdefg"]) # Set intersection operation items1.remove('abcde') idx1.intersection_update(idx2) self.assertEqual(idx1, items1.intersection(items2)) self.assertEqual(results(idx1.search('lmn')), []) self.assertEqual(results(idx1.search('ijk')), []) self.assertEqual(results(idx1.search('def')), ['cdefg'])
def test_ngram_search(self): """Tests from the original ngram.py, to check that the rewrite still uses the same underlying algorithm""" # Basic searching of the index idx = NGram(self.items) self.assertEqual(idx.search('askfjwehiuasdfji'), [ ('askfjwehiuasdfji', 1.0), ('asdfawe', 0.17391304347826086), ('asfwef', 0.083333333333333329), ('adfwe', 0.041666666666666664)]) self.assertEqual(idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998), ('asdfawe', 0.20000000000000001)]) # Pairwise comparison of strings self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0) self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
def test_set_operations(self): """Test advanced set operations""" items1 = set(["abcde", "cdefg", "fghijk", "ijklm"]) items2 = set(["cdefg", "lmnop"]) idx1 = NGram(items1) idx2 = NGram(items2) results = lambda L: sorted(x[0] for x in L) # Item removal self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"]) idx1.remove('abcde') self.assertEqual(results(idx1.search('cde')), ["cdefg"]) # Set intersection operation items1.remove('abcde') idx1.intersection_update(idx2) self.assertEqual(idx1, items1.intersection(items2)) self.assertEqual(results(idx1.search('lmn')), []) self.assertEqual(results(idx1.search('ijk')), []) self.assertEqual(results(idx1.search('def')), ['cdefg'])
def main(left_path, left_column, right_path, right_column, outfile, titles, join, minscore, count, warp): """Perform the similarity join >>> open('left.csv', 'w').write('''ID,NAME ... 1,Joe ... 2,Kin ... 3,ZAS''') >>> open('right.csv', 'w').write('''ID,NAME ... ID,NAME ... A,Joe ... B,Jon ... C,Job ... D,Kim''') >>> main(left_path='left.csv', left_column=1, ... right_path='right.csv', right_column=1, outfile='out.csv', ... titles=True, join='outer', minscore=0.24, count=5, warp=1.0) >>> print open('out.csv').read() #doctest: +NORMALIZE_WHITESPACE ID,NAME,Rank,Similarity,ID,NAME 1,Joe,1,1.0,A,Joe 1,Joe,2,0.25,B,Jon 1,Joe,3,0.25,C,Job 2,Kin,1,0.25,D,Kim 3,ZAS <BLANKLINE> """ right_file = csv.reader(open(right_path, 'r')) if titles: right_header = right_file.next() index = NGram((tuple(r) for r in right_file), threshold=minscore, warp=warp, key=lambda x: lowstrip(x[right_column])) left_file = csv.reader(open(left_path, 'r')) out = csv.writer(open(outfile, 'w')) if titles: left_header = left_file.next() out.writerow(left_header + ["Rank", "Similarity"] + right_header) for row in left_file: if not row: continue # skip blank lines row = tuple(row) results = index.search(lowstrip(row[left_column]), threshold=minscore) if results: if count > 0: results = results[:count] for rank, result in enumerate(results, 1): out.writerow(row + (rank, result[1]) + result[0]) elif join == "outer": out.writerow(row)
def get_ngram_similarity(gold, candidates, N=3, strip_space=True): def _strip_space(s): if not strip_space: return s return "\n".join([part.strip(" ") for part in s.split("\n")]) ng = NGram([_strip_space(gold)], N=N) sims = [] for c in candidates: ng_out = ng.search(_strip_space(c)) if len(ng_out) == 0: sims.append(0.0) else: sims.append(ng_out[0][1]) return sims
def wordsoccurrences(self, words_list, option='ortony'): frequencies = FreqDist(words_list) ordered_unigrams = frequencies.most_common() if option == 'ortony': lexicon = self.ortony_list else: lexicon = self.profane_words count = 0 for t_word, count_w in ordered_unigrams: lower_word = t_word.lower() three_grams = NGram(lexicon) likely_words = three_grams.search(lower_word) if len(likely_words) > 0: # if lower_word in lexicon: count += 1 * count_w if lower_word in lexicon: count += 1 return count
def verify(self, text_compare): results = [] texto = [] ''' file2 = open(text_compare,"r") for linea2 in file2.readlines(): texto+=linea2.split(" ") tng=NGram(texto) file2.close() ''' file2 = open(text_compare, "r") linea2 = file2.readline() while linea2 != '': texto += linea2.split(" ") linea2 = file2.readline() tng = NGram(texto) file2.close() for ngs in self.ng: count = 0 for word in list(ngs): for porc in tng.search(word): if porc[1] > 0.3: count += 1 results += [count] print list(results) pos = 0 count = 0 i = 0 for res in results: if count < res: count = res pos = i i += 1 if results[pos] > 2: print("Tema mas preciso del texto: " + repr(self.topic[pos])) else: print("No se ha podido precisar de que trata") print ""
def verify(self,text_compare): results = [] texto = [] ''' file2 = open(text_compare,"r") for linea2 in file2.readlines(): texto+=linea2.split(" ") tng=NGram(texto) file2.close() ''' file2 = open(text_compare,"r") linea2 = file2.readline() while linea2 != '': texto+=linea2.split(" ") linea2 = file2.readline() tng=NGram(texto) file2.close() for ngs in self.ng: count=0 for word in list(ngs): for porc in tng.search(word): if porc[1]>0.3: count+=1 results+=[count] print list(results) pos=0 count=0 i=0 for res in results: if count<res: count=res pos=i i+=1 if results[pos]>2: print("Tema mas preciso del texto: "+repr(self.topic[pos])) else: print("No se ha podido precisar de que trata") print ""
def _location_choices(self, search): ngram_index = NGram(key=self._location_to_name) ngram_index.update(Ward.objects.all()) ngram_index.update(District.objects.all()) locations = ngram_index.search(search)[:self.num_choices] return [self._location_to_choice(l) for l, _score in locations]
def test_scifi_genre(self): index = NGram(items=['Sci-Fi'], key=lambda x: x.lower()) self.assertGreater(index.search('science fiction')[0][1], 0) self.assertEqual(index.search('sci-fi')[0][1], 1)
""" address_longlat = [] for address in location: g = geocoder.google(address) list_longlat = g.latlnga list_longlat.insert(0,address) address_longlat.append(list_longlat) print address_longlat """ get long lat from data POI using Ngram """ with open("D:/tasya/python/code/Geo-Tag/corpus/sample-poi1.csv") as file: reader = csv.reader(file) #reader.next() corpus = [] for row in reader: corpus.append(row[0]) corpus_name = [] for word in corpus: corpus_name.append(word.split(';')[0]) address = [] G = NGram(corpus_name) G_latlng = NGram(corpus) for word in location: out = G.search(word) out1 = G_latlng.append(out[0][0]) address.append(out1[0][0])
def run(self): N = len(self.context) imatches = [] found = {} Y = range(0,len(self.bag)) for i in range(0,N): Xo_ = list(self.bag[i]) # skip_gram #Y = (Set(range(0,N)) - (Set([i]) | Set(imatches))) for ii in Y: if self.bag[i] == self.bag[ii] : imatches.append(ii) ; continue # # We are sure we are not comparing the identical phrase # NOTE: Repetition doesn't yield learning, rather context does. # Lets determine if there are common terms # Z = Set(self.bag[i]) & Set(self.bag[ii]) if len(Z) > 0 and len(Xo_) > 0: Xo_ = Set(Xo_) - Z # - list(Set(bag[i]) - Set(bag[ii])) Yo_ = Set(self.bag[ii]) - Z #list(Set(bag[ii]) - Set(bag[i])) size = len(Xo_) g = NGram(Yo_) for term in Xo_: xo = g.search(term) if len(xo) > 0 and len(term) < 4: xo = xo[0] else: continue; xo = list(xo) xo_i = self.bag[i].index(term) yo_i = self.bag[ii].index(xo[0]) # # We have the pair, and we will compute the distance # ratio = fuzz.ratio(term,xo[0])/100 is_subset = len(Set(term) & Set(xo[0])) == len(term) if is_subset and len(term) < len(xo[0]) and ratio > 0.5 and xo_i ==yo_i: xo[1] = [ratio,xo_i] if (term not in self.info): #xo[1] = ratio self.info[term] = [term,xo[0]]+xo[1] elif term in self.info and ratio > self.info[term][1] : self.info[term] = [term,xo[0]]+xo[1] imatches.append(ii) break; # # At this point we consolidate all that has been learnt # And make it available to the outside word, otherwise client should retrieve it # self.lock.acquire() if self.queue is not None: for term in self.info: value = ['thread # ',self.name]+list(self.info[term]) self.queue.put(value) self.lock.release()
def handle(self, *args, **options): if "simonly" in args: new_count = 100000 else: new_count = 0 for source in Source.objects.filter(scraper='feedparser', status__in=('silent', 'live')): l = feedparser.parse(source.scraper_config) ok = True if l["bozo"] == 1: if not isinstance(l["bozo_exception"], feedparser.ThingsNobodyCaresAboutButMe): ok = False if ok: for article in l["entries"]: #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] ) a, created = Article.objects.get_or_create( source=source, # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs) source_reference=article["id"], defaults={ 'date_created': datetime.now(), 'source_url': article["link"], 'title': self.normalise(article["title"]), 'num_comments': article.get("slash_comments", 0), 'summary': article["summary"], 'author': article.get("author", ""), 'date_published': datetime(*(article["updated_parsed"][:6])), 'status': "live" }) if created: #print "Creating new article." pass else: #print "Updating article." pass new_count += 1 if article.has_key("content"): # TODO test for multiple content blocks and pick most appropriate a.body = article["content"][0]["value"] a.tags.clear() for tag in article.get("tags", ()): a.tags.add(tag["term"]) a.save() else: logging.error("Could not read feed for file '%s': %s" % (source.scraper_config, l["bozo_exception"])) logging.error("Skipping '%s': %s" % (source.scraper_config, l["bozo_exception"])) break #calculate similarities #create a similarity corpus of last 200 docs def enrich(obj): s = unicode(obj) # simple stop words s = re.sub(r"\b(the|of|in|a)\b", "", s, re.IGNORECASE) # type prefixes s = re.sub(r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE) return s n = NGram(warp=2.5, iconv=enrich) articles = Article.objects.filter( status="live").order_by("date_published")[:(new_count * 4)] for article in articles: if "simonly" in args: article.is_duplicate = False article.duplicate_of = None article.save() continue #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count] #for article in articles: #print( u"similarity for %s" % ( article.title, ) ) sim = filter(lambda a: a[1] > 0.4, n.search(article.title)) for match in sim: nearest = match[0] if nearest.source == article.source: continue if nearest.is_duplicate: nearest = nearest.duplicate_of # do it again! if nearest.source == article.source: continue article.is_duplicate = True article.duplicate_of = nearest #print u" is duplicate of %s" % ( nearest.title, ) article.save() break n.add(article)
def get_similars(data, target, threshold): G = NGram(target) return G.search(data, threshold=threshold)[0][0]
def handle( self, *args, **options ): if "simonly" in args: new_count = 100000 else: new_count = 0 for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ): l = feedparser.parse( source.scraper_config ) ok = True if l[ "bozo" ] == 1: if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ): ok = False if ok: for article in l[ "entries" ]: #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] ) a, created = Article.objects.get_or_create( source = source, # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs) source_reference = article[ "id" ], defaults = { 'date_created' : datetime.now(), 'source_url' : article[ "link" ], 'title' : self.normalise( article[ "title" ] ), 'num_comments' : article.get( "slash_comments", 0 ), 'summary' : article[ "summary" ], 'author' : article.get( "author", "" ), 'date_published' : datetime(*(article[ "updated_parsed" ][:6])), 'status' : "live" } ) if created: #print "Creating new article." pass else: #print "Updating article." pass new_count += 1 if article.has_key( "content" ): # TODO test for multiple content blocks and pick most appropriate a.body = article[ "content" ][0][ "value" ] a.tags.clear() for tag in article.get( "tags", () ): a.tags.add( tag[ "term" ] ) a.save() else: logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) break #calculate similarities #create a similarity corpus of last 200 docs def enrich( obj ): s = unicode( obj ) # simple stop words s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE ) # type prefixes s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE ) return s n = NGram( warp=2.5, iconv=enrich ) articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)] for article in articles: if "simonly" in args: article.is_duplicate = False article.duplicate_of = None article.save() continue #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count] #for article in articles: #print( u"similarity for %s" % ( article.title, ) ) sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) ) for match in sim: nearest = match[0] if nearest.source == article.source: continue if nearest.is_duplicate: nearest = nearest.duplicate_of # do it again! if nearest.source == article.source: continue article.is_duplicate = True article.duplicate_of = nearest #print u" is duplicate of %s" % ( nearest.title, ) article.save() break n.add( article )
get long lat from geocoder """ address_longlat = [] for address in location: g = geocoder.google(address) list_longlat = g.latlnga list_longlat.insert(0, address) address_longlat.append(list_longlat) print address_longlat """ get long lat from data POI using Ngram """ with open("D:/tasya/python/code/Geo-Tag/corpus/sample-poi1.csv") as file: reader = csv.reader(file) #reader.next() corpus = [] for row in reader: corpus.append(row[0]) corpus_name = [] for word in corpus: corpus_name.append(word.split(';')[0]) address = [] G = NGram(corpus_name) G_latlng = NGram(corpus) for word in location: out = G.search(word) out1 = G_latlng.append(out[0][0]) address.append(out1[0][0])
inst = CodigoAritm(alfabeto, probabilidades) except SimbProbsError as e: print(e) except ItemVacioError as e: print(e) else: mensajes = tuple(muestreo) for mensaje in mensajes: caracteres = NGram(mensaje.split(' ')) try: print( '\nEntropía de \'{0}\': {1} \nTotal de vocales: {2} \t Total de palabras: {3}' .format(mensaje, str(inst.entropiadelmensaje(mensaje)), contarvocales(mensaje), len(word_tokenize(mensaje)))) inst.precodmsj(mensaje + '~') except ExistSimbError as e: print('{0} \t Ignorando mensaje'.format(e)) else: for palabrota in lexico: minusculas = palabrota[0].lower() query = caracteres.search(minusculas) coincidencias = [ match for match in query if match[1] > 0.29 ] if len(coincidencias) > 0: print('\tBuscando >> {0}: {1}'.format( minusculas, coincidencias[0])) finally: print('\nTerminando ejecución del programa...')
#flog.write('RESULT: ' + subs[sub_idx].text + '\n') sub_idx += 1 dialogue_idx += 1 return_to_dialogue_idx = dialogue_idx num_fails = 0 num_speakers_matched += 1 # If we're not very confident in a match, find the matching scores of the # subtitle against each substring of the same length within the line. else: #flog.write('--CHECKING SUBSTRINGS--\n') num_words = len(curr_sub.split(' ')) # Evaluate current dialogue line's substrings curr_line_substrings = get_all_substrings( num_words, num_words, curr_line) curr_line_ngrams = NGram(curr_line_substrings) curr_searches = curr_line_ngrams.search(curr_sub) if curr_searches: curr_candidate_matches, curr_candidate_scores = zip( *curr_searches) curr_max_substring_score = max(curr_candidate_scores) curr_line_substring_line = curr_candidate_matches[ np.argmax(curr_candidate_scores)] else: # If no matches are returned, give a max score of 0 curr_max_substring_score = 0.0 curr_line_substring_line = None # Evaluate next dialogue line's substrings next_line_substrings = get_all_substrings( num_words, num_words, next_line) next_line_ngrams = NGram(next_line_substrings) next_searches = next_line_ngrams.search(curr_sub)