def simtitle( request ): """calculate similarity based on title and naive threshold""" n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title ) articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) ) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare( article.title, nearest.title ) < 0.7: results.append( article ) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append( article ) n.add( article ) return render( request, "dump.html", dictionary = { "article_list": results, } )
def build_title_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = [entity['title'] for entity in entities] mapped_entities = {} for entity in entities: value = entity['title'] if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug('Iterating title took {} ms'.format( int((time.time() - start) * 1000))) start = time.time() index = NGram() for value in values: index.add(value) logger.debug('Building title index took {} ms'.format( int((time.time() - start) * 1000))) return index, mapped_entities
def simtitle(request): """calculate similarity based on title and naive threshold""" n = NGram(warp=2.5, iconv=enrich) articles = Article.objects.filter( status="live").order_by("date_published")[:1000] results = [] for article in articles: article.is_duplicate = False article.duplicate_of = None article.save() sim = filter(lambda a: a[1] >= 0.7, n.search(article.title)) for match in sim: nearest = match[0] if nearest.is_duplicate: nearest = nearest.duplicate_of if NGram.compare(article.title, nearest.title) < 0.7: results.append(article) break article.is_duplicate = True article.duplicate_of = nearest article.save() break else: results.append(article) n.add(article) return render(request, "dump.html", dictionary={ "article_list": results, })
def build_cast_index(movies, tvshows, key): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = [[cast[key] for cast in entity['cast']] for entity in entities] values = list(set(itertools.chain.from_iterable(values))) mapped_entities = {} for entity in entities: for cast in entity['cast']: value = cast[key] if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug('Iterating {} took {} ms'.format( key, int((time.time() - start) * 1000))) start = time.time() index = NGram() for value in values: index.add(value) logger.debug('Building {} index took {} ms'.format( key, int((time.time() - start) * 1000))) return index, mapped_entities
def build_collection_index(movies, tvshows): start = time.time() entities = list(itertools.chain.from_iterable([movies, tvshows])) values = list( set([ parse_collection(entity['set']) for entity in entities if 'set' in entity and len(entity['set']) > 0 ])) mapped_entities = {} for entity in entities: if 'set' in entity and entity['set']: value = parse_collection(entity['set']) if value not in mapped_entities: mapped_entities[value] = [] mapped_entities[value].append(entity) logger.debug('Iterating collection took {} ms'.format( int((time.time() - start) * 1000))) start = time.time() index = NGram() for value in values: index.add(value) logger.debug('Building collection index took {} ms'.format( int((time.time() - start) * 1000))) return index, mapped_entities
def build_multiclusters(inlines, threshold=0.05, N=4): clusters = [] ignoreus = [] for i, iline in enumerate(inlines): if i in ignoreus: continue iString = " ".join(iline.split(" :::: ")[:3]) ignoreus.append(i) icluster = {} icluster[iline] = -1 iModel = NGram(iString) for j in range(i, len(inlines)): if j in ignoreus: continue jline = inlines[j] jString = " ".join(jline.split(" :::: ")[:3]) results = iModel.search(jString) score = sum([y for x,y in results]) / len(results) \ if len(results) > 0 else 0.0 print score if score > threshold: icluster[jline] = score iModel.add(jString) ignoreus.append(j) clusters.append(icluster) return clusters
class Plagiarism: def __init__(self,text): self.ng=NGram() file = open(text,"r") linea = file.readline() while linea != '': if linea != '\n': self.ng.add(linea) linea = file.readline() self.lsn=list(self.ng); file.close() def verify(self,text_compare): results = [] dictio = [] file2 = open(text_compare,"r") linea2 = file2.readline() while linea2 != '': if linea2 != '\n': dictio += [self.ng.items_sharing_ngrams(linea2)] compares = 0.0 for parrafo in self.lsn: comp = NGram.compare(parrafo,linea2) if compares < comp: compares = comp results += [compares] linea2 = file2.readline() file2.close() major_ocurrences=[] for d in dictio: major=0 for val in d.values(): if major<val: major=val major_ocurrences+=[major] avg_perc=0.0 for r in results: avg_perc+=r avg_perc=avg_perc/len(results) print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences)) print("Porcentaje Similitud: "+repr(avg_perc))
def handle( self, *args, **options ): if "simonly" in args: new_count = 100000 else: new_count = 0 for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ): l = feedparser.parse( source.scraper_config ) ok = True if l[ "bozo" ] == 1: if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ): ok = False if ok: for article in l[ "entries" ]: #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] ) a, created = Article.objects.get_or_create( source = source, # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs) source_reference = article[ "id" ], defaults = { 'date_created' : datetime.now(), 'source_url' : article[ "link" ], 'title' : self.normalise( article[ "title" ] ), 'num_comments' : article.get( "slash_comments", 0 ), 'summary' : article[ "summary" ], 'author' : article.get( "author", "" ), 'date_published' : datetime(*(article[ "updated_parsed" ][:6])), 'status' : "live" } ) if created: #print "Creating new article." pass else: #print "Updating article." pass new_count += 1 if article.has_key( "content" ): # TODO test for multiple content blocks and pick most appropriate a.body = article[ "content" ][0][ "value" ] a.tags.clear() for tag in article.get( "tags", () ): a.tags.add( tag[ "term" ] ) a.save() else: logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) break #calculate similarities #create a similarity corpus of last 200 docs def enrich( obj ): s = unicode( obj ) # simple stop words s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE ) # type prefixes s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE ) return s n = NGram( warp=2.5, iconv=enrich ) articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)] for article in articles: if "simonly" in args: article.is_duplicate = False article.duplicate_of = None article.save() continue #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count] #for article in articles: #print( u"similarity for %s" % ( article.title, ) ) sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) ) for match in sim: nearest = match[0] if nearest.source == article.source: continue if nearest.is_duplicate: nearest = nearest.duplicate_of # do it again! if nearest.source == article.source: continue article.is_duplicate = True article.duplicate_of = nearest #print u" is duplicate of %s" % ( nearest.title, ) article.save() break n.add( article )
import csv from ngram import NGram records = NGram() with open('./data/houses.csv', 'r', encoding='windows-1251') as f: for line in csv.reader(f, delimiter=';'): records.add(' '.join(list(line)).lower()) while True: print('Enter search text:') search_text = input().lower() print('find', records.find(search_text), 0.8)
def test_unigram(self): n = NGram(0) n.add('after') n.next_word() == 'after'
def test_trigram(self): n = NGram(2) n.add('after', ('before', 'other')) assert n.next_word(('before', 'other')) == 'after'
def test_bigram(self): n = NGram(1) n.add('after', ('before')) assert n.next_word(('before')) == 'after'
def handle(self, *args, **options): if "simonly" in args: new_count = 100000 else: new_count = 0 for source in Source.objects.filter(scraper='feedparser', status__in=('silent', 'live')): l = feedparser.parse(source.scraper_config) ok = True if l["bozo"] == 1: if not isinstance(l["bozo_exception"], feedparser.ThingsNobodyCaresAboutButMe): ok = False if ok: for article in l["entries"]: #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] ) a, created = Article.objects.get_or_create( source=source, # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs) source_reference=article["id"], defaults={ 'date_created': datetime.now(), 'source_url': article["link"], 'title': self.normalise(article["title"]), 'num_comments': article.get("slash_comments", 0), 'summary': article["summary"], 'author': article.get("author", ""), 'date_published': datetime(*(article["updated_parsed"][:6])), 'status': "live" }) if created: #print "Creating new article." pass else: #print "Updating article." pass new_count += 1 if article.has_key("content"): # TODO test for multiple content blocks and pick most appropriate a.body = article["content"][0]["value"] a.tags.clear() for tag in article.get("tags", ()): a.tags.add(tag["term"]) a.save() else: logging.error("Could not read feed for file '%s': %s" % (source.scraper_config, l["bozo_exception"])) logging.error("Skipping '%s': %s" % (source.scraper_config, l["bozo_exception"])) break #calculate similarities #create a similarity corpus of last 200 docs def enrich(obj): s = unicode(obj) # simple stop words s = re.sub(r"\b(the|of|in|a)\b", "", s, re.IGNORECASE) # type prefixes s = re.sub(r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE) return s n = NGram(warp=2.5, iconv=enrich) articles = Article.objects.filter( status="live").order_by("date_published")[:(new_count * 4)] for article in articles: if "simonly" in args: article.is_duplicate = False article.duplicate_of = None article.save() continue #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count] #for article in articles: #print( u"similarity for %s" % ( article.title, ) ) sim = filter(lambda a: a[1] > 0.4, n.search(article.title)) for match in sim: nearest = match[0] if nearest.source == article.source: continue if nearest.is_duplicate: nearest = nearest.duplicate_of # do it again! if nearest.source == article.source: continue article.is_duplicate = True article.duplicate_of = nearest #print u" is duplicate of %s" % ( nearest.title, ) article.save() break n.add(article)