def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [] self.index = NearDuplicatesIndex() # Calculate near-duplicates index # Try to connect try: conn=psycopg2.connect("dbname='djangology' user='******' password=''") except: print "I am unable to connect to the database." cur = conn.cursor() try: cur.execute("""SELECT * from dj_document""") except: print "I can't SELECT from dj_document" rows = cur.fetchall() #print "\nRows: \n" for row in rows: #print " ", row[1] self.index.append(row[1], row[0]) self.files.append(row[0]) cur.close()
def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [ d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ] self.index = NearDuplicatesIndex() # Calculate near-duplicates index for file in self.files: filename = self.filename(file) with open(filename) as f: doc = f.read().strip().strip( ",.!|&-_()[]<>{}/\"'").strip().split(" ") self.index.append(doc, filename)
class Detector: def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [ d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ] self.index = NearDuplicatesIndex() # Calculate near-duplicates index for file in self.files: filename = self.filename(file) with open(filename) as f: doc = f.read().strip().strip( ",.!|&-_()[]<>{}/\"'").strip().split(" ") self.index.append(doc, filename) # Public: returns the full relative path from the base dir of the project # to the filename input # # filename - the filename relative to the test directory # # Returns full filename (including test directory) def filename(self, filename): return "%s/%s" % (self.test_docs_dir, filename) # Public: checks for near-duplicates in the set of files based on jaccard # coefficient threshold of 0.5 # # Returns a string containing formatted names and coefficients of # documents whose jaccard coefficient is greater than 0.5 def check_for_duplicates(self): matches = [] for indx1, f1 in enumerate(self.files): file1 = self.filename(f1) for indx2, f2 in enumerate(self.files[indx1 + 1:]): file2 = self.filename(f2) jaccard = self.index.get_jaccard(file1, file2) if jaccard > 0.5: matches.append( "%s and %s are near-duplicates, with Jaccard value of %0.3f." % (f1, f2, jaccard)) return "\n".join(matches)
def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ] self.index = NearDuplicatesIndex() # Calculate near-duplicates index for file in self.files: filename = self.filename(file) with open(filename) as f: doc = f.read().strip().strip(",.!|&-_()[]<>{}/\"'").strip().split(" ") self.index.append(doc, filename)
class Detector: def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [d for d in os.listdir(test_docs_dir) if os.path.isfile(os.path.join(test_docs_dir, d)) and d[0] != "." ] self.index = NearDuplicatesIndex() # Calculate near-duplicates index for file in self.files: filename = self.filename(file) with open(filename) as f: doc = f.read().strip().strip(",.!|&-_()[]<>{}/\"'").strip().split(" ") self.index.append(doc, filename) # Public: returns the full relative path from the base dir of the project # to the filename input # # filename - the filename relative to the test directory # # Returns full filename (including test directory) def filename(self, filename): return "%s/%s" % (self.test_docs_dir, filename) # Public: checks for near-duplicates in the set of files based on jaccard # coefficient threshold of 0.5 # # Returns a string containing formatted names and coefficients of # documents whose jaccard coefficient is greater than 0.5 def check_for_duplicates(self): matches = [] for indx1, f1 in enumerate(self.files): file1 = self.filename(f1) for indx2, f2 in enumerate(self.files[indx1+1:]): file2 = self.filename(f2) jaccard = self.index.get_jaccard(file1, file2) if jaccard > 0.5: matches.append("%s and %s are near-duplicates, with Jaccard value of %0.3f." % (f1, f2, jaccard)) return "\n".join(matches)
def setUp(self): self.docs = [] self.docs.append(['this','is','a','document']) self.docs.append(['this','is','b','document']) self.index = NearDuplicatesIndex()
class TestNearDuplicatesIndex(unittest.TestCase): def setUp(self): self.docs = [] self.docs.append(['this','is','a','document']) self.docs.append(['this','is','b','document']) self.index = NearDuplicatesIndex() def test_should_allow_to_append_documents(self): self.index.append(self.docs[0], 'doc1') self.index.append(self.docs[1], 'doc2') self.assertEqual(len(self.index), 2) def test_should_raise_an_error_when_docname_is_duplicated(self): self.index.append(self.docs[0], 'doc1') with self.assertRaises(Exception): self.index.append(self.docs[1], 'doc1') def test_should_calculate_jaccard_coefficient(self): self.index.append(self.docs[0], 'doc1') self.index.append(self.docs[0], 'doc2') self.assertEqual(self.index.get_jaccard('doc1', 'doc2'), 1.0) def test_should_raise_an_error_if_document_does_not_exist(self): with self.assertRaises(Exception): self.index.get_jaccard('doc1', 'doc3') def test_should_append_a_document_if_its_not_duplicated(self): self.index.append(self.docs[0], 'doc1') self.index.appendif(self.docs[1], 'doc2', 1.0) self.assertEqual(len(self.index), 2) def test_should_not_append_a_document_if_its_duplicated(self): self.index.append(self.docs[0], 'doc1') self.index.appendif(self.docs[1], 'doc2', -1.0) self.assertEqual(len(self.index), 1)
class Detector: def __init__(self, test_docs_dir="./test"): self.test_docs_dir = test_docs_dir self.files = [] self.index = NearDuplicatesIndex() # Calculate near-duplicates index # Try to connect try: conn=psycopg2.connect("dbname='djangology' user='******' password=''") except: print "I am unable to connect to the database." cur = conn.cursor() try: cur.execute("""SELECT * from dj_document""") except: print "I can't SELECT from dj_document" rows = cur.fetchall() #print "\nRows: \n" for row in rows: #print " ", row[1] self.index.append(row[1], row[0]) self.files.append(row[0]) cur.close() #for file in self.files: # filename = self.filename(file) # with open(filename) as f: # doc = f.read().strip().strip(",.!|&-_()[]<>{}/\"'").strip().split(" ") #self.index.append(doc, filename) # Public: returns the full relative path from the base dir of the project # to the filename input # # filename - the filename relative to the test directory # # Returns full filename (including test directory) def filename(self, filename): return "%s/%s" % (self.test_docs_dir, filename) # Public: checks for near-duplicates in the set of files based on jaccard # coefficient threshold of 0.5 # # Returns a string containing formatted names and coefficients of # documents whose jaccard coefficient is greater than 0.5 def check_for_duplicates(self): matches = [] for indx1, f1 in enumerate(self.files): for indx2, f2 in enumerate(self.files[indx1+1:]): jaccard = self.index.get_jaccard(f1, f2) if jaccard > 0.5: matches.append("%s and %s are near-duplicates, with Jaccard value of %0.3f." % (f1, f2, jaccard)) return "\n".join(matches)