class TestMetrics(unittest.TestCase): def setUp(self): self.m = Metrics() def test_metrics(self): e = self.m.euclid_vectors([1, 1], [4, 5]) self.assertEqual(e, 5) c = self.m.cos_vectors([1, 1, 1], [1, 1, 1]) self.assertEqual(round(c, 5), float(1)) c = self.m.cos_vectors([1, 0, 1], [0, 1, 0]) self.assertEqual(round(c, 5), float(0))
class TestMetrics(unittest.TestCase): def setUp(self): self.m = Metrics() def test_metrics(self): e = self.m.euclid_vectors([1,1],[4,5]) self.assertEqual(e,5) c = self.m.cos_vectors([1,1,1],[1,1,1]) self.assertEqual(round(c,5),float(1)) c = self.m.cos_vectors([1,0,1],[0,1,0]) self.assertEqual(round(c,5),float(0))
def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode( 'ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def __init__(self): self.file_name = 'qa.txt' self.qa_list = {} self.qa_id = 0 self.prep = Preprocessor() self.mx = Matrix() self.metric = Metrics()
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40, 55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " % (response.url, response.status_code)
class QA: def __init__(self): self.file_name = 'qa.txt' self.qa_list = {} self.qa_id = 0 self.prep = Preprocessor() self.mx = Matrix() self.metric = Metrics() def randomize(self, a): for i in range(len(a)): a[i] = random.randint(0,1) def readfile(self): fd = open(self.file_name,'r') for line in fd.readlines(): line = line.strip().lower().split(':') if len(line) != 2: continue elif line[0] == 'q': q_line = ' '.join(line[1:]) self.qa_id += 1 self.qa_list[self.qa_id] = {'q': q_line, 'a': ''} terms = self.prep.ngram_tokenizer(text=q_line) self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, frequency=True, do_padding=True) elif line[0] == 'a': a_line = ' '.join(line[1:]) self.qa_list[self.qa_id]['a'] = a_line #print 'Number of read questions and answers:', len(self.mx.docs) #print 'Number of read terms', len(self.mx.terms) def ask(self, q=''): q_id = 0 q_distance = 99999 terms = self.prep.ngram_tokenizer(text=q) q_vector = self.mx.query_to_vector(terms, frequency=False) if sum(q_vector) == 0: self.randomize(q_vector) for doc in self.mx.docs: distance = self.metric.euclid_vectors(doc['terms'], q_vector) if distance < q_distance: q_distance = distance q_id = doc['id'] print 'Tarek:', self.qa_list[q_id]['a']
def setUp(self): self.m = Metrics()
# Load the three modules: from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics import difflib # Create instances for their classes: prep = Preprocessor() mx = Matrix() metric = Metrics() q_vector = [] def generateMatrix(): fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r') count = 1 for line in fd.readlines(): terms = line.split(' ') terms = [x.strip() for x in terms] mx.add_doc(doc_id=str(count), doc_terms=terms, frequency=True, do_padding=True) count += 1 if count % 1000 == 0: print count mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)
class TestMetrics(TestCase): def setUp(self): self.m = Metrics() def test_jaccard_same_len(self): with self.assertRaises(ValueError): self.m.jaccard_vectors( [0, 1], [0, 1, 2, 3] ) def test_jaccard_empty(self): e = self.m.jaccard_vectors([],[]) self.assertEqual(e,1) def test_jaccard_int(self): e = self.m.jaccard_vectors( [0, 2, 1, 3], [0, 1, 2, 3] ) self.assertEqual(e,0.75) def test_jaccard_bool(self): e = self.m.jaccard_vectors( [False, False, True, True, True ], [False, True , True, True, False] ) self.assertEqual(e,0.4) def test_euclid_same_len(self): with self.assertRaises(ValueError): self.m.euclid_vectors( [0, 1, 2, 3], [0, 1] ) def test_euclid(self): e = self.m.euclid_vectors([1,1],[4,5]) self.assertEqual(e,5) def test_cos_same_len(self): with self.assertRaises(ValueError): self.m.cos_vectors( [0, 1, 2], [1, 1] ) def test_cos_0(self): c = self.m.cos_vectors([1,0,1],[0,1,0]) self.assertEqual(round(c,5),float(0)) def test_cos_1(self): c = self.m.cos_vectors([1,1,1],[1,1,1]) self.assertEqual(round(c,5),float(1))