Beispiel #1
0
class TestMetrics(unittest.TestCase):
    def setUp(self):
        self.m = Metrics()

    def test_metrics(self):
        e = self.m.euclid_vectors([1, 1], [4, 5])
        self.assertEqual(e, 5)
        c = self.m.cos_vectors([1, 1, 1], [1, 1, 1])
        self.assertEqual(round(c, 5), float(1))
        c = self.m.cos_vectors([1, 0, 1], [0, 1, 0])
        self.assertEqual(round(c, 5), float(0))
Beispiel #2
0
class TestMetrics(unittest.TestCase):

    def setUp(self):
        self.m = Metrics()

    def test_metrics(self):
        e = self.m.euclid_vectors([1,1],[4,5])
        self.assertEqual(e,5)
        c = self.m.cos_vectors([1,1,1],[1,1,1])
        self.assertEqual(round(c,5),float(1))
        c = self.m.cos_vectors([1,0,1],[0,1,0])
        self.assertEqual(round(c,5),float(0))
    def validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            '''for doc in mx.docs:
                distance = metric.euclid_vectors(doc['terms'], q_vector)
                print distance
            '''
            cnt = Counter()
            for word in terms:
                cnt[word] += 1
            tableTerms = PrettyTable(["Term", "Frequency"])
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.encode('ascii').lower() in queryTerms.encode(
                        'ascii').lower().split():
                    tableTerms.add_row([word, cnt[word]])
            print tableTerms
        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Beispiel #4
0
 def __init__(self):
     self.file_name = 'qa.txt'
     self.qa_list = {}
     self.qa_id = 0
     self.prep = Preprocessor()
     self.mx = Matrix()
     self.metric = Metrics()
Beispiel #5
0
    def __validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            cnt = Counter()

            for word in terms:
                cnt[word] += 1
            table = Texttable()
            table.set_cols_align(["l", "l"])
            table.set_cols_valign(["m", "m"])
            table.set_cols_width([40, 55])

            rows = [["Term", "Frequency"]]
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.lower() in queryTerms.lower().split():
                    rows.append([word, cnt[word]])
            table.add_rows(rows)
            print table.draw() + "\n"

        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Beispiel #6
0
class QA:
    
    def __init__(self):
        self.file_name = 'qa.txt'
        self.qa_list = {}
        self.qa_id = 0
        self.prep = Preprocessor()
        self.mx = Matrix()
        self.metric = Metrics()
        
    def randomize(self, a):
        for i in range(len(a)):
            a[i] = random.randint(0,1)

    def readfile(self):

        fd = open(self.file_name,'r')
        for line in fd.readlines():
            line = line.strip().lower().split(':')
            if len(line) != 2:  
                continue
            elif line[0] == 'q':
                q_line = ' '.join(line[1:])
                self.qa_id += 1
                self.qa_list[self.qa_id] = {'q': q_line, 'a': ''}
                terms = self.prep.ngram_tokenizer(text=q_line)
                self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, 
                        frequency=True, do_padding=True)
            elif line[0] == 'a': 
                a_line = ' '.join(line[1:])
                self.qa_list[self.qa_id]['a'] = a_line
        
        #print 'Number of read questions and answers:', len(self.mx.docs)
        #print 'Number of read terms', len(self.mx.terms)
               
    def ask(self, q=''):

        q_id = 0
        q_distance = 99999

        terms = self.prep.ngram_tokenizer(text=q)
        q_vector = self.mx.query_to_vector(terms, frequency=False)

        if sum(q_vector) == 0:
            self.randomize(q_vector)

        for doc in self.mx.docs:
            distance = self.metric.euclid_vectors(doc['terms'], q_vector)
            if distance < q_distance:
                q_distance = distance
                q_id = doc['id']
    
        print 'Tarek:', self.qa_list[q_id]['a']
Beispiel #7
0
 def setUp(self):
     self.m = Metrics()
Beispiel #8
0
# Load the three modules:
from irlib.preprocessor import Preprocessor
from irlib.matrix import Matrix
from irlib.metrics import Metrics
import difflib

# Create instances for their classes:
prep = Preprocessor()
mx = Matrix()
metric = Metrics()
q_vector = []


def generateMatrix():
    fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r')
    count = 1
    for line in fd.readlines():
        terms = line.split(' ')
        terms = [x.strip() for x in terms]
        mx.add_doc(doc_id=str(count),
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

        count += 1
        if count % 1000 == 0:
            print count

    mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)

Beispiel #9
0
 def setUp(self):
     self.m = Metrics()
Beispiel #10
0
class TestMetrics(TestCase):

    def setUp(self):
        self.m = Metrics()

    def test_jaccard_same_len(self):
        with self.assertRaises(ValueError):
            self.m.jaccard_vectors(
                [0, 1],
                [0, 1, 2, 3]
            )

    def test_jaccard_empty(self):
        e = self.m.jaccard_vectors([],[])
        self.assertEqual(e,1)

    def test_jaccard_int(self):
        e = self.m.jaccard_vectors(
            [0, 2, 1, 3],
            [0, 1, 2, 3]
        )
        self.assertEqual(e,0.75)
    
    def test_jaccard_bool(self):
        e = self.m.jaccard_vectors(
            [False, False, True, True, True ],
            [False, True , True, True, False]
        )
        self.assertEqual(e,0.4)

    def test_euclid_same_len(self):
        with self.assertRaises(ValueError):
            self.m.euclid_vectors(
                [0, 1, 2, 3],
                [0, 1]
            )

    def test_euclid(self):
        e = self.m.euclid_vectors([1,1],[4,5])
        self.assertEqual(e,5)

    def test_cos_same_len(self):
        with self.assertRaises(ValueError):
            self.m.cos_vectors(
                [0, 1, 2],
                [1, 1]
            )

    def test_cos_0(self):
        c = self.m.cos_vectors([1,0,1],[0,1,0])
        self.assertEqual(round(c,5),float(0))

    def test_cos_1(self):
        c = self.m.cos_vectors([1,1,1],[1,1,1])
        self.assertEqual(round(c,5),float(1))