def validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            '''for doc in mx.docs:
                distance = metric.euclid_vectors(doc['terms'], q_vector)
                print distance
            '''
            cnt = Counter()
            for word in terms:
                cnt[word] += 1
            tableTerms = PrettyTable(["Term", "Frequency"])
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.encode('ascii').lower() in queryTerms.encode(
                        'ascii').lower().split():
                    tableTerms.add_row([word, cnt[word]])
            print tableTerms
        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Exemple #2
0
    def __validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            cnt = Counter()

            for word in terms:
                cnt[word] += 1
            table = Texttable()
            table.set_cols_align(["l", "l"])
            table.set_cols_valign(["m", "m"])
            table.set_cols_width([40, 55])

            rows = [["Term", "Frequency"]]
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.lower() in queryTerms.lower().split():
                    rows.append([word, cnt[word]])
            table.add_rows(rows)
            print table.draw() + "\n"

        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Exemple #3
0
 def test_get_doc_by_id(self):
     mx = Matrix()
     for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
         mx.add_doc(doc_id=str(i),
                    doc_class='Email',
                    doc_terms=s.split(),
                    do_padding=True,
                    frequency=True)
     doc1_id = mx.docs.index('1')
     self.assertEqual(mx.docs[doc1_id]['id'], '1')
Exemple #4
0
 def test_white_and_black_lists(self):
     doc_terms = ['this', 'is', 'a', 'new', 'test']
     white_list = ['test']
     black_list = ['this', 'is', 'a']
     mx = Matrix(whitelist=white_list, blacklist=black_list)
     mx.add_doc(doc_id='doc1',
                doc_class='TestClass',
                doc_terms=doc_terms,
                do_padding=True,
                frequency=True)
     returned_terms = mx.vocabulary()
     expected_terms = ['test']
     self.assertItemsEqual(returned_terms, expected_terms)
Exemple #5
0
 def test_meta_data(self):
     mx = Matrix()
     for i, s in enumerate(['hello', 'world']):
         mx.add_doc(doc_id=str(i),
                    doc_class='Email',
                    doc_terms=s.split(),
                    do_padding=True,
                    frequency=True,
                    meta_data={
                        'original_text': s,
                        'original_text_len': len(s)
                    })
     self.assertEqual(mx.docs[1]['original_text'], 'world')
     self.assertEqual(mx.docs[1]['original_text_len'], 5)
Exemple #6
0
 def test_docs_unique_ids(self):
     mx = Matrix()
     for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
         mx.add_doc(doc_id=str(i),
                    doc_class='Email',
                    doc_terms=s.split(),
                    do_padding=True,
                    frequency=True)
     mx.add_doc(doc_id='1',
                doc_class='Email',
                doc_terms='goodbye'.split(),
                do_padding=True,
                frequency=True,
                unique_ids=True)
     self.assertEqual(len(mx), 3)
Exemple #7
0
def readfiles(fold_path='all-folds/fold1/'):

    prep = Preprocessor()
    mx = Matrix()

    files = os.listdir(fold_path)
    for filename in files:
        fd = open('%s/%s' % (fold_path, filename), 'r')
        file_data = fd.read()
        terms = prep.ngram_tokenizer(text=file_data)
        mx.add_doc(doc_id=filename,
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

    print 'Number of read documents:', len(mx.docs)
    print 'Number of read terms', len(mx.terms)
    #print mx.terms[0:5], mx.terms[-5:-1]
    print mx.terms
    print mx.docs
Exemple #8
0
# Load the three modules:
from irlib.preprocessor import Preprocessor
from irlib.matrix import Matrix
from irlib.metrics import Metrics
import difflib

# Create instances for their classes:
prep = Preprocessor()
mx = Matrix()
metric = Metrics()
q_vector = []


def generateMatrix():
    fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r')
    count = 1
    for line in fd.readlines():
        terms = line.split(' ')
        terms = [x.strip() for x in terms]
        mx.add_doc(doc_id=str(count),
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

        count += 1
        if count % 1000 == 0:
            print count

    mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)

Exemple #9
0
 def setUp(self):
     self.m = Matrix()
Exemple #10
0
 def __init__(self):
     self._mx = Matrix()
     self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)