Example #1
0
    def validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            '''for doc in mx.docs:
                distance = metric.euclid_vectors(doc['terms'], q_vector)
                print distance
            '''
            cnt = Counter()
            for word in terms:
                cnt[word] += 1
            tableTerms = PrettyTable(["Term", "Frequency"])
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.encode('ascii').lower() in queryTerms.encode(
                        'ascii').lower().split():
                    tableTerms.add_row([word, cnt[word]])
            print tableTerms
        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Example #2
0
    def __validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            cnt = Counter()

            for word in terms:
                cnt[word] += 1
            table = Texttable()
            table.set_cols_align(["l", "l"])
            table.set_cols_valign(["m", "m"])
            table.set_cols_width([40, 55])

            rows = [["Term", "Frequency"]]
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.lower() in queryTerms.lower().split():
                    rows.append([word, cnt[word]])
            table.add_rows(rows)
            print table.draw() + "\n"

        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Example #3
0
# Load the three modules:
from irlib.preprocessor import Preprocessor
from irlib.matrix import Matrix
from irlib.metrics import Metrics
import difflib

# Create instances for their classes:
prep = Preprocessor()
mx = Matrix()
metric = Metrics()
q_vector = []


def generateMatrix():
    fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r')
    count = 1
    for line in fd.readlines():
        terms = line.split(' ')
        terms = [x.strip() for x in terms]
        mx.add_doc(doc_id=str(count),
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

        count += 1
        if count % 1000 == 0:
            print count

    mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)

Example #4
0
 def setUp(self):
     self.m = Metrics()