def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode( 'ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40, 55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " % (response.url, response.status_code)
# Load the three modules: from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics import difflib # Create instances for their classes: prep = Preprocessor() mx = Matrix() metric = Metrics() q_vector = [] def generateMatrix(): fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r') count = 1 for line in fd.readlines(): terms = line.split(' ') terms = [x.strip() for x in terms] mx.add_doc(doc_id=str(count), doc_terms=terms, frequency=True, do_padding=True) count += 1 if count % 1000 == 0: print count mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)
def setUp(self): self.m = Metrics()