def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode( 'ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40, 55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def test_stemmer_lower(self): p = Preprocessor(lower=True, stem=True) stemmed = p.stemmer('Running') if my_nltk: self.assertEqual(stemmed, 'run') else: self.assertTrue(False, 'NLTK is not installed')
def main(): # Load configuration from file config = Configuration(config_file='/home/huma/Downloads/irlib-0.1.1/irlib/classify.conf') try: config.load_configuration() config_data = config.get_configuration() except: print("Error loading configuration file.") print("Classifier aborting.") raise # config.display_configuration() print(config) # sys.exit() myfolds = config.get_folds() correctness = 0 # Preporcessor: tokenizer, stemmer, etc. prep_lower = config_data['lower'] prep_stem = config_data['stem'] prep_pos = config_data['pos'] prep_ngram = config_data['ngram'] prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram) for myfold in myfolds: ev = Evaluation(config=config, fold=myfold) if config_data['classifier'] == 'rocchio': ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev) elif config_data['classifier'] == 'knn': ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev) else: ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev) training(config, myfold, ml, prep) ml.do_padding() ml.calculate_training_data() # r.display_idx() ml.diagnose() testing(config, myfold, ml, ev, prep) k = config_data['k'] results = ev.calculate(review_spam=True, k=k) print('Accuracy for fold %d: %s' % (myfold, results)) correctness += results print("Average accuracy for all folds:", correctness / len(myfolds))
def readfiles(fold_path='all-folds/fold1/'): prep = Preprocessor() mx = Matrix() files = os.listdir(fold_path) for filename in files: fd = open('%s/%s' % (fold_path, filename), 'r') file_data = fd.read() terms = prep.ngram_tokenizer(text=file_data) mx.add_doc(doc_id=filename, doc_terms=terms, frequency=True, do_padding=True) print 'Number of read documents:', len(mx.docs) print 'Number of read terms', len(mx.terms) #print mx.terms[0:5], mx.terms[-5:-1] print mx.terms print mx.docs
# Load the three modules: from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics import difflib # Create instances for their classes: prep = Preprocessor() mx = Matrix() metric = Metrics() q_vector = [] def generateMatrix(): fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r') count = 1 for line in fd.readlines(): terms = line.split(' ') terms = [x.strip() for x in terms] mx.add_doc(doc_id=str(count), doc_terms=terms, frequency=True, do_padding=True) count += 1 if count % 1000 == 0: print count mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)
def __init__(self): self._mx = Matrix() self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)
def test_term2ch(self): p = Preprocessor() charlist = p.term2ch('help') self.assertEqual(charlist, ['h', 'e', 'l', 'p'])
def test_3gram_tokenizer(self): p = Preprocessor(lower=False, stem=False, ngram=3) returned_tokens = p.ngram_tokenizer('how do you do?') expected_tokens = ['how do you', 'do you do'] self.assertEqual(returned_tokens, expected_tokens)
def test_tokenizer_lower(self): p = Preprocessor(lower=True, stem=False) tokens = p.tokenizer('This is IRLib') self.assertEqual(tokens, ['this', 'is', 'irlib'])