def test_stemmer_lower(self): p = Preprocessor(lower=True, stem=True) stemmed = p.stemmer('Running') if my_nltk: self.assertEqual(stemmed, 'run') else: self.assertTrue(False, 'NLTK is not installed')
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40, 55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def __validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc( doc_id=response.url,doc_terms=terms,frequency=True,do_padding=True) cnt = Counter() for word in terms: cnt[word] += 1 table = Texttable() table.set_cols_align(["l", "l"]) table.set_cols_valign(["m", "m"]) table.set_cols_width([40,55]) rows = [["Term", "Frequency"]] for word in sorted(cnt, key=cnt.get, reverse=True): if word.lower() in queryTerms.lower().split(): rows.append([word, cnt[word]]) table.add_rows(rows) print table.draw() + "\n" else: print "[-] Response for %s is %s " %(response.url, response.status_code)
def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc( doc_id=response.url,doc_terms=terms,frequency=True,do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode('ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " %(response.url, response.status_code)
def test_stemmer_lower(self): p = Preprocessor(lower=True, stem=True) stemmed = p.stemmer('Running') if my_nltk: self.assertEqual(stemmed,'run') else: self.assertTrue(False,'NLTK is not installed')
def validateResponse(self, response, queryTerms): from bs4 import BeautifulSoup if response.status_code == 200: soup = BeautifulSoup(response.text) from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics prep = Preprocessor() mx = Matrix() metric = Metrics() terms = prep.ngram_tokenizer(text=soup.get_text()) mx.add_doc(doc_id=response.url, doc_terms=terms, frequency=True, do_padding=True) '''for doc in mx.docs: distance = metric.euclid_vectors(doc['terms'], q_vector) print distance ''' cnt = Counter() for word in terms: cnt[word] += 1 tableTerms = PrettyTable(["Term", "Frequency"]) for word in sorted(cnt, key=cnt.get, reverse=True): if word.encode('ascii').lower() in queryTerms.encode( 'ascii').lower().split(): tableTerms.add_row([word, cnt[word]]) print tableTerms else: print "[-] Response for %s is %s " % (response.url, response.status_code)
def test_is_link(self): is_it = Preprocessor.is_link('hello world') self.assertEqual(is_it, False) is_it = Preprocessor.is_link('http://www.yahoo.com') self.assertEqual(is_it, True) is_it = Preprocessor.is_link('https://www.yahoo.com') self.assertEqual(is_it, True) is_it = Preprocessor.is_link('www.yahoo.com') self.assertEqual(is_it, True)
class QA: def __init__(self): self.file_name = 'qa.txt' self.qa_list = {} self.qa_id = 0 self.prep = Preprocessor() self.mx = Matrix() self.metric = Metrics() def randomize(self, a): for i in range(len(a)): a[i] = random.randint(0,1) def readfile(self): fd = open(self.file_name,'r') for line in fd.readlines(): line = line.strip().lower().split(':') if len(line) != 2: continue elif line[0] == 'q': q_line = ' '.join(line[1:]) self.qa_id += 1 self.qa_list[self.qa_id] = {'q': q_line, 'a': ''} terms = self.prep.ngram_tokenizer(text=q_line) self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, frequency=True, do_padding=True) elif line[0] == 'a': a_line = ' '.join(line[1:]) self.qa_list[self.qa_id]['a'] = a_line #print 'Number of read questions and answers:', len(self.mx.docs) #print 'Number of read terms', len(self.mx.terms) def ask(self, q=''): q_id = 0 q_distance = 99999 terms = self.prep.ngram_tokenizer(text=q) q_vector = self.mx.query_to_vector(terms, frequency=False) if sum(q_vector) == 0: self.randomize(q_vector) for doc in self.mx.docs: distance = self.metric.euclid_vectors(doc['terms'], q_vector) if distance < q_distance: q_distance = distance q_id = doc['id'] print 'Tarek:', self.qa_list[q_id]['a']
def __init__(self): self.file_name = 'qa.txt' self.qa_list = {} self.qa_id = 0 self.prep = Preprocessor() self.mx = Matrix() self.metric = Metrics()
def readfiles(fold_path='all-folds/fold1/'): prep = Preprocessor() mx = Matrix() files = os.listdir(fold_path) for filename in files: fd = open('%s/%s' % (fold_path, filename), 'r') file_data = fd.read() terms = prep.ngram_tokenizer(text=file_data) mx.add_doc(doc_id=filename, doc_terms=terms, frequency=True, do_padding=True) print 'Number of read documents:', len(mx.docs) print 'Number of read terms', len(mx.terms) #print mx.terms[0:5], mx.terms[-5:-1] print mx.terms print mx.docs
def main(): # Load configuration from file config = Configuration(config_file='/home/huma/Downloads/irlib-0.1.1/irlib/classify.conf') try: config.load_configuration() config_data = config.get_configuration() except: print("Error loading configuration file.") print("Classifier aborting.") raise # config.display_configuration() print(config) # sys.exit() myfolds = config.get_folds() correctness = 0 # Preporcessor: tokenizer, stemmer, etc. prep_lower = config_data['lower'] prep_stem = config_data['stem'] prep_pos = config_data['pos'] prep_ngram = config_data['ngram'] prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram) for myfold in myfolds: ev = Evaluation(config=config, fold=myfold) if config_data['classifier'] == 'rocchio': ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev) elif config_data['classifier'] == 'knn': ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev) else: ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev) training(config, myfold, ml, prep) ml.do_padding() ml.calculate_training_data() # r.display_idx() ml.diagnose() testing(config, myfold, ml, ev, prep) k = config_data['k'] results = ev.calculate(review_spam=True, k=k) print('Accuracy for fold %d: %s' % (myfold, results)) correctness += results print("Average accuracy for all folds:", correctness / len(myfolds))
def test_term2ch(self): p = Preprocessor() charlist = p.term2ch('help') self.assertEqual(charlist, ['h', 'e', 'l', 'p'])
def test_tokenizer_lower(self): p = Preprocessor(lower=True, stem=False) tokens = p.tokenizer('This is IRLib') self.assertEqual(tokens, ['this', 'is', 'irlib'])
# Load the three modules: from irlib.preprocessor import Preprocessor from irlib.matrix import Matrix from irlib.metrics import Metrics import difflib # Create instances for their classes: prep = Preprocessor() mx = Matrix() metric = Metrics() q_vector = [] def generateMatrix(): fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r') count = 1 for line in fd.readlines(): terms = line.split(' ') terms = [x.strip() for x in terms] mx.add_doc(doc_id=str(count), doc_terms=terms, frequency=True, do_padding=True) count += 1 if count % 1000 == 0: print count mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)
def __init__(self): self._mx = Matrix() self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)
class Search: def __init__(self): self._mx = Matrix() self._prep = Preprocessor(pattern='\W+', lower=True, stem=True) def readfiles(self, fold_path='all-folds/fold1/'): ruta = os.path.split(sys.argv[0]) abs = os.path.join(ruta[0], fold_path) files = os.listdir(abs) for filename in files: abs_arch = os.path.join(abs, filename) fd = open(abs_arch, 'r') file_data = fd.read() self.createMX(filename, file_data) print 'Number of read documents:', len(self._mx.docs) print 'Number of read terms', len(self._mx.terms) #print mx.terms[0:5], mx.terms[-5:-1] '''print mx.terms for doc in mx.docs: print doc''' self.saveMX(self._mx) print 'proceso culminado' def saveMX(self, mx): ruta = os.path.split(sys.argv[0]) abs = os.path.join(ruta[0], "db/matrix.mx") filemx = open(abs, 'w') serializer = Pickler(filemx) serializer.dump(mx) print 'matrix salvada' def createMX(self, file_id, file_data, lenguaje = 'english'): stop = stopwords.words(lenguaje) file = file_data.split(" ") content = [w for w in file if w.lower() not in stop] data = content.__str__() terms = self._prep.ngram_tokenizer(text=data) if len(terms) > 0: self._mx.add_doc(doc_id=file_id, doc_terms=terms, frequency=True, do_padding=True) def search(self): ruta = os.path.split(sys.argv[0]) abs = os.path.join(ruta[0], "db/matrix.mx") filemx = open(abs, 'r') serializer = Unpickler(filemx) self._mx = serializer.load() cadena = sys.argv del cadena[0] del cadena[0] cade = cadena.__str__() cade = cade.lower() cad = self._prep.ngram_tokenizer(text=cade) resultado = list() for doc in self._mx.docs: vector = list() for q in cad: if q in self._mx.terms: pos = self._mx.terms.index(q) vector.append(doc['terms'][pos]) resultado.append((doc['id'],vector)) resultado.sort(lambda a,b: self.__Deuclidiana(a[1]) - self.__Deuclidiana(b[1]), reverse = True) print resultado def __Deuclidiana(self, vector): dist = 0 for v in vector: dist+=v**2 return dist.__int__() def main(self): #self.readfiles() self.search()
def test_3gram_tokenizer(self): p = Preprocessor(lower=False, stem=False, ngram=3) returned_tokens = p.ngram_tokenizer('how do you do?') expected_tokens = ['how do you', 'do you do'] self.assertEqual(returned_tokens, expected_tokens)
def test_tokenizer_lower(self): p = Preprocessor(lower=True, stem=False) tokens = p.tokenizer('This is IRLib') self.assertEqual(tokens,['this','is','irlib'])
def test_is_mention(self): is_it = Preprocessor.is_mention('@twitter') self.assertEqual(is_it, True) is_it = Preprocessor.is_mention('#twitter') self.assertEqual(is_it, False)
def test_is_hashtag(self): is_it = Preprocessor.is_hashtag('@twitter') self.assertEqual(is_it, False) is_it = Preprocessor.is_hashtag('#twitter') self.assertEqual(is_it, True)