def clean(text): """ returns a list of words, from text, removing unnecessary punctuation and stopwords """ from stopwords import stopwords import string # trimming stopwords: stops = stopwords() text = text.split() ret = [] for word in text: if (word not in stops) and (len(word) > 1): ret.append(word) # trimming punctuation: punc = string.punctuation for i in range(len(ret)): og = ret[i] if og[0] in punc: og = og[1:] if og[-1] in punc: og = og[:-1] ret[i] = og return ret
def index(request): global swords if len(swords) == 0: swords = set(stopwords("cache")) swords.add("rlq") swords.add("u") swords.add("wp") print "stopwords fixed!" inCache = IndexedUrl.objects.all() return render(request, 'query/index.html', {'inCache': inCache})
def __init__(self): ''' self.saver : Pickles the object on to the disk. self.ps : Porter stemmer class object. It is required to get stem of a word. self.st : Class object to check for stop words. ''' self.saver = save_object() self.ps = PorterStemmer() self.st = stopwords() self.m_ds = {} self.models = {} self.m_mod = model()
def main(csv_file=None, number_of_articles=None): """ Main method. Purpose: Aggregator and a central controller to control all the modules Working: >> main(csv_file='~/suyash/Sample.csv') """ working_string = csv_to_str.csv_2_str(csv_file=csv_file) stopword_set = stopwords.stopwords() print nltk_stemmer.word_tokenizer( string_to_tokenize=working_string, stopset=stopword_set, top_number_of_articles=number_of_articles)
def search(path_to_index, queries): a = os.path.join(path_to_index, 'inverted_index') from stemmer import stemmer from stopwords import stopwords from search_handler import search stemmer = stemmer() stopwords = stopwords() search = search(a, stemmer, stopwords) outputs = [] for query in queries: the_result = search.search(query) if the_result is None: outputs.append(['.'] * 10) else: outputs.append(the_result) return outputs
def search(request): global swords if len(swords) == 0: swords = set(stopwords("cache")) swords.add("rlq") swords.add("u") swords.add("WP") print "stopwords fixed!" inCache = IndexedUrl.objects.all() backupnumDocs = 0 backup = [] qaug = "" try: # if True: q = request.GET.get('q', None) try: k = int(request.GET.get('k', None)) except: k = 0 print "searching ", q, ", ", k rankedfiles = SearchFiles(q) if k != 0: for docID in rankedfiles: backup.append(Docs.objects.get(num=int(docID))) backupnumDocs = len(backup) qaug = queryenhancement(q, swords, k, rankedfiles, "cache", inCache[0].number) # print qaug rankedfiles = SearchFiles(q + qaug) url_title = [] for docID in rankedfiles: url_title.append(Docs.objects.get(num=int(docID))) numDocs = len(url_title) return render( request, 'query/present.html', { 'docs': url_title, 'q': q, 'qaug': qaug, 'numDocs': numDocs, 'inCache': inCache, 'backup': backup, 'backupnumDocs': backupnumDocs }) except: return render(request, 'query/error.html', {'inCache': inCache})
def fix(request): global swords print "start fixing" swords = set(stopwords("cache")) swords.add("rlq") swords.add("u") swords.add("wp") fp = open("stopwords.txt", "w") fp.write("<table>\n<tr>\n") i = 0 for w in swords: fp.write(" <td> %s </td> " % w) i += 1 if (i % 8 == 0): fp.write("</tr>\n<tr>") fp.write("</tr>\n</table>") fp.close() return HttpResponse("StopWords fixed!")
def Tokens(filename): "对初始文本进行分词、去停用词,返回分词列表" # 去除文本空格、空行 with open(filename, encoding='utf=8') as f_obj: lines = f_obj.readlines() contents = '' for line in lines: contents = contents + line.strip() # 分词 text = jieba.lcut(contents) # 导入停用词表 stopword_list = stopwords.stopwords() # 去停用词 seg = [] for word in text: if word not in stopword_list: seg.append(word) return seg
def loginFormHandling(): from stemmer import stemmer from stopwords import stopwords from search_handler import search stemmer = stemmer() stopwords = stopwords() search = search("path", stemmer, stopwords) data = request.form query = request.form['query'] flag = 0 if data['title'] != '': flag = 1 query += " title:" query += data['title'] if data['infobox'] != '': flag = 1 query += " infobox:" query += data['infobox'] if data['references'] != '': flag = 1 query += " ref:" query += data['references'] if data['category'] != '': flag = 1 query += " category:" query += data['category'] if data['links'] != '': flag = 1 query += " links:" query += data['links'] if data['body'] != '': flag = 1 query += " body:" query += data['body'] # if (flag == 1): # r = len(request.form ['query']) # query = query[r:] print(query) the_result = search.search(query) return render_template('results.html', query=query, results=the_result)
def lambda_handler(event, context): book = event['queryStringParameters']['book'] query = event['queryStringParameters']['query'] s3 = boto3.resource('s3') wordMap = pickle.loads( s3.Bucket("pdftags").Object(book + ".pickle").get()['Body'].read()) # with BytesIO() as data: # s3.Bucket("pdftags").download_fileobj(book+".pkl", data) # data.seek(0) # move back to the beginning after writing # wordMap = pickle.load(data) keywords = query.split() match = {} for keyword in keywords: keyword = keyword.lower() if (keyword in stopwords.stopwords().words): continue for head in wordMap: wordMap[head].text = wordMap[head].text.lower() headWords = wordMap[head].text.split() if (keyword in wordMap[head].text): match[head] = match.get(head, 0) + 1 sort_match = sorted(match.items(), key=lambda x: wordMap[x[0]].fontSize, reverse=True) sort_match = sorted(sort_match, key=lambda x: x[1], reverse=True) return { 'statusCode': 200, 'headers': { 'Content-Type': 'application/json' }, 'body': json.dumps(sort_match) }
def crawl(request): inCache = IndexedUrl.objects.all() start = request.POST['start'] number = int(request.POST['number']) domain = request.POST['domain'] global swords try: fHash = BuildSearchEngine(start, number, domain) for q in IndexedUrl.objects.all(): q.delete() for q in Docs.objects.all(): q.delete() '''for q in StopWords.objects.all(): q.delete()''' log = IndexedUrl.objects.create(start=start, number=number, domain=domain, timestamp=timezone.now()) log.save() for num in fHash: url, title = fHash[num] doc = Docs.objects.create(num=num, url=url, title=title) doc.save() swords = set(stopwords("cache")) '''for w in stopwords("cache"): word = StopWords.objects.create(word=w) word.save()''' # print fHash return render(request, 'query/results.html', { 'start': start, 'number': num + 1, 'domain': domain }) except: return render(request, 'query/error.html', {'inCache': inCache})
session.add(term) frequency = session.query(TermFrequency).\ filter(TermFrequency.document == doc, TermFrequency.term == term).first() if not frequency: frequency = TermFrequency(document=doc, term=term) session.add(frequency) else: frequency.frequency += 1 session.flush() session.commit() if __name__ == '__main__': stopword_list = stopwords() def document_terms(): for filepath, content, date in documents(): print(filepath) extension = path.splitext(filepath)[1] words = None title = filename(filepath) if extension in ['.html', '.htm', '.jspy']: html_title, words, links = tokenize_html(content) html_title = html_title.strip() if html_title: title = html_title
import ssl ssl._create_default_https_context = ssl._create_unverified_context url = 'https://www.bbc.com/news/uk-51991887' stopword=[] freq=[] word=[] response = urllib.request.urlopen(url) html = response.read() soup=BeautifulSoup(html) text=soup.get_text(strip=True) fullwordlist= st.stripNonAlphaNum(text) stopword=st.stopwords() # this algorithmn method still has some problem so i show it as comment first # wordList=st.removeStopWord(fullwordlist,stopword) wordlist =st.remove(fullwordlist,stopword) dictionary = st.wordListToFreqDict(wordlist) sorteddict = st.sortFreqDict(dictionary) for t in sorteddict: freq.append(t[0]) for i in sorteddict: word.append(i[1]) print(sorteddict)
# -*- coding: utf-8 -*- from thinkbayes import Pmf import string from stopwords import stopwords #initialze constructor pmf = Pmf() #generate word list from text fname = 'data/aristotle.txt' words = [] stopwords = stopwords('data/stopwords.txt') with open(fname) as f: for line in f: processed = line.strip().translate(None, string.punctuation).split() for word in processed: if word.lower() in stopwords: continue words.append(word.lower()) #generate unique word set u_words = set(words) #count words for word in words: pmf.Incr(word, 1) #normalize word freqeuncies as probabilities pmf.Normalize()
term = Term(text = term_text) session.add(term) frequency = session.query(TermFrequency).\ filter(TermFrequency.document == doc, TermFrequency.term == term).first() if not frequency: frequency = TermFrequency(document = doc, term = term) session.add(frequency) else: frequency.frequency += 1 session.flush() session.commit() if __name__ == '__main__': stopword_list = stopwords() def document_terms(): for filepath, content, date in documents(): print(filepath) extension = path.splitext(filepath)[1] words = None title = filename(filepath) if extension in ['.html', '.htm', '.jspy']: html_title, words, links = tokenize_html(content) html_title = html_title.strip() if html_title: title = html_title
# - Site HTML # Imports from re import findall from os import listdir from collections import defaultdict, Counter from math import sqrt from datetime import datetime from pdb import set_trace from pymongo import MongoClient from stopwords import stopwords from bs4 import BeautifulSoup # setup stopwords = stopwords() directory = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/alexa_data/" html_directory = "/Users/mruttley/Documents/2015-05-11 StatCounter HTML Crawler/html/" ranking_directory = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/ranking_files/" verbose = False # Auxiliary functionality def get_ranking(): """Gets a ranking of domains""" timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d") filename = timestamp + "top-1m.csv" domains = [] with open(ranking_directory + filename) as f: for line in f:
# - Site HTML #Imports from re import findall from os import listdir from collections import defaultdict, Counter from math import sqrt from datetime import datetime from pdb import set_trace from pymongo import MongoClient from stopwords import stopwords from bs4 import BeautifulSoup #setup stopwords = stopwords() directory = '/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/alexa_data/' html_directory = "/Users/mruttley/Documents/2015-05-11 StatCounter HTML Crawler/html/" ranking_directory = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/ranking_files/" verbose = False #Auxiliary functionality def get_ranking(): """Gets a ranking of domains""" timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d') filename = timestamp + "top-1m.csv" domains = [] with open(ranking_directory + filename) as f: for line in f:
def __pipeline(classifier, train, test, train_y, test_y, scoring, task='train'): cls_name = classifier.__class__.__name__ print(f'##### {cls_name}:{task} #####') timer = Timer() timer.start() pipe = Pipeline([ ('features', FeatureUnion([ ('tfidf', TransformPipeline([('extract', ColumnExtractor(col='text', as_type=str)), ('vector', TfidfVectorizer(analyzer='word', tokenizer=__tokenizer, preprocessor=__preprocessor, stop_words=stopwords(), min_df=10, sublinear_tf=True))])), ('topic', TransformPipeline([('extract', ColumnExtractor(col='text', as_type=str)), ('vector', CountVectorizer(analyzer='word', tokenizer=__tokenizer, preprocessor=__preprocessor, stop_words=stopwords(), min_df=10)), ('lda', TransformLatentDirichletAllocation( n_components=8, max_iter=3, learning_method='online', learning_offset=10., n_jobs=4, random_state=__random_state))])), ('author', PrefixColumnExtractor(prefix='author_', as_type=int)), ('token_count', TransformPipeline([('extract', ColumnExtractor(col='token_count', as_type=int, as_matrix=True)), ('scale', MaxAbsScaler())])), ('token_length_mean', TransformPipeline([('extract', ColumnExtractor(col='token_length_mean', as_type=float, as_matrix=True)), ('scale', MaxAbsScaler())])), ('char_count', TransformPipeline([('extract', ColumnExtractor(col='char_count', as_type=int, as_matrix=True)), ('scale', MaxAbsScaler())])), ('digit_char_ratio', ColumnExtractor(col='digit_char_ratio', as_type=float, as_matrix=True)), ('alpha_char_ratio', ColumnExtractor(col='alpha_char_ratio', as_type=float, as_matrix=True)), ('upper_char_ratio', ColumnExtractor(col='upper_char_ratio', as_type=float, as_matrix=True)), ('space_char_ratio', ColumnExtractor(col='space_char_ratio', as_type=float, as_matrix=True)), ('punctuation_char_ratio', ColumnExtractor(col='punctuation_char_ratio', as_type=float, as_matrix=True)) ])), ('model', classifier) ]) # print(f'pipeline steps={repr(pipe.steps)}') if task == 'test': __train(pipe, train, train_y) __test(pipe, test, test_y) elif task == 'train': __train(pipe, train, train_y) elif task == 'tune': param_grid = { # 'features__token_length_mean__scale': [None, MaxAbsScaler()] # 'features__tfidf__vector__min_df': [1, 10, 100] # 'features__topic__lda__n_components': [8, 16, 32] # 'features__topic__lda__max_iter': [3, 6, 10] # 'model__penalty': ['l2', 'l1', 'elasticnet'] # 'model__alpha': [0.0001, 0.001, 0.01] # 'model__loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'] 'model__max_iter': [1000, 40000] } __grid_search(pipe, param_grid, train, train_y, scoring=scoring) elif task == 'validate': __validate(pipe, train, train_y, scoring) else: raise ValueError(f'Invalid value: task={task}') timer.stop() print( f'__pipeline {cls_name}:{task} took {seconds_to_hhmmss(timer.elapsed)}' )