def content_based_calculator(self, stringofwords): divider = re.compile('\\W*') res=[x.lower() for x in divider.split(stringofwords) if x!= ''] content_scores={} paperid_index={} content_out={} index=1 for word in res: if word in self.wordlocations: for papid in self.wordlocations[word]: papid_score=len(self.wordlocations[word][papid]) if papid not in content_scores: content_scores[papid]=papid_score paperid_index[papid]=index else: content_scores[papid]=content_scores[papid]*papid_score paperid_index[papid]+=1 else: continue for pid in paperid_index: if paperid_index[pid]==len(res): content_out[pid]=content_scores[pid] inst=searchengine.searcher('database') content_out=inst.normalizescores(content_out) self.contentscore= content_out
def generateFig(filePath): e=searchengine.searcher('searchindex.db') frequencies= e.getFrequentWords() # take relative word frequencies into account, lower max_font_size #wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text) wordcloud = WordCloud(font_path='/home/jamin/Documents/resource/msyh.ttf',background_color="white",stopwords=STOPWORDS.add(u"黄豆"),max_font_size=40, relative_scaling=.25).fit_words(frequencies) plt.imshow(wordcloud) plt.axis("off") plt.savefig(filePath)
def makeindex(key): e = s.searcher('searchindex.db') result = e.query(key) List = [] size = len(result) for i in range(size): for j in result[i]: List.append(e.geturlname(j)) return List
def pageRank(): reload(searchengine) crawler=searchengine.crawler('searchindex.db') e=searchengine.searcher('searchindex.db') #crawler.calculatepagerank( ) cur=crawler.con.execute('select * from pagerank order by score desc') for i in range(3): d=cur.next() print d,e.geturlname(d[0])
def queryhandler(): e = searchengine.searcher() q = bottle.request.forms.get("query") mywords, myurls = e.query(q) s = bottle.request.environ.get('beaker.session') s['mywords'] = mywords s['myurls'] = myurls s.save() bottle.redirect('/results')
def test_calculate_pagerank(): sys.stderr.write("testing pagerank calculation...\n") crawler=searchengine.crawler('searchindex.db') crawler.calculatepagerank() sys.stderr.write("checking pagerank result...\n") cur=crawler.con.execute('select * from pagerank order by score desc') for i in range(3): print cur.next() sys.stderr.write("checking pagerank top url...\n") e=searchengine.searcher('searchindex.db') urlid=cur.next()[0] print e.geturlname(urlid)
def testQueryIndian(self): wordids=[] rows=[] if self.config.queries == None or len(self.config.queries) <= 0: queries=['memory', 'mental', 'mind', 'storage', 'magnetic', 'cache', 'psychological', 'semiconductor', 'transistor', 'random access', 'data storage'] else: queries = self.config.queries s=searcher(self.dbname) if self.numusers >= 1: for q in queries: for userid in [x+1 for x in range(self.numusers)]: wordids,rows = s.query(q, userid)
def firesearch(): outputwidget.delete(1.0,END) fillconfig() s=searcher(config.dbname) q=queryvar.get() urllist=[] try: widlist,urlidlist = s.query(q,config.userid,config.userurlhitscoresweight) for urlid in urlidlist: url=s.geturlname(urlid) urllist.append(url) outputwidget.insert(END, '\n'.join(urllist)) except: print "Error:", sys.exc_info() tkMessageBox.showerror("Input Error", sys.exc_info()) raise
def serve_search(environ, start_response): query_words = '' results = '' if 'QUERY_STRING' in environ: query_dict = cgi.parse_qs(environ['QUERY_STRING']) if 'q' in query_dict: # parse_qs returns a list for values as query parameters can appear # several times (e.g. 'q=ddsview&q=makeicns'). Ignore all but the first # occurence of q. query_words = query_dict['q'][0] s = searchengine.searcher('searchindex.db') results = '<br>\n'.join(['%f: <a href="%s">%s</a>' % (score, url, url) for score, url in s.query(query_words)]) results = results.encode('utf-8') # Note: this also returns html for favicon queries. start_response('200 OK',[('Content-type','text/html')]) return [template % locals()]
def pagerank_calculator(self, iterations=20): pageranks={} for item in self.citations: pageranks.setdefault(item,1.0) for i in range(iterations): #print 'Iteration' %i pr=0.15 for item in pageranks: init_score=0 for element in self.citations[item]: if element not in pageranks: val=1.0 else: val=pageranks[element] linknum=self.citationcounts[element] init_score+=float(val/linknum) pageranks[item]=pr+(0.85*init_score) inst=searchengine.searcher('database') pageranks=inst.normalizescores(pageranks) #print pageranks['9402117'] self.pagerankscore=pageranks
def serve_search(environ, start_response): query_words = '' results = '' if 'QUERY_STRING' in environ: query_dict = cgi.parse_qs(environ['QUERY_STRING']) if 'q' in query_dict: # parse_qs returns a list for values as query parameters can appear # several times (e.g. 'q=ddsview&q=makeicns'). Ignore all but the first # occurence of q. query_words = query_dict['q'][0] s = searchengine.searcher('searchindex.db') results = '<br>\n'.join([ '%f: <a href="%s">%s</a>' % (score, url, url) for score, url in s.query(query_words) ]) results = results.encode('utf-8') # Note: this also returns html for favicon queries. start_response('200 OK', [('Content-type', 'text/html')]) return [template % locals()]
def generatePosNegFile(filepath): f=open(filepath,'w') e=searchengine.searcher('searchindex.db') cursor= e.con.execute( " select * from urllist where posnegscore is not null order by posnegscore desc limit 3 " ) for row in cursor: f.write(row[0]) f.write("\t") f.write(str(row[1])) f.write("\t") f.write("pos") f.write("\n") cursor= e.con.execute( " select * from urllist where posnegscore is not null order by posnegscore asc limit 3 " ) for row in cursor: f.write(row[0]) f.write("\t") f.write(str(row[1])) f.write("\t") f.write("neg") f.write("\n") f.close()
#coding:utf-8 #!/usr/bin/env python __author__ = 'dick' import searchengine craw = searchengine.crawler('searchindex.db') # craw.createindextables() pages = [ # 'http://www.bbc.com/', 'https://www.hao123.com/?1477704964', # 'https://www.baidu.com', ] # craw.crawl(pages) e = searchengine.searcher('searchindex.db') print e.getmatchrows('hao weather yes')
''' Created on Feb 16, 2014 @author: ssashita A query is to be given as python runquery.py 1 functional programming {1 is the userid, and the rest are the query words} ''' from searchengine import searcher import sys import cconfigurator if __name__ == '__main__': config = cconfigurator.configure('crawled.db') listargs=[] if len(sys.argv) > 2: for arg in sys.argv[2:]: listargs.append(arg) s=searcher('crawled.db') s.query(' '.join([str(x) for x in listargs]),sys.argv[1]) else: print("At least 3 args required. Second one is the userid and rest are the query words")
def query(): e = searchengine.searcher('searchIndex.db') print e.query('functional programming...')
def testquery(q = 'functional programming'): search = searchengine.searcher() search.query(q)
print '\n' if __name__ == '__main__': ''' 2. Boolean operations. Many search engines support Boolean queries, which allow users to construct searches like "python OR perl." An OR search can work by doing the queries separately and combining the results, but what about "python AND (program OR code)"? Modify the query methods to support some basic Boolean operations. 3. Exact matches. Search engines often support "exact match" queries, where the words in the page must match the words in the query in the same order with no additional words in between. Create a new version of getrows that only returns results that are exact matches. (Hint: you can use subtraction in SQL to get the difference between the word locations.) ''' dbname = 'searchindex.db' if True: crawler = se.crawler(dbname) crawler.createindextables() pages = [ 'https://www.zhihu.com/', 'https://github.com/' ] crawler.crawl(pages, depth=2) crawler.calculatepagerank() else: searcher = se.searcher(dbname) q = 'zhihu career' print searcher.query(q)
def test_se_search(): searcher = se.searcher('crawler.db') result = searcher.query('python language blog') print result
def test_query_ranking(weightFunc): sys.stderr.write("testing query with weighting function '%s'...\n" % weightFunc) e=searchengine.searcher('searchindex.db') print e.query('programming',weightFunc)
from flask import Flask, render_template, request, redirect import searchengine, neuralnet, crawler searcher = searchengine.searcher('searchengine.db') crawler = crawler.crawler('searchengine.db') nnet = neuralnet.searchnet('nn.db') app = Flask(__name__) @app.route("/") def search(): if request.args: queryText = request.args.get('q') (wordids, scores, urlIdsList, urlsList) = searcher.query(queryText) if len(urlIdsList) != 0: listOfItems = [{'id': urlIdsList[i], 'url': urlsList[i], 'score': scores[i]} for i in range(len(urlIdsList))] else: listOfItems = [] return render_template('index.html', list=listOfItems, q=queryText) return render_template('index.html', list=None) @app.route('/train', methods=['POST', 'GET']) def train(): if request.method == 'POST': queryPhrase = request.json['q'] selectedURLId = int(request.json['clicked']) app.logger.debug('queryPhrase: %s => selectedURLId: %s' %(queryPhrase, selectedURLId)) (wordids, scores, urlIdsList, urlsList) = searcher.query(queryPhrase) nnet.trainquery(wordids, urlIdsList, selectedURLId)
def test_full_match_words(): s = searcher("output/search.db") print s.getfullmatchrows("simple web page")
def wordFrequency(): reload(searchengine) e=searchengine.searcher('searchindex.db') e.query('sqlite3 python')
from django.conf.urls.static import static from django.shortcuts import render from django.http import HttpResponse, HttpRequest from django.shortcuts import render_to_response from django.template import RequestContext import searchengine import nn e = searchengine.searcher('wikipedia.db') allurls = e.getallurls("functional") # Create your views here. def home_view(request): return HttpResponse(request.method) def search_string(request): query = request.GET['searchquery'] data = e.query(query) context_dict = {'results': data, 'query': query} return render_to_response('results_page.html', context_dict) def train_nn(request, page_alias, selected_result): network = nn.searchnet('nn.db') words = e.getwordids(page_alias) if (selected_result.endswith("/")): selected_result = selected_result[:-1] urlid = e.geturlid(selected_result)
def setUp(self): self.s = searchengine.searcher("test.db")
def query(): e=searchengine.searcher('searchIndex.db') print e.query('functional programming...')
import searchengine pages = ['https://news.google.com.tw/'] crawler = searchengine.crawler('test') crawler.createindextables() #create tables crawler.crawl(pages) crawler.caculatepagerank() e = searchengine.searcher('test') e.query('單場 球季')
def contentranking(): reload(searchengine) #mynet=nn.searchnet('nn.db') #mynet.maketables() e=searchengine.searcher('searchindex.db') e.query('sqlite3 python')
# -*- coding: utf-8 -*- from tornado.ioloop import IOLoop from tornado.web import RequestHandler, Application, url, StaticFileHandler import os.path import sys sys.path.insert(0, os.path.abspath("../collective-intelligence")) import searchengine as se searcher = se.searcher("index.db") foofle_data = {"query" : "", "results" : []} def update_data(query): foofle_data["query"] = query foofle_data["results"] = searcher.query(query) class MainHandler(RequestHandler): def initialize(self, data): self.data = data def get(self): self.render("index.html", query = self.data["query"], results = self.data["results"]) def post(self): query = self.get_argument("input-query") print "La busqueda que se realizara utilizara la cadena '%s' como consulta" % query update_data(query) self.get()
def documentLocation(): reload(searchengine) e=searchengine.searcher('searchindex.db') e.query('sqlite3 python')
def test_getmatchrows(): sys.stderr.write("testing get match rows...\n") e=searchengine.searcher('searchindex.db') print e.getmatchrows('programming')
## coding:utf-8 ## import searchengine e=searchengine.searcher('searchindex.db') #print e.getmatchrows('perl python functional') while 1: print "输入查询的单词(en)" q=raw_input() print e.query(q)
def test_query(): sys.stderr.write("testing query...\n") e=searchengine.searcher('searchindex.db') print e.query('programming')
for test in eval_tests: node = ast.parse(test) print ast.dump(node) # MyVisitor().visit(node) print '\n' if __name__ == '__main__': ''' 2. Boolean operations. Many search engines support Boolean queries, which allow users to construct searches like "python OR perl." An OR search can work by doing the queries separately and combining the results, but what about "python AND (program OR code)"? Modify the query methods to support some basic Boolean operations. 3. Exact matches. Search engines often support "exact match" queries, where the words in the page must match the words in the query in the same order with no additional words in between. Create a new version of getrows that only returns results that are exact matches. (Hint: you can use subtraction in SQL to get the difference between the word locations.) ''' dbname = 'searchindex.db' if True: crawler = se.crawler(dbname) crawler.createindextables() pages = ['https://www.zhihu.com/', 'https://github.com/'] crawler.crawl(pages, depth=2) crawler.calculatepagerank() else: searcher = se.searcher(dbname) q = 'zhihu career' print searcher.query(q)
#!/usr/bin/python # coding: UTF-8 # Author: David # Email: [email protected] # Created: 2016-08-01 14:08 # Last modified: 2016-08-01 15:54 # Filename: search_test.py # Description: import searchengine e = searchengine.searcher() e.query('form authentication')