Example #1
0
    def content_based_calculator(self, stringofwords):
        divider = re.compile('\\W*')
        res=[x.lower() for x in divider.split(stringofwords) if x!= '']
        content_scores={}
        paperid_index={}
        content_out={}
        index=1
        for word in res:
            if word in self.wordlocations:
                for papid in self.wordlocations[word]:
                    papid_score=len(self.wordlocations[word][papid])
                    if papid not in content_scores:
                        content_scores[papid]=papid_score
                        paperid_index[papid]=index
                    else:
                        content_scores[papid]=content_scores[papid]*papid_score
                        paperid_index[papid]+=1
            else:
                continue

        for pid in paperid_index:
            if paperid_index[pid]==len(res):
                content_out[pid]=content_scores[pid]

        inst=searchengine.searcher('database')
        content_out=inst.normalizescores(content_out)
        self.contentscore= content_out
def generateFig(filePath):
    e=searchengine.searcher('searchindex.db')
    frequencies= e.getFrequentWords()
    # take relative word frequencies into account, lower max_font_size
    #wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
    wordcloud = WordCloud(font_path='/home/jamin/Documents/resource/msyh.ttf',background_color="white",stopwords=STOPWORDS.add(u"黄豆"),max_font_size=40, relative_scaling=.25).fit_words(frequencies)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig(filePath)
Example #3
0
def makeindex(key):
    e = s.searcher('searchindex.db')
    result = e.query(key)
    List = []
    size = len(result)
    for i in range(size):
        for j in result[i]:
            List.append(e.geturlname(j))
    return List
Example #4
0
File: run.py Project: wz125/courses
def pageRank():
  reload(searchengine)
  crawler=searchengine.crawler('searchindex.db')
  e=searchengine.searcher('searchindex.db')
  #crawler.calculatepagerank( )
  cur=crawler.con.execute('select * from pagerank order by score desc')
  for i in range(3): 
   d=cur.next()
   print d,e.geturlname(d[0])
Example #5
0
def makeindex(key):
  e = s.searcher('searchindex.db')
  result = e.query(key)
  List = []
  size = len(result)
  for i in range(size):
    for j in result[i]:
      List.append(e.geturlname(j))
  return List
Example #6
0
def queryhandler():
    e = searchengine.searcher()
    q = bottle.request.forms.get("query")
    mywords, myurls = e.query(q)
    s = bottle.request.environ.get('beaker.session')
    s['mywords'] = mywords
    s['myurls'] = myurls
    s.save()
    bottle.redirect('/results')
def test_calculate_pagerank():
    sys.stderr.write("testing pagerank calculation...\n")
    crawler=searchengine.crawler('searchindex.db')
    crawler.calculatepagerank()
    sys.stderr.write("checking pagerank result...\n")
    cur=crawler.con.execute('select * from pagerank order by score desc')
    for i in range(3): print cur.next()
    sys.stderr.write("checking pagerank top url...\n")
    e=searchengine.searcher('searchindex.db')
    urlid=cur.next()[0]
    print e.geturlname(urlid)
Example #8
0
 def testQueryIndian(self):
     wordids=[]
     rows=[]
     if self.config.queries == None or len(self.config.queries) <= 0:
         queries=['memory', 'mental', 'mind', 'storage', 'magnetic', 'cache', 'psychological', 'semiconductor', 'transistor', 'random access', 'data storage']
     else:
         queries = self.config.queries
     s=searcher(self.dbname)
     if self.numusers >= 1:
         for q in queries:
             for userid in [x+1 for x in range(self.numusers)]:
                 wordids,rows = s.query(q, userid)
Example #9
0
def firesearch():
    outputwidget.delete(1.0,END)
    fillconfig()
    s=searcher(config.dbname)
    q=queryvar.get()
    urllist=[]
    try:
        widlist,urlidlist = s.query(q,config.userid,config.userurlhitscoresweight)
        for urlid in urlidlist:
            url=s.geturlname(urlid)
            urllist.append(url)
        outputwidget.insert(END, '\n'.join(urllist))
    except:
        print "Error:", sys.exc_info()
        tkMessageBox.showerror("Input Error", sys.exc_info())
        raise
def serve_search(environ, start_response):

  query_words = ''
  results = ''
  if 'QUERY_STRING' in environ:
    query_dict = cgi.parse_qs(environ['QUERY_STRING'])
    if 'q' in query_dict:
      # parse_qs returns a list for values as query parameters can appear
      # several times (e.g. 'q=ddsview&q=makeicns'). Ignore all but the first
      # occurence of q.
      query_words = query_dict['q'][0]
      s = searchengine.searcher('searchindex.db')
      results = '<br>\n'.join(['%f: <a href="%s">%s</a>' % (score, url, url)
        for score, url in s.query(query_words)])
      results = results.encode('utf-8')

  # Note: this also returns html for favicon queries.
  start_response('200 OK',[('Content-type','text/html')])
  return [template % locals()]
Example #11
0
 def pagerank_calculator(self, iterations=20):
     pageranks={}
     for item in self.citations:
         pageranks.setdefault(item,1.0)
     for i in range(iterations):
         #print 'Iteration' %i
         pr=0.15
         for item in pageranks:
             init_score=0
             for element in self.citations[item]:
                 if element not in pageranks:
                     val=1.0
                 else:
                     val=pageranks[element]
                 linknum=self.citationcounts[element]
                 init_score+=float(val/linknum)
             pageranks[item]=pr+(0.85*init_score)
     inst=searchengine.searcher('database')
     pageranks=inst.normalizescores(pageranks)
     #print pageranks['9402117']
     self.pagerankscore=pageranks
def serve_search(environ, start_response):

    query_words = ''
    results = ''
    if 'QUERY_STRING' in environ:
        query_dict = cgi.parse_qs(environ['QUERY_STRING'])
        if 'q' in query_dict:
            # parse_qs returns a list for values as query parameters can appear
            # several times (e.g. 'q=ddsview&q=makeicns'). Ignore all but the first
            # occurence of q.
            query_words = query_dict['q'][0]
            s = searchengine.searcher('searchindex.db')
            results = '<br>\n'.join([
                '%f: <a href="%s">%s</a>' % (score, url, url)
                for score, url in s.query(query_words)
            ])
            results = results.encode('utf-8')

    # Note: this also returns html for favicon queries.
    start_response('200 OK', [('Content-type', 'text/html')])
    return [template % locals()]
def generatePosNegFile(filepath):
    f=open(filepath,'w')
    e=searchengine.searcher('searchindex.db')
    cursor= e.con.execute(
            " select * from urllist where posnegscore is not null order by posnegscore desc limit 3  " )
    for row in cursor:
        f.write(row[0])
        f.write("\t")
        f.write(str(row[1]))
        f.write("\t")
        f.write("pos")
        f.write("\n")
    cursor= e.con.execute(
            " select * from urllist where posnegscore is not null order by posnegscore asc limit 3  " )
    for row in cursor:
        f.write(row[0])
        f.write("\t")
        f.write(str(row[1]))
        f.write("\t")
        f.write("neg")
        f.write("\n")
    f.close()
#coding:utf-8
#!/usr/bin/env python
__author__ = 'dick'

import searchengine

craw = searchengine.crawler('searchindex.db')
# craw.createindextables()
pages = [
    # 'http://www.bbc.com/',
    'https://www.hao123.com/?1477704964',
    # 'https://www.baidu.com',
]

# craw.crawl(pages)

e = searchengine.searcher('searchindex.db')
print e.getmatchrows('hao weather yes')
Example #15
0
'''
Created on Feb 16, 2014

@author: ssashita
A query is to be given as 
 python runquery.py 1 functional programming
 {1 is the userid, and the rest are the query words} 
'''

from searchengine import searcher
import sys
import cconfigurator

if __name__ == '__main__':
    config = cconfigurator.configure('crawled.db')
    listargs=[]
    if len(sys.argv) > 2:
        for arg in sys.argv[2:]:
            listargs.append(arg)
        s=searcher('crawled.db')
        s.query(' '.join([str(x) for x in listargs]),sys.argv[1])
    else:
        print("At least 3 args required. Second one is the userid and rest are the query words")
Example #16
0
def query():
    e = searchengine.searcher('searchIndex.db')
    print e.query('functional programming...')
Example #17
0
def testquery(q = 'functional programming'):
   search = searchengine.searcher()
   search.query(q)
Example #18
0
    print '\n'

if __name__ == '__main__':
    '''
2. Boolean operations. Many search engines support Boolean queries, which allow
users to construct searches like "python OR perl." An OR search can work by
doing the queries separately and combining the results, but what about "python
AND (program OR code)"? Modify the query methods to support some basic
Boolean operations.
3. Exact matches. Search engines often support "exact match" queries, where the
words in the page must match the words in the query in the same order with no
additional words in between. Create a new version of getrows that only returns
results that are exact matches. (Hint: you can use subtraction in SQL to get the
difference between the word locations.) 
    '''
    dbname = 'searchindex.db'
    if True:
        crawler = se.crawler(dbname)
        crawler.createindextables()
        pages = [
            'https://www.zhihu.com/',
            'https://github.com/'
        ]
        crawler.crawl(pages, depth=2)
        crawler.calculatepagerank()
    else:
        searcher = se.searcher(dbname)
        q = 'zhihu career'
        print searcher.query(q)

def test_se_search():
    searcher = se.searcher('crawler.db')
    result = searcher.query('python language blog')
    print result
def test_query_ranking(weightFunc):
    sys.stderr.write("testing query with weighting function '%s'...\n" % weightFunc)
    e=searchengine.searcher('searchindex.db')
    print e.query('programming',weightFunc)
Example #21
0
from flask import Flask, render_template, request, redirect
import searchengine, neuralnet, crawler
searcher = searchengine.searcher('searchengine.db')
crawler = crawler.crawler('searchengine.db')
nnet = neuralnet.searchnet('nn.db')


app = Flask(__name__)


@app.route("/")
def search():
	if request.args:
		queryText = request.args.get('q')
		(wordids, scores, urlIdsList, urlsList) = searcher.query(queryText)
		if len(urlIdsList) != 0:
			listOfItems = [{'id': urlIdsList[i], 'url': urlsList[i], 'score': scores[i]} for i in range(len(urlIdsList))]
		else:
			listOfItems = []
		return render_template('index.html', list=listOfItems, q=queryText)
	return render_template('index.html', list=None)


@app.route('/train', methods=['POST', 'GET'])
def train():		
	if request.method == 'POST':
		queryPhrase = request.json['q']
		selectedURLId = int(request.json['clicked'])
		app.logger.debug('queryPhrase: %s => selectedURLId: %s' %(queryPhrase, selectedURLId))
		(wordids, scores, urlIdsList, urlsList) = searcher.query(queryPhrase)
		nnet.trainquery(wordids, urlIdsList, selectedURLId)
Example #22
0
def test_full_match_words():
    s = searcher("output/search.db")
    print s.getfullmatchrows("simple web page")
Example #23
0
File: run.py Project: wz125/courses
def wordFrequency():
  reload(searchengine)
  e=searchengine.searcher('searchindex.db')
  e.query('sqlite3 python')
Example #24
0
from django.conf.urls.static import static
from django.shortcuts import render
from django.http import HttpResponse, HttpRequest
from django.shortcuts import render_to_response
from django.template import RequestContext
import searchengine
import nn

e = searchengine.searcher('wikipedia.db')
allurls = e.getallurls("functional")

# Create your views here.


def home_view(request):
    return HttpResponse(request.method)


def search_string(request):
    query = request.GET['searchquery']
    data = e.query(query)
    context_dict = {'results': data, 'query': query}
    return render_to_response('results_page.html', context_dict)


def train_nn(request, page_alias, selected_result):
    network = nn.searchnet('nn.db')
    words = e.getwordids(page_alias)
    if (selected_result.endswith("/")):
        selected_result = selected_result[:-1]
    urlid = e.geturlid(selected_result)
 def setUp(self):
     self.s = searchengine.searcher("test.db")
Example #26
0
def query():
	e=searchengine.searcher('searchIndex.db')
	print e.query('functional programming...')
Example #27
0
File: run.py Project: yiran02/study
import searchengine
pages = ['https://news.google.com.tw/']
crawler = searchengine.crawler('test')
crawler.createindextables()  #create tables

crawler.crawl(pages)

crawler.caculatepagerank()

e = searchengine.searcher('test')
e.query('單場 球季')
Example #28
0
File: run.py Project: wz125/courses
def contentranking():
  reload(searchengine)
  #mynet=nn.searchnet('nn.db')
  #mynet.maketables()
  e=searchengine.searcher('searchindex.db')
  e.query('sqlite3 python')
Example #29
0
# -*- coding: utf-8 -*-
from tornado.ioloop import IOLoop
from tornado.web import RequestHandler, Application, url, StaticFileHandler
import os.path
import sys

sys.path.insert(0, os.path.abspath("../collective-intelligence"))

import searchengine as se
searcher = se.searcher("index.db")

foofle_data = {"query" : "",
               "results" : []}

def update_data(query):
    foofle_data["query"] = query
    foofle_data["results"] = searcher.query(query)

class MainHandler(RequestHandler):
    def initialize(self, data):
        self.data = data

    def get(self):
        self.render("index.html", query = self.data["query"], results = self.data["results"])

    def post(self):
        query = self.get_argument("input-query")
        print "La busqueda que se realizara utilizara la cadena '%s' como consulta" % query
        update_data(query)
        self.get()
Example #30
0
File: run.py Project: wz125/courses
def documentLocation():
  reload(searchengine)
  e=searchengine.searcher('searchindex.db')
  e.query('sqlite3 python')
def test_getmatchrows():
    sys.stderr.write("testing get match rows...\n")
    e=searchengine.searcher('searchindex.db')
    print e.getmatchrows('programming')
Example #32
0
## coding:utf-8 ##
import searchengine

e=searchengine.searcher('searchindex.db')
#print e.getmatchrows('perl python functional')
while 1:
	print "输入查询的单词(en)"
	q=raw_input()
	print e.query(q)
def test_query():
    sys.stderr.write("testing query...\n")
    e=searchengine.searcher('searchindex.db')
    print e.query('programming')
Example #34
0
for test in eval_tests:
    node = ast.parse(test)
    print ast.dump(node)
    # MyVisitor().visit(node)
    print '\n'

if __name__ == '__main__':
    '''
2. Boolean operations. Many search engines support Boolean queries, which allow
users to construct searches like "python OR perl." An OR search can work by
doing the queries separately and combining the results, but what about "python
AND (program OR code)"? Modify the query methods to support some basic
Boolean operations.
3. Exact matches. Search engines often support "exact match" queries, where the
words in the page must match the words in the query in the same order with no
additional words in between. Create a new version of getrows that only returns
results that are exact matches. (Hint: you can use subtraction in SQL to get the
difference between the word locations.) 
    '''
    dbname = 'searchindex.db'
    if True:
        crawler = se.crawler(dbname)
        crawler.createindextables()
        pages = ['https://www.zhihu.com/', 'https://github.com/']
        crawler.crawl(pages, depth=2)
        crawler.calculatepagerank()
    else:
        searcher = se.searcher(dbname)
        q = 'zhihu career'
        print searcher.query(q)
Example #35
0
#!/usr/bin/python
# coding: UTF-8
# Author: David
# Email: [email protected]
# Created: 2016-08-01 14:08
# Last modified: 2016-08-01 15:54
# Filename: search_test.py
# Description:
import searchengine
e = searchengine.searcher()
e.query('form authentication')