Ejemplo n.º 1
0
 def __init__(self,dbname):
     self.con = sqlite.connect(dbname)
     self.net = nn.searchnet(dbname)
Ejemplo n.º 2
0
import urllib
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.request 
import sqlite3 as sqlite
#from pysqlite2 import dbapi2 as sqlite
#from pysqlite3 import dbapi3
import re
import neuralnetwork as nn
import ssl  # 以https开头的网站若ssl证书不通过会报错
mynet = nn.searchnet('nn.db')
# Create a list of words to ignore
ignorewords = {'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}

class crawler:
  # Initialize the crawler with the name of database
  def __init__(self, dbname):
    self.con=sqlite.connect(dbname)
  
  def __del__(self):
    self.con.close()
  def dbcommit(self):
    self.con.commit()
  # Auxilliary function for getting an entry id and adding 
  # it if it's not present
  def getentryid(self, table, field, value, createnew=True):
    cur = self.con.execute(
    "select rowid from %s where %s='%s'" % (table,field,value))
    res = cur.fetchone()
    if res == None:
Ejemplo n.º 3
0
import neuralnetwork

mynet = neuralnetwork.searchnet('nn.db')
# mynet.maketables()
wWorld, wRiver, wBank = 101, 102, 103
uWorldBank, uRiver, uEwarth = 201, 202, 203
mynet.generatehiddennode([wWorld, wBank], [uWorldBank, uRiver, uEwarth])
for c in mynet.con.execute("SELECT * FROM wordhidden"): print c
print "------------------"
for c in mynet.con.execute("SELECT * FROM hiddenurl"): print c
Ejemplo n.º 4
0
crawler = searchengine.crawler(fn)
#crawler.createindextables() # 若 db 已建好,可注释此句

# crawl some pages:
#pagelist=['https://en.wikipedia.org/wiki/R_(programming_language)']
#crawler.crawl(pagelist)

pagelist = ['http://www.diveintopython.net']
#crawler.crawl(pagelist)

# 若 db 已建好,可注释下句
#[row for row in crawler.con.execute('select rowid from wordlocation where wordid=1')]

import neuralnetwork as nn
mynet = nn.searchnet('nn.db')
# mynet.maketables()

wordstosearch = 'python programming'
e = searchengine.searcher('searchindex.db')
e.getmatchrows(wordstosearch)

# create the needed tables for the page rank algorithm:
crawler.calculatepagerank()

# before adding in the page rank algorithm into the weights that form the scoring function
e.query(wordstosearch)

# cur = crawler.con.execute('select * from pagerank order by score desc')
#for i in range(3): print(cur.next())
#e.geturlname(17)  # word "python" rowid==17
Ejemplo n.º 5
0
                                 for (u, score) in linkscores.iteritems()])
        return normalizedScores

    def neuralNetwordScore(self, rows, wordids):
        urlids = [urlid for urlid in set([row[0] for row in rows])]
        nnres = mynet.getResult(wordids, urlids)
        scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))])
        return self.normalizeScores(scores)


if __name__ == '__main__':
    pagelist = []
    seedpage = raw_input('Enter the website page address: ')
    if seedpage.find('http://') == -1:
        seedpage = 'http://' + seedpage
    pagelist.append(seedpage)
    dbname = raw_input('Enter the database name: ')
    if dbname.find('.db') == -1:
        dbname += '.db'
    crawler = crawler(dbname)
    if not os.path.isfile(dbname):
        crawler.createIndexTables()
        crawler.crawl(pagelist)
    crawler.calculatePageRank()
    searcher = searcher(dbname)
    mynet = neuralnetwork.searchnet('nn.db')
    while True:
        qStr = raw_input('Enter the query string (press Enter to quit): ')
        if qStr == '' or qStr == '\n':
            break
        searcher.query(qStr)