Esempio n. 1
0
def query(s):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    print (s)
    a = WebIndexer()
    result = []
    try:
        res = a.query(u'name:"'+' '.join(jieba.cut(s, cut_all=False))+'" ', 'name')
    except Exception, e:
        print (e)
        print unicode(e.getJavaException())
Esempio n. 2
0
def query(s):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    print(s)
    a = WebIndexer()
    result = []
    try:
        res = a.query(u'name:"' + ' '.join(jieba.cut(s, cut_all=False)) + '" ',
                      'name')
    except Exception, e:
        print(e)
        print unicode(e.getJavaException())
Esempio n. 3
0
def getContent(s):
    soup = BeautifulSoup(s, from_encoding='utf-8')
    res = ''
    for i in soup.findAll('script'):
        i.decompose()
    soup = BeautifulSoup(soup.prettify())
    for i in soup.findAll('link'):
        i.decompose()
    soup = BeautifulSoup(soup.prettify())
    for i in soup.findAll('style'):
        i.decompose()
    soup = BeautifulSoup(soup.prettify())
    return soup.get_text()

if __name__=='__main__':
    crawler=WebIndexer()
    conn = sqlite3.connect('../info.db')
    c = conn.cursor()
    c.execute('select * from info where id>11214893')
    while True:
        try:
            raw = c.fetchone()
            if raw == None:
                break
            print (raw)
            crawler.add(id_ = str(raw[0]), name=string.lower(' '.join(jieba.cut(raw[1]))), price=str(raw[2]), 
                    imgurl=raw[3], author = string.lower(raw[4]), url=raw[5], category=string.lower(raw[6]), detail=string.lower(raw[7]), rawname=raw[1])
        except Exception,e:
            print(unicode(e))