Exemple #1
0
def get_site_id(conn,url):
    domain = urlparse.urlparse(url).netloc
    sql = "select siteid,name from xsite where url='%s';"%(domain)
    data = mysql.query_one(conn, sql)
    if not data:
        isql = "insert into xsite(name,url) values('%s','%s');"%(domain,domain)
        mysql.insert(conn, isql)
        mysql.commit(conn)
        data = mysql.query_one(conn, sql)
    return data
Exemple #2
0
def get_site_id(conn, url):
    domain = urlparse.urlparse(url).netloc
    sql = "select siteid,name from xsite where url='%s';" % (domain)
    data = mysql.query_one(conn, sql)
    if not data:
        isql = "insert into xsite(name,url) values('%s','%s');" % (domain,
                                                                   domain)
        mysql.insert(conn, isql)
        mysql.commit(conn)
        data = mysql.query_one(conn, sql)
    return data
Exemple #3
0
def get_domainID(handle,cursor,domain,source):
    sql = "select DomainID from domain where name='%s' and url='%s';"%(source,domain)
    cursor.execute(sql)
    domainID = cursor.fetchone()
    if not domainID:
        sql = "insert into domain(name,url) values('%s','%s');"%(source,domain)
        cursor.execute(sql)
        mysql.commit(handle)
        sql = "select DomainID from domain where name='%s' and url='%s';"%(source,domain)
        cursor.execute(sql)
        domainID = cursor.fetchone()
    return domainID[0]
Exemple #4
0
def get_authorID(handle,cursor,domainID,author):
    sql = "select AuthorID from author where nickname='%s' and domain='%s';"%(author,domainID)
#    print sql
    cursor.execute(sql)
    authorID = cursor.fetchone()
    if not authorID:
        sql = "insert into author(nickname,domain) values('%s','%s');"%(author,domainID)
        cursor.execute(sql)
        mysql.commit(handle)
        sql = "select AuthorID from author where nickname='%s' and domain='%s';"%(author,domainID)
        cursor.execute(sql)
        authorID = cursor.fetchone()
    return authorID[0]
 def fetch_query_results(self, query_string, num=200):
     index_path = "/disk1/kol_search_index/index"
     query_index = QueryEnvironment()
     query_index.addIndex(index_path)
     # 根据query_string查询结果
     # print query_string
     docs = query_index.runQuery(query_string, num)
     # 解析查询的结果
     results = get_query_results(query_index, docs)
     datas = {}
     flag = 0
     conn = ""
     now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     flag = 0
     for result in results:
         if flag >= 200:
             break
         data = {}
         userid = result["userid"]
         site = result["site"]
         relevance = process_relevance(result["relevance"])
         data.update({"userid": userid})
         data.update({"site": site})
         data.update({"relevance": relevance})
         datas.update({flag: data})
         flag += 1
     if datas:
         conn = mysql.connect("kol_search")
         results = json.dumps(datas)
         results = conn.escape_string(results)
         query = str(self.keyword) + "#$#" + str(self.site)
         sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % (
             query,
             results,
             now,
         )
         # print sql
         mysql.insert(conn, sql)
         mysql.commit(conn)
         mysql.close(conn)
         query_index.close()
     return datas
Exemple #6
0
    def process_item(self, item, spider):
        self.total_cnt += 1
        if isinstance(item, PageMetaItem):
            http_code = item['http_code']
            self.file.write(item['url'] + '\n')
            self.redis.add_url(item['url'])
            if http_code >= 200 and http_code < 300:
                self.total += 1
                try:
                    if self.total % self.nums_in_eachDBFile == 0:
                        self.db.closeDb()
                        if os.path.exists(self.db_file):
                            shutil.move(self.db_file, self.dbfile_move_target)
                        else:
                            err = '+++no_db_file:', self.db_file
                            print err
                            log.msg(err, level=log.ERROR)
                        self._createNewDBFile()

                    if item['url'] and item['content']:
                        self._writeDBFile(item)
                except:
                    print '=URL=', item['url'], '=body=', item['content']
                    info = sys.exc_info()
                    print info[0], ":", info[1]
        elif isinstance(item, ReplycountItem):
            url = item['url']
            appnameid = item['appnameid']
            replyCount = item.get('replyCount', 0)
            readnum = item.get('readnum', 0)
            likenum = item.get('likenum', 0)
            unlikenum = item.get('unlikenum', 0)
            playnum = item.get('playnum', 0)
            repostsnum = item.get('repostsnum', 0)
            updatetime = item.get('updatetime', 0)
            sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");'
            sql = sql % (url, appnameid, replyCount, readnum, likenum,
                         unlikenum, playnum, repostsnum, updatetime)
            mysql.insert(self.conn, sql)
            #cursor = self.conn.cursor()
            #cursor.execute(sql)
            mysql.commit(self.conn)
Exemple #7
0
    def process_item(self, item, spider):
        self.total_cnt += 1
        if isinstance(item, PageMetaItem):
            http_code = item['http_code']
            self.file.write(item['url']+'\n')
            self.redis.add_url(item['url'])
            if http_code >= 200 and http_code < 300:
                self.total += 1
                try:
                    if self.total % self.nums_in_eachDBFile == 0:
                        self.db.closeDb()
                        if os.path.exists(self.db_file):
                            shutil.move(self.db_file,self.dbfile_move_target)
                        else:
                            err = '+++no_db_file:',self.db_file
                            print err
                            log.msg(err,level=log.ERROR)
                        self._createNewDBFile()
 
                    if item['url'] and item['content']:
                        self._writeDBFile(item)
                except:
                    print '=URL=',item['url'],'=body=',item['content']
                    info=sys.exc_info()
                    print info[0],":",info[1]
        elif isinstance(item,ReplycountItem):
            url = item['url']
            appnameid = item['appnameid']
            replyCount = item.get('replyCount',0)
            readnum = item.get('readnum',0) 
            likenum = item.get('likenum',0) 
            unlikenum = item.get('unlikenum',0) 
            playnum = item.get('playnum',0) 
            repostsnum = item.get('repostsnum',0) 
            updatetime =  item.get('updatetime',0)
            sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");'
            sql = sql%(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime)
            mysql.insert(self.conn, sql)
	    #cursor = self.conn.cursor()
	    #cursor.execute(sql)
            mysql.commit(self.conn)
 def fetch_query_results(self, query_string, num=200):
     index_path = '/disk1/kol_search_index/index'
     query_index = QueryEnvironment()
     query_index.addIndex(index_path)
     #根据query_string查询结果
     #print query_string
     docs = query_index.runQuery(query_string, num)
     #解析查询的结果
     results = get_query_results(query_index, docs)
     datas = {}
     flag = 0
     conn = ''
     now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     flag = 0
     for result in results:
         if flag >= 200:
             break
         data = {}
         userid = result['userid']
         site = result['site']
         relevance = process_relevance(result['relevance'])
         data.update({'userid': userid})
         data.update({'site': site})
         data.update({'relevance': relevance})
         datas.update({flag: data})
         flag += 1
     if datas:
         conn = mysql.connect('kol_search')
         results = json.dumps(datas)
         results = conn.escape_string(results)
         query = str(self.keyword) + '#$#' + str(self.site)
         sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % (
             query, results, now)
         #print sql
         mysql.insert(conn, sql)
         mysql.commit(conn)
         mysql.close(conn)
         query_index.close()
     return datas
Exemple #9
0
    insert_num = 0
    for qualified_data in qualified_datas:
        objectid = qualified_data['objectid']
        date = qualified_data['pubtime'].strftime('%Y-%m-%d')
        facet_sql = 'select id from xfacet where objectid=%s and type=1;'%(objectid)
	try:
            facetid = mysql.query_one(conn, facet_sql)[0]
	except Exception,e:
	    #print e
	    continue
        xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";'%(facetid,date)
        xentryid = mysql.query_one(conn, xentry_sql)
        if not xentryid:
            xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");'%(facetid,date)
            mysql.insert(conn, xentry_insert_sql)
            mysql.commit(conn)
            xentryid = mysql.query_one(conn, xentry_sql)
        try:
            xentryid = xentryid[0]
        except Exception,e:
            print e
            return -1
        xpostnum = xentryid%8
        title,abstract,posttime,url,author,comment_count,click_count,template_type = qualified_data['title'],'',qualified_data['pubtime'],qualified_data['url'],qualified_data['author'],0,0,qualified_data['type']
        duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";'%(xpostnum,xentryid,conn.escape_string(url))
        cursor.execute(duplicate_sql)
        postid = cursor.fetchall()
        if postid:
	    #print xpostnum,postid
            continue
        sourcetype = SOURCE_TYPE_MAP.get(template_type,4)
Exemple #10
0
 date = qualified_data['pubtime'].strftime('%Y-%m-%d')
 facet_sql = 'select id from xfacet where objectid=%s and type=1;' % (
     objectid)
 try:
     facetid = mysql.query_one(conn, facet_sql)[0]
 except Exception, e:
     #print e
     continue
 xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";' % (
     facetid, date)
 xentryid = mysql.query_one(conn, xentry_sql)
 if not xentryid:
     xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");' % (
         facetid, date)
     mysql.insert(conn, xentry_insert_sql)
     mysql.commit(conn)
     xentryid = mysql.query_one(conn, xentry_sql)
 try:
     xentryid = xentryid[0]
 except Exception, e:
     print e
     return -1
 xpostnum = xentryid % 8
 title, abstract, posttime, url, author, comment_count, click_count, template_type = qualified_data[
     'title'], '', qualified_data['pubtime'], qualified_data[
         'url'], qualified_data['author'], 0, 0, qualified_data['type']
 duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";' % (
     xpostnum, xentryid, conn.escape_string(url))
 cursor.execute(duplicate_sql)
 postid = cursor.fetchall()
 if postid: