def get_site_id(conn,url): domain = urlparse.urlparse(url).netloc sql = "select siteid,name from xsite where url='%s';"%(domain) data = mysql.query_one(conn, sql) if not data: isql = "insert into xsite(name,url) values('%s','%s');"%(domain,domain) mysql.insert(conn, isql) mysql.commit(conn) data = mysql.query_one(conn, sql) return data
def get_site_id(conn, url): domain = urlparse.urlparse(url).netloc sql = "select siteid,name from xsite where url='%s';" % (domain) data = mysql.query_one(conn, sql) if not data: isql = "insert into xsite(name,url) values('%s','%s');" % (domain, domain) mysql.insert(conn, isql) mysql.commit(conn) data = mysql.query_one(conn, sql) return data
def get_domainID(handle,cursor,domain,source): sql = "select DomainID from domain where name='%s' and url='%s';"%(source,domain) cursor.execute(sql) domainID = cursor.fetchone() if not domainID: sql = "insert into domain(name,url) values('%s','%s');"%(source,domain) cursor.execute(sql) mysql.commit(handle) sql = "select DomainID from domain where name='%s' and url='%s';"%(source,domain) cursor.execute(sql) domainID = cursor.fetchone() return domainID[0]
def get_authorID(handle,cursor,domainID,author): sql = "select AuthorID from author where nickname='%s' and domain='%s';"%(author,domainID) # print sql cursor.execute(sql) authorID = cursor.fetchone() if not authorID: sql = "insert into author(nickname,domain) values('%s','%s');"%(author,domainID) cursor.execute(sql) mysql.commit(handle) sql = "select AuthorID from author where nickname='%s' and domain='%s';"%(author,domainID) cursor.execute(sql) authorID = cursor.fetchone() return authorID[0]
def fetch_query_results(self, query_string, num=200): index_path = "/disk1/kol_search_index/index" query_index = QueryEnvironment() query_index.addIndex(index_path) # 根据query_string查询结果 # print query_string docs = query_index.runQuery(query_string, num) # 解析查询的结果 results = get_query_results(query_index, docs) datas = {} flag = 0 conn = "" now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") flag = 0 for result in results: if flag >= 200: break data = {} userid = result["userid"] site = result["site"] relevance = process_relevance(result["relevance"]) data.update({"userid": userid}) data.update({"site": site}) data.update({"relevance": relevance}) datas.update({flag: data}) flag += 1 if datas: conn = mysql.connect("kol_search") results = json.dumps(datas) results = conn.escape_string(results) query = str(self.keyword) + "#$#" + str(self.site) sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % ( query, results, now, ) # print sql mysql.insert(conn, sql) mysql.commit(conn) mysql.close(conn) query_index.close() return datas
def process_item(self, item, spider): self.total_cnt += 1 if isinstance(item, PageMetaItem): http_code = item['http_code'] self.file.write(item['url'] + '\n') self.redis.add_url(item['url']) if http_code >= 200 and http_code < 300: self.total += 1 try: if self.total % self.nums_in_eachDBFile == 0: self.db.closeDb() if os.path.exists(self.db_file): shutil.move(self.db_file, self.dbfile_move_target) else: err = '+++no_db_file:', self.db_file print err log.msg(err, level=log.ERROR) self._createNewDBFile() if item['url'] and item['content']: self._writeDBFile(item) except: print '=URL=', item['url'], '=body=', item['content'] info = sys.exc_info() print info[0], ":", info[1] elif isinstance(item, ReplycountItem): url = item['url'] appnameid = item['appnameid'] replyCount = item.get('replyCount', 0) readnum = item.get('readnum', 0) likenum = item.get('likenum', 0) unlikenum = item.get('unlikenum', 0) playnum = item.get('playnum', 0) repostsnum = item.get('repostsnum', 0) updatetime = item.get('updatetime', 0) sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");' sql = sql % (url, appnameid, replyCount, readnum, likenum, unlikenum, playnum, repostsnum, updatetime) mysql.insert(self.conn, sql) #cursor = self.conn.cursor() #cursor.execute(sql) mysql.commit(self.conn)
def process_item(self, item, spider): self.total_cnt += 1 if isinstance(item, PageMetaItem): http_code = item['http_code'] self.file.write(item['url']+'\n') self.redis.add_url(item['url']) if http_code >= 200 and http_code < 300: self.total += 1 try: if self.total % self.nums_in_eachDBFile == 0: self.db.closeDb() if os.path.exists(self.db_file): shutil.move(self.db_file,self.dbfile_move_target) else: err = '+++no_db_file:',self.db_file print err log.msg(err,level=log.ERROR) self._createNewDBFile() if item['url'] and item['content']: self._writeDBFile(item) except: print '=URL=',item['url'],'=body=',item['content'] info=sys.exc_info() print info[0],":",info[1] elif isinstance(item,ReplycountItem): url = item['url'] appnameid = item['appnameid'] replyCount = item.get('replyCount',0) readnum = item.get('readnum',0) likenum = item.get('likenum',0) unlikenum = item.get('unlikenum',0) playnum = item.get('playnum',0) repostsnum = item.get('repostsnum',0) updatetime = item.get('updatetime',0) sql = 'insert into container(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) values("%s",%s,%s,%s,%s,%s,%s,%s,"%s");' sql = sql%(url,appnameid,replyCount,readnum,likenum,unlikenum,playnum,repostsnum,updatetime) mysql.insert(self.conn, sql) #cursor = self.conn.cursor() #cursor.execute(sql) mysql.commit(self.conn)
def fetch_query_results(self, query_string, num=200): index_path = '/disk1/kol_search_index/index' query_index = QueryEnvironment() query_index.addIndex(index_path) #根据query_string查询结果 #print query_string docs = query_index.runQuery(query_string, num) #解析查询的结果 results = get_query_results(query_index, docs) datas = {} flag = 0 conn = '' now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') flag = 0 for result in results: if flag >= 200: break data = {} userid = result['userid'] site = result['site'] relevance = process_relevance(result['relevance']) data.update({'userid': userid}) data.update({'site': site}) data.update({'relevance': relevance}) datas.update({flag: data}) flag += 1 if datas: conn = mysql.connect('kol_search') results = json.dumps(datas) results = conn.escape_string(results) query = str(self.keyword) + '#$#' + str(self.site) sql = "insert into search_result_cache(query,result,update_time) values('%s','%s','%s');" % ( query, results, now) #print sql mysql.insert(conn, sql) mysql.commit(conn) mysql.close(conn) query_index.close() return datas
insert_num = 0 for qualified_data in qualified_datas: objectid = qualified_data['objectid'] date = qualified_data['pubtime'].strftime('%Y-%m-%d') facet_sql = 'select id from xfacet where objectid=%s and type=1;'%(objectid) try: facetid = mysql.query_one(conn, facet_sql)[0] except Exception,e: #print e continue xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";'%(facetid,date) xentryid = mysql.query_one(conn, xentry_sql) if not xentryid: xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");'%(facetid,date) mysql.insert(conn, xentry_insert_sql) mysql.commit(conn) xentryid = mysql.query_one(conn, xentry_sql) try: xentryid = xentryid[0] except Exception,e: print e return -1 xpostnum = xentryid%8 title,abstract,posttime,url,author,comment_count,click_count,template_type = qualified_data['title'],'',qualified_data['pubtime'],qualified_data['url'],qualified_data['author'],0,0,qualified_data['type'] duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";'%(xpostnum,xentryid,conn.escape_string(url)) cursor.execute(duplicate_sql) postid = cursor.fetchall() if postid: #print xpostnum,postid continue sourcetype = SOURCE_TYPE_MAP.get(template_type,4)
date = qualified_data['pubtime'].strftime('%Y-%m-%d') facet_sql = 'select id from xfacet where objectid=%s and type=1;' % ( objectid) try: facetid = mysql.query_one(conn, facet_sql)[0] except Exception, e: #print e continue xentry_sql = 'select entryid from xentry where facetid=%s and date="%s";' % ( facetid, date) xentryid = mysql.query_one(conn, xentry_sql) if not xentryid: xentry_insert_sql = 'insert into xentry(facetid,date) values(%s,"%s");' % ( facetid, date) mysql.insert(conn, xentry_insert_sql) mysql.commit(conn) xentryid = mysql.query_one(conn, xentry_sql) try: xentryid = xentryid[0] except Exception, e: print e return -1 xpostnum = xentryid % 8 title, abstract, posttime, url, author, comment_count, click_count, template_type = qualified_data[ 'title'], '', qualified_data['pubtime'], qualified_data[ 'url'], qualified_data['author'], 0, 0, qualified_data['type'] duplicate_sql = 'select postid from xpost%d where entryid=%d and url="%s";' % ( xpostnum, xentryid, conn.escape_string(url)) cursor.execute(duplicate_sql) postid = cursor.fetchall() if postid: