def basd_info_add(sql): try: op = OrclPool() op.execute_sql(sql) print('插入数据库') # print(sql) except Exception as e: if sql != 'insert all select 1 from dual': export_log({"type": "批量插入sql", "data": sql, "exception": str(e)})
def update_keywords(): keywords = [] op = OrclPool() sql = 'select * from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001' key_list = op.fetch_all(sql) for ld in key_list: key = {} key['id'] = ld[0] key['name'] = ld[1] key['main_word'] = ld[2] key['key_word'] = ld[2].split(',') keywords.append(key) return keywords
def main(argv): http_url = argv[1] try: sql = "select ID from BASE_ANALYSIS_SENTIMENT_DETAIL where URL='%s'" % http_url op = OrclPool() res_check = op.fetch_all(sql) check_httpurl = [] if len(res_check) == 0: check_httpurl = [] else: check_httpurl = [str(rc[0]) for rc in res_check] print(http_url) with open('check_httpurl.txt', 'w', encoding='utf8') as fp: fp.write(json.dumps(check_httpurl, ensure_ascii=False)) except Exception as e: export_log({"type": "check_sql", "data": sql, "exception": str(e)})
def getKeylist(): # 取得关键字 os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' op = OrclPool() sql = "select key_word from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001" list1 = op.fetch_all(sql) keylist = [] for node in list1: temp1 = str(node).replace("'", '') temp2 = temp1.replace("(" or ")", "") temp3 = temp2.replace(")", "") temp4 = temp3.split(",") for key in temp4: if key != '': keylist.append(key) keylist = set(keylist) keylist = list(keylist) return keylist
def __init__(self): ############################################################################### global producer # producer = KafkaProducer(bootstrap_servers=['172.16.54.139:6667']) producer = KafkaProducer(bootstrap_servers=[ '172.16.54.139:6667', '172.16.54.140:6667', '172.16.54.141:6667', '172.16.54.148:6667' ]) # 自定义分词库---begin op = OrclPool() sql = 'select WORD from CONF_WORD' lex_list = op.fetch_all(sql) lex_str = '$$$$$' for lex in lex_list: lex_str = '%s\n%s' % (lex_str, lex[0]) with open('userdict.txt', 'w', encoding='utf8') as fp: fp.write(lex_str) jieba.load_userdict('userdict.txt')
def update_keywords(): keywords = {} op = OrclPool() sql = 'select * from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001' key_list = op.fetch_all(sql) for ld in key_list: key = {} key['id'] = ld[0] key['name'] = ld[1] key['main_word'] = ld[2] # key['key_word'] = ld[2].split(',') # keywords.append(key) kws = ld[2].split(',') # 关键字结构,结构根据匹配规则作相应调整 for kw in kws: kw = kw.lower() keywords[kw] = key return keywords
class SimpleBaiduSpider(scrapy.Spider): name = 'sbaidu' #content = '户户通' default_scope_day = 365 # 爬取时限(日) allowed_domains = ['tieba.baidu.com'] #start_urls = ['https://tieba.baidu.com/f?kw='+content] if (read_json.read_json(name)): default_scope_day = 50 #首次爬取时限 else: default_scope_day = 30 #增量爬取时限 #取得关键字 os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' op = OrclPool() sql = "select key_word from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001" list1 = op.fetch_all(sql) keylist = [] for node in list1: temp1 = str(node).replace("'", '') temp2 = temp1.replace("(" or ")", "") temp3 = temp2.replace(")", "") temp4 = temp3.split(",") for key in temp4: if key != '': keylist.append(key) keylist = set(keylist) keylist = list(keylist) urlList = [] for key in keylist: urlList.append('https://tieba.baidu.com/f?kw=' + str(key)) # 将拼接好的字符串加入初始url start_urls = urlList def parse(self, response): nodelist = response.xpath( '//div[@class="col2_right j_threadlist_li_right "]') #得到一页中的所有帖子 item = BaiduspiderItem() isHasContent = False # 判断此页中是否有合适的信息 NextPageUrl = '' for node in nodelist: #分析帖子信息 item["title"] = node.xpath( "./div[1]/div/a[@title]/text()").extract_first() item["UrlId"] = node.xpath( "./div[1]/div/a[@href]/@href").extract_first() item["info"] = node.xpath( './div[2]/div[@class="threadlist_text pull_left"]/div[1]/text()' ).extract_first() item["time"] = node.xpath( './div[1]/div[2]/span[@title="创建时间"]/text()').extract_first() item["time"] = item["time"] = TimeCalculate.time_calculate( item["time"], self.name) # 判断一页中是否有符合年限的帖子 if (isHasContent == False): isHasContent = TimeMarch.time_March(item["time"], self.default_scope_day) # 判断这个帖子是否符合时间 if (TimeMarch.time_March(item["time"], self.default_scope_day) == True): item["IsLimitedTime"] = 'y' else: item["IsLimitedTime"] = 'n' # 拼接子url childUrl = "https://tieba.baidu.com" + item["UrlId"] item["UrlId"] = childUrl # 处理简介为空的情况 if item["info"] == None: item["info"] = '' else: item["info"] = item["info"].strip() #将多余空格去掉 item["time"] = item["time"].strip() try: if (NextPageUrl == ''): # 记录下一页的链接 NextPageUrl = 'https:' + response.xpath( '//a[@class = "next pagination-item "]/@href' ).extract_first() except: url = response.url print("url is :----------" + url) temp = url.split("kw=")[1] keyword = temp.split("&")[0] if ("page" not in url): NextPageUrl = "https://tieba.baidu.com/f?kw=" + keyword + "&ie=utf-8&pn=50" else: page = temp.split("pn=")[1] page = int(page) + 50 NextPageUrl = "https://tieba.baidu.com/f?kw=" + keyword + "&ie=utf-8&pn=" + str( page) yield item #返回数据到pipeline if (isHasContent == False): #根据判断决定继续爬取还是结束 self.crawler.engine.close_spider(self, 'Finished') #关闭爬虫 else: yield scrapy.Request(NextPageUrl, callback=self.parse) print("翻页了!!!!!!!!!!!!!!!!!")
def run(): global producer # 自定义分词库---begin op = OrclPool() sql = 'select WORD from CONF_WORD' lex_list = op.fetch_all(sql) lex_str = '自定义' for lex in lex_list: lex_str = '%s\n%s'%(lex_str,lex[0]) with open('userdict.txt','w',encoding= 'utf8') as fp: fp.write(lex_str) jieba.load_userdict('userdict.txt') # 自定义分词库---end print('==================begin==================') ip_list = read_Proxies() if len(ip_list)==0: if (get_ip() == False): print("无法获得代理") return else: check_ip_list(ip_list) testlist = [{'title': '1', 'info': '11', 'time': '1', 'url': '1'}, # 测试用数据 {'title': '2', 'info': '22', 'time': '1', 'url': '1'}, {'title': '3', 'info': '33', 'time': '1', 'url': '1'}, {'title': '4', 'info': '44', 'time': '1', 'url': '1'}, {'title': '5', 'info': '55', 'time': '1', 'url': '1'}] keylist = getKeylist() count_art = 0 for key in keylist: # 处理公众号 # 首次出错处理 try: titleList_key = read_file("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json")[key] except: titleList_key = [] page = 1 title_list = [] # 最终的文章列表 isEnd = False # 判断是否爬到尾页 while (1): gzhList = [] print("公众号%s=====================第%d页"%(key,page)) tempList = GetgzhList(key, page) # 得到公众号列表 try: testList = tempList[0] # 用来判断是否为空 except: isEnd = True break count_art = count_art+1 isEnd = tempList[1] # 是否爬到尾页 for gzh in tempList[0]: print(gzh) if (gzh != None): gzhList.append(gzh) page = page + 1 # ################################# pageCount = 0 for gzh in gzhList: print(gzh) pageCount = pageCount+1 #获得公众号文章 print('关键词:%s,爬取公众号====%s====文章,第%d页,第%d个'%(key,gzh,page-1,pageCount)) article_list = get_article(gzh,titleList_key) if(article_list==False): print('失败停止停止5s=============================================') time.sleep(5)#失败停止5s continue else: for article in article_list: # 将文章存入titlelist title_list.append(article['title'] + '/' + article['time']) if (isEnd == True): break # 字典记录数据 tempdic = read_dic("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json") tempdic.update({key: title_list}) write_file("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json", tempdic) if count_art==0: writeProxies([]) print('==================end==================')
def run(): global producer # 自定义分词库---begin op = OrclPool() sql = 'select WORD from CONF_WORD' lex_list = op.fetch_all(sql) lex_str = '$$$$$' for lex in lex_list: lex_str = '%s\n%s' % (lex_str, lex[0]) with open('userdict.txt', 'w', encoding='utf8') as fp: fp.write(lex_str) jieba.load_userdict('userdict.txt') # 自定义分词库---end print('==================begin==================') ip_list = read_Proxies() if len(ip_list) == 0: if (get_ip() == False): print("can not get IP") return else: check_ip_list(ip_list) # testlist = [{'title': '1', 'info': '11', 'time': '1', 'url': '1'}, # 测试用数据 # {'title': '2', 'info': '22', 'time': '1', 'url': '1'},] gzhList = [ '户户通315行业网站', '户户通微平台', '户户通行业服务中心', '户户通中九卫星用户交流平台', '广播电视信息', '广电猎酷', '国家广电智库' ] count_art = 0 title_list = [] # 最终的文章列表 pageCount = 0 for gzh in gzhList: print(gzh) pageCount = pageCount + 1 # 处理公众号 # 首次出错处理 try: titleList_key = read_file( "./baiduspiderProject_new/baiduspider/jsonfile/sougou.json" )[gzh] except: titleList_key = [] #获得公众号文章 print('scrapy weixin====%s====article' % (gzh)) article_list = get_article(gzh, titleList_key) if (article_list == False): print( 'failed time sleep 5s=============================================' ) time.sleep(5) #失败停止5s continue else: title_list = article_list time.sleep(20) # 字典记录数据 tempdic = read_dic( "./baiduspiderProject_new/baiduspider/jsonfile/sougou.json") tempdic.update({gzh: title_list}) write_file("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json", tempdic) print('==================end==================')
sql = "" with open('update_basd_sql.txt','r') as fp: sql = fp.read() try: # sql = "delete from BASE_ANALYSIS_SENTIMENT_DETAIL where ID in ('111','122')" ID_list_str = sql.split('(')[1].split(')')[0] ID_list = ID_list_str.split(',') count = 0 ID_list_part = [] bsql = '' length = len(ID_list) for bid in ID_list: count = count + 1 ID_list_part.append(str(bid)) if count % 100 == 0: bsql = 'delete from BASE_ANALYSIS_SENTIMENT_DETAIL where ID in (%s)'%(','.join(ID_list_part)) ID_list_part = [] op = OrclPool() op.execute_sql(bsql) bsql = '' elif count == length-1: bsql = 'delete from BASE_ANALYSIS_SENTIMENT_DETAIL where ID in (%s)'%(','.join(ID_list_part)) ID_list_part = [] op = OrclPool() op.execute_sql(bsql) bsql = '' except Exception as e: export_log({"type":"update_sql","data":sql,"exception":str(e)})