Beispiel #1
0
def basd_info_add(sql):
    try:
        op = OrclPool()
        op.execute_sql(sql)
        print('插入数据库')
        # print(sql)
    except Exception as e:
        if sql != 'insert all select 1 from dual':
            export_log({"type": "批量插入sql", "data": sql, "exception": str(e)})
Beispiel #2
0
def update_keywords():
    keywords = []
    op = OrclPool()
    sql = 'select * from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001'
    key_list = op.fetch_all(sql)
    for ld in key_list:
        key = {}
        key['id'] = ld[0]
        key['name'] = ld[1]
        key['main_word'] = ld[2]
        key['key_word'] = ld[2].split(',')
        keywords.append(key)
    return keywords
def main(argv):
    http_url = argv[1]
    try:
        sql = "select ID from BASE_ANALYSIS_SENTIMENT_DETAIL where URL='%s'" % http_url
        op = OrclPool()
        res_check = op.fetch_all(sql)
        check_httpurl = []
        if len(res_check) == 0:
            check_httpurl = []
        else:
            check_httpurl = [str(rc[0]) for rc in res_check]
            print(http_url)
        with open('check_httpurl.txt', 'w', encoding='utf8') as fp:
            fp.write(json.dumps(check_httpurl, ensure_ascii=False))
    except Exception as e:
        export_log({"type": "check_sql", "data": sql, "exception": str(e)})
def getKeylist():
    # 取得关键字
    os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
    op = OrclPool()
    sql = "select key_word from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001"
    list1 = op.fetch_all(sql)
    keylist = []
    for node in list1:
        temp1 = str(node).replace("'", '')
        temp2 = temp1.replace("(" or ")", "")
        temp3 = temp2.replace(")", "")
        temp4 = temp3.split(",")
        for key in temp4:
            if key != '':
                keylist.append(key)
    keylist = set(keylist)
    keylist = list(keylist)
    return keylist
Beispiel #5
0
 def __init__(self):
     ###############################################################################
     global producer
     # producer = KafkaProducer(bootstrap_servers=['172.16.54.139:6667'])
     producer = KafkaProducer(bootstrap_servers=[
         '172.16.54.139:6667', '172.16.54.140:6667', '172.16.54.141:6667',
         '172.16.54.148:6667'
     ])
     # 自定义分词库---begin
     op = OrclPool()
     sql = 'select WORD from CONF_WORD'
     lex_list = op.fetch_all(sql)
     lex_str = '$$$$$'
     for lex in lex_list:
         lex_str = '%s\n%s' % (lex_str, lex[0])
     with open('userdict.txt', 'w', encoding='utf8') as fp:
         fp.write(lex_str)
     jieba.load_userdict('userdict.txt')
Beispiel #6
0
def update_keywords():
    keywords = {}
    op = OrclPool()
    sql = 'select * from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001'
    key_list = op.fetch_all(sql)
    for ld in key_list:
        key = {}
        key['id'] = ld[0]
        key['name'] = ld[1]
        key['main_word'] = ld[2]
        # key['key_word'] = ld[2].split(',')
        # keywords.append(key)
        kws = ld[2].split(',')
        # 关键字结构,结构根据匹配规则作相应调整
        for kw in kws:
            kw = kw.lower()
            keywords[kw] = key
    return keywords
Beispiel #7
0
class SimpleBaiduSpider(scrapy.Spider):
    name = 'sbaidu'
    #content = '户户通'
    default_scope_day = 365  # 爬取时限(日)
    allowed_domains = ['tieba.baidu.com']
    #start_urls = ['https://tieba.baidu.com/f?kw='+content]
    if (read_json.read_json(name)):
        default_scope_day = 50  #首次爬取时限
    else:
        default_scope_day = 30  #增量爬取时限

#取得关键字
    os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
    op = OrclPool()
    sql = "select key_word from BASE_ANALYSIS_SENTIMENT where DICT_ENABLED_VALUE=300010000000001"
    list1 = op.fetch_all(sql)
    keylist = []
    for node in list1:
        temp1 = str(node).replace("'", '')
        temp2 = temp1.replace("(" or ")", "")
        temp3 = temp2.replace(")", "")
        temp4 = temp3.split(",")
        for key in temp4:
            if key != '':
                keylist.append(key)
    keylist = set(keylist)
    keylist = list(keylist)
    urlList = []
    for key in keylist:
        urlList.append('https://tieba.baidu.com/f?kw=' + str(key))
    # 将拼接好的字符串加入初始url
    start_urls = urlList

    def parse(self, response):
        nodelist = response.xpath(
            '//div[@class="col2_right j_threadlist_li_right "]')  #得到一页中的所有帖子
        item = BaiduspiderItem()
        isHasContent = False  # 判断此页中是否有合适的信息
        NextPageUrl = ''
        for node in nodelist:  #分析帖子信息
            item["title"] = node.xpath(
                "./div[1]/div/a[@title]/text()").extract_first()
            item["UrlId"] = node.xpath(
                "./div[1]/div/a[@href]/@href").extract_first()
            item["info"] = node.xpath(
                './div[2]/div[@class="threadlist_text pull_left"]/div[1]/text()'
            ).extract_first()
            item["time"] = node.xpath(
                './div[1]/div[2]/span[@title="创建时间"]/text()').extract_first()
            item["time"] = item["time"] = TimeCalculate.time_calculate(
                item["time"], self.name)
            # 判断一页中是否有符合年限的帖子
            if (isHasContent == False):
                isHasContent = TimeMarch.time_March(item["time"],
                                                    self.default_scope_day)
            # 判断这个帖子是否符合时间
            if (TimeMarch.time_March(item["time"],
                                     self.default_scope_day) == True):
                item["IsLimitedTime"] = 'y'
            else:
                item["IsLimitedTime"] = 'n'
            # 拼接子url
            childUrl = "https://tieba.baidu.com" + item["UrlId"]
            item["UrlId"] = childUrl
            # 处理简介为空的情况
            if item["info"] == None:
                item["info"] = ''
            else:
                item["info"] = item["info"].strip()  #将多余空格去掉
            item["time"] = item["time"].strip()
            try:
                if (NextPageUrl == ''):  # 记录下一页的链接
                    NextPageUrl = 'https:' + response.xpath(
                        '//a[@class = "next pagination-item "]/@href'
                    ).extract_first()
            except:
                url = response.url
                print("url is :----------" + url)
                temp = url.split("kw=")[1]
                keyword = temp.split("&")[0]
                if ("page" not in url):
                    NextPageUrl = "https://tieba.baidu.com/f?kw=" + keyword + "&ie=utf-8&pn=50"
                else:
                    page = temp.split("pn=")[1]
                    page = int(page) + 50
                    NextPageUrl = "https://tieba.baidu.com/f?kw=" + keyword + "&ie=utf-8&pn=" + str(
                        page)

            yield item  #返回数据到pipeline
        if (isHasContent == False):  #根据判断决定继续爬取还是结束
            self.crawler.engine.close_spider(self, 'Finished')  #关闭爬虫
        else:
            yield scrapy.Request(NextPageUrl, callback=self.parse)
            print("翻页了!!!!!!!!!!!!!!!!!")
def run():
    global producer
    # 自定义分词库---begin
    op = OrclPool()
    sql = 'select WORD from CONF_WORD'
    lex_list = op.fetch_all(sql)
    lex_str = '自定义'
    for lex in lex_list:
        lex_str = '%s\n%s'%(lex_str,lex[0])
    with open('userdict.txt','w',encoding= 'utf8') as fp:
        fp.write(lex_str)
    jieba.load_userdict('userdict.txt')
    # 自定义分词库---end
    print('==================begin==================')
    ip_list = read_Proxies()
    if len(ip_list)==0:
        if (get_ip() == False):
            print("无法获得代理")
            return
    else:
        check_ip_list(ip_list)

    testlist = [{'title': '1', 'info': '11', 'time': '1', 'url': '1'},  # 测试用数据
                {'title': '2', 'info': '22', 'time': '1', 'url': '1'},
                {'title': '3', 'info': '33', 'time': '1', 'url': '1'},
                {'title': '4', 'info': '44', 'time': '1', 'url': '1'},
                {'title': '5', 'info': '55', 'time': '1', 'url': '1'}]
    keylist = getKeylist()
    count_art = 0
    for key in keylist:
        # 处理公众号
        # 首次出错处理
        try:
            titleList_key = read_file("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json")[key]
        except:
            titleList_key = []
        page = 1
        title_list = []  # 最终的文章列表
        isEnd = False  # 判断是否爬到尾页
        while (1):
            gzhList = []
            print("公众号%s=====================第%d页"%(key,page))
            tempList = GetgzhList(key, page)  # 得到公众号列表
            try:
                testList = tempList[0]  # 用来判断是否为空
            except:
                isEnd = True
                break
            count_art = count_art+1
            isEnd = tempList[1]  # 是否爬到尾页
            for gzh in tempList[0]:
                print(gzh)
                if (gzh != None):
                    gzhList.append(gzh)
            page = page + 1

            # #################################
            pageCount = 0
            for gzh in gzhList:
                print(gzh)
                pageCount = pageCount+1
                #获得公众号文章
                print('关键词:%s,爬取公众号====%s====文章,第%d页,第%d个'%(key,gzh,page-1,pageCount))
                article_list = get_article(gzh,titleList_key)
                if(article_list==False):
                    print('失败停止停止5s=============================================')
                    time.sleep(5)#失败停止5s
                    continue
                else:
                    for article in article_list:  # 将文章存入titlelist
                        title_list.append(article['title'] + '/' + article['time'])

            if (isEnd == True): break
        # 字典记录数据
        tempdic = read_dic("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json")
        tempdic.update({key: title_list})
        write_file("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json", tempdic)
    if count_art==0:
        writeProxies([])
    print('==================end==================')
def run():
    global producer
    # 自定义分词库---begin
    op = OrclPool()
    sql = 'select WORD from CONF_WORD'
    lex_list = op.fetch_all(sql)
    lex_str = '$$$$$'
    for lex in lex_list:
        lex_str = '%s\n%s' % (lex_str, lex[0])
    with open('userdict.txt', 'w', encoding='utf8') as fp:
        fp.write(lex_str)
    jieba.load_userdict('userdict.txt')
    # 自定义分词库---end
    print('==================begin==================')
    ip_list = read_Proxies()
    if len(ip_list) == 0:
        if (get_ip() == False):
            print("can not get IP")
            return
    else:
        check_ip_list(ip_list)

    # testlist = [{'title': '1', 'info': '11', 'time': '1', 'url': '1'},  # 测试用数据
    #             {'title': '2', 'info': '22', 'time': '1', 'url': '1'},]
    gzhList = [
        '户户通315行业网站', '户户通微平台', '户户通行业服务中心', '户户通中九卫星用户交流平台', '广播电视信息', '广电猎酷',
        '国家广电智库'
    ]
    count_art = 0

    title_list = []  # 最终的文章列表
    pageCount = 0
    for gzh in gzhList:
        print(gzh)
        pageCount = pageCount + 1
        # 处理公众号
        # 首次出错处理
        try:
            titleList_key = read_file(
                "./baiduspiderProject_new/baiduspider/jsonfile/sougou.json"
            )[gzh]
        except:
            titleList_key = []
        #获得公众号文章
        print('scrapy weixin====%s====article' % (gzh))
        article_list = get_article(gzh, titleList_key)
        if (article_list == False):
            print(
                'failed time sleep 5s============================================='
            )
            time.sleep(5)  #失败停止5s
            continue
        else:
            title_list = article_list
        time.sleep(20)

    # 字典记录数据
    tempdic = read_dic(
        "./baiduspiderProject_new/baiduspider/jsonfile/sougou.json")
    tempdic.update({gzh: title_list})
    write_file("./baiduspiderProject_new/baiduspider/jsonfile/sougou.json",
               tempdic)
    print('==================end==================')
Beispiel #10
0
sql = ""
with open('update_basd_sql.txt','r') as fp:
	sql = fp.read()

try:
	# sql = "delete from BASE_ANALYSIS_SENTIMENT_DETAIL where ID in ('111','122')"
	ID_list_str = sql.split('(')[1].split(')')[0]
	ID_list = ID_list_str.split(',')
	count = 0
	ID_list_part = []
	bsql = ''
	length = len(ID_list)
	for bid in ID_list:
		count = count + 1
		ID_list_part.append(str(bid))
		if count % 100 == 0:
			bsql = 'delete from BASE_ANALYSIS_SENTIMENT_DETAIL where ID in (%s)'%(','.join(ID_list_part))
			ID_list_part = []
			op = OrclPool()
			op.execute_sql(bsql)
			bsql = ''
		elif count == length-1:
			bsql = 'delete from BASE_ANALYSIS_SENTIMENT_DETAIL where ID in (%s)'%(','.join(ID_list_part))
			ID_list_part = []
			op = OrclPool()
			op.execute_sql(bsql)
			bsql = ''
except Exception as e:
	export_log({"type":"update_sql","data":sql,"exception":str(e)})