Example #1
0
    def getDatas(self, timestampes):
        try:
            url = 'http://www.bishijie.com/api/news/?size=100&timestamp=' + timestampes
            response_result = urllib.request.urlopen(url).read()
            tmp = json.loads(response_result)
        except Exception as e:
            return None

        try:
            all_div = tmp['data'][utils.gettoday()]['buttom']
        except KeyError:
            return None

        for item in all_div:
            newsId = (item['newsflash_id'])
            newsTime = bishijie.getTimeFromStampe(item['issue_time'])
            if newsId in self.ids:
                continue

            if newsTime.split(" ")[0] != utils.gettoday():
                return None
            content = self.processContent(item)
            newsurl = item['link']
            source = "币世界"
            insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\"".format(
                newsId, newsTime, content, newsurl, source)
            self.dao.saveInfo(tableName=utils.kuaixun_tbl,
                              columesName=utils.kuaixun_columes,
                              values=insertStr)
Example #2
0
 def update(self):
     self.ids = self.dao.getIdsBySource(utils.kuaixun_tbl, utils.gettoday(),
                                        "金色财经")
     indexId = self.getDatas(0)
     while indexId != None:
         indexId = self.getDatas(indexId)
         time.sleep(5)
Example #3
0
    def flush(self, index):
        cnblogs = self.requestCnblogs(index)
        if cnblogs == None:
            return None
        soup = BeautifulSoup(cnblogs, 'html.parser')
        all_div = soup.find_all('div', attrs={'class': 'list-art clear'})

        for item in all_div:
            content = self.processContent(item)
            it = content.split(";")
            if it[0] in self.titles:
                continue
            newstime = re.sub("/", "-", it[2])
            if newstime.split(" ")[0] != utils.gettoday():
                return None
            print(content)
            print('http://www.qukuaiwang.com.cn' + item.a['href'])
            title = it[0]
            author = it[1]
            hots = it[3]
            img = self.baseUrl + item.img['src']
            newsuri = 'http://www.qukuaiwang.com.cn' + item.a['href']
            insertStr = "\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",{}".format(
                title, author, newstime, newsuri, img, hots)
            self.dao.saveInfo(utils.qukuaiwang_tbl, utils.qukuaiwang_columes,
                              insertStr)
        time.sleep(5)
Example #4
0
 def update(self):
     self.ids = self.dao.getIdsBySource(utils.kuaixun_tbl, utils.gettoday(),
                                        "币世界")
     timest = self.getDatas("")
     while timest != None:
         timest = self.getDatas(timest)
         time.sleep(5)
Example #5
0
    def getDatas(self, index):
        url = 'http://www.jinse.com/ajax/weibo/getList?flag=down&id=' + str(
            index)
        try:
            response_result = urllib.request.urlopen(url).read()
            tmp = json.loads(response_result)
        except Exception as e:
            return None

        try:
            all_div = tmp['data']
        except KeyError:
            return None
        for item in all_div:
            infoId = item['id']
            if infoId in self.ids:
                continue
            infoDatetime = item['created_at']
            name = item['source_uri']
            if infoDatetime.split(" ")[0] != utils.gettoday():
                return None
            content = self.processContent(item)
            source = "weibo"
            insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\"".format(
                infoId, name, infoDatetime, content.strip('\n'), source)
            self.dao.saveInfo(tableName=utils.weibo_tbl,
                              columesName=utils.weibo_columes,
                              values=insertStr)
        return int(item['id'])
Example #6
0
 def getDatas(self, index):
     url = 'http://www.jinse.com/ajax/twitters/getList?flag=down&id=' + str(
         index)
     try:
         response_result = urllib.request.urlopen(url).read()
         tmp = json.loads(response_result)
     except Exception as e:
         return None
     # 循环div获取详细信息
     try:
         all_div = tmp['data']
     except KeyError:
         return None
     for item in all_div:
         # print(item['id'])
         infoId = item['id']
         if infoId in self.ids:
             continue
         name = item['source_uri']
         infoDatetime = item['published_at']
         if infoDatetime.split(" ")[0] != utils.gettoday():
             return None
         content = self.processContent(item)
         if content and len(content.strip()) == 0:
             continue
         source = "twitter"
         insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\"".format(
             infoId, name, infoDatetime, content, source)
         print(insertStr)
         self.dao.saveInfo(tableName=utils.twitter_tbl,
                           columesName=utils.twitter_columes,
                           values=insertStr)
     return int(item['id'])
Example #7
0
    def flush(self,index):
        cnblogs = self.requestCnblogs(index)
        if cnblogs ==None:
            return None
        soup = BeautifulSoup(cnblogs, 'html.parser')
        all_div = soup.find_all('div', attrs={'class': 'list-art clear'})

        for item in all_div:
            content = item.text;
            content = content.strip();
            content = re.sub("\n+", ";", content)
            it = content.split(";")
            if it[0] in self.titles:
                continue
            newstime = re.sub("/", "-", it[2])
            if newstime.split(" ")[0] != utils.gettoday():
                return None
            print(content)
            print('http://www.qukuaiwang.com.cn' + item.a['href'])
            title = it[0]
            author = it[1]
            newsuri = 'http://www.qukuaiwang.com.cn' + item.a['href']
            insertStr = "\"{}\",\"{}\",\"{}\",\"{}\"".format(title, author, newstime, newsuri)
            self.dao.saveInfo("tbl_qukuaiwang","title,author,newstime,newsuri",insertStr)
        time.sleep(5)
Example #8
0
    def getDatas(self, index):
        try:
            url = 'http://www.jinse.com/ajax/lives/getList?search=&id=' + str(
                index) + '&flag=down'
            response_result = urllib.request.urlopen(url).read()
        except Exception as e:
            print("error happen")
            return None
        tmp = json.loads(response_result)
        try:
            all_div = tmp['data'][utils.gettoday()]
        except KeyError:
            return None

        for item in all_div:
            if item['day_name'] == '今天':
                infoId = item['id']
                sourceurl = item['source_url']
                infoDatetime = item['publish_time']

                if infoDatetime == None or infoDatetime == "0000-00-00 00:00:00":
                    infoDatetime = utils.gettoday(
                    ) + " " + item['created_at'] + ":00"
                if infoDatetime.split(" ")[0] != utils.gettoday():
                    print(infoDatetime)
                    infoDatetime = utils.gettoday(
                    ) + " " + item['created_at'] + ":00"
                if infoId in self.ids:
                    continue
                content = item['content']
                content = re.sub(r'<[^<]+>', "", content)
                content = re.sub(r'\"', "\\\"", content)
                content = re.sub('\n', "", content)
                print(re.sub("\n", "", content))
                insertStr = "{},\"{}\",\"{}\",\"{}\",\"金色财经\"".format(
                    infoId, infoDatetime, content, sourceurl)
                print(insertStr)
                self.dao.saveInfo(tableName=utils.kuaixun_tbl,
                                  columesName=utils.kuaixun_columes,
                                  values=insertStr)
            else:
                return None
        return item['id']
Example #9
0
 def getTitles(self):
     return self.dao.getData(utils.qukuaiwang_tbl, "title",
                             utils.gettoday())
Example #10
0
    def getDatas(self, index):
        url = 'http://www.jinse.com/ajax/weibo/getList?flag=down&id=' + str(
            index)
        try:
            # response_result = urllib.request.urlopen(url).read()
            headers = {
                'User-Agent':
                r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
                # 'Referer': r'http://www.lagou.com/zhaopin/Python/?labelWords=label',
                'Connection':
                'keep-alive'
            }
            req = urllib.request.Request(url, headers=headers)
            response_result = urllib.request.urlopen(req).read()

            # response_result = page.decode('utf-8')
            # req = urllib.request(url)
            # req.add_header("User-Agent",
            #            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36")
            # req.add_header("GET", url)
            # response_result=req.urlopen(req).read()

            tmp = json.loads(response_result)
        except Exception as e:
            return None

        try:
            all_div = tmp['data']
        except KeyError:
            return None
        for item in all_div:
            infoId = item['id']
            contentImg = item['image_urls']
            retweetedImg = item['retweeted_image_urls']
            contentImgs = ""
            for img in contentImg:
                contentImgs += img['url'] + ","

            for img in retweetedImg:
                contentImgs += img['url'] + ","

            headImg = item['user']['avatar']
            if infoId in self.ids:
                continue
            infoDatetime = item['created_at']
            name = item['source_uri']
            if infoDatetime.split(" ")[0] != utils.gettoday():
                return None
            content = self.processContent(item)
            if item['retweeted_content']:
                content = content + "," + item['retweeted_content']
            if len(content.strip()) == 0:
                continue
            source = "weibo"
            insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\"".format(
                infoId, name, infoDatetime, content.strip('\n'), source,
                headImg, contentImgs)
            self.dao.saveInfo(tableName=utils.weibo_tbl,
                              columesName=utils.weibo_columes,
                              values=insertStr)
        return int(item['id'])
Example #11
0
 def update(self):
     self.ids = self.dao.getIds("tbl_weibo", utils.gettoday())
     indexId = self.getDatas(0)
     while indexId != None:
         indexId = self.getDatas(indexId)
         time.sleep(5)
Example #12
0
 def getTitles(self):
     return self.dao.getData("tbl_qukuaiwang","title",utils.gettoday())