def __init__(self,
              minutes,
              isDaemon=False,
              logger=LoggerUtil.getFileLogger()):
     BaseExecuter.__init__(self, logger)
     self.minutes = int(minutes)
     self.thread = threading.Thread(name=self.__class__.__name__,
                                    target=self.proc)
     self.thread.setDaemon(isDaemon)
Ejemplo n.º 2
0
 def __init__(self, logger = LoggerUtil.getFileLogger()):
     BaseExecuter.__init__(self, logger)
     self.connection = ParseConnection()
     self.track = []
     self.follow = []
     self.twitterIdInfo = {}
     self.hashtagAlbumIdMap = {}
     self.running = threading.Event()
     self.thread = threading.Thread(target = self.main)
Ejemplo n.º 3
0
 def __init__(self, connection, logger=LoggerUtil.getFileLogger()):
     self.connection = connection
     self.logger = logger
Ejemplo n.º 4
0
 def debugLog(self, message):
     LoggerUtil.encodedLog(self.logger.debug, message)
Ejemplo n.º 5
0
 def warnLog(self, message):
     LoggerUtil.encodedLog(self.logger.warn, message)
Ejemplo n.º 6
0
 def errorLog(self, message):
     LoggerUtil.encodedLog(self.logger.error, message)
Ejemplo n.º 7
0
 def infoLog(self, message):
     LoggerUtil.encodedLog(self.logger.info, message)
Ejemplo n.º 8
0
class xljs2Spider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("xljs2Spider")

    name = 'xljs2'
    allowed_domains = [
        'mil.news.sina.com.cn', "comment.sina.com.cn",
        "roll.mil.news.sina.com.cn"
    ]

    start_urls = ["http://roll.mil.news.sina.com.cn/col/zgjq/index.shtml"]

    def parse(self, response):
        for p in range(1, 800):
            next_url = "http://roll.mil.news.sina.com.cn/col/zgjq/index_{0}.shtml"
            next_url = next_url.format(p)
            print("页码====", next_url, "====")
            yield response.follow(url=next_url, callback=self.parese_news_list)

    def parese_news_list(self, response):
        datas = set(response.xpath("//div[@class='fixList']//a"))
        for item in datas:
            new_url = item.xpath("@href").extract()[0]
            title = item.xpath("text()").extract()[0]
            print(new_url, title)
            metadata = {}
            metadata["title"] = title
            yield response.follow(url=new_url,
                                  meta=metadata,
                                  callback=self.parse_news)

    def parse_news(self, response):
        url = response.url
        title = response.meta["title"]
        id = re.findall("-(.*)\.", url.split("/")[-1])[0]
        id = id[1:]
        # context = response.xpath("//div[@class='article']//p/text()").extract()
        # publish_time = response.xpath("//div[@class='date-source']/span[1]/text()").extract()[0]
        # print("id,title",id,title,publish_time,context)
        context = response.xpath(
            "//div[@class='article' or @id='artibody']//p/text()").extract()
        publish_time = response.xpath(
            "//div[@class='date-source']/span[1]/text() | //span[@class='time-source']/span[@class='titer']/text()"
        ).extract()[0]
        print("id,title", id, title, context, publish_time)

        comment_url = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=jc&newsid=comos-{0}&group=0&compress=0&ie=utf-8&oe=utf-8" \
                      "&page={1}&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user&callback=jsonp_1602817818772&_=1602817818772"
        comment_url = comment_url.format(id, 1)

        metadata = {}
        metadata["id"] = id
        metadata["title"] = title
        metadata["context"] = context
        metadata["publish_time"] = publish_time
        print(metadata)
        print("comment_url===", comment_url)
        yield response.follow(url=comment_url,
                              meta=metadata,
                              callback=self.parse_comment)

    def parse_comment(self, response):
        id = response.meta["id"]
        title = response.meta["title"]
        context = response.meta["context"]
        publish_time = response.meta["publish_time"]
        data = re.findall("{(.*)}", response.text)
        if len(data) > 0:
            jsondata_result = "{" + re.findall("{(.*)}",
                                               response.text)[0] + "}"
            jsondata = json.loads(jsondata_result)["result"]
            count = jsondata["count"]
            join_count = count["total"]
            comment_count = count["show"]
            xljsxw = FHJSXW()
            pdate = datetime.datetime.now().strftime('%Y-%m-%d')
            xljsxw["id"] = id
            xljsxw["title"] = title
            xljsxw["context"] = "|".join(context)
            xljsxw["publish_time"] = publish_time
            xljsxw["comment_count"] = comment_count
            xljsxw["join_count"] = join_count
            xljsxw["accumulator_count"] = 0
            xljsxw["pdate"] = pdate
            xljsxw["data_source"] = "新浪网"
            xljsxw["data_module"] = "中国军情"
            print(xljsxw)
            yield xljsxw
Ejemplo n.º 9
0
 def __init__(self, connection, logger=LoggerUtil.getFileLogger()):
     self.logger = logger
     self.connection = connection
     self.className = self.getClassName()
Ejemplo n.º 10
0
 def __init__(self, logger=LoggerUtil.getFileLogger()):
     self.logger = logger
Ejemplo n.º 11
0
class MysqlPipeline():
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("mysqlPipelineLogger")

    def __init__(self, host, database, user, password, port):
        self.host = host
        self.database = database
        self.user = user
        self.password = password
        self.port = port

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            database=crawler.settings.get('MYSQL_DATABASE'),
            user=crawler.settings.get('MYSQL_USER'),
            password=crawler.settings.get('MYSQL_PASSWORD'),
            port=crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self, spider):
        self.db = pymysql.connect(self.host,
                                  self.user,
                                  self.password,
                                  self.database,
                                  charset='utf8',
                                  port=self.port)
        self.cursor = self.db.cursor()

    def close_spider(self, spider):
        self.db.close()

    def process_item(self, item, spider):
        if isinstance(item, FHJSXW):
            table = "yq_data"
            data = dict(item)
            self.logger.info("插入数据: ", data)
            keys = ', '.join(data.keys())
            values = ', '.join(['%s'] * len(data))
            sql = 'insert into %s (%s) values (%s)' % (table, keys, values)
            print("执行sql:", sql)
            print(tuple(data.values()))
            try:
                self.cursor.execute(sql, tuple(data.values()))
                self.db.commit()
            except Exception as e:
                print("错误:", e)
            return item
        elif isinstance(item, FHJSXWPL):
            table = "yq_pl_data"
            data = dict(item)
            self.logger.info("插入数据: ", data)
            keys = ', '.join(data.keys())
            values = ', '.join(['%s'] * len(data))
            sql = 'insert into %s (%s) values (%s)' % (table, keys, values)
            print("执行sql:", sql)
            print(tuple(data.values()))
            try:
                self.cursor.execute(sql, tuple(data.values()))
                self.db.commit()
            except Exception as e:
                print("错误:", e)
            return item
Ejemplo n.º 12
0
class fhjwSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("fhjwSpider")

    name = 'fhjw2'
    allowed_domains = ['shankapi.ifeng.com',"comment.ifeng.com","mil.ifeng.com",
                       "tech.ifeng.com","ishare.ifeng.com","news.ifeng.com","survey.news.ifeng.com"]
    start_urls = ['https://mil.ifeng.com/']

    def parse(self, response):
        aitems = set(response.xpath("//div[@class='news-34dpVmYc']//a"))
        for aitem in aitems:
            item = {}
            url = "https:"+aitem.xpath("@href").extract()[0]
            title = aitem.xpath("@title").extract()[0]
            print(url,title)
            id = url.split("/")[-1]
            commentUrl = "ucms_"+ id
            item["id"] = id
            item["title"] = title
            item["commentUrl"] = commentUrl
            item["url"] = url
            comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \
                          "docUrl={0}&format=js&job=1&p=1&pageSize=1".format(commentUrl)
            yield response.follow(comment_url, meta=item, callback=self.parse_comment)

    def parse_comment(self,response):
        id = response.meta["id"]
        url = response.meta["url"]
        commentUrl = response.meta["commentUrl"]
        title = response.meta["title"]
        data = re.findall("{(.*)}", response.text.encode('utf-8').decode('unicode_escape'))
        if len(data) >0:
            jsondata_result = "{"+re.findall("{(.*)}",response.text)[0]+"}"
            jsondata = json.loads(jsondata_result)
            # print(jsondata)
            count = jsondata["count"]
            join_count = jsondata["join_count"]
            comments = jsondata["comments"]
            print("count===", id, title, count, join_count)
            metaitem= {}
            metaitem["id"] = id
            metaitem["title"] = title
            metaitem["count"] = count
            metaitem["join_count"] = join_count
            metaitem["commentUrl"] = commentUrl
            print("url====",url)
            yield response.follow(url, meta=metaitem, callback=self.parse_context)

    def parse_context(self, response):
        id = response.meta["id"]
        title = response.meta["title"]
        count = response.meta["count"]
        join_count = response.meta["join_count"]
        commentUrl = response.meta["commentUrl"]
        publish_time = response.xpath("//p[@class='time-1Mgp9W-1']/span[1]/text()").extract()[0].strip()
        context = response.xpath("////div[@class='text-3w2e3DBc']//p/text()").extract()
        # print("结果数据:",id,title,publish_time,context,count,join_count)
        accumulator_url = "https://survey.news.ifeng.com/api/getaccumulatorweight?format=js&" \
        "key={0}ding&serviceid=2&callback=getaccumulator"
        accumulator_url = accumulator_url.format(commentUrl)
        accumulator_meta = {}
        accumulator_meta["id"] = id
        accumulator_meta["title"] = title
        accumulator_meta["publish_time"] = publish_time
        accumulator_meta["context"] = context
        accumulator_meta["count"] = count
        accumulator_meta["join_count"] = join_count
        accumulator_meta["commentUrl"] = commentUrl
        print("获取推荐数:",accumulator_url)
        yield scrapy.Request(accumulator_url,meta=accumulator_meta, callback=self.parese_context2)

    def parese_context2(self,response):
        fhjsxw = FHJSXW()
        id = response.meta["id"]
        title = response.meta["title"]
        context = response.meta["context"]
        publish_time = response.meta["publish_time"]
        count = response.meta["count"]
        join_count = response.meta["join_count"]
        commentUrl = response.meta["commentUrl"]
        data = re.findall("\"browse\":{(.*)}}}", response.text)
        print(data)
        if len(data) > 0:
            jsondata_result = "{" + data[0] + "}"
            jsondata = json.loads(jsondata_result)
            print(jsondata)
            accumulator_count = jsondata[commentUrl+"ding"]
            print("结果数据:",id,title,context,publish_time,count,join_count,accumulator_count)
            fhjsxw["id"] = id
            fhjsxw["title"] = title
            fhjsxw["context"] = "|".join(context)
            fhjsxw["publish_time"] = publish_time
            fhjsxw["comment_count"] = count
            fhjsxw["join_count"] = join_count
            fhjsxw["accumulator_count"] = accumulator_count
            pdate = datetime.datetime.now().strftime('%Y-%m-%d')
            fhjsxw["pdate"] = pdate
            fhjsxw["data_source"] = "凤凰网"
            fhjsxw["data_module"] = "军事首页"
            yield fhjsxw
        else:
            fhjsxw["id"] = id
            fhjsxw["title"] = title
            fhjsxw["context"] = context
            fhjsxw["publish_time"] = publish_time
            fhjsxw["comment_count"] = count
            fhjsxw["join_count"] = join_count
            fhjsxw["accumulator_count"] = 0
            pdate = datetime.datetime.now().strftime('%Y-%m-%d')
            fhjsxw["pdate"] = pdate
            fhjsxw["data_source"] = "凤凰网"
            fhjsxw["data_module"] = "军事首页"
            yield fhjsxw
Ejemplo n.º 13
0
class xljsSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("xljsSpider")

    name = 'xljspl'
    allowed_domains = [
        'mil.news.sina.com.cn', "comment.sina.com.cn",
        "roll.mil.news.sina.com.cn"
    ]

    start_urls = ["http://roll.mil.news.sina.com.cn/col/gjjq/index.shtml"]

    def parse(self, response):
        for p in range(1, 500):
            next_url = "http://roll.mil.news.sina.com.cn/col/gjjq/index_{0}.shtml"
            next_url = next_url.format(p)
            print("页码====", next_url, "====")
            yield response.follow(url=next_url, callback=self.parese_news_list)

    def parese_news_list(self, response):
        datas = set(response.xpath("//div[@class='fixList']//a"))
        print("datas:", len(datas))
        for item in datas:
            new_url = item.xpath("@href").extract()[0]
            title = item.xpath("text()").extract()[0]
            print(new_url, title)
            metadata = {}
            metadata["title"] = title
            yield response.follow(url=new_url,
                                  meta=metadata,
                                  callback=self.parse_news)

    def parse_news(self, response):
        url = response.url
        title = response.meta["title"]
        id = re.findall("-(.*)\.", url.split("/")[-1])[0]
        id = id[1:]

        print("id,title", id, title)
        # datas = set(response.xpath("//div[@class='fixList']//a"))

        comment_url = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=jc&newsid=comos-{0}&group=0&compress=0&ie=utf-8&oe=utf-8" \
                      "&page={1}&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user&callback=jsonp_1602817818772&_=1602817818772"
        comment_url = comment_url.format(id, 1)

        metadata = {}
        metadata["id"] = id
        metadata["title"] = title
        metadata["page"] = 1
        print(metadata)
        print("comment_url===", comment_url)
        yield response.follow(url=comment_url,
                              meta=metadata,
                              callback=self.parse_comment)

    def parse_comment(self, response):
        id = response.meta["id"]
        title = response.meta["title"]
        page = response.meta["page"]

        data = re.findall("{(.*)}", response.text)
        if len(data) > 0:
            jsondata_result = "{" + re.findall("{(.*)}",
                                               response.text)[0] + "}"
            jsondata = json.loads(jsondata_result)["result"]
            print(jsondata)
            count = jsondata["count"]
            join_count = count["total"]
            comment_count = count["show"]

            cmntlist = jsondata["cmntlist"]
            threaddict = jsondata["threaddict"]
            if len(cmntlist) > 0:

                for cmn in cmntlist:
                    xljsxwpl = FHJSXWPL()
                    xljsxwpl["id"] = id
                    xljsxwpl["title"] = title
                    xljsxwpl["user_name"] = cmn["nick"]
                    xljsxwpl["user_id"] = cmn["uid"]
                    xljsxwpl["comment_id"] = cmn["mid"]
                    xljsxwpl["comment_contents"] = cmn["content"]
                    xljsxwpl["comment_date"] = cmn["time"]
                    xljsxwpl["uptimes"] = cmn["agree"]
                    xljsxwpl["reply_comment_ids"] = ""
                    pdate = datetime.datetime.now().strftime('%Y-%m-%d')
                    xljsxwpl["pdate"] = pdate
                    xljsxwpl["data_source"] = "新浪网"
                    xljsxwpl["data_module"] = "国际军情"
                    print(xljsxwpl)
                    yield xljsxwpl

                if len(threaddict.values()) > 0:
                    threaddictlist = threaddict.values()
                    for threaddictitem in threaddictlist:
                        for thread in threaddictitem["list"]:
                            xljsxwpl = FHJSXWPL()
                            xljsxwpl["id"] = id
                            xljsxwpl["title"] = title
                            xljsxwpl["user_name"] = thread["nick"]
                            xljsxwpl["user_id"] = thread["uid"]
                            xljsxwpl["comment_id"] = thread["mid"]
                            xljsxwpl["comment_contents"] = thread["content"]
                            xljsxwpl["comment_date"] = thread["time"]
                            xljsxwpl["uptimes"] = thread["agree"]
                            xljsxwpl["reply_comment_ids"] = ""
                            pdate = datetime.datetime.now().strftime(
                                '%Y-%m-%d')
                            xljsxwpl["pdate"] = pdate
                            xljsxwpl["data_source"] = "新浪网"
                            xljsxwpl["data_module"] = "国际军情"
                            print(xljsxwpl)
                            yield xljsxwpl

                page = page + 1
                comment_url = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=jc&newsid=comos-{0}&group=0&compress=0&ie=utf-8&oe=utf-8" \
                              "&page={1}&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user&callback=jsonp_1602817818772&_=1602817818772"
                comment_url = comment_url.format(id, page)
                print("next_page==", comment_url)
                yield scrapy.Request(comment_url,
                                     meta=response.meta,
                                     callback=self.parse_comment)
            else:
                print("结束")
Ejemplo n.º 14
0
class LhgtjsSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("LhgtjsSpider")

    name = 'lhgtjs'
    filepath = "jszlwz/data/lhgtjsdatas2.csv"
    allowed_domains = ['data.un.org']
    # start_urls = ['http://data.un.org/Handlers/ExplorerHandler.ashx?m=EDATA',
    #               'http://data.un.org/Handlers/ExplorerHandler.ashx?m=FAO',
    #               'http://data.un.org/Handlers/ExplorerHandler.ashx?m=ICS']
    start_urls = ['http://data.un.org/Handlers/ExplorerHandler.ashx?m=ICS']

    def __init__(self, url):
        if os.path.exists(self.filepath):
            os.remove(self.filepath)
        print("启动url===", url)

    def parse(self, response):
        # 爬取页面内的item
        items = set(response.xpath('//@href'))
        self.logger.warning("测试", items)
        for item in items:
            tempItem = item.extract()
            if re.findall("Data.aspx?.*[\d]",tempItem):
                dataurl = urllib.parse.unquote(str(tempItem).replace("\\\"",""))
                new_url = 'http://data.un.org/'+dataurl
                print(new_url)
                # self.logger.warning(new_url)
                yield response.follow(new_url, callback=self.parse_datadetail)

    def get_nextpage_url(self,page,url):
        print("url===",url)
        datatemp = url.split("?")[1].split("&")
        dataMartId = datatemp[0].split("=")[1]
        dataFilter = datatemp[1].split("=")[1]
        if dataMartId == "ICS":
            new_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page' \
                      '&Page={0}' \
                      '&DataFilter={1}' \
                      '&DataMartId={2}' \
                      '&UserQuery=&c=2,5,6,7,8' \
                      '&s=_crEngNameOrderBy:asc,,yr:desc,_utEngNameOrderBy:asc' \
                .format(page, dataFilter, dataMartId)
        else:
            new_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page' \
                  '&Page={0}' \
                  '&DataFilter={1}' \
                  '&DataMartId={2}' \
                  '&UserQuery=&c=2,5,6,7,8' \
                  '&s=_crEngNameOrderBy:asc,_enID:asc,yr:desc'\
                .format(page,dataFilter,dataMartId)
        print(new_url,dataMartId)
        return new_url,dataMartId

    def parse_data(self,response):
        name = response.meta['name']
        type = response.meta['type']
        items = response.xpath('//div[@class="DataContainer"]//tr')
        for item in items:
            tds = item.xpath("./td//text()").extract()
            print("原始数据===",type+"|"+name+"|"+"|".join(tds))
            with open(self.filepath,mode="a",encoding="utf-8") as f:
                f.write(type+"|"+name+"|"+"|".join(tds)+"\n")
            # lhgItem = LhgItem()
            # lhgItem.name = name
            # lhgItem.value = tds
            # yield lhgItem

    def parse_datadetail(self, response):
        print("test===============================")
        name = response.xpath('//div[@class="SeriesMeta"]//h2/text()').extract_first()
        # page = response.xpath('//span[contains(@id, "spanPageIndexB")]/text()').extract_first()
        total = response.xpath('//span[contains(@id, "spanPageCountB")]/text()').extract_first()
        for i in range(int(total)):
            page = i+1
            new_url,dataMartId = self.get_nextpage_url(page,response.url)
            print("test===",new_url,dataMartId)
            self.logger.warning(new_url)
            yield response.follow(new_url,meta={'name': name, 'type': dataMartId}, callback=self.parse_data)
Ejemplo n.º 15
0
class fhjwSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("fhjwSpider")

    name = 'fhjwpl'
    allowed_domains = ['shankapi.ifeng.com',"comment.ifeng.com"]

    def start_requests(self):
        url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \
              '/000/{0}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback'
        t = time.time()
        nowTime = int(round(t * 1000))
        url = url.format(nowTime)
        yield Request(url)

    def parse(self, response):
        print(response.text)
        data = re.findall("{(.*)}",response.text)
        if len(data) >0:
            jsondata_result = "{"+re.findall("{(.*)}",response.text)[0]+"}"
            jsondata = json.loads(jsondata_result)
            print(jsondata)
            code = jsondata.get("code")
            data =  jsondata.get("data")
            print(code)
            if code == 0:
                isEnd = data.get("isEnd")
                newsstream = data.get("newsstream")
                for index,item in enumerate(newsstream):
                    print("数据:",item)
                    id = item["id"]
                    newsTime = item["newsTime"]
                    skey = item["skey"]
                    url = item["url"]
                    commentUrl = item["commentUrl"]
                    source = item["source"]
                    title = item["title"]
                    item["p"] = 1
                    comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \
                                  "docUrl={0}&format=js&job=1&p=1&pageSize=20".format(commentUrl)
                    yield response.follow(comment_url, meta=item, callback=self.parse_comment)
                    if index == len(newsstream)-1:
                        new_url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \
                              '/{0}/{1}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback'
                        timeArray = time.strptime(newsTime, "%Y-%m-%d %H:%M:%S")
                        timeStamp = int(time.mktime(timeArray))
                        print(id,timeStamp)
                        new_url = new_url.format(id,timeStamp)
                        print(new_url)
                        if isEnd:
                            print("结束")
                        else:
                            yield scrapy.Request(url=new_url, callback=self.parse)

    def parse_comment(self,response):
        id = response.meta["id"]
        newsTime = response.meta["newsTime"]
        skey = response.meta["skey"]
        url = response.meta["url"]
        commentUrl = response.meta["commentUrl"]
        source = response.meta["source"]
        title = response.meta["title"]
        p = response.meta["p"]
        data = re.findall("{(.*)}", response.text.encode('utf-8').decode('unicode_escape'))
        if len(data) >0:
            jsondata_result = "{"+re.findall("{(.*)}",response.text)[0]+"}"
            jsondata = json.loads(jsondata_result)
            # print(jsondata)
            count = jsondata["count"]
            join_count = jsondata["join_count"]
            comments = jsondata["comments"]
            if len(comments) >0:
                for comment in comments:
                    comment_id = comment["comment_id"]
                    uname = comment["uname"]
                    user_id = comment["user_id"]
                    comment_contents = comment["comment_contents"]
                    comment_date = comment["comment_date"]
                    uptimes = comment["uptimes"]
                    parents = comment["parent"]
                    reply_comment_ids = []
                    if len(parents) >0 :
                        for parent in parents:
                            reply_comment_id = parent["comment_id"]
                            reply_comment_ids.append(reply_comment_id)
                    print("comment===",id, title, comment_id, uname, user_id, comment_contents, comment_date, uptimes,reply_comment_ids)

                    fhjsxwpl = FHJSXWPL()
                    fhjsxwpl["id"] = id
                    fhjsxwpl["title"] = title
                    fhjsxwpl["comment_id"] = comment_id
                    fhjsxwpl["comment_contents"] = comment_contents
                    fhjsxwpl["comment_date"] = comment_date
                    fhjsxwpl["user_name"] = uname
                    fhjsxwpl["user_id"] = user_id
                    fhjsxwpl["uptimes"] = uptimes
                    fhjsxwpl["reply_comment_ids"] = ",".join(reply_comment_ids)
                    pdate = datetime.datetime.now().strftime('%Y-%m-%d')
                    fhjsxwpl["pdate"] = pdate
                    fhjsxwpl["data_source"] = "凤凰网"
                    fhjsxwpl["data_module"] = "军情热点"
                    yield fhjsxwpl

                p = p+1
                comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \
                              "docUrl={0}&format=js&job=1&p={1}&pageSize=20".format(commentUrl,p)
                yield scrapy.Request(comment_url, meta=response.meta, callback=self.parse_comment)
            else:
                print("结束")
Ejemplo n.º 16
0
class fhjwSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("fhjwSpider")

    name = 'fhjwpl2'
    allowed_domains = [
        'shankapi.ifeng.com', "comment.ifeng.com", "mil.ifeng.com"
    ]
    start_urls = ['https://mil.ifeng.com/']

    def parse(self, response):
        aitems = set(response.xpath("//div[@class='news-34dpVmYc']//a"))
        for aitem in aitems:
            item = {}
            url = "https:" + aitem.xpath("@href").extract()[0]
            title = aitem.xpath("@title").extract()[0]
            print(url, title)
            id = url.split("/")[-1]
            commentUrl = "ucms_" + id
            item["id"] = id
            item["title"] = title
            item["commentUrl"] = commentUrl
            item["url"] = url
            item["p"] = 1
            comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \
                          "docUrl={0}&format=js&job=1&p=1&pageSize=20".format(commentUrl)
            yield response.follow(comment_url,
                                  meta=item,
                                  callback=self.parse_comment)

    def parse_comment(self, response):
        id = response.meta["id"]
        url = response.meta["url"]
        commentUrl = response.meta["commentUrl"]
        title = response.meta["title"]
        p = response.meta["p"]
        data = re.findall(
            "{(.*)}",
            response.text.encode('utf-8').decode('unicode_escape'))
        if len(data) > 0:
            jsondata_result = "{" + re.findall("{(.*)}",
                                               response.text)[0] + "}"
            jsondata = json.loads(jsondata_result)
            # print(jsondata)
            count = jsondata["count"]
            join_count = jsondata["join_count"]
            comments = jsondata["comments"]
            if len(comments) > 0:
                for comment in comments:
                    comment_id = comment["comment_id"]
                    uname = comment["uname"]
                    user_id = comment["user_id"]
                    comment_contents = comment["comment_contents"]
                    comment_date = comment["comment_date"]
                    uptimes = comment["uptimes"]
                    parents = comment["parent"]
                    reply_comment_ids = []
                    if len(parents) > 0:
                        for parent in parents:
                            reply_comment_id = parent["comment_id"]
                            reply_comment_ids.append(reply_comment_id)
                    print("comment===", id, title, comment_id, uname, user_id,
                          comment_contents, comment_date, uptimes,
                          reply_comment_ids)

                    fhjsxwpl = FHJSXWPL()
                    fhjsxwpl["id"] = id
                    fhjsxwpl["title"] = title
                    fhjsxwpl["comment_id"] = comment_id
                    fhjsxwpl["comment_contents"] = comment_contents
                    fhjsxwpl["comment_date"] = comment_date
                    fhjsxwpl["user_name"] = uname
                    fhjsxwpl["user_id"] = user_id
                    fhjsxwpl["uptimes"] = uptimes
                    fhjsxwpl["reply_comment_ids"] = ",".join(reply_comment_ids)
                    pdate = datetime.datetime.now().strftime('%Y-%m-%d')
                    fhjsxwpl["pdate"] = pdate
                    fhjsxwpl["data_source"] = "凤凰网"
                    fhjsxwpl["data_module"] = "军事首页"
                    yield fhjsxwpl

                p = p + 1
                comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \
                              "docUrl={0}&format=js&job=1&p={1}&pageSize=20".format(commentUrl,p)
                yield scrapy.Request(comment_url,
                                     meta=response.meta,
                                     callback=self.parse_comment)
            else:
                print("结束")
Ejemplo n.º 17
0
class fhjwSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("fhjwSpider")

    name = 'fhjw'
    allowed_domains = [
        'shankapi.ifeng.com', "comment.ifeng.com", "mil.ifeng.com",
        "tech.ifeng.com", "ishare.ifeng.com", "news.ifeng.com",
        "survey.news.ifeng.com"
    ]

    def start_requests(self):
        url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \
              '/000/{0}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback'
        t = time.time()
        nowTime = int(round(t * 1000))
        url = url.format(nowTime)
        yield Request(url)

    def parse(self, response):
        print(response.text)
        data = re.findall("{(.*)}", response.text)
        if len(data) > 0:
            jsondata_result = "{" + re.findall("{(.*)}",
                                               response.text)[0] + "}"
            jsondata = json.loads(jsondata_result)
            print(jsondata)
            code = jsondata.get("code")
            data = jsondata.get("data")
            print(code)
            if code == 0:
                isEnd = data.get("isEnd")
                newsstream = data.get("newsstream")
                for index, item in enumerate(newsstream):
                    print("数据:", item)
                    id = item["id"]
                    newsTime = item["newsTime"]
                    skey = item["skey"]
                    url = item["url"]
                    commentUrl = item["commentUrl"]
                    source = item["source"]
                    title = item["title"]
                    item["p"] = 1
                    comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \
                                  "docUrl={0}&format=js&job=1&p=1&pageSize=1".format(commentUrl)
                    yield response.follow(comment_url,
                                          meta=item,
                                          callback=self.parse_comment)
                    if index == len(newsstream) - 1:
                        new_url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \
                              '/{0}/{1}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback'
                        timeArray = time.strptime(newsTime,
                                                  "%Y-%m-%d %H:%M:%S")
                        timeStamp = int(time.mktime(timeArray))
                        print(id, timeStamp)
                        new_url = new_url.format(id, timeStamp)
                        print(new_url)
                        if isEnd:
                            print("结束")
                        else:
                            yield scrapy.Request(url=new_url,
                                                 callback=self.parse)

    def parse_comment(self, response):
        id = response.meta["id"]
        newsTime = response.meta["newsTime"]
        skey = response.meta["skey"]
        url = response.meta["url"]
        commentUrl = response.meta["commentUrl"]
        source = response.meta["source"]
        title = response.meta["title"]
        p = response.meta["p"]
        data = re.findall(
            "{(.*)}",
            response.text.encode('utf-8').decode('unicode_escape'))
        if len(data) > 0:
            jsondata_result = "{" + re.findall("{(.*)}",
                                               response.text)[0] + "}"
            jsondata = json.loads(jsondata_result)
            # print(jsondata)
            count = jsondata["count"]
            join_count = jsondata["join_count"]
            comments = jsondata["comments"]
            print("count===", id, title, newsTime, count, join_count)
            metaitem = {}
            metaitem["id"] = id
            metaitem["title"] = title
            metaitem["newsTime"] = newsTime
            metaitem["count"] = count
            metaitem["join_count"] = join_count
            metaitem["commentUrl"] = commentUrl
            print("url====", url)
            yield response.follow(url,
                                  meta=metaitem,
                                  callback=self.parse_context)

    def parse_context(self, response):
        id = response.meta["id"]
        title = response.meta["title"]
        count = response.meta["count"]
        join_count = response.meta["join_count"]
        commentUrl = response.meta["commentUrl"]
        publish_time = response.xpath(
            "//p[@class='time-1Mgp9W-1']/span[1]/text()").extract()[0].strip()
        context = response.xpath(
            "////div[@class='text-3w2e3DBc']//p/text()").extract()
        # print("结果数据:",id,title,publish_time,context,count,join_count)
        accumulator_url = "https://survey.news.ifeng.com/api/getaccumulatorweight?format=js&" \
        "key={0}ding&serviceid=2&callback=getaccumulator"
        accumulator_url = accumulator_url.format(commentUrl)
        accumulator_meta = {}
        accumulator_meta["id"] = id
        accumulator_meta["title"] = title
        accumulator_meta["publish_time"] = publish_time
        accumulator_meta["context"] = context
        accumulator_meta["count"] = count
        accumulator_meta["join_count"] = join_count
        accumulator_meta["commentUrl"] = commentUrl
        print("获取推荐数:", accumulator_url)
        yield scrapy.Request(accumulator_url,
                             meta=accumulator_meta,
                             callback=self.parese_context2)

    def parese_context2(self, response):
        fhjsxw = FHJSXW()
        id = response.meta["id"]
        title = response.meta["title"]
        context = response.meta["context"]
        publish_time = response.meta["publish_time"]
        count = response.meta["count"]
        join_count = response.meta["join_count"]
        commentUrl = response.meta["commentUrl"]
        data = re.findall("\"browse\":{(.*)}}}", response.text)
        print(data)
        if len(data) > 0:
            jsondata_result = "{" + data[0] + "}"
            jsondata = json.loads(jsondata_result)
            print(jsondata)
            accumulator_count = jsondata[commentUrl + "ding"]
            print("结果数据:", id, title, context, publish_time, count, join_count,
                  accumulator_count)
            fhjsxw["id"] = id
            fhjsxw["title"] = title
            fhjsxw["context"] = "|".join(context)
            fhjsxw["publish_time"] = publish_time
            fhjsxw["comment_count"] = count
            fhjsxw["join_count"] = join_count
            fhjsxw["accumulator_count"] = accumulator_count
            pdate = datetime.datetime.now().strftime('%Y-%m-%d')
            fhjsxw["pdate"] = pdate
            fhjsxw["data_source"] = "凤凰网"
            fhjsxw["data_module"] = "军情热点"
            yield fhjsxw
        else:
            fhjsxw["id"] = id
            fhjsxw["title"] = title
            fhjsxw["context"] = context
            fhjsxw["publish_time"] = publish_time
            fhjsxw["comment_count"] = count
            fhjsxw["join_count"] = join_count
            fhjsxw["accumulator_count"] = 0
            pdate = datetime.datetime.now().strftime('%Y-%m-%d')
            fhjsxw["pdate"] = pdate
            fhjsxw["data_source"] = "凤凰网"
            fhjsxw["data_module"] = "军情热点"
            yield fhjsxw
Ejemplo n.º 18
0
class CmanoSpider(scrapy.Spider):
    logger_util = LoggerUtil()
    logger = logger_util.getSelfLogger("CmanoSpider")

    name = 'cmano'
    allowed_domains = ['www.cmano-db.com']
    start_urls = ['http://www.cmano-db.com/']

    def __init__(self, url):
        print("启动url===", url)

    def parse(self, response):
        # 爬取页面内的item
        items = set(
            response.xpath(
                '//ul[contains(@class, "nav navbar-nav")]/li/ul/li/a/@href'))
        self.logger.warning("测试", items)
        listCountrys = ['Ship', 'Submarine', 'Facility', 'Aircraft']
        noListCountrys = ["Weapon", "Sensor"]
        for item in items:
            tempItem = item.extract()
            new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote(
                tempItem)
            self.logger.warning(tempItem)
            if tempItem[0:-1].lower() in [
                    listCountry.lower() for listCountry in listCountrys
            ]:
                self.logger.warning("测试", tempItem)
                yield response.follow(new_url, callback=self.parse_country)
            elif tempItem[0:-1].lower() in [
                    noListCountry.lower() for noListCountry in noListCountrys
            ]:
                self.logger.warning("测试", tempItem)
                yield response.follow(new_url,
                                      callback=self.parse_no_country_list)

    def parse_country(self, response):
        # self.logger.warning("country测试", response.url)
        items = set(
            response.xpath('//div[contains(@class, "country")]/h4/a/@href'))
        self.logger.warning("测试", items)
        for index, item in enumerate(items):
            new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote(
                item.extract())
            self.logger.warning("country测试", new_url)
            yield response.follow(new_url, callback=self.parse_country_list)

    def parse_country_list(self, response):
        self.logger.warning("country-list测试", response.url)
        items = set(
            response.xpath(
                '//table[contains(@class, "table table-striped table-hover")]//@href'
            ))
        gjitem = GJItem()
        gjitem["name"] = response.url.split("/")[-2]
        print("-----------response.url", response.url)
        print("-----------gjname", gjitem)
        self.logger.warning("测试", items)
        for index, item in enumerate(items):
            new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote(
                item.extract())
            self.logger.warning("测试", new_url)
            yield response.follow(new_url,
                                  meta={'item': gjitem},
                                  callback=self.parse_country_detail)

    def parse_no_country_list(self, response):
        self.logger.warning("country-list测试", response.url)
        items = set(
            response.xpath('//div[contains(@class, "country")]//@href'))
        self.logger.warning("测试", items)
        for index, item in enumerate(items):
            new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote(
                item.extract())
            self.logger.warning("测试", new_url)
            yield response.follow(new_url,
                                  meta={'item': None},
                                  callback=self.parse_country_detail)

    def parse_country_detail(self, response):

        label = response.url.split("/")[-3].strip()
        print("--------------label", label)
        name = response.xpath(
            "//h3[@id='typography']/text()").extract()[0].strip()
        print("--------------name", name)

        img = CmanoItem()
        self.logger.warning("country-detial测试", response.url)
        imageitems = response.xpath(
            "//div[contains(@class,'col-lg-7')]/a/img/@src")
        image_urls = []
        for index, item in enumerate(imageitems):
            new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote(
                item.extract())
            image_urls.append(new_url)
        img["image_urls"] = image_urls
        yield img

        img1 = AttrItem()
        img1["name"] = name
        img1["attr"] = "label"
        img1["value"] = label
        print("--------------label", img1)
        yield img1

        item = response.meta['item']
        print("--------------item", item)
        if item is not None:
            imggj = RelaItem()
            imggj["name_partA"] = name
            imggj["name_partB"] = item["name"]
            imggj["rela"] = "服役国家"
            print("--------------服役国家", imggj)
            yield imggj

        if len(image_urls) > 0:
            img0 = AttrItem()
            imagesname = image_urls[0].split("/")[-1].strip()
            filepath = "images/" + imagesname.split("_")[0] + "/" + imagesname
            print("--------------filepath", filepath)
            img0["name"] = name
            img0["attr"] = "image_path"
            img0["value"] = filepath
            yield img0

        items = response.xpath(
            "//div[contains(@class,'col-lg-7')]/table[1]//td/text()")
        for index, item in enumerate(items):
            tempdata = item.extract().split(":")
            print("--------------tempdata", tempdata)
            if len(tempdata[0].strip()) > 0 and len(tempdata) == 2:
                img2 = AttrItem()
                img2["name"] = name
                img2["attr"] = tempdata[0].strip()
                img2["value"] = tempdata[1].strip()
                print("--------------attr", img2)
                yield img2

        weaponsList = ["Weapons:", "Weapons / Loadouts:"]
        sensorsList = ["Sensors / EW:", "Sensors:"]

        itemsrela1 = response.xpath(
            "//div[contains(@class,'col-lg-7')]/table[2]//a/text()")
        print("------------itemsrela1", itemsrela1)
        relatype = response.xpath(
            "//div[contains(@class,'col-lg-7')]/table[2]//u/text()").extract()
        print("------------relatype", relatype)
        for index, item in enumerate(itemsrela1):
            tempdata = item.extract()
            print("--------------tempdata", tempdata)
            if len(tempdata.strip()) > 0:
                img3 = RelaItem()
                img3["name_partA"] = name
                img3["name_partB"] = tempdata.strip()
                if relatype[0] in sensorsList:
                    img3["rela"] = "传感器配置"
                elif relatype[0] in weaponsList:
                    img3["rela"] = "武器负载"
                print("--------------rela", img3)
                yield img3

        itemsrela12 = response.xpath(
            "//div[contains(@class,'col-lg-7')]/table[3]//a/text()")
        print("------------itemsrela1", itemsrela12)
        relatype2 = response.xpath(
            "//div[contains(@class,'col-lg-7')]/table[3]//u/text()").extract()
        print("------------relatype", relatype2)
        for index, item in enumerate(itemsrela12):
            tempdata = item.extract()
            print("--------------tempdata", tempdata)
            if len(tempdata.strip()) > 0:
                img4 = RelaItem()
                img4["name_partA"] = name
                img4["name_partB"] = tempdata.strip()
                if relatype2[0] in sensorsList:
                    img4["rela"] = "传感器配置"
                elif relatype2[0] in weaponsList:
                    img4["rela"] = "武器负载"
                print("--------------rela", img4)
                yield img4