Beispiel #1
0
 def __init__(self):
     """
     将item中的值初始化为空
     """
     Item.__init__(self)
     self['coal'] = ''
     self['non_coal'] = ''
Beispiel #2
0
    def __init__(self):
        Item.__init__(self)
        #爬取下来的一个全局唯一ID
        self['crawlerid'] = ''
        #页面链接
        self['url'] = ''
        #html源码
        self['html_code'] = ''
        #页面编码
        self['encoding'] = ''
        #标题
        self['title'] = ''
        #作者
        self['authors'] = []
        #正文
        self['content'] = ''
        #新闻时间
        self['time'] = ''
        #来源
        self['source'] = ''
        #编辑
        self['editor'] = ''
        #频道类别
        self['ctype'] = ''
        #频道类别
        self['subtype'] = ''
        #关键词
        self['keywords'] = []
        #摘要
        self['abstract'] = ''

        self['copyright'] = ''
        self['originality'] = ''
        self['type'] = 'text'
Beispiel #3
0
 def __init__(self):
     """
     将item中的值初始化为空
     """
     Item.__init__(self)
     self['coal'] = ''
     self['non_coal'] = ''
Beispiel #4
0
 def __init__(self, *args, **kwargs):
     Item.__init__(self, *args, **kwargs)
     self['platform'] = kwargs.get("keyword")
     self['keyword'] = kwargs.get("keyword")
     self['crawl_time'] = int(time.time())
     self['url'] = kwargs.get("url")
     self['real_url'] = kwargs.get("real_url")
     self['title'] = kwargs.get("title")
     self['source_url'] = kwargs.get("title")
     self['spider'] = kwargs.get("spider")
     self['skip_url'] = kwargs.get("skip_url")
     self['snapshot_url'] = kwargs.get("snapshot_url")
     self['show_url'] = kwargs.get("show_url")
     self['is_ad'] = kwargs.get("is_ad")
     self['content'] = kwargs.get("content")
Beispiel #5
0
 def __init__(self, year):
     global this_year
     this_year = year
     Item.__init__(self)