コード例 #1
0
ファイル: parse_item.py プロジェクト: LunaBlack/BaiduCrawler
    def trans_time(self, t):
        if '日'.decode('utf8') in t:
            t = t[:t.index('日'.decode('utf8')) + 1]
            t = TimeTransform.date_to_struct(t)
            return t

        if '前'.decode('utf8') in t:
            if '天'.decode('utf8') in t:
                delta_d = int(t[:t.index('天'.decode('utf8'))])
                t = time.gmtime(time.time() - delta_d * 24 * 60 * 60)
                return t
            elif '小时'.decode('utf8') in t:
                delta_H = int(t[:t.index('小时'.decode('utf8'))])
                t = time.gmtime(time.time() - delta_H * 60 * 60)
                return t
            elif '分钟'.decode('utf8') in t:
                delta_M = int(t[:t.index('分钟'.decode('utf8'))])
                t = time.gmtime(time.time() - delta_M * 60)
                return t
            elif '秒'.decode('utf8') in t:
                delta_S = int(t[:t.index('秒'.decode('utf8'))])
                t = time.gmtime(time.time() - delta_S)
                return t

        return time.localtime()
コード例 #2
0
ファイル: parse_item.py プロジェクト: LunaBlack/BaiduCrawler
    def parse_publishedtime(self, element):
        res = element.xpath(".//div[@class='bbs f13']/text()")  # 搜索百度贴吧

        if res:
            res = res[0]
            res = res[res.index('发帖时间'.decode('utf8')) + 5:]
        else:
            res = element.xpath(
                ".//div[@class='c-abstract']//span[@class=' newTimeFactor_before_abs m']/text()"
            )
            if res:
                res = res[0]
            else:
                return None

        res = res.split()[0]
        res = TimeTransform.struct_to_string(TimeTransform.date_to_struct(res))
        return res
コード例 #3
0
ファイル: readsetting.py プロジェクト: LunaBlack/BaiduCrawler
    def read_keywords(self):  # 读取检索的关键词及时间范围
        allwords = []

        for n, i in enumerate(self.text):
            if i.strip():
                text = i.strip()
                try:
                    text = text.decode('utf8')
                except:
                    try:
                        text = text.decode('utf8')
                    except:
                        pass
            else:
                continue

            text = text.split(';')

            times = text[0]
            starttime, endtime = times.split(' ')
            self.starttime = TimeTransform.date_to_struct(
                Utils.transform_coding(starttime))
            self.endtime = TimeTransform.date_to_struct(
                Utils.transform_coding(endtime))
            # self.endtime = TimeTransform.datetime_to_struct((TimeTransform.date_to_datetime(Utils.transform_coding(endtime)) + datetime.timedelta(days=1)))

            if len(text) > 1:
                words = list(set(text[1].split()))
                if ' ' in words:
                    words.remove(' ')
                if '' in words:
                    words.remove('')
                allwords.extend(words)
                break
            else:
                allwords = ['地震'.decode('utf8')]
                break

        self.allwords = allwords
コード例 #4
0
ファイル: sohu.py プロジェクト: LunaBlack/BaiduCrawler
    def extract_publishedtime(self, tree):
        t = tree.xpath(self.publishedtime_xpath)[0].strip().split()[0]

        try:
            t = time.strptime(t, "%Y-%m-%d")
        except:
            try:
                t = time.strptime(t, "%Y年%m月%d日%H%M".decode('utf8'))
            except:
                pass

        t = TimeTransform.struct_to_string(t)
        return t
コード例 #5
0
ファイル: huanqiu.py プロジェクト: LunaBlack/BaiduCrawler
 def extract_publishedtime(self, tree):
     t = tree.xpath(self.publishedtime_xpath)[0].strip()
     t = time.strptime(t, "%Y-%m-%d %H:%M:%S")
     t = TimeTransform.struct_to_string(t)
     return t
コード例 #6
0
 def extract_publishedtime(self, tree):
     t = tree.xpath(self.publishedtime_xpath)[0].strip()
     t = time.strptime(t, "%Y年%m月%d日 %H:%M".decode('utf8'))
     t = TimeTransform.struct_to_string(t)
     return t
コード例 #7
0
 def extract_publishedtime(self, tree):
     t = [i for i in tree.xpath(self.publishedtime_xpath) if i.strip()][-1]
     t = time.strptime(t, "%Y-%m-%d")
     t = TimeTransform.struct_to_string(t)
     return t
コード例 #8
0
 def extract_publishedtime(self, tree):
     t = tree.xpath(self.publishedtime_xpath)[0].strip()
     t = t[5:]
     t = TimeTransform.struct_to_string(t)
     return t
コード例 #9
0
 def extract_publishedtime(self, tree):
     t = tree.xpath(self.publishedtime_xpath)
     t = ''.join(t).strip().split()[0]
     t = time.strptime(t, "%Y-%m-%d")
     t = TimeTransform.struct_to_string(t)
     return t
コード例 #10
0
 def extract_publishedtime(self, tree):
     t = tree.xpath(self.publishedtime_xpath)[0].strip().split()[0]
     t = t[5:]
     t = time.strptime(t, "%Y-%m-%d".decode('utf8'))
     t = TimeTransform.struct_to_string(t)
     return t