def _process_dates(self):
   """internal method to parse the gcal_url for start and end date info and
     set the _start_date_arrow and _end_date_arrow to instances of arrow objs
   """
   #dont rerun if _start_date_arrow or _end_date_arrow is set or if gcal_url not found
   if (self._start_date_arrow or self._end_date_arrow) or not self.gcal_url: return
   gcal_url = self.gcal_url
   gcal_url_date_time_match = self.gcal_url_date_time_pattern.search(gcal_url)
   if not gcal_url_date_time_match: return
   (gcal_url_start_date_str, gcal_url_end_date_str) = gcal_url_date_time_match.groups()
   # add time to date if no time spesified
   if 'T' not in gcal_url_start_date_str: gcal_url_start_date_str += 'T000000'
   if 'T' not in gcal_url_end_date_str: gcal_url_end_date_str += 'T000000'
   self._start_date_arrow = Arrow.strptime(gcal_url_start_date_str, self.gcal_url_date_time_format, tzinfo=self.event_timezone)
   self._end_date_arrow = Arrow.strptime(gcal_url_end_date_str, self.gcal_url_date_time_format, tzinfo=self.event_timezone)
Example #2
0
 def run(self, symbol):
     uid = self.get_uuid(symbol)
     if uid is None:
         return
     url = 'http://www.newrank.cn/xdnphb/detail/getAccountArticle'
     params = {
         'uuid': uid,
     }
     r = self.req_post(url, data=params)
     datas = r.json()
     try:
         infos = datas['value']['lastestArticle']
         for info in infos:
             source_url = self.parse_url(info.get('url'))
             if self.repeat_check(source_url):
                 continue
             title = info.get('title')
             wx_id = info.get('account')
             author = info.get('author')
             post_time = info.get('publicTime')
             post_time = Arrow.strptime(post_time, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp
             summary = info.get('summary')
             content, img = self.get_content(source_url)
             if info.get('imageUrl') is None:
                 image = img
             else:
                 image = info.get('imageUrl')
             self.add_result(title=title, author=author, post_time=post_time, source_name=author,
                             source_url=source_url, summary=summary, spider_name=self.spider_name,
                             content=content, image=image, category=self.category, aid=wx_id)
     except Exception as e:
         self.log.error(e)
Example #3
0
 def parse_stamps(self, expr=STAMP_RE, fmt='%H:%M, %d %B %Y (%Z)'):
     stamps = []
     algo = self.archiver.config['algo']
     try:
         maxage = str2time(re.search(r"^old\((\w+)\)$", algo).group(1))
     except AttributeError as e:
         e.args = ("Malformed archive algorithm",)
         raise ArchiveError(e)
     for thread in self.threads:
         if mwp_parse(thread['header']).get(0).level != 2:
             # the header is not level 2
             stamps = []
             continue
         for stamp in expr.finditer(thread['content']):
             # This for loop can probably be optimised, but ain't nobody
             # got time fo' dat
             #if stamp.group(1) in MONTHS:
             try:
                 stamps.append(Arrow.strptime(stamp.group(0), fmt))
             except ValueError:  # Invalid stamps should not be parsed, ever
                 continue
         if stamps:
             # The most recent stamp should be used to see if we should archive
             most_recent = max(stamps)
             thread['stamp'] = most_recent
             thread['oldenough'] = Arrow.utcnow() - most_recent > maxage
             pass  # No stamps were found, abandon thread
         stamps = []
Example #4
0
 def parse_stamps(self, expr=STAMP_RE, fmt='%H:%M, %d %B %Y (%Z)'):
     stamps = []
     algo = self.archiver.config['algo']
     try:
         maxage = str2time(re.search(r"^old\((\w+)\)$", algo).group(1))
     except AttributeError as e:
         e.args = ("Malformed archive algorithm", )
         raise ArchiveError(e)
     for thread in self.threads:
         if mwp_parse(thread['header']).get(0).level != 2:
             # the header is not level 2
             stamps = []
             continue
         for stamp in expr.finditer(thread['content']):
             # This for loop can probably be optimised, but ain't nobody
             # got time fo' dat
             #if stamp.group(1) in MONTHS:
             try:
                 stamps.append(Arrow.strptime(stamp.group(0), fmt))
             except ValueError:  # Invalid stamps should not be parsed, ever
                 continue
         if stamps:
             # The most recent stamp should be used to see if we should archive
             most_recent = max(stamps)
             thread['stamp'] = most_recent
             thread['oldenough'] = Arrow.utcnow() - most_recent > maxage
             pass  # No stamps were found, abandon thread
         stamps = []
Example #5
0
 def parse(self, kind, aid, summary):
     url = 'http://api.smzdm.com/v1/%s/articles/%s' % (kind, aid)
     if self.blf.exist(url):
         return
     self.blf.add(url)
     try:
         r = self.req_get(url)
         data = r.json().get('data')
         title = data.get('article_title')
         author = data.get('article_referrals')
         post_time = data.get('article_date')
         post_time = Arrow.strptime(post_time, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp
         source_url = data.get('article_url')
         # summary = data.get('summary')
         content = data.get('article_filter_content')
         try:
             content = self.get_img(BeautifulSoup('<div>%s</div>' % content, 'lxml'), 'src')
         except Exception as e:
             self.log.exception(e)
         image = data.get('article_pic')
         # self.add_result(title=title, author=author, post_time=post_time, source_name=self.spider_name,
         #                 source_url=source_url, summary=summary,
         #                 content=content, image=image, category=self.category, aid=kind)
         self.add_result(title=title, author=author, post_time=post_time, source_name='什么值得买',
                         source_url=source_url, summary=summary, spider_name=self.spider_name,
                         content=content, image=image, category=self.category, aid=kind)
     except Exception as e:
         self.log.error(e)
Example #6
0
 def get_datestr_and_dateint(self, datestr_area):
     rt = dict(datestr='', dateint=0)
     if isinstance(datestr_area, unicode) or isinstance(datestr_area, str):
         for time_re, arrow_fmt in time_formats:
             findall = time_re.findall(datestr_area)
             if findall:
                 ar = Arrow.strptime(findall[0].encode('utf8'), arrow_fmt,
                                     'Asia/Shanghai')
                 if ar.year < 2000:
                     ar = ar.replace(year=Arrow.now().datetime.year)
                 rt = dict(datestr=findall[0], dateint=ar.timestamp)
                 break
     return rt
Example #7
0
 def run(self, symbol):
     uid = self.get_uuid(symbol)
     if uid is None:
         return
     url = 'http://www.newrank.cn/xdnphb/detail/getAccountArticle'
     params = {
         'uuid': uid,
     }
     r = self.req_post(url, data=params)
     datas = r.json()
     try:
         infos = datas['value']['lastestArticle']
         for info in infos:
             source_url = self.parse_url(info.get('url'))
             if self.repeat_check(source_url):
                 continue
             title = info.get('title')
             wx_id = info.get('account')
             author = info.get('author')
             post_time = info.get('publicTime')
             post_time = Arrow.strptime(post_time,
                                        '%Y-%m-%d %H:%M:%S',
                                        tzinfo='Asia/Shanghai').timestamp
             summary = info.get('summary')
             content, img = self.get_content(source_url)
             if info.get('imageUrl') is None:
                 image = img
             else:
                 image = info.get('imageUrl')
             self.add_result(title=title,
                             author=author,
                             post_time=post_time,
                             source_name=author,
                             source_url=source_url,
                             summary=summary,
                             spider_name=self.spider_name,
                             content=content,
                             image=image,
                             category=self.category,
                             aid=wx_id)
     except Exception as e:
         self.log.error(e)
Example #8
0
    def __init__(self, status_json):
        self.metadata = status_json
        self.id = status_json["id"]
        self.userName = status_json["user"]["screen_name"]
        self.userID = status_json["user"]["id"]
        # creation converts, e.g. "Thu May 25 15:18:25 +0000 2017" to int timestamp
        self.creation = Arrow.strptime(status_json["created_at"],
                                       "%a %b %d %H:%M:%S %z %Y").timestamp
        self.text = status_json["full_text"]
        # source converts the html "a" tag string to its inner text
        self.source = re.search(">.*?<", status_json["source"])[0].strip("><")
        self.favoriteCount = status_json["favorite_count"]
        self.retweets = status_json["retweet_count"]
        self.language = status_json["lang"]
        self.mentions = self.getMentions()
        self.hashtags = self.getHashtags()
        self.url = self.getUrls()
        self.medias = self.getMedia()

        self.filename = datetime.strftime(
            datetime.utcfromtimestamp(self.creation), "%Y-%m-%d_%H-%M-%S_UTC")
Example #9
0
 def parse(self, kind, aid, summary):
     url = 'http://api.smzdm.com/v1/%s/articles/%s' % (kind, aid)
     if self.blf.exist(url):
         return
     self.blf.add(url)
     try:
         r = self.req_get(url)
         data = r.json().get('data')
         title = data.get('article_title')
         author = data.get('article_referrals')
         post_time = data.get('article_date')
         post_time = Arrow.strptime(post_time,
                                    '%Y-%m-%d %H:%M:%S',
                                    tzinfo='Asia/Shanghai').timestamp
         source_url = data.get('article_url')
         # summary = data.get('summary')
         content = data.get('article_filter_content')
         try:
             content = self.get_img(
                 BeautifulSoup('<div>%s</div>' % content, 'lxml'), 'src')
         except Exception as e:
             self.log.exception(e)
         image = data.get('article_pic')
         # self.add_result(title=title, author=author, post_time=post_time, source_name=self.spider_name,
         #                 source_url=source_url, summary=summary,
         #                 content=content, image=image, category=self.category, aid=kind)
         self.add_result(title=title,
                         author=author,
                         post_time=post_time,
                         source_name='什么值得买',
                         source_url=source_url,
                         summary=summary,
                         spider_name=self.spider_name,
                         content=content,
                         image=image,
                         category=self.category,
                         aid=kind)
     except Exception as e:
         self.log.error(e)
Example #10
0
 def parse_time(time_str):
     t = ct.search(time_str).group(0)
     return Arrow.strptime(t, '%Y-%m-%d %H:%M:%S', tzinfo='Asia/Shanghai').timestamp
Example #11
0
 def parse_time(time_str):
     t = ct.search(time_str).group(0)
     return Arrow.strptime(t, '%Y-%m-%d %H:%M:%S',
                           tzinfo='Asia/Shanghai').timestamp
Example #12
0
def isostrptime(stamp):
    """I'm lazy, and can never remember the format string"""
    return Arrow.strptime(stamp, "%Y-%m-%dT%H:%M:%SZ")
Example #13
0
def isostrptime(stamp):
    """I'm lazy, and can never remember the format string"""
    return Arrow.strptime(stamp, "%Y-%m-%dT%H:%M:%SZ")