def parse_meta(json_spec: Dict) -> BotJsonMeta: meta = BotJsonMeta() meta.name = json_spec['name'] meta.race = PlayerRace[json_spec['race'].upper()] bot_type = json_spec['botType'] if bot_type == "JAVA_JNI" or bot_type == "JAVA_MIRROR": bot_type = "JAVA" meta.botType = BotType[bot_type] meta.description = json_spec[ 'description'] if 'description' in json_spec else None meta.update = parse_iso_date( json_spec['update']) if 'update' in json_spec else None meta.botBinary = json_spec[ 'botBinary'] if 'botBinary' in json_spec else None meta.bwapiDLL = json_spec[ 'bwapiDLL'] if 'bwapiDLL' in json_spec else None meta.botProfileURL = json_spec[ 'botProfileURL'] if 'botProfileURL' in json_spec else None meta.javaDebugPort = json_spec[ 'javaDebugPort'] if 'javaDebugPort' in json_spec else None meta.javaOpts = json_spec[ 'javaOpts'] if 'javaOpts' in json_spec else None return meta
def format_datetime_like(dt_object): if dt_object is None: result = dt_object elif isinstance(dt_object, str): try: parse_iso_date(dt_object) except Exception: raise else: result = dt_object elif isinstance(dt_object, dt.datetime): result = dt_object.astimezone( dt.timezone.utc).strftime(ISO8601_FORMAT_Z) else: result = dt_object.strftime(ISO8601_FORMAT_Z) return result
def parse_meta(json_spec: Dict): meta = BotJsonMeta() meta.name = json_spec['name'] meta.race = PlayerRace[json_spec['race'].upper()] meta.description = json_spec['description'] bot_type = json_spec['botType'] if bot_type == "JAVA_JNI" or bot_type == "JAVA_MIRROR": bot_type = "JAVA" meta.botType = BotType[bot_type] meta.update = parse_iso_date(json_spec['update']) meta.botBinary = json_spec['botBinary'] meta.bwapiDLL = json_spec['bwapiDLL'] meta.botProfileURL = json_spec['botProfileURL'] return meta
def task_post(self, grab, task): print 'task_post', grab.response.url self.platform.inc_api_calls() title = content = date = None # The Post's URL url = grab.response.url print "Checking url %s" % url # The Post's Title post_title = grab.doc.select( '//div[contains(@id,"Blog")]//*[contains(@class,"post-title") or contains(@class,"postTitle")]' ) if not post_title.count(): post_title = grab.doc.select( '//div[contains(@id,"Blog")]//div[contains(@class,"Post")]//*[contains(@class,"PostHeader")]' ) if not post_title.count(): post_title = grab.doc.select( '//div[contains(@id,"Blog")]/div[@class="post"]/text()[1]') if post_title.count(): title = post_title.text().strip() # The Post's Content post_content = grab.doc.select( '//div[contains(@id,"Blog")]//div[(contains(@class,"post-body") or contains(@class,"postBody"))]' ) if not post_content.count(): post_content = grab.doc.select( '//div[contains(@id,"Blog")]//div[contains(@class,"Post")]//div[contains(@class,"PostContent")]' ) if post_content.count(): content = post_content.html() else: return # The post date can be present in different variations # That's why we try to extract it from <abbr> (ISO format), after that from the date-header and finally from the URL # In the header the date can be present in different formats: "Tuesday, 6 December 2011", "February 18, 2014", etc. # Sometimes (http://www.tieandi.com/2014/02/valentines-day-wishlist.html) it's written not in English # In this case we can only get year and month from the url ("/2014/02/"") and we set day=1 # We work with "date" over and over again, until this value is changed from None # But if the date is None after all tries... well, I have no idea about # when this post is published, may be a few thousands years ago? post_date = grab.doc.select( '//div[contains(@class,"post-footer")]//abbr[contains(@class,"published")]/@title' ) # <abbr> if post_date.count(): iso_date = post_date.text().strip() try: date = parse_iso_date(iso_date) except ValueError: # bad ISO-format? let's try another way pass # "date-header" if not date: post_date = grab.doc.select( '//*[contains(@class,"date-header")]//span') if not post_date.count(): post_date = grab.doc.select( '//*[contains(@class,"date-header")]') if not post_date.count(): post_date = grab.doc.select( '//div[contains(@class,"post")]/*[contains(@class,"postAuthor")]/a' ) if post_date.count(): YMDhm = self.parse_date(post_date.text()) if YMDhm: date = datetime(*YMDhm) # URL if not date: url_path = urlparse(grab.response.url).path if url_path.startswith('/'): url_path_parts = url_path.split('/') month = year = None year_from_url = url_path_parts[1] month_from_url = url_path_parts[2] if month_from_url.isdigit() and int( month_from_url) in range(1, 13): month = int(month_from_url) if year_from_url.isdigit() and int(year_from_url) in range( 2000, 2020): year = int(year_from_url) if month and year: date = datetime(year, month, 1, 0, 0) post = models.Posts() post.influencer = self.platform.influencer post.show_on_search = self.platform.influencer.show_on_search post.platform = self.platform post.title = title post.url = url post.content = content post.create_date = date post.save() self._inc('posts_saved') print "Created post: %s " % post self.posts[post] = [] # Comments # This section also can be present in different variations comments = [] # The first type comments_blocks = grab.doc.select( '//div[contains(@id,"comments")]//dl[contains(@id,"comments-block")]' ) if comments_blocks.count(): comments_authors = comments_blocks.select( './/dt[contains(@class,"author")]') for author in comments_authors: author_name = author_url = url = None author_a = author.select('./a[@rel="nofollow"]') if not author_a.count(): author_a = author.select('.//a[@rel="nofollow"]') if author_a.count(): author_name = author_a.text().strip() try: author_url = author_a.attr('href') except Exception: pass comment_body = author.select( './following-sibling::dd[contains(@class,"comment-body")][1]' ) if comment_body.count(): content = comment_body.html() # content = ' '.join([_.text().strip() for _ in comment_body.select('.//text()')]) else: continue comment_footer = author.select( './following-sibling::dd[contains(@class,"comment-footer")][1]' ) if comment_footer.count(): comment_timestamp = comment_footer.select( './/span[contains(@class,"comment-timestamp")]/a') else: comment_timestamp = author.select( './/span[contains(@class,"comment-timestamp")]/a') if comment_timestamp.count(): try: url = comment_timestamp.attr('href') except Exception: pass timestamp = comment_timestamp.text() if date and not str( date.year)[:-1] in timestamp and ':' in timestamp: timestamp = '.'.join([ str(_) for _ in (date.month, date.day, date.year) ]) + ' ' + timestamp YMDhm = self.parse_date(timestamp) if YMDhm: date = datetime(*YMDhm) comments.append( dict( author_name=author_name, author_url=author_url, content=content, date=date, url=url, )) # The second type else: comments_blocks = grab.doc.select( '//div[@*="comments"]//div[@*="comment-header"]') for comment_header in comments_blocks: author_name = author_url = url = None author = comment_header.select( './/cite[contains(@class,"user")]') if author.count(): author_a = author.select('./a') if author_a.count(): author_name = author_a.text().strip() try: author_url = author_a.attr('href') except Exception: pass else: author_name = author.text().strip() else: author = comment_header.select( './/a[contains(@class, "autor-name")]') if author.count(): author_name = author.text().strip() try: author_url = author.attr('href') except Exception: pass comment_timestamp = comment_header.select( './/span[contains(@class,"datetime") or contains(@class,"timestamp") or contains(@id,"timestamp")]/a' ) if comment_timestamp.count(): try: url = comment_timestamp.attr('href') except Exception: pass timestamp = comment_timestamp.text() if date and not str( date.year)[:-1] in timestamp and ':' in timestamp: timestamp = '.'.join([ str(_) for _ in (date.month, date.day, date.year) ]) + ' ' + timestamp YMDhm = self.parse_date(timestamp) if YMDhm: date = datetime(*YMDhm) comment_body = comment_header.select( './/p[contains(@class,"comment-content") or contains(@class,"comment-body")]' ) if not comment_body.count(): comment_body = comment_header.select( './following-sibling::p[contains(@class,"comment-content") or contains(@class,"comment-body")][1]' ) if comment_body.count(): content = comment_body.html() # content = ' '.join([_.text().strip() for _ in comment_body.select('.//text()')]) else: continue comments.append( dict( author_name=author_name, author_url=author_url, content=content, date=date, url=url, )) print "got %d comments " % len(comments) for comment in comments: self.posts[post].append(comment) # this will call self.fetch_post_interactions() first to see if we got any comments # if not, disqus will be crawled self.fetch_post_interactions_extra([post])