def parse_thread(self, response): threadid = self.get_url_param(response.url, 'id') posts = response.css("#brdmain div.blockpost") for post in posts: try: messageitem = items.Message() posttime = self.parse_timestr(self.get_text(post.css("h2 a"))) userprofile_link = post.css( ".postleft dt:first-child a::attr(href)").extract_first() messageitem['author_username'] = self.get_text( post.css(".postleft dt:first-child a")) messageitem['postid'] = post.xpath("@id").extract_first() messageitem['threadid'] = threadid messageitem['posted_on'] = posttime msg = post.css("div.postmsg") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem yield self.make_request('userprofile', url=userprofile_link, relativeurl=userprofile_link) except Exception as e: self.logger.warning("Invalid thread page. %s" % e) for link in response.css("#brdmain .pagelink a::attr(href)").extract(): yield self.make_request('thread', url=link)
def parse_threadpage(self, response): threadid = response.meta['threadid'] for message in response.css(".messageList .message"): msgitem = items.Message() try: fullid = message.xpath("@id").extract_first() msgitem['postid'] = re.match("post-(\d+)", fullid).group(1) msgitem['author_username'] = self.get_text( message.css(".messageDetails .username")) msgitem['posted_on'] = self.read_datetime_div( message.css(".messageDetails .DateTime")) textnode = message.css(".messageContent") msgitem['contenthtml'] = textnode.extract_first() msgitem['contenttext'] = self.get_text(textnode) msgitem['threadid'] = threadid except Exception as e: self.logger.error( "Failed parsing response for thread at %s. Error is %s.\n Skipping thread\n %s" % (response.url, e.message, traceback.format_exc())) yield msgitem for link in response.css("a.username::attr(href)").extract( ): # Duplicates will be removed by dupefilter yield self.make_request('userprofile', url=self.make_url(link)) #Start looking for previous page. for link in response.css("div.PageNav nav a::attr(href)").extract(): yield self.make_request("threadpage", url=link, threadid=threadid)
def parse_message(self, response): #self.logger.info("Yielding messages from %s" % response.url) m = re.search("\?topic=(\d+)", response.url) if m: threadid = m.group(1).strip() for postwrapper in response.css(".post_wrapper"): messageitem = items.Message() postmeta = self.get_text(postwrapper.css(".flow_hidden .keyinfo div")) postmeta_ascii = re.sub(r'[^\x00-\x7f]',r'', postmeta).strip() m = re.search('on:\s*(.+)', postmeta_ascii) if m: if "N/A" not in m.group(1): messageitem['posted_on'] = self.parse_timestr(m.group(1)) postcontent = postwrapper.css(".postarea .post").xpath("./div[contains(@id, 'msg_')]") m = re.search('msg_(\d+)', postcontent.xpath('@id').extract_first()) if m: messageitem['postid'] = m.group(1) messageitem['threadid'] = threadid messageitem['author_username'] = self.get_text(postwrapper.css(".poster h4")) messageitem['contenthtml'] = postcontent.extract_first() messageitem['contenttext'] = self.get_text(postcontent) yield messageitem
def parse_message(self, response): #self.logger.info("Yielding messages from %s" % response.url) threadid = "" try: threadid = self.get_url_param(response.url, 'tid') except Exception as e: self.logger.warning("Couldn't get threadid at %s with error %s" % (response.url, e)) return posts = response.css("#posts div.post") for post in posts: try: messageitem = items.Message() posttime = self.get_text( post.css("div.post_head span.post_date")).split("(")[0] messageitem['author_username'] = self.get_text( post.xpath( ".//div[@class='author_information']//span[@class='largetext']/a" )) messageitem['postid'] = post.xpath("@id").extract_first( " ").replace("post_", "").strip() messageitem['threadid'] = threadid messageitem['posted_on'] = self.parse_datetime(posttime) msg = post.css("div.post_body") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem except Exception as e: self.logger.warning("Invalid thread page. %s" % e)
def parse_message(self, response): # try: # threadid = self.get_url_param(response.url, 't') # except KeyError: # # It shows one post in thread only, so ignore this page # return try: threadid = self.get_url_param(response.url, 't') posts = response.xpath( ".//div[@id='page-body']/div[contains(@id, 'p')]") for post in posts: messageitem = items.Message() messageitem['threadid'] = threadid author = post.xpath( './/a[starts-with(@class, "username")]/text()' ).extract_first() messageitem['author_username'] = author post_time = post.css('p.author *::text').extract() messageitem['posted_on'] = dateutil.parser.parse( post_time[-1].strip()) post_link = post.css( 'p.author > a::attr(href)').extract_first() messageitem['postid'] = self.get_url_param(post_link, 'p') msg = post.css("div.content") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem except Exception as e: self.logger.warning("Invalid thread page. %s" % e) inspect_response(response, self)
def parse_thread(self, response): posts = response.xpath('.//div[@class="post "]') for post in posts: messageitem = items.Message() guest_user = len(post.xpath('.//span[contains(text(), "Unregistered")]')) > 0 special_user = guest_user is True and post.xpath('.//div[@class="author_information"]/strong/span/a[contains(@href, "member")]//text()').extract_first() is not None if guest_user is False or special_user is True: messageitem['author_username'] = post.xpath('.//div[@class="author_information"]//a[contains(@href, "member")]//text()').extract_first() if messageitem['author_username'] is None: messageitem['author_username'] = post.xpath('.//div[@class="author_information"]/strong/span/a[contains(@href, "member")]//text()').extract_first() messageitem['postid'] = post.xpath('@id').extract_first().lstrip('post_') messageitem['threadid'] = re.search('tid\=([0-9]+)', response.url).group(1) msg = post.xpath('.//div[contains(@class, "post_body")]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) # Post date handling posted_on = post.xpath('.//span[@class="post_date"]/text()').extract_first() messageitem['posted_on'] = self.parse_datetime(posted_on) else: messageitem['author_username'] = post.xpath('div/div/strong/span/text()').extract_first() messageitem['postid'] = post.xpath('@id').extract_first().lstrip('post_') messageitem['threadid'] = re.search('tid\=([0-9]+)', response.url).group(1) msg = post.xpath('.//div[contains(@class, "post_body")]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) # Post date handling posted_on = post.xpath('.//span[@class="post_date"]/text()').extract_first() messageitem['posted_on'] = self.parse_datetime(posted_on) if messageitem['author_username'] is None: self.logger.warning("Author username is still None at URL: %s. Can't yield item." % response.url) yield messageitem # Yield user. useritem = items.User() if guest_user is False or special_user is True: useritem['username'] = messageitem['author_username'] useritem['fullurl'] = post.xpath('.//div[@class="author_information"]//span[@class="largetext"]/a/@href').extract_first() useritem['relativeurl'] = useritem['fullurl'].split('.onion')[1] useritem['title'] = post.xpath('.//div[@class="author_information"]//span[@class="smalltext"]/text()[1]').extract_first().strip() message_count = post.xpath('.//div[@class="author_statistics"]/text()[2]').extract_first() useritem['message_count'] = int(re.sub('[^0-9]', '', message_count)) post_count = post.xpath('.//div[@class="author_statistics"]/text()[3]').extract_first() useritem['post_count'] = int(re.sub('[^0-9]', '', post_count)) useritem['joined_on'] = self.parse_datetime(post.xpath('.//div[@class="author_statistics"]/text()[4]').extract_first().replace("Registrato: ", '')) useritem['reputation'] = post.xpath('.//strong[contains(@class, "reputation")]/text()').extract_first() useritem['post_count'] = int(re.sub('[^0-9]', '', post_count)) useritem['username_id'] = re.search('([0-9]+)', useritem['relativeurl']).group(1) useritem['membergroup'] = post.xpath('.//img[not(@class="buddy_status")]/@title').extract_first() else: # Unregistered users have no message count, join date, post count, reputation, id.. useritem['username'] = messageitem['author_username'] useritem['fullurl'] = self.spider_settings['endpoint'] + "/" + useritem['username'] useritem['relativeurl'] = useritem['username'] useritem['title'] = post.xpath('.//div[@class="author_information"]//span[@class="smalltext"]/text()[1]').extract_first().strip() yield useritem
def parse_message(self, response): posts = response.css('ul.row.list-posts > li') for post in posts: messageitem = items.Message() author_username_str = self.get_text(post.css('.post-header a.poster')) flair_str = self.get_text(post.css('.post-header a.poster span.flair')) messageitem["author_username"] = author_username_str.replace(flair_str, "") messageitem['postid'] = self.get_post_id(post.css('span:first-child::attr(id)').extract_first()) messageitem['threadid'] = self.get_thread_id(response.url) messageitem['posted_on'] = self.parse_timestr(self.get_text(post.css('.footer .cols-10 .col-4:first-child strong'))) msg = post.css("div.content") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem
def parse_message(self, response): if response.xpath('.//div[@class="inner"]/p/text()').extract_first( ) and "The requested topic does not exist." in response.xpath( './/div[@class="inner"]/p/text()').extract_first(): self.logger.warning('Post not available. Likely deleted: "%s"' % response.url) return else: m = re.search("t=(\d+)", response.url) if m: threadid = m.group(1).strip() else: # If the page has a p= and no t= in the URL, we need to fetch the threadid inside the post. threadid = response.xpath( './/h2[@class="topic-title"]/a/@href').extract_first() if threadid: threadid = re.search('t=(\d+)', threadid).group(1) else: self.logger.warning( "Couldn't identify the threadid at URL %s" % response.url) #m = re.search("p=(\d+)", response.url) #if m: # threadid = m.group(1).strip() posts = response.xpath( '//div[contains(@class, "post has-profile")]') for post in posts: try: messageitem = items.Message() posttime = post.xpath( './/span[@class="responsive-hide"]/following-sibling::text()' ).extract_first() messageitem['author_username'] = post.xpath( './/a[contains(@class, "username")]/text()' ).extract_first() messageitem['postid'] = post.xpath('@id').extract_first() messageitem['threadid'] = threadid if posttime: messageitem['posted_on'] = self.parse_timestr(posttime) msg = post.xpath('.//div[@class="content"]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text( msg.extract_first()) yield messageitem except Exception as e: self.logger.warning("Invalid thread page. %s" % e)
def parse_message(self, response): threadid = self.get_url_param(response.url, 'id') posts = response.css("#brdmain .blockpost") for post in posts: # Yield message. messageitem = items.Message() messageitem['contenthtml'] = post.xpath( ".//div[@class='postmsg']").extract_first() messageitem['contenttext'] = self.get_text( post.xpath(".//div[@class='postmsg']")) messageitem['postid'] = self.get_url_param( post.css("h2 span a::attr(href)").extract_first(), 'pid') messageitem['threadid'] = threadid messageitem['author_username'] = self.get_text( post.css(".postleft dl dt strong span")) messageitem['posted_on'] = self.parse_timestr( self.get_text(post.css("h2 span a"))) yield messageitem # Yield user. useritem = items.User() useritem['username'] = self.get_text( post.css(".postleft dl dt strong span")) member_group = post.css(".postleft dd.usertitle") if len(member_group) > 0: useritem['membergroup'] = self.get_text(member_group) website = post.css( ".postleft dd.usercontacts span.website a::attr(href)") if len(website) > 0: useritem['website'] = self.get_text(website) attributes = post.css(".postleft dd") for attribute in attributes: if not attribute.css("span::attr(class)"): content = self.get_text(attribute.css("span")) match = re.search('(.+): (.+)', content) if match: key = match.group(1) value = match.group(2) if 'From' in key or 'Lieu' in key: useritem['location'] = value elif 'Posts' in key or 'Messages' in key: useritem['post_count'] = value elif 'Registered' in key or 'Inscription' in key: useritem['joined_on'] = self.parse_timestr(value) else: self.logger.warning('New information found : %s' % key) yield useritem
def parse_message(self, response): threadid = self.get_url_param(response.url, 'id') posts = response.css("#punviewtopic div.blockpost") for post in posts: try: messageitem = items.Message() posttime = self.parse_timestr(self.get_text(post.css("h2 a"))) messageitem['author_username'] = self.get_text(post.xpath(".//div[@class='postleft']/dl/dt/strong/a/text()").extract_first()) messageitem['postid'] = post.xpath("@id").extract_first() messageitem['threadid'] = threadid messageitem['posted_on'] = posttime msg = post.css("div.postmsg") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem except Exception as e: self.logger.warning("Invalid thread page. Error: '%s'. URL:" % (e, response.url))
def parse_message(self, response): # self.logger.info("Yielding messages from %s" % response.url) threadid = self.get_url_param(response.url, 'tid') posts = response.css("#posts .post") for post in posts: if not 'deleted_post_hidden' in post.xpath( '@class').extract_first(): try: post_date_string = self.get_text( post.css('span.post_date::text')) if post_date_string == '': post_date_string = post.css( 'span.post_date::text').extract_first() post_date = self.parse_timestr(post_date_string, response) author_username = self.get_text( post.xpath('.//span[@class="largetext"]')) contenttext = post.css('.post_body') match = re.match('post_(\d+)', post.xpath("@id").extract_first()) if match: post_id = match.group(1) messageitem = items.Message() messageitem['author_username'] = author_username messageitem['postid'] = post_id messageitem['threadid'] = threadid messageitem['posted_on'] = post_date messageitem['contenttext'] = self.get_text(contenttext) messageitem['contenthtml'] = contenttext.extract_first() yield messageitem except Exception as e: self.logger.warning( "Cannot parse message item at URL %s because %s" % (response.url, e)) pass else: self.logger.warning( "Did not yield post because it was deleted or hidden at %s" % response.url)
def get_message_item_from_postwrapper(self, postwrapper, response): msgitem = items.Message() postmeta = self.get_text(postwrapper.css(".flow_hidden .keyinfo div")) postmeta_ascii = re.sub(r'[^\x00-\x7f]',r'', postmeta).strip() m = re.search('on:\s*(.+)', postmeta_ascii) if m: msgitem['posted_on'] = self.parse_timestr(m.group(1)) postcontent = postwrapper.css(".postarea .post").xpath("./div[contains(@id, 'msg_')]") m = re.search('msg_(\d+)', postcontent.xpath('@id').extract_first()) if m: msgitem['postid'] = m.group(1) msgitem['threadid'] = response.meta['threadid'] msgitem['author_username'] = self.get_text(postwrapper.css(".poster h4")) msgitem['contenthtml'] = self.get_text(postcontent.extract_first()) msgitem['contenttext'] = self.get_text(postcontent) return msgitem
def parse_thread(self, response): threadid = response.meta['threadid'] posts = response.css("#messageList li.message") for post in posts: try: messageitem = items.Message() fullid = post.xpath("@id").extract_first() content = post.css("blockquote.messageText") userprofile_link = post.css( "div.messageDetails a.username.author::attr(href)" ).extract_first() messageitem['author_username'] = post.xpath( './/div[@class="uix_userTextInner"]/a/text()' ).extract_first() messageitem['postid'] = re.match("post-(\d+)", fullid).group(1) messageitem['threadid'] = threadid messageitem['posted_on'] = self.parse_datetime( self.get_text(post.xpath(".//a[@class='datePermalink']"))) messageitem['contenttext'] = self.get_text(content) messageitem['contenthtml'] = self.get_text( content.extract_first()) yield messageitem yield self.make_request( 'userprofile', url=userprofile_link, relativeurl=userprofile_link, username=messageitem['author_username']) except Exception as e: self.logger.warning("Invalid thread page %s. %s" % (response.url, e)) for link in response.css(".PageNav nav a::attr(href)").extract(): yield self.make_request('thread', url=link, threadid=response.meta['threadid'])
def parse_thread(self, response): inspect_response(response, self) threadid = self.get_id_from_url(response.url) # We first parse the first post. messageitem = items.Message() messageitem['threadid'] = threadid messageitem['postid'] = "thread" + threadid msg = response.xpath('.//div[@class="col-xs-10 alert alert-info whitebg"]') messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) # there are 3 user classes. Buyer, vendor and support. vendor = response.xpath(".//div[@class='col-xs-12']/small/a/text()").extract_first() is not None support = response.xpath(".//div[@class='col-xs-12']/small/b/text()").extract_first() == 'Support' buyer = vendor is False and support is False # Buyer username. if buyer is True: author_username = response.xpath(".//div[@class='col-xs-12']/small/text()").extract_first().strip() author_username = re.search('by (.*)$', author_username).group(1) messageitem['author_username'] = author_username membergroup = "Buyer" # Support staff. elif support is True: author_username = response.xpath(".//div[@class='col-xs-12']/small/b/text()").extract_first().strip() messageitem['author_username'] = author_username membergroup = "Support" # vendor username. elif vendor is True: author_username = response.xpath(".//div[@class='col-xs-12']/small/a/text()").extract_first() messageitem['author_username'] = author_username membergroup = "Vendor" else: self.logger.warning('Unknown member group at %s' % response.url) # Get info about the post. postinfo = self.get_text(response.xpath(".//div[@class='col-xs-12']/small")) if postinfo: matches = re.search(r'(\d+) (.+) ago by ([^ ]+)', postinfo) messageitem['posted_on'] = self.parse_datetime(matches.group(0)) else: self.logger.warning("No postinfo yielded at %s" % response.url) yield messageitem user = items.User() user['username'] = author_username user['membergroup'] = membergroup if membergroup in ["Buyer", "Support"]: user['relativeurl'] = user['username'] user['fullurl'] = self.spider_settings['endpoint'] + user['username'] elif membergroup == "Vendor": user['relativeurl'] = response.xpath(".//div[@class='col-xs-12']/small/a/@href").extract_first() user['fullurl'] = self.spider_settings['endpoint'] + user['relativeurl'] else: self.logger.warning('Unknown member group at %s' % response.url) poster_block = response.xpath(".//div[@class='col-xs-12']") if membergroup in ['Buyer', 'Vendor']: stars = poster_block.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first() if stars: stars = re.search('[Vendor|Buyer]: ([0-9]{1,1000})', stars).group(1) user['stars'] = stars else: self.logger.warning('No stars at URL %s' % response.url) yield user # We now parse the comments and yield them to the DB. # To treat the DB nice and avoid race conditions, sleep for a second. #time.sleep(0.5) post = response.css('.row .col-lg-8 > div') # Parse the remaining comments. # Post IDs are not caught by the comment selector. We loop them using an index. reply_index = 0 msg_ids = post.xpath(".//span[@class='forumMsgOffset']") for comment in post.css('div.comment p'): messageitem = items.Message() messageitem['threadid'] = threadid messageitem['postid'] = msg_ids[reply_index].xpath("@id").extract_first() reply_index += 1 post_info = comment.css('small::text').extract_first() if post_info: matches = re.search(r'(\d+) point([s]*) (.+)', post_info) if matches: messageitem['posted_on'] = self.parse_timestr(matches.group(3)) author_name = comment.css('a.vendorname::text').extract_first() if not author_name: author_name = comment.css('*::text').extract_first() messageitem['author_username'] = author_name.strip() messageitem['contenttext'] = ''.join(comment.css('p::text').extract()[1:]) messageitem['contenthtml'] = self.get_text(comment.css('p').extract_first()) yield messageitem # Sleep again to avoid race condition. #time.sleep(0.5) for comment in post.css('div.comment p'): useritem = items.User() vendor = comment.xpath('.//a[@class="vendorname"]/text()').extract_first() is not None buyer = comment.xpath('.//span[@class="left lightGrey"]').extract_first() is not None and self.get_text(comment).startswith('Support') is False support = comment.xpath('.//span/b') is not None and self.get_text(comment).startswith('Support') is True if vendor is True: useritem['username'] = comment.xpath('.//a[@class="vendorname"]/text()').extract_first() useritem['relativeurl'] = comment.xpath('.//a[@class="vendorname"]/@href').extract_first() useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['relativeurl'] membergroup = "Vendor" useritem['stars'] = comment.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first().replace('Vendor: ', '') elif support is True: username = self.get_text(comment) username = re.search('^(Support)[0-9]{1,100} ', username).group(1) useritem['username'] = username useritem['relativeurl'] = useritem['username'] useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['username'] membergroup = "Support" elif buyer is True: username = self.get_text(comment) username = re.search('^(.*?) Buyer', username).group(1) useritem['username'] = username useritem['relativeurl'] = useritem['username'] useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['username'] membergroup = "Buyer" useritem['stars'] = comment.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first().replace('Buyer: ', '') else: self.logger.warning("Unknown commenter group at %s" % response.url) useritem['membergroup'] = membergroup yield useritem
def parse_message(self, response): #self.logger.info("Yielding messages from %s" % response.url) threadid = self.get_url_param(response.url, 'topic').split(".")[0] posts = response.css("#forumposts div.windowbg") + response.css( "#forumposts div.windowbg2") for post in posts: messageitem = items.Message() posttime = self.parse_timestr( re.search("«.*on:(.*?)»", self.get_text(post.css("div.keyinfo div.smalltext")), re.S | re.M).group(1).strip()) author_username = post.xpath(".//h4/a/text()").extract_first() if author_username is not None: # Verified posters. messageitem['author_username'] = author_username.strip() elif post.xpath(".//h4/text()").extract_first() is not None: messageitem['author_username'] = post.xpath( ".//h4/text()").extract_first().strip() else: self.logger.warning('Unknown problem yielding user at URL %s' % response.url) messageitem['postid'] = post.css( "div.post div.inner::attr(id)").extract_first().replace( "msg_", "") messageitem['threadid'] = threadid messageitem['posted_on'] = posttime msg = post.css("div.post") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) yield messageitem for post in posts: useritem = items.User() username = post.xpath(".//h4/a/text()").extract_first() if username is not None: # Verified posters. useritem['username'] = username.strip() useritem["relativeurl"] = self.get_relative_url( post.css(".poster h4 a::attr(href)").extract_first()) useritem["fullurl"] = self.make_url( post.css(".poster h4 a::attr(href)").extract_first()) elif post.xpath(".//h4/text()").extract_first() is not None: useritem['username'] = post.xpath( ".//h4/text()").extract_first().strip() useritem["relativeurl"] = useritem['username'] useritem["fullurl"] = self.spider_settings[ 'endpoint'] + useritem['username'] else: self.logger.warning('Unknown problem yielding user at URL %s' % response.url) for li in post.xpath(".//ul/li"): key = li.xpath(".//@class").extract_first() keytext = li.xpath(".//text()").extract_first() if key == "postgroup": useritem['postgroup'] = keytext elif key == "membergroup": useritem['membergroup'] = keytext elif key == 'karma': useritem['karma'] = keytext.replace('Karma: ', '') elif key == 'title': useritem['title'] = keytext elif key == 'stars': useritem['stars'] = keytext elif key == 'postcount': useritem['post_count'] = keytext.replace('Posts: ', '') elif key == 'custom': awards = li.xpath(".//text()").extract() useritem['awards'] = '|'.join(awards).replace( 'Awards: |', '') elif key is None or key in [ 'blurb', 'avatar', 'profile', 'new_win', 'quote', 'quote_button' ]: pass else: self.logger.warning( "Unknown key in user profile '%s' with value '%s'" % (key, keytext)) yield useritem
def parse_message(self, response): threadid = self.get_url_param(response.url, 'id') posts = response.css("#brdmain div.blockpost") index = 0 last_posttime = None authors = posts.xpath( ".//div[@class='postleft']/dl/dt/strong/a/text()").extract() for post in posts: try: messageitem = items.Message() userprofile_link = post.css( ".postleft dt:first-child a::attr(href)").extract_first() messageitem['author_username'] = self.get_text( post.xpath( ".//div[@class='postleft']/dl/dt/strong/a/text()"). extract_first()) # The admin (SpeedStepper) obfuscates/spoofs their time of posting. # Their posts are therefore tagged as coming *just before* the proceding post. # SpeedStepper frequently makes 2+ posts in a row, so we need to hack around a bit. # a while-loop would be better. only_admin = len(list(set(authors))) == 1 and list( set(authors))[0] == 'SpeedStepper' if only_admin is True: posttime = None self.logger.warning( "Only SpeedStepper has posted in this thread. No posted_on could be determined from %s." % response.url) elif messageitem[ 'author_username'] == 'SpeedStepper' and index == 0: if self.get_text(posts[index + 1].xpath( "h2/span/a/text()").extract_first()) == '': posttime = self.parse_datetime( self.get_text( posts[index + 2].xpath("h2/span/a/text()"). extract_first())) - timedelta(seconds=2) else: posttime = self.parse_datetime( self.get_text( posts[index + 1].xpath("h2/span/a/text()"). extract_first())) - timedelta(seconds=1) last_posttime = posttime # A failsafe ensuring we always have a time we can refer to and accomodate SpeedStepper. elif messageitem[ 'author_username'] == 'SpeedStepper' and index > 0 and last_posttime is not None: posttime = last_posttime + timedelta(seconds=1) last_posttime = posttime # A failsafe ensuring we always have a time we can refer to and accomodate SpeedStepper. else: posttime = self.parse_datetime( self.get_text( post.xpath("h2/span/a/text()").extract_first())) last_posttime = posttime # A failsafe ensuring we always have a time we can refer to and accomodate SpeedStepper. messageitem['posted_on'] = posttime messageitem['postid'] = post.xpath("@id").extract_first() messageitem['threadid'] = threadid #messageitem['subforum'] = self.get_text(response.css('ul.crumbs:nth-child(2) > li:nth-child(2) > a:nth-child(2)')) #self.logger.info("subforum is %s" % messageitem['subforum']) msg = post.css("div.postmsg") messageitem['contenttext'] = self.get_text(msg) messageitem['contenthtml'] = self.get_text(msg.extract_first()) index = index + 1 yield messageitem except Exception as e: self.logger.warning("Invalid thread page at %s (Error: '%s'" % (response.url, e))