def parse_threadlisting(self, response): for line in response.css("#messageindex table tbody tr"): threaditem = items.Thread() last_post_time = self.parse_timestr( self.get_text( line.css("td:last-child")).split("by")[0].strip()) threadlinkobj = next(iter( line.css("td:nth-child(3) span a") or []), None) # First or None if empty if threadlinkobj: threadlinkhref = threadlinkobj.xpath( "@href").extract_first() if threadlinkobj else None threaditem['title'] = self.get_text(threadlinkobj) threaditem['relativeurl'] = self.get_relative_url( threadlinkhref) threaditem['fullurl'] = self.make_url(threadlinkhref) threaditem['threadid'] = self.get_url_param( threaditem['fullurl'], 'topic').split(".")[0] byuser = self.get_text(line.xpath(".//div/p/a")) if byuser == '': byuser = line.xpath( ".//div/p[contains(text(), 'Started by')]/text()" ).extract_first().strip().replace("Started by ", "") threaditem['author_username'] = byuser threaditem['last_update'] = last_post_time reply_review = self.get_text(line.css("td:nth-child(4)")) threaditem['replies'] = re.search(r"(\d+) Replies", reply_review, re.S | re.M).group(1) threaditem['views'] = re.search(r"(\d+) Views", reply_review, re.S | re.M).group(1) yield threaditem else: self.logger.warning( 'Couldn\'t yield thread. Please review: %s' % response.url)
def parse_thread_listing(self, response): for line in response.css("#brdmain tbody tr"): threadlinkobj = next(iter(line.css("td:first-child a") or []), None) # Get Thread Name link, or None if not present if threadlinkobj: threaditem = items.Thread() threadlinkhref = threadlinkobj.xpath("@href").extract_first() if threadlinkobj else None threaditem['title'] = self.get_text(threadlinkobj) threaditem['relativeurl'] = threadlinkhref threaditem['fullurl'] = self.make_url(threadlinkhref) threaditem['threadid'] = self.get_url_param(threaditem['fullurl'], 'id') byuser = self.get_text(line.css("td:first-child span.byuser")) m = re.match("by (.+)", byuser) if m: threaditem['author_username'] = m.group(1) threaditem['last_update'] = self.parse_timestr(self.get_text(line.css("td:last-child a"))) threaditem['replies'] = self.get_text(line.css("td:nth-child(2)")) threaditem['views'] = self.get_text(line.css("td:nth-child(3)")) yield threaditem yield self.make_request('thread', url=threadlinkhref) for link in response.css("#brdmain .pagelink a::attr(href)").extract(): yield self.make_request('threadlisting', url=link)
def parse_threadlisting(self, response): for line in response.css("#brdmain tbody tr"): threaditem = items.Thread() title = self.get_text(line.css("td:first-child a")) threadlinkobj = next(iter(line.css("td:first-child a") or []), None) # First or None if empty if line.xpath(".//span[@class='movedtext']"): self.logger.warning("Thread was moved. Not collected.") elif threadlinkobj: last_post_time = self.parse_datetime( self.get_text(line.css("td:last-child a"))) threadlinkhref = threadlinkobj.xpath( "@href").extract_first() if threadlinkobj else None threaditem['title'] = self.get_text(threadlinkobj) if threaditem['title'] == '': threaditem['title'] = "[Untitled thread]" self.logger.warning( "Encountered a thread with no title at %s. Inserted %s as title." % (response.url, threaditem['title'])) threaditem['relativeurl'] = threadlinkhref threaditem['fullurl'] = self.make_url(threadlinkhref) threaditem['threadid'] = self.get_url_param( threaditem['fullurl'], 'id') byuser = self.get_text(line.css("td:first-child span.byuser")) m = re.match("by (.+)", byuser) # regex if m: threaditem['author_username'] = m.group(1) threaditem['last_update'] = last_post_time threaditem['replies'] = self.get_text( line.css("td:nth-child(2)")) threaditem['views'] = self.get_text( line.css("td:nth-child(3)")) else: self.logger.warning("no threadlinkobj") yield threaditem
def parse_threadlisting(self, response): threads = response.css('#vf table tbody tr') for thread in threads: try: threadlink = thread.css('td:first-child a') threadurl = thread.css( 'td:first-child a::attr(href)').extract_first() thread_last_update = self.get_text( thread.css('td:last-child a')) threaditem = items.Thread() threaditem['threadid'] = self.get_url_param(threadurl, 'id') threaditem['title'] = thread.xpath( ".//a[contains(@href, 'viewtopic')]/text()").extract_first( ) threaditem['author_username'] = self.get_text( thread.css('td:first-child span.byuser span')) threaditem['last_update'] = self.parse_timestr( thread_last_update) threaditem['relativeurl'] = threadurl threaditem['fullurl'] = self.make_url(threadurl) threaditem['replies'] = self.get_text( thread.css('td:nth-child(2)')) threaditem['views'] = self.get_text( thread.css('td:nth-child(3)')) yield threaditem except Exception as e: self.logger.error( "Cannot parse thread item at %s (Error: %s)" % (response.url, e))
def parse_threadlisting(self, response): topics = response.css('ul.row.big-list.zebra > li') for topic in topics: threaditem = items.Thread() threaditem['title'] = self.get_text( topic.css("div.main > div > a")) href = topic.css("div.main > div > a::attr(href)").extract_first() threaditem['relativeurl'] = self.get_relative_url(href) if href != "": threaditem['fullurl'] = self.make_url(href) threadid = self.get_thread_id(href) threaditem['threadid'] = threadid threaditem['author_username'] = topic.css( "div.main > div > span a::text").extract_first("").strip() replies = self.get_text( topic.css("div.main > div > span strong:last-child")) if re.match(r'^\d+$', replies) is None: replies = 0 threaditem['replies'] = replies yield threaditem flair = topic.css( "div.main > div > span a::attr(data-flair)" ).extract_first() if flair is not None: user = items.User() user["username"] = topic.css( "div.main > div > span a::text").extract_first("").strip() user["flair"] = flair.strip() user['fullurl'] = topic.css( "div.main > div > span a::attr(href)").extract_first("").strip() user["relativeurl"] = self.get_relative_url(user['fullurl']) yield user
def parse_board(self, response): for threadline in response.css('#messageindex table tbody tr'): try: threaditem = items.Thread() threadcell = threadline.css(".subject") authorlink = threadcell.xpath( ".//p[contains(., 'Started by')]").css('a') threadlink = threadcell.xpath( './/span[contains(@id, "msg_")]/a') threaditem['author_username'] = self.get_text_first(authorlink) threadurl = threadlink.xpath("@href").extract_first() m = re.search("\?topic=(\d+)", threadurl) if m: threaditem['threadid'] = m.group(1).strip() threaditem['title'] = self.get_text(threadlink) threaditem['relativeurl'] = threadurl threaditem['fullurl'] = self.make_url(threadurl) #Last update lastpost_str = self.get_text(threadline.css(".lastpost")) m = re.search("(.+) by (.+)", lastpost_str) if m: threaditem['last_update'] = self.parse_timestr(m.group(1)) #Stats cell statcellcontent = self.get_text(threadline.css("td.stats")) m = re.search("(\d+) Replies [^\d]+(\d+) Views", statcellcontent) if m: threaditem['replies'] = m.group(1) threaditem['views'] = m.group(2) yield threaditem for pagelink in response.css(".pagelinks a.navPages"): yield self.make_request( 'board', url=pagelink.xpath("@href").extract_first()) for userlink in threadline.xpath( './/a[contains(@href, "action=profile")]'): u = userlink.xpath("@href").extract_first() yield self.make_request('userprofile', url=u, relativeurl=u) for threadlink in threadline.xpath( './/a[contains(@href, "?topic=") and not(contains(@href, "#new"))]' ): yield self.make_request( 'thread', url=threadlink.xpath("@href").extract_first(), threadid=threaditem['threadid']) except Exception as e: self.logger.error("Cannot parse thread item : %s" % e) raise
def parse_threadlisting(self, response): topics = response.xpath('.//tr[@class="inline_row"]') for topic in topics: threaditem = items.Thread() threaditem['title'] = topic.xpath('.//span[contains(@id, "tid")]/a/text()').extract_first() threaditem['relativeurl'] = topic.xpath('.//span[contains(@id, "tid")]/a/@href').extract_first() threaditem['fullurl'] = self.make_url(threaditem['relativeurl']) threaditem['threadid'] = re.search('([0-9]+)', threaditem['relativeurl']).group(1) threaditem['author_username'] = topic.xpath('.//div[contains(@class, "author")]/a/text()').extract_first() threaditem['replies'] = re.sub('[^0-9]', '', topic.xpath('.//td[4]/a/text()').extract_first()) threaditem['views'] = re.sub('[^0-9]', '', topic.xpath('.//td[5]/text()').extract_first()) # Last update handling lastupdate = topic.xpath('.//span[contains(@class, "lastpost")]/text()[1]').extract_first() threaditem['last_update'] = self.parse_datetime(lastupdate) yield threaditem
def parse_threadlisting(self, response): # self.logger.info("Yielding threads from %s" % response.url) threads = response.css("#content tr.inline_row") for thread in threads: try: threaditem = items.Thread() threadlink = thread.css("td:nth-child(3)").xpath( ".//span[contains(@id, 'tid_')]/a") threaditem['title'] = self.get_text(threadlink) # Handle deleted threads. deleted_thread = thread.xpath( './/td/em/text()').extract_first() if len(threadlink) < 1 and deleted_thread == 'Deleted Thread': self.logger.warning( "A deleted thread was not collected from %s." % response.url) else: threadurl = threadlink.xpath('@href').extract_first() lastpost_content = self.get_text( thread.css("td:last-child span.lastpost")) match = re.search("(.+)Ultimo", lastpost_content) last_post_time = self.parse_timestr( match.group(1), response) if match else None threaditem['threadid'] = self.get_url_param( threadurl, 'tid') threaditem['relativeurl'] = threadurl threaditem['fullurl'] = self.make_url(threadurl) threaditem['author_username'] = self.get_text( thread.css("td:nth-child(3) div.author a")) threaditem['last_update'] = last_post_time threaditem['replies'] = self.get_text( thread.css("td:nth-child(4) a")) threaditem['views'] = self.get_text( thread.css("td:nth-child(5)")) yield threaditem except Exception as e: self.logger.warning( "Cannot parse thread item at URL %s because %s" % (response.url, e)) pass
def parse_threadlisting(self, response): for line in response.css("div.discussionList li.discussionListItem"): threaditem = items.Thread() threadlink = line.css( "div.main h3.title a::attr(href)").extract_first() threadid = self.read_threadid_from_url(threadlink) threaditem['title'] = self.get_text( line.css("div.main h3.title a")) threaditem['author_username'] = line.xpath( '@data-author').extract_first() threaditem['replies'] = self.get_text( line.css("div.stats .major dd")) threaditem['views'] = self.get_text( line.css("div.stats .minor dd")) # last_update comes in two formats with different layout. short_timestring = line.xpath( ".//span[@class='DateTime']/text()").extract_first() long_timestring = line.xpath( ".//abbr[@class='DateTime']/text()").extract_first() if long_timestring is not None: threaditem['last_update'] = self.parse_datetime( long_timestring) elif long_timestring is None or short_timestring is not None: threaditem['last_update'] = self.parse_datetime( short_timestring) else: self.logger.warning( "Couldn't get the correct time for the last update of post at %s." % response.url) threaditem['relativeurl'] = threadlink threaditem['fullurl'] = self.make_url(threadlink) threaditem['threadid'] = threadid yield threaditem yield self.make_request('thread', url=threadlink, threadid=threadid) for link in response.css(".PageNav nav a::attr(href)").extract(): yield self.make_request('threadlisting', url=link)
def parse_threadlisting(self, response): threaddivs = response.css("li.discussionListItem") oldestthread_datetime = datetime.utcnow() for threaddiv in threaddivs: try: threaditem = items.Thread() last_message_datestr = threaddiv.css( ".lastPostInfo .DateTime::text").extract_first() threaditem['last_update'] = self.to_utc( AlphabayDatetimeParser.tryparse(last_message_datestr)) oldestthread_datetime = threaditem[ 'last_update'] # We assume that threads are ordered by time. link = threaddiv.css(".title a.PreviewTooltip") threadurl = link.xpath("@href").extract_first() threaditem['relativeurl'] = threadurl threaditem['fullurl'] = self.make_url(threadurl) threaditem['title'] = self.get_text_first(link) threaditem['author_username'] = self.get_text_first( threaddiv.css(".username")) threaditem['threadid'] = self.read_threadid_from_url(threadurl) author_url = threaddiv.css( ".username::attr(href)").extract_first() yield self.make_request('userprofile', url=author_url) yield self.make_request( 'threadpage', url=threadurl, threadid=threaditem['threadid']) # First page of threa yield threaditem # sends data to pipelne except Exception as e: self.logger.error( "Failed parsing response for threadlisting at %s. Error is %s.\n Skipping thread\n %s" % (response.url, e.message, traceback.format_exc())) continue # Parse next page. for link in response.css("div.PageNav nav a::attr(href)").extract(): yield self.make_request(reqtype='threadlisting', url=link)
def parse_threadlisting(self, response): for line in response.css('#punviewforum tbody tr:not([class*="inone"])'): threaditem = items.Thread() last_post_time = self.parse_timestr(self.get_text(line.css("td:last-child a"))) # First or None if empty threadlinkobj = next(iter(line.css("td:first-child a") or []), None) if threadlinkobj: threadlinkhref = threadlinkobj.xpath("@href").extract_first() if threadlinkobj else None threaditem['title'] = self.get_text(threadlinkobj) threaditem['relativeurl'] = threadlinkhref threaditem['fullurl'] = self.make_url(threadlinkhref) threaditem['threadid'] = self.get_url_param(threaditem['fullurl'],'id') byuser = self.get_text(line.css("td:first-child span.byuser")) m = re.match("by (.+)", byuser) # regex if m: threaditem['author_username'] = m.group(1) threaditem['last_update'] = last_post_time threaditem['replies'] = self.get_text(line.css("td:nth-child(2)")) threaditem['views'] = self.get_text(line.css("td:nth-child(3)")) yield threaditem
def parse_threadlisting(self, response): for threadline in response.css('#messageindex table tbody tr'): try: threaditem = items.Thread() threadcell = threadline.css(".subject") authorlink = threadcell.xpath(".//p[contains(., 'Started by')]").css('a') threadlink = threadcell.xpath('.//span[contains(@id, "msg_")]/a') threaditem['author_username'] = self.get_text_first(authorlink) threadurl = threadlink.xpath("@href").extract_first() m = re.search("\?topic=(\d+)", threadurl) if m: threaditem['threadid'] = m.group(1).strip() threaditem['title'] = self.get_text(threadlink) threaditem['relativeurl'] = threadurl threaditem['fullurl'] = self.make_url(threadurl) #Last update lastpost_str = threadline.xpath('td[contains(@class, "lastpost")]/a/following-sibling::text()').extract_first() if lastpost_str: if "N/A" not in lastpost_str: threaditem['last_update'] = self.parse_timestr(lastpost_str.strip()) #Stats cell statcellcontent = threadline.xpath('td[contains(@class, "stats")]//text()').extract() m1 = re.search("(\d+) Replies", statcellcontent[0]) if m1 : threaditem['replies'] = m1.group(1) m2 = re.search("[^\d]+(\d+) Views", statcellcontent[1]) if m2 : threaditem['views'] = m2.group(1) yield threaditem except Exception as e: self.logger.error("Cannot parse thread item : %s" % e) raise
def parse_thread_listing(self, response): for line in response.css('.table.forum > tbody > tr'): try: cells = line.css('td') if len(cells) != 4: continue thread_link = cells[1].css('h4 div a::attr(href)').extract_first() if not thread_link: continue threaditem = items.Thread() threaditem['title'] = cells[1].css('h4 div a::text').extract_first() threaditem['relativeurl'] = thread_link threaditem['fullurl'] = self.make_url(thread_link) threaditem['threadid'] = self.get_id_from_url(thread_link) author = cells[1].css('h4 div small a') if author: threaditem['author_username'] = author.css('::text').extract_first().strip() else: byuser = cells[1].xpath('.//h4/div/small//text()').extract() byuser = ''.join(byuser) if byuser: matches = re.search(" ago by (.+)", byuser) # regex if matches: threaditem['author_username'] = matches.group(1).strip() # Cannot get last update time exactly, that's because the update time # doesn't follow time format, it's something like "XX days ago". moment_time_value = cells[3].css('small::text').extract()[-1] threaditem['last_update'] = self.parse_datetime(moment_time_value).date() threaditem['replies'] = cells[2].css('::text').extract_first() yield threaditem yield self.make_request('thread', url=thread_link, shared=True) except Exception as ex: self.logger.warning("Error in retrieving theads. %s at URL %s" % (ex, response.url)) for link in response.css("a.paginate[rel='next']::attr(href)").extract(): yield self.make_request('threadlisting', url=link, shared=True)
def parse_thread_listing(self, response): topics = response.css('ul.row.big-list.zebra > li') for topic in topics: threaditem = items.Thread() threaditem['title'] = self.get_text( topic.css("div.main > div > a")) href = topic.css("div.main > div > a::attr(href)").extract_first() threaditem['relativeurl'] = href threaditem['fullurl'] = self.make_url(href) threadid = self.get_thread_id(href) threaditem['threadid'] = threadid threaditem['author_username'] = topic.css( "div.main > div > span a::text").extract_first() replies = self.get_text( topic.css("div.main > div > span strong:last-child")) if re.match(r'^\d+$', replies) is None: replies = 0 threaditem['replies'] = replies yield threaditem
def parse_threadlisting(self, response): #self.logger.info("Yielding threads from %s" % response.url) for line in response.css("div.wrapper table tr.inline_row"): threaditem = items.Thread() threaditem['title'] = self.get_text( line.xpath("td[3]/div/span/span/a")) if threaditem['title'] == "": continue threaditem['replies'] = self.get_text(line.css("td:nth-child(4)")) threaditem['views'] = self.get_text(line.css("td:nth-child(5)")) threaditem['relativeurl'] = line.xpath( "td[3]/div/span/span/a/@href").extract_first() threaditem['fullurl'] = self.make_url(threaditem['relativeurl']) last_post_time = self.get_text( line.css("td:nth-child(6) span.lastpost")) try: threaditem['last_update'] = self.parse_datetime( re.search("(.*)last ", last_post_time, re.M | re.I | re.S).group(1).strip()) except Exception as e: self.logger.warning("last_update %s error %s" % (response.url, e)) try: threaditem['author_username'] = re.search( "post:(.*)", last_post_time, re.M | re.I | re.S).group(1).strip() except Exception as e: self.logger.warning("author_username %s error value %s" % (response.url, e)) threaditem['threadid'] = self.get_url_param( threaditem['fullurl'], 'tid') yield threaditem
def parse_threadlisting(self, response): for line in response.xpath('//ul[@class="topiclist topics"]/li'): threaditem = items.Thread() title = line.xpath( './/a[@class="topictitle"]/text()').extract_first() last_post_time = self.parse_timestr( line.xpath( './/a[@title="Go to last post"]/text()').extract_first()) threaditem['title'] = line.xpath( './/a[@class="topictitle"]/text()').extract_first() threaditem['relativeurl'] = line.xpath( './/a[@class="topictitle"]/@href').extract_first() threaditem['fullurl'] = self.make_url(threaditem['relativeurl']) threaditem['threadid'] = threaditem['relativeurl'].split('&t=')[-1] threaditem['author_username'] = line.xpath( './/a[contains(@class, "username")]/text()').extract_first() threaditem['last_update'] = last_post_time threaditem['replies'] = line.xpath( './/dd[@class="posts"]/text()').extract_first().strip() threaditem['views'] = line.xpath( './/dd[@class="views"]/text()').extract_first().strip() yield threaditem
def parse_threadlisting(self, response): # self.logger.info("Yielding threads from %s" % response.url) for line in response.css("ul.topiclist.topics li.row"): try: title = line.css("dt div.list-inner > a") # if not title: # continue threaditem = items.Thread() threaditem['title'] = self.get_text(title) threaditem['relativeurl'] = title.xpath( '@href').extract_first() threaditem['fullurl'] = self.make_url( threaditem['relativeurl']) threaditem['threadid'] = self.get_url_param( threaditem['fullurl'], 't') threaditem['author_username'] = line.css( 'div.topic-poster a::text').extract_first() threaditem['replies'] = line.css( 'dd.posts *::text').extract_first().strip() threaditem['views'] = line.css( 'dd.views *::text').extract_first().strip() yield threaditem except Exception as e: self.logger.warning("Invalid thread listing page. %s" % e)