def parse_pm_listing(conn, html): """ Given a connection and some html containing a PM inbox's contents, return a list of dicts, each containing attributes of a thread in the inbox. """ threads = [] soup = bs4.BeautifulSoup(html) pmTable = soup.find('table', {'class': 'grid'}) # first row is header row. for row in pmTable.find_all('tr')[1:]: rowSections = row.find_all('td') pmProps = {'read': False, 'unreadCount': 0} if rowSections[0].find('b') is None: pmProps['read'] = True pmProps['subject'] = rowSections[0].find('a').text pmProps['id'] = int( albatross.getEnclosedString(rowSections[0].find('a').get('href'), 'thread=', '')) user_id = int( albatross.getEnclosedString(rowSections[1].find('a').get('href'), 'user='******'')) user_name = rowSections[1].find('a').text pmProps['user'] = conn.user(user_id, name=user_name) unreadNode = rowSections[2].find('a') if unreadNode is not None: pmProps['unreadCount'] = int(unreadNode.text[1:]) unreadNode.extract() pmProps['pmCount'] = int(rowSections[2].text.replace(' ()', '')) pmProps['lastPMTime'] = pytz.timezone('America/Chicago').localize( datetime.datetime.strptime(rowSections[3].text, "%m/%d/%Y %H:%M")) threads.append(pmProps) return threads
def testGetNormalEnclosedString(self): assert albatross.getEnclosedString(self.testString, r'">', r'<a') == "This is a test" assert albatross.getEnclosedString( self.testString, r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>" ) == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""
def parse_pm(conn, html): """ Given a connection and some html containing a PM's contents, return a dict, containing attributes of the PM. """ soup = bs4.BeautifulSoup(html) pmInfo = soup.find('div', {'class': 'message-top'}) user_id = int(albatross.getEnclosedString(pmInfo.find('a').get('href'), 'user='******'')) user_name = pmInfo.find('a').text posted_elt = pmInfo.find('b', text='Posted:') posted_date = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(posted_elt.next_sibling.strip(), "%m/%d/%Y %I:%M:%S %p |")) quote_elt = pmInfo.find('a', text='Quote') pm_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'quote=', '')) pm_thread_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'pm=', '"e=')) pmContents = soup.find('table', {'class': 'message-body'}).find('td', {'class': 'message'}).contents separators = [i for i, j in enumerate(pmContents) if j == u'\n---'] if separators: lastSeparator = separators[-1] pm_html = ''.join(unicode(x) for x in pmContents[:lastSeparator]) if lastSeparator+2 > len(pmContents): pm_sig = ''.join(unicode(x) for x in pmContents[lastSeparator+1:]) else: pm_sig = ''.join(unicode(x) for x in pmContents[lastSeparator+2:]) else: pm_html = ''.join(unicode(x) for x in pmContents) pm_sig = '' lastSeparator = len(separators) return { 'id': pm_id, 'user': conn.user(user_id, name=user_name), 'date': posted_date, 'thread': conn.pmThread(pm_thread_id), 'html': pm_html, 'sig': pm_sig }
def parse_pm_listing(conn, html): """ Given a connection and some html containing a PM inbox's contents, return a list of dicts, each containing attributes of a thread in the inbox. """ threads = [] soup = bs4.BeautifulSoup(html) pmTable = soup.find('table', {'class': 'grid'}) # first row is header row. for row in pmTable.find_all('tr')[1:]: rowSections = row.find_all('td') pmProps = { 'read': False, 'unreadCount': 0 } if rowSections[0].find('b') is None: pmProps['read'] = True pmProps['subject'] = rowSections[0].find('a').text pmProps['id'] = int(albatross.getEnclosedString(rowSections[0].find('a').get('href'), 'thread=', '')) user_id = int(albatross.getEnclosedString(rowSections[1].find('a').get('href'), 'user='******'')) user_name = rowSections[1].find('a').text pmProps['user'] = conn.user(user_id, name=user_name) unreadNode = rowSections[2].find('a') if unreadNode is not None: pmProps['unreadCount'] = int(unreadNode.text[1:]) unreadNode.extract() pmProps['pmCount'] = int(rowSections[2].text.replace(' ()', '')) pmProps['lastPMTime'] = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(rowSections[3].text, "%m/%d/%Y %H:%M")) threads.append(pmProps) return threads
def parse_pm(conn, html): """ Given a connection and some html containing a PM's contents, return a dict, containing attributes of the PM. """ soup = bs4.BeautifulSoup(html) pmInfo = soup.find('div', {'class': 'message-top'}) user_id = int(albatross.getEnclosedString(pmInfo.find('a').get('href'), 'user='******'')) user_name = pmInfo.find('a').text posted_elt = pmInfo.find('b', text='Posted:') posted_date = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(posted_elt.next_sibling.strip(), "%m/%d/%Y %I:%M:%S %p |")) quote_elt = pmInfo.find('a', text='Quote') pm_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'quote=', '')) pm_thread_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'pm=', '"e=')) pmContents = soup.find('table', {'class': 'message-body'}).find('td', {'class': 'message'}).contents separators = [i for i, j in enumerate(pmContents) if j == u'\n---'] if separators: lastSeparator = separators[-1] pm_html = ''.join(str(x) for x in pmContents[:lastSeparator]) if lastSeparator+2 > len(pmContents): pm_sig = ''.join(str(x) for x in pmContents[lastSeparator+1:]) else: pm_sig = ''.join(str(x) for x in pmContents[lastSeparator+2:]) else: pm_html = ''.join(str(x) for x in pmContents) pm_sig = '' lastSeparator = len(separators) return { 'id': pm_id, 'user': conn.user(user_id, name=user_name), 'date': posted_date, 'thread': conn.pmThread(pm_thread_id), 'html': pm_html, 'sig': pm_sig }
def __init__(self, conn, tags=None, active=False): super(TagList, self).__init__(conn) if tags is None: tags = [] self._tagNames = dict(zip(tags, [1]*len(tags))) self._tags = None if active: parser = HTMLParser.HTMLParser() mainPage = page.Page(self.connection, "https://endoftheinter.net/main.php") tagLinksHTML = albatross.getEnclosedString(mainPage.html, r'<div style="font-size: 14px">', r'</div>', multiLine=True) tagLinks = tagLinksHTML.split(' • ') for text in tagLinks: self._tagNames[parser.unescape(albatross.getEnclosedString(text, '">', '</a>')).strip()] = 1 self.load()
def parse(self, text): """ Given some HTML containing a post, return a dict of attributes. """ parser = HTMLParser.HTMLParser() timeString = albatross.getEnclosedString(text, r'<b>Posted:</b> ', r' \| ', greedy=False) altTimeString = albatross.getEnclosedString(text, r'<b>Posted:</b> ', r'</div>', greedy=False) timeString = timeString if timeString and len(timeString) < len(altTimeString) else altTimeString user = self.connection.user(int(albatross.getEnclosedString(text, r'<b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">'))).set({'name': parser.unescape(True and albatross.getEnclosedString(text, r'<b>From:</b>\ <a href="//endoftheinter\.net/profile\.php\?user=\d+">', r'</a>') or u'Human')}) attrs = { 'id': int(albatross.getEnclosedString(text, r'<div class="message-container" id="m', r'">')), 'user': user, 'date': pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(timeString, "%m/%d/%Y %I:%M:%S %p")), 'html': albatross.getEnclosedString(text, r' class="message">', '(\n)?---<br />(\n)?', multiLine=True, greedy=True), 'sig': albatross.getEnclosedString(text, '(\n)?---<br />(\n)?', r'</td>', multiLine=True, greedy=False) } if attrs['html'] is False: # sigless and on message detail page. attrs['html'] = albatross.getEnclosedString(text, r' class="message">', r'</td>', multiLine=True, greedy=False) attrs['sig'] = u"" if attrs['html'] is False: # sigless and on topic listing. attrs['html'] = albatross.getEnclosedString(text, r' class="message">', r'', multiLine=True, greedy=True) if attrs['html'] is False: raise MalformedPostError(self, self.topic, unicode(text)) attrs['html'] = attrs['html'].rstrip("\n") if attrs['sig'] is not False: attrs['sig'] = attrs['sig'].rstrip("\n") return attrs
def testGetGreedyEnclosedString(self): assert albatross.getEnclosedString( self.testString, r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>", greedy=True ) == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed</a>String"""
def search(self, query="", maxTime=None, maxID=None, activeSince=None, topics=None, recurse=False): """ Searches for topics using given parameters, and returns a list of dicts of returned topics. By default, recursively iterates through every page of search results. Upon failure returns False. """ if topics is None: self._topics = [] # if allowedTags or forbiddenTags is provided, it overrides this topiclist object's personal allowed or forbidden tags. if maxID is None: maxID = "" if activeSince is None: activeSince = pytz.timezone('America/Chicago').localize(datetime.datetime(1970, 1, 1)) else: # the topic listing only provides minute-level resolution, so remove seconds and microseconds from activeSince. activeSince = activeSince - datetime.timedelta(0, activeSince.second, activeSince.microsecond) while not maxTime or maxTime > activeSince: # assemble the search query and request this search page's topic listing. requestArgs = { 'q': unicode(query).encode('utf-8') } if maxTime is not None: if isinstance(maxTime, datetime.datetime): maxTime = calendar.timegm(maxTime.utctimetuple()) requestArgs['ts'] = unicode(maxTime).encode('utf-8') if maxID is not None: requestArgs['t'] = unicode(maxID).encode('utf-8') searchQuery = urllib.urlencode(requestArgs) url = 'https://boards.endoftheinter.net/topics/' + self.formatTagQueryString() + '?' + searchQuery topicPageHTML = self.connection.page(url).html # split the topic listing string into a list so that one topic is in each element. topicListingHTML = albatross.getEnclosedString(topicPageHTML, '<th>Last Post</th></tr>', '</tr></table>', multiLine=True) if not topicListingHTML: # No topic listing table. this means there are no topics that matched the search. break topicListingHTML = topicListingHTML.split('</tr>') if topicListingHTML else [] originalTopicsNum = len(self._topics) for topic in topicListingHTML: topicInfo = self.parse(topic) if topicInfo and topicInfo['lastPostTime'] >= activeSince: self._topics.append(self.connection.topic(topicInfo['id']).set(topicInfo)) if len(self._topics) == originalTopicsNum: # No matching topics; end our search. break if not recurse: break # we can't parallelize this, since we have no way of predicting the next ts and t parameters. DAMN YOU KEYSET PAGING maxTime = self._topics[-1].lastPostTime maxID = self._topics[-1].id self._topics = sorted(self._topics, key=lambda topic: topic.lastPostTime, reverse=True) return self
def __init__(self, conn, tags=None, active=False): super(TagList, self).__init__(conn) if tags is None: tags = [] self._tagNames = dict(zip(tags, [1] * len(tags))) self._tags = None if active: parser = HTMLParser() mainPage = page.Page(self.connection, "https://endoftheinter.net/main.php") tagLinksHTML = albatross.getEnclosedString( mainPage.html, r'<div id="active-tags" style="font-size: 14px">', r'</div>', multiLine=True) tagLinks = tagLinksHTML.split(' • ') for text in tagLinks: self._tagNames[parser.unescape( albatross.getEnclosedString(text, '">', '</a>')).strip()] = 1 self.load()
def parse(self, html): """ Given the HTML of a topic page, returns a dict of attributes. """ attrs = {} parser = HTMLParser.HTMLParser() soup = bs4.BeautifulSoup(html) attrs['archived'] = bool(re.search(r'<h2><em>This topic has been archived\. No additional messages may be posted\.</em></h2>', html)) subdomain = "archives" if attrs['archived'] else "boards" attrs['title'] = parser.unescape(albatross.getEnclosedString(html, r'\<h1\>', r'\<\/h1\>')) attrs['date'] = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(albatross.getEnclosedString(html, r'<b>Posted:</b> ', r' \| '), "%m/%d/%Y %I:%M:%S %p")) userID = int(albatross.getEnclosedString(html, r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">')) username = parser.unescape(True and albatross.getEnclosedString(html, r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">', r'</a>') or 'Human') attrs['user'] = self.connection.user(userID).set({'name': username}) attrs['pages'] = int(albatross.getEnclosedString(html, r'">(First Page</a> \| )?(<a href)?(\S+)?(Previous Page</a> \| )?Page \d+ of <span>', r'</span>')) attrs['closed'] = attrs['archived'] tagNames = [urllib2.unquote(albatross.getEnclosedString(tagEntry, '<a href="/topics/', r'">')) for tagEntry in albatross.getEnclosedString(html, r"<h2><div", r"</div></h2>").split(r"</a>")[:-1] if not tagEntry.startswith(' <span')] # we need to process tag names # e.g. remove enclosing square braces and decode html entities. cleanedTagNames = [] for tagName in tagNames: if tagName.startswith("[") and tagName.endswith("]"): tagName = tagName[1:-1] cleanedTagNames.append(parser.unescape(tagName.replace("_", " "))) attrs['tags'] = self.connection.tags(tags=cleanedTagNames) lastPage = self.connection.page('https://' + subdomain + '.endoftheinter.net/showmessages.php?topic=' + unicode(self.id) + '&page=' + unicode(attrs['pages'])) if lastPage.authed: lastPagePosts = self.getPagePosts(lastPage.html) lastPost = self.connection.post(1, self) lastPost = lastPost.set(lastPost.parse(lastPagePosts[-1])) attrs['lastPostTime'] = lastPost.date csrfTag = soup.find("input", {"name": "h"}) if csrfTag: attrs['csrfKey'] = csrfTag.get('value') return attrs
def search(self, query="", maxID=None, activeSince=None, createdSince=None, startPageNum=None, endPageNum=None, recurse=False): """ Searches for users using given parameters, and returns the current user listing object Performs operation in parallel. """ self._users = [] self._userIDs = {} maxID = float("inf") if maxID is None else int(maxID) activeSince = pytz.timezone('America/Chicago').localize(datetime.datetime(1970, 1, 1)) if activeSince is None else activeSince createdSince = pytz.timezone('America/Chicago').localize(datetime.datetime(1970, 1, 1)) if createdSince is None else createdSince startPageNum = 1 if startPageNum is None else int(startPageNum) paramArray = {'maxID': maxID, 'activeSince': activeSince, 'createdSince': createdSince} if endPageNum is None or not recurse: # fetch first page to grab number of pages, and grab users while we're at it. userListParams = urllib.urlencode([('user', unicode(query)), ('page', str(startPageNum))]) firstUrl = 'https://endoftheinter.net/userlist.php?' + userListParams firstUserPage = self.connection.page(firstUrl) self.appendUsers(firstUserPage.html, firstUrl, None, paramArray) endPageNum = int(albatross.getEnclosedString(firstUserPage.html, r'Page ' + str(startPageNum) + r' of <span>', r'</span>')) # increment start page num. startPageNum += 1 else: endPageNum = int(endPageNum) if not recurse: return self # now loop over all the other pages (if there are any) for pageNum in range(startPageNum, endPageNum+1): userListParams = urllib.urlencode([('user', unicode(query)), ('page', str(pageNum))]) self.connection.parallelCurl.startrequest('https://endoftheinter.net/userlist.php?' + userListParams, self.appendUsers, paramArray) self.connection.parallelCurl.finishallrequests() self._users = sorted(self._users, key=lambda userObject: userObject.id) return self
def testGetBeginningEndEnclosedString(self): assert albatross.getEnclosedString(self.testString, "<span>", "") == u"</span>" assert albatross.getEnclosedString(self.testString, "", "</a>") == u"""<a href="test.php?id=58&topic=23">This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""
def parse(self, text): """ Given some JSON containing a tag or list of tags, return a dict of attributes for the current tag. """ text = text[1:] try: tagJSON = json.loads(text) except ValueError: raise MalformedTagError(self, unicode(text)) if len(tagJSON) < 1: raise MalformedTagError(self, unicode(tagJSON)) # match only the tag in this JSON that has this tag's name, if it's set. if self.name: tagJSON = filter(lambda x: x[0] == self.name if not x[0].startswith("[") else x[1:-1] == self.name, tagJSON) if not tagJSON: raise InvalidTagError(self) tagJSON = tagJSON[0] name = tagJSON[0] if name.startswith("["): name = name[1:] if name.endswith("]"): name = name[:-1] tag = {'name': name} tag['staff'] = [] moderatorText = albatross.getEnclosedString(tagJSON[1][0], r"<b>Moderators: </b>", r"<br /><b>Administrators:") if moderatorText: descriptionEndTag = "<br /><b>Moderators:" moderatorTags = moderatorText.split(", ") for moderator in moderatorTags: user = self.connection.user(int(albatross.getEnclosedString(moderator, r"\?user="******">'))).set({'name': albatross.getEnclosedString(moderator, r'">', r"</a>")}) tag['staff'].append({'user': user, 'role':'moderator'}) else: descriptionEndTag = "<br /><b>Administrators:" administratorText = albatross.getEnclosedString(tagJSON[1][0], startString="<br /><b>Administrators: </b>", greedy=True) if administratorText: administratorTags = administratorText.split(", ") for administrator in administratorTags: user = self.connection.user(int(albatross.getEnclosedString(administrator, r"\?user="******">'))).set({'name': albatross.getEnclosedString(administrator, r'">', r"</a>")}) tag['staff'].append({'user': user, 'role':'administrator'}) parser = HTMLParser.HTMLParser() descriptionText = albatross.getEnclosedString(tagJSON[1][0], r":</b> ", descriptionEndTag) if descriptionText: tag['description'] = parser.unescape(descriptionText) else: tag['description'] = '' tagInteractions = tagJSON[1][1] tag['related'] = tag['forbidden'] = tag['dependent'] = [] if len(tagInteractions) > 0: if '0' in tagInteractions: tag['forbidden'] = [Tag(self.connection, key) for key in tagInteractions['0'].keys()] if '1' in tagInteractions: tag['dependent'] = [Tag(self.connection, key) for key in tagInteractions['1'].keys()] if '2' in tagInteractions: tag['related'] = [Tag(self.connection, key) for key in tagInteractions['2'].keys()] return tag
def testGetBeginningEndEnclosedString(self): assert albatross.getEnclosedString(self.testString, "<span>", "") == u"</span>" assert albatross.getEnclosedString( self.testString, "", "</a>" ) == u"""<a href="test.php?id=58&topic=23">This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""
def parse(self, text): """ Given some HTML containing a post, return a dict of attributes. """ parser = HTMLParser() timeString = albatross.getEnclosedString(text, r'<b>Posted:</b> ', r' \| ', greedy=False) altTimeString = albatross.getEnclosedString(text, r'<b>Posted:</b> ', r'</div>', greedy=False) timeString = timeString if timeString and len(timeString) < len( altTimeString) else altTimeString user = self.connection.user( int( albatross.getEnclosedString( text, r'<b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">')) ).set({ 'name': parser.unescape(True and albatross.getEnclosedString( text, r'<b>From:</b>\ <a href="//endoftheinter\.net/profile\.php\?user=\d+">', r'</a>') or u'Human') }) attrs = { 'id': int( albatross.getEnclosedString( text, r'<div class="message-container" id="m', r'">')), 'user': user, 'date': pytz.timezone('America/Chicago').localize( datetime.datetime.strptime(timeString, "%m/%d/%Y %I:%M:%S %p")), 'html': albatross.getEnclosedString(text, r' class="message">', '(\n)?---<br />(\n)?', multiLine=True, greedy=True), 'sig': albatross.getEnclosedString(text, '(\n)?---<br />(\n)?', r'</td>', multiLine=True, greedy=False), 'replies': 0 } replies = albatross.getEnclosedString( text, r'amp;thread={}">Replies \('.format(attrs['id']), r'\)</a>', greedy=False) if replies is not False: attrs['replies'] = int(replies) if attrs['html'] is False: # sigless and on message detail page. attrs['html'] = albatross.getEnclosedString(text, r' class="message">', r'</td>', multiLine=True, greedy=False) attrs['sig'] = u"" if attrs['html'] is False: # sigless and on topic listing. attrs['html'] = albatross.getEnclosedString(text, r' class="message">', r'', multiLine=True, greedy=True) if attrs['html'] is False: raise MalformedPostError(self, self.topic, str(text)) attrs['html'] = attrs['html'].rstrip("\n") if attrs['sig'] is not False: attrs['sig'] = attrs['sig'].rstrip("\n") return attrs
def parse(self, text): """ Given some JSON containing a tag or list of tags, return a dict of attributes for the current tag. """ text = text[1:] try: tagJSON = json.loads(text) except ValueError: raise MalformedTagError(self, str(text)) if len(tagJSON) < 1: raise MalformedTagError(self, str(tagJSON)) # match only the tag in this JSON that has this tag's name, if it's set. if self.name: tagJSON = filter( lambda x: x[0] == self.name if not x[0].startswith("[") else x[1:-1] == self.name, tagJSON) if not tagJSON: raise InvalidTagError(self) tagJSON = tagJSON[0] name = tagJSON[0] if name.startswith("["): name = name[1:] if name.endswith("]"): name = name[:-1] tag = {'name': name} tag['staff'] = [] moderatorText = albatross.getEnclosedString( tagJSON[1][0], r"<b>Moderators: </b>", r"<br /><b>Administrators:") if moderatorText: descriptionEndTag = "<br /><b>Moderators:" moderatorTags = moderatorText.split(", ") for moderator in moderatorTags: user = self.connection.user( int( albatross.getEnclosedString( moderator, r"\?user="******">'))).set({ 'name': albatross.getEnclosedString( moderator, r'">', r"</a>") }) tag['staff'].append({'user': user, 'role': 'moderator'}) else: descriptionEndTag = "<br /><b>Administrators:" administratorText = albatross.getEnclosedString( tagJSON[1][0], startString="<br /><b>Administrators: </b>", greedy=True) if administratorText: administratorTags = administratorText.split(", ") for administrator in administratorTags: user = self.connection.user( int( albatross.getEnclosedString( administrator, r"\?user="******">'))).set({ 'name': albatross.getEnclosedString( administrator, r'">', r"</a>") }) tag['staff'].append({'user': user, 'role': 'administrator'}) parser = HTMLParser() descriptionText = albatross.getEnclosedString(tagJSON[1][0], r":</b> ", descriptionEndTag) if descriptionText: tag['description'] = parser.unescape(descriptionText) else: tag['description'] = '' tagInteractions = tagJSON[1][1] tag['related'] = tag['forbidden'] = tag['dependent'] = [] if len(tagInteractions) > 0: if '0' in tagInteractions: tag['forbidden'] = [ Tag(self.connection, key) for key in tagInteractions['0'].keys() ] if '1' in tagInteractions: tag['dependent'] = [ Tag(self.connection, key) for key in tagInteractions['1'].keys() ] if '2' in tagInteractions: tag['related'] = [ Tag(self.connection, key) for key in tagInteractions['2'].keys() ] return tag
def testGetNonexistentEnclosedString(self): assert albatross.getEnclosedString(self.testString, r"<span>", "this ending doesn't exist") == False assert albatross.getEnclosedString(self.testString, r"This beginning doesn't exist", "</span>") == False
def testGetMultilineEnclosedString(self): assert albatross.getEnclosedString(self.testString, r"""<span class='secondLine'>""", r"</span>", multiLine=True) == u"""This is the second line of the test string.
def testGetEmptyEnclosedString(self): assert albatross.getEnclosedString(self.testString, "<span>", "</span>") == u""
def testGetMultilineEnclosedString(self): assert albatross.getEnclosedString( self.testString, r"""<span class='secondLine'>""", r"</span>", multiLine=True) == u"""This is the second line of the test string.
def parse(self, html): """ Parses a user's profile page. Returns a dict of attributes. """ attrs = {} parser = HTMLParser.HTMLParser() centralTime = pytz.timezone("America/Chicago") attrs['id'] = int(albatross.getEnclosedString(html, "<td>User ID</td>\s+<td>", r"</td>")) attrs['name'] = parser.unescape(albatross.getEnclosedString(html, r'<th colspan="2">Current Information for ', r'</th>')) try: attrs['level'] = int(albatross.getEnclosedString(html, r"""<td><a href="//endoftheinter\.net/profile\.php\?user=""" + str(attrs['id']) + """\">""" + re.escape(xml.sax.saxutils.escape(attrs['name'])) + """</a> \(""", r'\)')) except ValueError: # User has a non-integral level. attrs['level'] = 0 matchStatus = albatross.getEnclosedString(html, "<td>Status</td>\s+<td>", r"</td>") attrs['banned'] = False attrs['suspended'] = False if matchStatus: if matchStatus == '<b>Banned</b>': attrs['banned'] = True else: attrs['suspended'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(matchStatus, '<b>Suspended until:</b> ', ''), "%m/%d/%Y %I:%M:%S %p")) attrs['formerly'] = None nameChanged = albatross.getEnclosedString(html, "<td>Formerly</td>\s+<td>", "</td>") if nameChanged: attrs['formerly'] = nameChanged.split(", ") attrs['reputation'] = {} reputationText = albatross.getEnclosedString(html, r'<td>Reputation</td><td style="line-height:1.6em">', r'</td>') if reputationText: for repLine in reputationText.split("• "): tagName = parser.unescape(albatross.getEnclosedString(repLine, r'">', r'</a>')) tagRep = int(re.sub('\([0-9\,]+\)', '', albatross.getEnclosedString(repLine, r': ', '').replace(' ', '')).replace(",", "")) attrs['reputation'][self.connection.tag(tagName)] = tagRep tokenText = albatross.getEnclosedString(html, '<td>Tokens</td>\s+<td>', '</td>') if not tokenText: tokenText = 0 attrs['tokens'] = int(tokenText) attrs['goodTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&type=2">)?Good Tokens(</a>)?</td>\s+<td>', '</td>')) attrs['badTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&type=1">)?Bad Tokens(</a>)?</td>\s+<td>', '</td>')) attrs['created'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Account Created</td>\s+<td>', '</td>'), "%m/%d/%Y")) attrs['active'] = bool(re.search('\(online now\)', albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '</td>'))) attrs['lastActive'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '( \(online now\))?</td>'), "%m/%d/%Y")) attrs['sig'] = True and albatross.getEnclosedString(html, '<td>Signature</td>\s+<td>', '</td>') or None attrs['quote'] = True and albatross.getEnclosedString(html, '<td>Quote</td>\s+<td>', '</td>') or None attrs['email'] = True and albatross.getEnclosedString(html, '<td>Email Address</td>\s+<td>', '</td>') or None attrs['im'] = True and albatross.getEnclosedString(html, '<td>Instant Messaging</td>\s+<td>', '</td>') or None attrs['picture'] = True and albatross.getEnclosedString(html, '<td>Picture</td>\s+<td>\s*<a target="_blank" imgsrc="http:', '" href') or None return attrs
def parse(self, html): """ Parses a user's profile page. Returns a dict of attributes. """ attrs = {} parser = HTMLParser() centralTime = pytz.timezone("America/Chicago") attrs['id'] = int(albatross.getEnclosedString(html, "<td>User ID</td>\s+<td>", r"</td>")) attrs['name'] = parser.unescape(albatross.getEnclosedString(html, r'<th colspan="2">Current Information for ', r'</th>')) try: attrs['level'] = int(albatross.getEnclosedString(html, r"""<td><a href="//endoftheinter\.net/profile\.php\?user=""" + str(attrs['id']) + """\">""" + re.escape(xml.sax.saxutils.escape(attrs['name'])) + """</a> \(""", r'\)')) except ValueError: # User has a non-integral level. attrs['level'] = 0 matchStatus = albatross.getEnclosedString(html, "<td>Status</td>\s+<td>", r"</td>") attrs['banned'] = False attrs['suspended'] = False if matchStatus: if matchStatus == '<b>Banned</b>': attrs['banned'] = True else: attrs['suspended'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(matchStatus, '<b>Suspended until:</b> ', ''), "%m/%d/%Y %I:%M:%S %p")) attrs['formerly'] = None nameChanged = albatross.getEnclosedString(html, "<td>Formerly</td>\s+<td>", "</td>") if nameChanged: attrs['formerly'] = nameChanged.split(", ") attrs['reputation'] = {} reputationText = albatross.getEnclosedString(html, r'<td>Reputation</td><td style="line-height:1.6em">', r'</td>') if reputationText: for repLine in reputationText.split("• "): tagName = parser.unescape(albatross.getEnclosedString(repLine, r'">', r'</a>')) tagRep = int(re.sub('\([0-9\,]+\)', '', albatross.getEnclosedString(repLine, r': ', '').replace(' ', '')).replace(",", "")) attrs['reputation'][self.connection.tag(tagName)] = tagRep tokenText = albatross.getEnclosedString(html, '<td>Tokens</td>\s+<td>', '</td>') if not tokenText: tokenText = 0 attrs['tokens'] = int(tokenText) attrs['goodTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&type=2">)?Good Tokens(</a>)?</td>\s+<td>', '</td>')) attrs['badTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&type=1">)?Bad Tokens(</a>)?</td>\s+<td>', '</td>')) attrs['created'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Account Created</td>\s+<td>', '</td>'), "%m/%d/%Y")) attrs['active'] = bool(re.search('\(online now\)', albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '</td>'))) attrs['lastActive'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '( \(online now\))?</td>'), "%m/%d/%Y")) attrs['sig'] = True and albatross.getEnclosedString(html, '<td>Signature</td>\s+<td>', '</td>') or None attrs['quote'] = True and albatross.getEnclosedString(html, '<td>Quote</td>\s+<td>', '</td>') or None attrs['email'] = True and albatross.getEnclosedString(html, '<td>Email Address</td>\s+<td>', '</td>') or None attrs['im'] = True and albatross.getEnclosedString(html, '<td>Instant Messaging</td>\s+<td>', '</td>') or None attrs['picture'] = True and albatross.getEnclosedString(html, '<td>Picture</td>\s+<td>\s*<a target="_blank" imgsrc="http:', '" href') or None return attrs
def search(self, query="", maxTime=None, maxID=None, activeSince=None, topics=None, recurse=False): """ Searches for topics using given parameters, and returns a list of dicts of returned topics. By default, recursively iterates through every page of search results. Upon failure returns False. """ if topics is None: self._topics = [] # if allowedTags or forbiddenTags is provided, it overrides this topiclist object's personal allowed or forbidden tags. if maxID is None: maxID = "" if activeSince is None: activeSince = pytz.timezone('America/Chicago').localize( datetime.datetime(1970, 1, 1)) else: # the topic listing only provides minute-level resolution, so remove seconds and microseconds from activeSince. activeSince = activeSince - datetime.timedelta( 0, activeSince.second, activeSince.microsecond) while not maxTime or maxTime > activeSince: # assemble the search query and request this search page's topic listing. requestArgs = {'q': str(query).encode('utf-8')} if maxTime is not None: if isinstance(maxTime, datetime.datetime): maxTime = calendar.timegm(maxTime.utctimetuple()) requestArgs['ts'] = str(maxTime).encode('utf-8') if maxID is not None: requestArgs['t'] = str(maxID).encode('utf-8') searchQuery = urllib.parse.urlencode(requestArgs) url = 'https://boards.endoftheinter.net/topics/' + self.formatTagQueryString( ) + '?' + searchQuery topicPageHTML = self.connection.page(url).html # split the topic listing string into a list so that one topic is in each element. topicListingHTML = albatross.getEnclosedString( topicPageHTML, '<th>Last Post</th></tr>', '</tr></table>', multiLine=True) if not topicListingHTML: # No topic listing table. this means there are no topics that matched the search. break topicListingHTML = topicListingHTML.split( '</tr>') if topicListingHTML else [] originalTopicsNum = len(self._topics) for topic in topicListingHTML: topicInfo = self.parse(topic) print( datetime.datetime.strftime(topicInfo['lastPostTime'], '%Y-%m-%d')) if topicInfo and topicInfo['lastPostTime'] >= activeSince: self._topics.append( self.connection.topic(topicInfo['id']).set(topicInfo)) if len(self._topics) == originalTopicsNum: # No matching topics; end our search. break if not recurse: break # we can't parallelize this, since we have no way of predicting the next ts and t parameters. DAMN YOU KEYSET PAGING maxTime = self._topics[-1].lastPostTime maxID = self._topics[-1].id self._topics = sorted(self._topics, key=lambda topic: topic.lastPostTime, reverse=True) return self
def testGetNonexistentEnclosedString(self): assert albatross.getEnclosedString( self.testString, r"<span>", "this ending doesn't exist") == False assert albatross.getEnclosedString(self.testString, r"This beginning doesn't exist", "</span>") == False
def parse(self, html): """ Given the HTML of a topic page, returns a dict of attributes. """ attrs = {} parser = HTMLParser() soup = bs4.BeautifulSoup(html) attrs['archived'] = bool( re.search( r'<h2><em>This topic has been archived\. No additional messages may be posted\.</em></h2>', html)) subdomain = "archives" if attrs['archived'] else "boards" attrs['title'] = parser.unescape( albatross.getEnclosedString(html, r'\<h1\>', r'\<\/h1\>')) attrs['date'] = pytz.timezone('America/Chicago').localize( datetime.datetime.strptime( albatross.getEnclosedString(html, r'<b>Posted:</b> ', r' \| '), "%m/%d/%Y %I:%M:%S %p")) userID = int( albatross.getEnclosedString( html, r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">')) username = parser.unescape(True and albatross.getEnclosedString( html, r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">', r'</a>') or 'Human') attrs['user'] = self.connection.user(userID).set({'name': username}) attrs['pages'] = int( albatross.getEnclosedString( html, r'">(First Page</a> \| )?(<a href)?(\S+)?(Previous Page</a> \| )?Page \d+ of <span>', r'</span>')) attrs['closed'] = attrs['archived'] tagNames = [ urllib.parse.unquote( albatross.getEnclosedString(tagEntry, '<a href="/topics/', r'">')) for tagEntry in albatross.getEnclosedString( html, r"<h2><div", r"</div></h2>").split(r"</a>")[:-1] if not tagEntry.startswith(' <span') ] # we need to process tag names # e.g. remove enclosing square braces and decode html entities. cleanedTagNames = [] for tagName in tagNames: if tagName.startswith("[") and tagName.endswith("]"): tagName = tagName[1:-1] cleanedTagNames.append(parser.unescape(tagName.replace("_", " "))) attrs['tags'] = self.connection.tags(tags=cleanedTagNames) lastPage = self.connection.page( 'https://' + subdomain + '.endoftheinter.net/showmessages.php?topic=' + str(self.id) + '&page=' + str(attrs['pages'])) if lastPage.authed: lastPagePosts = self.getPagePosts(lastPage.html) lastPost = self.connection.post(1, self) lastPost = lastPost.set(lastPost.parse(lastPagePosts[-1])) attrs['lastPostTime'] = lastPost.date csrfTag = soup.find("input", {"name": "h"}) if csrfTag: attrs['csrfKey'] = csrfTag.get('value') return attrs
def scrape_imagemaps(self): ''' Processes the imagemap scraping queue. ''' if (datetime.datetime.now(tz=pytz.utc) - self.info['last_run_time']) < datetime.timedelta(seconds=10): return self.info['last_run_time'] = datetime.datetime.now(tz=pytz.utc) self.daemon.log.info("Processing imagemap queue.") scrape_requests = self.dbs['imagemap'].table('scrape_requests').fields('scrape_requests.user_id', 'scrape_requests.date', 'scrape_requests.password', 'scrape_requests.private', 'scrape_requests.permanent', 'scrape_requests.max_pages', 'users.name').join('users ON users.id=scrape_requests.user_id').where('password IS NOT NULL', progress=0).order('date ASC').list() for request in scrape_requests: # process scrape request. self.daemon.log.info("Processing usermap ID " + str(request['user_id']) + ".") self.dbs['imagemap'].table('scrape_requests').set(progress=1).where(user_id=request['user_id']).update() # attempt to use a cookie string for this user, if one is provided. try: if request['user_id'] not in self.info['cookie_strings']: eti = albatross.Connection(username=request['name'], password=request['password'], loginSite=albatross.SITE_MOBILE) else: try: eti = albatross.Connection(cookieString=self.info['cookie_strings'][request['user_id']], loginSite=albatross.SITE_MOBILE) except albatross.UnauthorizedError: # cookie string is expired. try to login to grab a new one. del self.info['cookie_strings'][request['user_id']] eti = albatross.Connection(username=request['name'], password=request['password'], loginSite=albatross.SITE_MOBILE) except albatross.UnauthorizedError: # incorrect password, or ETI is down. self.daemon.log.info("Incorrect password or ETI down for usermap ID " + str(request['user_id']) + ". Skipping.") self.dbs['imagemap'].table('scrape_requests').set(password=None, progress=-1).where(user_id=request['user_id']).update() continue # store the latest cookie string for this user. self.info['cookie_strings'][request['user_id']] = eti.cookieString # get this user's currently-uploaded image hashes. user_hashes = self.dbs['imagemap'].table('images').fields('hash').where(user_id=request['user_id']).list(valField='hash') user_hashes = {image_hash:1 for image_hash in user_hashes} base_datetime = datetime.datetime.now(tz=pytz.utc) images_to_add = [] params = { 'images': images_to_add, 'hashes': user_hashes, 'user_id': request['user_id'], 'base_datetime': base_datetime, 'page_num': 1, 'private': request['private'] } start_page_num = 1 if request['max_pages'] is None: # fetch imagemap's first page to get number of pages. imap_first_page_html = eti.page('https://images.endoftheinter.net/imagemap.php').html imap_first_page = bs4.BeautifulSoup(imap_first_page_html) infobar = imap_first_page.find('div', {'class': 'infobar'}) last_page_link = infobar.find_all('a')[-1] last_page_num = int(albatross.getEnclosedString(last_page_link.attrs['href'], 'page=', '')) # process the first imagemap page that we've already gotten. start_page_num = 2 self.process_imagemap_page(imap_first_page_html, 'https://images.endoftheinter.net/imagemap.php?page=1', None, params) if not params['images']: # usermap is unchanged. break. self.daemon.log.info('First imagemap page is unchanged for userID ' + str(request['user_id']) + '. Skipping.') self.dbs['imagemap'].table('scrape_requests').set(progress=0).where(user_id=request['user_id']).update() continue else: last_page_num = int(request['max_pages']) # now loop over all the other pages (if there are any). # if this is the user's first scrape, do this in parallel. # otherwise do this in serial so we can break. if not user_hashes: self.daemon.log.info('Fetching imagemap in parallel.') self.scrape_map_parallel(eti, start_page_num, last_page_num, params) else: self.daemon.log.info('Fetching imagemap in serial.') self.scrape_map_serial(eti, start_page_num, last_page_num, params) # add images to the database. if images_to_add: self.dbs['imagemap'].table('images').fields('server', 'hash', 'filename', 'type', 'user_id', 'created', 'hits', 'tags', 'private').values(images_to_add).onDuplicateKeyUpdate('hash=hash').insert() self.dbs['imagemap'].table('users').set('image_count=image_count+' + str(len(images_to_add))).where(id=request['user_id']).update() # set progress to finished. if request['permanent'] > 0: # this is a permanent scrape request. insert this back into the queue with the current time. current_time = datetime.datetime.now(tz=pytz.utc).strftime('%Y-%m-%d %H:%M:%S') self.dbs['imagemap'].table('scrape_requests').set(progress=0, date=current_time).where(user_id=request['user_id']).update() else: self.dbs['imagemap'].table('scrape_requests').set(password=None, progress=100).where(user_id=request['user_id']).update() self.daemon.log.info("Inserted " + str(len(images_to_add)) + " images for userID " + str(request['user_id']) + ".")
def testGetNormalEnclosedString(self): assert albatross.getEnclosedString(self.testString, r'">', r'<a') == "This is a test" assert albatross.getEnclosedString(self.testString, r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>") == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""
def testGetGreedyEnclosedString(self): assert albatross.getEnclosedString(self.testString, r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>", greedy=True) == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed</a>String"""
def search(self, query="", maxID=None, activeSince=None, createdSince=None, startPageNum=None, endPageNum=None, recurse=False): """ Searches for users using given parameters, and returns the current user listing object Performs operation in parallel. """ self._users = [] self._userIDs = {} maxID = float("inf") if maxID is None else int(maxID) activeSince = pytz.timezone('America/Chicago').localize( datetime.datetime(1970, 1, 1)) if activeSince is None else activeSince createdSince = pytz.timezone('America/Chicago').localize( datetime.datetime(1970, 1, 1)) if createdSince is None else createdSince startPageNum = 1 if startPageNum is None else int(startPageNum) paramArray = { 'maxID': maxID, 'activeSince': activeSince, 'createdSince': createdSince } if endPageNum is None or not recurse: # fetch first page to grab number of pages, and grab users while we're at it. userListParams = urllib.parse.urlencode([('user', str(query)), ('page', str(startPageNum))]) firstUrl = 'https://endoftheinter.net/userlist.php?' + userListParams firstUserPage = self.connection.page(firstUrl) self.appendUsers(firstUserPage.html, firstUrl, None, paramArray) endPageNum = int( albatross.getEnclosedString( firstUserPage.html, r'Page ' + str(startPageNum) + r' of <span>', r'</span>')) # increment start page num. startPageNum += 1 else: endPageNum = int(endPageNum) if not recurse: return self # now loop over all the other pages (if there are any) for pageNum in range(startPageNum, endPageNum + 1): userListParams = urllib.parse.urlencode([('user', str(query)), ('page', str(pageNum))]) self.connection.parallelCurl.startrequest( 'https://endoftheinter.net/userlist.php?' + userListParams, self.appendUsers, paramArray) self.connection.parallelCurl.finishallrequests() self._users = sorted(self._users, key=lambda userObject: userObject.id) return self