Python getEnclosedString Examples, albatross.getEnclosedString Python Examples

Example #1

0

Show file

File: pminbox.py Project: sb-adam/albatross

def parse_pm_listing(conn, html):
    """
  Given a connection and some html containing a PM inbox's contents,
  return a list of dicts, each containing attributes of a thread in the inbox.
  """
    threads = []
    soup = bs4.BeautifulSoup(html)
    pmTable = soup.find('table', {'class': 'grid'})
    # first row is header row.
    for row in pmTable.find_all('tr')[1:]:
        rowSections = row.find_all('td')
        pmProps = {'read': False, 'unreadCount': 0}
        if rowSections[0].find('b') is None:
            pmProps['read'] = True
        pmProps['subject'] = rowSections[0].find('a').text
        pmProps['id'] = int(
            albatross.getEnclosedString(rowSections[0].find('a').get('href'),
                                        'thread=', ''))
        user_id = int(
            albatross.getEnclosedString(rowSections[1].find('a').get('href'),
                                        'user='******''))
        user_name = rowSections[1].find('a').text
        pmProps['user'] = conn.user(user_id, name=user_name)
        unreadNode = rowSections[2].find('a')
        if unreadNode is not None:
            pmProps['unreadCount'] = int(unreadNode.text[1:])
            unreadNode.extract()
        pmProps['pmCount'] = int(rowSections[2].text.replace(' ()', ''))
        pmProps['lastPMTime'] = pytz.timezone('America/Chicago').localize(
            datetime.datetime.strptime(rowSections[3].text, "%m/%d/%Y %H:%M"))
        threads.append(pmProps)
    return threads

Example #2

0

Show file

File: albatross_tests.py Project: sb-adam/albatross

 def testGetNormalEnclosedString(self):
     assert albatross.getEnclosedString(self.testString, r'">',
                                        r'<a') == "This is a test"
     assert albatross.getEnclosedString(
         self.testString,
         r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>"
     ) == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""

Example #3

0

Show file

File: pm.py Project: shaldengeki/albatross

def parse_pm(conn, html):
  """
  Given a connection and some html containing a PM's contents,
  return a dict, containing attributes of the PM.
  """
  soup = bs4.BeautifulSoup(html)
  pmInfo = soup.find('div', {'class': 'message-top'})
  user_id = int(albatross.getEnclosedString(pmInfo.find('a').get('href'), 'user='******''))
  user_name = pmInfo.find('a').text
  posted_elt = pmInfo.find('b', text='Posted:')
  posted_date = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(posted_elt.next_sibling.strip(), "%m/%d/%Y %I:%M:%S %p |"))
  quote_elt = pmInfo.find('a', text='Quote')
  pm_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'quote=', ''))
  pm_thread_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'pm=', '&quote='))
  pmContents = soup.find('table', {'class': 'message-body'}).find('td', {'class': 'message'}).contents
  separators = [i for i, j in enumerate(pmContents) if j == u'\n---']
  if separators:
    lastSeparator = separators[-1]
    pm_html = ''.join(unicode(x) for x in pmContents[:lastSeparator])
    if lastSeparator+2 > len(pmContents):
      pm_sig = ''.join(unicode(x) for x in pmContents[lastSeparator+1:])
    else:
      pm_sig = ''.join(unicode(x) for x in pmContents[lastSeparator+2:])
  else:
    pm_html = ''.join(unicode(x) for x in pmContents)
    pm_sig = ''
    lastSeparator = len(separators)
  return {
    'id': pm_id,
    'user': conn.user(user_id, name=user_name),
    'date': posted_date,
    'thread': conn.pmThread(pm_thread_id),
    'html': pm_html,
    'sig': pm_sig
  }

Example #4

0

Show file

File: pminbox.py Project: shaldengeki/albatross

def parse_pm_listing(conn, html):
  """
  Given a connection and some html containing a PM inbox's contents,
  return a list of dicts, each containing attributes of a thread in the inbox.
  """
  threads = []
  soup = bs4.BeautifulSoup(html)
  pmTable = soup.find('table', {'class': 'grid'})
  # first row is header row.
  for row in pmTable.find_all('tr')[1:]:
    rowSections = row.find_all('td')
    pmProps = {
      'read': False,
      'unreadCount': 0
    }
    if rowSections[0].find('b') is None:
      pmProps['read'] = True
    pmProps['subject'] = rowSections[0].find('a').text
    pmProps['id'] = int(albatross.getEnclosedString(rowSections[0].find('a').get('href'), 'thread=', ''))
    user_id = int(albatross.getEnclosedString(rowSections[1].find('a').get('href'), 'user='******''))
    user_name = rowSections[1].find('a').text
    pmProps['user'] = conn.user(user_id, name=user_name)
    unreadNode = rowSections[2].find('a')
    if unreadNode is not None:
      pmProps['unreadCount'] = int(unreadNode.text[1:])
      unreadNode.extract()
    pmProps['pmCount'] = int(rowSections[2].text.replace(' ()', ''))
    pmProps['lastPMTime'] = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(rowSections[3].text, "%m/%d/%Y %H:%M"))
    threads.append(pmProps)
  return threads

Example #5

0

Show file

File: pm.py Project: sb-adam/albatross

def parse_pm(conn, html):
  """
  Given a connection and some html containing a PM's contents,
  return a dict, containing attributes of the PM.
  """
  soup = bs4.BeautifulSoup(html)
  pmInfo = soup.find('div', {'class': 'message-top'})
  user_id = int(albatross.getEnclosedString(pmInfo.find('a').get('href'), 'user='******''))
  user_name = pmInfo.find('a').text
  posted_elt = pmInfo.find('b', text='Posted:')
  posted_date = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(posted_elt.next_sibling.strip(), "%m/%d/%Y %I:%M:%S %p |"))
  quote_elt = pmInfo.find('a', text='Quote')
  pm_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'quote=', ''))
  pm_thread_id = int(albatross.getEnclosedString(quote_elt.get('href'), 'pm=', '&quote='))
  pmContents = soup.find('table', {'class': 'message-body'}).find('td', {'class': 'message'}).contents
  separators = [i for i, j in enumerate(pmContents) if j == u'\n---']
  if separators:
    lastSeparator = separators[-1]
    pm_html = ''.join(str(x) for x in pmContents[:lastSeparator])
    if lastSeparator+2 > len(pmContents):
      pm_sig = ''.join(str(x) for x in pmContents[lastSeparator+1:])
    else:
      pm_sig = ''.join(str(x) for x in pmContents[lastSeparator+2:])
  else:
    pm_html = ''.join(str(x) for x in pmContents)
    pm_sig = ''
    lastSeparator = len(separators)
  return {
    'id': pm_id,
    'user': conn.user(user_id, name=user_name),
    'date': posted_date,
    'thread': conn.pmThread(pm_thread_id),
    'html': pm_html,
    'sig': pm_sig
  }

Example #6

0

Show file

File: taglist.py Project: shaldengeki/albatross

 def __init__(self, conn, tags=None, active=False):
   super(TagList, self).__init__(conn)
   if tags is None:
     tags = []
   self._tagNames = dict(zip(tags, [1]*len(tags)))
   self._tags = None
   if active:
     parser = HTMLParser.HTMLParser()
     mainPage = page.Page(self.connection, "https://endoftheinter.net/main.php")
     tagLinksHTML = albatross.getEnclosedString(mainPage.html, r'<div style="font-size: 14px">', r'</div>', multiLine=True)
     tagLinks = tagLinksHTML.split('&nbsp;&bull; ')
     for text in tagLinks:
       self._tagNames[parser.unescape(albatross.getEnclosedString(text, '">', '</a>')).strip()] = 1
     self.load()

Example #7

0

Show file

File: post.py Project: shaldengeki/albatross

  def parse(self, text):
    """
    Given some HTML containing a post, return a dict of attributes.
    """
    parser = HTMLParser.HTMLParser()
    timeString = albatross.getEnclosedString(text, r'<b>Posted:</b> ', r' \| ', greedy=False)
    altTimeString = albatross.getEnclosedString(text, r'<b>Posted:</b> ', r'</div>', greedy=False)

    timeString = timeString if timeString and len(timeString) < len(altTimeString) else altTimeString

    user = self.connection.user(int(albatross.getEnclosedString(text, r'<b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">'))).set({'name': parser.unescape(True and albatross.getEnclosedString(text, r'<b>From:</b>\ <a href="//endoftheinter\.net/profile\.php\?user=\d+">', r'</a>') or u'Human')})
    attrs = {
      'id': int(albatross.getEnclosedString(text, r'<div class="message-container" id="m', r'">')),
      'user': user,
      'date': pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(timeString, "%m/%d/%Y %I:%M:%S %p")),
      'html': albatross.getEnclosedString(text, r' class="message">', '(\n)?---<br />(\n)?', multiLine=True, greedy=True),
      'sig': albatross.getEnclosedString(text, '(\n)?---<br />(\n)?', r'</td>', multiLine=True, greedy=False)
    }
    if attrs['html'] is False:
      # sigless and on message detail page.
      attrs['html'] = albatross.getEnclosedString(text, r' class="message">', r'</td>', multiLine=True, greedy=False)
      attrs['sig'] = u""
    if attrs['html'] is False:
      # sigless and on topic listing.
      attrs['html'] = albatross.getEnclosedString(text, r' class="message">', r'', multiLine=True, greedy=True)
    if attrs['html'] is False:
      raise MalformedPostError(self, self.topic, unicode(text))
    attrs['html'] = attrs['html'].rstrip("\n")
    if attrs['sig'] is not False:
      attrs['sig'] = attrs['sig'].rstrip("\n")
    return attrs

Example #8

0

Show file

File: albatross_tests.py Project: sb-adam/albatross

 def testGetGreedyEnclosedString(self):
     assert albatross.getEnclosedString(
         self.testString,
         r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""",
         r"</a>",
         greedy=True
     ) == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed</a>String"""

Example #9

0

Show file

File: topiclist.py Project: shaldengeki/albatross

  def search(self, query="", maxTime=None, maxID=None, activeSince=None, topics=None, recurse=False):
    """
    Searches for topics using given parameters, and returns a list of dicts of returned topics.
    By default, recursively iterates through every page of search results.
    Upon failure returns False.
    """
    if topics is None:
      self._topics = []

    # if allowedTags or forbiddenTags is provided, it overrides this topiclist object's personal allowed or forbidden tags.
    if maxID is None:
      maxID = ""

    if activeSince is None:
      activeSince = pytz.timezone('America/Chicago').localize(datetime.datetime(1970, 1, 1))
    else:
      # the topic listing only provides minute-level resolution, so remove seconds and microseconds from activeSince.
      activeSince = activeSince - datetime.timedelta(0, activeSince.second, activeSince.microsecond)

    while not maxTime or maxTime > activeSince:
      # assemble the search query and request this search page's topic listing.
      requestArgs = {
        'q': unicode(query).encode('utf-8')
      }
      if maxTime is not None:
        if isinstance(maxTime, datetime.datetime):
          maxTime = calendar.timegm(maxTime.utctimetuple())
        requestArgs['ts'] = unicode(maxTime).encode('utf-8')
      if maxID is not None:
        requestArgs['t'] = unicode(maxID).encode('utf-8')
      searchQuery = urllib.urlencode(requestArgs)

      url = 'https://boards.endoftheinter.net/topics/' + self.formatTagQueryString() + '?' + searchQuery
      topicPageHTML = self.connection.page(url).html

      # split the topic listing string into a list so that one topic is in each element.
      topicListingHTML = albatross.getEnclosedString(topicPageHTML, '<th>Last Post</th></tr>', '</tr></table>', multiLine=True)
      if not topicListingHTML:
        # No topic listing table. this means there are no topics that matched the search.
        break

      topicListingHTML = topicListingHTML.split('</tr>') if topicListingHTML else []
      originalTopicsNum = len(self._topics)
      for topic in topicListingHTML:
        topicInfo = self.parse(topic)
        if topicInfo and topicInfo['lastPostTime'] >= activeSince:
          self._topics.append(self.connection.topic(topicInfo['id']).set(topicInfo))
      
      if len(self._topics) == originalTopicsNum:
        # No matching topics; end our search.
        break

      if not recurse:
        break
      # we can't parallelize this, since we have no way of predicting the next ts and t parameters. DAMN YOU KEYSET PAGING
      maxTime = self._topics[-1].lastPostTime
      maxID = self._topics[-1].id
    self._topics = sorted(self._topics, key=lambda topic: topic.lastPostTime, reverse=True)
    return self

Example #10

0

Show file

File: taglist.py Project: sb-adam/albatross

 def __init__(self, conn, tags=None, active=False):
     super(TagList, self).__init__(conn)
     if tags is None:
         tags = []
     self._tagNames = dict(zip(tags, [1] * len(tags)))
     self._tags = None
     if active:
         parser = HTMLParser()
         mainPage = page.Page(self.connection,
                              "https://endoftheinter.net/main.php")
         tagLinksHTML = albatross.getEnclosedString(
             mainPage.html,
             r'<div id="active-tags" style="font-size: 14px">',
             r'</div>',
             multiLine=True)
         tagLinks = tagLinksHTML.split('&nbsp;&bull; ')
         for text in tagLinks:
             self._tagNames[parser.unescape(
                 albatross.getEnclosedString(text, '">',
                                             '</a>')).strip()] = 1
         self.load()

Example #11

0

Show file

File: topic.py Project: shaldengeki/albatross

  def parse(self, html):
    """
    Given the HTML of a topic page, returns a dict of attributes.
    """

    attrs = {}
    parser = HTMLParser.HTMLParser()

    soup = bs4.BeautifulSoup(html)

    attrs['archived'] = bool(re.search(r'<h2><em>This topic has been archived\. No additional messages may be posted\.</em></h2>', html))

    subdomain = "archives" if attrs['archived'] else "boards"

    attrs['title'] = parser.unescape(albatross.getEnclosedString(html, r'\<h1\>', r'\<\/h1\>'))
    attrs['date'] = pytz.timezone('America/Chicago').localize(datetime.datetime.strptime(albatross.getEnclosedString(html, r'<b>Posted:</b> ', r' \| '), "%m/%d/%Y %I:%M:%S %p"))
    userID = int(albatross.getEnclosedString(html, r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">'))
    username = parser.unescape(True and albatross.getEnclosedString(html, r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">', r'</a>') or 'Human')
    attrs['user'] = self.connection.user(userID).set({'name': username})
    attrs['pages'] = int(albatross.getEnclosedString(html, r'">(First Page</a> \| )?(<a href)?(\S+)?(Previous Page</a> \| )?Page \d+ of <span>', r'</span>'))
    attrs['closed'] = attrs['archived']
    tagNames = [urllib2.unquote(albatross.getEnclosedString(tagEntry, '<a href="/topics/', r'">')) for tagEntry in albatross.getEnclosedString(html, r"<h2><div", r"</div></h2>").split(r"</a>")[:-1] if not tagEntry.startswith(' <span')]
    # we need to process tag names
    # e.g. remove enclosing square braces and decode html entities.
    cleanedTagNames = []
    for tagName in tagNames:
      if tagName.startswith("[") and tagName.endswith("]"):
        tagName = tagName[1:-1]
      cleanedTagNames.append(parser.unescape(tagName.replace("_", " ")))
    attrs['tags'] = self.connection.tags(tags=cleanedTagNames)
    lastPage = self.connection.page('https://' + subdomain + '.endoftheinter.net/showmessages.php?topic=' + unicode(self.id) + '&page=' + unicode(attrs['pages']))
    if lastPage.authed:
      lastPagePosts = self.getPagePosts(lastPage.html)
      lastPost = self.connection.post(1, self)
      lastPost = lastPost.set(lastPost.parse(lastPagePosts[-1]))
      attrs['lastPostTime'] = lastPost.date
    csrfTag = soup.find("input", {"name": "h"})
    if csrfTag:
      attrs['csrfKey'] = csrfTag.get('value')
    return attrs

Example #12

0

Show file

File: userlist.py Project: shaldengeki/albatross

  def search(self, query="", maxID=None, activeSince=None, createdSince=None, startPageNum=None, endPageNum=None, recurse=False):
    """
    Searches for users using given parameters, and returns the current user listing object
    Performs operation in parallel.
    """
    self._users = []
    self._userIDs = {}

    maxID = float("inf") if maxID is None else int(maxID)
    activeSince = pytz.timezone('America/Chicago').localize(datetime.datetime(1970, 1, 1)) if activeSince is None else activeSince
    createdSince = pytz.timezone('America/Chicago').localize(datetime.datetime(1970, 1, 1)) if createdSince is None else createdSince
    startPageNum = 1 if startPageNum is None else int(startPageNum)

    paramArray = {'maxID': maxID, 'activeSince': activeSince, 'createdSince': createdSince}

    if endPageNum is None or not recurse:
      # fetch first page to grab number of pages, and grab users while we're at it.
      userListParams = urllib.urlencode([('user', unicode(query)), ('page', str(startPageNum))])
      firstUrl = 'https://endoftheinter.net/userlist.php?' + userListParams
      firstUserPage = self.connection.page(firstUrl)
      self.appendUsers(firstUserPage.html, firstUrl, None, paramArray)

      endPageNum = int(albatross.getEnclosedString(firstUserPage.html, r'Page ' + str(startPageNum) + r' of <span>', r'</span>'))

      # increment start page num.
      startPageNum += 1
    else:
      endPageNum = int(endPageNum)

    if not recurse:
      return self

    # now loop over all the other pages (if there are any)
    for pageNum in range(startPageNum, endPageNum+1):
      userListParams = urllib.urlencode([('user', unicode(query)), ('page', str(pageNum))])
      self.connection.parallelCurl.startrequest('https://endoftheinter.net/userlist.php?' + userListParams, self.appendUsers, paramArray)
    self.connection.parallelCurl.finishallrequests()
    self._users = sorted(self._users, key=lambda userObject: userObject.id)
    return self

Example #13

0

Show file

File: albatross_tests.py Project: shaldengeki/albatross

 def testGetBeginningEndEnclosedString(self):
   assert albatross.getEnclosedString(self.testString, "<span>", "") == u"</span>"
   assert albatross.getEnclosedString(self.testString, "", "</a>") == u"""<a href="test.php?id=58&topic=23">This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""

Example #14

0

Show file

File: tag.py Project: shaldengeki/albatross

  def parse(self, text):
    """
    Given some JSON containing a tag or list of tags, return a dict of attributes for the current tag.
    """
    text = text[1:]
    try:
      tagJSON = json.loads(text)
    except ValueError:
      raise MalformedTagError(self, unicode(text))
    if len(tagJSON) < 1:
      raise MalformedTagError(self, unicode(tagJSON))

    # match only the tag in this JSON that has this tag's name, if it's set.
    if self.name:
      tagJSON = filter(lambda x: x[0] == self.name if not x[0].startswith("[") else x[1:-1] == self.name, tagJSON)
    if not tagJSON:
      raise InvalidTagError(self)
      
    tagJSON = tagJSON[0]
    name = tagJSON[0]
    if name.startswith("["):
      name = name[1:]
    if name.endswith("]"):
      name = name[:-1]
    tag = {'name': name}

    tag['staff'] = []

    moderatorText = albatross.getEnclosedString(tagJSON[1][0], r"<b>Moderators: </b>", r"<br /><b>Administrators:")
    if moderatorText:
      descriptionEndTag = "<br /><b>Moderators:"
      moderatorTags = moderatorText.split(", ")
      for moderator in moderatorTags:
        user = self.connection.user(int(albatross.getEnclosedString(moderator, r"\?user="******">'))).set({'name': albatross.getEnclosedString(moderator, r'">', r"</a>")})
        tag['staff'].append({'user': user, 'role':'moderator'})
    else:
      descriptionEndTag = "<br /><b>Administrators:"

    administratorText = albatross.getEnclosedString(tagJSON[1][0], startString="<br /><b>Administrators: </b>", greedy=True)
    if administratorText:
      administratorTags = administratorText.split(", ")
      for administrator in administratorTags:
        user = self.connection.user(int(albatross.getEnclosedString(administrator, r"\?user="******">'))).set({'name': albatross.getEnclosedString(administrator, r'">', r"</a>")})
        tag['staff'].append({'user': user, 'role':'administrator'})

    parser = HTMLParser.HTMLParser()
    descriptionText = albatross.getEnclosedString(tagJSON[1][0], r":</b> ", descriptionEndTag)
    if descriptionText:
      tag['description'] = parser.unescape(descriptionText)
    else:
      tag['description'] = ''

    tagInteractions = tagJSON[1][1]
    tag['related'] = tag['forbidden'] = tag['dependent'] = []
    if len(tagInteractions) > 0:
      if '0' in tagInteractions:
        tag['forbidden'] = [Tag(self.connection, key) for key in tagInteractions['0'].keys()]
      if '1' in tagInteractions:
        tag['dependent'] = [Tag(self.connection, key) for key in tagInteractions['1'].keys()]
      if '2' in tagInteractions:
        tag['related'] = [Tag(self.connection, key) for key in tagInteractions['2'].keys()]
    return tag

Example #15

0

Show file

File: albatross_tests.py Project: sb-adam/albatross

 def testGetBeginningEndEnclosedString(self):
     assert albatross.getEnclosedString(self.testString, "<span>",
                                        "") == u"</span>"
     assert albatross.getEnclosedString(
         self.testString, "", "</a>"
     ) == u"""<a href="test.php?id=58&topic=23">This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""

Example #16

0

Show file

    def parse(self, text):
        """
    Given some HTML containing a post, return a dict of attributes.
    """
        parser = HTMLParser()
        timeString = albatross.getEnclosedString(text,
                                                 r'<b>Posted:</b> ',
                                                 r' \| ',
                                                 greedy=False)
        altTimeString = albatross.getEnclosedString(text,
                                                    r'<b>Posted:</b> ',
                                                    r'</div>',
                                                    greedy=False)

        timeString = timeString if timeString and len(timeString) < len(
            altTimeString) else altTimeString

        user = self.connection.user(
            int(
                albatross.getEnclosedString(
                    text,
                    r'<b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">'))
        ).set({
            'name':
            parser.unescape(True and albatross.getEnclosedString(
                text,
                r'<b>From:</b>\ <a href="//endoftheinter\.net/profile\.php\?user=\d+">',
                r'</a>') or u'Human')
        })
        attrs = {
            'id':
            int(
                albatross.getEnclosedString(
                    text, r'<div class="message-container" id="m', r'">')),
            'user':
            user,
            'date':
            pytz.timezone('America/Chicago').localize(
                datetime.datetime.strptime(timeString,
                                           "%m/%d/%Y %I:%M:%S %p")),
            'html':
            albatross.getEnclosedString(text,
                                        r' class="message">',
                                        '(\n)?---<br />(\n)?',
                                        multiLine=True,
                                        greedy=True),
            'sig':
            albatross.getEnclosedString(text,
                                        '(\n)?---<br />(\n)?',
                                        r'</td>',
                                        multiLine=True,
                                        greedy=False),
            'replies':
            0
        }
        replies = albatross.getEnclosedString(
            text,
            r'amp;thread={}">Replies \('.format(attrs['id']),
            r'\)</a>',
            greedy=False)
        if replies is not False:
            attrs['replies'] = int(replies)
        if attrs['html'] is False:
            # sigless and on message detail page.
            attrs['html'] = albatross.getEnclosedString(text,
                                                        r' class="message">',
                                                        r'</td>',
                                                        multiLine=True,
                                                        greedy=False)
            attrs['sig'] = u""
        if attrs['html'] is False:
            # sigless and on topic listing.
            attrs['html'] = albatross.getEnclosedString(text,
                                                        r' class="message">',
                                                        r'',
                                                        multiLine=True,
                                                        greedy=True)
        if attrs['html'] is False:
            raise MalformedPostError(self, self.topic, str(text))
        attrs['html'] = attrs['html'].rstrip("\n")
        if attrs['sig'] is not False:
            attrs['sig'] = attrs['sig'].rstrip("\n")
        return attrs

Example #17

0

Show file

File: tag.py Project: sb-adam/albatross

    def parse(self, text):
        """
    Given some JSON containing a tag or list of tags, return a dict of attributes for the current tag.
    """
        text = text[1:]
        try:
            tagJSON = json.loads(text)
        except ValueError:
            raise MalformedTagError(self, str(text))
        if len(tagJSON) < 1:
            raise MalformedTagError(self, str(tagJSON))

        # match only the tag in this JSON that has this tag's name, if it's set.
        if self.name:
            tagJSON = filter(
                lambda x: x[0] == self.name
                if not x[0].startswith("[") else x[1:-1] == self.name, tagJSON)
        if not tagJSON:
            raise InvalidTagError(self)

        tagJSON = tagJSON[0]
        name = tagJSON[0]
        if name.startswith("["):
            name = name[1:]
        if name.endswith("]"):
            name = name[:-1]
        tag = {'name': name}

        tag['staff'] = []

        moderatorText = albatross.getEnclosedString(
            tagJSON[1][0], r"<b>Moderators: </b>", r"<br /><b>Administrators:")
        if moderatorText:
            descriptionEndTag = "<br /><b>Moderators:"
            moderatorTags = moderatorText.split(", ")
            for moderator in moderatorTags:
                user = self.connection.user(
                    int(
                        albatross.getEnclosedString(
                            moderator, r"\?user="******">'))).set({
                                'name':
                                albatross.getEnclosedString(
                                    moderator, r'">', r"</a>")
                            })
                tag['staff'].append({'user': user, 'role': 'moderator'})
        else:
            descriptionEndTag = "<br /><b>Administrators:"

        administratorText = albatross.getEnclosedString(
            tagJSON[1][0],
            startString="<br /><b>Administrators: </b>",
            greedy=True)
        if administratorText:
            administratorTags = administratorText.split(", ")
            for administrator in administratorTags:
                user = self.connection.user(
                    int(
                        albatross.getEnclosedString(
                            administrator, r"\?user="******">'))).set({
                                'name':
                                albatross.getEnclosedString(
                                    administrator, r'">', r"</a>")
                            })
                tag['staff'].append({'user': user, 'role': 'administrator'})

        parser = HTMLParser()
        descriptionText = albatross.getEnclosedString(tagJSON[1][0], r":</b> ",
                                                      descriptionEndTag)
        if descriptionText:
            tag['description'] = parser.unescape(descriptionText)
        else:
            tag['description'] = ''

        tagInteractions = tagJSON[1][1]
        tag['related'] = tag['forbidden'] = tag['dependent'] = []
        if len(tagInteractions) > 0:
            if '0' in tagInteractions:
                tag['forbidden'] = [
                    Tag(self.connection, key)
                    for key in tagInteractions['0'].keys()
                ]
            if '1' in tagInteractions:
                tag['dependent'] = [
                    Tag(self.connection, key)
                    for key in tagInteractions['1'].keys()
                ]
            if '2' in tagInteractions:
                tag['related'] = [
                    Tag(self.connection, key)
                    for key in tagInteractions['2'].keys()
                ]
        return tag

Example #18

0

Show file

File: albatross_tests.py Project: shaldengeki/albatross

 def testGetNonexistentEnclosedString(self):
   assert albatross.getEnclosedString(self.testString, r"<span>", "this ending doesn't exist") == False
   assert albatross.getEnclosedString(self.testString, r"This beginning doesn't exist", "</span>") == False

Example #19

0

Show file

File: albatross_tests.py Project: shaldengeki/albatross

 def testGetMultilineEnclosedString(self):
   assert albatross.getEnclosedString(self.testString, r"""<span class='secondLine'>""", r"</span>", multiLine=True) == u"""This is the second line of the test string.

Example #20

0

Show file

File: albatross_tests.py Project: sb-adam/albatross

 def testGetEmptyEnclosedString(self):
     assert albatross.getEnclosedString(self.testString, "<span>",
                                        "</span>") == u""

Example #21

0

Show file

File: albatross_tests.py Project: sb-adam/albatross

 def testGetMultilineEnclosedString(self):
     assert albatross.getEnclosedString(
         self.testString,
         r"""<span class='secondLine'>""",
         r"</span>",
         multiLine=True) == u"""This is the second line of the test string.

Example #22

0

Show file

File: user.py Project: shaldengeki/albatross

  def parse(self, html):
    """
    Parses a user's profile page.
    Returns a dict of attributes.
    """
    attrs = {}
    parser = HTMLParser.HTMLParser()
    centralTime = pytz.timezone("America/Chicago")

    attrs['id'] = int(albatross.getEnclosedString(html, "<td>User ID</td>\s+<td>", r"</td>"))
    attrs['name'] = parser.unescape(albatross.getEnclosedString(html, r'<th colspan="2">Current Information for ', r'</th>'))
    try:
      attrs['level'] = int(albatross.getEnclosedString(html, r"""<td><a href="//endoftheinter\.net/profile\.php\?user=""" + str(attrs['id']) + """\">""" + re.escape(xml.sax.saxutils.escape(attrs['name'])) + """</a> \(""", r'\)'))
    except ValueError:
      # User has a non-integral level.
      attrs['level'] = 0
    matchStatus = albatross.getEnclosedString(html, "<td>Status</td>\s+<td>", r"</td>")
    attrs['banned'] = False
    attrs['suspended'] = False
    if matchStatus:
      if matchStatus == '<b>Banned</b>':
        attrs['banned'] = True
      else:
        attrs['suspended'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(matchStatus, '<b>Suspended until:</b> ', ''), "%m/%d/%Y %I:%M:%S %p"))
    attrs['formerly'] = None
    nameChanged = albatross.getEnclosedString(html, "<td>Formerly</td>\s+<td>", "</td>")
    if nameChanged:
      attrs['formerly'] = nameChanged.split(", ")
    attrs['reputation'] = {}
    reputationText = albatross.getEnclosedString(html, r'<td>Reputation</td><td style="line-height:1.6em">', r'</td>')
    if reputationText:
      for repLine in reputationText.split("&bull; "):
        tagName = parser.unescape(albatross.getEnclosedString(repLine, r'">', r'</a>'))
        tagRep = int(re.sub('\([0-9\,]+\)', '', albatross.getEnclosedString(repLine, r':&nbsp;', '').replace('&nbsp;', '')).replace(",", ""))
        attrs['reputation'][self.connection.tag(tagName)] = tagRep
    tokenText = albatross.getEnclosedString(html, '<td>Tokens</td>\s+<td>', '</td>')
    if not tokenText:
      tokenText = 0
    attrs['tokens'] = int(tokenText)
    attrs['goodTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&amp;type=2">)?Good&nbsp;Tokens(</a>)?</td>\s+<td>', '</td>'))
    attrs['badTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&amp;type=1">)?Bad Tokens(</a>)?</td>\s+<td>', '</td>'))
    attrs['created'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Account Created</td>\s+<td>', '</td>'), "%m/%d/%Y"))
    attrs['active'] = bool(re.search('\(online now\)', albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '</td>')))
    attrs['lastActive'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '( \(online now\))?</td>'), "%m/%d/%Y"))
    attrs['sig'] = True and albatross.getEnclosedString(html, '<td>Signature</td>\s+<td>', '</td>') or None
    attrs['quote'] = True and albatross.getEnclosedString(html, '<td>Quote</td>\s+<td>', '</td>') or None
    attrs['email'] = True and albatross.getEnclosedString(html, '<td>Email Address</td>\s+<td>', '</td>') or None
    attrs['im'] = True and albatross.getEnclosedString(html, '<td>Instant&nbsp;Messaging</td>\s+<td>', '</td>') or None
    attrs['picture'] = True and albatross.getEnclosedString(html, '<td>Picture</td>\s+<td>\s*<a target="_blank" imgsrc="http:', '" href') or None

    return attrs

Example #23

0

Show file

  def parse(self, html):
    """
    Parses a user's profile page.
    Returns a dict of attributes.
    """
    attrs = {}
    parser = HTMLParser()
    centralTime = pytz.timezone("America/Chicago")

    attrs['id'] = int(albatross.getEnclosedString(html, "<td>User ID</td>\s+<td>", r"</td>"))
    attrs['name'] = parser.unescape(albatross.getEnclosedString(html, r'<th colspan="2">Current Information for ', r'</th>'))
    try:
      attrs['level'] = int(albatross.getEnclosedString(html, r"""<td><a href="//endoftheinter\.net/profile\.php\?user=""" + str(attrs['id']) + """\">""" + re.escape(xml.sax.saxutils.escape(attrs['name'])) + """</a> \(""", r'\)'))
    except ValueError:
      # User has a non-integral level.
      attrs['level'] = 0
    matchStatus = albatross.getEnclosedString(html, "<td>Status</td>\s+<td>", r"</td>")
    attrs['banned'] = False
    attrs['suspended'] = False
    if matchStatus:
      if matchStatus == '<b>Banned</b>':
        attrs['banned'] = True
      else:
        attrs['suspended'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(matchStatus, '<b>Suspended until:</b> ', ''), "%m/%d/%Y %I:%M:%S %p"))
    attrs['formerly'] = None
    nameChanged = albatross.getEnclosedString(html, "<td>Formerly</td>\s+<td>", "</td>")
    if nameChanged:
      attrs['formerly'] = nameChanged.split(", ")
    attrs['reputation'] = {}
    reputationText = albatross.getEnclosedString(html, r'<td>Reputation</td><td style="line-height:1.6em">', r'</td>')
    if reputationText:
      for repLine in reputationText.split("&bull; "):
        tagName = parser.unescape(albatross.getEnclosedString(repLine, r'">', r'</a>'))
        tagRep = int(re.sub('\([0-9\,]+\)', '', albatross.getEnclosedString(repLine, r':&nbsp;', '').replace('&nbsp;', '')).replace(",", ""))
        attrs['reputation'][self.connection.tag(tagName)] = tagRep
    tokenText = albatross.getEnclosedString(html, '<td>Tokens</td>\s+<td>', '</td>')
    if not tokenText:
      tokenText = 0
    attrs['tokens'] = int(tokenText)
    attrs['goodTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&amp;type=2">)?Good&nbsp;Tokens(</a>)?</td>\s+<td>', '</td>'))
    attrs['badTokens'] = int(albatross.getEnclosedString(html, '<td>(<a href="tokenlist\.php\?user='******'id']) + '&amp;type=1">)?Bad Tokens(</a>)?</td>\s+<td>', '</td>'))
    attrs['created'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Account Created</td>\s+<td>', '</td>'), "%m/%d/%Y"))
    attrs['active'] = bool(re.search('\(online now\)', albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '</td>')))
    attrs['lastActive'] = centralTime.localize(datetime.datetime.strptime(albatross.getEnclosedString(html, '<td>Last Active</td>\s+<td>', '( \(online now\))?</td>'), "%m/%d/%Y"))
    attrs['sig'] = True and albatross.getEnclosedString(html, '<td>Signature</td>\s+<td>', '</td>') or None
    attrs['quote'] = True and albatross.getEnclosedString(html, '<td>Quote</td>\s+<td>', '</td>') or None
    attrs['email'] = True and albatross.getEnclosedString(html, '<td>Email Address</td>\s+<td>', '</td>') or None
    attrs['im'] = True and albatross.getEnclosedString(html, '<td>Instant&nbsp;Messaging</td>\s+<td>', '</td>') or None
    attrs['picture'] = True and albatross.getEnclosedString(html, '<td>Picture</td>\s+<td>\s*<a target="_blank" imgsrc="http:', '" href') or None

    return attrs

Example #24

0

Show file

File: topiclist.py Project: sb-adam/albatross

    def search(self,
               query="",
               maxTime=None,
               maxID=None,
               activeSince=None,
               topics=None,
               recurse=False):
        """
    Searches for topics using given parameters, and returns a list of dicts of returned topics.
    By default, recursively iterates through every page of search results.
    Upon failure returns False.
    """
        if topics is None:
            self._topics = []

        # if allowedTags or forbiddenTags is provided, it overrides this topiclist object's personal allowed or forbidden tags.
        if maxID is None:
            maxID = ""

        if activeSince is None:
            activeSince = pytz.timezone('America/Chicago').localize(
                datetime.datetime(1970, 1, 1))
        else:
            # the topic listing only provides minute-level resolution, so remove seconds and microseconds from activeSince.
            activeSince = activeSince - datetime.timedelta(
                0, activeSince.second, activeSince.microsecond)

        while not maxTime or maxTime > activeSince:
            # assemble the search query and request this search page's topic listing.
            requestArgs = {'q': str(query).encode('utf-8')}
            if maxTime is not None:
                if isinstance(maxTime, datetime.datetime):
                    maxTime = calendar.timegm(maxTime.utctimetuple())
                requestArgs['ts'] = str(maxTime).encode('utf-8')
            if maxID is not None:
                requestArgs['t'] = str(maxID).encode('utf-8')
            searchQuery = urllib.parse.urlencode(requestArgs)

            url = 'https://boards.endoftheinter.net/topics/' + self.formatTagQueryString(
            ) + '?' + searchQuery
            topicPageHTML = self.connection.page(url).html

            # split the topic listing string into a list so that one topic is in each element.
            topicListingHTML = albatross.getEnclosedString(
                topicPageHTML,
                '<th>Last Post</th></tr>',
                '</tr></table>',
                multiLine=True)
            if not topicListingHTML:
                # No topic listing table. this means there are no topics that matched the search.
                break

            topicListingHTML = topicListingHTML.split(
                '</tr>') if topicListingHTML else []
            originalTopicsNum = len(self._topics)
            for topic in topicListingHTML:
                topicInfo = self.parse(topic)
                print(
                    datetime.datetime.strftime(topicInfo['lastPostTime'],
                                               '%Y-%m-%d'))
                if topicInfo and topicInfo['lastPostTime'] >= activeSince:
                    self._topics.append(
                        self.connection.topic(topicInfo['id']).set(topicInfo))

            if len(self._topics) == originalTopicsNum:
                # No matching topics; end our search.
                break

            if not recurse:
                break
            # we can't parallelize this, since we have no way of predicting the next ts and t parameters. DAMN YOU KEYSET PAGING
            maxTime = self._topics[-1].lastPostTime
            maxID = self._topics[-1].id
        self._topics = sorted(self._topics,
                              key=lambda topic: topic.lastPostTime,
                              reverse=True)
        return self

Example #25

0

Show file

File: albatross_tests.py Project: sb-adam/albatross

 def testGetNonexistentEnclosedString(self):
     assert albatross.getEnclosedString(
         self.testString, r"<span>", "this ending doesn't exist") == False
     assert albatross.getEnclosedString(self.testString,
                                        r"This beginning doesn't exist",
                                        "</span>") == False

Example #26

0

Show file

File: topic.py Project: sb-adam/albatross

    def parse(self, html):
        """
    Given the HTML of a topic page, returns a dict of attributes.
    """

        attrs = {}
        parser = HTMLParser()

        soup = bs4.BeautifulSoup(html)

        attrs['archived'] = bool(
            re.search(
                r'<h2><em>This topic has been archived\. No additional messages may be posted\.</em></h2>',
                html))

        subdomain = "archives" if attrs['archived'] else "boards"

        attrs['title'] = parser.unescape(
            albatross.getEnclosedString(html, r'\<h1\>', r'\<\/h1\>'))
        attrs['date'] = pytz.timezone('America/Chicago').localize(
            datetime.datetime.strptime(
                albatross.getEnclosedString(html, r'<b>Posted:</b> ', r' \| '),
                "%m/%d/%Y %I:%M:%S %p"))
        userID = int(
            albatross.getEnclosedString(
                html,
                r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">'))
        username = parser.unescape(True and albatross.getEnclosedString(
            html,
            r'<div class="message-top"><b>From:</b> <a href="//endoftheinter\.net/profile\.php\?user='******'">', r'</a>') or 'Human')
        attrs['user'] = self.connection.user(userID).set({'name': username})
        attrs['pages'] = int(
            albatross.getEnclosedString(
                html,
                r'">(First Page</a> \| )?(<a href)?(\S+)?(Previous Page</a> \| )?Page \d+ of <span>',
                r'</span>'))
        attrs['closed'] = attrs['archived']
        tagNames = [
            urllib.parse.unquote(
                albatross.getEnclosedString(tagEntry, '<a href="/topics/',
                                            r'">'))
            for tagEntry in albatross.getEnclosedString(
                html, r"<h2><div", r"</div></h2>").split(r"</a>")[:-1]
            if not tagEntry.startswith(' <span')
        ]
        # we need to process tag names
        # e.g. remove enclosing square braces and decode html entities.
        cleanedTagNames = []
        for tagName in tagNames:
            if tagName.startswith("[") and tagName.endswith("]"):
                tagName = tagName[1:-1]
            cleanedTagNames.append(parser.unescape(tagName.replace("_", " ")))
        attrs['tags'] = self.connection.tags(tags=cleanedTagNames)
        lastPage = self.connection.page(
            'https://' + subdomain +
            '.endoftheinter.net/showmessages.php?topic=' + str(self.id) +
            '&page=' + str(attrs['pages']))
        if lastPage.authed:
            lastPagePosts = self.getPagePosts(lastPage.html)
            lastPost = self.connection.post(1, self)
            lastPost = lastPost.set(lastPost.parse(lastPagePosts[-1]))
            attrs['lastPostTime'] = lastPost.date
        csrfTag = soup.find("input", {"name": "h"})
        if csrfTag:
            attrs['csrfKey'] = csrfTag.get('value')
        return attrs

Example #27

0

Show file

File: albatross_tests.py Project: shaldengeki/albatross

 def testGetEmptyEnclosedString(self):
   assert albatross.getEnclosedString(self.testString, "<span>", "</span>") == u""

Example #28

0

Show file

File: image_gps_modules.py Project: shaldengeki/eti-imagemap-webapp

  def scrape_imagemaps(self):
    '''
    Processes the imagemap scraping queue.
    '''
    if (datetime.datetime.now(tz=pytz.utc) - self.info['last_run_time']) < datetime.timedelta(seconds=10):
      return
    self.info['last_run_time'] = datetime.datetime.now(tz=pytz.utc)
    self.daemon.log.info("Processing imagemap queue.")

    scrape_requests = self.dbs['imagemap'].table('scrape_requests').fields('scrape_requests.user_id', 'scrape_requests.date', 'scrape_requests.password', 'scrape_requests.private', 'scrape_requests.permanent', 'scrape_requests.max_pages', 'users.name').join('users ON users.id=scrape_requests.user_id').where('password IS NOT NULL', progress=0).order('date ASC').list()
    for request in scrape_requests:
      # process scrape request.
      self.daemon.log.info("Processing usermap ID " + str(request['user_id']) + ".")
      self.dbs['imagemap'].table('scrape_requests').set(progress=1).where(user_id=request['user_id']).update()

      # attempt to use a cookie string for this user, if one is provided.
      try:
        if request['user_id'] not in self.info['cookie_strings']:
          eti = albatross.Connection(username=request['name'], password=request['password'], loginSite=albatross.SITE_MOBILE)
        else:
          try:
            eti = albatross.Connection(cookieString=self.info['cookie_strings'][request['user_id']], loginSite=albatross.SITE_MOBILE)
          except albatross.UnauthorizedError:
            # cookie string is expired. try to login to grab a new one.
            del self.info['cookie_strings'][request['user_id']]
            eti = albatross.Connection(username=request['name'], password=request['password'], loginSite=albatross.SITE_MOBILE)
      except albatross.UnauthorizedError:
        # incorrect password, or ETI is down.
        self.daemon.log.info("Incorrect password or ETI down for usermap ID " + str(request['user_id']) + ". Skipping.")
        self.dbs['imagemap'].table('scrape_requests').set(password=None, progress=-1).where(user_id=request['user_id']).update()
        continue

      # store the latest cookie string for this user.
      self.info['cookie_strings'][request['user_id']] = eti.cookieString

      # get this user's currently-uploaded image hashes.
      user_hashes = self.dbs['imagemap'].table('images').fields('hash').where(user_id=request['user_id']).list(valField='hash')
      user_hashes = {image_hash:1 for image_hash in user_hashes}

      base_datetime = datetime.datetime.now(tz=pytz.utc)
      images_to_add = []
      params = {
        'images': images_to_add,
        'hashes': user_hashes,
        'user_id': request['user_id'],
        'base_datetime': base_datetime,
        'page_num': 1,
        'private': request['private']
      }

      start_page_num = 1
      if request['max_pages'] is None:
        # fetch imagemap's first page to get number of pages.
        imap_first_page_html = eti.page('https://images.endoftheinter.net/imagemap.php').html
        imap_first_page = bs4.BeautifulSoup(imap_first_page_html)
        infobar = imap_first_page.find('div', {'class': 'infobar'})
        last_page_link = infobar.find_all('a')[-1]
        last_page_num = int(albatross.getEnclosedString(last_page_link.attrs['href'], 'page=', ''))

        # process the first imagemap page that we've already gotten.
        start_page_num = 2
        self.process_imagemap_page(imap_first_page_html, 'https://images.endoftheinter.net/imagemap.php?page=1', None, params)
        if not params['images']:
          # usermap is unchanged. break.
          self.daemon.log.info('First imagemap page is unchanged for userID ' + str(request['user_id']) + '. Skipping.')
          self.dbs['imagemap'].table('scrape_requests').set(progress=0).where(user_id=request['user_id']).update()
          continue
      else:
        last_page_num = int(request['max_pages'])

      # now loop over all the other pages (if there are any).
      # if this is the user's first scrape, do this in parallel.
      # otherwise do this in serial so we can break.
      if not user_hashes:
        self.daemon.log.info('Fetching imagemap in parallel.')
        self.scrape_map_parallel(eti, start_page_num, last_page_num, params)
      else:
        self.daemon.log.info('Fetching imagemap in serial.')
        self.scrape_map_serial(eti, start_page_num, last_page_num, params)

      # add images to the database.
      if images_to_add:
        self.dbs['imagemap'].table('images').fields('server', 'hash', 'filename', 'type', 'user_id', 'created', 'hits', 'tags', 'private').values(images_to_add).onDuplicateKeyUpdate('hash=hash').insert()
        self.dbs['imagemap'].table('users').set('image_count=image_count+' + str(len(images_to_add))).where(id=request['user_id']).update()

      # set progress to finished.
      if request['permanent'] > 0:
        # this is a permanent scrape request. insert this back into the queue with the current time.
        current_time = datetime.datetime.now(tz=pytz.utc).strftime('%Y-%m-%d %H:%M:%S')
        self.dbs['imagemap'].table('scrape_requests').set(progress=0, date=current_time).where(user_id=request['user_id']).update()
      else:
        self.dbs['imagemap'].table('scrape_requests').set(password=None, progress=100).where(user_id=request['user_id']).update()

      self.daemon.log.info("Inserted " + str(len(images_to_add)) + " images for userID " + str(request['user_id']) + ".")

Example #29

0

Show file

File: albatross_tests.py Project: shaldengeki/albatross

 def testGetNormalEnclosedString(self):
   assert albatross.getEnclosedString(self.testString, r'">', r'<a') == "This is a test"
   assert albatross.getEnclosedString(self.testString, r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>") == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed"""

Example #30

0

Show file

File: albatross_tests.py Project: shaldengeki/albatross

 def testGetGreedyEnclosedString(self):
   assert albatross.getEnclosedString(self.testString, r"""<a href="test.php\?id=[0-9]+&topic=[0-9]+">""", r"</a>", greedy=True) == u"""This is a test<a href="test.php?id=62&topic=13"> string for getEnclosed</a>String"""

Example #31

0

Show file

    def search(self,
               query="",
               maxID=None,
               activeSince=None,
               createdSince=None,
               startPageNum=None,
               endPageNum=None,
               recurse=False):
        """
    Searches for users using given parameters, and returns the current user listing object
    Performs operation in parallel.
    """
        self._users = []
        self._userIDs = {}

        maxID = float("inf") if maxID is None else int(maxID)
        activeSince = pytz.timezone('America/Chicago').localize(
            datetime.datetime(1970, 1,
                              1)) if activeSince is None else activeSince
        createdSince = pytz.timezone('America/Chicago').localize(
            datetime.datetime(1970, 1,
                              1)) if createdSince is None else createdSince
        startPageNum = 1 if startPageNum is None else int(startPageNum)

        paramArray = {
            'maxID': maxID,
            'activeSince': activeSince,
            'createdSince': createdSince
        }

        if endPageNum is None or not recurse:
            # fetch first page to grab number of pages, and grab users while we're at it.
            userListParams = urllib.parse.urlencode([('user', str(query)),
                                                     ('page',
                                                      str(startPageNum))])
            firstUrl = 'https://endoftheinter.net/userlist.php?' + userListParams
            firstUserPage = self.connection.page(firstUrl)
            self.appendUsers(firstUserPage.html, firstUrl, None, paramArray)

            endPageNum = int(
                albatross.getEnclosedString(
                    firstUserPage.html,
                    r'Page ' + str(startPageNum) + r' of <span>', r'</span>'))

            # increment start page num.
            startPageNum += 1
        else:
            endPageNum = int(endPageNum)

        if not recurse:
            return self

        # now loop over all the other pages (if there are any)
        for pageNum in range(startPageNum, endPageNum + 1):
            userListParams = urllib.parse.urlencode([('user', str(query)),
                                                     ('page', str(pageNum))])
            self.connection.parallelCurl.startrequest(
                'https://endoftheinter.net/userlist.php?' + userListParams,
                self.appendUsers, paramArray)
        self.connection.parallelCurl.finishallrequests()
        self._users = sorted(self._users, key=lambda userObject: userObject.id)
        return self