Ejemplos de BeautifulSoup.BeautifulSoup en Python, ejemplos de BeautifulSoup.BeautifulSoup.BeautifulSoup.BeautifulSoup en Python

Ejemplo n.º 1

0

Mostrar archivo

    def _parseComment(self, communityId, liveInfoFilePath, commentFilePath):
        chatList = []
        if not (os.path.exists(liveInfoFilePath) and os.path.exists(commentFilePath)):
            return chatList

        infoParser = BeautifulSoup(open(liveInfoFilePath, u'r'))
        if not infoParser.find(u'communityid').renderContents() == communityId:
            return chatList

        commentParser = BeautifulSoup(open(commentFilePath, u'r'))
        chatTagList = commentParser.findAll(u'chat', attrs={u'msgkind': u'message_msg'})
        for chatTag in chatTagList:
            communityId = communityId.decode(u'utf-8')
            liveId = infoParser.find(u'liveid').renderContents().decode()
            userId = chatTag.get(u'user').decode(u'utf-8')
            name = chatTag.get(u'nickname').decode(u'utf-8')
            message = chatTag.renderContents().decode(u'utf-8')
            option = chatTag.get(u'mail').decode(u'utf-8') if chatTag.get(u'mail') != '' else None
            date = re.sub(
                ur'(\d{4})/(\d{1,2})/(\d{1,2})\s(\d{1,2}):(\d{1,2}):(\d{1,2})',
                lambda match: u'{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}'.format(int(match.group(1)), int(match.group(2)), int(match.group(3)), int(match.group(4)), int(match.group(5)), int(match.group(6))),
                chatTag.get(u'date')
            ).decode(u'utf-8')
            chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Ejemplo n.º 2

0

Mostrar archivo

    def firstPass(self, page):
        soup = BeautifulSoup(page)

        loginform = soup.find('form')

        action = loginform['action']

        urls = urlparse(action)
        self.urlBase = "https://" + urls.netloc
        logging.info("Base URL = " + self.urlBase)

        inputs = loginform.findAllNext('input')

        values = {}

        values['userid'] = self.filledCreds['03']  #username

        # build the body content
        data = urllib.urlencode(values)

        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 2

Ejemplo n.º 3

0

Mostrar archivo

    def doStep2(self, allofit, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-username.html", page)

        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no login form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']

        values = self.parseForm(loginform)

        # fill in our credentials
        values["ctl00$mainContent$LI5TABA$DBID_edit"] = self.filledCreds[
            '01']  #customer number

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3

        return 'good'

Ejemplo n.º 4

0

Mostrar archivo

    def DoStep3(self, allofit):

        scrape_result = "good"

        page = self.HexToByte(allofit['body'])

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("fd-summary.html", page)

        accountTable = soup.find('table', attrs={'class': 'fdBalancesTable'})

        if accountTable != None:
            self.accountLinks = accountTable.findAll(
                'a', attrs={'class': 'fdActionLink'})

            if len(self.accountLinks) == 0:
                #got some kind of message
                scrape_result = 'bank error'
                logging.info('Still got no accounts')
        else:
            logging.debug("No fd table")
            scrape_result = 'credentials incorrect'

        return scrape_result

Ejemplo n.º 5

0

Mostrar archivo

    def doStep7(self, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-acclink.html", page)

        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no view account form')
            return 'bank error'

        action = self.urlBase + '/' + loginform['action']

        values = self.parseForm(loginform)

        # fill in our selection - 1 month
        values['ctl00$mainContent$SS2SPDDA'] = 'M1'

        # default button - needed
        values['ctl00$mainContent$NextButton_button'] = 'View Transactions'

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 20

        return 'good'

Ejemplo n.º 6

0

Mostrar archivo

    def doStep1(self, allofit, page):

        body = page

        scrape_result = 'good'
        logging.info("NatWest page1")

        # the following is how you could retrieve the headers from the request
        # for head in allofit['headers']:
        #     name = self.HexToByte(head['name'])
        #     val = self.HexToByte(head['value'])

        # write out the start page
        self.output_page("1_first_page.html", body)

        soup = BeautifulSoup(body)

        frame = soup.find('frame', id='ctl00_secframe')

        if frame != None:

            action = self.urlBase + '/' + frame['src']

            #<frame id="ctl00_secframe" title="Security Frame" frameborder="0" src="login.aspx?refererident=774C53DCE4C17556595C91973A6DF1A0A1F6242E&amp;cookieid=100714"></frame>

            self.response = {}
            self.response['url'] = self.ByteToHex(action)
            self.response['data'] = ""
            self.response['method'] = 'GET'
            self.response['step'] = 2
        else:
            logging.debug('NatWest frame link error - ')
            scrape_result = 'bank error'

        return scrape_result

Ejemplo n.º 7

0

Mostrar archivo

    def doAllLink(self, page):

        soup = BeautifulSoup(page)

        self.output_page("natwest-xactlist-all-look.html", page)

        #<a href="/StatementsFixedPeriod.aspx?id=B7879D8CABBF283B38AE447E07A4EA5D8DA9A859&amp;persist=%2fwEPBQ1BY2NvdW50TnVtYmVyBQg4ODU3MjIxOA%3d%3d%7c%2fwEPBQhGcm9tRGF0ZQUTMTgvMDUvMjAxMSAwMDowMDowMA%3d%3d%7c%2fwEPBQhTb3J0Q29kZQUGNjAxNzIx%7c%2fwEPBQZUb0RhdGUFEzE4LzA3LzIwMTEgMDA6MDA6MDA%3d%7c%2fwEWBh4JU1MyQUNDRERBDwUUU0lOSEEgTS9TVFUgODg1NzIyMTgFKEI3ODc5RDhDQUJCRjI4M0IzOEFFNDQ3RTA3QTRFQTVEOERBOUE4NTkeCFNTMlNQRERBDxBkZBYBAgNnHgZTUzJXTEEPAgFo&amp;showall=1" title="Show all items on a single page">All</a>

        logging.debug('NatWest checking for all links')

        # find any all link
        links = soup.findAll('a')

        link = None
        for a in links:
            # detect our link
            try:
                if re.search(".tatements.ixed.eriod", a['href']):
                    logging.debug("natwest - got a statement  link")
                    if re.search(".ll", a.text):  # the one that says all
                        link = self.composeLink(a['href'][:])
                        logging.debug("natwest - got an All statement link")
                        break  # only need the first one so break the for loop
            except Exception, e:
                logging.debug('NatWest a link error missing href - ' + str(e))

Ejemplo n.º 8

0

Mostrar archivo

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     showIndex = BeautifulSoup(source)
     vidInfo = {
         'tvshowtitle':
         showIndex.find('div',
                        id='showDashboard').find('span', {
                            'class': 'blueText'
                        }).string,
         'studio':
         'FOX'
     }
     seasonsListing = showIndex.findAll(
         'div', {
             'class': re.compile('dashPageHolder'),
             'id': re.compile('^fullEp')
         })
     print len(seasonsListing)
     for season in seasonsListing:
         episodesListing = season.findAll('div',
                                          {'class': 'episodeListing'})
         for episode in episodesListing:
             listitem = xbmcgui.ListItem(
                 episode.find('h3').find('a').string)
             listitem.setThumbnailImage(
                 episode.find('img', id=re.compile('^epThumb'))['src'])
             episodeLink = episode.find('a', {'class': 'thumbnailLink'})
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             airedDateAndPlot = re.search(
                 'Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',
                 str(episode.find('div', {'class': 'episodeInfo'})))
             seasonNum = re.search(
                 'Season\s+([0-9]+?)[\s:]',
                 str(episode.find('p', {'class': 'seasonNum'})))
             episodeNumAndDuration = re.search(
                 'Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',
                 str(episode.find('p', {'class': 'episodeNumLine'})))
             vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),
                                              airedDateAndPlot.group(1),
                                              airedDateAndPlot.group(2))
             vidInfo['season'] = int(seasonNum.group(1))
             vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             vidInfo['duration'] = episodeNumAndDuration.group(2)
             vidInfo['title'] = episode.find('h3').find('a').string
             vidInfo['plot'] = decode_htmlentities(
                 airedDateAndPlot.group(4))
             print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s" %
                 (sys.argv[0], quote_plus(episodeUrl)))
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Ejemplo n.º 9

0

Mostrar archivo

 def _get_soup(self, html):
     try:
         return BeautifulSoup(html)
     except:
         logging.warning(
             "BeautifulSoup exception cleaning up html, can't cache images offline"
         )
         return None

Ejemplo n.º 10

0

Mostrar archivo

    def rewrite_html(self, guid, html=None, ajax_url=None):
        """if we are not using ajax, then html is IGNORED and we go by the
		cached copy.  html is sometimes used to see if there should be a
		cached copy at all, or if something goes wrong and we just need to
		return unaltered html
		"""

        guid = str(guid)
        cache_dir = os.path.join(self._store_location, guid_hash(guid))
        mapping_file = os.path.join(cache_dir, guid + "-" + "mapping.pickle")

        if not os.path.isfile(mapping_file):
            # quick and dirty check.  are there images?  if not, plain
            # html is fine
            if html.lower().find('<img') >= 0:
                #logging.warning("Should be downloaded images, but couldn't open mapping.  Recaching")
                self.cache_html(guid, html)
            return html

        try:
            mapping = open(mapping_file, 'r')
            rewrite_hash = pickle.load(mapping)
            non_ajax_html = pickle.load(mapping)
            mapping.close()
        except:
            logging.error("error opening cache pickle for guid %s %s" %
                          (guid, mapping_file))
            logging.error(
                "If you have upgraded penguintv, you might need to delete your image cache"
            )
            return html

        if ajax_url is None:
            return non_ajax_html

        #else, rewrite on the fly
        soup = BeautifulSoup(html)
        img_tags = soup.findAll('img')

        if len(img_tags) == 0:
            return html

        for result in img_tags:
            # believe it or not, some img tags don't have a src, they have an id
            # that points to CSS.  At least I think that's what's going on
            if result.has_key('src'):
                if rewrite_hash.has_key(result['src']):
                    if rewrite_hash[result['src']][1] == UrlCacher.DOWNLOADED:
                        #if os.path.isfile(os.path.join(self._store_location, rewrite_hash[result['src']][0])):
                        result['src'] = ajax_url + "/cache/" + rewrite_hash[
                            result['src']][0]
                        #else:
                        #	logging.warning("file not found, not replacing")
                        #	logging.debug("(should we attempt to recache here?")

        return soup.prettify()

Ejemplo n.º 11

0

Mostrar archivo

    def _parseNatWestLinks(self, raw):
        soup = BeautifulSoup(raw)
        accountBLock = soup.findAll('a', attrs={'class': 'accountNameExpand'})

        # got some acount details now so all good
        if len(accountBLock) == 0:
            logging.warning('NatWest no accounts after continue form')
            return 'account problem'

        for ac_link in accountBLock:
            ac_link.string = ac_link.text
            self.accountLinks.append(ac_link)

            # now the accnum list - to get the pair data, cos cant get it from link
            row = ac_link.parent.parent
            try:
                # find the account number span
                acnumSpan = row.find('span', attrs={'class': 'AccountNumber'})
                acnum = acnumSpan.text
                acnum = acnum.replace(' ', '')

                # find the sort code span
                sortSpan = row.find('span', attrs={'class': 'SortCode'})
                sortc = sortSpan.text
                sortc = sortc.replace(' ', '')
                sortc = sortc.replace('-', '')
            except Exception, e:
                logging.exception('NatWest form error - ' + str(e))
                return 'bank error'

            #combine the two - to be our matching number
            num = sortc + "-" + acnum

            actype = 'Cheque'
            # might be a credit card
            if len(acnum) > 14:
                actype = 'Credit'

            # now get balances...
            balance = 0
            baltr = ac_link.parent.parent
            baltds = baltr.findAll('td')
            if len(baltds) > 2:
                baltext = self.tidy_text(baltds[3].text)
                balance = self.normalise_ammount(baltext)

            # and add it to our account list
            acpair = {
                'name': ac_link.text,
                'num': num,
                'type': actype,
                'bal': balance
            }

            self.myAccounts.append(acpair)

Ejemplo n.º 12

0

Mostrar archivo

    def doStep3(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-security.html", page)

        scrape_result = 'good'
        logging.info("NatWest security page2")

        # check if we got returned
        # check for the password input ctl00$mainContent$LI5TABA$DBID_edit then we didnt move on
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI5TABA$DBID_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest security page1 still - customer number bad")
            return 'credentials incorrect'  # if we get here then the form was found hence creds must be wrong

        # find our form
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no security form')
            return 'bank error'

        values = self.parseForm(loginform)

        # define some variables that would only otherwise exist in a try catch block scope
        # the label text split on spaces
        which1arr = ""
        which2arr = ""
        which3arr = ""

        # the chalenges
        firstDigit = ""
        secondDigit = ""
        thirdDigit = ""

        #>>>>>>> The first set of Pin fields
        #-------------------- get the questions --------------#

        #<label for="ctl00_mainContent_Tab1_LI6PPEA_edit" id="ctl00_mainContent_Tab1_LI6DDALALabel" class="wizCrl wizardLabelRememberMeWide">Enter the 2nd number</label>
        useNewTab = False
        try:
            which1 = soup.find('label',
                               attrs={
                                   'for': 'ctl00_mainContent_LI6PPEA_edit'
                               }).text
        except Exception, e:
            useNewTab = True

Ejemplo n.º 13

0

Mostrar archivo

    def __makerequest(self, cmd, **kwargs):
        kwargs["cmd"] = cmd
        if self._token:
            kwargs["token"] = self._token

        try:
            response = BeautifulSoup(
                self._opener.open(self._url + urllib.urlencode(
                    dict([
                        k,
                        v.encode('utf-8') if isinstance(v, basestring) else v
                    ] for k, v in kwargs.items())))).response
        except urllib2.URLError, e:
            raise FogBugzConnectionError(e)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: route_handling.py Proyecto: pursh2002/Web-Scraping

    def read_timetable_file(self, route_filename, weekday, direction):
        # Example of how the timetables can be read and returned as a Map

        from BeautifulSoup.BeautifulSoup import BeautifulSoup
        import urllib2, re, time

        filestr = ('data/timetables/%s_%s_%s.html' %
                   (route_filename, weekday, direction))
        fil = open(filestr, "r")
        soup = BeautifulSoup(fil.read(), fromEncoding='utf8')
        fil.close()

        divs = soup.html.body.findAll('div')
        children = divs[0].contents

        #timetable
        tt = children[1].contents[3].contents[3].contents[3].contents[
            1].contents[2]

        route_list = []
        route_times_list = []
        # stop names values
        for (j, name) in enumerate(tt.contents[0].contents[4].contents):

            route_times_list = []
            route_name = name.contents[1].find('a').contents[0]
            print route_name
            #am / pm values
            for (i, name) in enumerate(
                    tt.contents[0].contents[3].contents[2].contents):
                time_value = tt.contents[0].contents[5].contents[
                    j + 1].contents[i].contents[0].text
                if time_value == '-':
                    print time_value
                    continue
                time_prefix = name.text
                #values minus the first
                time_str = '' + time_value + ' ' + time_prefix
                try:
                    time_result = time.strftime(
                        '%H:%M:%S', time.strptime(time_str, '%I:%M %p'))
                    route_times_list.append(time_result)
                    print time_result
                except:
                    print "ERR", time_str

            route_list.append((route_name, route_times_list))

        return route_list

Ejemplo n.º 15

0

Mostrar archivo

Archivo: seasonList.py Proyecto: andy13th/xbmc-catchuptv-au

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['showUrl']))
     print 'Loading ' + self.BASE_URL + unquote_plus(params['showUrl'])
     seasonIndex = BeautifulSoup(source)
     tvshowcontainer = seasonIndex.find(
         'div',
         id=re.compile(
             'scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer == None:
         tvshowcontainer = seasonIndex.find(
             'div', {
                 'class':
                 re.compile(
                     'scet_header|scet_top|show-header|show-header-scet')
             })
     if tvshowcontainer != None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1)
     print 'Parsing seasons for "%s"' % tvshowtitle
     showsListing = seasonIndex.find('div', {
         "class": re.compile('scet-gallery-nav')
     }).find(
         'h3',
         text='Full Episodes').parent.findNextSibling('ul').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         print 'Found ' + showLink.string
         listitem = xbmcgui.ListItem(decode_htmlentities(showLink.string))
         listitem.setInfo('video', {'tvshowtitle': tvshowtitle})
         #listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                     listitem=listitem,
                                     url="%s?seasonUrl=%s" % (
                                         sys.argv[0],
                                         quote_plus(showUrl),
                                     ),
                                     totalItems=len(showsListing),
                                     isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='seasons')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Ejemplo n.º 16

0

Mostrar archivo

    def DoStep2(self, allofit):

        page = self.HexToByte(allofit['body'])

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("fd-username.html", page)

        loginform = soup.find('form')

        action = loginform['action']

        inputs = loginform.findAllNext('input')

        values = {}

        self.response = {}

        # build the post values up - there arent any others afaik

        ps = loginform.findAllNext('p')

        numbers = ps[1].findAllNext('strong')

        #not enough lookup digits
        try:
            password = self.lookupdigit(numbers[0].text) + self.lookupdigit(
                numbers[1].text) + self.lookupdigit(numbers[2].text)
        except:
            logging.debug("credentials incorrect")
            return 'credentials incorrect'

        answer = self.filledCreds['06']

        values['password'] = password
        values['memorableAnswer'] = answer

        # build the body content
        data = urllib.urlencode(values)

        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 3

        return 'good'

Ejemplo n.º 17

0

Mostrar archivo

    def __init__(self, prontuario_html):
        '''
        Constructor
        '''

        self.__soup = \
            BeautifulSoup(prontuario_html, fromEncoding='iso-8859-1',
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
        self.__prontuario = {}

        self._parsear_dados_veiculo()
        self._parsear_debitos()
        self._parsear_infracoes_em_autuacao()
        self._parsear_listagem_multas()
        self._parsear_historico_multas()
        self._parsear_ultimo_processo()
        self._parsear_recurso_infracao()

Ejemplo n.º 18

0

Mostrar archivo

 def __init__(self):
     source = self._fetch_url(self.BASE_FOD_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div',
                                  id='episodes-listing').findAll('li')
     for show in showsListing:
         showLink = show.find('a')
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(
             handle=int(sys.argv[1]),
             listitem=xbmcgui.ListItem(showLink.string),
             url="%s?showUrl=%s" % (sys.argv[0], quote_plus(showUrl)),
             totalItems=len(showsListing),
             isFolder=True)
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Ejemplo n.º 19

0

Mostrar archivo

    def __init__(self, url, token=None):
        self.__handlerCache = {}
        if not url.endswith('/'):
            url += '/'

        if token:
            self._token = token.encode('utf-8')
        else:
            self_token = None

        self._opener = urllib2.build_opener()
        try:
            soup = BeautifulSoup(self._opener.open(url + 'api.xml'))
        except URLError:
            raise FogBugzConnectionError(
                "Library could not connect to the FogBugz API.  Either this installation of FogBugz does not support the API, or the url, %s, is incorrect."
                % (self._url, ))
        self._url = url + soup.response.url.string
        self.currentFilter = None

Ejemplo n.º 20

0

Mostrar archivo

    def _loadComment(self, communityId, userSettingFilePath, commentLogFolder):
        nameDict = self._loadUserSetting(communityId, userSettingFilePath)
        commentLogFileList = filter(lambda file: re.match(ur'ncvLog_lv\d+-{0}\.xml$'.format(communityId), file) , os.listdir(commentLogFolder))

        chatList = []
        for commentFile in commentLogFileList:
            parser = BeautifulSoup(open(os.path.join(commentLogFolder, commentFile), u'r'))
            liveId = u'lv' + parser.find(u'livenum').renderContents().decode(u'utf-8')
            chatTagList = parser.find(u'livecommentdataarray').findAll(u'chat', recursive=False)
            for chatTag in chatTagList:
                userId = chatTag.get(u'user_id')
                if chatTag.get(u'user_id') == u'':
                    continue

                name = nameDict.get(userId)
                message = chatTag.renderContents().decode(u'utf-8')
                option = chatTag.get(u'mail')
                unixtime = time.localtime(int(chatTag.get(u'date')))
                date = (datetime.datetime(*unixtime[:-3]).strftime(u'%Y-%m-%d %H:%M:%S') if unixtime else None).decode(u'utf-8')
                chatList.append((communityId, liveId, userId, name, message, option, date))

        return chatList

Ejemplo n.º 21

0

Mostrar archivo

    def doStep12(self, page):

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        self.output_page("natwest-xactlist-cc-poss.html", page)

        rightButtons = soup.findAll('a', attrs={'class': 'link-button-right'})

        # any buttons?
        if len(rightButtons) == 0:
            logging.error('NatWest no cc accountbuttons')
            return 'bank error'

        # natWest is not dynamic -so this static list is fine (unlike Smile)
        acLink = None
        for a in rightButtons:
            # filter out the account detail buttons matching just the statement buttons
            # Bloody hope this regex finds shit in the right order
            if re.search(".ard.tatement.etail", a['href']):
                acLink = a['href'][:]

        if acLink == None:
            logging.debug('NatWest no cc detail link')
            return 'bank error'

        # action = self.urlBase + '/' + loginform['action']

        action = acLink
        try:
            logging.debug("checking link - " + acLink)
            urls = urlparse(acLink)

            # if it parses properly good else

        except Exception, e:
            logging.error('NatWest cc link error - ' + str(e))
            action = self.urlBase + '/' + acLink

Ejemplo n.º 22

0

Mostrar archivo

 def __init__(self):
     print 'Fetching %s' % self.INDEX_URL
     source = self._fetch_url(self.INDEX_URL)
     fodIndex = BeautifulSoup(source)
     showsListing = fodIndex.find('div', {
         "class": re.compile('group-full-eps')
     }).findAll('li')
     print 'Parsed listing and found %d shows' % len(showsListing)
     for show in showsListing:
         showLink = show.find('a')
         listitem = xbmcgui.ListItem(decode_htmlentities(showLink['title']))
         episodeCount = show.find('div',
                                  text=re.compile('^[0-9]+ Videos?$'))
         if episodeCount:
             episodeCount = int(
                 re.search('^([0-9]+)\s*Videos?$',
                           episodeCount.string).group(1))
             print 'Found "%s" with %d episodes' % (decode_htmlentities(
                 showLink['title']), episodeCount)
             listitem.setInfo('video', {'episode': episodeCount})
         else:
             print 'Found "%s" but did not find how many episodes' % decode_htmlentities(
                 showLink['title'])
         listitem.setThumbnailImage(showLink.find('img')['src'])
         if showLink['href'][0] == '/':
             showUrl = showLink['href'][1:]
         else:
             showUrl = showLink['href'].replace(self.BASE_URL, '')
         xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                     listitem=listitem,
                                     url="%s?showUrl=%s" %
                                     (sys.argv[0], quote_plus(showUrl)),
                                     totalItems=len(showsListing),
                                     isFolder=True)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='tvshows')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Ejemplo n.º 23

0

Mostrar archivo

    def processAccount(self, acCount, acName, account_path, allofit):

        page = self.HexToByte(allofit['body'])

        # save this page
        self.output_page("account" + str(acCount) + ".html", page)

        soup = BeautifulSoup(page)

        logging.debug('ac path - ' + str(account_path) + ' - end')

        if account_path != "":
            # delete existing current xactions

            logging.debug('Processing :) ')

            self.statementbuilder = StatementBuilder(self.facade, account_path,
                                                     self.token)

            # need to get last statement and make a new one every time
            self.statementbuilder.make_recent_dif_statement(
                'Fd-recent', 'Scraper', None)  #TODO change this

            isVisa = False
            loginform = soup.find(
                'input', attrs={'name': 'cmd_sort_referenceAscending'})
            if loginform != None:
                isVisa = True

                bal_tables = soup.findAll(
                    'table', attrs={'class': 'fdTableBackgroundOne'})
                balance_table = bal_tables[2]

                if balance_table <> None:
                    vals = balance_table.findAll('td')

                    if vals:
                        bal = vals[1].text
                        data = bal.replace('&#163;', u'£')
                        data = data.strip(u'£')
                        if data[-1] == 'D':
                            data = data.replace('DB', '')
                            data = data.replace('D', '')
                            lastbal = int(float(data) * 100)
                            firstbal = 0 - lastbal
                        else:
                            data = data.replace('CR', '')
                            data = data.replace('C', '')
                            firstbal = int(float(data) * 100)

                        self.statementbuilder.set_current_balance(firstbal)

            logging.debug(
                "-----------------------------*******---------------------")
            if isVisa:
                logging.debug("found visa --")

            acTable = soup.find('table', attrs={'class': 'fdStatTable'})

            # if no table then no new data afaik
            if acTable != None:
                datarows = acTable.findAll('tr')

                next = False

                # build the post values up
                atts = {}

                isFirst = True
                firstbal = 0
                firstdate = ""

                lastbal = 0
                lastdate = ""

                doBalance = False

                dp = DateParser()

                for rows in datarows:
                    vals = rows.findAll('td')

                    if vals:
                        for i, val in enumerate(vals):

                            if val.text:
                                data = val.text.strip()
                                data = unescape(data)
                                data = unicode(data)

                            else:
                                data = ""

                            if data != "&nbsp;":
                                data = data.replace('&nbsp;', '')
                                if i == 0:
                                    if data != "":
                                        try:
                                            lastdate = dp.ymd_from_date(
                                                dp.date_from_dmy(data, '/'))
                                        except:
                                            logging.warn(
                                                "Invalid FD date format - probably no transactions"
                                            )
                                            return

                                        if firstdate == "":
                                            firstdate = lastdate

                                    atts['date'] = lastdate

                                if (i == 1 and not isVisa) or (i == 2
                                                               and isVisa):
                                    atts['display'] = data[0:19]
                                    atts['extradisplay'] = data[19:]

                                if (i == 2 and not isVisa) or (i == 3
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'D')
                                        data = data.strip(u'B')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Debit'

                                if (i == 3 and not isVisa) or (i == 4
                                                               and isVisa):
                                    if data != "":
                                        data = data.strip(u'£')
                                        data = data.strip(u'C')
                                        data = data.strip(u'R')
                                        if data == '':
                                            atts['amount'] = 0
                                        else:
                                            atts['amount'] = int(
                                                float(data) * 100)
                                        atts['type'] = 'Credit'

                                if not isVisa:
                                    if i == 4:
                                        data = data.strip(u'£')
                                        if data != "":
                                            lastbal = int(float(data) * 100)

                                            if isFirst:
                                                isFirst = False
                                                firstbal = lastbal
                                                doBalance = True

                                    if i == 5:
                                        if doBalance:
                                            doBalance = False
                                            if data == "D":
                                                firstbal = 0 - firstbal
                                            self.statementbuilder.set_current_balance(
                                                firstbal)

                        self.statementbuilder.make_xact(atts)

                self.statementbuilder.put_statement()
                self.current_statement = self.current_statement + 1

Ejemplo n.º 24

0

Mostrar archivo

 def __init__(self):
     params = self._parse_argv()
     source = self._fetch_url(self.BASE_URL +
                              unquote_plus(params['seasonUrl']))
     showIndex = BeautifulSoup(source)
     tvshowcontainer = showIndex.find(
         'div',
         id=re.compile(
             'scet_header|scet_top|show-header|show-header-scet|^header$'))
     if tvshowcontainer == None:
         tvshowcontainer = showIndex.find(
             'div', {
                 'class':
                 re.compile(
                     'scet_header|scet_top|show-header|show-header-scet')
             })
     if tvshowcontainer != None:
         tvshowtitle = tvshowcontainer.find('h1').string
     else:
         tvshowtitle = re.search('var siteName = "(.+?)";', source).group(1)
     pages = 1
     if showIndex.find('div', {'class': re.compile('nbcu_pager')}):
         pageLinks = showIndex.find('div', {
             'class': re.compile('nbcu_pager')
         }).findAll('a', {'class': re.compile('nbcu_pager_page')})
         pages = len(pageLinks)
     for i in range(0, pages):
         if i > 0:
             source = self._fetch_url(self.BASE_URL + pageLinks[i]['href'])
             showIndex = BeautifulSoup(source)
         episodesListing = showIndex.find(
             'ul', {
                 'class': re.compile('scet_th_full')
             }).findAll('li')
         for episode in episodesListing:
             vidInfo = {'tvshowtitle': tvshowtitle, 'studio': 'NBC'}
             title = decode_htmlentities(
                 episode.find('p', {
                     'class': re.compile('list_full_det_title')
                 }).find('a').string)
             listitem = xbmcgui.ListItem(title)
             listitem.setThumbnailImage(episode.find('img')['src'])
             episodeLink = episode.find('a')
             if episodeLink['href'][0] == '/':
                 episodeUrl = episodeLink['href'][1:]
             else:
                 episodeUrl = episodeLink['href'].replace(self.BASE_URL, '')
             if episode.find('p', {'class': re.compile('list_full_des')}):
                 vidInfo['plot'] = decode_htmlentities(
                     episode.find('p', {
                         'class': re.compile('list_full_des')
                     }).find('em').string)
             epNum = re.search(
                 '^Ep(?:\.\s*)?([0-9]{1,2})([0-9][0-9])(?:\s*:\s*)?(.+)$',
                 title)
             if epNum != None:
                 vidInfo['season'] = int(epNum.group(1))
                 vidInfo['episode'] = int(epNum.group(2))
             vidInfo['title'] = epNum.group(3)
             #airedDateAndPlot = re.search('Aired\s+([01]?[0-9])/([0-3]?[0-9])/([0-9]{2,4})\s*(?:<br\s*/?>)?\s*(.+?)\s*</div>$',str(episode.find('div',{'class':'episodeInfo'})))
             #seasonNum = re.search('Season\s+([0-9]+?)[\s:]',str(episode.find('p',{'class':'seasonNum'})))
             #episodeNumAndDuration = re.search('Episode\s+([0-9]+?)\s+?\(((?:[0-9]*?:)?[0-9]*?:[0-9]+?)\)',str(episode.find('p',{'class':'episodeNumLine'})))
             #vidInfo['aired'] = '%s-%s-%s' % (airedDateAndPlot.group(3),airedDateAndPlot.group(1),airedDateAndPlot.group(2))
             #vidInfo['season'] = int(seasonNum.group(1))
             #vidInfo['episode'] = int(episodeNumAndDuration.group(1))
             #vidInfo['duration'] = episodeNumAndDuration.group(2)
             #vidInfo['title'] = episode.find('h3').find('a').string
             #vidInfo['plot'] = decode_htmlentities(airedDateAndPlot.group(4))
             #print vidInfo
             listitem.setInfo("video", vidInfo)
             xbmcplugin.addDirectoryItem(
                 handle=int(sys.argv[1]),
                 listitem=listitem,
                 url="%s?episodeUrl=%s&episode=%s&season=%s" %
                 (sys.argv[0], quote_plus(episodeUrl), vidInfo['episode'],
                  vidInfo['season']))
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_EPISODE)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_DATE)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_LABEL)
     xbmcplugin.addSortMethod(handle=int(sys.argv[1]),
                              sortMethod=xbmcplugin.SORT_METHOD_DURATION)
     xbmcplugin.setContent(handle=int(sys.argv[1]), content='episodes')
     xbmcplugin.endOfDirectory(handle=int(sys.argv[1]), succeeded=1)

Ejemplo n.º 25

0

Mostrar archivo

###################################################
# RMIT University, Melbourne
# Date 27 Mar 2012
# By Emil Broegger Kjer
# For questions or comments contact [email protected]
###################################################

from BeautifulSoup.BeautifulSoup import BeautifulSoup
import urllib2, re

#### Read from URL
page = urllib2.urlopen(
    "http://tt.metlinkmelbourne.com.au/tt/XSLT_TTB_REQUEST?command=direct&language=en&outputFormat=0&net=vic&line=02EPP&project=ttb&itdLPxx_selLineDir=R&sup=B"
)
soup = BeautifulSoup(page)

#### Read from file
# transport_line = "epping_line"
# weekday = "weekday"
# direction = "true"
# filestr = ('data/timetables/%s_%s_%s.html' % (transport_line, weekday, direction))
# fil = open(filestr, "r")
# soup = BeautifulSoup(fil.read(), fromEncoding='utf8')
# fil.close()

divs = soup.html.body.findAll('div')
children = divs[0].contents

#### Set the timetable
tt = children[1].contents[3].contents[3].contents[3].contents[1].contents[2]

Ejemplo n.º 26

0

Mostrar archivo

    def doStep4(self, allofit, page):

        scrape_result = "good"

        #-------------------------------- Grab the form values -----------------------------------------------
        soup = BeautifulSoup(page)

        # write out the start page
        self.output_page("natwest-pos-accounts.html", page)

        scrape_result = 'good'
        logging.info("NatWest message or bad cred check ")

        # if we still have the input then def bad credentials
        errorDiv = soup.findAll(
            'input', attrs={'name': 'ctl00$mainContent$LI6PPEA_edit'})

        if len(errorDiv) != 0:
            logging.info("NatWest defiantely bad credentials")
            return 'credentials incorrect'

        accountBLock = soup.findAll('table', attrs={'class': 'AccountTable'})
        # got some acount details so all good
        if len(accountBLock) > 0:
            logging.debug("NatWest defiantely got some good accounts")
            return 'good'

        # find any link

        # if we find a link return it

        # check for the normal continue button and fail all else - with credentials failure
        continueButton = soup.find(
            'input', attrs={'id': 'ctl00_mainContent_FinishButton_button'})

        if (continueButton == None):
            logging.warning(
                "NatWest cant find finish button credentials incorrect")

            nextButton = soup.find(
                'input', attrs={'id': 'ctl00_mainContent_NextButton_button'})

            if (nextButton == None):
                logging.warning("NatWest cant find next button either")
                return 'credentials incorrect'

        # now find the form that these buttons belong to
        loginform = soup.find('form', attrs={'name': 'aspnetForm'})

        if loginform == None:
            logging.debug('NatWest no continue form')
            return 'bank error'
        else:
            logging.debug('found a continue form - so clicking it')
        action = self.urlBase + '/' + loginform['action']

        # any hidden values etc
        values = self.parseForm(loginform)

        # build the body content
        data = urllib.urlencode(values)
        self.response = {}
        self.response['url'] = self.ByteToHex(action)
        self.response['data'] = self.ByteToHex(data)
        self.response['method'] = 'POST'
        self.response['step'] = 4

        return 'messages'

Ejemplo n.º 27

0

Mostrar archivo

    def _processNormAccount(self, raw, account_path, balance):

        soup = BeautifulSoup(raw)

        logging.debug('Norm ac path - ' + str(account_path) + ' - end')

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) norm ')

                builder = StatementBuilder(self.facade, account_path,
                                           self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[
                    self.current_statement]

                # we know this is not a credit card
                isCCard = False

                # get a fixed balance somewhere??
                # balance passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement(
                    'NatWest-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class': 'ItemTable'})

                if x_table == None:
                    # could easily be no transactions
                    logging.debug(" No xtable ======>")

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            cash = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:

                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(
                                            dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''
                                if i == 1:
                                    if data == 'ATM':
                                        cash = 'CASH - '

                                if i == 2:
                                    if data != "":
                                        extra = ""
                                        datebit = ""
                                        parts = data.split(',')
                                        if len(parts) > 1:
                                            # match natwest dates - a.la. 8062 14APR11
                                            if re.match(
                                                    '\d{4}\s\d\d[A-Z]{3}\d\d',
                                                    parts[0]) != None:
                                                datebit = parts[0][
                                                    0:4] + ' ' + parts[0][
                                                        5:7] + ' ' + parts[0][
                                                            7:10]
                                                # remember pretty_display strips out any words containing a sequence of 3 or more numbers

                                                parts = parts[1:]

                                        if len(parts) > 1:
                                            extra = parts[-1]
                                            parts = parts[0:-1]

                                        data = ' '.join(parts)

                                        disp = (cash + data).strip()

                                        atts['display'] = " ".join(
                                            disp.split())

                                        atts['extradisplay'] = " ".join(
                                            (extra + " " + datebit).split())

                                if i > 2:  # the numbers

                                    if data != "" and data != '-':

                                        amount = self.normalise_ammount(data)

                                        if i == 3:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('NatWest parsing error - ' + str(e))

Ejemplo n.º 28

0

Mostrar archivo

    def _processCCAccount(self, raw, account_path, balance):
        soup = BeautifulSoup(raw)

        logging.debug('CC ac path - ' + str(account_path) + ' - end')

        try:
            if account_path != "":
                # delete existing current xactions
                logging.debug('Processing :) ')

                builder = StatementBuilder(self.facade, account_path,
                                           self.token)
                self.statementlist.append(builder)
                self.statementbuilder = self.statementlist[
                    self.current_statement]

                # we know this is not a credit card
                isCCard = True

                # get a fixed balance somewhere??
                # passed in for natwest

                # set up our statement
                self.statementbuilder.make_recent_dif_statement(
                    'NatWest-recent', 'Scraper', None)

                # now set the final balance
                logging.debug("Balance - - - - - - - > " + str(balance))
                self.statementbuilder.set_current_balance(balance)

                # now find all the recent transactions
                x_table = soup.find('table', attrs={'class': 'ItemTable'})

                if x_table != None:
                    x_body = x_table.find('tbody')
                    inputs = x_body.findAll('tr')

                    # build the post values up
                    for rows in inputs:
                        atts = {}

                        vals = rows.findAll('td')
                        if vals:
                            datebit = ''
                            for i, val in enumerate(vals):
                                data = self.tidy_text(val.text)
                                if i == 0:
                                    dp = DateParser()
                                    try:
                                        atts['date'] = dp.ymd_from_date(
                                            dp.date_from_small(data))
                                    except:
                                        atts['date'] == ''

                                if i == 1:
                                    datebit = data[:-5]

                                if i == 2:
                                    if data != 'SALE':  # only keep extra xact date for Sales
                                        datebit = ''

                                if i == 3:
                                    if data != "":
                                        atts['display'] = " ".join(
                                            data.split()).encode('utf8')
                                        atts['extradisplay'] = datebit.encode(
                                            'utf8')

                                if i > 3:  # the numbers

                                    if data != "" and data != '-':
                                        amount = self.normalise_ammount(data)

                                        if i == 4:
                                            atts['amount'] = amount
                                            atts['type'] = 'Credit'

                                        if i == 5:
                                            atts['amount'] = amount
                                            atts['type'] = 'Debit'

                                    if i == 5:
                                        self.statementbuilder.make_xact(atts)

            self.statementbuilder.put_statement()
            self.current_statement = self.current_statement + 1

        except Exception, e:
            logging.exception('NatWest parsing error - ' + str(e))

Ejemplo n.º 29

0

Mostrar archivo

 def _loadUserSetting(self, communityId, userSettingFilePath):
     parser = BeautifulSoup(open(userSettingFilePath, u'r'))
     nameTagList = parser.findAll(u'user', attrs={ u'community': communityId, u'name': True })
     return dict(map(lambda tag: (tag.renderContents(), tag.get(u'name')), nameTagList))