def _read_ib_html(fname, table_ref):
    """
    Reads a single table from an .html file fname produced by IB reports, and returns a pandas dataframe
    
    table_ref gives position of table in .html stack
    """

    ## Open the file
    with open(fname, 'r') as file_handle:
        soup = BeautifulSoup(file_handle.read())
    if len(soup) == 0:
        raise Exception("Empty or non existent html file %s" % fname)

    ## Find the right table and extract the rows
    tables = soup.findAll('table')
    table = tables[table_ref]
    table_rows = table.findAll('tr')

    ## Process the rows from html into lists
    (headerrow, table_data) = _parse_html_table(table_rows)

    soup.close()

    ## Convert to pandas dataframe
    main_table = _html_table_to_pddataframe(headerrow, table_data)

    return main_table
def _read_ib_html(fname, table_ref):
    """
    Reads a single table from an .html file fname produced by IB reports, and returns a pandas dataframe
    
    table_ref gives position of table in .html stack
    """

    ## Open the file
    with open(fname,'r') as file_handle:
        soup = BeautifulSoup(file_handle.read())
    if len(soup)==0:
        raise Exception("Empty or non existent html file %s" % fname)
    
    ## Find the right table and extract the rows
    tables=soup.findAll('table')
    table=tables[table_ref]
    table_rows = table.findAll('tr')
    
    ## Process the rows from html into lists
    (headerrow, table_data) = _parse_html_table(table_rows)
    
    soup.close()

    ## Convert to pandas dataframe
    main_table=_html_table_to_pddataframe(headerrow, table_data)
    
    return main_table
    def processCaptcha(self, kParam):

        vers, language, jsh, questionfile = self._collect_api_info()

        responseQFile = self.openPage(questionfile, buildHref=False)
        if responseQFile is None or responseQFile.getcode() != 200:
            return

        # ___ https://www.google.com/recaptcha/api2/anchor?k=
        gurl = 'https://www.google.com/recaptcha/api2/anchor?k=' + kParam + '&co=aHR0cDovL2xpbmtjYXB0Y2hhLmNvbTo4MA..&hl=' + language + '&v=' + vers + '&size=normal&cb=z6ubp1ln1ecg'

        responseAnchor = self.openPage(gurl, buildHref=False)
        if responseAnchor is None or responseAnchor.getcode() != 200:
            return

        contentAnchor = responseAnchor.read()

        # ___ Initialize BeautifulSoup
        soupAnchor = BeautifulSoup(contentAnchor)
        soupAnchor.close()

        # ___ Get the captcha token
        token = soupAnchor.find('input', {'id': 'recaptcha-token'})
        if token is None:
            return

        recaptchaToken = token['value']
        print contentAnchor

        return
        userverifyUrl = 'https://www.google.com/recaptcha/api2/userverify?k=' + kParam
        data = {
            'v':
            'r20171212152908',
            'c':
            recaptchaToken,
            'response':
            'eyJyZXNwb25zZSI6IiIsInMiOiI2ZTVhIn0',
            't':
            771,
            'ct':
            771,
            'bg':
            '!QUegR2ZHqovVLlrzLJxBBzBhg5mRh1oHAAAAY1cAAAB-nAOGhu4fgbzGrh0Of91RdQt0tdbJYOeKtFu1_y_v_N4nuCILtj4Cm8f605XqMDHhXJE-k6u9R1qmmgMUX3WJ8wmU439ZzqnyxvMFYewQW02z_gCxsxAUPkMzGaqOat8zy1a48JSgdMe1Bd5QXdJxABqcjfltK52rYsJ8ZePokzTjOZSiNRSm8fI_cU9FOJnjiBjZfLtE8y-a5rUnpAkimAPDXbcwybbu_4J5nvfStYXv4jfI2mFOBKkuNfSNrhrtoXUX83Qv5JbxDNdytuSfzRmq126a46jP-jZIsarKP-Mh9lJTIX4rgE6hxvXWzXBQKF0gwbafsM7yWRfn8_-f_Fv9Y60KV5y7JQK8PfrU546MAGpOmgKhs5nMOPkwwfCKk31IL6rc3deFBG_0TtVG4rGP7ZcDM3w6C0Dd3wIdeEJYpA4yEpgCswaotgnkMpvnoCNgVABLasxkFIOSEzxRpOvNBdo-2z6vLwkNZNOAoMTdL8VWfW5fSbr9qMUJihl5cVHyJm8km3b8HCmI-wkIQXPOVVnFu_tIdHqvKux-InVPGsLgg8eCm8mG7ZphYfsOtezccmYnne9kpduSHRLTUm0tgZqmvEIT0QpyhyYB7RpYzPE2MY43tPoDl0Ap3VsDH_gEvFPpLR0ZLoKZGwmsywLW4NOZxbIPYjWB13i8xdSI3uGrmcYp16jt_-au8TWCkz7eDSc6O77uVw6moB_KbmGmFxNcO-Ob6UhgUHZP3g8dOFcz30SXIeVTiVl8fgsxb6LPuhdf5_x9gE0oYv-91q75TFlB3lC4NIq802g2SouWK72J8S5taQwLY2eVy-Qc4Q5P2q_dV3WraHFBvUeL8kiFwAzOypOfzD3-AEZnKxF3i2MTh7kaTO2cYdmjhhpWMO74YcJYB5ZCQKX8t8JawFZ0pdjFgfTPRK8de9TRM0bI8azbIUNnBtzrKrxrPeP6U8-6NsALgifqhP1PetFc40K0CSPqb-jRDZKT_MrQ66ugTKHyknTRrI0Zqr3uv59g5U7-LBQAh5Yx0iJGdwJU9p0_j_LOilV4GOH0-GDyhTyDzOgMiK9Lml4ErJitZ99bZfZHrjeJXaahhfoTBilxZmf6xMvwCZg_MlXwoGupo4GkEhd9s33SBJ653oRdCpvbQMRIkj77rYBE_ANGIPGtgnnxrl_RIV4ZSCz2MiWIKkF5YjQIHWm_uTM'
        }
        responseVerifyUser = self.postPage(userverifyUrl,
                                           data,
                                           buildHref=False)
        print responseVerifyUser.read()

        print '------'

        post_captcha_href = 'http://linkcaptcha.com/verifyCaptchaNew.php'
        data = {'g-recaptcha-response': token['value']}
        responseToken = self.postPage(post_captcha_href, data, buildHref=False)
        print responseToken
        if responseToken and responseToken.getcode() == 200:
            print responseToken.read()
            responseToken.close()
 def run(self):
     while True:
         chunk = self.outcoming.get()
         soup = BeautifulSoup(chunk)
         print '====================================='
         print soup.findAll('title')[0].renderContents()
         soup.close()
         self.outcoming.task_done()
Beispiel #5
0
 def get_all_form(content):
     try:
         soup = BeautifulSoup(str(content))
         all_form = soup.findAll('form')
         soup.close()
         return all_form
     except Exception, e:
         return []
Beispiel #6
0
 def get_all_form(content):
     try:
         soup = BeautifulSoup(str(content))
         all_form = soup.findAll('form')
         soup.close()
         return all_form
     except Exception, e:
         return []
 def run(self):
     if self._callback:
         self._callback.update_state("start")
     itemhtml = self._file.open(str(self._item))
     soup = BeautifulSoup(itemhtml)
     converted = self._processor.get_converted_html(soup)
     self._file.writestr(self._item, converted)
     soup.close()
     itemhtml.close()
     if self._callback:
         self._callback.update_state("finish")
 def run(self):
     if self._callback:
         self._callback.update_state("start")
     itemhtml = self._file.open(str(self._item))
     soup = BeautifulSoup(itemhtml)
     converted = self._processor.get_converted_html(soup)
     self._file.writestr(self._item, converted)
     soup.close()
     itemhtml.close()
     if self._callback:
         self._callback.update_state("finish")
Beispiel #9
0
def getChildPageScoreAndNum(url):
    req = urllib2.Request(url=url, headers=headers)
    page = urllib2.urlopen(req).read()
    soup = BeautifulSoup(page)
    tag = soup.find('span', {'class': 'Goldnum'})
    if tag == None:
        return None

    score = float(tag.string)
    tag = tag.nextSibling.find('span', {'class': 'Golder'})
    if tag == None:
        return None

    num = int(tag.string)
    soup.close()
    return (score, num)
Beispiel #10
0
def getChildPageScoreAndNum(url):
    req = urllib2.Request( url = url, headers = headers)
    page = urllib2.urlopen(req).read()
    soup = BeautifulSoup(page)
    tag=soup.find('span', {'class':'Goldnum'})
    if tag==None:
        return None

    score=float(tag.string)
    tag=tag.nextSibling.find('span',{'class':'Golder'})
    if tag==None:
        return None

    num=int(tag.string)
    soup.close()
    return (score,num)
Beispiel #11
0
 def parse_page(self, url):
     f = urllib.urlopen(url)
     if f.getcode() != 200:
         print "Could not fetch participants page %s!" % url
         raise FetchParticipantException(url)
     soup = BeautifulSoup(f)
     links = map(lambda tag: tag.a.get("href"), soup.findAll("div", "news"));
     participants = []
     for link in links:
         try:
             parsed_p = self.parse_participant(link)
             participants.append(parsed_p)
         except FetchParticipantException:
             continue
     soup.close()
     return participants
Beispiel #12
0
 def parse_participant(self, url):
     str_data = urllib.urlopen(url).read()
     data = str_data.decode('utf-8')
     soup = BeautifulSoup(data)
     partic_info = soup.find("div", "u4asn-1")
     if partic_info is None:
         raise FetchParticipantException(url)
     info = partic_info.find("div", "u4a-desc")
     results = [x.text for x in (info.h3, info.span, info.p) if x is not None]
     name = results[0]
     style = results[1]
     descr = results[2] if len(results) > 2 else u""
     votes = int(soup.find("div", "u4asn-2").find("div", "total").text)
     soup.close()
     return Participant(name=name,
                     link=url,
                     style=style,
                     description=descr,
                     votes=votes)
Beispiel #13
0
def getMainPageScoreAndUrl(url):
    #print url
    req = urllib2.Request( url = url, headers = headers)
    page = urllib2.urlopen(req).read()
    soup = BeautifulSoup(page)
    #print soup.originalEncoding
    #print soup
    datas=[]
    for tag in soup.findAll('span', {'class':'play-icon'}):
        ptag=tag.parent
        href='http://www.senanb.com/'+ptag['href']

        ptag=ptag.parent
        ptag=ptag.find('strong', {'class':'ratbar-num'})
        if ptag == None: continue

        score=float(ptag.string)
        #score=ptag.contents[0]
        datas.append((score,href))
    soup.close()
    return datas
Beispiel #14
0
def getMainPageScoreAndUrl(url):
    #print url
    req = urllib2.Request(url=url, headers=headers)
    page = urllib2.urlopen(req).read()
    soup = BeautifulSoup(page)
    #print soup.originalEncoding
    #print soup
    datas = []
    for tag in soup.findAll('span', {'class': 'play-icon'}):
        ptag = tag.parent
        href = 'http://www.senanb.com/' + ptag['href']

        ptag = ptag.parent
        ptag = ptag.find('strong', {'class': 'ratbar-num'})
        if ptag == None: continue

        score = float(ptag.string)
        #score=ptag.contents[0]
        datas.append((score, href))
    soup.close()
    return datas
Beispiel #15
0
    def am_I_logged_in_is_form_here(self, page1, page2):
        soup1 = BeautifulSoup(str(page1))
        soup2 = BeautifulSoup(str(page2))
        flag1 = True
        flag2 = True
        for name in self.Form_tags_password_type:
            res_p = soup1.find("input", attrs={"type": re.compile(name)})
            if res_p:
                flag1 = False
        soup1.close()

        for name in self.Form_tags_password_type:
            res_p = soup2.find("input", attrs={"type": re.compile(name)})
            if res_p:
                flag2 = False
                break
        soup2.close()

        if not flag2:
            return False
        else:
            return True
Beispiel #16
0
    def am_I_logged_in_is_form_here(self, page1, page2):
        soup1 = BeautifulSoup(str(page1))
        soup2 = BeautifulSoup(str(page2))
        flag1 = True
        flag2 = True
        for name in self.Form_tags_password_type:
            res_p = soup1.find("input", attrs={"type" : re.compile(name)})
            if res_p:
                flag1 = False;
        soup1.close()

        for name in self.Form_tags_password_type:
            res_p = soup2.find("input", attrs={"type" : re.compile(name)})
            if res_p:
                flag2 = False
                break;
        soup2.close()

        if not flag2:
            return False
        else:
            return True
Beispiel #17
0
    def get_html_files_ref(self):
        htmlfiles = []

        with ZipFile(self._filepath, 'r') as f:
            foo = f.open('META-INF/container.xml')
            soup = BeautifulSoup(foo)
            foo.close()
            contentfile = dict(soup.find('rootfile').attrs)['full-path']
            soup.close()

            root = re.sub(r'[^/]*(.opf)', '', contentfile)

            foo = f.open(contentfile)
            soup = BeautifulSoup(foo)
            for item in soup.findAll('item'):
                itemdict = dict(item.attrs)
                if itemdict['href'].endswith('html'):
                    htmlfiles.append(root + itemdict['href'])

            foo.close()
            soup.close()
            f.close()

        return htmlfiles
Beispiel #18
0
    def get_html_files_ref(self):
        htmlfiles = []

        with ZipFile(self._filepath, 'r') as f:
            foo = f.open('META-INF/container.xml')
            soup = BeautifulSoup(foo)
            foo.close()
            contentfile = dict(soup.find('rootfile').attrs)['full-path']
            soup.close()

            root = re.sub(r'[^/]*(.opf)', '', contentfile)

            foo = f.open(contentfile)
            soup = BeautifulSoup(foo)
            for item in soup.findAll('item'):
                itemdict = dict(item.attrs)
                if itemdict['href'].endswith('html'):
                    htmlfiles.append(root + itemdict['href'])

            foo.close()
            soup.close()
            f.close()

        return htmlfiles
Beispiel #19
0
     downloadScheduler = DownloadScheduler()
     downloadScheduler.start()
     """
     
     mainUrl = 'http://mabilo.com/ringtones.htm'
     urlHeader = 'http://mabilo.com'
     
     timeout = 40
     socket.setdefaulttimeout(timeout)
     
     # get category_list
     u = urllib.urlopen(mainUrl)
     category_list = []
     soup = BeautifulSoup(u.read())
     u.close()
     for elem in soup.findAll('ul')[-1]:
         elem = str(elem)
         start = elem.find('"',0)
         stop = elem.find('"',start+1)
         category_list.append(elem[start+1:stop])
     soup.close()
     
     # process category_list
     for elem in category_list[15:16]:
         solveEachCategory(urlHeader+elem) 
     """
     while  not downloadQueue.empty():
         time.sleep(20)
     """
     
Beispiel #20
0
def  solveEachCategory(origurl):
     global  recordIndex,  urlHeader
     cnt = 1
     while True:
         if cnt == 1:
             url = origurl
         else :
             url = origurl[0:origurl.index('.htm')]+'-'+str(cnt)+'-tr.htm'
         cnt = cnt + 1
         
         if cnt > 1501:
             break
         
         numTry = 3
         while  numTry > 0:      
             try:
                 u = urllib.urlopen(url)
             except:
                 numTry = numTry - 1
                 print  'category '+url+' open err '+str(3-numTry)+ ' try'
                 #time.sleep(1)
             else:
                 try:
                     data = u.read()
                 except:
                     u.close()
                     numTry = numTry - 1
                     print 'category '+url+' get data  err '+str(3-numTry)+' try'
                     #time.sleep(1)
                 else :
                     break;    
          
         if  numTry <= 0:
             recordIndex += 10       # assume there are 10 rings per page
             continue
                    
         if u.url != url:                         # judge if reach the last page 
             u.close()
             return 0          
         
         soup = BeautifulSoup(data)
         u.close()

         for divElem in soup.findAll('div',attrs={'class':'row2'}):  # get TAG with attribute
             # process   each  ring
             print 'processing '+str(recordIndex)
             # process  a   ring 
             ringurl = divElem.find('a')['href'];
             if ringurl == None:
                 self.errorLog('ring')
                 continue
             ringurl = urlHeader+ringurl
             ringThread = RingThread(ringurl, recordIndex)
             ringThread.start()
             queue.put(1)
             recordIndex = recordIndex + 1
             # for test
             #if  recordIndex > 1080:
             #	 return;
             
         soup.close()
Beispiel #21
0
 def  run(self):
      numTry = 3
      while numTry > 0:
          try:
              #print self.url
              u = urllib.urlopen(self.url)
          except:
              numTry = numTry - 1
              self.logError('url can not open '+str(3-numTry)+ ' try')
              #time.sleep(1)
          else:
              try:
                  data = u.read()
              except:
                      numTry = numTry - 1
                      self.logError('url data not get '+str(3-numTry)+ ' try')
                      u.close()
                      #time.sleep(1)
              else :
                  break;
              
      if  numTry <= 0:        
          self.finish()
          return -1
      
     
      # ring url open success
      soup = BeautifulSoup(data)
      u.close()
      record = ['<Record>']
      
      # attri Title
      temp = soup.find('h4')
      if  temp !=None:
          temp = str(temp.next)
          record.append('<Title>'+temp[temp.index('Free')+5:temp.index('Ringtone')-1]+'</Title>')
      else:
          self.logError('title not found')
          self.finish()
          return -1
      
      # attri  Image
      temp = soup.find('div', attrs={'class':'image'})
      if temp != None:
          imageurl = str(temp.next['src'])
          splitPath = imageurl.split('/')
          imagefile = splitPath.pop().strip()
          record.append('<Image>'+imagefile+'</Image>')
          # download image
          #print 'imageurl '+ imageurl
          imageThread = DownloadThread(imageurl, imagefile, self.index) 
          imageThread.start()
          downloadQueue.put(1)
          """imageThread = DownloadThread(imageurl, imagefile)
          imageThread.start()"""
          """if  imageThread.getResult() < 0:
              self.logError('image download error')
              return -1"""
      else: 
          self.logError('image not found')
          self.finish()
          return -1 
          
      # attrs including Artist,Downloads,Size etc
      for infoElem in soup.findAll('div',attrs={'class':'info'}):
            for specInfoElem in infoElem.findAll('span', attrs={'class':'grey'}):
                 item = specInfoElem.next
                 itemname = str(item).strip()[:-1]
                 if itemname=='Artist' or itemname=='Category':
                      record.append('<'+itemname+'>'+str(item.nextSibling.next).strip()+'</'+itemname+'>')
                 elif itemname=='Date Added':
                      record.append('<Date>'+str(specInfoElem.nextSibling).strip()+'</Date>')
                 else:
                      record.append('<'+itemname+'>'+str(specInfoElem.nextSibling).strip()+'</'+itemname+'>')
       
      # start-rating 
      infoElem = soup.find('li',attrs={'id':'rsli'})
      itemname = str(infoElem['style'])
      record.append('<Mark>'+itemname[itemname.index(':')+1:itemname.index('%')]+'</Mark>')  
       
       
       # attri  Ring
      divElem = soup.find('div',attrs={'class':'det2'})
      if divElem == None:
           self.logError('ring not found')
           self.finish()
           return -1
      ringurl = str(divElem.find('a')['href'])
      if  ringurl == None:
           self.logError('ring not found')
           self.finish()
           return -1
      
      pos = ringurl.find('.mp3')+4 
      if    pos == 3:
            pos = ringurl.find('.wav')+4
            if pos == 3:
                  self.logError('not mp3 or wav format')
                  self.finish()
                  return -1
      ringfile = ringurl[ringurl.index('file=')+5:pos].strip()         
      record.append('<Ring>'+ringfile+'</Ring>')
      ringurl = 'http://music.mabilo.com/dl'+ringurl[ringurl.index('.php'):pos]
      #download ring
      #print 'ringurl '+ringurl
      downloadThread = DownloadThread(ringurl, ringfile, self.index)
      downloadThread.start()
      downloadQueue.put(1)
      """ringThread = DownloadThread(ringurl, ringfile)
      ringThread.start()""" 
      """if   ringThread.getResult()< 0:
           self.logError('ring download error')
           return -1"""
      soup.close()     
      record.append('</Record>')
      
      self.storeRecord(record) 
      self.finish()
      return 0