Beispiel #1
0
def extract_trains():
    """Extracts the train data from html dump"""
    db.query(TempTrain).delete() # Truncate TempTrain
    db.commit()

    for row in db.query(Raw.url):
        url = row.url
        if url not in train_urls:
            continue

        debug("Processing html of %s" % url)
        raw_html = db.query(Raw.html).filter_by(url=url).first()
        s = BeautifulSoup(raw_html.html)
        table = s.find('table', id='SearchResultsTable')
        update_departure_days(table)

        header = table.find('tr', 'tableheader').extract()
        headings = [td.text for td in header]

        trains = to_dict(table, headings)
        trains = clean_keys(trains)

        train_links = get_train_links(table)
        s.decompose()

        for train in trains:
            number = train['number']
            train['name'] = train_links[number][0]
            train['url'] = train_links[number][1]

            db.add(TempTrain(**train))
            db.commit()
Beispiel #2
0
    def testPixivImageParseInfo(self):
        p = open('./test/test-image-info.html', 'r')
        page = BeautifulSoup(p.read())
        image2 = PixivImage(32039274, page)
        page.decompose()
        del page

        self.assertEqual(image2.imageId, 32039274)
        self.assertEqual(image2.imageTitle, u"新しいお姫様")
        self.assertTrue(len(image2.imageCaption) > 0)
        # print(u"\r\nCaption = {0}".format(image2.imageCaption))

        self.assertTrue(u'MAYU' in image2.imageTags)
        self.assertTrue(u'VOCALOID' in image2.imageTags)
        self.assertTrue(u'VOCALOID3' in image2.imageTags)
        self.assertTrue(u'なにこれかわいい' in image2.imageTags)
        self.assertTrue(u'やはり存在する斧' in image2.imageTags)

        self.assertEqual(image2.imageMode, "big")
        self.assertEqual(image2.worksDate, '12/10/12 15:23')
        self.assertEqual(image2.worksResolution, '642x900')
        # self.assertEqual(image2.worksTools, 'Photoshop SAI')
        # self.assertEqual(image2.jd_rtv, 88190)
        # self.assertEqual(image2.jd_rtc, 6711)
        # self.assertEqual(image2.jd_rtt, 66470)
        self.assertEqual(image2.artist.artistToken, 'nardack')
Beispiel #3
0
    def testPixivImageParseInfo(self):
      p = open('./test/test-image-info.html', 'r')
      page = BeautifulSoup(p.read())
      image2 = PixivImage(32039274, page)
      page.decompose()
      del page

      self.assertEqual(image2.imageId, 32039274)
      self.assertEqual(image2.imageTitle, u"新しいお姫様")

      self.assertTrue(u'MAYU' in image2.imageTags)
      self.assertTrue(u'VOCALOID' in image2.imageTags)
      self.assertTrue(u'VOCALOID3' in image2.imageTags)
      self.assertTrue(u'うさぎになりたい' in image2.imageTags)
      self.assertTrue(u'なにこれかわいい' in image2.imageTags)
      self.assertTrue(u'やはり存在する斧' in image2.imageTags)

      self.assertEqual(image2.imageMode, "bigNew")
      self.assertEqual(image2.worksDate,'12-11-2012 00:23')
      self.assertEqual(image2.worksResolution,'642x900')
      self.assertEqual(image2.worksTools, 'Photoshop SAI')
      #self.assertEqual(image2.jd_rtv, 88190)
      #self.assertEqual(image2.jd_rtc, 6711)
      #self.assertEqual(image2.jd_rtt, 66470)
      self.assertEqual(image2.artist.artistToken, 'nardack')
    def testPixivImageParseInfo(self):
      p = open('./test/test-image-info.html', 'r')
      page = BeautifulSoup(p.read())
      image2 = PixivImage(32039274, page)
      page.decompose()
      del page

      self.assertEqual(image2.imageId, 32039274)
      self.assertEqual(image2.imageTitle, u"新しいお姫様")
      self.assertEqual(image2.imageCaption, u'EXIT TUNES様より冬コミ発売予定の「MAYU画集(仮)」に1枚描かせて頂きました。詳しくはこちらをご確認下さい!★ <a href="/jump.php?http%3A%2F%2Fexittunes.com%2Fevent%2Fc83%2Findex.html" target="_blank">http://exittunes.com/event/c83/index.html</a> ★「MAYU」公式サイト<a href="/jump.php?http%3A%2F%2Fmayusan.jp%2F" target="_blank">http://mayusan.jp/</a>')

      self.assertTrue(u'MAYU' in image2.imageTags)
      self.assertTrue(u'VOCALOID' in image2.imageTags)
      self.assertTrue(u'VOCALOID3' in image2.imageTags)
      self.assertTrue(u'うさぎになりたい' in image2.imageTags)
      self.assertTrue(u'なにこれかわいい' in image2.imageTags)
      self.assertTrue(u'やはり存在する斧' in image2.imageTags)
      self.assertTrue(u'ヤンデレ' in image2.imageTags)
      self.assertTrue(u'吸いこまれそうな瞳の色' in image2.imageTags)

      self.assertEqual(image2.imageMode, "big")
      self.assertEqual(image2.worksDate,'12-11-2012 00:23')
      self.assertEqual(image2.worksResolution,'642x900')
      self.assertEqual(image2.worksTools, 'Photoshop SAI')
      #self.assertEqual(image2.jd_rtv, 88190)
      #self.assertEqual(image2.jd_rtc, 6711)
      #self.assertEqual(image2.jd_rtt, 66470)
      self.assertEqual(image2.artist.artistToken, 'nardack')
    def testCreateFilenameUnicode(self):
        p = open('./test/test-image-unicode.htm', 'r')
        page = BeautifulSoup(p.read())
        imageInfo = PixivImage(2493913, page)
        page.decompose()
        del page

        # cross check with json value for artist info
        js_file = open('./test/detail-267014.json', 'r')
        js = json.load(js_file)

        self.assertEqual(imageInfo.artist.artistId, str(js["user"]["id"]))
        self.assertEqual(imageInfo.artist.artistToken, js["user"]["account"])
        self.assertEqual(
            imageInfo.artist.artistAvatar,
            js["user"]["profile_image_urls"]["medium"].replace("_170", ""))

        nameFormat = '%member_token% (%member_id%)\\%urlFilename% %works_date_only% %works_res% %works_tools% %title%'
        expected = unicode(
            u'balzehn (267014)\\2493913 12/23/08 852x1200 アラクネのいる日常2.jpg')
        result = PixivHelper.makeFilename(
            nameFormat,
            imageInfo,
            artistInfo=None,
            tagsSeparator=' ',
            fileUrl='http://i2.pixiv.net/img16/img/balzehn/2493913.jpg')
        # print(result)
        self.assertEqual(result, expected)
Beispiel #6
0
    def testPixivImageParseInfo(self):
        p = open("./test/test-image-info.html", "r")
        page = BeautifulSoup(p.read())
        image2 = PixivImage(32039274, page)
        page.decompose()
        del page

        self.assertEqual(image2.imageId, 32039274)
        self.assertEqual(image2.imageTitle, u"新しいお姫様")

        self.assertTrue(u"MAYU" in image2.imageTags)
        self.assertTrue(u"VOCALOID" in image2.imageTags)
        self.assertTrue(u"VOCALOID3" in image2.imageTags)
        self.assertTrue(u"うさぎになりたい" in image2.imageTags)
        self.assertTrue(u"なにこれかわいい" in image2.imageTags)
        self.assertTrue(u"やはり存在する斧" in image2.imageTags)

        self.assertEqual(image2.imageMode, "bigNew")
        self.assertEqual(image2.worksDate, "12-11-2012 00:23")
        self.assertEqual(image2.worksResolution, "642x900")
        self.assertEqual(image2.worksTools, "Photoshop SAI")
        # self.assertEqual(image2.jd_rtv, 88190)
        # self.assertEqual(image2.jd_rtc, 6711)
        # self.assertEqual(image2.jd_rtt, 66470)
        self.assertEqual(image2.artist.artistToken, "nardack")
Beispiel #7
0
def _CNHistoryParse(indexpage, only_get_EIN=True):
    '''Parse CN History Pages'''

    # Load up the index page, A.html, etc
    indexpagepath = indexpage
    if os.path.exists(indexpagepath):
        html = open(indexpagepath, 'r')
        soup = BeautifulSoup(html)
        # all of the text names are in links (a href ...)
        if only_get_EIN:  # ONLY GET THE EIN AND THEN EXIT
            alist = soup.findAll('a')
            for link in alist:
                try:
                    if 'search.irs' in link.get('href'):
                        orgname = link.get('href')
                        html.close()
                        soup.decompose()
                        return orgname
                except:
                    pass

        html.close()
        soup.decompose()
    else:
        raise ValueError(
            "Page {} was indexed but doesn't exist??".format(indexpage))
Beispiel #8
0
def _CNHistoryParse(indexpage,only_get_EIN=True):
    '''Parse CN History Pages'''

    # Load up the index page, A.html, etc
    indexpagepath = indexpage
    if os.path.exists(indexpagepath):
        html = open(indexpagepath,'r')
        soup = BeautifulSoup(html)
        # all of the text names are in links (a href ...)
        if only_get_EIN: # ONLY GET THE EIN AND THEN EXIT
            alist=soup.findAll('a')
            for link in alist:
                try:
                    if 'search.irs' in link.get('href'):
                        orgname = link.get('href')
                        html.close()
                        soup.decompose()
                        return orgname
                except:
                    pass
                
        html.close()
        soup.decompose()
    else:
        raise ValueError("Page {} was indexed but doesn't exist??".format(indexpage))
Beispiel #9
0
def extract_train_schedules():
    """Extracts the train schedule data from html dump"""
    db.query(TempSchedule).delete() # Truncate TempSchedule
    db.commit()

    count = db.query(func.count(Raw.id)).scalar()
    for i, row in enumerate(db.query(Raw.url)):
        url = row.url
        if url in train_urls:
            continue

        debug("Processing html of %s (%s of %s. Remaining %s)" % (url, i, count, count-i))
        train = db.query(TempTrain).filter_by(url=url).first()
        raw_html = db.query(Raw.html).filter_by(url=url).first()
        raw_html = raw_html.html
        s = BeautifulSoup(raw_html)

        assert train.number in s.text
        #assert unicode(train.name, 'utf-8') in s.text

        table = s.find('table', 'schtable')
        header = table.find('tr', 'first-child').extract()
        headings = [td.text for td in header]

        schedule = to_dict(table, headings)
        schedule = clean_keys(schedule)


        for sch in schedule:
            sch['train_number'] = train.number
            db.add(TempSchedule(**sch))

        update_return_train(train, s)
        s.decompose()
        db.commit()
Beispiel #10
0
    def testCreateMangaFilename(self):
        p = open('./test/test-image-manga.htm', 'r')
        page = BeautifulSoup(p.read())
        imageInfo = PixivImage(28820443, page)
        imageInfo.imageCount = 100
        page.decompose()
        del page

        # cross check with json value for artist info
        js_file = open('./test/detail-554800.json', 'r')
        js = json.load(js_file)

        self.assertEqual(imageInfo.artist.artistId, str(js["user"]["id"]))
        self.assertEqual(imageInfo.artist.artistToken, js["user"]["account"])
        self.assertEqual(imageInfo.artist.artistAvatar, js["user"]["profile_image_urls"]["medium"].replace("_170", ""))

        nameFormat = '%member_token% (%member_id%)\\%urlFilename% %page_number% %works_date_only% %works_res% %works_tools% %title%'

        expected = unicode(u'maidoll (554800)\\28865189_p0 001 07/22/12 Multiple images: 2P C82おまけ本 「沙耶は俺の嫁」サンプル.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img26/img/ffei/28865189_p0.jpg')
        # print(result)
        self.assertEqual(result, expected)

        expected = unicode(u'maidoll (554800)\\28865189_p14 015 07/22/12 Multiple images: 2P C82おまけ本 「沙耶は俺の嫁」サンプル.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img26/img/ffei/28865189_p14.jpg')
        # print(result)
        self.assertEqual(result, expected)

        expected = unicode(u'maidoll (554800)\\28865189_p921 922 07/22/12 Multiple images: 2P C82おまけ本 「沙耶は俺の嫁」サンプル.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img26/img/ffei/28865189_p921.jpg')
        # print(result)
        self.assertEqual(result, expected)
Beispiel #11
0
    def getImagePage(self, imageId, parent=None, fromBookmark=False,
                     bookmark_count=-1, image_response_count=-1):
        image = None
        response = None
        PixivHelper.GetLogger().debug("Getting image page: {0}".format(imageId))
        if self._isWhitecube:
            url = "https://www.pixiv.net/rpc/whitecube/index.php?mode=work_details_modal_whitecube&id={0}&tt={1}".format(imageId, self._whitecubeToken)
            response = self.open(url).read()
            PixivHelper.GetLogger().debug(response);
            image = PixivModelWhiteCube.PixivImage(imageId,
                                                   response,
                                                   parent,
                                                   fromBookmark,
                                                   bookmark_count,
                                                   image_response_count,
                                                   dateFormat=self._config.dateFormat)
            # overwrite artist info
            self.getMemberInfoWhitecube(image.artist.artistId, image.artist)
        else:
            url = "http://www.pixiv.net/member_illust.php?mode=medium&illust_id={0}".format(imageId)
            response = self.open(url).read()
            parsed = BeautifulSoup(response)
            image = PixivModel.PixivImage(imageId,
                                          parsed,
                                          parent,
                                          fromBookmark,
                                          bookmark_count,
                                          image_response_count,
                                          dateFormat=self._config.dateFormat)
            if image.imageMode == "ugoira_view" or image.imageMode == "bigNew":
                image.ParseImages(parsed)
            parsed.decompose()

        return (image, response)
Beispiel #12
0
    def testPixivImageParseInfoPixivPremiumOffer(self):
        p = open('./test/test-image-parse-image-38826533-pixiv-premium.html',
                 'r')
        page = BeautifulSoup(p.read())
        image2 = PixivImage(38826533, page)
        page.decompose()
        del page

        self.assertEqual(image2.imageId, 38826533)
        self.assertEqual(image2.imageTitle, u"てやり")
        self.assertEqual(image2.imageCaption, u'一応シーダ様です。')

        self.assertTrue(u'R-18' in image2.imageTags)
        self.assertTrue(u'FE' in image2.imageTags)
        self.assertTrue(u'ファイアーエムブレム' in image2.imageTags)
        self.assertTrue(u'シーダ' in image2.imageTags)

        self.assertEqual(image2.imageMode, "big")
        self.assertEqual(image2.worksDate, '9/30/2013 01:43')
        self.assertEqual(image2.worksResolution, '1000x2317')
        self.assertEqual(image2.worksTools, 'CLIP STUDIO PAINT')
        # self.assertEqual(image2.jd_rtv, 88190)
        # self.assertEqual(image2.jd_rtc, 6711)
        # self.assertEqual(image2.jd_rtt, 66470)
        self.assertEqual(image2.artist.artistToken, 'hvcv')
Beispiel #13
0
    def testCreateMangaFilename(self):
        p = open('./test/test-image-manga.htm', 'r')
        page = BeautifulSoup(p.read())
        imageInfo = PixivImage(28820443, page)
        imageInfo.imageCount = 100
        page.decompose()
        del page

        # cross check with json value for artist info
        js_file = open('./test/detail-554800.json', 'r')
        js = json.load(js_file)

        self.assertEqual(imageInfo.artist.artistId, str(js["user"]["id"]))
        self.assertEqual(imageInfo.artist.artistToken, js["user"]["account"])
        self.assertEqual(imageInfo.artist.artistAvatar, js["user"]["profile_image_urls"]["medium"].replace("_170", ""))

        nameFormat = '%member_token% (%member_id%)\\%urlFilename% %page_number% %works_date_only% %works_res% %works_tools% %title%'

        expected = unicode(u'maidoll (554800)\\28865189_p0 001 07/22/12 Multiple images: 2P C82おまけ本 「沙耶は俺の嫁」サンプル.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img26/img/ffei/28865189_p0.jpg')
        # print(result)
        self.assertEqual(result, expected)

        expected = unicode(u'maidoll (554800)\\28865189_p14 015 07/22/12 Multiple images: 2P C82おまけ本 「沙耶は俺の嫁」サンプル.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img26/img/ffei/28865189_p14.jpg')
        # print(result)
        self.assertEqual(result, expected)

        expected = unicode(u'maidoll (554800)\\28865189_p921 922 07/22/12 Multiple images: 2P C82おまけ本 「沙耶は俺の嫁」サンプル.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img26/img/ffei/28865189_p921.jpg')
        # print(result)
        self.assertEqual(result, expected)
Beispiel #14
0
    def testPixivImageParseInfo(self):
        p = open('./test/test-image-info.html', 'r')
        page = BeautifulSoup(p.read())
        image2 = PixivImage(32039274, page)
        page.decompose()
        del page

        self.assertEqual(image2.imageId, 32039274)
        self.assertEqual(image2.imageTitle, u"新しいお姫様")

        self.assertTrue(u'MAYU' in image2.imageTags)
        self.assertTrue(u'VOCALOID' in image2.imageTags)
        self.assertTrue(u'VOCALOID3' in image2.imageTags)
        self.assertTrue(u'うさぎになりたい' in image2.imageTags)
        self.assertTrue(u'なにこれかわいい' in image2.imageTags)
        self.assertTrue(u'やはり存在する斧' in image2.imageTags)
        self.assertTrue(u'ヤンデレ' in image2.imageTags)
        self.assertTrue(u'吸いこまれそうな瞳の色' in image2.imageTags)

        self.assertEqual(image2.imageMode, "big")
        self.assertEqual(image2.worksDate, '12-11-2012 00:23')
        self.assertEqual(image2.worksResolution, '642x900')
        self.assertEqual(image2.worksTools, 'Photoshop SAI')
        #self.assertEqual(image2.jd_rtv, 88190)
        #self.assertEqual(image2.jd_rtc, 6711)
        #self.assertEqual(image2.jd_rtt, 66470)
        self.assertEqual(image2.artist.artistToken, 'nardack')
Beispiel #15
0
    def testPixivImageParseInfoPixivPremiumOffer(self):
        p = open('./test/test-image-parse-image-38826533-pixiv-premium.html',
                 'r')
        page = BeautifulSoup(p.read())
        image2 = PixivImage(38826533, page)
        page.decompose()
        del page

        self.assertEqual(image2.imageId, 38826533)
        self.assertEqual(image2.imageTitle, u"てやり")
        self.assertEqual(image2.imageCaption, u'一応シーダ様です。')

        self.assertTrue(u'R-18' in image2.imageTags)
        self.assertTrue(u'FE' in image2.imageTags)
        self.assertTrue(u'ファイアーエムブレム' in image2.imageTags)
        self.assertTrue(u'シーダ' in image2.imageTags)

        self.assertEqual(image2.imageMode, "big")
        self.assertEqual(image2.worksDate, '9-30-2013 01:43')
        self.assertEqual(image2.worksResolution, '1000x2317')
        self.assertEqual(image2.worksTools, 'CLIP STUDIO PAINT')
        #self.assertEqual(image2.jd_rtv, 88190)
        #self.assertEqual(image2.jd_rtc, 6711)
        #self.assertEqual(image2.jd_rtt, 66470)
        self.assertEqual(image2.artist.artistToken, 'hvcv')
Beispiel #16
0
 def testPixivArtistNoImage(self):
     # print '\nTesting member page - no image'
     p = open("./test/test-noimage.htm", "r")
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException):
         PixivArtist(1233, page)
     page.decompose()
     del page
Beispiel #17
0
 def testPixivImageDeleted(self):
     # print '\nTesting image page - deleted image'
     p = open("./test/test-image-deleted.htm", "r")
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException):
         PixivImage(123, page)
     page.decompose()
     del page
Beispiel #18
0
 def testPixivArtistNotLoggedIn(self):
     p = open('./test/test-member-nologin.htm', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException) as ex:
         PixivArtist(143229, page)
     self.assertEqual(ex.exception.errorCode, 100)
     page.decompose()
     del page
Beispiel #19
0
 def testPixivArtistDeleted(self):
   #print '\nTesting member page - deleted member'
   p = open('./test/test-member-deleted.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivModelException):
       PixivArtist(123, page)
   page.decompose()
   del page
Beispiel #20
0
 def testPixivArtistNoImage(self):
   #print '\nTesting member page - no image'
   p = open('./test/test-noimage.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivModelException):
       PixivArtist(363073, page)
   page.decompose()
   del page
Beispiel #21
0
 def testPixivArtistNoMember(self):
   #print '\nTesting member page - no member'
   p = open('./test/test-nouser.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException):
       PixivArtist(1, page)
   page.decompose()
   del page
Beispiel #22
0
 def testPixivArtistNoImage(self):
   #print '\nTesting member page - no image'
   p = open('./test/test-noimage.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException):
       PixivArtist(1233, page)
   page.decompose()
   del page
Beispiel #23
0
 def testPixivArtistNotLoggedIn(self):
   p = open('./test/test-member-nologin.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException) as ex:
       PixivArtist(143229, page)
   self.assertEqual(ex.exception.errorCode, 100)
   page.decompose()
   del page
Beispiel #24
0
 def testPixivImageNoImageEng(self):
   #print '\nTesting image page - no image'
   p = open('./test/test-image-noimage-eng.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException):
       PixivImage(123, page)
   page.decompose()
   del page
Beispiel #25
0
 def testPixivArtistNoMember(self):
     # print('\nTesting member page - no member')
     p = open('./test/test-nouser.htm', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException):
         PixivArtist(1, page)
     page.decompose()
     del page
Beispiel #26
0
 def testPixivImageDeleted(self):
     # print '\nTesting image page - deleted image'
     p = open('./test/test-image-deleted.htm', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException):
         PixivImage(123, page)
     page.decompose()
     del page
Beispiel #27
0
 def testPixivImageNoImageEng(self):
     # print('\nTesting image page - no image')
     p = open('./test/test-image-noimage-eng.htm', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException):
         PixivImage(123, page)
     page.decompose()
     del page
Beispiel #28
0
 def testPixivArtistNoImage(self):
     # print('\nTesting member page - no image')
     p = open('./test/test-noimage.htm', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException):
         member = PixivArtist(1233, page)
         # print(member.imageList)
     page.decompose()
     del page
Beispiel #29
0
 def testPixivArtistServerError(self):
   #print '\nTesting member page'
   p = open('./test/test-server-error.html', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException) as ex:
     artist = PixivArtist(234753, page)
   self.assertEqual(ex.exception.errorCode, PixivException.SERVER_ERROR)
   page.decompose()
   del page
Beispiel #30
0
 def testPixivImageServerError2(self):
     # print('\nTesting image page')
     p = open('./test/test-image-generic-error.html', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException) as ex:
         image = PixivImage(37882549, page)
     self.assertEqual(ex.exception.errorCode, PixivException.UNKNOWN_IMAGE_ERROR)
     page.decompose()
     del page
Beispiel #31
0
 def testPixivArtistServerError(self):
     # print('\nTesting member page')
     p = open('./test/test-server-error.html', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException) as ex:
         artist = PixivArtist(234753, page)
     self.assertEqual(ex.exception.errorCode, PixivException.SERVER_ERROR)
     page.decompose()
     del page
Beispiel #32
0
 def testPixivArtistSuspended(self):
     # print('\nTesting member page - suspended member')
     p = open('./test/test-member-suspended.htm', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException) as ex:
         PixivArtist(123, page)
     self.assertEqual(ex.exception.errorCode, 1002)
     page.decompose()
     del page
Beispiel #33
0
 def testPixivImageServerError(self):
     # print '\nTesting image page'
     p = open("./test/test-server-error.html", "r")
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException) as ex:
         image = PixivImage(9138317, page)
     self.assertEqual(ex.exception.errorCode, PixivException.SERVER_ERROR)
     page.decompose()
     del page
Beispiel #34
0
 def testPixivImageServerError2(self):
   #print '\nTesting image page'
   p = open('./test/test-image-generic-error.html', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException) as ex:
     image = PixivImage(37882549, page)
   self.assertEqual(ex.exception.errorCode, PixivException.SERVER_ERROR)
   page.decompose()
   del page
Beispiel #35
0
 def testPixivArtistSuspended(self):
   #print '\nTesting member page - suspended member'
   p = open('./test/test-member-suspended.htm', 'r')
   page = BeautifulSoup(p.read())
   with self.assertRaises(PixivException) as ex:
       PixivArtist(123, page)
   self.assertEqual(ex.exception.errorCode, 1002)
   page.decompose()
   del page
Beispiel #36
0
 def testPixivImageServerError(self):
     # print('\nTesting image page')
     p = open('./test/test-server-error.html', 'r')
     page = BeautifulSoup(p.read())
     with self.assertRaises(PixivException) as ex:
         image = PixivImage(9138317, page)
     self.assertEqual(ex.exception.errorCode, PixivException.SERVER_ERROR)
     page.decompose()
     del page
Beispiel #37
0
 def testPixivImageUgoira(self):
   #print '\nTesting image page'
   p = open('./test/test-image-ugoira.htm', 'r')
   page = BeautifulSoup(p.read())
   image = PixivImage(46281014, page)
   urls = image.ParseImages(page)
   print image.imageUrls
   self.assertTrue(image.imageUrls[0].find(".zip") > -1)
   page.decompose()
   del page
Beispiel #38
0
 def testPixivImageUgoira(self):
     # print('\nTesting image page')
     p = open('./test/test-image-ugoira.htm', 'r')
     page = BeautifulSoup(p.read())
     image = PixivImage(46281014, page)
     urls = image.ParseImages(page)
     # print(image.imageUrls)
     self.assertTrue(image.imageUrls[0].find(".zip") > -1)
     page.decompose()
     del page
Beispiel #39
0
    def testPixivImageModeManga(self):
        # print('\nTesting image page - manga')
        p = open('./test/test-image-manga.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(28820443, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 28820443)
        self.assertEqual(image.imageMode, 'manga')
Beispiel #40
0
    def testPixivImageModeManga(self):
        # print('\nTesting image page - manga')
        p = open('./test/test-image-manga.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(28820443, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 28820443)
        self.assertEqual(image.imageMode, 'manga')
Beispiel #41
0
    def testPixivImageRateCount(self):
        p = open('./test/test-image-rate_count.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(28865189, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 28865189)
        self.assertEqual(image.imageMode, 'manga')
        self.assertTrue(image.jd_rtv > 0)
        self.assertTrue(image.jd_rtc > 0)
Beispiel #42
0
    def testPixivImageRateCount(self):
        p = open('./test/test-image-rate_count.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(28865189, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 28865189)
        self.assertEqual(image.imageMode, 'manga')
        self.assertTrue(image.jd_rtv > 0)
        self.assertTrue(image.jd_rtc > 0)
Beispiel #43
0
    def testCreateFilenameUnicode(self):
        p = open('./test/test-image-unicode.htm', 'r')
        page = BeautifulSoup(p.read())
        imageInfo = PixivImage(2493913, page)
        page.decompose()
        del page

        nameFormat = '%member_token% (%member_id%)\%urlFilename% %works_date_only% %works_res% %works_tools% %title%'
        expected = unicode(u'balzehn (267014)\\2493913 12-23-2008 852x1200 Photoshop SAI つけペン アラクネのいる日常2.jpg')
        result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img16/img/balzehn/2493913.jpg')
        # print(result)
        self.assertEqual(result, expected)
  def testCreateFilenameUnicode(self):
    p = open('./test/test-image-unicode.htm', 'r')
    page = BeautifulSoup(p.read())
    imageInfo = PixivImage(2493913, page)
    page.decompose()
    del page

    nameFormat = '%member_token% (%member_id%)\%urlFilename% %works_date_only% %works_res% %works_tools% %title% - %tags%'
    expected = unicode(u'balzehn (267014)\\2493913 12-23-2008 852x1200 Photoshop SAI つけペン アラクネのいる日常2 - R-18 これは萌える アラクネ ツンデレ ピロートークの上手さに定評のある兄弟 モンスター娘 モン娘のいる日常シリーズ 人外 魔物娘 魔界全土喝采.jpg')                       
    result = PixivHelper.makeFilename(nameFormat, imageInfo, artistInfo=None, tagsSeparator=' ', fileUrl='http://i2.pixiv.net/img16/img/balzehn/2493913.jpg')
    ##print result    
    self.assertEqual(result, expected)
    def crawlAppsForCategory(self, url, cat, cat2):
        pageIndex = 0
        curl = url % (cat, pageIndex, self.pageIncrements)
        twice = False

        while True:
            try:
                #print curl
                request = urllib2.Request(curl)
                request.add_header("User-Agent", "PermissionCrawler")
                handle = urllib2.build_opener()
                content = handle.open(request).read()
                soup = BeautifulSoup(content)

                print " crawling next %d entries starting with #%d" % (self.pageIncrements, pageIndex+1)
                appURLS = self.extractAppUrls(soup)
                duplicates = self.extractPermissionsIntoDB(appURLS, cat, cat2)

                if len(duplicates) == 0:
                    pageIndex+=self.pageIncrements
                # if we got first full repetition of page 1, go back one page and move on slowly until second full repetition
                elif ((len(duplicates) == self.pageIncrements) or (len(duplicates) >= myThreshold)) and (twice == False):
                    print >> sys.stderr, "  ! %d duplicate entries on last iteration" % len(duplicates)
                    pageIndex = max(pageIndex-self.pageIncrements, 0)
                    twice = True
                    duplicates = set()
                elif twice == True:
                    pageIndex+=1
                # resorting of top n apps may produce 1 or 2 duplicates - ignore low number of duplicates
                else:
                    pageIndex+=self.pageIncrements

                curl = url % (cat, pageIndex, self.pageIncrements)

                soup.decompose()

                if TERMAPP == True:
                    connection.close()
                    sys.exit()

                if ((len(duplicates) == self.pageIncrements) or (len(duplicates) >= myThreshold)) and (twice == True):
                    print >> sys.stderr, "INFO: stopped crawling categrory %s due to %s duplicates at last iteration twice" % (cat, len(duplicates))
                    return False

            except urllib2.HTTPError, error:
                if error.code == 404:
                    print >> sys.stderr, "404 ERROR: %s -> %s" % (error, error.url)
                if error.code == 403:
                    print >> sys.stderr, "403 (NO MORE APP PAGES FOR THIS CATEGORY)ERROR: %s -> %s" % (error, error.url)
                else:
                    print >> sys.stderr, "ERROR: %s" % error
                break
    def getSearchTagPage(self, tags,
                         current_page,
                         wild_card=True,
                         title_caption=False,
                         start_date=None,
                         end_date=None,
                         member_id=None,
                         oldest_first=False,
                         start_page=1):
        response = None
        result = None
        url = ''

        if member_id is not None:
            # from member id search by tags
            (artist, response) = self.getMemberPage(member_id, current_page, False, tags)

            # convert to PixivTags
            result = PixivModelWhiteCube.PixivTags()
            result.parseMemberTags(artist, member_id, tags)
        else:
            # search by tags
            url = PixivHelper.generateSearchTagUrl(tags, current_page,
                                                   title_caption,
                                                   wild_card,
                                                   oldest_first,
                                                   start_date,
                                                   end_date,
                                                   member_id,
                                                   self._config.r18mode)

            PixivHelper.print_and_log('info', 'Looping... for ' + url)
            # response = self.open(url).read()
            response = self.getPixivPage(url, returnParsed=False).read()
            self.handleDebugTagSearchPage(response, url)

            parse_search_page = BeautifulSoup(response)

            result = PixivModel.PixivTags()
            if member_id is not None:
                result.parseMemberTags(parse_search_page, member_id, tags)
            else:
                try:
                    result.parseTags(parse_search_page, tags)
                except BaseException:
                    PixivHelper.dumpHtml("Dump for SearchTags " + tags + ".html", response)
                    raise

            parse_search_page.decompose()
            del parse_search_page

        return (result, response)
Beispiel #47
0
    def getSearchTagPage(self, tags,
                         current_page,
                         wild_card=True,
                         title_caption=False,
                         start_date=None,
                         end_date=None,
                         member_id=None,
                         oldest_first=False,
                         start_page=1):
        response = None
        result = None
        url = ''

        if member_id is not None:
            # from member id search by tags
            (artist, response) = self.getMemberPage(member_id, current_page, False, tags)

            # convert to PixivTags
            result = PixivModelWhiteCube.PixivTags()
            result.parseMemberTags(artist, member_id, tags)
        else:
            # search by tags
            url = PixivHelper.generateSearchTagUrl(tags, current_page,
                                                   title_caption,
                                                   wild_card,
                                                   oldest_first,
                                                   start_date,
                                                   end_date,
                                                   member_id,
                                                   self._config.r18mode)

            PixivHelper.print_and_log('info', 'Looping... for ' + url)
            # response = self.open(url).read()
            response = self.getPixivPage(url, returnParsed=False).read()
            self.handleDebugTagSearchPage(response, url)

            parse_search_page = BeautifulSoup(response)

            result = PixivModel.PixivTags()
            if member_id is not None:
                result.parseMemberTags(parse_search_page, member_id, tags)
            else:
                try:
                    result.parseTags(parse_search_page, tags)
                except BaseException:
                    PixivHelper.dumpHtml("Dump for SearchTags " + tags + ".html", response)
                    raise

            parse_search_page.decompose()
            del parse_search_page

        return (result, response)
Beispiel #48
0
def parseJs(page):
    parsed = BeautifulSoup(page.decode("utf8"))
    jss = parsed.find('meta', attrs={'id': 'meta-preload-data'})

    # cleanup
    parsed.decompose()
    del parsed

    if jss is None or len(jss["content"]) == 0:
        return None  # Possibly error page

    payload = demjson.decode(jss["content"])
    return payload
 def testPixivArtistProfileDataSrc(self):
   #print '\nTesting member page ProfileDataSrc'
   p = open('./test/test-helper-avatar-name.htm', 'r')
   page = BeautifulSoup(p.read())
   try:
     artist = PixivArtist(1107124, page)
   except PixivModelException as ex:
     print ex
   page.decompose()
   del page
   self.assertNotEqual(artist, None)
   self.assertEqual(artist.artistId, 1107124)
   self.assertEqual(artist.artistToken, 'kirabara29')
Beispiel #50
0
    def testPixivImageUnicode(self):
        # print('\nTesting image page - big')
        p = open('./test/test-image-unicode.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(2493913, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 2493913)
        self.assertEqual(image.imageMode, 'big')
        self.assertEqual(image.worksDate, '12/23/08 12:01')
        self.assertEqual(image.worksResolution, '852x1200')
Beispiel #51
0
 def testPixivArtistPage(self):
   #print '\nTesting member page'
   p = open('./test/test.htm', 'r')
   page = BeautifulSoup(p.read())
   try:
     artist = PixivArtist(363073, page)
     artist.PrintInfo()
   except PixivModelException as ex:
     print ex
   page.decompose()
   del page
   self.assertNotEqual(artist, None)
   self.assertEqual(artist.artistId, 363073)
Beispiel #52
0
    def testPixivArtistNoAvatar(self):
        # print('\nTesting member page without avatar image')
        p = open('./test/test-member-noavatar.htm', 'r')
        artist = None
        page = BeautifulSoup(p.read())
        artist = PixivArtist(26357, page)

        page.decompose()
        del page
        self.assertNotEqual(artist, None)
        self.assertEqual(artist.artistId, 26357)
        self.assertEqual(artist.artistToken, 'yukimaruko')
        self.assertTrue(artist.artistAvatar.find("no_profile.png") > 0)
Beispiel #53
0
 def testPixivArtistBookmark(self):
   #print '\nTesting member page'
   p = open('./test/test-member-bookmark.htm', 'r')
   page = BeautifulSoup(p.read())
   try:
     artist = PixivArtist(3281699, page)
     #artist.PrintInfo()
   except PixivException as ex:
     print ex
   page.decompose()
   del page
   self.assertNotEqual(artist, None)
   self.assertEqual(artist.artistId, 3281699)
Beispiel #54
0
 def testPixivArtistProfileDataSrc(self):
   #print '\nTesting member page ProfileDataSrc'
   p = open('./test/test-helper-avatar-name.htm', 'r')
   page = BeautifulSoup(p.read())
   try:
     artist = PixivArtist(1107124, page)
   except PixivException as ex:
     print ex
   page.decompose()
   del page
   self.assertNotEqual(artist, None)
   self.assertEqual(artist.artistId, 1107124)
   self.assertEqual(artist.artistToken, 'kirabara29')
Beispiel #55
0
    def testPixivImageUnicode(self):
        # print('\nTesting image page - big')
        p = open('./test/test-image-unicode.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(2493913, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 2493913)
        self.assertEqual(image.imageMode, 'big')
        self.assertEqual(image.worksDate, '12/23/08 12:01')
        self.assertEqual(image.worksResolution, '852x1200')
Beispiel #56
0
    def testPixivImageParseNoTags(self):
        p = open('./test/test-image-no_tags.htm', 'r')
        page = BeautifulSoup(p.read())
        image = PixivImage(9175987, page)
        page.decompose()
        del page

        self.assertNotEqual(image, None)
        self.assertEqual(image.imageId, 9175987)
        self.assertEqual(image.worksDate, '03/05/10 18:04')
        self.assertEqual(image.worksResolution, '1155x768')
        # self.assertEqual(image.worksTools, u'SAI')
        self.assertEqual(image.imageTags, [])
Beispiel #57
0
 def testPixivArtistNoAvatar(self):
   #print '\nTesting member page without avatar image'
   p = open('./test/test-member-noavatar.htm', 'r')
   page = BeautifulSoup(p.read())
   try:
     artist = PixivArtist(26357, page)
     #artist.PrintInfo()
   except PixivException as ex:
     print ex
   page.decompose()
   del page
   self.assertNotEqual(artist, None)
   self.assertEqual(artist.artistId, 26357)
   self.assertEqual(artist.artistToken, 'yukimaruko')
Beispiel #58
0
 def testPixivImageModeManga(self):
     # print '\nTesting image page - manga'
     p = open('./test/test-image-manga.htm', 'r')
     page = BeautifulSoup(p.read())
     try:
         image = PixivImage(28820443, page)
         # image.PrintInfo()
     except PixivException as ex:
         print ex
     page.decompose()
     del page
     self.assertNotEqual(image, None)
     self.assertEqual(image.imageId, 28820443)
     self.assertEqual(image.imageMode, 'manga')
Beispiel #59
0
 def testPixivImageNoAvatar(self):
     # print('\nTesting artist page without avatar image')
     p = open('./test/test-image-noavatar.htm', 'r')
     page = BeautifulSoup(p.read())
     image = PixivImage(20496355, page)
     page.decompose()
     del page
     # self.assertNotEqual(image, None)
     self.assertEqual(image.artist.artistToken, 'iymt')
     self.assertEqual(image.imageId, 20496355)
     # 07/22/2011 03:09|512×600|RETAS STUDIO&nbsp;
     # print(image.worksDate, image.worksResolution, image.worksTools)
     self.assertEqual(image.worksDate, '07/21/11 18:09')
     self.assertEqual(image.worksResolution, '512x600')