Beispiel #1
0
    def handle(self, *args, **options):
        self.stdout.write("start scraping from Bulbapedia...\n")
        p = re.compile("official (count|total) (?P<count>\d+)")

        CardSet.objects.all().delete()

        html = json.load(urlopen(
            BASE_URL.format(CARDSET_PAGE)))['parse']['text']['*']
        set_tbl = BeautifulSoup(html).find("table").find("table").find_all(
            "tr")[1:]

        for r in set_tbl:
            data = r.find_all("td")
            logo_temp = NamedTemporaryFile()

            try:
                eng_cs = CardSet(country="GB")
                eng_cs.set_no = int(r.find("th").text.strip())
                eng_cs.name = data[2].text.strip()
                try:
                    logo_url = data[1].a.img['src']
                    logo_temp.write(urlopen(logo_url).read())
                    logo_ext = urlparse(logo_url).path.split('.')[-1]
                    logo_filename = "{0}_{1}.{2}".format(
                        eng_cs.country, eng_cs.set_no, logo_ext)
                    logo_temp.flush()
                    eng_cs.logo.save(logo_filename, File(logo_temp))
                except (TypeError, AttributeError):
                    pass
                try:
                    eng_cs.partial_url = data[2].a['title']
                except TypeError:
                    pass
                eng_cs.release = datetime.strptime(data[6].text.strip(),
                                                   "%B %d, %Y")
                count = data[4].find("span", "explain")
                try:
                    eng_cs.official_count = int(
                        p.search(count['title']).group("count"))
                except (TypeError, AttributeError):
                    if data[4].text.strip() != "":
                        eng_cs.official_count = int(data[4].text.strip())

                self.stdout.write("scraped: {0}\n".format(eng_cs))
                eng_cs.save()
            except ValueError:
                # If there is no valid set_no
                pass

            try:
                jap_cs = CardSet(country="JP")
                jap_cs.set_no = int(data[0].text.strip())
                jap_cs.name = data[3].text.strip()
                try:
                    logo_url = data[1].a.img['src']
                    logo_temp.write(urlopen(logo_url).read())
                    logo_ext = urlparse(logo_url).path.split('.')[-1]
                    logo_filename = "{0}_{1}.{2}".format(
                        jap_cs.country, jap_cs.set_no, logo_ext)
                    logo_temp.flush()
                    jap_cs.logo.save(logo_filename, File(logo_temp))
                except (TypeError, AttributeError):
                    pass
                try:
                    jap_cs.partial_url = data[2].a['title']
                except TypeError:
                    pass
                jap_cs.release = datetime.strptime(data[7].text.strip(),
                                                   "%B %d, %Y")
                count = data[5].find("span", "explain")
                try:
                    jap_cs.official_count = int(
                        p.search(count['title']).group("count"))
                except (TypeError, AttributeError):
                    if data[5].text.strip() != "":
                        jap_cs.official_count = int(data[5].text.strip())

                self.stdout.write("scraped: {0}\n".format(jap_cs))
                jap_cs.save()
            except ValueError:
                # If there is no valid set_no
                pass

        self.stdout.write("total sets {0}\n".format(
            str(CardSet.objects.all().count())))