Beispiel #1
0
    def process_url(self, url):
        self.stdout.write("Processing Page {}".format(self.count))
        self.count += 1
        resp = self.session.get(url)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.content, "html.parser")
        grps = soup.select(".grpl .grpl-grp")
        for grp in grps:
            name = grp.select_one("h3 a").text.strip()
            image_url = urljoin(url, grp.select_one("img")["src"]).strip()
            if image_url.endswith("/group_img.png"):
                image_url = None
            group_tag = grp.select_one(".grpl-type")
            if group_tag is not None:
                group_type = group_tag.text.strip()
            else:
                group_type = None
            description = grp.select_one(".grpl-purpose").text.replace(
                "\r\n", "\n").strip()
            if description == "This group has not written a purpose":
                description = ""
            else:
                description = clean(description)
            contact_tag = grp.select_one(".grpl-contact")
            if contact_tag is not None:
                contact_email = contact_tag.text.strip()
            else:
                contact_email = None

            if group_type is not None and not self.dry_run and not self.skip_tags:
                tag = Tag.objects.get_or_create(name=group_type)[0]
            else:
                tag = None
            clubs = Club.objects.filter(name__iexact=name)
            if clubs.exists():
                if clubs.count() > 1:
                    raise CommandError(
                        "Club with name '{}' exists twice!".format(name))
                club = clubs.first()
                flag = False
            else:
                code = slugify(name)
                if not self.dry_run:
                    club, flag = Club.objects.get_or_create(code=code)
                elif Club.objects.filter(code=code).exists():
                    club = Club.objects.get(code=code)
                    flag = False
                else:
                    club = Club(code=code)
                    flag = True

            # only overwrite blank fields
            if not club.name:
                club.name = name
            if not club.description:
                club.description = description
            use_image = False
            if image_url:
                if not self.dry_run:
                    if club.image:
                        resp = requests.head(image_url, allow_redirects=True)
                        use_image = not resp.ok
                    else:
                        use_image = True
                    if use_image:
                        resp = requests.get(image_url, allow_redirects=True)
                        resp.raise_for_status()
                        club.image.save(os.path.basename(image_url),
                                        ContentFile(resp.content))
            if not club.email:
                club.email = contact_email

            # mark newly created clubs as inactive (has no owner)
            if flag:
                club.active = False
            if not self.dry_run:
                club.save()
                if tag is not None and not club.tags.count():
                    club.tags.set([tag])
            self.club_count += 1
            self.stdout.write("{} '{}' (image: {})".format(
                "Created" if flag else "Updated", name, use_image))

        next_tag = soup.find(text="Next >")
        if next_tag is not None:
            next_link = next_tag.find_parent("a")["href"]
            next_url = url.split("?", 1)[0] + next_link
            self.process_url(next_url)
Beispiel #2
0
    def process_url(self, url):
        self.stdout.write(f"Processing Page {self.count}")
        self.count += 1
        resp = self.session.get(url)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.content, "html.parser")
        grps = soup.select(".grpl .grpl-grp")
        for grp in grps:
            # parse name
            name = grp.select_one("h3 a").text.strip()

            # parse image url
            image_url = urljoin(url, grp.select_one("img")["src"]).strip()
            if image_url.endswith("/group_img.png"):
                image_url = None

            # parse tag
            group_tag = grp.select_one(".grpl-type")
            if group_tag is not None:
                group_type = group_tag.text.strip()
            else:
                group_type = None

            # parse description
            description = grp.select_one(".grpl-purpose").text.replace(
                "\r\n", "\n").strip()
            if description == "This group has not written a purpose":
                description = ""
            else:
                description = clean(description)

            # parse email contact
            contact_tag = grp.select_one(".grpl-contact")
            if contact_tag is not None:
                contact_email = contact_tag.text.strip()
            else:
                contact_email = None

            # create or update tag
            if group_type is not None and not self.dry_run and not self.skip_tags:
                tag = Tag.objects.get_or_create(name=group_type)[0]
            else:
                tag = None

            # don't include parentheses content in code
            slug_name = re.sub(r"\(.+?\)$", "", name).strip()

            # create or update club
            code = slugify(slug_name)
            club = fuzzy_lookup_club(name)
            if club is not None:
                code = club.code
                flag = False
            else:
                club = Club(code=code)
                flag = True

            if not flag and self.create_only:
                self.ignore_count += 1
                self.stdout.write(f"Ignoring {name}, club already exists")
                continue

            # only overwrite blank fields
            if not club.name:
                club.name = name
            if not club.description:
                club.description = description

            # only update image if existing image is nonexistent/broken link
            # if image is local and set, assume that it exists
            use_image = False
            if image_url:
                if not self.dry_run:
                    if club.image:
                        if club.image.url.startswith("http"):
                            resp = requests.head(club.image.url,
                                                 allow_redirects=True)
                            use_image = not resp.ok
                        else:
                            use_image = False
                    else:
                        use_image = True

                    if use_image:
                        resp = requests.get(image_url, allow_redirects=True)
                        resp.raise_for_status()
                        club.image.save(os.path.basename(image_url),
                                        ContentFile(resp.content))
                else:
                    use_image = not bool(club.image)

            # update email if there is no email
            if not club.email:
                club.email = contact_email

            # mark newly created clubs as inactive (has no owner)
            if flag:
                club.active = False

            if not self.dry_run:
                with transaction.atomic():
                    club.save()
                    if tag is not None and not club.tags.count():
                        club.tags.set([tag])

            self.club_count += 1
            action_verb = "Created" if flag else "Updated"
            out_string = f"{action_verb} '{name}' (image: {use_image})"
            if flag:
                self.stdout.write(self.style.SUCCESS(out_string))
                self.create_count += 1
            else:
                self.stdout.write(out_string)
                self.update_count += 1

        next_tag = soup.find(text="Next >")
        if next_tag is not None:
            next_link = next_tag.find_parent("a")["href"]
            next_url = url.split("?", 1)[0] + next_link
            self.process_url(next_url)