def process_url(self, url): self.stdout.write("Processing Page {}".format(self.count)) self.count += 1 resp = self.session.get(url) resp.raise_for_status() soup = BeautifulSoup(resp.content, "html.parser") grps = soup.select(".grpl .grpl-grp") for grp in grps: name = grp.select_one("h3 a").text.strip() image_url = urljoin(url, grp.select_one("img")["src"]).strip() if image_url.endswith("/group_img.png"): image_url = None group_tag = grp.select_one(".grpl-type") if group_tag is not None: group_type = group_tag.text.strip() else: group_type = None description = grp.select_one(".grpl-purpose").text.replace( "\r\n", "\n").strip() if description == "This group has not written a purpose": description = "" else: description = clean(description) contact_tag = grp.select_one(".grpl-contact") if contact_tag is not None: contact_email = contact_tag.text.strip() else: contact_email = None if group_type is not None and not self.dry_run and not self.skip_tags: tag = Tag.objects.get_or_create(name=group_type)[0] else: tag = None clubs = Club.objects.filter(name__iexact=name) if clubs.exists(): if clubs.count() > 1: raise CommandError( "Club with name '{}' exists twice!".format(name)) club = clubs.first() flag = False else: code = slugify(name) if not self.dry_run: club, flag = Club.objects.get_or_create(code=code) elif Club.objects.filter(code=code).exists(): club = Club.objects.get(code=code) flag = False else: club = Club(code=code) flag = True # only overwrite blank fields if not club.name: club.name = name if not club.description: club.description = description use_image = False if image_url: if not self.dry_run: if club.image: resp = requests.head(image_url, allow_redirects=True) use_image = not resp.ok else: use_image = True if use_image: resp = requests.get(image_url, allow_redirects=True) resp.raise_for_status() club.image.save(os.path.basename(image_url), ContentFile(resp.content)) if not club.email: club.email = contact_email # mark newly created clubs as inactive (has no owner) if flag: club.active = False if not self.dry_run: club.save() if tag is not None and not club.tags.count(): club.tags.set([tag]) self.club_count += 1 self.stdout.write("{} '{}' (image: {})".format( "Created" if flag else "Updated", name, use_image)) next_tag = soup.find(text="Next >") if next_tag is not None: next_link = next_tag.find_parent("a")["href"] next_url = url.split("?", 1)[0] + next_link self.process_url(next_url)
def process_url(self, url): self.stdout.write(f"Processing Page {self.count}") self.count += 1 resp = self.session.get(url) resp.raise_for_status() soup = BeautifulSoup(resp.content, "html.parser") grps = soup.select(".grpl .grpl-grp") for grp in grps: # parse name name = grp.select_one("h3 a").text.strip() # parse image url image_url = urljoin(url, grp.select_one("img")["src"]).strip() if image_url.endswith("/group_img.png"): image_url = None # parse tag group_tag = grp.select_one(".grpl-type") if group_tag is not None: group_type = group_tag.text.strip() else: group_type = None # parse description description = grp.select_one(".grpl-purpose").text.replace( "\r\n", "\n").strip() if description == "This group has not written a purpose": description = "" else: description = clean(description) # parse email contact contact_tag = grp.select_one(".grpl-contact") if contact_tag is not None: contact_email = contact_tag.text.strip() else: contact_email = None # create or update tag if group_type is not None and not self.dry_run and not self.skip_tags: tag = Tag.objects.get_or_create(name=group_type)[0] else: tag = None # don't include parentheses content in code slug_name = re.sub(r"\(.+?\)$", "", name).strip() # create or update club code = slugify(slug_name) club = fuzzy_lookup_club(name) if club is not None: code = club.code flag = False else: club = Club(code=code) flag = True if not flag and self.create_only: self.ignore_count += 1 self.stdout.write(f"Ignoring {name}, club already exists") continue # only overwrite blank fields if not club.name: club.name = name if not club.description: club.description = description # only update image if existing image is nonexistent/broken link # if image is local and set, assume that it exists use_image = False if image_url: if not self.dry_run: if club.image: if club.image.url.startswith("http"): resp = requests.head(club.image.url, allow_redirects=True) use_image = not resp.ok else: use_image = False else: use_image = True if use_image: resp = requests.get(image_url, allow_redirects=True) resp.raise_for_status() club.image.save(os.path.basename(image_url), ContentFile(resp.content)) else: use_image = not bool(club.image) # update email if there is no email if not club.email: club.email = contact_email # mark newly created clubs as inactive (has no owner) if flag: club.active = False if not self.dry_run: with transaction.atomic(): club.save() if tag is not None and not club.tags.count(): club.tags.set([tag]) self.club_count += 1 action_verb = "Created" if flag else "Updated" out_string = f"{action_verb} '{name}' (image: {use_image})" if flag: self.stdout.write(self.style.SUCCESS(out_string)) self.create_count += 1 else: self.stdout.write(out_string) self.update_count += 1 next_tag = soup.find(text="Next >") if next_tag is not None: next_link = next_tag.find_parent("a")["href"] next_url = url.split("?", 1)[0] + next_link self.process_url(next_url)