def handle(self, *args, **kwargs): for club in Club.objects.all(): if club.description: # extract testimonials by using the string "From the members of <club>:" # the description is before this string and the testimonials are after this string match = re.match(r"(.*)From members of ([^:]+)[:;](.*)", club.description, re.M | re.I | re.S) if match is not None: desc, _, testimonials = match.groups() # remove testimonials from description club.description = clean(desc) club.save() # save testimonials count = 0 testimonials = bleach.clean(testimonials, strip=True, tags=[]) # find text enclosed in quotes followed by a newline # except for the last quote, which is at the end of the string for testimonial in re.findall(r'"(.*?)"(?:\r?\n|$)', testimonials, re.M | re.I | re.S): text = testimonial.strip() Testimonial.objects.create(club=club, text=text) count += 1 self.stdout.write( self.style.SUCCESS( "Extracted {} testimonial(s) from {}".format( count, club.code)))
def fix_clubs(self): self.clubs_to_scrape = [] self.process_url(self.START_URL) for club, url in self.clubs_to_scrape: desc = self.extract_club_desc(url) if desc is None: continue club.description = clean(desc.text.strip()) if not self.dry_run: club.save() self.stdout.write(f"Fixing club {club.name}.") self.club_count += 1 self.stdout.write(f"Updated {self.club_count} clubs!")
def add_ics_events(self): """ Fetch the ICS events from the club's calendar URL and return the number of modified events. """ # random but consistent uuid used to generate uuid5s from invalid uuids ics_import_uuid_namespace = uuid.UUID( "8f37c140-3775-42e8-91d4-fda7a2e44152") extractor = URLExtract() url = self.ics_import_url if url: calendar = Calendar(requests.get(url).text) event_list = Event.objects.filter(is_ics_event=True, club=self) modified_events = [] for event in calendar.events: tries = [ Event.objects.filter(club=self, start_time=event.begin.datetime, end_time=event.end.datetime).first(), Event(), ] # try matching using uuid if it is valid if event.uid: try: event_uuid = uuid.UUID(event.uid[:36]) except ValueError: # generate uuid from malformed/invalid uuids event_uuid = uuid.uuid5(ics_import_uuid_namespace, event.uid) tries.insert( 0, Event.objects.filter(ics_uuid=event_uuid).first()) else: event_uuid = None for ev in tries: if ev: ev.club = self ev.name = event.name.strip() ev.start_time = event.begin.datetime ev.end_time = event.end.datetime ev.description = clean(event.description.strip()) ev.location = event.location ev.is_ics_event = True # very simple type detection, only perform on first time if ev.pk is None: ev.type = Event.OTHER for val, lbl in Event.TYPES: if val in {Event.FAIR}: continue if (lbl.lower() in ev.name.lower() or lbl.lower() in ev.description.lower()): ev.type = val break # extract urls from description if ev.description: urls = extractor.find_urls(ev.description) urls.sort( key=lambda url: any( domain in url for domain in { "zoom.us", "bluejeans.com", "hangouts.google.com", }), reverse=True, ) if urls: ev.url = urls[0] # extract url from url or location if event.url: ev.url = event.url elif ev.location: location_urls = extractor.find_urls(ev.location) if location_urls: ev.url = location_urls[0] # format url properly with schema if ev.url: parsed = urlparse(ev.url) if not parsed.netloc: parsed = parsed._replace(netloc=parsed.path, path="") if not parsed.scheme: parsed = parsed._replace(scheme="https") ev.url = parsed.geturl() # add uuid if it exists, otherwise will be autogenerated if event_uuid: ev.ics_uuid = event_uuid # ensure length limits are met before saving if ev.location: ev.location = ev.location[:255] if ev.name: ev.name = ev.name[:255] if ev.code: ev.code = ev.code[:255] if ev.url: ev.url = ev.url[:2048] ev.save() modified_events.append(ev) break event_list.exclude(pk__in=[e.pk for e in modified_events]).delete() return len(modified_events) return 0
def process_url(self, url): self.stdout.write("Processing Page {}".format(self.count)) self.count += 1 resp = self.session.get(url) resp.raise_for_status() soup = BeautifulSoup(resp.content, "html.parser") grps = soup.select(".grpl .grpl-grp") for grp in grps: name = grp.select_one("h3 a").text.strip() image_url = urljoin(url, grp.select_one("img")["src"]).strip() if image_url.endswith("/group_img.png"): image_url = None group_tag = grp.select_one(".grpl-type") if group_tag is not None: group_type = group_tag.text.strip() else: group_type = None description = grp.select_one(".grpl-purpose").text.replace( "\r\n", "\n").strip() if description == "This group has not written a purpose": description = "" else: description = clean(description) contact_tag = grp.select_one(".grpl-contact") if contact_tag is not None: contact_email = contact_tag.text.strip() else: contact_email = None if group_type is not None and not self.dry_run and not self.skip_tags: tag = Tag.objects.get_or_create(name=group_type)[0] else: tag = None clubs = Club.objects.filter(name__iexact=name) if clubs.exists(): if clubs.count() > 1: raise CommandError( "Club with name '{}' exists twice!".format(name)) club = clubs.first() flag = False else: code = slugify(name) if not self.dry_run: club, flag = Club.objects.get_or_create(code=code) elif Club.objects.filter(code=code).exists(): club = Club.objects.get(code=code) flag = False else: club = Club(code=code) flag = True # only overwrite blank fields if not club.name: club.name = name if not club.description: club.description = description use_image = False if image_url: if not self.dry_run: if club.image: resp = requests.head(image_url, allow_redirects=True) use_image = not resp.ok else: use_image = True if use_image: resp = requests.get(image_url, allow_redirects=True) resp.raise_for_status() club.image.save(os.path.basename(image_url), ContentFile(resp.content)) if not club.email: club.email = contact_email # mark newly created clubs as inactive (has no owner) if flag: club.active = False if not self.dry_run: club.save() if tag is not None and not club.tags.count(): club.tags.set([tag]) self.club_count += 1 self.stdout.write("{} '{}' (image: {})".format( "Created" if flag else "Updated", name, use_image)) next_tag = soup.find(text="Next >") if next_tag is not None: next_link = next_tag.find_parent("a")["href"] next_url = url.split("?", 1)[0] + next_link self.process_url(next_url)
def process_url(self, url): self.stdout.write(f"Processing Page {self.count}") self.count += 1 resp = self.session.get(url) resp.raise_for_status() soup = BeautifulSoup(resp.content, "html.parser") grps = soup.select(".grpl .grpl-grp") for grp in grps: # parse name name = grp.select_one("h3 a").text.strip() # parse image url image_url = urljoin(url, grp.select_one("img")["src"]).strip() if image_url.endswith("/group_img.png"): image_url = None # parse tag group_tag = grp.select_one(".grpl-type") if group_tag is not None: group_type = group_tag.text.strip() else: group_type = None # parse description description = grp.select_one(".grpl-purpose").text.replace( "\r\n", "\n").strip() if description == "This group has not written a purpose": description = "" else: description = clean(description) # parse email contact contact_tag = grp.select_one(".grpl-contact") if contact_tag is not None: contact_email = contact_tag.text.strip() else: contact_email = None # create or update tag if group_type is not None and not self.dry_run and not self.skip_tags: tag = Tag.objects.get_or_create(name=group_type)[0] else: tag = None # don't include parentheses content in code slug_name = re.sub(r"\(.+?\)$", "", name).strip() # create or update club code = slugify(slug_name) club = fuzzy_lookup_club(name) if club is not None: code = club.code flag = False else: club = Club(code=code) flag = True if not flag and self.create_only: self.ignore_count += 1 self.stdout.write(f"Ignoring {name}, club already exists") continue # only overwrite blank fields if not club.name: club.name = name if not club.description: club.description = description # only update image if existing image is nonexistent/broken link # if image is local and set, assume that it exists use_image = False if image_url: if not self.dry_run: if club.image: if club.image.url.startswith("http"): resp = requests.head(club.image.url, allow_redirects=True) use_image = not resp.ok else: use_image = False else: use_image = True if use_image: resp = requests.get(image_url, allow_redirects=True) resp.raise_for_status() club.image.save(os.path.basename(image_url), ContentFile(resp.content)) else: use_image = not bool(club.image) # update email if there is no email if not club.email: club.email = contact_email # mark newly created clubs as inactive (has no owner) if flag: club.active = False if not self.dry_run: with transaction.atomic(): club.save() if tag is not None and not club.tags.count(): club.tags.set([tag]) self.club_count += 1 action_verb = "Created" if flag else "Updated" out_string = f"{action_verb} '{name}' (image: {use_image})" if flag: self.stdout.write(self.style.SUCCESS(out_string)) self.create_count += 1 else: self.stdout.write(out_string) self.update_count += 1 next_tag = soup.find(text="Next >") if next_tag is not None: next_link = next_tag.find_parent("a")["href"] next_url = url.split("?", 1)[0] + next_link self.process_url(next_url)