def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") victim_divs = soup.find("div", class_="row mt-3 mb-3").find_all( "div", recursive=False) for div in victim_divs: # parse all the stuff out of the html parent_div = div.find("div") header_div = parent_div.find("div", class_="header") # get the name from the header h5 = header_div.find("div").find("div", class_="col-8").find("h5") name = h5.text.split("- ")[0].strip() # get the published date from the header published_span = header_div.find("div").find( "div", class_="col-4 text-right").find("span") published_dt = datetime.strptime(published_span.text.strip(), "%d.%m.%Y") # parse out the details link # this is ugly but it works body_div = parent_div.find("div", class_="body") link_div = body_div.find_all("div")[-1] a = body_div.find_all("div") b = a[-1] c = b.find("a") url = c.attrs["href"] logging.debug(f"Found victim: {name}") # check if the org is already seen (search by url because name isn't guarenteed unique) q = self.session.query(Victim).filter_by(url=url, site=self.site) if q.count() == 0: # new org v = Victim(name=name, url=url, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.site.last_scraped = datetime.utcnow() self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}/rss", headers=self.headers) soup = BeautifulSoup(r.content, features="xml") items = soup.findAll('item') for item in items: name = item.title.text logging.debug(f"Found victim: {name}") publish_dt = datetime.strptime(item.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z") q = self.session.query(Victim).filter_by(site=self.site, name=name) if q.count() == 0: # new victim v = Victim(name=name, url=None, published=publish_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.site.last_scraped = datetime.utcnow() self.session.commit()
def _handle_page(self, body: str): soup = BeautifulSoup(body, "html.parser") victim_divs = soup.find_all("div", class_="border-top border-light pt-3 mb-4") for div in victim_divs: # parse all the stuff out of the html name = div.find("h3").text.split("\n")[0].strip() url = div.find_all("div")[-1].find("a").attrs["href"] logging.debug(f"Found victim: {name}") # check if the org is already seen (search by url because name isn't guarenteed unique) q = self.session.query(Victim).filter_by(url=url, site=self.site) if q.count() == 0: # new org v = Victim(name=name, url=url, published=None, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def _handle_page(self, body: str): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all("div", class_="post-block") for victim in victim_list: victim_name = victim.find("div", class_="post-title").text.strip() victim_leak_site = victim.find( "div", class_="post-block-body").find("a").attrs["href"] published_dt = datetime.now() q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def _handle_page(self, body: str): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all("div", class_="col p-4 d-flex flex-column position-static") for victim in victim_list: victim_name = victim.find("h3", class_="mb-0").text.strip() victim_name = victim_name[:victim_name.find("\n")] victim_leak_site = self.url + victim.find("a").attrs["href"] published = victim.find("div", class_="mb-1 text-muted").text.strip() published_dt = datetime.strptime( published, "%Y-%m-%d") q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) #print(v) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def _handle_page(self, soup): victim_list = soup.find_all("a", class_="post") for victim in victim_list: victim_name = victim.find("h2", class_="post-title").text.strip() published = victim.find("div", class_="time").text.strip() published_dt = dateparser.parse(published) victim_leak_site = victim['href'] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() self.current_victims.append(v) self.session.commit() # Lets delay execution of next in case of timeout of server/proxy relay time.sleep(1.0)
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") script_list = soup.find_all("script") # they include the list in javascript code instead of HTML # So we have to parse it js_victims_raw = "" js_marker = "var post_links = " for script in script_list: script = str(script) if js_marker in script: js_victims_raw = script break if not js_victims_raw: raise Exception(f"js victim list not found (tried to locate '{js_marker}')") raw_victim_list = js_victims_raw.split(f"{js_marker}[{{")[1].split( "}]" )[0] victim_list = json.loads(f"[{{{raw_victim_list}}}]") for victim in victim_list: victim_name = victim["title"] if "-" in victim_name: victim_name = victim_name[:victim_name.find("-")] published = int(victim["timestamp"]) published_dt = datetime.utcfromtimestamp(published) victim_leak_site = self.url + "/?" + victim["link"] + "/" q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") script_list = soup.find_all("script") # they include the list in javascript code instead of HTML # So we have to parse it javascript_code = "" for script in script_list: script = str(script) if "var post_links = " in script: javascript_code = script break start_index = javascript_code.find("var post_links = ") end_index = javascript_code[start_index:].find("var baseUrl") + start_index javascript_code = javascript_code[start_index:end_index].strip() start_index = javascript_code.find("[") end_index = javascript_code.rfind("]") + 1 javascript_code = javascript_code[start_index:end_index].strip().replace("null", "None") # convert javascript list of dictionary to python's list of dictionary victim_list = list(eval(javascript_code)) for victim in victim_list: victim_name = victim["title"] if "-" in victim_name: victim_name = victim_name[:victim_name.find("-")] published = int(victim["timestamp"]) published_dt = datetime.utcfromtimestamp(published) victim_leak_site = self.url + "/?" + victim["link"] + "/" q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: url = self.url + '/partners.html' r = p.get(f"{url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="page-header") for victim in victim_list: victim_name = victim.find_all("a")[0].text.strip() published = victim.find_all("span")[1].text.strip() published_dt = None # they use a bunch of different date format... if published == "29/01/21": published_dt = datetime.strptime(published, "%d/%m/%y") elif published[6:8] == "20" and published[8:] != "": published_dt = datetime.strptime(published, "%m/%d/%Y") else: published_dt = datetime.strptime(published, "%m/%d/%y") victim_leak_site = self.url + '/' + victim.find_all( "a")[0].attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def _handle_page(self, body: str, p: Proxy): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all("div", class_="list-text") for victim in victim_list: victim_description = victim.find("a").find("p").text.strip().split( " ") # extract company name by getting only the first few words that start with a capitalized letter victim_name = "" for word in victim_description: if word[0].isupper() or word == "and": victim_name += word + " " else: break victim_name = victim_name[:-1] # Delete the last space if victim_name[-2:] == "is": # hard-code this. They forgot to add a space to one name, so I can't properly scrape it victim_name = victim_name[:-2] # they put the published date in the victim's leak page victim_leak_site = victim.find("a").attrs["href"] r = p.get(victim_leak_site, headers=self.headers) published_dt = self.extract_published_date(r.content.decode()) q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def scrape_victims(self): with Proxy() as p: url = self.url + '/partners.html' r = p.get(f"{url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="page-header") for victim in victim_list: victim_name = victim.find_all("a")[0].text.strip() published = victim.find_all("span")[1].text.strip() # they use a bunch of different date format # use a nice dateparsing library to handle them all in an easier manner published_dt = dateparser.parse(published) # sometimes they don't have a timestamp if published_dt is None and len(published) > 0: logging.warning(f"couldn't parse timestamp: {published}") victim_leak_site = self.url + '/' + victim.find_all( "a")[0].attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="card-body") for victim in victim_list: victim_name = victim.find("h5", class_="card-title").text.strip() published = victim.find( "p", class_="card-text mt-3 text-secondary").text[11:21] published_dt = datetime.strptime(published, "%Y-%m-%d") victim_leak_site = self.url + victim.find( "a", class_="btn btn-outline-primary").attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="lot-card row m-0") for victim in victim_list: victim_name = victim.find( "div", class_="text-left text-grey d-block overflow-hidden").find( "a").attrs["href"] published_dt = None victim_leak_site = None q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all( "div", class_="blog-post blog-main posts_at_first") for victim in victim_list: print(victim) victim_name = victim.find( "h2", class_="blog-post-title").find("a").text.strip() published = "" victim_leak_site = self.url + victim.find( "h2", class_="blog-post-title").find("a").attr["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="blog-one__single") for victim in victim_list: victim_name = victim.find("h3").text.strip() client_site = victim.find("h3").find("a", title="Visit Client Website").text.strip() victim_name = victim_name.replace(client_site, "").strip() published = victim.find("div", class_="blog-one__meta").text.strip()[:10] published_dt = datetime.strptime( published, "%Y-%m-%d") victim_leak_site = self.url + "/" + victim.find("h3").find("a").attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("a", class_="leak-card p-3") for victim in victim_list: victim_name = victim.find("h5").text.strip() published = victim.find("div", class_="col-auto published") published_dt = datetime.strptime( published.text.strip(), "%Y-%m-%d %H:%M:%S") if victim_name == "Hello world 1" or victim_name == "Mercy, journalists,chonky boi": # skipping news and updates continue victim_leak_site = self.url + victim.attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find("div", class_="collapse-section").find_all("li") for victim in victim_list: victim_name = victim.find("a").text.strip() if victim_name == "HOME" or victim_name == "HOW TO DOWNLOAD?": continue victim_leak_site = self.url + victim.find("a").attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=None, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="col p-4 d-flex flex-column position-static") for victim in victim_list: victim_name = victim.find("h3", class_="mb-0").text.strip() victim_name = victim_name[:victim_name.find("\n")] published = victim.find("div", class_="mb-1 text-muted") published_dt = datetime.strptime( published.text.strip(), "%Y-%m-%d") victim_leak_site = self.url + victim.find("a", class_="stretched-link").attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def _handle_page(self, body: str): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all("div", {"id": re.compile("comp.*")}) for victim in victim_list: victim_h3 = victim.find("div", class_="panel-heading").find("h3") if victim_h3 is None: # unpublished victims are in a h4 continue victim_name = victim_h3.text.strip() victim_leak_site = self.url + "/#" + victim.get("id") if victim.find("span", class_="glyphicon"): published = victim.find("span", class_="glyphicon").next_sibling published = published.lstrip() published_dt = datetime.strptime(published, "Posted %b %d, %Y.") else: published = "" q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def _handle_page(self, soup): victim_list = soup.find_all("header", class_="entry-header") for victim in victim_list: victim_title = victim.find("h2", class_="entry-title").text.strip() victim_name = victim_title[0:victim_title.find(". Part")] meta = victim.find("div", class_="entry-meta") published = meta.find("time", class_="entry-date").attrs["datetime"] published_dt = datetime.strptime(published.strip()[:-6], "%Y-%m-%dT%H:%M:%S") victim_leak_site = meta.find( "span", class_="posted-on").find("a").attrs["href"] q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() self.current_victims.append(v) self.session.commit() # server was timing out so slows it down a bit time.sleep(1.0)
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}", headers=self.headers) soup = BeautifulSoup(r.content.decode(), "html.parser") # get max page number victim_list = soup.find_all("div", class_="col py-3") for victim in victim_list: victim_name = victim.find("h3", class_="mb-3").text.strip() # it's less than ideal that there aren't other properties to search on # but I don't want to store leak data URLs q = self.session.query(Victim).filter_by(site=self.site, name=victim_name) if q.count() == 0: # new victim v = Victim(name=victim_name, published=None, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit()
def _handle_page(self, body: str): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all( "header", class_="entry-header has-text-align-center") for victim in victim_list: victim_name = victim.find( "h2", class_="entry-title heading-size-1").text.strip() victim_leak_site = victim.find( "h2", class_="entry-title heading-size-1").find("a").attrs["href"] published = victim.find( "li", class_="post-date meta-wrapper").find("a").text.strip() published_dt = datetime.strptime(published, "%B %d, %Y") q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def scrape_victims(self): with Proxy() as p: r = p.get(f"{self.url}/v1/companies/disclosed", headers=self.headers) j = r.json() for entry in j: name = entry["title"] logging.debug(f"Found victim: {name}") publish_dt = datetime.strptime(entry["disclosed_at"], "%Y-%m-%dT%H:%M:%SZ") q = self.session.query(Victim).filter_by(site=self.site, name=name) if q.count() == 0: # new victim v = Victim(name=name, url=None, published=publish_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.site.last_scraped = datetime.utcnow() self.session.commit()
def _handle_page(self, body: str): soup = BeautifulSoup(body, "html.parser") victim_list = soup.find_all("article", {"id": re.compile("post.*")}) for victim in victim_list: victim_name = victim.find("h2", class_="type-list-title").text.strip() victim_leak_site = victim.find( "h2", class_="type-list-title").find("a").attrs["href"] published = victim.find("div", class_="type-list-date").text.strip() published_dt = dateparser.parse(published) if published_dt is None and len(published) > 0: logging.warning(f"couldn't parse timestamp: {published}") q = self.session.query(Victim).filter_by(url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit()
def _handle_page(self, soup): victim_list = soup.find_all("div", class_="card-body") for victim in victim_list: victim_name = victim.find("h5", class_="card-title").text.strip() published = victim.find("p", class_="card-text mt-3 text-secondary").text[11:21] published_dt = datetime.strptime( published, "%Y-%m-%d") victim_leak_site = self.url + victim.find("a", class_="btn btn-outline-primary").attrs["href"] q = self.session.query(Victim).filter_by( url=victim_leak_site, site=self.site) if q.count() == 0: # new victim v = Victim(name=victim_name, url=victim_leak_site, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site) self.session.add(v) self.new_victims.append(v) else: # already seen, update last_seen v = q.first() v.last_seen = datetime.utcnow() # add the org to our seen list self.current_victims.append(v) self.session.commit() self.site.last_scraped = datetime.utcnow() # just for good measure self.session.commit() # in case server/tor proxy relay times out, slowing down scraping a bit time.sleep(1.0)