def scrape(): csv = MultiCSV() threaded(gdocs_persons(), lambda data: scrape_image(data['Full Name'], data['Image URL'], csv, data['Image Credit']), num_threads=THREAD_COUNT) csv.close()
def scrape(): csv = MultiCSV() threaded( gdocs_persons(), lambda data: scrape_image(data['Full Name'], data['Image URL'], csv, data['Image Credit']), num_threads=THREAD_COUNT ) csv.close()
def scrape(): csv = MultiCSV() threaded(make_names_from_gdocs(), lambda name: scrape_image(name, csv), num_threads=THREAD_COUNT) csv.close()
def load(): csv = MultiCSV() api_meta = requests.get(INSTANCE_URL).json().get('meta') orgs = load_organizations(api_meta, csv) #orgs = {} load_persons(api_meta, csv, orgs)
def scrape(limit): searcher = Searcher(limit) csv = MultiCSV() searcher.init_session(csv) csv.close() searcher.report()
def scrape_npos(): csv = MultiCSV() threaded(make_urls(), lambda i: scrape_npo(csv, i), num_threads=30) csv.close()
def scrape_ngos(): csv = MultiCSV() threaded(make_urls(), lambda url: scrape_ngo(csv, url), num_threads=3) csv.close()
def scrape_companies(): csv = MultiCSV() threaded(make_urls(), lambda url: scrape_company(csv, url), num_threads=5) csv.close()
def scrape_persons(): csv = MultiCSV() threaded(make_urls(), lambda i: scrape_person(csv, i), num_threads=25) csv.close()
def scrape(): csv = MultiCSV() threaded(scrape_index(), lambda i: scrape_record(csv, i), num_threads=30) csv.close()
def scrape_contracts(): csv = MultiCSV() threaded(make_urls2(), lambda i: scrape_contract(csv, i), num_threads=30) csv.close()
return self.url_graph.add_edge(src_url, dest_url) self.url_graph.node[dest_url]['min_distance'] = this_distance # if dest_url is within range and it used to be outside range if this_distance <= self.degrees and min_distance > self.degrees: self.url_to_scrape.add(dest_url) # if dest_url's children are within range and used to be outside range if this_distance < self.degrees and min_distance + 1 > self.degrees: for child_url in self.url_graph[dest_url].keys(): self.add_edge(dest_url, child_url) if __name__ == '__main__': degrees = 0 try: degrees = int(sys.argv[1]) except (IndexError, ValueError): pass scraper = NetworkScraper(csv=MultiCSV(), thread_count=5, degrees=degrees) scraper.start() for data in gdocs_persons(): try: scraper.scrape( search_term=data['Full Name'], start_url=data['WhosWho'], ) except Exception as e: sys.stderr.write("%s\n" % str(e)) scraper.finish()
def scrape(): scraper = ResultsScraper() csv = MultiCSV() scraper.init_session.run(csv) csv.close() scraper.report()