def worker(sublist): print(sublist) #for link in sublist: print(base_url + sublist) workerHtmlTree = BeautifulSoup( requests.get(base_url + sublist).text, "lxml") staff = Staff(*Staff.populateObjectFromHTML(workerHtmlTree), base_url + sublist) #print(staff.toJSON().decode('utf8')) return staff
htmlTree = BeautifulSoup( requests.get(base_url + "/de/einblicke/mitarbeiter").text, 'lxml') # get all the staff and their corresponding profile link workerList = htmlTree.find( "div", class_="panel-pane pane-views-panes pane-mitarbeiter-kontakt-panel-pane-3" ).find_all("a", href=re.compile("/(.*)")) # delete "/de" prefixes and clean list from duplicates via conversion to a set workerSet = {(link.get("href")[3:] if link.get("href").startswith("/de") else link.get("href")) for link in workerList} stafflist = [] for link in workerSet: workerHtmlTree = BeautifulSoup(requests.get(base_url + link).text, "lxml") staff = Staff(*Staff.populateObjectFromHTML(workerHtmlTree), base_url + link) print(staff.toJSON().decode('utf8')) stafflist.append(staff) # determine the output format and print in the corresponding format to a file if sys.argv[1] == "-csv": print("CSV") with open("staff.csv", "w+", newline="\n") as csvout: writer = csv.writer(csvout, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow([ "Name", "E-Mail", "Telefon", "Fax", "Adresse", "Foto", "URL", "Nähere Informationen"