def process_author(author, k): '''Given an authos soup object, collects information and books for the author if they have at least k sentences throughout their books Parameters: author - an author soup object k - the number of sentences required Output: books_list - the list of books for that author author_path - the folder where books for that author are saved ''' author_name = author.getText() #get the author name authorclean = ''.join(x for x in author_name.split('(')[0] if x.isalpha() or x in ' ') #leave alpha characters and spaces author_path = unidecode(clean_name(author_name)) #if the author is not yet in the db if author_path > "Verna Draba" and not db['lab1'].find({'folder':author_path}).count(): print(authorclean) #get the next sibling of the author and proceed only if it is ul next_sibling = author.find_next_sibling() if next_sibling.name != 'ul': return None, None list_of_books = author.find_next_sibling("ul") #get a list of books for the author (using the ul sibling of the h2 tag) if list_of_books != None: #if there is a list of books books = list_of_books.findAll("li", {"class" : "pgdbetext"}) #get all books if "Anonymous" not in authorclean and "Unknown" not in authorclean: #if author is not Anonymous or Unknown books_to_save, k = get_books_up_to_k_sentences(books, k) #if k has reached 0, save all the books for that author if k == 0: books_list = [] for book in books_to_save: book_path = save_file(book[0], './lab1_data/'+author_path+'/', book[1]) #write the returned sentence to a file with the book name # books_by_author[author_path].append({'name': book[2], 'path':book_path}) #add the book to the respective author in the books_by_author dictionary books_list.append({'name': book[2], 'path':book_path}) #get the multilingual abstracts and the literary movements abstracts, url_author = get_multilingual_abstracts(authorclean) movements = [] if url_author != None: movements = get_literary_movements(url_author) #create and author object, record all the related the data and save it to the db a = Author(url_author) a.name = authorclean.strip() a.folder = author_path a.books = books_list a.abstracts = abstracts a.movements = movements out = db['lab1'].insert_one(a.__dict__) return books_list, author_path return None, None