コード例 #1
0
def process_author(author, k):
	'''Given an authos soup object, collects information and books 
	for the author if they have at least k sentences throughout their books
	Parameters:
	author - an author soup object
	k - the number of sentences required
	Output: books_list - the list of books for that author
			author_path - the folder where books for that author are saved
	'''
	author_name = author.getText() #get the author name
	authorclean = ''.join(x for x in author_name.split('(')[0] if x.isalpha() or x in ' ') #leave alpha characters and spaces
	author_path = unidecode(clean_name(author_name))

	#if the author is not yet in the db
	if author_path > "Verna Draba" and not db['lab1'].find({'folder':author_path}).count():
		print(authorclean)
		#get the next sibling of the author and proceed only if it is ul
		next_sibling = author.find_next_sibling()
		if next_sibling.name != 'ul':
			return None, None
		list_of_books = author.find_next_sibling("ul") #get a list of books for the author (using the ul sibling of the h2 tag)
		if list_of_books != None: #if there is a list of books
			books = list_of_books.findAll("li", {"class" : "pgdbetext"}) #get all books
			if "Anonymous" not in authorclean and "Unknown" not in authorclean: #if author is not Anonymous or Unknown
				books_to_save, k = get_books_up_to_k_sentences(books, k)
				
				#if k has reached 0, save all the books for that author
				if k == 0:
					books_list = []
					for book in books_to_save:
						book_path = save_file(book[0], './lab1_data/'+author_path+'/', book[1]) #write the returned sentence to a file with the book name
						# books_by_author[author_path].append({'name': book[2], 'path':book_path}) #add the book to the respective author in the books_by_author dictionary
						books_list.append({'name': book[2], 'path':book_path})
					
					#get the multilingual abstracts and the literary movements
					abstracts, url_author = get_multilingual_abstracts(authorclean)

					movements = []
					if url_author != None:
						movements = get_literary_movements(url_author)
					#create and author object, record all the related the data and save it to the db
					a = Author(url_author)
					a.name = authorclean.strip()
					a.folder = author_path
					a.books = books_list
					a.abstracts = abstracts
					a.movements = movements
					out = db['lab1'].insert_one(a.__dict__)
					return books_list, author_path
	return None, None