def export_dataset(df): titles = get_featured_biographies( ) # Get the titles of featured biographies infos = [] # Generate a list to save the page print() print("The sample featured biography are listed as followed:") for page in titles[: 20]: # Get the new sample by change the index of the title list print(page) firstParagraph = get_first_paragraph( page) # Retrieve the first paragraph text = page_text(page, 'text') # Retrieve the topic information of a page pronouns = get_pronouns(text) # Get the most possible gender success = additional_analysis(text) # Get the success or failure cases info = { 'title': page, 'most_pronouns': pronouns, 'introduction': firstParagraph, 'stableness_cases': success } infos.append(info) df_info = pd.DataFrame(infos) df_info.to_csv(df + '.csv') # save the information into the format of csv df_info.to_json(df + '.json') # save the information into the format of JSON
def get_first_paragraph(list): all_first_p = [] i = 0 n = 0 successes = 0 fails = 0 for i in final_list: #Extract first paragraph from each element in final_list biography = page_text(final_list[i], 'html') p_start = biography.find("<p>") biography = biography[p_start + 3:] p_break = biography.find("</p>") biography = biography[0:p_break] # Clean raw text: remove HTML tags and breaks in each HTML string of first paragraph extracted cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') clean_bio = re.sub(cleanr, '', biography) clean_bio = clean_bio.replace('\\n', '') clean_bio = clean_bio.replace('\\', '') try: all_first_p.append(clean_bio) successes += 1 n += 1 except: fails += 1 n += 1 continue print(f'There were {successes} successes.') print(f'There were {fails} fails.') return all_first_p
def p2_extracting_info(): """Implements get_pronouns(text) function to all of the biographies and print the result""" print("Part 2") print("#2.3 extracting common pronouns") common_male = 0 common_female = 0 common_plural = 0 for article in get_featured_biographies(): try: page_content = page_text(article, "text") if get_pronouns(page_content) == "Male Pronouns": common_male += 1 elif get_pronouns(page_content) == "Female Pronouns": common_female += 1 elif get_pronouns(page_content) == "Plural or Non-binary Pronouns": common_plural += 1 else: print(article) except UnboundLocalError: pass print(f"male common number: {common_male}\n" f"male common percentage: {round(common_male / len(get_featured_biographies())*100, 2)}%") print(f"female common number: {common_female}\n" f"female common percentage: {round(common_female / len(get_featured_biographies()) * 100, 2)}%") print(f"plural common number: {common_plural}\n" f"plural common percentage: {round(common_plural / len(get_featured_biographies()) * 100, 2)}%")
def Wiki_Q3(): biographies_names = get_featured_biographies() number_of_failures = 0 num_male, num_female, num_plural, num_unknown = 0, 0, 0, 0 for name in biographies_names: try: text = page_text(name, 'text') gender = get_pronouns(text) if(gender == 'Male'): num_male += 1 elif(gender == 'Female'): num_female += 1 elif(gender == 'Plural'): num_plural += 1 else: num_unknow += 1 except Exception as e: number_of_failures += 1 pass percentage_of_male = num_male/len(biographies_names) * 100 percentage_of_female = num_female/len(biographies_names) * 100 percentage_of_plural = num_plural/len(biographies_names) * 100 percentage_of_unknown = num_unknown/len(biographies_names) * 100 percentage_of_failures = number_of_failures/len(biographies_names) *100 print(f'{percentage_of_male:.2f}% biographies use he/his pronouns.') print(f'{percentage_of_female:.2f}% biographies use she/her pronouns.') print(f'{percentage_of_plural:.2f}% biographies use they/them pronouns.') print(f'{percentage_of_unknown:.2f}% biographies use unknown pronouns.') print(f'Failed to parse {percentage_of_failures:.2f}% of pages.')
def q5_addtional(): """This function tests additional_analysis_q5(text) before using it in q5""" for article in get_featured_biographies(): try: page_content = page_text(article, "text") print(additional_analysis_q5(page_content)) except UnboundLocalError: pass
def get_featured_biographies(): page_list = page_text('Wikipedia:Featured articles', 'list') # print(page_list) # After checking the strucuture of outcome, # find out that the row end with'[edit]' is a sub-title, which should be the start # Therefore, set the first occurence of 'edit'as the the start # to skip over the beginning descption of the webpage for page_list_row in range(len(page_list)): txt = page_list[page_list_row].strip() if '[edit]' in txt: break # Here, page_list_row is the first row of topic # txt is the title of the first topic featuredArticle = { } # Generate a dictionary to save the topic and the titles of the articles Articles_each_topic = None subTopic = txt while page_list_row < len(page_list): txt = page_list[page_list_row].strip() if '[edit]' in txt: featuredArticle[subTopic] = Articles_each_topic Articles_each_topic = [] subTopic = txt page_list_row += 1 continue if txt != None: Articles_each_topic.append(txt) page_list_row += 1 # In the previous check, also find out that # featured articles also biographies have # the sub-topic containing "biographies" or "Biographies" biographies = [] # Generate a list to save these articles for subTopic in featuredArticle.keys(): if 'biographies' in subTopic.lower( ) and 'Autobiographies' not in subTopic: biographies.extend(featuredArticle[subTopic]) # clean the featured biologies list del biographies[301] del biographies[516] biographyCount = len(biographies) articleCount = 0 # Count the number of featured articles for subTopic in featuredArticle: articleCount += len(featuredArticle[subTopic]) print( f"There're {articleCount} featured articles, among which, {biographyCount} articles that are also biographies." ) article_perct = biographyCount / articleCount article_perct = round(article_perct * 100, 2) print( f"The percentage of featured articles among the featured articles is {article_perct}%." ) return biographies
def get_infobox(page): """Returns a dictionary containing information extracted from infoboxes""" # to identify an infobox: # 1. find "table class=" in line # 2.parse the line and find the <tr> lables # 3. extract categorical label of the entries using e_c_r # 4. extract details following the label using e_a_r page_content = page_text(page, "html", include_tables=True).split("\\n") entry_regex = re.compile(r'<tr>.*?</tr>') entry_category_regex = re.compile(r'\">.*?</th>') entry_answer_regex = re.compile(r'\>.*?<') info_dict = {"Name": page} for line in page_content: if line == "": page_content.pop(page_content.index(line)) for line in page_content: if "<table class=\"infobox" in line\ and "vcard" in line: table_entries = entry_regex.findall(line) for i in table_entries: # print(i) entry_category = entry_category_regex.findall(i) entry_answer = entry_answer_regex.findall(i) entry_answer.pop(0) if len(entry_category) != 0: answer = [] for j in entry_answer: # to remove >< marks j = j[1:-1] if len(j.strip()) > 1: try: # to remove random punctuation marks while j[0].isalpha() is False and j[0].isdigit( ) is False: j = j[1:] while j[-1].isalpha( ) is False and j[-1].isdigit() is False: j = j[:-1] answer.append(j) except IndexError: pass # print(answer) if len(answer) > 1: # return the result in a joint string to improve readability answer_value = ', '.join(answer[1:]) answer_key = answer[0] if "(" in answer_key: answer_key = answer_key.replace("(", "") info_dict[answer_key] = answer_value # print(info_dict) return info_dict
def Wiki_Q1(): page = page_text('Wikipedia:Featured_articles', 'html') soup = BeautifulSoup(page, 'html.parser') featured_articles_names = [] for sub in soup.find('h2').parent.descendants: if sub.name == 'li': featured_articles_names.append(sub.string) biographies_names = get_featured_biographies() percentage_of_biographies = len(biographies_names)/len(featured_articles_names) * 100 print(f"Total articles: {len(featured_articles_names)}") print(f"Biographies articles: {len(biographies_names)}") print(f"Percentage of biographies: {percentage_of_biographies:.2f}%")
def get_first_paragraph(page): first_paragraphs = '' try: soup = BeautifulSoup(page_text(page, 'html'), 'html.parser') for p in soup.find_all('p'): if(p.get('class', '') == ''): for paragraph in p.children: first_paragraphs += str(paragraph.string) break except Exception as e: pass return first_paragraphs
def get_first_paragraph(page): info = page_text(page, 'text') # Retrieve information of a page # Split the retrieved information into different paragraphs through Line break in html with `\n` info = re.split('\n+', info) # After going through several webpage, find out the common characteristics of the first paragraph: # 1. begins at the second items # 2. covers more than 100 words for txt in info[1:]: if len(txt) > 100: return txt # If there're no paragraph containing more than 100 words, # use the second item directly return info[1]
def Extra_Credit(): biographies_names = get_featured_biographies() num_no_infobox = 0 birth_and_death = [] year_list = [] for name in biographies_names: try: page = page_text(name, 'html', include_tables=True) soup = BeautifulSoup(page, 'html.parser') if('infobox biography vcard' in page): table = soup.find_all('table', 'infobox biography vcard')[0] year = (get_birth_and_death(table)) year_list.append(year) else: year = (-1, -1) num_no_infobox += 1 year_list.append(year) except Exception as e: year = (-1, -1) year_list.append(year) years_dictionary = { 'title': name, 'year': year } birth_and_death.append(years_dictionary) df = pd.DataFrame(birth_and_death) df.to_csv('extra_credit.csv', index=False) print('Extra credit is saved in extra_credit.csv') plt.boxplot(year_list[0]) plt.boxplot(year_list[1]) plt.show() born, died = [], [] no_birth, no_death, unknown = 0, 0, 0 for i in year_list: if(i[0] == -1 and i[1] == -1): unknown += 1 elif(i[0] == -1 and i[1] != -1): no_birth += 1 died.append(i[1]) elif(i[0] != -1 and i[1] == -1): born.append(i[0]) no_death += 1 else: born.append(i[0]) died.append(i[1]) percentage_of_alive = no_death/len(year_list) * 100 percentage_of_unknown = unknown/len(year_list) * 100 print(f'{percentage_of_alive:.2f}% people are still alive.') print(f'{percentage_of_unknown:.2f}% featured biographies are unknown.')
def get_first_paragraph(name_list): # \d is any number character, 0-9 # \w is any alphabetical character, A-Z, uppercase or lowercase # \s is any space character, including tabs or spaces or other blanks. first_paragraph = [] for name in name_list: page = page_text(name, "list") life_span_pattern = name + '\s*\(' for paragraph in page: match = re.search(life_span_pattern, paragraph) if match: first_paragraph.append(paragraph) break return first_paragraph
def get_featured_biographies(): page = page_text('Wikipedia:Featured_articles', 'html') soup = BeautifulSoup(page, 'html.parser') featured_articles_names = [] for sub in soup.find('h2').parent.descendants: if sub.name == 'li': featured_articles_names.append(sub.string) biographies_names = [] for h3 in soup.find_all('h3'): for sub in h3.children: if('mw-headline' in sub.get('class', '') and 'biographies' in sub.string): for li in h3.next_sibling.next_sibling.children: if li.name == 'li': biographies_names.append(li.string) break return biographies_names
def additional_analysis(): biographies_names = get_featured_biographies() page_length = [] for name in biographies_names: try: biography_text = page_text(name,'text') length = len(biography_text.split()) page_length.append(length) except: pass max_length = max(page_length) min_length = min(page_length) mean_length = np.mean(page_length) median_length = np.median(page_length) std_length = np.std(page_length) print(f'max: {max_length}, min: {min_length}, mean: {mean_length:.2f}, median: {median_length}, std: {std_length:.2f}')
def Wiki_Q5(): biographies_names = get_featured_biographies() list_biography = [] number_of_failures = 0 for name in biographies_names: try: biography_text = page_text(name, 'text') gender = get_pronouns(biography_text) length = len(biography_text.split()) list_biography.append({'title': name, 'pronoun': gender, 'len': length}) except Exception as e: number_of_failures += 1 pass print(f'Failed to scrape {number_of_failures} pages.') df_biography = pd.DataFrame(list_biography, columns=['title', 'pronoun', 'len']) export_dataset(df_biography) print(pd.read_csv('export_dataset.csv'))
def additional_analysis(): """Returns a list of century counts""" # look for year information by 3-4 digits (2 digits were omitted) # check if the year is BC or not year_pattern = "\d{3,4}\)*\s" bc_str = "\sBC\s\)*" bc_pattern = re.compile(bc_str) century_count = {} for article in get_featured_biographies(): print(article) try: page_content = page_text(article, "text") try: birth_year = re.findall(year_pattern, page_content)[0] try: birth_year = int(birth_year) # if the ) is not parse out of the string, it causes an error except ValueError: birth_year = birth_year[:-2] birth_year = int(birth_year) print(birth_year) birth_year_century = math.floor(birth_year/100)+1 if bc_pattern.search(page_content): birth_year_century = -birth_year_century if birth_year_century not in century_count: century_count[birth_year_century] = 1 else: century_count[birth_year_century] += 1 except IndexError: pass except UnboundLocalError: pass century_count_items = list(century_count.items()) sorted_century_count = sorted(century_count_items, key=lambda item: item[0], reverse=True) # write the count into a csv file # with open('century_count.csv', 'w') as f3: # [f3.write('{0},{1}\n'.format(key, value)) for key, value in century_count.items()] print(sorted_century_count) return sorted_century_count
def get_first_paragraph(page): page_content = page_text(page, "text").split("\n") # remove empty lines in the list of content for line in page_content: if line == "": page_content.pop(page_content.index(line)) i = 0; # criteria to search for first info paragraphs: # 1. the numbers of characters in line is greater than 100 # 2. in the line, the person's family name is included # 3. the line is not starting with functional words such as "For" or "This" # if find the paragraph successfully based on rules about, return true # otherwise, return false while len(page_content[i].strip()) < 100: i += 1 first_para = page_content[i].strip() first_word = first_para.split(" ")[0] try: if page[-1] in first_para: if first_word != "This" and first_word != "For" \ and first_word != "The" and first_word != "In" \ and first_para[0] != "\"" and first_para[0] != "(": print(first_para) return True else: first_para = page_content[i+1].strip() first_word = first_para.split(" ")[0] if len(first_para) > 100: if first_word != "This" and first_word != "For"\ and first_word != "The" and first_word != "In" \ and first_para[0] != "\"" and first_para[0] != "(": print(first_para) return True else: return False else: return False else: return False except IndexError: return False
def df_summary_list(): """This function creates a list of dictionaries that store people's info""" summary_list = [] for article in get_featured_biographies(): try: page_content = page_text(article, "text") sub_dict = {"Name": article, "Year_of_Birth": additional_analysis_q5(page_content), "Most_Common_Pronoun": get_pronouns(page_content), } i = get_featured_biographies().index(article) print(i) print(sub_dict) summary_list.append(sub_dict) except UnboundLocalError: pass df = pd.DataFrame(summary_list) return df
for i in ratings_filtered.columns: print(i.upper()) print(ratings_filtered[i].value_counts()) print() #This command takes a while to run print(q3_yelp(ratings)) ## PART TWO: WIKIPEDIA # To import this function you will need to install the lxml library using Conda. from wiki_api import page_text import re # Pulling wiki_html = page_text("Wikipedia:Featured articles", "html") wiki_text = page_text("Wikipedia:Featured articles", "text") wiki_list = page_text("Wikipedia:Featured articles", "list") # Part 2, Question 1 def get_featured_biographies(list): print(f"There are {len(wiki_list)} items in the object wiki_list.") final_list = [] boolean = False title = [] for title in wiki_list: if ('[edit]' in title) and ('Autobiographies' in title): boolean = False continue elif ('[edit]' in title) and (('biographies' in title) or
return gender_list def additional_analysis(): pass def export_dataset(df, format): with open(f"export_dataset.{format}", "w", encoding='utf-8') as out_file: # have the option to export in csv or json format # encoding = 'utf-8' because run into error of UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position if format == "csv": out_file.write(df.to_csv()) elif format == "json": out_file.write(df.to_json()) if __name__ == "__main__": ls = page_text("Wikipedia:Featured articles", "list") ls = ls[40:] ls = ls[:-7] ls = list(filter(lambda x: x != "",ls)) name_list=get_featured_biographies() print(f"Among {len(ls)} number of featured articles, {len(name_list)/len(ls)*100:.2f}% are biographies.") first_para_list = get_first_paragraph(name_list) print(f"Among {len(name_list)} number of biographies, {len(first_para_list)/len(name_list)*100:.2f}% can be scraped as first paragraphs.") gender_list = get_pronouns(first_para_list) female_count = sum(gender == 'Female' for gender in gender_list) male_count = sum(gender == 'Male' for gender in gender_list) plural_count = sum(gender == 'Plural' for gender in gender_list) total_count = int(len(first_para_list)) other_count = total_count - female_count - male_count - plural_count