Beispiel #1
0
def export_dataset(df):
    titles = get_featured_biographies(
    )  # Get the titles of featured biographies
    infos = []  # Generate a list to save the page
    print()
    print("The sample featured biography are listed as followed:")
    for page in titles[:
                       20]:  # Get the new sample by change the index of the title list
        print(page)
        firstParagraph = get_first_paragraph(
            page)  # Retrieve the first paragraph
        text = page_text(page,
                         'text')  # Retrieve the topic information of a page
        pronouns = get_pronouns(text)  # Get the most possible gender
        success = additional_analysis(text)  # Get the success or failure cases

        info = {
            'title': page,
            'most_pronouns': pronouns,
            'introduction': firstParagraph,
            'stableness_cases': success
        }

        infos.append(info)

    df_info = pd.DataFrame(infos)
    df_info.to_csv(df + '.csv')  # save the information into the format of csv
    df_info.to_json(df +
                    '.json')  # save the information into the format of JSON
Beispiel #2
0
def get_first_paragraph(list):
    all_first_p = []
    i = 0
    n = 0
    successes = 0
    fails = 0
    for i in final_list:
        #Extract first paragraph from each element in final_list
        biography = page_text(final_list[i], 'html')
        p_start = biography.find("<p>")
        biography = biography[p_start + 3:]

        p_break = biography.find("</p>")
        biography = biography[0:p_break]

        # Clean raw text: remove HTML tags and breaks in each HTML string of first paragraph extracted
        cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        clean_bio = re.sub(cleanr, '', biography)
        clean_bio = clean_bio.replace('\\n', '')
        clean_bio = clean_bio.replace('\\', '')

        try:
            all_first_p.append(clean_bio)
            successes += 1
            n += 1
        except:
            fails += 1
            n += 1
            continue

    print(f'There were {successes} successes.')
    print(f'There were {fails} fails.')

    return all_first_p
Beispiel #3
0
def p2_extracting_info():

    """Implements get_pronouns(text) function to all of the biographies and print the result"""

    print("Part 2")
    print("#2.3 extracting common pronouns")
    common_male = 0
    common_female = 0
    common_plural = 0

    for article in get_featured_biographies():
        try:
            page_content = page_text(article, "text")
            if get_pronouns(page_content) == "Male Pronouns":
                common_male += 1
            elif get_pronouns(page_content) == "Female Pronouns":
                common_female += 1
            elif get_pronouns(page_content) == "Plural or Non-binary Pronouns":
                common_plural += 1
            else:
                print(article)
        except UnboundLocalError:
            pass

    print(f"male common number: {common_male}\n"
          f"male common percentage: {round(common_male / len(get_featured_biographies())*100, 2)}%")
    print(f"female common number: {common_female}\n"
          f"female common percentage: {round(common_female / len(get_featured_biographies()) * 100, 2)}%")
    print(f"plural common number: {common_plural}\n"
          f"plural common percentage: {round(common_plural / len(get_featured_biographies()) * 100, 2)}%")
Beispiel #4
0
def Wiki_Q3():
    biographies_names = get_featured_biographies()
    number_of_failures = 0
    num_male, num_female, num_plural, num_unknown = 0, 0, 0, 0
    for name in biographies_names:
        try:
            text = page_text(name, 'text')
            gender = get_pronouns(text)
            if(gender == 'Male'):
                num_male += 1
            elif(gender == 'Female'):
                num_female += 1
            elif(gender == 'Plural'):
                num_plural += 1
            else:
                num_unknow += 1
        except Exception as e:
            number_of_failures += 1
            pass
    percentage_of_male = num_male/len(biographies_names) * 100
    percentage_of_female = num_female/len(biographies_names) * 100
    percentage_of_plural = num_plural/len(biographies_names) * 100
    percentage_of_unknown = num_unknown/len(biographies_names) * 100
    percentage_of_failures = number_of_failures/len(biographies_names) *100
    print(f'{percentage_of_male:.2f}% biographies use he/his pronouns.')
    print(f'{percentage_of_female:.2f}% biographies use she/her pronouns.')
    print(f'{percentage_of_plural:.2f}% biographies use they/them pronouns.')
    print(f'{percentage_of_unknown:.2f}% biographies use unknown pronouns.')
    print(f'Failed to parse {percentage_of_failures:.2f}% of pages.')
Beispiel #5
0
def q5_addtional():

    """This function tests additional_analysis_q5(text) before using it in q5"""

    for article in get_featured_biographies():
        try:
            page_content = page_text(article, "text")
            print(additional_analysis_q5(page_content))
        except UnboundLocalError:
            pass
Beispiel #6
0
def get_featured_biographies():
    page_list = page_text('Wikipedia:Featured articles', 'list')
    # print(page_list)
    # After checking the strucuture of outcome,
    # find out that the row end with'[edit]' is a sub-title, which should be the start
    # Therefore, set the first occurence of 'edit'as the the start
    # to skip over the beginning descption of the webpage
    for page_list_row in range(len(page_list)):
        txt = page_list[page_list_row].strip()
        if '[edit]' in txt:
            break
    # Here, page_list_row is the first row of topic
    # txt is the title of the first topic
    featuredArticle = {
    }  # Generate a dictionary to save the topic and the titles of the articles
    Articles_each_topic = None
    subTopic = txt
    while page_list_row < len(page_list):
        txt = page_list[page_list_row].strip()
        if '[edit]' in txt:
            featuredArticle[subTopic] = Articles_each_topic
            Articles_each_topic = []
            subTopic = txt
            page_list_row += 1
            continue
        if txt != None:
            Articles_each_topic.append(txt)
        page_list_row += 1

    # In the previous check, also find out that
    # featured articles also biographies have
    # the sub-topic containing "biographies" or "Biographies"
    biographies = []  # Generate a list to save these articles
    for subTopic in featuredArticle.keys():
        if 'biographies' in subTopic.lower(
        ) and 'Autobiographies' not in subTopic:
            biographies.extend(featuredArticle[subTopic])
    # clean the featured biologies list
    del biographies[301]
    del biographies[516]

    biographyCount = len(biographies)
    articleCount = 0  # Count the number of featured articles
    for subTopic in featuredArticle:
        articleCount += len(featuredArticle[subTopic])
    print(
        f"There're {articleCount} featured articles, among which, {biographyCount} articles that are also biographies."
    )
    article_perct = biographyCount / articleCount
    article_perct = round(article_perct * 100, 2)
    print(
        f"The percentage of featured articles among the featured articles is {article_perct}%."
    )
    return biographies
Beispiel #7
0
def get_infobox(page):
    """Returns a dictionary containing information extracted from infoboxes"""

    # to identify an infobox:
    # 1. find "table class=" in line
    # 2.parse the line and find the <tr> lables
    # 3. extract categorical label of the entries using e_c_r
    # 4. extract details following the label using e_a_r
    page_content = page_text(page, "html", include_tables=True).split("\\n")
    entry_regex = re.compile(r'<tr>.*?</tr>')
    entry_category_regex = re.compile(r'\">.*?</th>')
    entry_answer_regex = re.compile(r'\>.*?<')
    info_dict = {"Name": page}

    for line in page_content:
        if line == "":
            page_content.pop(page_content.index(line))
    for line in page_content:
        if "<table class=\"infobox" in line\
                and "vcard" in line:
            table_entries = entry_regex.findall(line)
            for i in table_entries:
                # print(i)
                entry_category = entry_category_regex.findall(i)
                entry_answer = entry_answer_regex.findall(i)
                entry_answer.pop(0)
                if len(entry_category) != 0:
                    answer = []
                    for j in entry_answer:
                        # to remove >< marks
                        j = j[1:-1]
                        if len(j.strip()) > 1:
                            try:
                                # to remove random punctuation marks
                                while j[0].isalpha() is False and j[0].isdigit(
                                ) is False:
                                    j = j[1:]
                                while j[-1].isalpha(
                                ) is False and j[-1].isdigit() is False:
                                    j = j[:-1]
                                answer.append(j)
                            except IndexError:
                                pass
                    # print(answer)
                    if len(answer) > 1:
                        # return the result in a joint string to improve readability
                        answer_value = ', '.join(answer[1:])
                        answer_key = answer[0]
                        if "(" in answer_key:
                            answer_key = answer_key.replace("(", "")
                        info_dict[answer_key] = answer_value

    # print(info_dict)
    return info_dict
Beispiel #8
0
def Wiki_Q1():
    page = page_text('Wikipedia:Featured_articles', 'html')
    soup = BeautifulSoup(page, 'html.parser')
    featured_articles_names = []
    for sub in soup.find('h2').parent.descendants:
        if sub.name == 'li':
            featured_articles_names.append(sub.string)
    biographies_names = get_featured_biographies()
    percentage_of_biographies = len(biographies_names)/len(featured_articles_names) * 100
    print(f"Total articles: {len(featured_articles_names)}")
    print(f"Biographies articles: {len(biographies_names)}")
    print(f"Percentage of biographies: {percentage_of_biographies:.2f}%")
Beispiel #9
0
def get_first_paragraph(page):
    first_paragraphs = ''
    try:
        soup = BeautifulSoup(page_text(page, 'html'), 'html.parser')
        for p in soup.find_all('p'):
            if(p.get('class', '') == ''):
                for paragraph in p.children:
                    first_paragraphs += str(paragraph.string)
                break
    except Exception as e:
        pass
    return first_paragraphs
Beispiel #10
0
def get_first_paragraph(page):
    info = page_text(page, 'text')  # Retrieve information of a page
    # Split the retrieved information into different paragraphs through Line break in html with `\n`
    info = re.split('\n+', info)
    # After going through several webpage, find out the common characteristics of the first paragraph:
    # 1. begins at the second items
    # 2. covers more than 100 words
    for txt in info[1:]:
        if len(txt) > 100:
            return txt
    # If there're no paragraph containing more than 100 words,
    # use the second item directly
    return info[1]
Beispiel #11
0
def Extra_Credit():
    biographies_names = get_featured_biographies()
    num_no_infobox = 0
    birth_and_death = []
    year_list = []
    for name in biographies_names:
        try:
            page = page_text(name, 'html', include_tables=True)
            soup = BeautifulSoup(page, 'html.parser')
            if('infobox biography vcard' in page):
                table = soup.find_all('table', 'infobox biography vcard')[0]
                year = (get_birth_and_death(table))
                year_list.append(year)
            else:
                year = (-1, -1)
                num_no_infobox += 1
                year_list.append(year)
        except Exception as e:
            year = (-1, -1)
            year_list.append(year)
        years_dictionary = {
            'title': name,
            'year': year
        }
        birth_and_death.append(years_dictionary)
    df = pd.DataFrame(birth_and_death)
    df.to_csv('extra_credit.csv', index=False)
    print('Extra credit is saved in extra_credit.csv')
    plt.boxplot(year_list[0])
    plt.boxplot(year_list[1])
    plt.show()

    born, died = [], []
    no_birth, no_death, unknown = 0, 0, 0
    for i in year_list:
        if(i[0] == -1 and i[1] == -1):
            unknown += 1
        elif(i[0] == -1 and i[1] != -1):
            no_birth += 1
            died.append(i[1])
        elif(i[0] != -1 and i[1] == -1):
            born.append(i[0])
            no_death += 1
        else:
            born.append(i[0])
            died.append(i[1])
    percentage_of_alive = no_death/len(year_list) * 100
    percentage_of_unknown = unknown/len(year_list) * 100
    print(f'{percentage_of_alive:.2f}% people are still alive.')
    print(f'{percentage_of_unknown:.2f}% featured biographies are unknown.')
Beispiel #12
0
def get_first_paragraph(name_list):
    # \d is any number character, 0-9
    # \w is any alphabetical character, A-Z, uppercase or lowercase
    # \s is any space character, including tabs or spaces or other blanks.
    first_paragraph = []
    for name in name_list:
        page = page_text(name, "list")
        life_span_pattern = name + '\s*\('
        for paragraph in page:
            match = re.search(life_span_pattern, paragraph)
            if match:
                first_paragraph.append(paragraph)
                break
    return first_paragraph
Beispiel #13
0
def get_featured_biographies():
    page = page_text('Wikipedia:Featured_articles', 'html')
    soup = BeautifulSoup(page, 'html.parser')
    featured_articles_names = []
    for sub in soup.find('h2').parent.descendants:
        if sub.name == 'li':
            featured_articles_names.append(sub.string)
    biographies_names = []
    for h3 in soup.find_all('h3'):
        for sub in h3.children:
            if('mw-headline' in sub.get('class', '') and 'biographies' in sub.string):
                for li in h3.next_sibling.next_sibling.children:
                    if li.name == 'li':
                        biographies_names.append(li.string)
                break
    return biographies_names
Beispiel #14
0
def additional_analysis():
    biographies_names = get_featured_biographies()
    page_length = []
    for name in biographies_names:
        try:
            biography_text = page_text(name,'text')
            length = len(biography_text.split())
            page_length.append(length)
        except:
            pass
    max_length = max(page_length)
    min_length = min(page_length)
    mean_length = np.mean(page_length)
    median_length = np.median(page_length)
    std_length = np.std(page_length)
    print(f'max: {max_length}, min: {min_length}, mean: {mean_length:.2f}, median: {median_length}, std: {std_length:.2f}')
Beispiel #15
0
def Wiki_Q5():
    biographies_names = get_featured_biographies()
    list_biography = []
    number_of_failures = 0
    for name in biographies_names:
        try:
            biography_text = page_text(name, 'text')
            gender = get_pronouns(biography_text)
            length = len(biography_text.split())
            list_biography.append({'title': name, 'pronoun': gender, 'len': length})
        except Exception as e:
            number_of_failures += 1
            pass
    print(f'Failed to scrape {number_of_failures} pages.')
    df_biography = pd.DataFrame(list_biography, columns=['title', 'pronoun', 'len'])
    export_dataset(df_biography)
    print(pd.read_csv('export_dataset.csv'))
Beispiel #16
0
def additional_analysis():

    """Returns a list of century counts"""

    # look for year information by 3-4 digits (2 digits were omitted)
    # check if the year is BC or not
    year_pattern = "\d{3,4}\)*\s"
    bc_str = "\sBC\s\)*"
    bc_pattern = re.compile(bc_str)
    century_count = {}

    for article in get_featured_biographies():
        print(article)
        try:
            page_content = page_text(article, "text")
            try:
                birth_year = re.findall(year_pattern, page_content)[0]
                try:
                    birth_year = int(birth_year)
                    # if the ) is not parse out of the string, it causes an error
                except ValueError:
                    birth_year = birth_year[:-2]
                    birth_year = int(birth_year)
                print(birth_year)
                birth_year_century = math.floor(birth_year/100)+1
                if bc_pattern.search(page_content):
                    birth_year_century = -birth_year_century

                    if birth_year_century not in century_count:
                        century_count[birth_year_century] = 1
                    else:
                        century_count[birth_year_century] += 1
            except IndexError:
                pass
        except UnboundLocalError:
            pass

    century_count_items = list(century_count.items())
    sorted_century_count = sorted(century_count_items, key=lambda item: item[0], reverse=True)

    # write the count into a csv file
    # with open('century_count.csv', 'w') as f3:
    #     [f3.write('{0},{1}\n'.format(key, value)) for key, value in century_count.items()]
    print(sorted_century_count)
    return sorted_century_count
Beispiel #17
0
def get_first_paragraph(page):
    page_content = page_text(page, "text").split("\n")
    # remove empty lines in the list of content
    for line in page_content:
        if line == "":
            page_content.pop(page_content.index(line))
    i = 0;

    # criteria to search for first info paragraphs:
    # 1. the numbers of characters in line is greater than 100
    # 2. in the line, the person's family name is included
    # 3. the line is not starting with functional words such as "For" or "This"
    # if find the paragraph successfully based on rules about, return true
    # otherwise, return false
    while len(page_content[i].strip()) < 100:
        i += 1
    first_para = page_content[i].strip()
    first_word = first_para.split(" ")[0]
    try:
        if page[-1] in first_para:
            if first_word != "This" and first_word != "For" \
                    and first_word != "The" and first_word != "In" \
                    and first_para[0] != "\"" and first_para[0] != "(":
                print(first_para)
                return True
            else:
                first_para = page_content[i+1].strip()
                first_word = first_para.split(" ")[0]
                if len(first_para) > 100:
                    if first_word != "This" and first_word != "For"\
                            and first_word != "The" and first_word != "In" \
                            and first_para[0] != "\"" and first_para[0] != "(":
                        print(first_para)
                        return True
                    else:
                        return False
                else:
                    return False
        else:
            return False
    except IndexError:
        return False
Beispiel #18
0
def df_summary_list():

    """This function creates a list of dictionaries that store people's info"""

    summary_list = []
    for article in get_featured_biographies():
        try:
            page_content = page_text(article, "text")
            sub_dict = {"Name": article,
                        "Year_of_Birth": additional_analysis_q5(page_content),
                        "Most_Common_Pronoun": get_pronouns(page_content),
                        }
            i = get_featured_biographies().index(article)
            print(i)
            print(sub_dict)
            summary_list.append(sub_dict)
        except UnboundLocalError:
            pass
    df = pd.DataFrame(summary_list)
    return df
Beispiel #19
0
    for i in ratings_filtered.columns:
        print(i.upper())
        print(ratings_filtered[i].value_counts())
        print()


#This command takes a while to run
print(q3_yelp(ratings))

## PART TWO: WIKIPEDIA
# To import this function you will need to install the lxml library using Conda.
from wiki_api import page_text
import re

# Pulling
wiki_html = page_text("Wikipedia:Featured articles", "html")
wiki_text = page_text("Wikipedia:Featured articles", "text")
wiki_list = page_text("Wikipedia:Featured articles", "list")


# Part 2, Question 1
def get_featured_biographies(list):
    print(f"There are {len(wiki_list)} items in the object wiki_list.")
    final_list = []
    boolean = False
    title = []
    for title in wiki_list:
        if ('[edit]' in title) and ('Autobiographies' in title):
            boolean = False
            continue
        elif ('[edit]' in title) and (('biographies' in title) or
Beispiel #20
0
    return gender_list


def additional_analysis():
    pass

def export_dataset(df, format):
    with open(f"export_dataset.{format}", "w", encoding='utf-8') as out_file:  # have the option to export in csv or json format
        # encoding = 'utf-8' because run into error of UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position
        if format == "csv":
            out_file.write(df.to_csv())
        elif format == "json":
            out_file.write(df.to_json())

if __name__ == "__main__":
    ls = page_text("Wikipedia:Featured articles", "list")
    ls = ls[40:]
    ls = ls[:-7]
    ls = list(filter(lambda x: x != "",ls))

    name_list=get_featured_biographies()
    print(f"Among {len(ls)} number of featured articles, {len(name_list)/len(ls)*100:.2f}% are biographies.")
    first_para_list = get_first_paragraph(name_list)
    print(f"Among {len(name_list)} number of biographies, {len(first_para_list)/len(name_list)*100:.2f}% can be scraped as first paragraphs.")

    gender_list = get_pronouns(first_para_list)
    female_count = sum(gender == 'Female' for gender in gender_list)
    male_count = sum(gender == 'Male' for gender in gender_list)
    plural_count = sum(gender == 'Plural' for gender in gender_list)
    total_count = int(len(first_para_list))
    other_count = total_count - female_count - male_count - plural_count