Python cleanText Exemples, lib.utils.scraper_utils.cleanText Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : events.py Projet : dowel015/Level-Up

def getStyle(text):

    if re.search(r"^ *$", text):
        return "none"

    standard = {"ballroom", "standard"}

    am_or_intl = ""

    for word in set_library.american_styles:
        if word in cleanText(text):
            am_or_intl = "american"
    for word in set_library.international_styles:
        if word in cleanText(text):
            am_or_intl = "intl"

    for word in cleanText(text).split(" "):
        if word == "rhythm":
            return "rhythm"
        elif word == "latin":
            return "latin"
        elif word == "smooth":
            return "smooth"
        elif word in standard:
            return "standard"
        elif word == "fun":
            return "fun"

    # Figure out style from dances and am_or_intl
    if am_or_intl == "american":
        for dance in set_library.smooth_dances:
            if re.search(dance, cleanText(text)):
                return "smooth"
        for dance in set_library.rhythm_dances:
            if re.search(dance, cleanText(text)):
                return "rhythm"

    elif am_or_intl == "intl":
        for dance in set_library.standard_dances:
            if re.search(dance, cleanText(text)):
                return "standard"
        for dance in set_library.latin_dances:
            if re.search(dance, cleanText(text)):
                return "latin"

    if am_or_intl == "":
        for dance in set_library.other_dances:
            if re.search(dance, cleanText(text)):
                return "other"
        for dance_style in set_library.dance_styles:
            if re.search(dance_style, cleanText(text)):
                return dance_style
        return "none"
    else:
        return am_or_intl

Exemple #2

0

Afficher le fichier

Fichier : events.py Projet : dowel015/Level-Up

def getAge(event_text):

    if re.search(r"^ $", event_text):
        return "none"

    age_group = []

    for age in (set_library.ages - set_library.age_groups):
        if re.search(age + ' ', cleanText(event_text)):
            age_group.append(re.sub('\\\\', '', age))

    if len(age_group) == 0:
        for age in set_library.ages:
            if re.search(age + ' ', cleanText(event_text)):
                age_group.append(re.sub('\\\\', '', age))

    if len(age_group) == 0:
        age_group.append("none")

    return age_group

Exemple #3

0

Afficher le fichier

Fichier : competitions.py Projet : dowel015/Level-Up

def buildO2CMCompTable(comp_ids, quick):

    print("\nScraping competitions from o2cm")

    # Connect to dreamhost db
    mydb = database.getDB()

    # initialize cursor
    cursor = mydb.cursor()

    # check if comp table exists
    # create if not exist
    cursor.execute("CREATE TABLE IF NOT EXISTS competitions " +
                   "(comp_id varchar(255) NOT NULL, " +
                   "comp_name varchar(255), " +
                   "is_nqe varchar(255), " +
                   "date varchar(255), " +
                   "PRIMARY KEY (comp_id))")

    # Initialize date variables
    comp_id = ''
    current_date = datetime.datetime.now()
    year = current_date.year
    month = current_date.month

    while year != 2004:

        print("searching", year, month)

        # Initialize web driver
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        driver = webdriver.Chrome(options=options)

        # GET request to competition results page
        driver.get('http://results.o2cm.com/')

        # Find form and filter by year and month
        month_element = driver.find_element_by_id("inmonth")
        month_element.clear()
        month_element.send_keys(month)

        year_element = driver.find_element_by_id("inyear")
        year_element.clear()
        year_element.send_keys(year)

        driver.find_element_by_name("Go").click()

        allCompResultsPage = BeautifulSoup(driver.page_source, 'html.parser')

        compHTMLTable = allCompResultsPage.select('table[id=main_tbl]')[0]

        # Countdown to help sort results from same date
        counter = len(compHTMLTable.select('tr'))

        # Iterate through table of Competitions
        for row in compHTMLTable.select('tr'):

            # An anchor element with href attribute indicates link to competition
            # results page
            if row.select('a[href]'):

                # Comp ID
                href = row.select('a[href]')[0]['href']

                comp_id = re.search('(?<=event=).*?(?=&)', href).group(0)

                print(comp_id)

                if set([comp_id]).issubset(comp_ids):
                    print("competition id " + comp_id + "' already present")

                    if quick:
                        return

                else:
                    comp_ids.add(comp_id)

                    # Date
                    day_string = row.select('td')[0].get_text(strip=True)
                    month_string = scraper_utils.numericalMonth(day_string[0:3])
                    day = day_string[4:]

                    # Competition Name
                    comp_name = row.select('a[href]')[0].get_text(strip=True)

                    # NQE? If a competition's name includes the string 'NQE', we
                    # assume it's an NQE.
                    is_NQE = False

                    if re.search("nqe", scraper_utils.cleanText(comp_name)):
                        is_NQE = True

                    date = str(year) + '-' + month_string + "-" + day + "-" + str(counter)

                    # Add comp to db
                    insert = "INSERT INTO competitions (comp_id, comp_name, is_nqe, date) VALUES('" + comp_id + "', '" + comp_name + "', '" + str(is_NQE) + "', '" + date + "')"

                    print(insert)
                    cursor.execute(insert)
                    mydb.commit()

                    # # Alternatively, write to file
                    # competition = [ comp_id, comp_name, is_NQE, date ]
                    # print("competition: " + str(competition))
                    # f = open("./output/comp-table.txt", "a")
                    # f.write(str(competition))
                    # f.write("\n")
                    # f.close

            # Decrement counter
            counter = counter - 1

        # Update date variables
        month = month - 1
        if month == 0:
            month = 12
            year = year - 1

    # while comp_id != stop_point

    # close db connection
    cursor.close()

Exemple #4

0

Afficher le fichier

Fichier : placements.py Projet : dowel015/Level-Up

def buildO2CMPlacementsTable(comp_ids, quick):

    print("\nScraping O2CM placements")

    # start = False

    comp_id = ''
    current_date = datetime.datetime.now()
    year = current_date.year
    month = current_date.month

    while year != 2004:

        print("searching", year, month)

        # Initialize web driver
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        driver = webdriver.Chrome(options=options)

        # GET request to competition results page
        driver.get('http://results.o2cm.com/')

        # Find form and filter by year and month
        month_element = driver.find_element_by_id("inmonth")
        month_element.clear()
        month_element.send_keys(month)

        year_element = driver.find_element_by_id("inyear")
        year_element.clear()
        year_element.send_keys(year)

        driver.find_element_by_name("Go").click()

        allCompResultsPage = BeautifulSoup(driver.page_source, 'html.parser')

        # Iterates through every competition on results.o2cm.com
        for comp in allCompResultsPage.select('a'):

            if re.search('(?<=event=).*?(?=&)', comp['href']):
                # e.g. scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000
                comp_id = re.search('(?<=event=).*?(?=&)',
                                    comp['href']).group(0)
                print(comp_id)

                if set([comp_id]).issubset(comp_ids):
                    print(comp_id, "present in DB")

                    if quick:
                        return

                    comp_ids.add(comp_id)

                else:
                    # Initialize web driver
                    options = webdriver.ChromeOptions()
                    options.add_argument('headless')
                    driver = webdriver.Chrome(options=options)

                    # GET request to competition results page
                    driver.get('http://results.o2cm.com/?event=' + comp_id)

                    # Attempt to locate 'submit' button and click to reach complete results
                    # page
                    try:
                        ok_button = driver.find_element_by_xpath(
                            "//input[@type='submit']")
                        ok_button.click()
                        comp_all_page = BeautifulSoup(driver.page_source,
                                                      'html.parser')
                        results_table = comp_all_page.select('table[width]')[1]

                        # Parse competitor drop down, get competitor ID info
                        # initialize empty data structure
                        competitor_ids = {}

                        if comp_all_page.find(id='selEnt') != None:

                            competitors = comp_all_page.find(id='selEnt')

                            # add each competitor number and text to data structure
                            for competitor in competitors.find_all('option'):
                                competitor_ids[format_name(
                                    competitor.get_text(
                                        strip=True))] = competitor['value']

                        heat_id = ''

                        # For every event at a competition, find the things
                        for row in results_table.find_all('tr'):

                            # A row with an anchor element denotes a new event
                            if len(row.select('a')) > 0:

                                # Get heat_id
                                href = row.select('a')[0]['href']
                                heat_id = re.search('(?<=heatid=).*?(?=&)',
                                                    href).group(0)
                                heat_id = comp_id + heat_id
                                # scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000

                            # Another entry in same event
                            else:
                                if len(row.select('td')) >= 3:
                                    row_text = row.select('td')[2].get_text(
                                        strip=True)

                                    clean_row = scraper_utils.cleanText(
                                        row_text)

                                    # '1) 210 Jackson Fossen & Claire Thompson - MN'
                                    # '8) TBA TBA& TBA TBA'

                                    if clean_row != '----' and clean_row != '':

                                        ##########################################
                                        # Determine and save each placement here #
                                        ##########################################

                                        if not re.search(
                                                r'^\d+\)\s\d?\d?\d?\s?[\w\d\"\'\`\-\.\,\?\!\_\/\#\(\)\s]*\s?&\s?',
                                                clean_row):
                                            f = open(
                                                "./output/failed-pattern.txt",
                                                "a")
                                            f.write(clean_row)
                                            f.write("\n")
                                            f.close()

                                        else:
                                            # Overall placement in event
                                            numeric_placement = "0"
                                            if re.search(
                                                    r'^\d\d?\d?(?=\))',
                                                    clean_row):
                                                numeric_placement = re.search(
                                                    r'^\d\d?\d?(?=\))',
                                                    clean_row).group(0)
                                                clean_row = re.sub(
                                                    numeric_placement + "\)",
                                                    scraper_utils.subSpace(
                                                        numeric_placement +
                                                        ")"), clean_row)
                                                clean_row = re.sub(
                                                    r'^\s*', '', clean_row)

                                            # Competitor Number
                                            competitor_number = "X"
                                            if re.search(
                                                    r'^\s*\d+', clean_row):
                                                competitor_number = re.search(
                                                    r'^\s*\d+',
                                                    clean_row).group(0)
                                                competitor_number = re.sub(
                                                    r'\s', '',
                                                    competitor_number)

                                            clean_row = re.sub(
                                                competitor_number,
                                                scraper_utils.subSpace(
                                                    competitor_number),
                                                clean_row)

                                            # Couple/Competitors
                                            # Lead name is any text before an ampersand
                                            lead_name = " "
                                            if re.search(
                                                    r'^.*(?=\ *\&)',
                                                    clean_row):
                                                lead_name = re.search(
                                                    r'^.*(?=\ *\&)',
                                                    clean_row).group(0)

                                            # Remove leading/trailing spaces from lead name
                                            lead_name = re.sub(
                                                r'^\ *', '', lead_name)
                                            lead_name = re.sub(
                                                r'\ *$', '', lead_name)

                                            # Lead ID (for this comp anyway)
                                            lead_id = ''
                                            if competitor_ids.get(lead_name):
                                                lead_id = competitor_ids[
                                                    lead_name]

                                            # Follow name is any text after an ampersand
                                            follow_name = re.search(
                                                r'(?<=\&).*$',
                                                clean_row).group(0)

                                            # Remove leading spaces from follow name
                                            follow_name = re.sub(
                                                r'^\ *', '', follow_name)

                                            # Filter potential location info
                                            location = ''
                                            if re.search(
                                                    r'\ *(\-\ )?\-\ \ ?[\w\s\-\(\)]+$',
                                                    follow_name):
                                                location = re.search(
                                                    r'\ *(\-\ )?\-\ \ ?[\w\s\-\(\)]+$',
                                                    follow_name).group(0)
                                                location = re.sub(
                                                    r'^\ *(\-\ )?\-\ \ ?', '',
                                                    location)

                                                # Remove location info from follow_name
                                                follow_name = re.sub(
                                                    r'\ *(\-\ )?\-\ \ ?[\w\s\-\(\)]+$',
                                                    '', follow_name)

                                            # Remove trailing space/hyphens from follow_name
                                            follow_name = re.sub(
                                                r'[\ \-]*$', '', follow_name)

                                            # Follow ID (for this comp anyway)
                                            follow_id = ''
                                            if competitor_ids.get(follow_name):
                                                follow_id = competitor_ids[
                                                    follow_name]

                                            # Complete Placement
                                            placement = [
                                                heat_id, numeric_placement,
                                                competitor_number, lead_name,
                                                lead_id, follow_name,
                                                follow_id, location, row_text
                                            ]  # TO DO: couple_id? lead_id? follow_id?

                                            checkPlacement(placement)

                    except NoSuchElementException:
                        print('No button for ' + comp_id)
                        no_button_file = open("output/no-button-comp.txt", "a")
                        no_button_file.write(comp_id)
                        no_button_file.write("\n")
                        no_button_file.close()

                # else:
                #     print("skip " + comp_id)

        # Update date variables
        month = month - 1
        if month == 0:
            month = 12
            year = year - 1

Exemple #5

0

Afficher le fichier

Fichier : events.py Projet : dowel015/Level-Up

def buildO2CMEventsTable(comp_ids, quick):

    print("\nScraping O2CM events")

    comp_id = ''
    current_date = datetime.datetime.now()
    year = current_date.year
    month = current_date.month

    while year != 2004:

        print("searching", year, month)

        # Initialize web driver
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        driver = webdriver.Chrome(options=options)

        # GET request to competition results page
        driver.get('http://results.o2cm.com/')

        # Find form and filter by year and month
        month_element = driver.find_element_by_id("inmonth")
        month_element.clear()
        month_element.send_keys(month)

        year_element = driver.find_element_by_id("inyear")
        year_element.clear()
        year_element.send_keys(year)

        driver.find_element_by_name("Go").click()

        allCompResultsPage = BeautifulSoup(driver.page_source, 'html.parser')

        compHTMLTable = allCompResultsPage.select('table[id=main_tbl]')[0]

        # Track event ID's to check for potential duplicates
        heat_uids = set()

        # Iterates through every competition on results.o2cm.com
        for comp in compHTMLTable.select('a'):

            # e.g. scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000
            comp_id = re.search('(?<=event=).*?(?=&)', comp['href']).group(0)
            print(comp_id)

            if set([comp_id]).issubset(comp_ids):
                print(comp_id, "present in DB")

                if quick:
                    return

                comp_ids.add(comp_id)

            else:
                comp_ids.add(comp_id)

                # Initialize web driver
                options = webdriver.ChromeOptions()
                options.add_argument('headless')
                driver = webdriver.Chrome(options=options)

                # GET request to competition results page
                driver.get('http://results.o2cm.com/?event=' + comp_id)

                # Attempt to locate 'submit' button and click to reach complete
                # results page
                try:
                    ok_button = driver.find_element_by_xpath(
                        "//input[@type='submit']")
                    ok_button.click()
                    comp_all_page = BeautifulSoup(driver.page_source,
                                                  'html.parser')
                    results_table = comp_all_page.select('table[width]')[1]

                    heat_id = ''
                    status = []
                    age_group = []
                    style = ''
                    skill_level = []
                    dances = []
                    num_rounds = 1
                    num_couples = 0

                    raw_event_text = ""

                    # For every event at a competition, find the things
                    for row in results_table.find_all('tr'):

                        # A row with an anchor element denotes a new event
                        if len(row.select('a')) > 0:

                            # Save info from previous event if it exists
                            if heat_id != '':
                                event_summary = [
                                    comp_id, heat_id, status, age_group, style,
                                    skill_level, num_couples, num_rounds,
                                    dances, raw_event_text
                                ]
                                checkEvent(event_summary)

                            # Reset counts
                            num_couples = 0
                            num_rounds = 1

                            # Get heat_id
                            href = row.select('a')[0]['href']
                            heat_id = re.search('(?<=heatid=).*?(?=&)',
                                                href).group(0)
                            heat_id = comp_id + heat_id
                            # scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000

                            # Check for duplicate event ID
                            if heat_id in heat_uids:

                                # Write heat_id to file
                                heat_ids_file = open("./output/heat-ids.txt",
                                                     "a")
                                heat_ids_file.write(
                                    str([comp_id, heat_id, "DUPLICATE"]))
                                heat_ids_file.write("\n")
                                heat_ids_file.close()

                            raw_event_text = row.select('a')[0].get_text(
                                strip=True)
                            event_text = cleanText(raw_event_text)
                            print(raw_event_text)
                            print(event_text)

                            # Get each attribute with get<Attribute>(event_text). After a determination is made, the value for
                            # that field is removed from event_text to avoid the same section of the string being interpreted
                            # multiple ways

                            # Status
                            status = getStatus(event_text)

                            if len(status) == 1:
                                event_text = re.sub(status[0],
                                                    subSpace(status[0]),
                                                    event_text)

                            print(event_text)

                            # Age
                            age_group = getAge(event_text)

                            if len(age_group) == 1:
                                event_text = re.sub(age_group[0],
                                                    subSpace(age_group[0]),
                                                    event_text)

                            print(event_text)

                            # Style
                            style = getStyle(event_text)

                            event_text = re.sub(style, subSpace(style),
                                                event_text)

                            print(event_text)

                            # Level
                            skill_level = getLevel(event_text)

                            if len(skill_level) == 1:
                                event_text = re.sub(skill_level[0],
                                                    subSpace(skill_level[0]),
                                                    event_text)

                            print(event_text)

                            # Dances
                            dances = getDances(event_text)

                            dances = re.sub(" ", "", re.escape(dances))

                            event_text = re.sub(dances, subSpace(dances),
                                                event_text)

                            print(event_text)

                            # More figuring out level, should move to getLevel
                            stripped_event_text = cleanText(event_text)
                            stripped_event_text = re.sub(
                                " ", "", stripped_event_text)

                            for stat in status:
                                stripped_event_text = re.sub(
                                    stat, "", stripped_event_text)

                            for age in age_group:
                                stripped_event_text = re.sub(
                                    age, "", stripped_event_text)

                            stripped_event_text = re.sub(
                                style, "", stripped_event_text)
                            stripped_event_text = re.sub(
                                re.escape(dances), "", stripped_event_text)

                            if stripped_event_text == "" and skill_level == []:
                                skill_level = "none"

                        # Another entry in same event
                        else:
                            if len(row.select('td')) >= 3:
                                row_text = row.select('td')[2].get_text(
                                    strip=True)

                                # Previous Round
                                if row_text == '----':
                                    num_rounds = num_rounds + 1

                                # Another Result
                                else:
                                    num_couples = num_couples + 1

                    event_summary = [
                        comp_id, heat_id, status, age_group, style,
                        skill_level, num_couples, num_rounds, dances,
                        raw_event_text
                    ]

                    print(event_summary)

                    # checkEvent validates each event and writes valid ones to
                    # Events table/file, invalid ones to invalid collection
                    checkEvent(event_summary)

                except NoSuchElementException:
                    print('No button for ' + comp_id)
                    no_button_file = open("output/no-button-comp.txt", "a")
                    no_button_file.write(comp_id)
                    no_button_file.write("\n")
                    no_button_file.close()

        # Update date variables
        month = month - 1
        if month == 0:
            month = 12
            year = year - 1