Ejemplo n.º 1
0
def main():
    args = sys.argv
    if args[1] == 'scrape':
        patients_url, querents_url, inspections_url = get_url()
        if args[2] in {'inspections', 'all'}:
            create_inspections_csv(inspections_url)
        if args[2] in {'patients', 'all'}:
            create_patients_csv(patients_url)
        if args[2] in {'querents', 'all'}:
            create_querents_csv(querents_url)
        if not args[2] in {'all', 'inspections', 'patients', 'querents'}:
            print('invalid second argument !')
    elif args[1] == 'convert':
        convert()
    else:
        print('invalid first argument !')
Ejemplo n.º 2
0
# local module imports
from config import urls
import data_processor as dp
import scraper as scrape

# external module imports
from bs4 import BeautifulSoup

page = scrape.get_url(urls['titanic_data']).text
soup = BeautifulSoup(page, 'html.parser')

titanic = dp.populate(soup)
dp.analyse(titanic.passengers)
Ejemplo n.º 3
0
import scraper as scrape

from bs4 import BeautifulSoup

# Run stuff
# Dictionary of websites to access
urls = {
    # 'Web Scraper Test Site': 'http://webscraper.io',
    'StarLadder': 'https://starladder.com/en/starseries-i-league-pubg',
    # 'Google': 'http://www.google.com',
    # 'Reddit': 'https://www.reddit.com'
}

for key, value in urls.items():

    try:
        page = BeautifulSoup(scrape.get_url(value).text, 'html.parser')

        team_standings = dp.parse_historic(page)
        team_stats = dp.parse_historic_stats(page)

        final_results = dp.aggregate_historic_results(team_standings,
                                                      team_stats)

        print(final_results)

    except custom_exception.DisallowedException as e:
        print(
            'Connection to {} not permitted with HTTP code {}. Does its robots.txt allow access?'
            .format(value, e.status))
Ejemplo n.º 4
0
import scraper
from bs4 import BeautifulSoup

html = scraper.get_url(
    "https://www.kijiji.ca/b-motorcycles/canada/ninja-300/k0c30l0?price=1000__&dc=true"
)
soup = scraper.ad(html)
#print(soup)

#for i in range(5):
#    print("---------------------------------------")

#print(soup[2])

for i in range(5):
    print("---------------------------------------")

for i in soup:
    print(scraper.link(i))

#print(soup.prettify())
Ejemplo n.º 5
0
from scrapeFlowsheet import get_flowsheet_info
from scraper import get_url
from app import db, UBClasses, UBRecitation, Degree
import re

base_url = 'https://www.eng.buffalo.edu/undergrad/advisement/flowsheets/'
path = '//*[@id="div_content"]/ul/li/a'

links = get_url(base_url, path)
modified_links = []
for i in range(0,len(links)):
    if i is 1:
        modified_links.append(links[i])
    elif i is 4:
        modified_links.append(links[i])
    elif i is 5:
        modified_links.append(links[i])
    elif i is 6:
        modified_links.append(links[i])
    elif i is 7:
        modified_links.append(links[i])
    elif i is 9:
        modified_links.append(links[i])
    elif i is 11:
        modified_links.append(links[i])
    elif i is 14:
        modified_links.append(links[i])
    elif i is 15:
        modified_links.append(links[i])
    elif i is 17:
        modified_links.append(links[i])
Ejemplo n.º 6
0
                else:
                    instructor.append(curr_data)
            elif(position == 11):

                status.append(curr_data)
            if(position >= 11):
                position = 1
            else:
                position += 1
            curr_data = ''
        else:
            curr_data += data[i]
    return data

index=0
semester_links = get_url(base_url, '//*[@id="content_internal"]/ul/li/a')
semester_links.remove(semester_links[1])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
for semm in semester_links:
    print semm


broken_departments = []
for semester in semester_links:
    print semester # print link
    department_links = get_url(semester, '/html/body/table[4]/tr/td[1]/table/tr/td[1]/a')
    for department in department_links: