Ejemplos de Scraper.Scraper en Python, ejemplos de Scraper.Scraper, privacyflash-pro en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: timetable.py Proyecto: PortfolioCollection/AutoTablePython

def generate():
    "----------Start up the course times------------"
    autotable = AutoTable()
    scraper = Scraper(autotable)
    autotable = scraper.build_table()
    #builder = Builder(autotable)
    #autotable = builder.build_table()
    start_time = time.time()
    print("Year Space")
    year_solutions = generate_semester(autotable.year.courses)
    if len(year_solutions) == 0:
        optimal_solution = fall_winter_merge(autotable.fall.courses,
                                             autotable.winter.courses)
    else:
        optimal_solution = year_fall_winter_merge(year_solutions,
                                                  autotable.fall.courses,
                                                  autotable.winter.courses)

    print("Fall")
    for day in optimal_solution[0][0]:
        for timeslot in day:
            print(timeslot)
    print("Winter")
    for day in optimal_solution[0][1]:
        for timeslot in day:
            print(timeslot)
    print("Fall Distance: " + str(optimal_solution[1]) +
          "    Winter Distance: " + str(optimal_solution[2]))
    print("--- Full algorithm %s seconds ---" % (time.time() - start_time))
    root = Tk()
    gui1 = MyFirstGUI(optimal_solution[0][0], "Fall", root)
    root.mainloop()
    root = Tk()
    gui2 = MyFirstGUI(optimal_solution[0][1], "Winter", root)
    root.mainloop()

Ejemplo n.º 2

0

Mostrar archivo

def generate():
    "----------Start up the course times------------"
    
    autotable = AutoTable()
    scraper = Scraper(autotable)
    autotable = scraper.build_table()
    #builder = Builder(autotable)
    #autotable = builder.build_table()
    start_time = time.time()
    "----------Get all Fall Timetables------------"
    courses = autotable.fall.courses
    courses.extend(autotable.year.courses)
    space1 = autotable.solution_space(courses)
    "----------Get all Winter Timetables------------"
    courses = autotable.winter.courses
    courses.extend(autotable.year.courses)
    space2 = autotable.solution_space(courses)
    "-----------Combine fall and winter-------------"
    
    listed = autotable.index_year_courses(autotable.year.courses)
    compatible = autotable.construct_year(space1,space2,listed)
    print("Fall:")
    for section in compatible[0][0][0]:
        print(section.name)
        print(section)
    print("Winter:")
    for section in compatible[0][1][0]:
        print(section.name)
        print(section)
    print("Distance: "+str(compatible[0][2]))
    
    print("--- %s seconds ---" % (time.time() - start_time))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: Crawler.py Proyecto: yamilmaud/Inmuebles

    def crawler_main(self, indice_link):

        source = re.findall(REGEX_SOURCE, self.__url)
        url = self.__url
        parsed = Crawler.crawlear_web(self, url)
        ultima_pagina = (parsed.xpath(XPATH_ULTIMA_PAGINA)[0])

        # transforma el string con , como separador de mil en entero
        locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
        last_page = int(locale.atof((re.findall(REGEX_ULTIMA_PAGINA, ultima_pagina))[0]))
        cantidad_paginas = math.ceil(last_page / 24)

        indice_pagina = 1  # indice para cambio de pagina

        while cantidad_paginas >= indice_pagina:
            contador = 1
            sigiente_pagina = self.__nuevo_link.format(indice_pagina)
            parsed3 = Crawler.crawlear_web(self, sigiente_pagina)
            unidades_href = parsed3.xpath(XPATH_HREF_UNIDADES)

            for elemento in unidades_href:

                parsed2 = Crawler.crawlear_web(self, elemento)
                instancia_scraper = Scraper.Scraper()
                instancia_scraper.crear_dicc(parsed2, elemento, source, indice_link)

                print("Nuevo Aviso", contador)
                contador += 1

            indice_pagina += 1
            print("Cambio de pagina")
        print("Nueva Url")

Ejemplo n.º 4

0

Mostrar archivo

def init():
	global crawler
	global domain
	global bot
	global timeout
	global lag
	global depth
	global emailscan
	global tprint
	
	# Remove previous files
	try:
		shutil.rmtree(".cache")
	except OSError:
		pass
		
	try:
		os.remove("emails.txt")
	except OSError:
		pass
	
	# Process cmd line arguments
	cmdArgs()
	
	# Initialize scraper object
	crawler = Scraper(domain, bot, timeout, lag, depth, emailscan)

	# Pretty print thread
	tprint = threading.Thread(target=ThreadPrettyPrint)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: GUI.py Proyecto: kpahawa/Web-Scraper

 def writeToCSV(self):
     scraper = Scraper()
     finishedUltimaList = scraper.getUltimaList()
     f = open("{}.csv".format(self.animeName.get()), 'w', newline='')
     writer = csv.writer(f)
     for row in finishedUltimaList:
         self.progressBar.step(10)
         writer.writerow(row)
     self.progressBar.stop()
     f.close()

Ejemplo n.º 6

0

Mostrar archivo

def update_navigation() -> None:
    scraper = Scraper.Scraper()
    nav_data = scraper.get_all_nav()
    # Subject Names
    json.dump(nav_data["subject_names"], open(make_path(navigation_dir, 'subject_names.json'),'w'))
    # Subject -> Colleges
    json.dump(nav_data["subject_colleges"], open(make_path(navigation_dir, 'subject_colleges.json'),'w'))
    # College -> Subjects
    json.dump(nav_data["college_subjects"], open(make_path(navigation_dir, 'college_subjects.json'),'w'))
    # Errors
    json.dump(nav_data["errors"], open(make_path(error_dir, 'update_nav.json'),'w'))

Ejemplo n.º 7

0

Mostrar archivo

 def __init__(self):
     self.scraper = Scraper.Scraper()
     self.materias = {}  # Dicionario: Codigo_materia->Materia_obj
     self.nomes_materias = {}  # Dicionario: Codigo_materia->nome_materia
     self.curso_materias = {
     }  # Dicionario: Codigo_curso->lista_materias_do_curso (Ordem alfabetica)
     self.lista_cursos = [
     ]  # Lista com o nome de todos os cursos. Guardados em tuplas (cod, nome)
     if os.path.exists(DICT_FILE):
         with open(DICT_FILE, "rb") as f:
             self.materias, self.nomes_materias, self.curso_materias, self.lista_cursos = cPickle.load(
                 f)

Ejemplo n.º 8

0

Mostrar archivo

    def initUI(self):
        #Scraper 클래스 선언
        self.scraper = Scraper.Scraper()

        self.setWindowTitle(self.title)
        self.mainWidget = MainWidget()
        self.setCentralWidget(self.mainWidget)
        self.statusBar = QStatusBar(self)
        self.setStatusBar(self.statusBar)
        self.show()
        self.mainWidget.pathSetButton.clicked.connect(self.openFileNameDialog)
        self.openFileNameDialog()

        # 스크래핑 버튼에 스크래핑 함수 연결
        self.mainWidget.scrapeButton.clicked.connect(self.scrape)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: Layout.py Proyecto: isaac-j-smith/Web-Scraper

    def on_click(self):
        s = Scraper.Scraper(self.url.text(), self.container.text(),
                            self.records.text())
        formatted = ""
        for record in s.run():
            for item in record:
                formatted += item + '; '
            formatted += '\n'

        msg = QMessageBox()
        msg.setIcon(QMessageBox.Information)

        msg.setText("Data Scrape Complete")
        msg.setInformativeText("Press Show Details to view")
        msg.setWindowTitle("Complete")
        msg.setDetailedText(str(formatted))
        msg.setStandardButtons(QMessageBox.Ok | QMessageBox.Close)
        msg.setFixedWidth(500)
        msg.exec_()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: RunScraper.py Proyecto: jessicafooyh/Influenza

def influencer_bin_data(influencers, user, pw):
    for i in range(len(influencers)):

        print('Extracting info from ' + influencers[i].decode('utf-8') +
              '... using ' + user)
        scraper = Scraper(influencers[i], user, pw)
        user_data = scraper.run()
        print('\n' + 'No. of followers scraped for ' +
              influencers[i].decode('utf-8') + ' : ' + str(len(user_data)))
        scraper.close()

        # save data for each user
        file_name = 'data/followers_' + influencers[i].decode(
            'utf-8') + '.pickle'
        with open(file_name, 'wb') as file:
            pickle.dump(user_data, file)

        # track done list of users
        with open('done_list.txt', 'a') as file:
            file.write(influencers[i].decode('utf-8') + '\n')

Ejemplo n.º 11

0

Mostrar archivo

def main():
    # initialize core objects
    scraper = Scraper(Config.praw('CLIENT_ID'), Config.praw('CLIENT_SECRET'),
                      Config.praw('USER_AGENT'), Config.praw('USERNAME'),
                      Config.praw('PASSWORD'))
    parser = Parser()
    notifier = Notifier(Config.twilio('ACCOUNT_SID'),
                        Config.twilio('AUTH_TOKEN'))

    # initialize time for loop
    startTime = time.time()

    while True:
        try:
            # grab last 100 new mechmarket posts
            posts = scraper.grabNewPosts('mechmarket', 100)

            # loop through posts
            for post in posts:
                have = parser.parseHave(post.title)
                # does this need the or? or is the search case insensitive?
                if parser.keywordSearch('milkshake',
                                        have) or parser.keywordSearch(
                                            'Milkshake', have):
                    # notify if we found it
                    notify(notifier, post,
                           f'Milkshake found '\
                           f'{post.title} '\
                           f'{post.url} ')
            # sleep for 60 seconds
            time.sleep(60.0 - ((time.time() - startTime) % 60.0))
            print(
                f'Starting new loop at {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}'
            )
        except:
            # disable during testing
            # notifier.sendMessage('8042100901','17572092974','Something broke and the bot has stopped')
            break

Ejemplo n.º 12

0

Mostrar archivo

Archivo: GUI.py Proyecto: kpahawa/Web-Scraper

    def executeScript(self):
        scraper = Scraper()

        tempName = self.animeName.get()
        tempName = tempName.split()
        tempName.append('episode')

        animeName = ""
        for i in tempName:
            animeName = animeName + i + "-"
        permAnimeName = animeName

        self.count = 1
        a = True

        while a:
            animeName = animeName + str(self.count)
            a = scraper.getHTMLTags("dummy string", animeName)
            animeName = permAnimeName

            self.count += 1

        self.writeToCSV()

Ejemplo n.º 13

0

Mostrar archivo

def startScraping(keyword=None):
    global resultDict
    SID = request.cookies.get(session.sessionID,None)
    if keyword == "":
        return """ <script> window.location.href = "/" </script>"""
    if SID in scraperDict.keys():
        if threadDict[SID].is_alive():
            return "ただいま処理中です。お待ちください。{now}".format(now=str(datetime.datetime.now()))
        else:
            try:
                with open(resultJsonPath+keyword,"r",encoding="utf-8") as res:
                    return res.read()
            except FileNotFoundError:
                response = make_response("""<script> window.location.href = "/scraping/{keyword}" </script>""".format(keyword=keyword))
                max_age = liveLimit
                expires = int(datetime.datetime.now().timestamp()) + max_age
                response.set_cookie(session.sessionID, value=str(session.getSID()), max_age=max_age, expires=expires, path='/', secure=None, httponly=False)
                return response
    else:
        scraperDict[SID] = Scraper.Scraper(0,0)
        threadDict[SID] = threading.Thread(target=wrapScraping,args=(scraperDict[SID],keyword,10,))
        threadDict[SID].daemon = True
        threadDict[SID].start()
        return "ただいま処理中です。お待ちください。{now}".format(now=str(datetime.datetime.now()))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: main.py Proyecto: saikia81/osipy

def find_courses():
    username = "******"
    password = get_password()

    driver = Driver('firefox')
    driver.home()
    assert (driver.login(username, password))
    driver.goto_menu("Onderwijs")
    driver.fillout()

    scraper = Scraper(driver)
    scraper.find_courses()
    scraper.print_courses()
    scraper.find_courses_names()
    scraper.scrape_course_elements()
    driver.shutdown()

    with open("courses.dat", 'wb+') as courses_file:
        pickle.dump(scraper.courses, courses_file)

    for course in scraper.courses:
        print(course)

    return scraper.courses

Ejemplo n.º 15

0

Mostrar archivo

#encoding:utf-8

import sys
import Queue
from Scraper import *
import time
import sys
import urlparse
import bs4
import threading

test_obj = Scraper(single_page=False, workers_num=15)
test_obj.feed(['http://www.freebuf.com'])
time.sleep(5)
z = test_obj.get_result_urls_queue()

while True:
    try:
        print z.get(timeout=4)
    except:
        break

Ejemplo n.º 16

0

Mostrar archivo

Archivo: main.py Proyecto: IsIKazz/SimCoScrapper

'''
Created on Oct 19, 2017

@author: FB
'''

import datetime
from pandas.io.pytables import HDFStore
import Scraper
import os

if __name__ == '__main__':
    '''TODO: Dynamische Abfrage, bis zu welchem Datum die Datenbank bereits Patente hat'''
    startdate = datetime.date(2004, 3, 13)
    waiting_time = 50  # Waiting time to not overstrain google
    cwd = os.getcwd()

    download_dir = cwd + "\\PatentScraper"

    Patent_info = Scraper().downloadCSV(startdate=startdate,
                                        waiting_time=50,
                                        download_dir=download_dir,
                                        DaysTillStore=10)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: pokerscrapetest.py Proyecto: jacobDeutsch10/Primedope-Scraper

import Scraper

foo = Scraper.Scraper(wr_bb_100=10, wr_observed=10, std_dev=140, hands=151)
#foo.scrape_page_x_times(500, file_name="rawdatafrom500scrapes.csv")
#print(foo.average_with_rake_exclude_first_x_cols(rake=3, cols=8))
#foo.sort_data_by_plus_minus(file_name="plus_minus_500_scrapes.csv")
foo.read_raw_file_from_csv("rawdatafrom500scrapes.csv")
#foo.sort_data_by_plus_minus(file_name="plus_minus_500.csv")
print(foo.average_with_rake_exclude_first_x_cols())
#print(foo.get_avg_of_plus_minus(two_col_array=foo.read_plus_minus_file_from_csv("plus_minus_1k_no_bw.csv")))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: main.py Proyecto: westgarrett/parse_bby

import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent

import Scraper

if __name__ == '__main__':
    s = Scraper.Scraper()
    s.scrape_product_page()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: main.py Proyecto: HemanthK9/Web-Scraping

'''
DeviantArt Image downloader - Takes one or more keywords as input and downloads the specified number of images from deviantArt
'''
import sys, os
import re
import requests
import Downloader
import Scraper

#Input keywords
keywords = input("Enter search terms : ").split()
d = Downloader.Downloader()
sc = Scraper.Scraper()

#Download the corresponding DeviantArt page
response = d.downloadPage(keywords)
#Scrape all page urls
linkTags = sc.getLinkTags(response)
#Open all page urls, download them and scrape the image urls along with names
imageURLs, imageNames = sc.getImageTags(linkTags)

#Create a new directory in C:\Users\HP\Pictures\Scraped if it does no already exist. Then switch to that directory
try:
    os.chdir(r'C:\Users\HP\Pictures\Scraped' + '\\' +
             " ".join(keywords).title())
except FileNotFoundError:
    os.makedirs(r'C:\Users\HP\Pictures\Scraped' + '\\' +
                " ".join(keywords).title())
    os.chdir(r'C:\Users\HP\Pictures\Scraped' + '\\' +
             " ".join(keywords).title())

Ejemplo n.º 20

0

Mostrar archivo

def main(argv):
    logging.basicConfig(filename='AutoDownload.log',
                        level=logging.INFO,
                        format='%(asctime)s - %(levelname)s:%(message)s')
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    # set a format which is simpler for console use
    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
    # tell the handler to use this format
    console.setFormatter(formatter)
    # add the handler to the root logger
    logging.getLogger('').addHandler(console)
    rootlogger = logging.getLogger('AutoDownload')
    logger = logging.getLogger('AutoDownload.main')
    logger.info("*** START ***")
    logger.info("command args: [%s]", " ".join(argv))

    try:
        current_month = date.today().month
        current_year = date.today().year
        current_day = 0
        month_year_str = '{!s}-{!s}'.format(
            month_digit_to_string(current_month), str(current_year))

        cfgstr = "config.ini"
        card_type = 'cd'
        statement_issued_time = 'd'
        security_mode = False

        try:
            opts, args = getopt.getopt(argv, "hdasc:m:t:", ["month="])
        except getopt.GetoptError:
            logger.exception("usage error")
            print_usage()
            sys.exit(2)

        for opt, arg in opts:
            if opt == '-h':
                print_usage()
                sys.exit()
            elif opt == '-c':
                cfgstr = arg
            elif opt == "-m":
                statement_issued_time = 'm'
                month_year_str = arg
            elif opt == '-d':
                statement_issued_time = 'd'
                current_day = int(date.today().day)
            elif opt == '-a':
                statement_issued_time = 'a'
            elif opt == '-t':
                card_type = arg
            elif opt == '-s':
                security_mode = True
            else:
                print("Invalid arguments!")
                logger.error("invalid argument [%s]", opt)
                return

        cfg_path = Path(cfgstr)
        if not cfg_path.is_file():
            print("missing config file")
            logger.error("missing config file (%s)", str(cfg_path))
            return
        cfg = configparser.ConfigParser()
        cfg.read(str(cfg_path))

        new_log_level = logging.getLevelName(cfg['Log']['level'])
        logging.getLogger('AutoDownloads').setLevel(new_log_level)
        console.setLevel(new_log_level)
        logger.info("log level is %s", logger.getEffectiveLevel())
        scraper = Scraper.Scraper(cfg)
        scraper.download(statement_issued_time, month_year_str, current_day,
                         card_type, security_mode)
        logger.info("--- DONE ---")
    except:
        logger.exception("!!! FAILED !!!")
        sys.exit(3)
    logger.info("### END ###")

Ejemplo n.º 21

0

Mostrar archivo

Archivo: craw_run.py Proyecto: bigbigx/first_threading_crawler

#encoding:utf-8
import sys
import Queue
from Scraper import *
import time
import sys
import urlparse
import bs4
import threading

test_obj = Scraper(single_page=True, workers_num=15)
test_obj.feed(['http://freebuf.com'])
time.sleep(5)

z = test_obj.get_result_urls_queue()

while True:

    try:

        print z.get(timeout=4)

    except:
        print "error"
        break

Ejemplo n.º 22

0

Mostrar archivo

Archivo: harto_product_pages_scraper.py Proyecto: BucKz96/Scraping-Selenium

from FileDownloader import *
from ArchiveExtractor import *

current_folder = os.path.dirname(os.path.realpath(__file__))

brand_name = "HARTÔ"

skip_web_scraping = False
download_files = True

output_json = []
output_json_path = current_folder + "/exported_datas/" + brand_name.lower(
) + "_product_pages_data.json"

# init the scraper
scraper = Scraper(export_dir=current_folder + "/exported_datas",
                  download_dir=current_folder + "/downloads_product_pages")

# >>> données sur la marque ----------------------------------------------------------------------------------------------------

# nom qui sera utilisé dans la base de données
scraper.set_brand_name(brand_name)

# homepage du site à scraper
# parfois, les liens sont relatifs, il faut ajouter homepage + lien pour avoir l'url correcte
scraper.homepage = "https://www.hartodesign.fr/"

# première page à charger, pas forcément la homepage
scraper.load_page(scraper.homepage)

# <<< données sur la marque ----------------------------------------------------------------------------------------------------

Ejemplo n.º 23

0

Mostrar archivo

Archivo: Model.py Proyecto: nrife96/hackprinceton-spr18

    def do_GET(self):

        mode = getValue("mode", self.requestline)
        message = format("%s is not a mode") % mode

        if mode == "echo":
            message = "Hola Mundo"

        if mode == "init":

            message = getHTML(None)

        if mode == 'train':

            modelName = getValue("modelName", self.requestline)

            ml = Classifier()
            ml.train()
            ml.save(modelName)

            message = "Training Successful"

        if mode == "test":

            print(self.requestline)

            link = getValue("link", self.requestline)
            modelName = getValue("modelName", self.requestline)

            scraper = Scraper()
            scraper.scrape(link)

            ml = Classifier()
            ml.load(modelName)
            outputs = ml.test('./Articles/UserQuery.txt')
            result = biasCalculation(outputs)

            message = getHTML(result)

        # else:
        #     print(self.requestline)
        #     string = self.requestline
        #     string = string.replace("GET /", "")
        #     index = string.index(" ")
        #     file = open(string[:index],"rb")
        #     message = "hello"#file.read()
        # return file(string)

        # Send response status code
        self.send_response(200)

        # Send headers
        self.send_header('Content-type', 'text/html')
        self.send_header("Access-Control-Allow-Origin", "*")
        self.send_header('Access-Control-Allow-Headers',
                         'Content-Type,Authorization')
        self.send_header('Access-Control-Allow-Methods', 'GET')
        self.end_headers()

        # Write content as utf-8 data
        self.wfile.write(bytes(message, "utf8"))
        return

Ejemplo n.º 24

0

Mostrar archivo

Archivo: functions.py Proyecto: jhelvy/scraperToolkit

def scrape():
    scraper = Scraper()
    print('Done!')

Ejemplo n.º 25

0

Mostrar archivo

Archivo: scheduledscraper.py Proyecto: BorisB93/nba-api

import logging
import pymongo
import schedule
import time

from Scraper import *

logging.basicConfig(
    filename='scheduledscraperlog.log',
    format='%(asctime)s %(levelname)s %(name)s %(threadName)s : '
    '%(message)s',
    level=logging.INFO)
url = 'https://www.espn.com/nba/scoreboard'
scraper = Scraper(url)

mongo_url = 'mongodb+srv://dbBoris:[email protected]/test?retryWrites=true&w=majority'
mongo_client = pymongo.MongoClient(mongo_url, maxPoolSize=50, connect=False)
db = pymongo.database.Database(mongo_client, 'games_database')
collection = pymongo.collection.Collection(db, 'games_collection')

scraper.scrape()  # Initial scraping
logging.info("Scheduled scraper initialized.")
schedule.every(1).minutes.do(scraper.scrape)

while 1:
    schedule.run_pending()

    for game in scraper.data:
        # New games enter the database, while existing games (match both name and date) are updated (only if needed).
        if not db.collection.count_documents({
                'name': game.get('name'),

Ejemplo n.º 26

0

Mostrar archivo

Archivo: harto_ready_for_db_formatter.py Proyecto: BucKz96/Scraping-Selenium

from os import listdir
import re
import json
from Scraper import *

scraper = Scraper()

brand_name = "VERPAN"

new_json = []

# new_format = {
#     "name": product name
#     "details": component name
#     "collection": collection name
#     "type": string
#     "designers": string[],
#     "description": string,
#     "brandWebsiteUrl": string
#     "width": int en mm,
#     "height": int en mm,
#     "depth": int en mm,
#     "seatHeight": int en mm,
#     "diameter": int en mm,
#     "weight": int en g,
#     "designedDate": année 4 chiffres,
#     "thumbnail": string,
#     "sourceFiles": string[]
# }
# value of type
# - Furniture

Ejemplo n.º 27

0

Mostrar archivo

def update():

    #feild initilization
    host = "https://hamstudy.org"
    first, last = (0, 5)  #control variable range for extracting specific links
    print("Web Host: ")
    scraper = Scraper(host)
    secondaryURL = scraper.href(last)
    allQuestions = []
    allanswers = []
    allcorrectAnswers = []

    print()
    #gets the different sections(techician, general, Amateur Extra)
    first, last = (5, 8)
    sections = scraper.href(first, last)

    #loops through each section and gets the questions and answers
    for type in sections:

        typeHandle = type.replace('/', '')
        link = host + type
        scraper.modifyURL(link)
        first, last = (0, 17)
        extension = host + ''.join(scraper.href(21, 22))

        #open Main Links
        scraper.modifyURL(extension)
        first, last = (7, 17)
        mainLinks = scraper.href(first, last)

        #getting sublinks from mainLinks
        print("\nTop Links in " + typeHandle + ": ")
        first, last = (17, None)
        allLinks = []
        for extension in mainLinks:
            scraper.modifyURL(host + extension)
            subLinks = scraper.href(first, last)
            allLinks.append(subLinks)

        #clear files before adding contents
        open('Questions.txt', 'w').close()
        open('Answers.txt', 'w').close()
        open('CorrectAnswers.txt', 'w').close()

        #Opening all links
        print("\nOpening all links " + typeHandle + ": ")
        for links in allLinks:
            for link in links:
                scraper.modifyURL(host + link)
                sectionQuestions = scraper.get_web_element("div", "question")
                sectionAnswers = scraper.get_web_element("li", "answer")
                correctAnswers = scraper.get_web_element(
                    "li", "answer correct")

                with open("data/" + typeHandle + 'Questions.txt',
                          'a') as fileHandle:
                    for question in sectionQuestions:
                        fileHandle.write('%s\n' % question.text.strip())

                with open("data/" + typeHandle + 'Answers.txt',
                          'a') as fileHandle:
                    for answers in sectionAnswers:
                        fileHandle.write('%s\n' % answers.div.text)

                with open("data/" + typeHandle + 'CorrectAnswers.txt',
                          'a') as fileHandle:
                    for answer in correctAnswers:
                        fileHandle.write('%s\n' % answer.label.text[0].strip())

Ejemplo n.º 28

0

Mostrar archivo

Archivo: testTMS.py Proyecto: AdinSolomon/POSH

import json

import Scraper
#import CourseCatalog as CC
from TermMasterSchedule import Quarters

scraper = Scraper.Scraper()
data: dict = scraper.scrapeTMS(terms=["Fall Quarter 20-21"],
                               colleges=["Arts and Sciences"],
                               subjects=["GEO"])
assert (len(data) != 0), "No data was found!"
json.dump(data, open('temp_TMS.json', 'w'))

Ejemplo n.º 29

0

Mostrar archivo

from Scraper import *
from config import *
import requests
from bs4 import BeautifulSoup

DDSM_page = requests.get(DDSM_url)
DDSM_html = BeautifulSoup(DDSM_page.text)
DDSM_table = DDSM_html.find('table')
DDSM_links = DDSM_table.findAll('a')

Scraper(DDSM_links)