Python Scraper.begin Examples

Programming Language: Python

Namespace/Package Name: scraper

Class/Type: Scraper

Method/Function: begin

Examples at hotexamples.com: 3

Python Scraper.begin - 3 examples found. These are the top rated real world Python examples of scraper.Scraper.begin extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Scraper(30)

matchTag(7)

connect(6)

__init__(5)

_time_now(5)

close(5)

submit(3)

find_docs(3)

get_children(3)

create_destination(2)

extractTag(2)

get_papers(2)

begin(2)

get_all_page_uris(1)

get_all_skills(1)

get_css(1)

get_and_write_records(1)

getZipLinks(1)

get_manga(1)

get_paths(1)

get_post_data_per_page(1)

get_all_manga(1)

getGameList(1)

getSlist(1)

getQlist(1)

getInformation(1)

getIndexhtm(1)

get_prices(1)

getEvents(1)

getDepts(1)

getAppList(1)

gather_reddit_data(1)

fetch_most_recent_transactions(1)

fetch_booster_usage(1)

extractText(1)

create_organization_sets(1)

create_http_link(1)

get_price(1)

DownloadImage(1)

get_script(1)

scrape_ingredients(1)

update_submission_content(1)

store_parse(1)

stopped(1)

sort(1)

seturldata(1)

set_started_callback(1)

set_output_file(1)

set_finished_callback(1)

set_broadcast_document_callback(1)

Example #1

Show file

def main():
    parser = argparse.ArgumentParser(
        description="Help with the process of putting a song in Apple Music.")
    parser.add_argument(
        "youtube",
        default=None,
        help="The youtube link to the songs video, must surround in quotes")
    parser.add_argument("--title",
                        nargs="+",
                        default=None,
                        help="The title of the music")
    parser.add_argument("--artist",
                        nargs="+",
                        default=None,
                        help="The artist of the music")
    parser.add_argument("--thumbnail",
                        action="store_true",
                        help="If the music should include art")
    parser.add_argument("--geckodriver",
                        default=None,
                        help="Path to geckodriver, must surround in quotes")

    arguments = parser.parse_args()

    arguments.title = " ".join(arguments.title)
    arguments.artist = " ".join(arguments.artist)

    name = f"{arguments.artist} - {arguments.title}"

    options = {
        "format":
        "bestaudio/best",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192",
        }],
        "outtmpl":
        f"{name}.%(ext)s"
    }

    with youtube_dl.YoutubeDL(options) as youtube:
        youtube.download([arguments.youtube])

    music = metadata.load(f"{name}.mp3")

    if arguments.title:
        music.tag.title = arguments.title

    if arguments.artist:
        music.tag.artist = arguments.artist

    if arguments.thumbnail and arguments.geckodriver:
        from scraper import Scraper

        scraper = Scraper(arguments.geckodriver, music)
        scraper.begin(name=name)

    music.tag.save()

Example #2

Show file

graph = Graph()

while running:
    print "Commands:"
    print "0: quit"
    print "1:(Actor/Movie to parse)"
    print "2:(load graph path)"
    print "3:(execute command)"
    input = raw_input("Command")
    print input
    if input[0] == '0':
        break
    if input[0] == '1':
        test = Scraper('https://en.wikipedia.org/wiki/' + input[2:], 50)
        test.set_speed(1)
        graph = test.begin()
    if input[0] == '2':
        graph.open_json(input[2:])
    if input[0] == '3':
        cmd = "print "
        cmd += input[2:]
        exec(cmd)
'''
test_two = Scraper('https://en.wikipedia.org/wiki/Ryan_Reynolds', 30)
test_two.set_speed(1)

print str(Scraper.get_oldest_actors(graph, 5))

print str(Scraper.get_movies(graph,2009))
print str(Scraper.get_actors(graph,2009))

Example #3

Show file

File: launcher.py Project: citruspi/BarkingOwl

class ScraperWrapper(threading.Thread):

    def __init__(self,address='localhost',exchange='barkingowl',DEBUG=False):
        threading.Thread.__init__(self)

        self.uid = str(uuid.uuid4())
        self.address = address
        self.exchange = exchange
        self.DEBUG=DEBUG
        self.interval = 1

        # create scraper instance
        self.scraper = Scraper(uid=self.uid,DEBUG=DEBUG)
        self.scraping = False

        #setup message bus
        self.respcon = pika.BlockingConnection(pika.ConnectionParameters(
                                                           host=self.address))
        self.respchan = self.respcon.channel()
        self.respchan.exchange_declare(exchange=self.exchange,type='fanout')

        self.reqcon = pika.BlockingConnection(pika.ConnectionParameters(host=address))
        self.reqchan = self.reqcon.channel()
        self.reqchan.exchange_declare(exchange=exchange,type='fanout')
        result = self.reqchan.queue_declare(exclusive=True)
        queue_name = result.method.queue
        self.reqchan.queue_bind(exchange=exchange,queue=queue_name)
        self.reqchan.basic_consume(self.reqcallback,queue=queue_name,no_ack=True)

        if self.DEBUG:
            print "Scraper Wrapper INIT complete."

    def run(self):
        # setup call backs
        self.scraper.setFinishedCallback(self.scraperFinishedCallback)
        self.scraper.setStartedCallback(self.scraperStartedCallback)
        self.scraper.setBroadcastDocCallback(self.scraperBroadcastDocCallback)

        # broadcast availability
        self.broadcastavailable()
        self.reqchan.start_consuming()

    def stop(self):
        self.scraper.stop()
        self.reqchan.stop_consuming()

    def broadcastavailable(self):
        if self.scraper.status['busy'] == True:
            # we are currently scraping, so we are not available - don't broadcast
            return

        isodatetime = strftime("%Y-%m-%d %H:%M:%S")
        packet = {
            'availabledatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_available',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

        #
        # TODO: move this over to it's own timer, no need to do it here.
        #
        if self.scraper.stopped():
            raise Exception("Scraper Wrapper Exiting")
        else:
            threading.Timer(self.interval, self.broadcastavailable).start()
        
    def broadcaststatus(self):
        isodatetime = strftime("%Y-%m-%d %H:%M:%S")
        packet = {
            'status': self.scraper.status,
            'urldata': self.status['urldata'],
            'statusdatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_status',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        #time.sleep(.5)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def broadcastsimplestatus(self):
        isodatetime = strftime("%Y-%m-%d %H:%M:%S")

        if self.scraper.status['urldata'] == {}:
            targeturl = 'null'
        else:
            targeturl = self.scraper.status['urldata']['targeturl']

        packet = {
            'busy': self.scraper.status['busy'],
            'linkcount': self.scraper.status['linkcount'],
            'processedlinkcount': len(self.scraper.status['processed']),
            'badlinkcount': len(self.scraper.status['badlinks']),
            'targeturl': targeturl,
            'statusdatetime': str(isodatetime)
        }
        payload = {
            'command': 'scraper_status_simple',
            'sourceid': self.uid,
            'destinationid': 'broadcast',
            'message': packet
        }
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)

    def scraperFinishedCallback(self,payload):
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraperStartedCallback(self,payload):
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    def scraperBroadcastDocCallback(self,payload):
        jbody = simplejson.dumps(payload)
        self.respchan.basic_publish(exchange=self.exchange,routing_key='',body=jbody)
        return

    # message handler
    def reqcallback(self,ch,method,properties,body):
        try:
            response = simplejson.loads(body)
            if self.DEBUG:
                print "Processing Message:\n\t{0}".format(response['command'])
            if response['command'] == 'url_dispatch':
                if response['destinationid'] == self.uid:
                    #print "URL Dispatch Command Seen."
                    #print response
                    if self.scraping == False:
                        #print "[Wrapper] Launching Scraper on URL: '{0}'".format(response['message']['targeturl'])
                        self.scraper.seturldata(response['message'])
                        if self.scraper.started == False:
                            self.scraper.start()
                        self.scraper.begin()
                        self.scraping = True

            elif response['command'] == 'scraper_finished':
                if response['sourceid'] == self.scraper.uid:
                    self.scraping = False

            elif response['command'] == 'get_status':
                self.broadcaststatus()

            elif response['command'] == 'get_status_simple':
                self.broadcastsimplestatus()

            elif response['command'] == 'shutdown':
                if response['destinationid'] == self.uid:
                    print "[{0}] Shutting Down Recieved".format(self.uid)
                    self.stop()

            elif response['command'] == 'global_shutdown':
                print "Global Shutdown Recieved"
                self.stop()

        except:
            if self.DEBUG:
                print "Message Error"