Python scrapeの例 - HotExamples

コード例 #1

0

ファイルを表示

ファイル: database.py プロジェクト: bofrim/Web_scraper

def collect(url, visited = None):
    '''
    Used to access a new page and put it's data into a page instance
    Controls the users specified operations
    Runs only if it needs to
    Running time depends on factors such as internet speed and cached results.
    '''
    # Find the locations of the files
    file_path = os.getcwd()+'/data/'+getDomain(url)

    # Create an empty set if visited is not supplied
    if visited is None:
        if os.path.exists(file_path):
            visited = set(os.listdir(file_path))

        else:
            visited = set()

    # Check if there is data existing for the website
    if os.path.exists(file_path):
        # Check if the page has been visited
        if url.replace('/', '|')+'.txt' not in visited:
            scrape(url)
    else:
        scrape(url)

    return visited

コード例 #2

0

ファイルを表示

def start_updater(DB_LOCATION, SCRAPE_PERIOD, SMTP_USERNAME, SMTP_PASSWORD,
                  webapp_context):
    # Used in a thread to scrape periodically.
    def scrape_and_update(webapp_context):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        while webapp_context["running"]:
            time.sleep(SCRAPE_PERIOD)
            db_connection = sqlite3.connect(DB_LOCATION)
            db_cursor = db_connection.cursor()
            previous_scrape_datetime = datetime.datetime.now()
            try:
                scrape(DB_LOCATION)
                email_new_urops(DB_LOCATION, previous_scrape_datetime,
                                SMTP_USERNAME, SMTP_PASSWORD)
                webapp_context["data_json"] = create_data_json(
                    DB_LOCATION, previous_scrape_datetime)
            except:
                traceback.print_exc()
            db_connection.commit()
            db_connection.close()

    # Make initial data_json, start scraping thread, before serving.
    previous_scrape_datetime = datetime.datetime.now()
    scrape(DB_LOCATION)
    email_new_urops(DB_LOCATION, previous_scrape_datetime, SMTP_USERNAME,
                    SMTP_PASSWORD)
    webapp_context["data_json"] = create_data_json(DB_LOCATION,
                                                   previous_scrape_datetime)
    updater = threading.Thread(target=scrape_and_update,
                               args=(webapp_context, ))
    updater.daemon = True
    updater.start()

コード例 #3

0

ファイルを表示

ファイル: database.py プロジェクト: bofrim/Web_scraper

def collect(url, visited=None):
    '''
    Used to access a new page and put it's data into a page instance
    Controls the users specified operations
    Runs only if it needs to
    Running time depends on factors such as internet speed and cached results.
    '''
    # Find the locations of the files
    file_path = os.getcwd() + '/data/' + getDomain(url)

    # Create an empty set if visited is not supplied
    if visited is None:
        if os.path.exists(file_path):
            visited = set(os.listdir(file_path))

        else:
            visited = set()

    # Check if there is data existing for the website
    if os.path.exists(file_path):
        # Check if the page has been visited
        if url.replace('/', '|') + '.txt' not in visited:
            scrape(url)
    else:
        scrape(url)

    return visited

コード例 #4

0

ファイルを表示

ファイル: iterate.py プロジェクト: wnaftw/uscrape

def iterate(urlList, n):
    if len(urlList) >= 1:
        for i in range(0, len(urlList)):
            #CATCH UNSUPPORTED LINKS
            print "ATTEMPT: %s" % (urlList[i])
            if urlList[i][-1] == 'd':
                print "Not supported"
                continue
            elif urlList[i].find('"') != -1 or urlList[i].find(".js") != -1:
                print "Not supported"
                continue
            html = urllib2.urlopen(urlList[i]).read()
            scrape(html, urlList)
            if len(urlList) > n:
                break

コード例 #5

0

ファイルを表示

ファイル: iterate.py プロジェクト: wnaftw/uscrape

def iterate(urlList, n):
    if len(urlList) >= 1:
        for i in range(0, len(urlList)):
            #CATCH UNSUPPORTED LINKS
            print "ATTEMPT: %s" % (urlList[i])
            if urlList[i][-1] == 'd':
                print "Not supported"
                continue
            elif urlList[i].find('"') != -1 or urlList[i].find(".js") != -1:
                print "Not supported"
                continue
            html = urllib2.urlopen(urlList[i]).read()
            scrape(html, urlList)
            if len(urlList) > n:
                break

コード例 #6

0

ファイルを表示

 def scrape_and_update(webapp_context):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     while webapp_context["running"]:
         time.sleep(SCRAPE_PERIOD)
         db_connection = sqlite3.connect(DB_LOCATION)
         db_cursor = db_connection.cursor()
         previous_scrape_datetime = datetime.datetime.now()
         try:
             scrape(DB_LOCATION)
             email_new_urops(DB_LOCATION, previous_scrape_datetime,
                             SMTP_USERNAME, SMTP_PASSWORD)
             webapp_context["data_json"] = create_data_json(
                 DB_LOCATION, previous_scrape_datetime)
         except:
             traceback.print_exc()
         db_connection.commit()
         db_connection.close()

コード例 #7

0

ファイルを表示

ファイル: main.py プロジェクト: JPDaly/Instant_Insta

def main():
    if input(
            "Do you want to scrape (s) new images or use an existing (e) folder? (s/e): "
    ) == 's':
        folder = scrape()
    else:
        folder = None
    if input(
            "\nWould you like to use classifier.py to remove outliers? (y/n): "
    ) != 'n':
        classifier(folder)
    if input("\nWould you like to use average.py to generate an image? (y/n): "
             ) != 'n':
        average(folder)
    return

コード例 #8

0

ファイルを表示

    'c++', 'vulkan', 'opengl', 'python 3', 'golang'
]

# print options
print("OPTIONS:")
i = 0
while i < len(options_list):
    print(', '.join(options_list[i:i + int((len(options_list) / 5))]))
    i = i + int((len(options_list) / 5))

mail_id = input("Enter your e-mail(gmail) id here: ")
password = input("Enter your password: "******"Enter what you want from the above stack: ")
tech_need_list = []

# take all technologies user wants
while tech_need.lower() != 'exit':
    tech_need_list.append(tech_need.lower())
    tech_need = input("Enter what you want from the above stack: ")

# call to scrape function which return rows with required data
site_data = scrape(tech_need_list)

# if we get no matching data don't do anything else write to file and send mail
if len(site_data) != 0:
    writeFile(site_data)
    sendMail(mail_id, password)
    print("DONE!! :)")
else:
    print('NO matching data found for you :(')

コード例 #9

0

ファイルを表示

ファイル: Professor.py プロジェクト: dhafen21/RateMyProfessor

 def get_data(self, url, wd):
     self.data.class_name, self.data.quality, self.data.difficulty, self.data.comments, self.name, self.data.date = scrape(
         self.url, wd)

コード例 #10

0

ファイルを表示

ファイル: app.py プロジェクト: roopa90/Indeed-Job-Scraper-1

You can then download the full dataframe as an excel sheet for convenience. 

**NOTE: Parsing through all job descriptions can take sometime. (up to 30 seconds). **

'''

# Hacky way of allowing downloading
global_df = None

######################################################################################################################
# Creating app instance and designing layout #
######################################################################################################################

# create scrape instance
scraper = scrape()

# better stylesheet
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

# Create app
app = dash.Dash(external_stylesheets=external_stylesheets)

# assign server instance
server = app.server

# Create layout
app.layout = html.Div(children=[
    dcc.Markdown(heading),
    dbc.Progress(id="progress", value=0, striped=True, animated=True),
    html.Div(children=[

コード例 #11

0

ファイルを表示

ファイル: main.py プロジェクト: wnaftw/uscrape

#UrlScraper

import urllib2
from scrape import *
from iterate import *

url = ""
#How to download images
f = open("test.gif", "wb")
f.write(urllib2.urlopen(url).read())
f.close()

urlList = []
site = urllib2.urlopen(url)
html = site.read()

scrape(html, urlList)

print len(urlList)

#for i in range(0, len(urlList)):
#    print urlList[i]

iterate(urlList, 150)

for i in range(0, len(urlList)):
    print urlList[i]
    
print len(urlList)

scrapeHTML(urlList)

コード例 #12

0

ファイルを表示

import time, dropbox, os, datetime
from dropbox.files import WriteMode
from dropbox.exceptions import ApiError
from scrape import *
from BellBot import *

listings = 'Listings.txt'
client = dropbox.Dropbox('key')
counter = 0

while True:
    if counter == 3:
        counter = 0
        try:
            print('scraping cattle exchange now')
            listings = scrape()
            print('getting cattle listings')
            get_listings_text(listings)
            print('scrape is complete')
        except:
            print('exception occured')

        # upload to one drive folder
        with open(listings, 'rb') as f:
            dest_path = datetime.datetime.today().strftime('%d-%m-%Y')
            dest_path = '/Cowbell/{}_listings.txt'.format(dest_path)
            try:
                client.files_upload(f.read(), dest_path, mode=WriteMode('add'))
            except ApiError:
                print('API Error occured')

コード例 #13

0

ファイルを表示

ファイル: main.py プロジェクト: wnaftw/uscrape

#UrlScraper

import urllib2
from scrape import *
from iterate import *

url = ""
#How to download images
f = open("test.gif", "wb")
f.write(urllib2.urlopen(url).read())
f.close()

urlList = []
site = urllib2.urlopen(url)
html = site.read()

scrape(html, urlList)

print len(urlList)

#for i in range(0, len(urlList)):
#    print urlList[i]

iterate(urlList, 150)

for i in range(0, len(urlList)):
    print urlList[i]

print len(urlList)

scrapeHTML(urlList)