Python scrapeの例、scraping.scrape Pythonの例

コード例 #1

0

ファイルを表示

ファイル: crawler.py プロジェクト: samistart/WebCrawler

 def gather_links(page_url):
     html_string = ''
     url_info = urllib.request.urlparse(page_url)
     if url_info.netloc != Crawler.domain_name:
         return set()
     try:
         response = urllib.request.urlopen(page_url)
     except:
         print("Cannot open page: " + page_url)
         return set()
     if 'text/html' in response.getheader('Content-Type'):
         html_bytes = response.read()
         soup = bs4.BeautifulSoup(html_bytes, "html.parser")
         if config.GENERATE_SITE_MAP:
             path = url_info.path
             Crawler.xml_writer.write(path)
         if config.DOWNLOAD_HTML:
             file_path = file_writing.get_file_path(Crawler.project_name, page_url)
             file_writing.create_dir_from_file_path(file_path)
             try:
                 html_string = html_bytes.decode("utf-8")
                 file_writing.write_file(file_path, html_string)
             except:
                 print("Cannot write to file: " + page_url)
         scraping.scrape(soup, Crawler.domain_name, Crawler.xml_writer)
         return scraping.get_links(soup, Crawler.base_url)

コード例 #2

0

ファイルを表示

ファイル: find_websites.py プロジェクト: nadborduedil/url_matching

def bing_the_query_field(query_field, bing_api_key, n_results):

    # Forbidden list

    forbidden_list = ["wikipedia", "bloomberg", "companiesintheuk", "duedil", "companycheck", "prnewswire", "google", "companieslist", "linkedin", "endole.co.uk", "tuugo", "companiesireland", "top1000", "directorsintheuk", "companydirectorcheck", "yell", "192.com", "facebook", "solocheck", "reuters.com", "idevon.co.uk", "slideshare"]



    # Bings a query, returns the n first items

    bing_search_url = 'https://api.datamarket.azure.com/Data.ashx/Bing/Search/Web?Query='+ query_field + '&$format=json'
    #print bing_search_url
    
    bing_response = scrape(bing_search_url, bing_api_key)

    list_of_url = []
    if bing_response:
        if 'd' in bing_response:
            if 'results' in bing_response['d']:
                for result in bing_response['d']['results']:
                    flag = True
                    for element in forbidden_list:
                        if element in result['Url']:
                            flag = False
                    
                    if flag == True:
                        list_of_url.append(result['Url'])


    return list_of_url[:min(len(list_of_url), n_results)]

コード例 #3

0

ファイルを表示

ファイル: getScores.py プロジェクト: samearth/Ranking-Youtube-Videos

def getScore(link):
    vidName = scrape(link)
    process_likes(vidName)
    score = 0
    video_comm = pd.read_csv(vidName)
    video_comm = video_comm.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
    video_comm.rename(columns={'0': 'comment', '1': 'likes'}, inplace=True)
    print('likes:')
    print(video_comm.get('likes'))
    num_likes = video_comm.get('likes').sum()
    #num_comments = video_comm['comment'].count()
    video_comm['likes'] = video_comm['likes'] + 1
    for index, comm in video_comm.iterrows():
        try:
            pol = TextBlob(comm['comment']).sentiment
            score_comm = pol.polarity * comm['likes']
            score += score_comm
        except:
            continue
    return score / num_likes


#Add links here
#print(getScore('https://www.youtube.com/watch?v=-niuhBmUPLU'))
#print(getScore('https://www.youtube.com/watch?v=fP17mIEv8lo'))
#print(getScore('https://www.youtube.com/watch?v=QD0IM5tfnVQ'))
#print(getScore('https://www.youtube.com/watch?v=rmQMKowvYeo'))
#print(getScore('https://www.youtube.com/watch?v=QPkXJvULrN8'))

コード例 #4

0

ファイルを表示

def search_result():
    if request.method == 'POST':
        productname = request.form.get('username')
        global data
        data = scrape(productname)
        return render_template('search_result.html', message=data)
    if request.method == 'GET':
        data = []
        return render_template('search_result.html', message=data)

コード例 #5

0

ファイルを表示

ファイル: app.py プロジェクト: Gagandeepsingh10798/Projects

def index():
    data = str(request.args.get('query'))
    data = scrape(data)
    # response = app.response_class(
        # response=json.dumps(data),
        # status=200,
        # mimetype='application/json'
    # )
    # return response
    return render_template("index.html", datas = data)

コード例 #6

0

ファイルを表示

ファイル: test_file.py プロジェクト: reyha/Web-Scraper

    def test_scrape(self):

        # Makes sure empty dicts are returned when neither bse nor nse is found for company. For example: SBI Magnum Express

        url = 'http://www.moneycontrol.com/india/stockpricequote/finance-investments/sbimagnumexpress/SBI06'
        company_url = requests.get(url)
        soup = BS(company_url.text, "html.parser")
        b, n = scrape(soup)
        self.assertEqual(b, {})
        self.assertEqual(n, {})

コード例 #7

0

ファイルを表示

def main():
    if request.method == 'POST':
        company_name = request.form['cname']
        frequency = request.form['freq']
        start_time = request.form['stime']
        end_time = request.form['etime']
        company_url = search_url(company_name)

        if company_url:
            c_url = requests.get(company_url)
            soup = BS(c_url.text, "html.parser")

            # Returns bse and nse contents if present
            b, n = scrape(soup)

            # Adding info to Database
            bse_db = mongo.db.bse
            nse_db = mongo.db.nse

            if b:
                bse_entry = bse_db.insert({'BSE Date': b['bse_date'], 'BSE Time': b['bse_time'], \
                            'BSE Current Price': b['bse_current_price'], 'BSE absolute price': b['bse_abs_price'],\
                            'BSE percentage': b['bse_per'], 'BSE Volume': b['bse_volume'], \
                            'BSE Prev close': b['bse_prev_close'], 'BSE Open price': b['bse_open_price'],\
                            'BSE bid price': b['bse_bid_price'], 'BSE offer price': b['bse_offer_price']})
            if n:
                nse_entry = nse_db.insert({'NSE Date': n['nse_date'], 'NSE Time':n['nse_time'], \
                           'NSE Current Price': n['nse_current_price'], 'NSE absolute price': n['nse_abs_price'], \
                           'NSE percentage': n['nse_per'], 'NSE Volume': n['nse_volume'], \
                           'NSE Prev close': n['nse_prev_close'], 'NSE Open price': n['nse_open_price'], \
                           'NSE bid price': n['nse_bid_price'], 'NSE offer price': n['nse_offer_price']})

            # Job scheduling
            if frequency and start_time and end_time:

                # Check to ensure start time is before end time
                if start_time < end_time:
                    trigger = OrTrigger([
                        CronTrigger(hour=start_time + '-' + end_time,
                                    minute=frequency)
                    ])
                    scheduler.add_job(main, trigger)
                else:
                    error = "End time should be after start time"
                    return render_template('index.html', error=error)

            if bse_db or nse_db:
                return redirect(url_for('info'))

        else:
            error = "Sorry! Company not found."
            return render_template('index.html', error=error)

    return render_template('index.html')

コード例 #8

0

ファイルを表示

ファイル: find_websites.py プロジェクト: nadborduedil/url_matching

def duedil_company_search(company_name, duedil_api_key):
    # Searches a company by its name
    # Requires scrap

    # Clean company name
    clean_company_name = company_name
    clean_company_name = clean_company_name.lower()
    clean_company_name = clean_company_name.replace(' ', '%20')

    # Do search
    search_url = 'http://duedil.io/v3/companies?filters={"name":"'+clean_company_name+'"}&api_key='+duedil_api_key

    search_response = scrape(search_url)

    if search_response:
        company_url_root = search_response["response"]["data"][0]["company_url"]
        company_url = company_url_root+'?api_key='+duedil_api_key+'&format=json'
        director_url = company_url_root+'/directors'+'?api_key='+duedil_api_key+'&format=json'
    else:
        return False

    # Company profile

    profile_response = scrape(company_url)

    if profile_response and 'response' in profile_response:
        company_profile = profile_response['response']
    else:
        return False

    director_response = scrape(director_url)

    if director_response and 'response' in director_response:
        company_profile['directors'] = director_response['response']['data']
    else:
        return False

    return company_profile

コード例 #9

0

ファイルを表示

ファイル: main.py プロジェクト: poojan-dalal/amazon_reviews_analyzer

def books():
    if request.method == 'GET':
        return "hello"

    if request.method == 'POST':

        content2 = request.json
        name = content2['name']
        spage = content2['spage']
        epage = content2['epage']
        url = content2['url']

        data = {
            'pid': 0,
            'name': name,
            'url': 0,
            'start_page': 0,
            'end_page': 0,
            'goodreviews': 0,
            'badreviews': 0,
            'no_of_comm': 0,
            'avg': 0,
            'exit': 0
        }

        result = firebase.post(y, data)
        print(result)
        subPart = result.get('name')

        path = y + subPart
        url = str(url)
        good, bad, no_of_comm, avg = scrape(url, spage, epage)
        #whole code

        #a1 = ['car', 'bike', 'bhavik', 'truck', 'quality555', 'little', 'shabby', 'side', 'money', 'expecting', 'dollar',
        #    'snap', 'jumper', 'cable', 'chinese', 'knock', 'shop', 'harbor', 'freight', 'buck']

        send(url, spage, epage, good, bad, path, no_of_comm, avg)

        #new_obj2 = {
        #    'response': result
        #}

        # os.remove("temp.png")
        # return Response(response = image_url)
        try:
            return jsonify(result), 201
        except FileNotFoundError:
            abort(404)

コード例 #10

0

ファイルを表示

ファイル: bing_companies.py プロジェクト: nadborduedil/url_matching

def bing_companies(name, bing_api_key, blacklist=website_blacklist):
    """Bings a query, returns the n first items omits urls containing
    blacklisted words"""
    query_field = "'%s'" % urllib.quote(name)
    bing_search_url = \
        'https://api.datamarket.azure.com/Data.ashx/Bing/Search/Web?Query=' + \
        query_field + '&$format=json'

    response = scrape(bing_search_url, bing_api_key)

    list_of_urls = []
    if response and 'd' in response and 'results' in response['d']:
        for result in response['d']['results']:
            if not any(b in result['Url'] for b in blacklist):
                list_of_urls.append(result['Url'])

    return list_of_urls

コード例 #11

0

ファイルを表示

ファイル: wegSite.py プロジェクト: Nicholasgardner/Wegmans-Recipe-Helper

def confirm():
    if request.method == 'POST':
        global ingList
        try:
            ingName = request.form['submitButton']
            ingList.remove(ingName)
        except Exception:
            try:
                ingList = nlp_parser.ingredient_getter(scraping.scrape(request.form['Name']))
            except Exception:
                try:
                    ingName = request.form['ing']
                    ingList.append(ingName)
                except:
                    pass
                pass
            pass

        return render_template("confirm.html", ingList=ingList)

コード例 #12

0

ファイルを表示

ファイル: getScore.py プロジェクト: sehejjain/Ranking-Youtube-Videos

def getScore(link):
    vidName = scrape(link)
    process_likes(vidName)
    score = 0
    video_comm = pd.read_csv(vidName)
    video_comm = video_comm.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
    video_comm.rename(columns={'0': 'comment', '1': 'likes'}, inplace=True)
    print('likes:')
    print(video_comm.get('likes'))
    num_likes = video_comm.get('likes').sum()
    #num_comments = video_comm['comment'].count()
    video_comm['likes'] = video_comm['likes'] + 1
    for index, comm in video_comm.iterrows():
        try:
            pol = TextBlob(comm['comment']).sentiment
            score_comm = pol.polarity * comm['likes']
            score += score_comm
        except:
            continue
    return score / num_likes

コード例 #13

0

ファイルを表示

ファイル: app.py プロジェクト: Emilabbas679/scraper

def jobs():
    return jsonify(Jobs=scrape())

コード例 #14

0

ファイルを表示

ファイル: app.py プロジェクト: bfgreene/spanish-dashboard

def index():
    #return jsonify(scrape())
    return render_template("index.html", articles=scrape(), word=scrapeWord())

コード例 #15

0

ファイルを表示

def scrape():
    mars = mongo.db.mars_db
    mars_info = scraping.scrape()
    mars.update({}, mars_info, upsert=True)
    return "I think it worked"

コード例 #16

0

ファイルを表示

ファイル: scrapingDriver.py プロジェクト: swole-goals/SwoleGoals

import scraping

#selects each muscle group
for i in range(1, 19):
    scraping.scrape(i)

コード例 #17

0

ファイルを表示

def index():
    return jsonify(Diet=scrape())

コード例 #18

0

ファイルを表示

ファイル: main.py プロジェクト: sspinc/dac

                        choices=["create_db", "scrape", "scrape_once"])
    args = parser.parse_args()
    command = args.command

    configpath = os.getenv('DAC_CONFIG_PATH')

    with open(configpath, 'r') as config_file:
        config = json.load(config_file)

    return args.command, config, configpath


if __name__ == "__main__":
    logformat = "%(asctime)-15s %(name)-12s %(levelname)-8s %(message)s"
    logging.basicConfig(level=logging.DEBUG, format=logformat)
    log = logging.getLogger("dac")
    logging.getLogger("urllib3.connectionpool").setLevel(logging.INFO)

    command, config, configpath = parse_args()

    log.info(f"Command: {command}")
    log.debug(f"Config loaded from: {configpath}")

    if command == "scrape":
        interval = float(config["scraper"].get("interval"))
        thread = start_scheduled_scraping(interval, configpath)
    elif command == "create_db":
        create_db()
    elif command == "scrape_once":
        scrape(configpath)

コード例 #19

0

ファイルを表示

ファイル: app.py プロジェクト: IdahK/web_scraping

def index():
    return jsonify(Elements = scrape())

コード例 #20

0

ファイルを表示

def scrape():

    mars = mongo.db.mars
    mars_data = scraping.scrape()
    mars.replace_on({}, mars_data, upsert=True)
    return "Complete"

コード例 #21

0

ファイルを表示

ファイル: app.py プロジェクト: vineet23/scrapeBL

def index():
    return jsonify(Cars=scrape())

コード例 #22

0

ファイルを表示

def index():
    return jsonify(Products=scrape())

コード例 #23

0

ファイルを表示

ファイル: accumulate.py プロジェクト: AnguillaJaponica/NaturalLanguageProcessing

import glob
import json
import urllib.request
import scraping
import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    values = []
    for filename in glob.glob('./data/wikipedia/*.html'):
        with open(filename) as fin:
            html = fin.read()
            text, title = scraping.scrape(html)
            print('scraped:', title)
            url = 'https://ja.wikipedia.org/wiki/{{}}'.format(
                urllib.parse.quote(title))
            values.append((text, json.dumps({'url': url, 'title': title})))
    datastore.load(values)

    print(list(datastore.get_all_ids(limit=-1)))
    datastore.close()

コード例 #24

0

ファイルを表示

import sys
import pymongo
import scraping
import config

if __name__ == "__main__":

    uri = "mongodb://" + config.user + ":" + \
        config.password + "@ds243441.mlab.com:43441/gofundme"

    client = pymongo.MongoClient(uri)
    db = client.get_default_database()
    campaigns = db['campaigns']
    scraped_data = scraping.scrape()
    campaigns.insert_many(scraped_data)

    client.close()

コード例 #25

0

ファイルを表示

import math
import scraping
import spotify_api
import spotipy
import sys

if len(sys.argv) > 3:
    username = sys.argv[1]
    playlist_name = sys.argv[2]
    apple_url = sys.argv[3]
else:
    print "Usage: %s username playlist_id track_id ..." % (sys.argv[0], )
    sys.exit()

song_list = scraping.scrape(apple_url)

# for song in song_list:
# 	print song

token = spotify_api.authenticate(username)

tracks = []
failed = []

if token:
    sp = spotipy.Spotify(auth=token)
    sp.trace = False
    new_playlist = sp.user_playlist_create(username,
                                           playlist_name,
                                           public=False)
    if new_playlist:

コード例 #26

0

ファイルを表示

ファイル: app.py プロジェクト: bopopescu/Scrapy-1

def index():
    return jsonify(Jobs=scrape())

コード例 #27

0

ファイルを表示

ファイル: api.py プロジェクト: hugaba/api_sentiment_analysis

def graphs():
    initial_time = datetime.now()
    # 1. Get infos for scraping
    # Mandatory argument : Category
    category = request.args.get('category')

    # Optional argument
    # number of site to scrape per category, default 5 (0 for max)
    if request.args.get('num_of_site'):
        num_of_site = int(request.args.get('num_of_site'))
    else:
        num_of_site = 5
    # number of page to scrape per site, default 2 (0 for max)
    if request.args.get('num_page'):
        num_page = int(request.args.get('num_page'))
    else:
        num_page = 2
    # city where the scraping is desired (better with department number)
    if request.args.get('location'):
        location = request.args.get('location')
    else:
        location = 'no city'
    # model to use for scraping (one option 'camembert', else default model)
    model_to_test = ''
    if request.args.get('model'):
        model_to_test = 'camembert'

    print('\n', '#'*50)
    print(f' Start Analyse on {category} '.center(50, '#'))
    print('#'*50, '\n')

    # 2. Scrape trustpilot to get dataframe
    init_time = datetime.now()
    print(' Start scraping '.center(30, '#'))
    refs, df = scraping.scrape(category, location, num_of_site, num_page)
    time_elapsed = datetime.now() - init_time
    print(f'Scraping time : {time_elapsed}')
    if len(df)>0:
        # 3. Preprocess dataframe before prediction
        init_time = datetime.now()
        print(' Start preprocess '.center(30, '#'))
        df = process.preprocess_df(df)
        time_elapsed = datetime.now() - init_time
        print(f'Preprocess time : {time_elapsed}')

        # 4. Predict sentiment and add it to dataframe
        init_time = datetime.now()
        print(' Start prediction '.center(30, '#'))
        if model_to_test == 'camembert':
            df = model.predict_camembert(df)
        else:
            df = model.predict(df)
        time_elapsed = datetime.now() - init_time
        print(f'Prediction time : {time_elapsed}')

        # 5. Apply postprocess to transform data into json
        init_time = datetime.now()
        print(' Start postprocess '.center(30, '#'))
        json_review = process.postprocess(df, refs)
        time_elapsed = datetime.now() - init_time
        print(f'Postprocess time : {time_elapsed}')
    else:
        print("No data found")
        json_review = "<h1>Pas de données</h1>"
    time_elapsed = datetime.now() - initial_time
    print(f'Total time elapsed : {time_elapsed}')
    return json_review

コード例 #28

0

ファイルを表示

ファイル: app.py プロジェクト: vineet23/scrapeBL

def index():
    return jsonify(Dogs=scrape())

コード例 #29

0

ファイルを表示

def topGainLose():
    return jsonify(Stocks=scrape())

コード例 #30

0

ファイルを表示

ファイル: helloSoup.py プロジェクト: ccahill1117/webmaps

def index():
    data = request.json
    print(data)
    return jsonify(Links=scrape(data))