Python Scrapeの例、scrape.Scrape Pythonの例

コード例 #1

0

ファイルを表示

def main():
    url = "https://news.ycombinator.com/"
    savePath = "/home/term1nal/Documents/HackerNews/"

    scraper = scrape.Scrape(url, savePath)

    # Executes the scrape
    scraper.hackScraper()

コード例 #2

0

ファイルを表示

ファイル: holjplus.py プロジェクト: valvoda/holjplus

 def __init__(self):
     self.ex = extract.Extract()
     self.sc = scrape.Scrape()
     self.fm = format.Format()
     self.name = 1  # The cases get a number for a name starting with 1
     self.allowed = [
         "09", "08", "07", "06", "05", "04", "03", "02", "01", "00", "99",
         "98", "97", "96"
     ]

コード例 #3

0

ファイルを表示

def get_flipkart_reviews(phones):
    scrape_obj = scrape.Scrape()
    fields = FLIPKART_REQUEST_FIELDS
    for phone in phones:
        fields['productId'] = phone[3]
        page = scrape_obj.scrape(FLIPKART_REVIEW_URL, fields, HEADERS)
        print('Fetching flipkart reviews for %s' % phone[0])
        if page.status == 200:
            extracted_reviews = scrape_obj.extract_flipkart_reviews(page)
            if not extracted_reviews:
                break
            mongo.update_phone_reviews(extracted_reviews, phone[0])

コード例 #4

0

ファイルを表示

def main():

    CONFIG = config.init_config('application.ini')

    aws_access_key = CONFIG.get('aws', 'AWS_ACCESS_KEY')
    aws_secret_key = CONFIG.get('aws', 'AWS_SECRET_KEY')
    s3_bucket = CONFIG.get('aws', 'S3_BUCKET')

    tumblr_api_key = CONFIG.get('tumblr', 'API_KEY')
    refresh_period = CONFIG.getfloat('tumblr', 'REFRESH_PERIOD')
    tags = 'instagram'

    scraper = scrape.Scrape(aws_access_key, aws_secret_key, s3_bucket, tumblr_api_key, tags, refresh_period=refresh_period)
    scraper.start()

コード例 #5

0

ファイルを表示

def get_amazon_reviews(phones):
    scrape_obj = scrape.Scrape()
    for phone in phones:
        count = 1
        page = scrape_obj.scrape(
            AMAZON_BASE_URL + phone[2] + AMAZON_REVIEW_URL_1 + str(count), '',
            '')
        print('Fetching amazon reviews for %s' % phone[0])
        while page.status == 200:
            extracted_reviews = scrape_obj.extract_amazon_reviews(page)
            if not extracted_reviews:
                break
            mongo.update_phone_reviews(extracted_reviews, phone[0])
            count += 1
            page = scrape_obj.scrape(
                AMAZON_BASE_URL + phone[2] + AMAZON_REVIEW_URL_1 + str(count),
                '', '')

コード例 #6

0

ファイルを表示

def main():
    phones = fetch_phone_list()
    scrape_obj = scrape.Scrape()
    print('Fetching phone specs from gsm arena : ')
    # TODO remove
    f = open(SPECS_STORAGE_PATH + SPECS_FILE_NAME, 'w', encoding='utf-8')

    for phone in phones:
        print('Fetching %s phone specs from gsm arena' % phone[0])
        page = scrape_obj.scrape(GSM_ARENA_BASE_URL + phone[1], '', '')
        specs = scrape_obj.extract_phone_specs(page)
        # TODO update db
        mongo.insert_phone_specs(specs)
        f.write(str(specs) + "\n")

    # TODO remove
    f.close()

コード例 #7

0

ファイルを表示

def main():
    pp = pprint.PrettyPrinter()
    spreadsheet_object = spreadsheet.Spreadsheet("stats.json")
    scrape_object = scrape.Scrape()
    name = input("Name of hero: ").capitalize()
    heroes_data = scrape_object.scrape_hero(name)
    heroes_data = scrape_object.data_list(heroes_data, scrape_object.hero_name)
    heroes_data = scrape_object.list_into_dict(heroes_data)

    boon = input("Boon(default = neutral): ")
    bane = input("bane(default = neutral): ")
    stats = spreadsheet_object.hero_stats(heroes_data, boon, bane)
    list_stats = list(stats.values())
    list_stats.insert(0, 5)  # Hard-code rarity 5 into data
    list_stats.insert(1, scrape_object.hero_name)

    list_stats.append(boon.upper())
    list_stats.append(bane.upper())
    pp.pprint(list_stats)
    spreadsheet_object.insert_hero(list_stats)

コード例 #8

0

ファイルを表示

 def index(self):
     if 'doi' in request.args:
         global doi_s
         doi_s = str(request.args['doi'])
     else:
         return 'Error:'
     global out_db
     if doi_s == 'all':
         out_db = database.read_all()
         return redirect(url_for('ApiView:display_all'))
     try:
         doi.validate_doi(doi_s)
         domain = doi.get_real_url_from_doi(doi_s)
     except ValueError:
         return 'Invalid doi'
     doi_temp = database.check([doi_s])
     if doi_temp:
         scrap = scrape.Scrape()
         scrap.scrape([domain], current_app.config['DICT_OF_SPIDERS'])
     out_db = database.read([doi_s])
     return redirect(url_for('ApiView:display_all'))

コード例 #9

0

ファイルを表示

    def extract_case(self, html, all, year):
        """
        Extracts all the judgements from the case.
        Returns false if case is not accessible.
        """
        sc = scrape.Scrape()
        raw_html = sc.simple_get(html)

        if year == "07" or year == "06":  #06 and 07 formating requires special processing
            text, end = self.extract_body(raw_html, True)
        else:
            text, end = self.extract_body(raw_html, False)

        if text == None:
            return False
        else:
            all += text

        if end is not None:
            html = self.next_html(html)
            self.extract_case(html, all, year)

コード例 #10

0

ファイルを表示

 def search_doi(self):
     global out_db, doi_s
     list_doi = []
     if request.method == 'POST':
         if 'doi' in request.form:
             list_doi = request.form['doi'].split(',')
         if 'file' in request.files:
             file = request.files['file']
             if file and self.allowed_file(file.filename):
                 filename = secure_filename(file.filename)
                 extension = file.filename.rsplit('.', 1)[1].lower()
                 path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
                 file.save(path)
                 list_doi = self.upload_contents(extension, path)
             else:
                 flash('Please upload only csv and json formats')
         list_doi = list(dict.fromkeys(list_doi))
         doi_s = list_doi
         domain = {}
         for i in list_doi:
             try:
                 doi.validate_doi(i)
                 domain[i] = doi.get_real_url_from_doi(i)
             except ValueError:
                 flash(f'{i} : is not valid , please try again')
                 doi_s.remove(i)
         if doi_s is None:
             return redirect(url_for('DOIView:index'))
         doi_temp = database.check(doi_s)
         if doi_temp:
             doi_ = doi_temp
             domains = [domain[i] for i in doi_ if i in domain]
             doi_temp.clear()
             scrap = scrape.Scrape()
             success = scrap.scrape(domains, app.config['DICT_OF_SPIDERS'])
             if success:
                 for i in success:
                     print('i in succscc', i)
         out_db = database.read(doi_s)
     return render_template("search/search_doi.html", context=out_db)

コード例 #11

0

ファイルを表示

def make_tweet(exclamations_file, adverbs_file, adjectives_file):

    s = scrape.Scrape()

    foodList_ldc = s.getDataLDC()[3]
    ## print(foodList_ldc)
    foodList_burt = s.getDataBurt()[3]
    ## print(foodList_burt)

    ex_file = open(exclamations_file)
    adv_file = open(adverbs_file)
    adj_file = open(adjectives_file)

    exclamations = ex_file.read().split(", ")
    adverbs = adv_file.read().split(", ")
    adjectives = adj_file.read().split(", ")

    food_index_ldc = random.randint(0, len(foodList_ldc) - 1)
    food_index_burt = random.randint(0, len(foodList_burt) - 1)

    ex_index_ldc = random.randint(0, len(exclamations) - 1)
    ex_index_burt = random.randint(0, len(exclamations) - 1)

    adv_index_ldc = random.randint(0, len(adverbs) - 1)
    adv_index_burt = random.randint(0, len(adverbs) - 1)

    adj_index_ldc = random.randint(0, len(adjectives) - 1)
    adj_index_burt = random.randint(0, len(adjectives) - 1)

    food_name_ldc = foodList_ldc[food_index_ldc][0].replace("\\", "")
    food_name_burt = foodList_burt[food_index_burt][0].replace("\\", "")

    station_name_ldc = foodList_ldc[food_index_ldc][1].replace(" ", "")
    station_name_burt = foodList_burt[food_index_burt][1].replace(" ", "")

    exclamation_ldc = exclamations[ex_index_ldc]
    exclamation_burt = exclamations[ex_index_burt]

    adverb_ldc = adverbs[adv_index_ldc]
    adverb_burt = adverbs[adv_index_burt]

    adjective_ldc = adjectives[adj_index_ldc]
    adjective_burt = adjectives[adj_index_burt]

    food_ldc = sanitize_food(food_name_ldc)
    food_burt = sanitize_food(food_name_burt)

    article_ldc = ""
    if food_ldc[-1] == "s":
        article_ldc = " are "
    else:
        article_ldc = " is "

    article_burt = ""
    if food_burt[-1] == "s":
        article_burt = " are "
    else:
        article_burt = " is "

    tweet_ldc = exclamation_ldc + "! I hear LDC's " + food_ldc + article_ldc + adverb_ldc + " " + adjective_ldc + ". #" + station_name_ldc + " #LDC\n"
    tweet_burt = exclamation_burt + "! I hear Burton's " + food_burt + article_burt + adverb_burt + " " + adjective_burt + ". #" + station_name_burt + " #WestSideBestSide\n"

    f = open("tweet.txt", 'w')
    f.write(tweet_ldc)
    f.write(tweet_burt)
    f.close
    ex_file.close()
    adv_file.close()
    adj_file.close()

コード例 #12

0

ファイルを表示

ファイル: main.py プロジェクト: SteveScott/real_estate_web_scraping

from dotenv import load_dotenv
import scrape
import db_insert

load_dotenv()

my_scrape = scrape.Scrape()
df = my_scrape.scrape()
#print(df.info())
db_insert.insert_df(df)

コード例 #13

0

ファイルを表示

def main():
    scrape_object = scrape.Scrape()
    hero_name = input("Name of hero: ")
    data = scrape_object.scrape_hero(hero_name)
    data = scrape_object.data_list(data, scrape_object.hero_name)
    data = scrape_object.list_into_dict(data)

コード例 #14

0

ファイルを表示

async def get_infomation():
    ret_str = ""
    for url, path in zip(urls, output_ID_path):
        scr = scrape.Scrape(url, path)
        ret_str += scr.get_elems_string()
    return ret_str

コード例 #15

0

ファイルを表示

        else:
            text, end = self.extract_body(raw_html, False)

        if text == None:
            return False
        else:
            all += text

        if end is not None:
            html = self.next_html(html)
            self.extract_case(html, all, year)


if __name__ == "__main__":
    ex = Extract()
    sc = scrape.Scrape()

    error1 = "ERROR: Wrong number of links."
    error2 = "ERROR: Wrong links."

    print("Testing Extract:")

    # Testing extraction of links
    raw_html = sc.simple_get(
        "https://publications.parliament.uk/pa/ld/ldjudgmt.htm")
    links = ex.extract_links(raw_html)
    assert (len(links) == 835), error1
    assert (links[-1] == "http://www.parliament.uk/site-information/copyright/"
            ), error2

    # Testing filtering of links