def main(): url = "https://news.ycombinator.com/" savePath = "/home/term1nal/Documents/HackerNews/" scraper = scrape.Scrape(url, savePath) # Executes the scrape scraper.hackScraper()
def __init__(self): self.ex = extract.Extract() self.sc = scrape.Scrape() self.fm = format.Format() self.name = 1 # The cases get a number for a name starting with 1 self.allowed = [ "09", "08", "07", "06", "05", "04", "03", "02", "01", "00", "99", "98", "97", "96" ]
def get_flipkart_reviews(phones): scrape_obj = scrape.Scrape() fields = FLIPKART_REQUEST_FIELDS for phone in phones: fields['productId'] = phone[3] page = scrape_obj.scrape(FLIPKART_REVIEW_URL, fields, HEADERS) print('Fetching flipkart reviews for %s' % phone[0]) if page.status == 200: extracted_reviews = scrape_obj.extract_flipkart_reviews(page) if not extracted_reviews: break mongo.update_phone_reviews(extracted_reviews, phone[0])
def main(): CONFIG = config.init_config('application.ini') aws_access_key = CONFIG.get('aws', 'AWS_ACCESS_KEY') aws_secret_key = CONFIG.get('aws', 'AWS_SECRET_KEY') s3_bucket = CONFIG.get('aws', 'S3_BUCKET') tumblr_api_key = CONFIG.get('tumblr', 'API_KEY') refresh_period = CONFIG.getfloat('tumblr', 'REFRESH_PERIOD') tags = 'instagram' scraper = scrape.Scrape(aws_access_key, aws_secret_key, s3_bucket, tumblr_api_key, tags, refresh_period=refresh_period) scraper.start()
def get_amazon_reviews(phones): scrape_obj = scrape.Scrape() for phone in phones: count = 1 page = scrape_obj.scrape( AMAZON_BASE_URL + phone[2] + AMAZON_REVIEW_URL_1 + str(count), '', '') print('Fetching amazon reviews for %s' % phone[0]) while page.status == 200: extracted_reviews = scrape_obj.extract_amazon_reviews(page) if not extracted_reviews: break mongo.update_phone_reviews(extracted_reviews, phone[0]) count += 1 page = scrape_obj.scrape( AMAZON_BASE_URL + phone[2] + AMAZON_REVIEW_URL_1 + str(count), '', '')
def main(): phones = fetch_phone_list() scrape_obj = scrape.Scrape() print('Fetching phone specs from gsm arena : ') # TODO remove f = open(SPECS_STORAGE_PATH + SPECS_FILE_NAME, 'w', encoding='utf-8') for phone in phones: print('Fetching %s phone specs from gsm arena' % phone[0]) page = scrape_obj.scrape(GSM_ARENA_BASE_URL + phone[1], '', '') specs = scrape_obj.extract_phone_specs(page) # TODO update db mongo.insert_phone_specs(specs) f.write(str(specs) + "\n") # TODO remove f.close()
def main(): pp = pprint.PrettyPrinter() spreadsheet_object = spreadsheet.Spreadsheet("stats.json") scrape_object = scrape.Scrape() name = input("Name of hero: ").capitalize() heroes_data = scrape_object.scrape_hero(name) heroes_data = scrape_object.data_list(heroes_data, scrape_object.hero_name) heroes_data = scrape_object.list_into_dict(heroes_data) boon = input("Boon(default = neutral): ") bane = input("bane(default = neutral): ") stats = spreadsheet_object.hero_stats(heroes_data, boon, bane) list_stats = list(stats.values()) list_stats.insert(0, 5) # Hard-code rarity 5 into data list_stats.insert(1, scrape_object.hero_name) list_stats.append(boon.upper()) list_stats.append(bane.upper()) pp.pprint(list_stats) spreadsheet_object.insert_hero(list_stats)
def index(self): if 'doi' in request.args: global doi_s doi_s = str(request.args['doi']) else: return 'Error:' global out_db if doi_s == 'all': out_db = database.read_all() return redirect(url_for('ApiView:display_all')) try: doi.validate_doi(doi_s) domain = doi.get_real_url_from_doi(doi_s) except ValueError: return 'Invalid doi' doi_temp = database.check([doi_s]) if doi_temp: scrap = scrape.Scrape() scrap.scrape([domain], current_app.config['DICT_OF_SPIDERS']) out_db = database.read([doi_s]) return redirect(url_for('ApiView:display_all'))
def extract_case(self, html, all, year): """ Extracts all the judgements from the case. Returns false if case is not accessible. """ sc = scrape.Scrape() raw_html = sc.simple_get(html) if year == "07" or year == "06": #06 and 07 formating requires special processing text, end = self.extract_body(raw_html, True) else: text, end = self.extract_body(raw_html, False) if text == None: return False else: all += text if end is not None: html = self.next_html(html) self.extract_case(html, all, year)
def search_doi(self): global out_db, doi_s list_doi = [] if request.method == 'POST': if 'doi' in request.form: list_doi = request.form['doi'].split(',') if 'file' in request.files: file = request.files['file'] if file and self.allowed_file(file.filename): filename = secure_filename(file.filename) extension = file.filename.rsplit('.', 1)[1].lower() path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) file.save(path) list_doi = self.upload_contents(extension, path) else: flash('Please upload only csv and json formats') list_doi = list(dict.fromkeys(list_doi)) doi_s = list_doi domain = {} for i in list_doi: try: doi.validate_doi(i) domain[i] = doi.get_real_url_from_doi(i) except ValueError: flash(f'{i} : is not valid , please try again') doi_s.remove(i) if doi_s is None: return redirect(url_for('DOIView:index')) doi_temp = database.check(doi_s) if doi_temp: doi_ = doi_temp domains = [domain[i] for i in doi_ if i in domain] doi_temp.clear() scrap = scrape.Scrape() success = scrap.scrape(domains, app.config['DICT_OF_SPIDERS']) if success: for i in success: print('i in succscc', i) out_db = database.read(doi_s) return render_template("search/search_doi.html", context=out_db)
def make_tweet(exclamations_file, adverbs_file, adjectives_file): s = scrape.Scrape() foodList_ldc = s.getDataLDC()[3] ## print(foodList_ldc) foodList_burt = s.getDataBurt()[3] ## print(foodList_burt) ex_file = open(exclamations_file) adv_file = open(adverbs_file) adj_file = open(adjectives_file) exclamations = ex_file.read().split(", ") adverbs = adv_file.read().split(", ") adjectives = adj_file.read().split(", ") food_index_ldc = random.randint(0, len(foodList_ldc) - 1) food_index_burt = random.randint(0, len(foodList_burt) - 1) ex_index_ldc = random.randint(0, len(exclamations) - 1) ex_index_burt = random.randint(0, len(exclamations) - 1) adv_index_ldc = random.randint(0, len(adverbs) - 1) adv_index_burt = random.randint(0, len(adverbs) - 1) adj_index_ldc = random.randint(0, len(adjectives) - 1) adj_index_burt = random.randint(0, len(adjectives) - 1) food_name_ldc = foodList_ldc[food_index_ldc][0].replace("\\", "") food_name_burt = foodList_burt[food_index_burt][0].replace("\\", "") station_name_ldc = foodList_ldc[food_index_ldc][1].replace(" ", "") station_name_burt = foodList_burt[food_index_burt][1].replace(" ", "") exclamation_ldc = exclamations[ex_index_ldc] exclamation_burt = exclamations[ex_index_burt] adverb_ldc = adverbs[adv_index_ldc] adverb_burt = adverbs[adv_index_burt] adjective_ldc = adjectives[adj_index_ldc] adjective_burt = adjectives[adj_index_burt] food_ldc = sanitize_food(food_name_ldc) food_burt = sanitize_food(food_name_burt) article_ldc = "" if food_ldc[-1] == "s": article_ldc = " are " else: article_ldc = " is " article_burt = "" if food_burt[-1] == "s": article_burt = " are " else: article_burt = " is " tweet_ldc = exclamation_ldc + "! I hear LDC's " + food_ldc + article_ldc + adverb_ldc + " " + adjective_ldc + ". #" + station_name_ldc + " #LDC\n" tweet_burt = exclamation_burt + "! I hear Burton's " + food_burt + article_burt + adverb_burt + " " + adjective_burt + ". #" + station_name_burt + " #WestSideBestSide\n" f = open("tweet.txt", 'w') f.write(tweet_ldc) f.write(tweet_burt) f.close ex_file.close() adv_file.close() adj_file.close()
from dotenv import load_dotenv import scrape import db_insert load_dotenv() my_scrape = scrape.Scrape() df = my_scrape.scrape() #print(df.info()) db_insert.insert_df(df)
def main(): scrape_object = scrape.Scrape() hero_name = input("Name of hero: ") data = scrape_object.scrape_hero(hero_name) data = scrape_object.data_list(data, scrape_object.hero_name) data = scrape_object.list_into_dict(data)
async def get_infomation(): ret_str = "" for url, path in zip(urls, output_ID_path): scr = scrape.Scrape(url, path) ret_str += scr.get_elems_string() return ret_str
else: text, end = self.extract_body(raw_html, False) if text == None: return False else: all += text if end is not None: html = self.next_html(html) self.extract_case(html, all, year) if __name__ == "__main__": ex = Extract() sc = scrape.Scrape() error1 = "ERROR: Wrong number of links." error2 = "ERROR: Wrong links." print("Testing Extract:") # Testing extraction of links raw_html = sc.simple_get( "https://publications.parliament.uk/pa/ld/ldjudgmt.htm") links = ex.extract_links(raw_html) assert (len(links) == 835), error1 assert (links[-1] == "http://www.parliament.uk/site-information/copyright/" ), error2 # Testing filtering of links