def test_pretty_val_Equal(tester, expected_type, v, expected_v): pretty_v = Scraper.pretty_val_st(v, expected_type) tester.assertEqual(pretty_v, expected_v) if pretty_v is None: pass elif expected_type == date: tester.assertIsInstance(pretty_v, str) else: tester.assertIsInstance(pretty_v, expected_type)
def test_get_perustiedot_Controll(tester, company_id, part_of_expected_perustiedot): type_dict = { "porssi": str, "listattu": str, "kaupankayntitunnus": str, "isin-koodi": str, "toimialaluokka": str, "nimellisarvo": str, "kaupankaynti_valuutta": str, "toimiala": str, "markkina-arvo": float, "osakkeet_kpl": int } scraper = Scraper(company_id) perustiedot = scraper.get_perustiedot() tester.assertEqual(len(perustiedot), 10) for key in perustiedot: if perustiedot[key] is not None: tester.assertIsInstance(perustiedot[key], type_dict[key]) if type_dict[key] == str: tester.assertEqual(perustiedot[key], part_of_expected_perustiedot[key])
def test_get_osinko_Controll(tester, company_id, one_expected_osinko): type_dict = { "vuosi": int, "irtoaminen": str, "oikaistu_euroina": float, "maara": float, "valuutta": str, "tuotto_%": float, "lisatieto": str } scraper = Scraper(company_id) osingot = scraper.get_osingot() matches = 0 for top_key in osingot: tester.assertIsInstance(top_key, str) tester.assertEqual(len(osingot[top_key]), 7) if osingot[top_key]["irtoaminen"] == one_expected_osinko["irtoaminen"] \ and osingot[top_key]["maara"] == one_expected_osinko["maara"]: tester.assertDictEqual(osingot[top_key], one_expected_osinko) matches += 1 for key in osingot[top_key]: if osingot[top_key][key] is not None: tester.assertIsInstance(osingot[top_key][key], type_dict[key]) tester.assertEqual(matches, 1)
def urun_bilgileri(): if request.method == "POST": global category category = request.form.get("category_selected") # urunler_df = pd.read_csv("Tüm_Ürünler.csv") # urunler_df.fillna("-", inplace=True) result_list = Scraper.getAll(category=category) urunler_df = result_list[1] urunler_df.fillna("-", inplace=True) global urunler_df_list urunler_df_list = result_list[0] return render_template("urun-bilgileri.html", category=category, urunler_df=urunler_df) else: return render_template(url_for("home"))
class Application: """ Main window of the program. """ window_width = 1000 window_height = 600 border_width = 5 border_color = 'black' __slots__ = ( 'root', 'left_frame', 'url_label', 'url_entry', 'check_button', 'url_check_label', 'start_dl_button', 'mid_frame', 'url_tracking_label', 'url_tracking_text', 'right_frame', 'log_text', 'bottom_frame', 'download_tracking_label', 'download_tracking_bar', 'scraper', 'driver', 'login', 'ig_url_re', 'ig_profile_url_re', 'general_img_re', 'imgur_re', 'youtube_re', 'yt_re', 'reddit_re', 'reddit_fallback_re', 'gfycat_re', 'tumblr_re', 'twitter_re', 'exprs', ) def __init__(self, root): self.root = root self.left_frame = tk.Frame() self.url_label = tk.Label() self.url_entry = tk.Entry() self.check_button = tk.Button() self.url_check_label = tk.Label() self.start_dl_button = tk.Button() self.setup_left_frame() self.mid_frame = tk.Frame() self.url_tracking_label = tk.Label() self.url_tracking_text = ScrollText() self.setup_mid_frame() self.right_frame = tk.Frame() self.log_text = ScrollText() self.setup_right_frame() self.bottom_frame = tk.Frame() self.download_tracking_label = tk.Label() self.download_tracking_bar = ttk.Progressbar() self.setup_bottom_frame() # Initialise classes here so we can pass the logging widget self.scraper = Scraper(self.log_text, self.download_tracking_label) self.driver = Driver(self.log_text) self.driver.start_driver() # Start webdriver to be used for scraping self.login = None # Lots of regexes to check the validity of wanted URLs # Make sure only IG posts are specified, not user's pages self.ig_url_re = re.compile(r'^https://www\.instagram\.com/p/.+/') self.ig_profile_url_re = re.compile( r'^https://www\.instagram\.com/(\w+|\d+)/$') self.general_img_re = re.compile( r'^https?://.+\..+\..+\.(?:jpg|png|gif)') self.imgur_re = re.compile( r'^https?://imgur\.com/(?:.)+$(?<!(png|gif|jpg))') self.youtube_re = re.compile( 'https://(?:www\.)?youtube\.com/watch\?v=.+') self.yt_re = re.compile(r'https://youtu\.be/.+') self.reddit_re = re.compile( r'https?://(?:www|old)\.reddit\.com/(?:r|u|user)/(\w+)/.+') self.reddit_fallback_re = re.compile( r'https://v\.redd\.it/.+\?source=fallback') self.gfycat_re = re.compile(r'https://gfycat\.com/\w+$(?<!-)') self.tumblr_re = re.compile( r'https://(.+)\.tumblr\.com/post/(\d+)(?:/.+)?') self.twitter_re = re.compile(r'https://twitter.com/.+/status/(\d+)') # Map URLs to the methods needed to extract the images in them # All of these methods take a single argument, the URL/text self.exprs = { self.ig_url_re: self.process_ig_url, self.ig_profile_url_re: self.process_ig_profile_url, self.general_img_re: self.process_general_url, self.imgur_re: self.process_imgur_url, self.youtube_re: self.process_yt_url, self.yt_re: self.process_yt_url, self.reddit_re: self.process_reddit_url, self.reddit_fallback_re: self.process_general_url, self.gfycat_re: self.process_gfycat_url, self.tumblr_re: self.process_tumblr_url, self.twitter_re: self.process_twitter_url, } def setup_left_frame(self): """ Set up the left frame of the application's window. """ self.left_frame = tk.Frame( self.root, bg=MID_GREY, width=self.window_width / 3, height=self.window_height - self.border_width * 40, highlightbackground=self.border_color, highlightcolor=self.border_color, highlightthickness=self.border_width, ) self.left_frame.grid(row=0, column=0) self.left_frame.grid_propagate( False) # Keep the frame from automatically resizing self.url_label = tk.Label(self.left_frame, bg=MID_GREY, text='Enter URLs:', font=('Arial', 15, 'bold')) self.url_label.place(relx=0.5, rely=0.3, anchor='center') self.url_entry = tk.Entry( self.left_frame, width=int(self.left_frame.winfo_reqwidth() * 0.1), borderwidth=3, ) self.url_entry.bind( '<Return>', lambda e: threading.Thread(target=self.process_input).start()) self.url_entry.place(relx=0.5, rely=0.4, anchor='center') self.check_button = tk.Button(self.left_frame, text='OK', bg='black', fg='white', activebackground=DARK_GREY, font=('Arial', 12), cursor='hand2') self.check_button.bind( '<ButtonRelease-1>', lambda e: threading.Thread(target=self.process_input).start()) self.check_button.place(relx=0.5, rely=0.5, anchor='center') self.url_check_label = tk.Label(self.left_frame, bg=MID_GREY, font=('Arial', 12)) self.url_check_label.place(relx=0.5, rely=0.6, anchor='center') self.start_dl_button = tk.Button(self.left_frame, text='Start Downloading', bg='black', fg='white', activebackground=DARK_GREY, font=('Arial', 12), cursor='hand2') self.start_dl_button.bind( '<ButtonRelease-1>', lambda e: threading.Thread(target=self.download_files).start()) self.start_dl_button.place(relx=0.5, rely=0.7, anchor='center') def setup_mid_frame(self): """ Set up the middle frame of the application's window. """ self.mid_frame = tk.Frame( self.root, bg=MID_GREY, width=self.window_width / 3, height=self.window_height - self.border_width * 40, highlightbackground=self.border_color, highlightcolor=self.border_color, highlightthickness=self.border_width, ) self.mid_frame.grid(row=0, column=1) self.mid_frame.grid_propagate(False) # Keep the frame border from disappearing by adding weights self.mid_frame.rowconfigure(1, weight=1) self.mid_frame.columnconfigure(0, weight=1) self.url_tracking_label = tk.Label( self.mid_frame, bg=MID_GREY, text='Saved URLs:', font=('Arial', 15, 'bold'), ) self.url_tracking_label.place(relx=0.5, rely=0.04, anchor='center') # Retroactively scale the mid_frame's first row to make space for the tracking label self.mid_frame.rowconfigure( 0, weight=1, minsize=self.url_tracking_label.winfo_reqheight()) self.url_tracking_text = ScrollText( self.mid_frame, bg=MID_GREY, font=('Arial', 10), borderwidth=0, ) self.url_tracking_text.grid(row=1, column=0, sticky='ew') def setup_right_frame(self): """ Set up the right frame of the application's window. """ self.right_frame = tk.Frame( self.root, bg=MID_GREY, width=self.window_width / 3, height=self.window_height - self.border_width * 40, highlightbackground=self.border_color, highlightcolor=self.border_color, highlightthickness=self.border_width, ) self.right_frame.grid(row=0, column=2) self.right_frame.grid_propagate(False) self.right_frame.columnconfigure(0, weight=1) self.right_frame.rowconfigure(0, weight=1) self.log_text = ScrollText( self.right_frame, bg=MID_GREY, font=('Arial', 10), borderwidth=0, ) self.log_text.grid(sticky='nsew') def setup_bottom_frame(self): """ Set up the bottom frame of the application's window. """ self.bottom_frame = tk.Frame( self.root, bg=MID_GREY, width=self.window_width, height=self.border_width * 40, highlightbackground=self.border_color, highlightcolor=self.border_color, highlightthickness=self.border_width, ) self.bottom_frame.grid(row=1, column=0, columnspan=3) self.bottom_frame.grid_propagate(False) self.download_tracking_label = tk.Label(self.bottom_frame, bg=MID_GREY, text='Downloaded 0 / 0 files', font=('Arial', 15, 'bold')) self.download_tracking_label.place(relx=0.5, rely=0.2, anchor='center') style = ttk.Style() # ('winnative', 'clam', 'alt', 'default', 'classic', 'vista', 'xpnative') style.theme_use('alt') style.configure('black.Horizontal.TProgressbar', foreground='red', background='black') self.download_tracking_bar = ttk.Progressbar( self.bottom_frame, style='black.Horizontal.TProgressbar', orient='horizontal', length=self.window_width * 0.8, mode='determinate', ) self.download_tracking_bar.place(relx=0.5, rely=0.5, anchor='center') def disable_input_widgets(self): """ Disable the interactive widgets around input and downloading. """ self.url_entry.configure(state='disabled') self.check_button.configure(state='disabled') self.start_dl_button.configure(state='disabled') def enable_input_widgets(self): """ Enable the interactive widgets around input and downloading. """ self.url_entry.configure(state='normal') self.check_button.configure(state='normal') self.start_dl_button.configure(state='normal') def process_input(self): """ Disable the input widgets and check/process the input, then enable the widgets again. """ text = self.url_entry.get().strip() if not text: return # Don't allow more input while current input is being processed self.disable_input_widgets() # Allow pasting multiple links at once, separated by spaces if len(text.split()) > 1: is_input_accepted = False for url in text.split(): is_input_accepted = self.check_url(text=url) # Sleep to not spam APIs time.sleep(0.5) else: is_input_accepted = self.check_url(text=text) self.enable_input_widgets() # Cannot delete text while widget is disabled if is_input_accepted is True: self.url_entry.delete(0, tk.END) def check_url(self, text=None): """ Check the text to see if it fits one of the specified URL regexes. Then process the URL as needed. """ if not text: return False # We only need to track Reddit URLs in JSON format if self.reddit_re.match(text) and not text.endswith('.json'): text += '.json' if not any(regex.match(text) for regex in self.exprs.keys()): self.url_check_label.configure(text='ERR: URL not accepted', fg='red') return False if text in self.scraper.tracking_links + self.scraper.display_links: self.url_check_label.configure(text='WARN: URL already added.', fg='brown') return False # In case a URL gets ctrl+v'd into the entry multiple times if any(link in text for link in self.scraper.tracking_links + self.scraper.display_links): self.url_check_label.configure(text='WARN: URL already added.', fg='brown') return False self.url_check_label.configure(text='OK: URL accepted', fg='black') self.process_url(text) # Signify that the method completed return True def process_url(self, url): """ Get the corresponding extraction method of a URL by matching a regex, then execute the method and update the tracking label. """ for regex in self.exprs.keys(): # Guaranteed to happen for at least one regex if regex.match(url): extraction_method = self.exprs[regex] extraction_method(url) break self.scraper.display_links.append(url) self.url_tracking_text.display_these_lines(self.scraper.display_links) self.log_text.newline('URL processing complete') self.log_text.newline('.') def process_general_url(self, url): """ Append a link directly pointing to an image to the lists as no further actions are needed. """ type_ = 'image' if url.startswith('https://v.redd.it/'): type_ = 'video' self.scraper.append_link(url, type_=type_) def process_ig_url(self, url): """ Prepare data and handle extraction of images of Instagram posts. """ self.driver.webdriver.get(url) self.log_text.newline(f'Got URL - {url}') soup = BeautifulSoup(self.driver.webdriver.page_source, features='html.parser') data = self.scraper.get_ig_data(soup) self.log_text.newline('Extracted JSON data') if self.scraper.is_private(data) and self.driver.is_logged_in is False: def show_root(_): """ Needed for the pos arg getting passed with tkinter bindings. """ self.root.deiconify() # self.process_url(url) self.process_ig_url(url) # Not unbinding here would lead to an infinite loop # of calling the above function again and again self.login.unbind('<Destroy>') self.log_text.newline('Login initiated') self.create_login_window() self.root.withdraw() self.login.bind('<Destroy>', show_root) return # Logging for IG links is done inside of this function already self.scraper.extract_ig_images(data) self.scraper.tracking_links.append(url) def process_ig_profile_url(self, url): """ Extract an Instagram user's profile name and get their avatar's URL from instadp.com. """ profile_name = self.ig_profile_url_re.match(url).group(1) instadp_url = f'https://www.instadp.com/fullsize/{profile_name}' self.driver.webdriver.get(instadp_url) self.log_text.newline(f'Got URL - {url}') soup = BeautifulSoup(self.driver.webdriver.page_source, features='html.parser') self.scraper.extract_ig_avatar(soup) def process_imgur_url(self, url): """ Prepare data needed for extracting images from an Imgur link and then actually extract them. """ self.driver.webdriver.get(url) self.log_text.newline(f'Got URL - {url}') soup = BeautifulSoup(self.driver.webdriver.page_source, features='html.parser') self.scraper.extract_imgur_images(soup) def process_yt_url(self, url): """ Simply call the scraper's method to keep the method class uniform here. """ self.scraper.extract_yt_thumbnail(url) def process_reddit_url(self, url): """ Get the JSON data of a Reddit post and extract the video link. NOTE: Video and audio are separated on Reddit, so the audio will be missing. """ self.driver.webdriver.get(url) self.log_text.newline(f'Got URL - {url}') soup = BeautifulSoup(self.driver.webdriver.page_source, features='html.parser') data_str = soup.find_all('pre')[0].text data = json.loads(data_str) post_url = self.scraper.extract_reddit_link(data) # Need to process the URL which a Reddit post points to # ... if it's not a self-post if url.replace('/.json', '/') == post_url: self.log_text.newline('Reddit post is a self-post, aborting') return self.check_url(text=post_url) def process_gfycat_url(self, url): """ Check to see if the entered Gfycat URL is valid. """ # Usually I would insist on doing everything with Selenium # But it's so f*****g slow with Gfycat (~5s to .get the URL) # that it's better to use requests -.- # With that being said, the commented out Selenium code does work # self.driver.webdriver.get(url) # self.log_text.newline(f'Got URL - {url}') # # logs = self.driver.webdriver.get_log('browser') # messages = [log['message'] for log in logs] # request_failed = ('Failed to load resource:' # ' the server responded with a status of 404') # # if any(request_failed in message for message in messages): # self.log_text.newline('Invalid response 404 for Gfycat URL') # return res = requests.get(url) self.log_text.newline(f'Got URL - {url}') if res.status_code != 200: self.log_text.newline(f'Unexpected response code' f' ({res.status_code}) for Gfycat URL') return self.scraper.extract_gfycat_video(url) def process_tumblr_url(self, url): """ Complete extra navigation step if necessary. Prep BeautifulSoup to be used in extraction. """ self.driver.webdriver.get(url) self.driver.log_text.newline(f'Got URL - {url}') self.driver.confirm_tumblr_gdpr() # Wait for page to reload while True: soup = BeautifulSoup(self.driver.webdriver.page_source, features='html.parser') if config.tumblr_ascii_logo not in str(soup): break time.sleep(0.2) self.scraper.extract_tumblr_links(soup) def process_twitter_url(self, url): """ Navigate to the Twitter URL and prep BeautifulSoup object. """ self.driver.webdriver.get(url) self.driver.log_text.newline(f'Got URL - {url}') soup = BeautifulSoup(self.driver.webdriver.page_source, features='html.parser') self.scraper.extract_twitter_images(soup) def download_files(self): """ Wrapper to call the scraper's download method, to avoid arg weirdness with tkinter widget bindings. """ if not self.scraper.download_links: return # Disable some widgets to not mess with running downloads self.disable_input_widgets() threading.Thread(target=self.scraper.download_files).start() # Intentionally block here to re-enable widgets only after this returns self.update_widgets() self.enable_input_widgets() def update_widgets(self): """ Update the download tracking widgets while downloading and reset them when the downloads are finished. """ self.download_tracking_bar['maximum'] = len( self.scraper.download_links) last_download = self.scraper.last_download finished_dls = 0 while finished_dls < len(self.scraper.download_links): if self.scraper.last_download != last_download: last_download = self.scraper.last_download finished_dls += 1 self.download_tracking_bar['value'] += 1 self.download_tracking_label.configure( text=f'Downloaded {finished_dls}' f' / {len(self.scraper.download_links)} files') self.scraper.download_links = [] self.scraper.display_links = [] self.download_tracking_bar['value'] = 0 self.download_tracking_label.configure(text='Downloaded 0 / 0 files') self.url_tracking_text.clear_text() self.log_text.newline('Reset tracking widgets') self.log_text.newline('.') def create_login_window(self): """ Create a login window. """ self.login = LoginWindow(self.driver) screen_width = self.root.winfo_screenwidth() screen_height = self.root.winfo_screenheight() window_x = int(screen_width / 2 - self.login.window_width * 0.5) window_y = int(screen_height * 0.25) self.login.title('Login') self.login.geometry( f'{self.login.window_width}x{self.login.window_height}' f'+{window_x}+{window_y}') self.login.resizable(width=False, height=False)
def scrape_main(self, jb): #job = business-analyst #city = omaha-ne #local = raw_input("Local or Web (L/W): ") #job = raw_input("Job title with + as spaces: ") #-----need a method to scrub job title -------- job = jb.split() local = "W" #city = "" #pages = 1 path1 = settings.MEDIA_ROOT #path1 = '' path2 = '/jobs_' + jb.replace(" ","_") path3 = '.txt' filename = path1 + path2 + path3 #===== CHECK IF FILE EXISTS ======== if os.path.exists(filename): filename = filename else: #===== CREATE FILE IF IT DOESNT EXISTS ======== file = open(filename, 'w+') header = 'Res' + ';' + "URL" + ';' + "Job_num" + ';' + "Job_Title" + ';' + "Company" + ';' + "Location" + ';' + "End_Date" + ';' + "Duration" + ';' + "Avg_Sal" + ';' + "Company_Prestige" + ';' + "Work_Description" + '\n' file.write(header) #===== GRAB LIST OF RESUMES FOR INPUTS ABOVE ====== # if job[1]: # file.write(job[0] + "---" + job[(len(job)-1)] + jb ) # resume_links = Links.objects.filter(job_name__icontains=job[1]) #.filter(job_name__contains=job[1]) # file.write(resume_links[0]) # else: #file.write('hello') resume_links = Links.objects.filter(job_name__icontains=jb.replace(" ","-")) #file.write(resume_links) #======= GRAB SPECIFIC RESUME BY HYPERLINK AND SCRAPE DATA ============ pers = 0 threadlist = [] s2 = Scraper() for res in resume_links: print 'Resume' + res.job_url pers = pers + 1 #---------- THREADING CODE ----------------- # --- so that we dont kick off more than 10 threads ------ if pers % 10 == 0: time.sleep(1) try: t = Thread(target=s2.person, args=(res.job_url,pers,file)) t.start() threadlist.append(t) except: time.sleep(1) try: t = Thread(target=s2.person, args=(res.job_url,pers,file)) t.start() threadlist.append(t) except: print "this person didnt work" # -- rejoining the threads ----- for b in threadlist: b.join() file.close() #-------- DO PLOTTING ----------- job_cluster = [] if local == "L": print "local" #plots(filename) - ADD FOR LOCAL - https://bitbucket.org/njs/rnumpy/wiki/API elif local == "W": r = Rcode() r_data = r.rots(filename, jb) med = r_data['median'] for i in range(1,(med+2)): print "-----------JOB " + str(i) + " ----------------" plots = r_data['plot_files'][i-1] #plot_listing.append(plots) job_list = [] for j in range(1,4): print '--cluster '+ str(j) + "--" job = r_data['jobz'][(i*j)-1] job1 = tuple(job) title = job1[3][0] company = job1[4][0] sal = job1[8][0] #{'title' 'company''sal'} tup = [title, company, sal, plots] job_list.append(tup) job_cluster.append(job_list) return {'stat':"done", 'jobs_cluster':job_cluster, 'med':range((med+2))} # when doing multithreading you need to define a function and give seperate memory allocation for each thread #sc = ScrapeMain() #sc.scrape_main('architect')
def test_get_sijoittajan_tunnuslukuja_Controll(tester, company_id): scraper = Scraper(company_id) sijoittajan_tunnuslukuja = scraper.get_sijoittajan_tunnuslukuja() assert_tulostietoja(tester, company_id, sijoittajan_tunnuslukuja, 12)
def test_get_maksuvalmius_Controll(tester, company_id): scraper = Scraper(company_id) maksuvalmius = scraper.get_maksuvalmius() assert_tulostietoja(tester, company_id, maksuvalmius, 3)
def test_get_vakavaraisuus_Controll(tester, company_id): scraper = Scraper(company_id) vakavaraisuus = scraper.get_vakavaraisuus() assert_tulostietoja(tester, company_id, vakavaraisuus, 6)
def test_get_name(self): for company_id in some_company_ids: scraper = Scraper(company_id) name = scraper.get_name() self.assertIsInstance(name, str) self.assertGreater(len(name), 2)
# -*- coding: utf-8 -*- """ @author: Muhammed """ from scraping import Scraper from text_similarity import TextSimilarity from flask import Flask, render_template, redirect, url_for, request import pandas as pd Scraper = Scraper() TextSimilarity = TextSimilarity() app = Flask(__name__) categories = ["Cep Telefonu", "Dizüstü Bilgisayar", "Tablet"] ### import time ### @app.route("/", methods=["GET", "POST"]) def home(): return render_template("index.html", categories=categories) @app.route("/urun-bilgileri", methods=["POST", "GET"]) def urun_bilgileri(): if request.method == "POST":
def test_get_tunnuslukuja_Controll(tester, company_id): scraper = Scraper(company_id) tunnuslukuja = scraper.get_tunnuslukuja() tester.assertEqual(len(tunnuslukuja), 6) for key in tunnuslukuja: tester.assertIsInstance(tunnuslukuja[key], float)
def test_get_kuvaus(self): for company_id in some_company_ids: scraper = Scraper(company_id) kuvaus = scraper.get_kuvaus() self.assertIsInstance(kuvaus, str)
def test_get_kurssi(self): for company_id in some_company_ids: scraper = Scraper(company_id) kurssi = scraper.get_kurssi() self.assertIsInstance(kurssi, float)
def __init__(self, root): self.root = root self.left_frame = tk.Frame() self.url_label = tk.Label() self.url_entry = tk.Entry() self.check_button = tk.Button() self.url_check_label = tk.Label() self.start_dl_button = tk.Button() self.setup_left_frame() self.mid_frame = tk.Frame() self.url_tracking_label = tk.Label() self.url_tracking_text = ScrollText() self.setup_mid_frame() self.right_frame = tk.Frame() self.log_text = ScrollText() self.setup_right_frame() self.bottom_frame = tk.Frame() self.download_tracking_label = tk.Label() self.download_tracking_bar = ttk.Progressbar() self.setup_bottom_frame() # Initialise classes here so we can pass the logging widget self.scraper = Scraper(self.log_text, self.download_tracking_label) self.driver = Driver(self.log_text) self.driver.start_driver() # Start webdriver to be used for scraping self.login = None # Lots of regexes to check the validity of wanted URLs # Make sure only IG posts are specified, not user's pages self.ig_url_re = re.compile(r'^https://www\.instagram\.com/p/.+/') self.ig_profile_url_re = re.compile( r'^https://www\.instagram\.com/(\w+|\d+)/$') self.general_img_re = re.compile( r'^https?://.+\..+\..+\.(?:jpg|png|gif)') self.imgur_re = re.compile( r'^https?://imgur\.com/(?:.)+$(?<!(png|gif|jpg))') self.youtube_re = re.compile( 'https://(?:www\.)?youtube\.com/watch\?v=.+') self.yt_re = re.compile(r'https://youtu\.be/.+') self.reddit_re = re.compile( r'https?://(?:www|old)\.reddit\.com/(?:r|u|user)/(\w+)/.+') self.reddit_fallback_re = re.compile( r'https://v\.redd\.it/.+\?source=fallback') self.gfycat_re = re.compile(r'https://gfycat\.com/\w+$(?<!-)') self.tumblr_re = re.compile( r'https://(.+)\.tumblr\.com/post/(\d+)(?:/.+)?') self.twitter_re = re.compile(r'https://twitter.com/.+/status/(\d+)') # Map URLs to the methods needed to extract the images in them # All of these methods take a single argument, the URL/text self.exprs = { self.ig_url_re: self.process_ig_url, self.ig_profile_url_re: self.process_ig_profile_url, self.general_img_re: self.process_general_url, self.imgur_re: self.process_imgur_url, self.youtube_re: self.process_yt_url, self.yt_re: self.process_yt_url, self.reddit_re: self.process_reddit_url, self.reddit_fallback_re: self.process_general_url, self.gfycat_re: self.process_gfycat_url, self.tumblr_re: self.process_tumblr_url, self.twitter_re: self.process_twitter_url, }
import colorama import yaml # Local. from scraping import Scraper # Load config. cp = "conf.yaml" # Config Path string with open(cp, encoding='utf-8') as cf: # Config File object cd = yaml.load(cf, Loader=yaml.FullLoader) # Config Dict # Configure logging. logging.basicConfig(filename=cd["TitleUrlLog"], filemode="w", level=logging.INFO, format=f"\n {'-'*23} \n %(asctime)s %(message)s") """Overwrite log file on every interpreter (not script) launch.""" # Console. colorama.init() print(colorama.Fore.CYAN, end="") # Set text color. # print(colorama.Style.BRIGHT, end="") # Set text brightness. Default: colorama.Style.NORMAL s = Scraper(cd) s.start() print(f"Type {cd['QuitKw']} to quit.") while s.is_alive(): if input().strip().lower() == cd["QuitKw"]: s.quit() break print(colorama.Style.RESET_ALL, end="") colorama.deinit()
def test_get_toiminnan_laajuus_Controll(tester, company_id): scraper = Scraper(company_id) toiminnan_laajuus = scraper.get_toiminnan_laajuus() assert_tulostietoja(tester, company_id, toiminnan_laajuus, 7)
def test_get_kannattavuus_Controll(tester, company_id): scraper = Scraper(company_id) kannattavuus = scraper.get_kannattavuus() assert_tulostietoja(tester, company_id, kannattavuus, 7)
if not page.has_attrs(): raise 'Can\'t load: url = %s' % (page.url,) self._add_page(page) def _add_page(self, page): if not page.is_valid(): return for link_page in page.create_similar_pages(): code = link_page.get_code() if code in self.codes: continue self.page_stack.append(link_page) self.codes.add(code) if __name__ == '__main__': init_logger() parser = argparse.ArgumentParser() parser.add_argument('-l', '--lang', required=True) # ja,us args = parser.parse_args() args = vars(args) lang = args['lang'] scrape_target = ArtistScrapeTarget(lang) scraper = Scraper(scrape_target) scraper.run()