def homePageScrapper(html): soup = bs(html, 'lxml') targetSection = soup.find(id="mainResults") targetDivs = targetSection.findAll('div', {'class': 's-item-container'}) links = {} for div in targetDivs: itemLink = div.find( 'a', { 'class': 'a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal' }) itemName = div.find( 'h2', {'class': 'a-size-base s-inline s-access-title a-text-normal'}) links[itemName.text] = itemLink.get('href') file1 = open("data/homePageLinks.csv", "w") file2 = open("data/homePageLinks.txt", "w") file1.write('"Product Name","Link",\n') for key in links: fileEnrty = '"' + key + '"' + ',' + '"' + links[key] + '"' + '\n' file1.write(fileEnrty) file2.write(key + '~' + links[key] + '\n') file1.close() file2.close() print('\ncreated a file data/homePageLinks.csv\n') scraper("homePageLinks.txt")
def await_market_open(num): setup_folders() num += 1 if num > 4: print("market not opening today") return print("checking time...") client = build_client() today = datetime.date.today() try: clock = client.get_hours_for_single_market( market=client.Markets.EQUITY, date=today).json()["equity"]["EQ"] except: clock = client.get_hours_for_single_market( market=client.Markets.EQUITY, date=today).json()["equity"]["equity"] # app starts right at 9:30 est from scheduler # if it's a trading day, start the app if clock["isOpen"] == True: print("Beginning process.") scraper() # assess('skip') # time.sleep( 1 ) # This whole process (from scrape to starting watchdog) takes about 2-5 minutes daily_trader( 'initial' ) # so there's also inherently a delay between market open and when the app time.sleep(1) # starts trading run_watchdog(0) # else: print("market not open today.") sys.exit()
def main(audio_dir, json_dir, download_hours_ahead, audio_hours_retain, json_hours_retain): scraper.scraper(json_dir) logging.info("Scraper complete") sh.cleanup(audio_dir, audio_hours_retain) logging.info("Audio cleanup complete") sh.cleanup(json_dir, json_hours_retain) logging.info("JSON cleanup complete") downloader.downloader(download_hours_ahead, audio_dir, json_dir) logging.info("Downloader complete")
def main(): """does the work, does not have the classes""" scraper.scraper() # run the scraper with open('output.csv', 'r') as csv_file: #open file until done reading lists = LinkedList.LinkedList() for row in csv_file: # make the nodes from the .csv file and put them into a linked list state_text = repr(row.strip()) lists.new_node(state_text) lists.print_list() # bools for loops, self-explanatory names has_state = False still_search = True while still_search: search_for = raw_input("\nEnter a letter to narrow search " "(1: exit, 0: whole list): ") print " " if search_for == "0": # print whole list lists.print_list() elif search_for == "1": # quit print "Ending search.\n" still_search = False else: # print only the states beginning with user inputted letter current_node = lists.head while current_node.next != None: # navigate list until the end if current_node.cargo[0] == search_for.upper(): # if state starts with input, has_state = True print current_node.cargo has_state = True current_node = current_node.next if has_state == False: # if no states are found print "No state or territory found." has_state = False
def run(self): count = 0 while True: try: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.mostCommonWords, self.icsSubDomains, self.longestPage, self.similarURLs) for scraped_url in scraped_urls: #For each scraped url, add only if not discovered if (scraped_url not in self.discoveredURLs): self.discoveredURLs.add(scraped_url) self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) count += 1 print("\n", count, "\n") except: print("IT BLEW UPPPPPPPP") pass
def run(self): # this is where we are gonna call the function to get the robot.txt record = Recorder() while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break if not self.can_fetch(tbd_url): continue resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) # adding data to recorder record.add_url(tbd_url) if not (resp.raw_response is None and is_valid(tbd_url)): record.add_words(resp.raw_response.content, tbd_url) record.save() for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) record.finish_crawl_report()
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break # Politeness. Check if diff is less than 500 miliseconds. current_time = int(round(time.time() * 1000)) parsed = urlparse(tbd_url, allow_fragments=False) if parsed.netloc in self.time_visited: if current_time - self.time_visited[parsed.netloc] < 500: # print("sleeping for ", (500-(current_time-time_visited[parsed.netloc])-1) * .001) time.sleep( ((500 - (current_time - self.time_visited[parsed.netloc])) + 10) * .001) current_time = int(round(time.time() * 1000)) self.time_visited[parsed.netloc] = current_time resp = download(tbd_url, self.config, self.logger) if resp == False: continue else: self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) # print("scraped_urls: ", scraped_urls) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() # if there is a url to download on the frontier if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break # put the response = download into a try except, in case there is a timeout # and resp doesn't equal anything resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") # after getting the response from the webpage, the function will # store the information in the self.frontier self.frontier.store_page_text_content(resp, tbd_url) scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) self.frontier.close_files() print("number of unique pages is:", unique_pages(self.frontier.discovered_urls)) print("longest page is:", longest_page(self.frontier.site_content)) print("fifty most common words are here:",fifty_most_common_words(self.frontier.word_frequencies)) print(ics_subdomain_frequencies(self.frontier.discovered_urls))
def index(): if request.args: link = request.args['link'] n_posts = int(request.args['n_posts']) print(f'i got the args they are {link, n_posts}') # тут начинается получаение результата raw_comments_df = scraper(link, n_posts) print('raw comments shape', raw_comments_df.shape) # print(raw_comments_df) preprocessed_comments_df = pd.DataFrame( {'text': raw_comments_df['text'].tolist()}) preprocessed_comments_df['text'] = preprocessed_comments_df.text.apply( lambda x: str(preprocess(x))) preprocessed_comments_df = preprocessed_comments_df[ preprocessed_comments_df['text'] != 'None'] print('preproc comments shape', preprocessed_comments_df.shape) # print(preprocessed_comments_df) prediction = model_predict(preprocessed_comments_df) result = metrics(prediction) positive_n = result['positive_n'] negative_n = result['negative_n'] neutral_n = result['neutral_n'] positive_index = result['positive_index'] neutral_index = result['neutral_index'] return render_template('result.html', positive=positive_n, negative=negative_n, neutral=neutral_n, positive_index=positive_index, neutral_index=neutral_index) return render_template('index.html', links=[])
def main(): # output some information print("=====================================================") print("Welcome to Letterboxd Friend Ranker!") print("Instructions: This program compares you and") print("your friend's film taste. Once all the data has") print("been scraped and scores have been computed,") print("a report will be generated. The lower the avg.") print("difference, the better. If you and a friend do") print("not share at least 30 watched films, a score will") print("not be computed.") print("=====================================================\n") # prompt for info file_name = "" username = input("Enter your Letterboxd username: "******"Scraping friend data...\n") friends = sc.scraper(username) print("Computing scores...\n") # compute commonality for each friend results = commonality(current_user, friends) # write report print("Generating report...\n") gr.generate_report(results, current_user) print("Done! View the report in the current directory!")
def BFS(root_link): q = Queue() seen = set() explored = [] base_link = root_link[:root_link[:root_link.rfind('/')].rfind('/')] root_title = root_link[root_link.rfind('/') + 1:] root_node = Node(title=root_title, link=root_link) seen.add(root_title) q.put(root_node) while not q.empty(): curr = q.get() next_links = scraper(curr.link) for link in next_links: if link[1] not in seen: next_node = Node(link=base_link + link[0], title=link[1], parent=curr) curr.add_child(next_node) q.put(next_node) seen.add(next_node.title) explored.append(curr) for node in explored: print("{} evolves into {}".format(node.title, node.list_children()))
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break delta = datetime.timedelta(seconds=.5) split = urlparse(tbd_url).netloc.split('.') #extract domain from url (does not account for toay.blah./blah/blah/) domain = split[-3] + '.' + split[-2] + '.' + split[-1] print("DOMAIN: " + domain) # if we've accessed tbd_url domain within 500ms then sleep # sleep(500ms) if domain in last_seen and ( datetime.datetime.now() - last_seen[domain] < delta): print("====SLEEPING====") time.sleep(.5) # store tbh_url accessed at current time. last_seen[domain] = datetime.datetime.now() resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): i = 0 while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break try: resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.state) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) except HTTPError as err: self.logger.error(f"Downloaded {tbd_url}, hitting error {err}") self.frontier.mark_url_complete(tbd_url) if i % 1000 == 0: print(self.state['longest_page']) print_freq(self.state['word_rank'], 50) for domain, count in self.state['sub_domains'].items(): print(domain, count) self.frontier.print_saved() i += 1
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") printStats() break #We will ignore any network exceptions and retry. startTime = time.time() resp = None hasFailed = False while resp is None: try: resp = download(tbd_url, self.config, self.logger) except Exception as ex: hasFailed = True print(f"{ex}\nRetrying in 60 sec.") time.sleep(60) if hasFailed: with open("server-outages.rtf", "a+") as err: err.write(f"Server outage from: {startTime} to: {time.time()} duration: {round(time.time() - startTime)} sec.\n") self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) if scraped_urls is not None: for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: try: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") final_report() break resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.logger) # Report the stats every so often self.sites_crawled += 1 if self.sites_crawled >= 100: self.sites_crawled = 0 self.logger.info( "Downloaded 100 sites. Generating a report-so-far") final_report() for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) except Exception: # If the crawler runs into any exception, spit out the final report before re-raising the exception self.logger.info( "Worker caught an exception. Generating final report before exit." ) final_report() raise
def main(): scraper = sp.scraper() scraper.config_user("lukezim5", "playlist-read-private", "3035253db48b41bc9d5054646ff3cac5", "163471d7285c4fe58fb90573b5042bc6") scraper.get_featured_playlists() return
def main(): """ Demonstrates capabilities of linkedlist class """ scraper.scraper() # make the nodes from the .csv file and put them into a linked list with open('output.csv', 'r') as csv_file: lists = LinkedList.LinkedList() for row in csv_file: state_text = repr(row.strip()) lists.new_node(state_text) lists.print_list() # searches for list values with matching first letter has_state = False still_search = True while still_search: search_for = raw_input("\nEnter a letter to narrow search " "(1: exit, 0: whole list): ") print " " if search_for == "0": lists.print_list() elif search_for == "1": print "Ending search.\n" still_search = False else: # print only the states beginning with user inputted letter current_node = lists.head while current_node.next != None: if current_node.cargo[0] == search_for.upper(): print current_node.cargo has_state = True current_node = current_node.next if has_state == False: print "No state or territory found." has_state = False
def main(): print('Glassdoor Job Scraper.') job_title = input('Job search input: ') location = input('Location: ') while True: outcome = scraper(job_title, location, GD_USERNAME, GD_PASSWORD) if outcome: break
def scraperurl(): # collect dict results from scraper JSON = scraper() # establish collection object collection = mongo.db.disease_collection # upsert json to collection collection.update({}, JSON, upsert=True) return redirect('/data', code=302)
def download(): url = 'https://www.proxynova.com/proxy-server-list/elite-proxies/' soup = scraper(url) print(soup) with open('soup_file.txt', 'w') as f: json.dump(str(soup.prettify), f) print('Success')
def handle(self, *args, **options): url = options['url'] scraper.scraper(url, scraper_op) for i in golfers: # print first try: g = Golfer.objects.get(name=i) except Golfer.DoesNotExist: print i + " does not exist. Creating..." g = Golfer(name=i) g.save() name = raw_input('Name of event?: ') event = Event(name=name, active=True) event.save()
def scraperCallBack(): print("GUI: executing scraper") stockList = scraper() print("GUI: executing merger") merger(stockList) print("GUI: executing overview update") overviewUpdate(stockList) print("GUI: Update request finished") messagebox.showinfo("Progress", "Update finished")
def main(): args = arguments() browser = args.browser if browser.lower() == 'c' or browser.lower() == 'chrome': driver = driver_chrome() elif browser.lower() == 'f' or browser.lower() == 'firefox': driver = driver_firefox() print('\n-- RECLAME AQUI SCRAPER --') file = args.file id_page = args.id pages = args.pages conn, cursor = db_conn() coletor = url_collector(driver, file, id_page, pages, conn, cursor) scraper(driver, coletor, id_page, conn, cursor) driver.quit()
def homepage(): st.title("Enter Amazon product url to fetch data") url = st.text_input("") if st.button('Fetch Data'): all_reviews_df, product_title = scraper.scraper(url) if all_reviews_df is not None: st.dataframe(all_reviews_df.head()) title = preprocessing.product_name(product_title) all_reviews_df.to_csv(f"./Amazon Reviews/{title}.csv") preprocessing.clean_data(all_reviews_df, title)
def run(self): try: while True: self._lock() try: tbd_url = self.frontier.get_tbd_url() if tbd_url: self.threads_in_processing.add(self.worker_id) elif tbd_url is None and len( self.threads_in_processing) == 0: self.logger.info( f"Frontier is empty. Stopping the Worker: {self.worker_id}" ) break finally: self._unlock() if tbd_url is None or tbd_url == "": time.sleep(0.05) continue self.logger.info( f"Worker: {self.worker_id} Downloading: {tbd_url}") resp = download(tbd_url, self.config, self.logger) if resp.raw_response is None and resp.error.startswith( "EMPTYCONTENT"): self.logger.error(f"{resp.error}, status <{resp.status}>") self.logger.info( f"Worker: {self.worker_id} Downloaded : {tbd_url}, status <{resp.status}>" ) scraped_urls = scraper(tbd_url, resp) new_urls_added = 0 self._lock() try: for scraped_url in scraped_urls: if (self.frontier.add_url(scraped_url)): new_urls_added += 1 self.frontier.mark_url_complete(tbd_url) finally: self._unlock() self.threads_in_processing.remove(self.worker_id) self.logger.info( f"Worker: {self.worker_id}, Added: {new_urls_added}, Remaining: {self.frontier.count_tbd_urls()}" ) except BaseException: self.logger.exception( f"Unexpected exception in Worker: {self.worker_id}") finally: if self.worker_id in self.threads_in_processing: self.threads_in_processing.remove(self.worker_id) self.logger.info(f"Worker: {self.worker_id} Stopped")
def pageScrapper(html, index): soup = bs(html, 'lxml') targetDivs = soup.findAll('div', {'class': 'a-section a-spacing-medium'}) print(len(targetDivs)) links = {} for div in targetDivs: itemLink = div.find('a', {'class' : 'a-link-normal a-text-normal'}) itemName = div.find('span', {'class' : 'a-size-medium a-color-base a-text-normal'}) links[itemName.text] = itemLink.get('href') file1 = open("data/pageLinks"+str(index)+".csv", "w") file2 = open("data/pageLinks.txt", "w") file1.write('"Product Name","Link",\n') for key in links: fileEnrty = '"'+key+'"'+','+'"'+links[key]+'"'+'\n' file1.write(fileEnrty) file2.write(key + '~' + links[key] + '\n') file1.close() file2.close() print('\ncreated a file data/pageLinks'+str(index)+'.csv\n') scraper("pageLinks.txt")
def check_source(source, article): results = scraper.scraper(article['link']) if results == None: text1 = None else: text1 = results[0] text2 = article['full_text'] assert text1 == text2
def animate(i): scraper = sc.scraper() site = scraper.scrapeWebsite( 'https://www1.oanda.com/currency/live-exchange-rates/') parsed = scraper.parserHTML(site) scraper.writer(parsed, 'data.json') df = reader.readFile('data.json') newDF = reader.createData(df) a.clear() max = newDF[['GBP_USD']].max() min = newDF[['GBP_USD']].min() a.set_title('Current $ to £ Exchange Rates') newDF.plot(kind='line', x='TIME', y='GBP_USD', ax=a, ylim=[1.30, 1.32])
def main(): path = "C:\\Users\Arthur\Documents\Studies\ParisDauphineUniversity\Master203\\Newsletter\\Scraper\\data\\" price_list = [] for ticker, sec in TICKERS.items(): print(ticker) price_list.append(scraper(ticker, sec)) price_list = clean_data(price_list) ComputingTable = compute_data(price_list, path) TableStr = format_table( ComputingTable, "C:\\Users\Arthur\Documents\Studies\ParisDauphineUniversity\Master203\\Newsletter\\Scraper\\pictures\\" ) return TableStr
def email(item, min_price, max_price, id_list, chat_id): # Create the HTML version of your message # html = """\ # <html> # <body>""" html = "" items = scraper.scraper(item, min_price, max_price, id_list) if len(items) != 0: for item in items: # html = html + """<p><b>%s</b><br> #html = '<b>%s</b> %s %s ><a href="%s">Ad Link</a>' % (item.title, item.price, item.date, "http://" + item.url) title = item.title price = item.price date = item.date url = item.url title = title.replace("*", " ") date = date.replace("< ", "") text = '<b>' + title + '</b>' + '\n' + price + '\n' + date + '\n' + '<a href="http://' + url + '">Ad link</a>' bot.send_message(chat_id=chat_id, text=text, parse_mode=telegram.ParseMode.HTML) # html = html + """</body></html>""" # sender_email = senderE # receiver_email = receiverE # password = password # message = MIMEMultipart("alternative") # message["Subject"] = "Kijiji Ad Alert!!!!" # message["From"] = sender_email # message["To"] = receiver_email # Turn these into plain/html MIMEText objects # part1 = MIMEText(text, "plain") # part2 = MIMEText(html, "html") # Add HTML/plain-text parts to MIMEMultipart message # The email client will try to render the last part first # message.attach(part1) # message.attach(part2) # Create secure connection with server and send email # context = ssl.create_default_context() # with smtplib.SMTP_SSL("smtp.gmail.com", 465, context=context) as server: # server.login(sender_email, password) # server.sendmail( # sender_email, receiver_email, message.as_string() # ) return None
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls, urlInfo, token_list = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.reporter.add_words(tbd_url, token_list) self.frontier.mark_url_complete(tbd_url, urlInfo)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: print("************* REPORT ****************") print() print("Team Members:") print(" Kamaniya Sathish Kumar (56361951)") print(" Samhitha Tarra (69706915)") print(" Vani Anilkumar (36335618)") print() print("Number of Unique URLs:", scraper.count_unique_url) print() print("Longest URL:", scraper.longest_page) print("Number of Tokens in Longest URL:", scraper.num_words_longest_page) print() print("50 Most Common Words:") counter = 1 for key, value in sorted(scraper.master_freq_dict.items(), key=lambda x: x[1], reverse = True): if counter <= 50: print(str(counter) + ". " + key + " (" + str(value) + ")") counter = int(counter) counter += 1 else: break print() print("Subdomains in ics.uci.edu:") for tup, val in sorted(scraper.master_subdomain_dict.items(), key=lambda x: x[0]): url_string = "" url_string += tup[1] + "://" + tup[0] + ".ics.uci.edu," print(url_string, val) print() print("************* REPORT ****************") self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper.scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def imageLookupLoop(): cacheDir = adGlobal.cacheDir; image_type = "Action" maxImagesPerHost = 5 searchType = adGlobal.searchType syslog.syslog("search method: "+searchType) if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) hosts=[] services = subprocess.check_output(["slptool","findsrvs","service:artdisplay.x"]).split('\n'); if len(services) == 0: syslog.syslog("no available services") return for s in services: if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) if slpDebug: syslog.syslog("slp s:"+s) loc=s.split(','); if loc[0] == '': continue if slpDebug: syslog.syslog("loc:"+str(loc)) if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) attr=subprocess.check_output(["slptool","findattrs",loc[0]]); host={} host['ip']=loc[0].split("//")[1] host['hasPanel']=False host['isDispText']=False if attr.find("hasPanel") != -1: if slpDebug: syslog.syslog(str(host)+":host has panel"); host['hasPanel']=True if attr.find("isDispText") != -1: if slpDebug: syslog.syslog(str(host)+":host is a text display"); host['isDispText']=True if slpDebug: syslog.syslog("slp host"+str(host)) hosts.append(host) if slpDebug: syslog.syslog("hosts:"+str(hosts)) if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) images=[] choices=[] if searchType == "Archive": if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) vars=archive.getArchive(); images=vars[0] choices=vars[1] if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) else: while len(images) < 20: images=[] choices=[] choices = words.getWords() images = scraper.scraper(choices[:]) if len(images) == 1 and images[0] == "error": return syslog.syslog("select: "+choices[0]+" "+choices[1]) copyList = {} if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) for h in hosts: copyList[h['ip']] = {} copyList[h['ip']]['image'] = [] copyList[h['ip']]['flag'] = [] copyList[h['ip']]['text'] = None if h['hasPanel'] or h['isDispText']: if dispDebug: syslog.syslog("doing has panel or disp text") if len(choices) < 2: syslog.syslog("WARNING, choices array not loaded") else: textPath=cacheDir+"/"+adGlobal.textName adGlobal.mutex.acquire() f = open(textPath,'w') f.write(choices[0]+'\n') f.write(choices[1]+'\n') f.close(); adGlobal.mutex.release() copyList[h['ip']]['text']=textPath if debug: syslog.syslog( "textPath:"+textPath) hostCount=0 if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) for image in images: if hosts[hostCount]['isDispText']: if dispDebug: syslog.syslog("skipping image store for:"+hosts[hostCount]['ip']) hostCount += 1 if hostCount == len(hosts): hostCount=0 if ( searchType != "Archive"): if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) raw_img=getRawImage(image) if hangDebug: syslog.syslog("Hang debug:" +__file__+" " +str(inspect.currentframe().f_lineno)) if raw_img is None: syslog.syslog( "raw_image = none",image ) continue; adGlobal.mutex.acquire() cntr = len([i for i in os.listdir(cacheDir) if image_type in i]) + 1 adGlobal.mutex.release() if debug: syslog.syslog( str(cntr)) imgPath=cacheDir + '/' + image_type + "_"+ str(cntr)+".jpg" adGlobal.mutex.acquire() f = open(imgPath, 'wb') f.write(raw_img) f.close() flgPath=cacheDir + '/' + image_type + "_"+ str(cntr)+".flg" f = open(flgPath, 'w') f.close() adGlobal.mutex.release() del raw_img else: adGlobal.mutex.acquire() try: cmd=["cp",image,cacheDir] if debug: syslog.syslog( "cmd:"+str(cmd)) subprocess.check_output(cmd) imgPath=cacheDir+"/"+os.path.basename(image) i=os.path.basename(image) flgPath=cacheDir+"/"+image[:image.rindex(".")]+".flg" cmd=["touch",flgPath] if debug: syslog.syslog( "cmd"+str(cmd)) subprocess.check_output(cmd) except subprocess.CalledProcessError, e: syslog.syslog("archive file copy problem: "+', '.join(cmd)+str(e.output)) adGlobal.mutex.release() continue; finally:
def parse_feed(url, debug=False, mode='streaming'): """ params: url - string for the location of an RSS feed returns: writes all new articles in feed to mongo db with all relevant noozli data """ feed = feedparser.parse(url) if len(feed['entries']) == 0: log.info("No items in feed: " + url) return None try: client = MongoClient('localhost', 27017) except Exception as e: log.warning('Failed to connect to mongo.') db = client.noozli if mode == 'prototype': collection = db.prototype else: collection = db.streaming if 'npr.org' in url: source = 'NPR - ' + feed['feed']['title'] else: source = feed['feed']['title'] # normalize fox news source if source == "FOXNews.com": source = "FOX News" # limit number of articles for test set if mode == 'prototype': if collection.find({'source': source}).count() > 100: log.info("Source '" + source + "' has " + str(collection.find({'source': source}).count()) + ' articles in database. Skipping.') return None log.info('parsing feed ' + url) articles = [] count = 0 for item in feed['entries']: if debug and count == 100: break # don't normalize abc urls, they don't work if 'abcnews.go.com' in item['link']: rss_link = item['link'] else: # normalize urls p_url = urlparse(item['link']) rss_link = p_url.scheme + "://" + p_url.netloc + p_url.path strip_regex = re.compile('index\.html|index\.shtml') rss_link = strip_regex.sub('', rss_link) # Business Insider occasionally has bloomberg articles, skip them if source == 'Business Insider' and 'bloomberg.com' in rss_link: continue article_id = hashlib.md5(str.encode(rss_link)).hexdigest() if collection.find({'links': rss_link}).count() > 0: log.debug('Already found "'+ source + '" article: ' + rss_link) continue title = item['title'] try: if type(item['content']) == list: if len(item['content']) != 1: log.info('RSS feed "content" field has more than 1 entry. ' + url) content = item['content'][0]['value'] else: content = item['content']['value'] except KeyError: try: content = item['summary'] except KeyError: content = None try: author = item['author'] except KeyError: author = None published = item['published'] # add datetime stamp to data for ttl expiration date, zone = published.rsplit(' ', 1) try: published_dt = datetime.datetime.strptime(date, "%a, %d %b %Y %H:%M:%S") except: log.warning('unable to parse published time: ' + published) continue # add time zone awareness found_numerical_zone = False if len(zone) >= 5: search = re.search('(\+|\-)[0-9]+', zone) if search: found_numerical_zone = True zone = search.group() if zone == 'EDT' or zone == 'EST': published_tz = pytz.timezone('US/Eastern') elif zone == 'CDT' or zone == 'CST': published_tz = pytz.timezone('US/Central') elif zone == 'MDT' or zone == 'MST': published_tz = pytz.timezone('US/Mountain') elif zone == 'PDT' or zone == 'PST': published_tz = pytz.timezone('US/Western') elif zone == 'UTC' or zone == 'GMT': published_tz = pytz.timezone('UTC') elif found_numerical_zone: zone = zone.rstrip('0') if len(zone) > 1: published_dt += datetime.timedelta(hours=int(zone)) else: log.warning('unable to determine timezone: ' + published) continue if not found_numerical_zone: published_dt = published_tz.localize(published_dt) comp_dt = published_dt.astimezone(pytz.utc) else: comp_dt = published_dt # check if article is old, skip try: age = datetime.datetime.utcnow() - comp_dt except TypeError: utc_tz = pytz.timezone('UTC') age = utc_tz.localize(datetime.datetime.utcnow()) - comp_dt if age.seconds + age.days*86400 >= 2592000: log.debug('url ' + rss_link + ' more than 30 days old, skipping.') continue # # perform scrape (want to minimize this as much as possible, don't annoying hosts) # try: result = scraper.scraper(rss_link, debug) except UnicodeEncodeError: # sometimes finding non utf-8 unicode characters, for now skip result = None log.warning('non utf-8 character in '+rss_link) full_text = None og_url = None if result is not None: full_text = result[0] image_url = result[1] display_text = result[2] og_url = result[3] # if og:url exists redefine the link and article_id hash, some rss feeds have badly formatted urls og_link = None if og_url is not None: p_url = urlparse(og_url) og_link = p_url.scheme + "://" + p_url.netloc + p_url.path strip_regex = re.compile('index\.html|index\.shtml') og_link = strip_regex.sub('', og_link) # recompute article id and check again for its existence in database (based on og:url tag) article_id = hashlib.md5(str.encode(og_link)).hexdigest() if og_link is None: links = [rss_link] elif rss_link != og_link: links = [og_link, rss_link] else: links = [og_link] if full_text is not None: if len(full_text) < 250: if og_url is not None: log.debug('Text too short on ' + og_link + '. Skipping.') else: log.debug('Text too short on ' + rss_link + '. Skipping.') continue # check to see if article is in DB based on the text # handles sources that get resyndicated (AP, CNN, ABC, etc.) matching_articles = collection.find({'full_text': full_text}) if matching_articles.count() > 0: if og_url is not None: log.debug('Already found "'+ source + '" article by text: ' + og_link) else: log.debug('Already found "'+ source + '" article by text: ' + rss_link) # if text of article was already found, associate all links that point to that same article for art in matching_articles: new_links = list(art['links']) if rss_link not in new_links: new_links.append(rss_link) if og_link != rss_link and og_link not in new_links: new_links.append(og_link) if len(new_links) > len(art['links']): collection.update( {'_id': art['_id']}, {'$set': {'links': new_links}} ) continue # check og_link after checking if text was found in db, so we can add additional rss_links if collection.find({'links': og_link}).count() > 0: log.debug('Already found "'+ source + '" article: ' + og_link) continue #if mode == 'prototype': topics = textrazor_categorization(full_text) #else: # topics = None if image_url is not None: server_image_url = resize_and_store_image(image_url, article_id) else: server_image_url = None article_data = {'article_id': article_id, 'title': title, 'author': author, 'summary': content, 'full_text': full_text, 'display_text': display_text, 'image': server_image_url, 'links': links, 'published': published, 'timestamp': published_dt, 'topics': {'text_razor': topics }, 'source': source } if debug: articles.append(article_data) else: log.debug('Adding "'+ source + '" article: ' + links[0]) collection.insert(article_data) count += 1 else: if og_url is not None: log.warning("error: did not extract any text from - " + og_link) else: log.warning("error: did not extract any text from - " + rss_link) # append empty if debug: article_data = { 'links': links, 'full_text': None, 'display_text': None, 'source': source } articles.append(article_data) # in streaming mode, append all links that failed to scrape to reduce website access if mode == 'streaming': if og_url is None: links = [rss_link] else: links = [rss_link, og_link] article_data = {'article_id': article_id, 'full_text': None, 'display_text': None, 'links': links, 'source': source, 'timestamp': published_dt } if not debug: collection.insert(article_data) log.info('Added ' + str(count) + ' articles from ' + source) if debug: return articles
from scraper import scraper import os import time def diff(A,B): if len(A.keys()) > len(B.keys()): A,B = B,A for k in A.keys(): if A[k]!=B[k]: print '--',k,A[k] print '++',k,B[k] print '' while True: try: if os.path.exists(os.path.join(os.getcwd(),'database.json')): with open('database.json','r') as fin: old_database = json.load(fin) new_database = scraper(USERNAME, PASSWORD) if new_database != old_database: diff(old_database, new_database) with open('database.json', 'w') as fout: json.dump(new_database, fout) else: new_database = scraper(USERNAME, PASSWORD) with open('database.json', 'w') as fout: json.dump(new_database, fout) time.sleep(60*60) except Exception as e: print e
import scraper gi = scraper.scraper('instagram') #source [e.g. "google", "twitter", "instagram"] gi.set_outputDir('output/') #output directory on local disk gi.set_downloadLimit(10) #specified number of images gi.scrape( "design" ) #arq: tag, search query gi.stop() #close the browser once scraping is done
def __init__(self, booksCallback=None): self.database = connector.connector(booksCallback) self.scraper = scraper.scraper()