def getDataFromLast7Dayz(symbol): isCreated = dbop.createTable(symbol) print "getting data from last 7 days for ", symbol msg = "getting data from last 7 days for " + symbol Log(msg) sc = Scrapper() result = sc.equityScrapper(symbol, selected=False, timeout=100)
def scrap_and_upload(vehicle_category): """ """ if vehicle_category is None: sys.exit("vehicle category cannot be null") vehicles = load_scrapping_links(vehicle_category) start_time = datetime.utcnow().strftime("%Y-%m-%d") create_directory(f"tmp") create_directory(f"tmp/{vehicle_category}") file_path = f"{DIR_NAME}/tmp/{vehicle_category}/{start_time}.csv" if os.path.exists(file_path): header = None else: header = ["Make", "Model", "Trim", "Year", "Mileage", "Price"] for make, model, urls in vehicles: for website_name, link in urls.items(): if website_name == 'cg': urlsuffix = "#resultsPage=" elif website_name == 'ed': urlsuffix = "?pagenumber=" site_scrapper = Scrapper(website_name, link, urlsuffix, make, model, vehicle_category) site_scrapper.fetch_batch(NUM_OF_PAGES) if site_scrapper.listings: with open(file_path, "a") as csvfile: write(csvfile, site_scrapper.listings, header) header = None if os.path.exists(file_path): s3_client = boto3.client('s3') s3_client.upload_file(file_path, DESTINATION_BUCKET, f"{vehicle_category}/{start_time}.csv")
def count(id): logger.info(f'Adding task for id: {id}') session = Session() task = session.query(Tasks).filter_by(id=id).first() res = Results(address=task.address, words_count=0, http_status_code=0) try: scrpr = Scrapper(task.address) except: scrpr = None if scrpr: err = scrpr.get_page() if not err: task.http_status_code, matches = scrpr.count_matches() task.task_status = 'FINISHED' res = Results(address=task.address, words_count=matches, http_status_code=task.http_status_code) else: print(err) session.add(res) session.commit() logger.info(task) logger.info(res)
def aule(): aula = request.args.get('aula') settimanaDopo = request.args.get('settimanaDopo') # Conversione da string a boolean if settimanaDopo == 'True': settimanaDopo = True else: settimanaDopo = False scrapper = Scrapper() dati = scrapper.cerca_orario_aule(aula, settimanaDopo) if dati is None: return "SETTIMANA DI VACANZA" ris = "Aula " + aula + "<br>" for giorni in dati: for giorno in giorni.values(): if isinstance(giorno, str): ris += giorno + " " else: for materie in giorno: for materia in materie.values(): if isinstance(materia, str): ris += materia + " " else: for classe in materia: ris += classe + " " ris += "<br>" ris += "<br>" return ris
class Main: def __init__(self): ap = argparse.ArgumentParser() ap.add_argument("-train", "--train", required=True, help="whether to train a model or not") self.args = vars(ap.parse_args()) self.scrapper = Scrapper() # self.dataManager = DataManager() self.filterImage = FilterImage() self.faceRecognition = FaceRecognition() self.emotionDetection = EmotionDetection() self.model = Model() def run(self): self.scrapper.scrape() # self.dataManager.manage() roi_face, face_image = self.faceRecognition.recognizeFace() self.filterImage.blurImage(roi_face) self.faceRecognition.checkFaceRatio() self.emotionDetection.detect() self.faceRecognition.recognizeMouth() if self.args['train'] == '1': save_weights_name = input("Enter the name for weights: ") self.model.train(save_weights_name, save_weights='TRUE') self.model.load_weights(config.WEIGHT_NAME) self.model.predict() pass
def extract_acts(): scrapper = Scrapper(constants.base_url) #when the url is requested without data, the search form is retrieved home_page = scrapper.request({}) acts_scrapper = ActsParser(home_page) acts_scrapper.parse() scrapper.save_data(acts_scrapper.acts, "acts.json")
def olx_bot(): scrapper = Scrapper() if (Scrapper.isExecution): print('O Programa já está sendo executado') else: scrapper.start() print('O Programa está sedo iniciado')
def olx_bot(): scrapper = Scrapper() if (Scrapper.isExecution): return render_template('running.html') else: scrapper.start() return render_template('sucess.html')
def fetch_data(url, callback): try: r = yield gen.Task(http_client.fetch, url) print "done" callback(Scrapper.Blog(url=url, content=r.body[:100])) except: print "Something went wrong" callback(Scrapper.Blog())
def run(self): scapper = Scrapper() linklist = scapper.loadLink(self.rooturl) dbr = DB() dbr.rawlinks_save(linklist) pass
def run(self): scrapper = Scrapper() global folder_path global test_df test_df = generate_test_data(self.username, self.threshold) folder_path = scrapper.dowload_data(self.username, self.threshold) #user_account="skyemcalpine" folder_path = folder_path.replace("\\", "/") print(folder_path) self.signals.result.emit(True)
def initialise_stats(self): if self.has_scrapper_links and self.has_valid_predictions: self.predictions = Prediction().initialise_prediction( ).get_all_prediction() self.driver_standings = Scrapper().initialise_links( ).scrape_driver() self.team_standings = Scrapper().initialise_links( ).scrape_constructor() return self else: print("Links and predictions not initialised properly") return self
def get_table_info(self): details_movie = None try: if self.download_url is not None: self.sc = Scrapper(self.download_url) details_movie = self.sc.get_movie_details() except Exception as e: print("Error initializing the Scrapper: " + e) if details_movie is not None: return details_movie
def main(): """ Instancie mes classes Requester et Scrapper, effectue une première requete puis transmet la réponse au scrapper """ requester = Requester() scrapper = Scrapper(requester) requested_response = requester.html_requester(constants.URL) category_list = scrapper.get_category_list(requested_response) scrapper.scrap_books_in_category(category_list, scrapper)
def __init__(self): ap = argparse.ArgumentParser() ap.add_argument("-train", "--train", required=True, help="whether to train a model or not") self.args = vars(ap.parse_args()) self.scrapper = Scrapper() # self.dataManager = DataManager() self.filterImage = FilterImage() self.faceRecognition = FaceRecognition() self.emotionDetection = EmotionDetection() self.model = Model()
def test(self,username,threshold): scrapper=Scrapper() folder_path=scrapper.dowload_data(username,threshold) dataProcessor=DataProcessor(folder_path) data=dataProcessor.create_dataframe_input() #print(data) class_names=['food and drink', 'entertainment', 'business and industry', 'family and relationships', 'fitness and wellness', 'hobbies and activities', 'shopping and fashion', 'sports and outdoors', 'technology'] model_path="./last_cnn_model.h5" cnnModel=CnnModel(class_names,model_path,data) model=cnnModel.load_model() test_generator=cnnModel.create_generator() prediction=cnnModel.getPrediction(model,test_generator) result=np.sum(prediction,axis=0) result*=(1/len(prediction)) return result
def create_app(): app = Flask(__name__) CORS(app) from blueprints import npcs_blueprint from blueprints import gears_blueprint from blueprints import runes_blueprint from blueprints import biomes_blueprint from blueprints import bosses_blueprint from blueprints import outfits_blueprint from blueprints import pickups_blueprint from blueprints import enemies_blueprint from blueprints import mutations_blueprint from blueprints import achievements_blueprint app.register_blueprint(npcs_blueprint.bp) app.register_blueprint(gears_blueprint.bp) app.register_blueprint(runes_blueprint.bp) app.register_blueprint(biomes_blueprint.bp) app.register_blueprint(bosses_blueprint.bp) app.register_blueprint(outfits_blueprint.bp) app.register_blueprint(pickups_blueprint.bp) app.register_blueprint(enemies_blueprint.bp) app.register_blueprint(mutations_blueprint.bp) app.register_blueprint(achievements_blueprint.bp) app.scrapper_manager = Scrapper() @app.errorhandler(404) def route_not_found(error): app.logger.error(error) return 'Route not found.', 404 return app
def scrapeURL(): data = request.json url = data['url'] response = dict() scrapper = None if urlExists(url, timeout=20, check_is_image=False): if isInCustomSites(url): scrapper = CustomScrapper() response['custom'] = True else: scrapper = Scrapper() response['custom'] = False image_or_data_urls = scrapper.scrape(url) if len(image_or_data_urls) > 0: response['success'] = True response['output'] = image_or_data_urls response['stats'] = scrapper.stats else: response['success'] = False response['output'] = "NO_IMAGES_FOUND" else: response['success'] = False response['output'] = "INVALID_URL" return response
def start_scraper(): global SELENIUM global FILE_NAME global TEST kwargs = { 'selenium': SELENIUM, 'url': FILE_NAME, 'test': TEST, 'skip_after': 0, 'skip_before': 0, 'export': 'json' } if not TEST: print('test not enabled...') Scrapper(**kwargs).crawl() else: Scrapper.test()
def get(): register_no = request.args.get('register_no') dob = request.args.get('dob') if register_no is None or dob is None: resp = make_response(json.dumps({'error': 'Request parameters are not in correct format.'})) else: if not check_regno(register_no) and not check_dob(dob): resp = make_response(json.dumps({'error': 'Invalid Register Number and Date of Birth.'})) elif not check_regno(register_no): resp = make_response(json.dumps({'error': 'Invalid Register Number.'})) elif not check_dob(dob): resp = make_response(json.dumps({'error': "Date of Birth is invalid."})) else: s = Scrapper(register_no, dob) json_data = s.get_json() resp = make_response(json_data) resp.mimetype = 'application/json' return resp
def check_prices(): users = session.query(User).all() scrapper = Scrapper() items = session.query(Item).all() for item in items: scrapper.go_to(item.link) price = scrapper.get_price() title = scrapper.get_title() if not item.title: item.title = title session.commit() if item.price: change_percentage = (abs(price - item.price) / item.price) * 100.0 if change_percentage >= 3: item.price = price session.commit() markup = InlineKeyboardMarkup( [InlineKeyboardButton('Check', url=item.link)]) for u in users: try: bot.send_message( u.tg_id, '<code>{}</code> price changed'.format(title), parse_mode=ParseMode.HTML, reply_markup=markup) except Exception as e: config.logger.error( 'Error sending a message: {}'.format(e)) else: item.price = price session.commit()
def main(): scrapper = Scrapper() merger = Merger() parser = Parser() client = MongoClient('localhost', 27017) db = client['Data'] collection_socialmedia = db['socialmedia'] #Begin real time collecting while True: scrapper.scrap() merger.main() parser.main() sleep(3600) #Storing to mangoDB f = open( '/home/sartharion/Bureau/stage/POO/data.json', 'r') file_data = json.load(f) collection_socialmedia.delete_many({}) collection_socialmedia.insert_many(file_data) client.close()
def communicate(): transport = THTTPTornadoTransport() pfactory = TJSONProtocol.TJSONProtocolFactory() client = Scrapper.Client(transport, pfactory) futures = [client.scrape('http://google.com/') for i in xrange(100)] try: yield futures except Exception as e: print e io_loop.stop()
def book_download(query, book_name): data = Scrapper(query).parse_data() # gets the book_name from the data and gets the direct download link for the book try: book = list( filter(lambda book: book['Book']['title'] == book_name, data))[0] direct_dl = DownloadFetcher(book).get_direct_download() return jsonify({'book': book, 'download': direct_dl}), 200 except Exception as e: print(e) print(book_name) return f"Error specified book name not found for query = {query}", 404
def getDataFromLastYears(symbol, historic=False): # get the year from which we have to start scraping. year = int(props.get("startYear")) # check for company's db or create it if not created already. isCreated = dbop.createTable(symbol, historic) # this loop code will form dates and scrape data from the startYear say 2000 till last year's December say 2017. while year < currentYear: startMonth = 1 endMonth = startMonth + 1 while endMonth < 13: if not historic: result = formDateAndCallScrapper(startMonth, endMonth, year) else: result = formDateAndCallScrapper(startMonth, endMonth, year, historic=True) startMonth = endMonth + 1 endMonth = startMonth + 1 year += 1 startDay = 1 startMonth = 1 endMonth = startMonth + 1 limitMonth = int(datetime.datetime.now().strftime("%m")) # Current month # now this loop is for the last slot of month/months which couldn't form 2 months pack. while endMonth < limitMonth: if not historic: result = formDateAndCallScrapper(startMonth, endMonth, year) else: result = formDateAndCallScrapper(startMonth, endMonth, year, historic=True) startMonth = endMonth + 1 endMonth = startMonth + 1 if limitMonth - startMonth == 0 or limitMonth - startMonth == 1: startDate = "0" + str(startDay) + "-0" + str(startMonth) + "-" + str( year) endDate = str(datetime.datetime.now().strftime("%d-%m-%Y")) print "start - ", startDate, " to end - ", endDate msg = "start - " + startDate + " to end - " + endDate Log(msg) if not historic: sc = Scrapper() result = sc.equityScrapper(symbol, startDate, endDate, selected=True, timeout=100) else: sc = Scrapper(historic=True) result = sc.historicScrapper(startDate, endDate)
def create_recipe(): json_data = request.get_json() url = json_data.get('url') type_recipe = json_data.get('typeRecipe') print(f'Creating entry \'{type_recipe}\' for url: \'{url}\'') if type_recipe is None: raise ValueError("typeRecipe is empty") if url is None: raise ValueError("URL is empty") recipe = mongo.add_recipe( Scrapper(url=url, type_recipe=type_recipe).scrap()) return {'success': True, 'recipe': recipe}
def Scrap(): product = request.args.get('product') maxpages = request.args.get('max') website = request.args.get('website') if not maxpages: maxpages = 2 print(product, maxpages) scrap = Scrapper() scrapped_data, csvfile = scrap.start(product, max=maxpages, website=website) record = Record(product=product, created=datetime.today().strftime('%d_%m_%Y'), pages=maxpages, data=csvfile.split('/')[-1], user=session.get('user')) db.session.add(record) db.session.commit() return jsonify(scrapped_data)
def communicate(): transport = TAMQPTornadoTransport() pfactory = TJSONProtocol.TJSONProtocolFactory() client = Scrapper.Client(transport, pfactory) yield gen.Task(transport.open) futures = [client.scrape('http://google.com/') for i in xrange(100)] yield futures client._transport.close() io_loop.stop()
def check_and_scrap_reviews(self, hotel_name, platforms): for platform in platforms: if (platform == 'TA'): data = self.read_csv_to_list( "C:/Users/acfelk/Documents/IIT_Files/final year/FYP/fyp_workfiles/final_project/backend/drops/" + hotel_name + "-tripadvisor.csv") if data is None: # NOW CALL THE SCRAPPER TO SCRAP REVIEWS TO drops scrapper = Scrapper() scrapper.scrap_reviews(hotel_name, platform) if (platform == 'BC'): data = self.read_csv_to_list( "C:/Users/acfelk/Documents/IIT_Files/final year/FYP/fyp_workfiles/final_project/backend/drops/" + hotel_name + "-bookingscom.csv") if data is None: # NOW CALL THE SCRAPPER TO SCRAP REVIEWS TO drops scrapper = Scrapper() scrapper.scrap_reviews(hotel_name, platform)
def main(args): username = args.username cid = os.environ['SPOTIPY_CLIENT_ID'] secret = os.environ['SPOTIPY_CLIENT_SECRET'] redirect_uri = os.environ['SPOTIPY_REDIRECT_URI'] content = args.content spoti = TrackExtractor(username, cid, secret, redirect_uri) sc = Scrapper() if content == 'all': ret_tracks, _, _ = spoti.all_tracks() elif content == 'playlists': ret_tracks, _, _ = spoti.tracks_in_all_playlists() elif content == 'saved_tracks': ret_tracks, _, _ = spoti.saved_tracks() else: print( 'Wrong set of filter! Please enter one of [\'all\', \'playlists\',\'saved_tracks\']' )
def formDateAndCallScrapper(startMonth, endMonth, year, historic=False): dates = dt.dateCreator(startMonth, endMonth, year) print "start - ", dates[0], " to end - ", dates[1] msg = "start - " + dates[0] + " to end - " + dates[1] Log(msg) if not historic: sc = Scrapper() return sc.equityScrapper(symbol, dates[0], dates[1], selected=True, timeout=100) else: sc = Scrapper(historic=True) return sc.historicScrapper(dates[0], dates[1])
bay=["33.52694833905606,44.61786288710962","33.52779437548921,44.6187406119569","33.52844095177134,44.61952376688691","33.52912847311098,44.6203916749758","33.52941621924846,44.62153677469976","33.52955130148957,44.62233144707166","33.5300525893401,44.6227438007253","33.53074849762842,44.62332258304048","33.53171789286472,44.62402774084964","33.5324194892887,44.62470767274025","33.53284911288601,44.62509717000224", "33.53319327158253,44.62545051188663", "33.53388568626423,44.6257351767263", "33.53436842166227,44.62606452934324", "33.53505441166816,44.62632008064779", "33.5353801786637,44.62659249116985", "33.53630376612985,44.62625791333705", "33.53648828981084,44.62599170149596", "33.53610642979442,44.62526522473693", "33.5357881089882,44.6248585430718", "33.53500799910345,44.62426344134919", "33.53465157533417,44.62368184867986", "33.53431369292202,44.62296420957523", "33.53382395865862,44.62231786828408", "33.53333558808044,44.6216463315828", "33.53246351873157,44.62075912758933", "33.53167225920291,44.61991668370528", "33.53115295169879,44.61933457914446", "33.53057732703983,44.61855221942759", "33.53004247447537,44.61794741478788", "33.52952835400872,44.61738465609927", "33.52893986428256,44.61685023094336", "33.52844685348632,44.61674350283417", "33.52770715079468,44.61675678646416","33.52693507427308,44.61711744970111","33.52694833905606,44.61786288710962"] Bay = [] for coord in bay : lon, lat = coord.split(",") Bay.append(Coordinates(float(lat), float(lon))) Field = Area(Bay) Field.__start_point__() caters = ["MOLODIZGNIY", "ADMIRAL LAZAREV", "SATURN", "ADMIRAL ISTOMIN", "V ADMIRAL KLOKACHEV", "NORD"] Caters = [] for c in caters: Caters.append(Ship(c)) Scrap = Scrapper() for cater in Caters : res = Scrap.scrape_ship(cater.name) if res == None: print cater.name, "Not found" else: v, cors, point = res print cater.name, "\t---\t", Field.__das_is_in__(point) print point print #print #for x in Field.points: # print x.latitude, x.longitude
def OptimizeChromosome(self, chromosome): chromosomeQuality = dict() for gene in chromosome: chromosomeQuality[gene] = self.CalculateCachedLinkQuality(gene) return sorted(chromosomeQuality, key=chromosomeQuality.get, reverse = True) def Plot(self): raise NotImplemented def Display(self): raise NotImplemented gsa = GSA() sc = Scrapper(str(input("Enter search query: ")), 20) urls = set(sc.getLinks()) urlDict = dict() for index, url in enumerate(urls): urlDict[index] = url gsa.genes = urlDict.keys() for key, value in urlDict.iteritems(): features = sc.getFeatures(value) gsa.bounceRate[key] = features[0] gsa.pageView[key] = features[1] gsa.time[key] = features[2] gsa.searchVisit[key] = features[3] gsa.linkIn[key] = features[4]
def result(self): self.lblResult.setText("") data = Scrapper(self.date.year(), self.date.month(), self.date.day()) self.lblResult.setText(str(data.getPrice()))
def update_from_ais(self): scrapper = Scrapper() data = scrapper.scrape_ship(self.name) return self.update(data)
import sys def retry(func): @wraps(func) def decorated(*args): result = func(*args) while not result: print("Retrying..") result = func(*args, retry=True) return result return decorated if __name__ == "__main__": scrapper = Scrapper() nodes = set() nodes_sets = [pickle.load(open(filename)) for filename in sys.argv[1:]] nodes_info_filename = "all_nodes_info.pickle" try: nodes_info = pickle.load(open(nodes_info_filename)) processed = set(nodes_info.keys()) except IOError: nodes_info = {} processed = set() for nodes_set in nodes_sets: for number, node in enumerate(nodes_set - processed): nodes_info[node] = scrapper.get_node_info(node) processed.add(node)