def start_requests(self): self.i = 0 if self.re_query_id is None: raise ValueError("No re_query_id passed to spider") return query_from_db = get_re_query(int(self.re_query_id)) if query_from_db is None: # don't scrape print("WARNING: Skelbiu: Not scraping (query not found):") print(query_from_db) return None qr = SkelbiuReQuery(query_from_db) re_query = qr.generate() if re_query is None: # don't scrape print("WARNING: Skelbiu: Not scraping:") print(query_from_db) return None assert re_query is not None, "RE query not found!" urls = ["https://skelbiu.lt/skelbimai/?" + urlencode(re_query)] print("STARTED CRAWLING SKELBIU") for url in urls: rq = scrapy.Request(url=url, callback=self.parse, headers={"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}) yield rq
def __init__(self, old_ads, scraped_ads, query): self.re_query: dict = query self.old_ads = {} self.new_ads = {} self.ad_changes = {} for ad in old_ads: self.old_ads[ad["id"]] = ad for ad in scraped_ads: self.new_ads[ad["id"]] = ad self.compare_re_ads() ads_to_delete = self.deleted_re_ads() print("ads_to_ mark as deleted:") print(ads_to_delete) mark_re_ads_as_deleted(ads_to_delete) print("RE_AD_CHANGES:") print(self.ad_changes) self.re_query = get_re_query(self.re_query["id"]) if self.re_query["was_scraped"] and len(self.ad_changes) != 0: msg = self.generate_message() self.msg_id = insert_message(self.re_query["user_id"], "Pasikeitė paieškos rezultatai", msg) print("MESSAGE TO SEND TO THE USER:"******"NO RE AD CHANGES OR RE QUERY IS NEW")
def del_re_query(user_id, query_id): jwt = get_jwt_identity() query = get_re_query(query_id) if query is None: return Response(status=404) if query["user_id"] != jwt["user_id"]: if jwt["group"] != "admin": return jsonify( {"error": "You can only access your own resources."}), 403 if delete_re_query(user_id, query_id): scraper_interface.delete_re_query(user_id, query_id) return Response(status=200)
def put_query(user_id, query_id): jwt = get_jwt_identity() query = get_re_query(query_id) if query is None: if (res := validate_resource(user_id)) != True: return res