def new_links_update(): if len(GlobalParser.flats_ids) == 0: return logger.info("Filtering offers and sending them to queue") # Filter ids from duplicates flats = set(GlobalParser.flats_ids) # Get these flats from DB flats = list(Databases.get_flats_db().find( { 'id': { '$in': list(flats) }, 'location.metro': { '$exists': True } }, { 'id': 1, 'location.metro.name': 1, 'price': 1 })) for flat in flats: flat['price'] = GlobalParser.fix_price(flat) # Get all users with their fields users = Databases.get_users_db().find( { 'max_price': { '$exists': True }, 'metro_stations': { '$exists': True } }, { 'id': 1, 'max_price': 1, 'metro_stations': 1 }) # For each user filter links they need for user in users: user_id = user['id'] user_max_price = user['max_price'] user_stations = set(user['metro_stations']) # Filter by stations good_flats = ( flat for flat in flats if flat['location']['metro']['name'].lower() in user_stations) good_flats = (flat for flat in good_flats if flat['price'] <= user_max_price) good_flats = list(good_flats) if len(good_flats) > 0: message = { 'uid': user_id, 'offers': [flat['id'] for flat in good_flats], } logger.debug("Sending {} offers to user {}".format( len(good_flats), user_id)) GlobalParser.offers_send_function(message)
def check_suspicious(): logger.info("Checking suspicious") db = Databases.get_flats_db() logger.debug("Getting target flats") target_flats = list(db.find({'seen_by_suspicious_validator': False, 'location.metro': {'$exists': True}}, {'id': 1, 'price': 1, 'location.metro.name': 1})) logger.debug("Got {}".format(len(target_flats))) for flat in target_flats: flat['price'] = SuspiciousChecker.fix_price(flat) flat['metro'] = flat['location']['metro']['name'] del flat['location'] logger.debug("Fixed names") suspicious_flats = [] stations = set(flat['metro'].lower() for flat in target_flats) logger.debug("Getting suspicious flats") for station in stations: station_flats = [flat for flat in target_flats if flat['metro'].lower() == station] suspicious_num = math.ceil(len(station_flats) * config.suspicious_fraction) station_flats.sort(key=lambda x: x['price']) suspicious_flats.extend(station_flats[:suspicious_num]) target_flats_ids = [flat['id'] for flat in target_flats] suspicious_flats_ids = [flat['id'] for flat in suspicious_flats] logger.debug("Pushing to DB") db.update_many({'id': {'$in': target_flats_ids}}, {'$set': {'seen_by_suspicious_validator': True}}) db.update_many({'id': {'$in': suspicious_flats_ids}}, {'$set': {'suspicious': True}})
def get_new_offers(url, time=config.cian_default_timeout): db = Databases.get_flats_db() ids = {} for offer in get_offers(url, time): if offer['id'] in ids.keys(): old = ids[offer['id']] old = old.copy() if old != offer: logger.error("Different dicts: {}\n{}".format(offer, old)) else: offer['seen_by_suspicious_validator'] = False offer['suspicious'] = False ids[offer['id']] = offer db.find_one_and_replace({'id': offer['id']}, offer, upsert=True) yield offer logger.info("Totally parsed {} real offers.".format(len(ids)))
class InvitesManager: db = Databases.get_invites_db() @staticmethod def generate_random_str(n): return ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(n)) @staticmethod def pull_invite(invite): invite = InvitesManager.db.find_one({'id': invite}) if invite is not None: InvitesManager.db.delete_one({'_id': invite['_id']}) return True return False @staticmethod def insert_invite(invite): InvitesManager.db.insert_one({'id': invite}) @staticmethod def insert_random_invites(count, len): for _ in range(count): InvitesManager.insert_invite( InvitesManager.generate_random_str(len)) @staticmethod def insert_many_invites(invites): InvitesManager.db.insert_many([{'id': invite} for invite in invites]) @staticmethod def get_invites(): return InvitesManager.db.find() @staticmethod def get_invite(): return InvitesManager.db.find_one() @staticmethod def get_invites_list(count=config.default_invites_count): invites = [invite['id'] for invite in InvitesManager.get_invites()] if len(invites) < count: diff = config.default_invites_count - len(invites) InvitesManager.insert_random_invites(diff, config.invite_length) invites = [invite['id'] for invite in InvitesManager.get_invites()] return invites[:count]
# Определяем по ним число страниц if num_of_offers == 0: return raw_offers = get_raw_offers(page_bs) yield from (parse_raw_offer(offer) for offer in raw_offers) num_of_offers -= len(raw_offers) i = 2 while num_of_offers > 0: url = change_params(raw_url, totime=url_time, p=i) raw_offers = get_raw_offers(safe_request(url)) logger.debug("Parsing {} page".format(i)) yield from (parse_raw_offer(offer) for offer in raw_offers) num_of_offers -= len(raw_offers) i += 1 if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='CIAN parser by URL') parser.add_argument('url', type=str, help='URL to parse') parser.add_argument('-t', '--time', type=int, help='Set time of last parsing', default=360000000000000000000) args = parser.parse_args() db = Databases.get_flats_db() for info, info_id in get_offers(args.url, args.time): write_to_database(info_id, info, db)
def init(): User.db = Databases.get_users_db()
# Получаем число предложений num_of_offers = get_count_of_offers(page_bs) logger.debug("Parsing {} offers".format(num_of_offers)) # Определяем по ним число страниц if num_of_offers == 0: return raw_offers = get_raw_offers(page_bs) yield from (parse_raw_offer(offer) for offer in raw_offers) num_of_offers -= len(raw_offers) i = 2 while num_of_offers > 0: url = change_params(raw_url, totime=url_time, p=i) raw_offers = get_raw_offers(safe_request(url)) logger.debug("Parsing {} page".format(i)) yield from (parse_raw_offer(offer) for offer in raw_offers) num_of_offers -= len(raw_offers) i += 1 if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='CIAN parser by URL') parser.add_argument('url', type=str, help='URL to parse') parser.add_argument('-t', '--time', type=int, help='Set time of last parsing', default=360000000000000000000) args = parser.parse_args() db = Databases.get_flats_db() for info, info_id in get_offers(args.url, args.time): write_to_database(info_id, info, db)