def modify_tt_time(self, uid: str, amount: str): doc_id = generate_id_from_uid(uid) if self.db.contains(doc_id=doc_id): if amount[0] == '-': amount_to_add = int( isodate.parse_duration(amount[1:]).seconds) * -1 else: amount_to_add = int(isodate.parse_duration(amount).seconds) tt = self.db.get(doc_id=doc_id) tt['origin_time'] = self.convert_sec_to_time( self.convert_time_to_secs(tt['origin_time']) + amount_to_add) tt['destination_time'] = self.convert_sec_to_time( self.convert_time_to_secs(tt['destination_time']) + amount_to_add) tt['description'] = tt['origin_time'] + tt['description'][4:] if 'entry_time' in tt: tt['entry_time'] = self.convert_sec_to_time( self.convert_time_to_secs(tt['entry_time']) + amount_to_add) for loc in tt['locations']: if 'dep' in loc: loc['dep'] = self.convert_sec_to_time( self.convert_time_to_secs(loc['dep']) + amount_to_add) if 'arr' in loc: loc['arr'] = self.convert_sec_to_time( self.convert_time_to_secs(loc['arr']) + amount_to_add) self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id))
def add_tt_if_not_present(self, tt: dict): """ Adds TT to TT DB if one with the same uid is NOT already present. :param tt: json TT to add. """ doc_id = generate_id_from_uid(tt['uid']) if not self.db.contains(doc_id=doc_id): self.db.insert(table.Document(tt, doc_id=doc_id))
def add_seed_groups(self, seed_groups: list): """ Adds the seed groups to the main header DB overwriting any if present. :param seed_groups: list of seed groups to add. """ if self.db.contains(doc_id=3): self.db.remove(doc_ids=[3]) self.db.insert(table.Document({'seed_groups': seed_groups}, doc_id=3))
def add_categories_map(self, cat_map: dict): """ Adds the map of xml train categories to the main header DB overwriting one if present. :param cat_map: map of train categories to add. """ if self.db.contains(doc_id=2): self.db.remove(doc_ids=[2]) self.db.insert(table.Document({'categories_map': cat_map}, doc_id=2))
def add_header(self, header: dict): """ Adds the TT header to the main header DB overwriting one if present. :param header: Header to add. """ if self.db.contains(doc_id=1): self.db.remove(doc_ids=[1]) self.db.insert(table.Document(header, doc_id=1))
def add_tt(self, tt: dict): """ Adds TT to TT DB overwriting one with the same uid if present. :param tt: json TT to add. """ doc_id = generate_id_from_uid(tt['uid']) if self.db.contains(doc_id=doc_id): self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id))
def add_rule_if_not_present(self, rule: dict): """ Adds Rule to Rules DB if one with the same id is NOT already present. :param rule: Rule to add. """ doc_id = generate_rule_id(rule) if not self.db.contains(doc_id=doc_id): self.db.insert(table.Document(rule, doc_id=doc_id))
def add_rule(self, rule: dict): """ Adds Rule to Rules DB overwriting one with the same id if present. :param rule: Rule to add. """ doc_id = generate_rule_id(rule) if self.db.contains(doc_id=doc_id): self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(rule, doc_id=doc_id))
def update_destination_for_uids(self, uids: list, destination: str): for uid in uids: doc_id = generate_id_from_uid(uid) if self.db.contains(doc_id=doc_id): tt = self.db.get(doc_id=doc_id) tt['destination_name'] = destination tt['description'] = tt['description'].split( '- ')[0] + destination self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id))
def update_location_for_uids(self, uids: list, location_to_update: str, keys_to_update: dict): for uid in uids: doc_id = generate_id_from_uid(uid) if self.db.contains(doc_id=doc_id): tt = self.db.get(doc_id=doc_id) for loc in tt['locations']: if location_to_update in loc['location']: for key in keys_to_update.keys(): loc[str(key)] = keys_to_update[key] self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id))
def update_category_for_uids(self, uids: list, category: str): for uid in uids: doc_id = generate_id_from_uid(uid) if self.db.contains(doc_id=doc_id): tt = self.db.get(doc_id=doc_id) tt['category'] = category tt['description'] = '{} {} - {} {}'.format( tt['origin_time'], tt['origin_name'], tt['destination_name'], tt['category']) self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id))
def put_tt_by_uid(self, uid: str, tt: dict) -> bool: """ Overwrites TT with specified uid. :param uid: uid of the TT. :param tt: TT to replace with. :return: True if successfully replaced, False if not or no original record. """ doc_id = generate_id_from_uid(uid) if self.db.contains(doc_id=doc_id): self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id)) return True return False
def update_origin_for_uids(self, uids: list, origin: str, origin_time: str): for uid in uids: doc_id = generate_id_from_uid(uid) if self.db.contains(doc_id=doc_id): tt = self.db.get(doc_id=doc_id) tt['origin_name'] = origin if origin_time is not None: tt['origin_time'] = origin_time tt['description'] = '{} {} -'.format( origin_time, origin) + tt['description'].split('-')[1] else: tt['description'] = '{} {} -'.format( tt['origin_time'], origin) + tt['description'].split('-')[1] self.db.remove(doc_ids=[doc_id]) self.db.insert(table.Document(tt, doc_id=doc_id))
# fs = FinnScraper("https://www.finn.no/realestate/homes/search.html?page={}") try: finn_codes = fs.get_ad_codes(npages=25, verbose=1) except Exception as e: send_message(f'Error: {str(e)}\n{str(e.__traceback__.tb_frame)}') if len(finn_codes) == 0: db.close() fs.close_driver() exit() ad_counter = 0 for finn_code in tqdm(finn_codes): if db.contains(doc_id=finn_code): continue try: ad_dict = fs.parse_housing_page(finn_code) ad_dict['scraping_date'] = todays_date except Exception as e: send_message(f'Error: {str(e)}\n{str(e.__traceback__.tb_frame)}') continue db.insert(table.Document(ad_dict, doc_id=finn_code)) ad_counter += 1 send_message(f'Added {ad_counter} new house data\nTotal Data: {len(db)}') db.close() fs.close_driver()