def main(): ts = time() with session_scope() as session: bbb_urls = [x[0] for x in session.query(Company.bbb_url).distinct()] queue = Queue() for i in range(8): worker = SqlUpdateWorker(queue) worker.daemon = True worker.start() for url in bbb_urls: pro = pro_coll.find_one({"bbb_url": url}) if pro: g_id = pro["google_id"] if g_id == "NoPhoneMatch" or g_id == "NoAddressMatch": g_id = None else: logger.warning(f"NOT IN MONGO: {url}") queue.put((url, "google_place_id", g_id, logger)) queue.join() print(f"Time: {time() - ts}")
def run(self): while True: try: url, column, value = self.queue.get() with session_scope() as session: _id = (session.query( Company.id).filter(Company.bbb_url == url).scalar()) print(pro_id) pro_coll.update_one( {"bbb_url": url}, {"$set": { f"{sys.argv[1]}_sql_id": pro_id }}) finally: self.queue.task_done()
def run(self): while True: try: url, column, value, logger = self.queue.get() with session_scope(logger=logger, mode="update") as session: query = list( session.query(Company).filter(Company.bbb_url == url)) if len(query) == 0: logger.info(f"{url} is not in prod db") elif len(query) == 1: company = query[0] logger.info(f"ID: {company.id} has updated {column}") session.query(Company).filter( Company.id == company.id).update({column: value}) elif len(query) >= 2: for c in query: logger.info(f"ID: {c.id} has updated {column}") session.query(Company).filter( Company.id == c.id).update({column: value}) finally: self.queue.task_done()
continue try_again = input("need another try?") if try_again == "y": try: log_message = merge_pro(log_message, columns, dlicates, p_ind, keep_id, sql_ids) except: raise else: pass ## pop keep id before using sql_ids to delete duplicates _ = sql_ids.pop(sql_ids.index(keep_id)) log_message = log_message + f"\tDELETING IDS - {sql_ids}" # mlogger.warning(log_message) ### delete pros not used with session_scope(logger=mlogger) as session: for i in sql_ids: log_message = log_message + f"\tDELETING - {i}" del_pro = session.query(Company).filter( Company.id == i).first() session.delete(del_pro) # log operation and clear the terminal mlogger.warning(log_message) clear() count += 1
"in_production": True, "is_accredited": True }, { "_id": False, "bbb_url": True, "is_accredited": True }, ) ] ##5988 unique bbb_urls in production according to mongo db # check prod in heroku query tool! for p in accredited_pros: with session_scope() as session: query = list( session.query(Company).filter( Company.bbb_url == p["bbb_url"]).all()) if len(query) == 0: logger.info( f"{p['bbb_url']} is not in prod db or has accreditation data") elif len(query) == 1: logger.info(f"{p['bbb_url']} has updated accreditation data!") company = query[0] company.is_accredited = True elif len(query) >= 2: logger.info(f"{p['bbb_url']} has updated accreditation data!") logger.warning( f"{p['bbb_url']} has {len(query)} duplicate rows in the db")
logger = logging.getLogger() logger.setLevel(logging.DEBUG) # map company table Company = Base.classes["companies_company"] Service = Base.classes["commons_servicetype"] CompanyService = Base.classes["companies_companyservicetype_service_type"] ## connect to pymongo client = MongoClient() db = client["ProData"] pro_coll = db["ProDataCollection"] # create conversion to dict to match my alpha version of the service name to the # service_id in the db with session_scope(logger) as session: services = [x._asdict() for x in session.query(Service.name, Service.id)] service_dict = {clean_special(clean_str(x["name"])): x["id"] for x in services} sqlre = re.compile("(sql_id:)\s\d\d\d\d\d") mongore = re.compile("(pro_mongo_id\s:)\s\w*") namere = re.compile("('name':)\s'(.*?)'") uploaded_ids = [] names = [] with open("./log_files/staging_accredited_load.log") as file: for row in file.readlines(): if mongore.findall(row): uploaded_ids.append(row.split(":")[1].strip())