def create_temp_ads(raw_ad_ids): session = DBSession() raw_ad_list = session.query(RawAd).filter(RawAd.id.in_(raw_ad_ids)).all() bulk_temp_ads = [] bulk_props = [] bulk_imgs = [] processed = 0 for raw_ad in raw_ad_list: try: temp_ad = raw_ad.map() temp_ad.id = raw_ad.id # Set temp_ad_id to all TempAdProperty instances for prop_name, prop in temp_ad.properties.items(): # Adding to bulk bulk_props.append({"temp_ad_id": temp_ad.id, "name": prop.name, "value": prop.value}) # Download images for image in temp_ad.images: # Adding to bulk bulk_imgs.append({ "temp_ad_id": temp_ad.id, "external_path": image.external_path, }) # Adding to bulk bulk_temp_ads.append({ "id": temp_ad.id, "feed_in_location_id": temp_ad.feed_in_location_id, "feed_in_subcat_id": temp_ad.feed_in_subcat_id, "feed_in_id": raw_ad.feed_in_id }) processed += 1 except Exception as e: logger = logging.getLogger(__name__) logger.info("RawId: {0} {1} {2}".format(raw_ad.id, type(e).__name__, str(e))) if bulk_temp_ads: session.execute(TempAd.__table__.insert(), bulk_temp_ads) if bulk_props: session.execute(TempAdProperty.__table__.insert(), bulk_props) if bulk_imgs: session.execute(TempAdImage.__table__.insert(), bulk_imgs) session.commit() return processed
def run(num_workers = None, max_size = 10000, chunk_size = 1000): log_file_name = os.path.join( LOG_FOLDER, "{0}_feeds_transform.log".format(dtt.datetime.today().strftime("%Y-%m-%d"))) # start - logging configuration # @TODO: Maybe logging in this way it is an error sinse # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_formatter = logging.Formatter('[%(asctime)s] %(message)s') logger_handler = logging.FileHandler(log_file_name) logger_handler.setFormatter(log_formatter) logger.addHandler(logger_handler) # end - logging configuration pool = Pool(processes = num_workers) raw_ads = [] last_id = 0 processed = 0 total_time = 0 session = DBSession() # process will scan all existing RawAd inserted while True: # We need to chunk result query sinse raw ads are so big raw_ads = session.query(RawAd.id).\ filter(RawAd.id > last_id, RawAd.status == "P").\ order_by(RawAd.id).\ limit(max_size).\ all() # If no more RawAds it breks loop if not raw_ads: logger.info("FINISHED. Total RawAds mapped: {0} in {1} secs".format(processed, round(total_time, 2))) break raw_ads = [raw_ad[0] for raw_ad in raw_ads] last_id = raw_ads[-1] chunked_raw_ads = chunk_list(raw_ads, chunk_size) start = time.time() results = pool.map_async(create_temp_ads, chunked_raw_ads).get() end = time.time() - start total_time += end processed += sum(results) logger.info("Processed: {0} in {1} secs".format(processed, round(end, 2)))
def __init__(self, feed_type): self.map_methods = {} self.feed_type = feed_type self.db_session = DBSession() try: self.root = feed_type.mapping.filter_by(method="ROOT").one().field except NoResultFound: raise FeedMappingException( "No se ha creado ROOT para el mapeo de {0}".format( feed_type.id)) except MultipleResultsFound: raise FeedMappingException( "Se ha creado más de un ROOT para el mapeo de {0}".format( feed_type.id))
def run(urls=None, feed_ids=None, num_workers=None): """ params: urls list of feed urls feed_ids list of feed ids num_workers: Number of workers process. If None It will be used If urls and feed_ids are not provided the process will be run over all feeds enabled where it was not processed today """ log_file_name = os.path.join( LOG_FOLDER, "{0}_feeds_input.log".format( dtt.datetime.today().strftime("%Y-%m-%d"))) # start - logging configuration # @TODO: Maybe logging in this way it is an error sinse # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_formatter = logging.Formatter('[%(asctime)s] %(message)s') logger_handler = logging.FileHandler(log_file_name) logger_handler.setFormatter(log_formatter) logger.addHandler(logger_handler) # end - logging configuration pool = Pool(processes=num_workers) session = DBSession() if not urls and not feed_ids: result = session.query(FeedIn.id).\ filter( FeedIn.last_processed_date < dtt.date.today(), FeedIn.enabled == '1' ).all() feed_ids = [t_id[0] for t_id in result] elif urls: # It gets a list of feed_id from urls if it is passed result = session.query(FeedIn.id).\ filter( FeedIn.url.in_(urls)).all() feed_ids = [t_id[0] for t_id in result] args_collection = [(feed_id, DOWNLOAD_FOLDER, 20) for feed_id in feed_ids] results = pool.map_async(__process_feed, args_collection).get()
def process_feed(feed_id, download_folder, timeout=None): logger = logging.getLogger(__name__) session = DBSession() feed_in = session.query(FeedIn).get(feed_id) url = feed_in.url file_name = "" try: file_name = download_file(url, download_folder, ext=feed_in.ext or None, timeout=timeout) result = preprocess(file_name, feed_in.bulk_insert, ()) for res in result: logger.info("{0} {1} {2} {3} {4} {5} {6}".format( res['status'], url, file_name, res['inserted'], res['old_ads'], res['repeated_ads'], res['e_msg'] or "")) except Exception as e: logger.info("{0} {1} {2} {3} {4} {5} {6}".format( type(e).__name__, url, file_name, 0, 0, 0, str(e)))