Example #1
0
def create_temp_ads(raw_ad_ids):
    session = DBSession()
    raw_ad_list = session.query(RawAd).filter(RawAd.id.in_(raw_ad_ids)).all()
    
    bulk_temp_ads = []
    bulk_props = []
    bulk_imgs = []
    processed = 0
    for raw_ad in raw_ad_list:
        try:

            temp_ad = raw_ad.map()
            temp_ad.id = raw_ad.id
            
            # Set temp_ad_id to all TempAdProperty instances
            for prop_name, prop in temp_ad.properties.items():

                # Adding to bulk
                bulk_props.append({"temp_ad_id": temp_ad.id, "name": prop.name, "value": prop.value})

            # Download images
            for image in temp_ad.images:
                # Adding to bulk
                bulk_imgs.append({
                    "temp_ad_id": temp_ad.id, 
                    "external_path": image.external_path, 
                })


            # Adding to bulk
            bulk_temp_ads.append({
                "id": temp_ad.id, 
                "feed_in_location_id": temp_ad.feed_in_location_id, 
                "feed_in_subcat_id": temp_ad.feed_in_subcat_id,
                "feed_in_id": raw_ad.feed_in_id 
            })

            processed += 1

        except Exception as e:
            logger = logging.getLogger(__name__)
            logger.info("RawId: {0} {1} {2}".format(raw_ad.id, type(e).__name__, str(e)))

    if bulk_temp_ads:
        session.execute(TempAd.__table__.insert(), bulk_temp_ads)
    
    if bulk_props:
        session.execute(TempAdProperty.__table__.insert(), bulk_props)

    if bulk_imgs:
        session.execute(TempAdImage.__table__.insert(), bulk_imgs)

    session.commit()

    return processed
Example #2
0
def run(num_workers = None, max_size = 10000, chunk_size = 1000):
    
    log_file_name = os.path.join(
        LOG_FOLDER, 
        "{0}_feeds_transform.log".format(dtt.datetime.today().strftime("%Y-%m-%d")))

    # start - logging configuration
    # @TODO: Maybe logging in this way it is an error sinse 
    # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_formatter = logging.Formatter('[%(asctime)s] %(message)s')
    logger_handler = logging.FileHandler(log_file_name)
    logger_handler.setFormatter(log_formatter)
    logger.addHandler(logger_handler)
    # end - logging configuration


    pool = Pool(processes = num_workers)
    raw_ads = []
    last_id = 0
    processed = 0
    total_time = 0

    session = DBSession()
    # process will scan all existing RawAd inserted
    while True:
        
        # We need to chunk result query sinse raw ads are so big
        raw_ads = session.query(RawAd.id).\
            filter(RawAd.id > last_id, RawAd.status == "P").\
            order_by(RawAd.id).\
            limit(max_size).\
            all()

        # If no more RawAds it breks loop
        if not raw_ads:
            logger.info("FINISHED. Total RawAds mapped: {0} in {1} secs".format(processed, round(total_time, 2)))
            break

        raw_ads = [raw_ad[0] for raw_ad in raw_ads]
        last_id = raw_ads[-1]

        chunked_raw_ads = chunk_list(raw_ads, chunk_size)
        
        start = time.time()
        results = pool.map_async(create_temp_ads, chunked_raw_ads).get()
        end = time.time() - start
        total_time += end
        processed += sum(results)
        logger.info("Processed: {0} in {1} secs".format(processed, round(end, 2)))
Example #3
0
    def __init__(self, feed_type):
        self.map_methods = {}
        self.feed_type = feed_type
        self.db_session = DBSession()
        try:
            self.root = feed_type.mapping.filter_by(method="ROOT").one().field

        except NoResultFound:
            raise FeedMappingException(
                "No se ha creado ROOT para el mapeo de {0}".format(
                    feed_type.id))

        except MultipleResultsFound:
            raise FeedMappingException(
                "Se ha creado más de un ROOT para el mapeo de {0}".format(
                    feed_type.id))
Example #4
0
def run(urls=None, feed_ids=None, num_workers=None):
    """
    params:
        urls list of feed urls
        feed_ids list of feed ids
        num_workers: Number of workers process. If None It will be used 

    If urls and feed_ids are not provided the process will be run over all feeds enabled 
    where it was not processed today
    """

    log_file_name = os.path.join(
        LOG_FOLDER, "{0}_feeds_input.log".format(
            dtt.datetime.today().strftime("%Y-%m-%d")))

    # start - logging configuration
    # @TODO: Maybe logging in this way it is an error sinse
    # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_formatter = logging.Formatter('[%(asctime)s] %(message)s')
    logger_handler = logging.FileHandler(log_file_name)
    logger_handler.setFormatter(log_formatter)
    logger.addHandler(logger_handler)
    # end - logging configuration

    pool = Pool(processes=num_workers)

    session = DBSession()

    if not urls and not feed_ids:
        result = session.query(FeedIn.id).\
            filter(
                FeedIn.last_processed_date < dtt.date.today(),
                FeedIn.enabled == '1' ).all()
        feed_ids = [t_id[0] for t_id in result]

    elif urls:
        # It gets a list of feed_id from urls if it is passed
        result = session.query(FeedIn.id).\
            filter( FeedIn.url.in_(urls)).all()

        feed_ids = [t_id[0] for t_id in result]

    args_collection = [(feed_id, DOWNLOAD_FOLDER, 20) for feed_id in feed_ids]
    results = pool.map_async(__process_feed, args_collection).get()
Example #5
0
def process_feed(feed_id, download_folder, timeout=None):

    logger = logging.getLogger(__name__)
    session = DBSession()

    feed_in = session.query(FeedIn).get(feed_id)

    url = feed_in.url
    file_name = ""
    try:
        file_name = download_file(url,
                                  download_folder,
                                  ext=feed_in.ext or None,
                                  timeout=timeout)
        result = preprocess(file_name, feed_in.bulk_insert, ())
        for res in result:
            logger.info("{0} {1} {2} {3} {4} {5} {6}".format(
                res['status'], url, file_name, res['inserted'], res['old_ads'],
                res['repeated_ads'], res['e_msg'] or ""))

    except Exception as e:
        logger.info("{0} {1} {2} {3} {4} {5} {6}".format(
            type(e).__name__, url, file_name, 0, 0, 0, str(e)))