Example #1
0
def run_img(tempids = None, num_workers = None, images_by_loop = 1000):

    if tempids:
        images_query = DBSession.query(TempAdImage).\
                    filter(TempAdImage.temp_ad_id.in_(tempids))

    else:
        images_query = DBSession.query(TempAdImage).\
                    filter( TempAdImage.internal_path == None)

    last_id = 0

    pool = Pool(num_workers)
    
    while True:
        images = images_query.\
                    filter(TempAdImage.id > last_id).\
                    order_by(TempAdImage.id).\
                    limit(images_by_loop).\
                    all()
        if not images:
            break
            
        last_id = images[-1].id
        
        arguments = [image.external_path for image in images]
        
        for image, path in zip(images, pool.map(_download, arguments)):
            image.internal_path = path

        DBSession.commit()
Example #2
0
def run(num_workers = None, max_size = 10000, chunk_size = 1000):
    
    log_file_name = os.path.join(
        LOG_FOLDER, 
        "{0}_feeds_transform.log".format(dtt.datetime.today().strftime("%Y-%m-%d")))

    # start - logging configuration
    # @TODO: Maybe logging in this way it is an error sinse 
    # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_formatter = logging.Formatter('[%(asctime)s] %(message)s')
    logger_handler = logging.FileHandler(log_file_name)
    logger_handler.setFormatter(log_formatter)
    logger.addHandler(logger_handler)
    # end - logging configuration


    pool = Pool(processes = num_workers)
    raw_ads = []
    last_id = 0
    processed = 0
    total_time = 0

    session = DBSession()
    # process will scan all existing RawAd inserted
    while True:
        
        # We need to chunk result query sinse raw ads are so big
        raw_ads = session.query(RawAd.id).\
            filter(RawAd.id > last_id, RawAd.status == "P").\
            order_by(RawAd.id).\
            limit(max_size).\
            all()

        # If no more RawAds it breks loop
        if not raw_ads:
            logger.info("FINISHED. Total RawAds mapped: {0} in {1} secs".format(processed, round(total_time, 2)))
            break

        raw_ads = [raw_ad[0] for raw_ad in raw_ads]
        last_id = raw_ads[-1]

        chunked_raw_ads = chunk_list(raw_ads, chunk_size)
        
        start = time.time()
        results = pool.map_async(create_temp_ads, chunked_raw_ads).get()
        end = time.time() - start
        total_time += end
        processed += sum(results)
        logger.info("Processed: {0} in {1} secs".format(processed, round(end, 2)))
Example #3
0
    def __init__(self, feed_type):
        self.map_methods = {}
        self.feed_type = feed_type
        self.db_session = DBSession()
        try:
            self.root = feed_type.mapping.filter_by(method="ROOT").one().field

        except NoResultFound:
            raise FeedMappingException(
                "No se ha creado ROOT para el mapeo de {0}".format(
                    feed_type.id))

        except MultipleResultsFound:
            raise FeedMappingException(
                "Se ha creado más de un ROOT para el mapeo de {0}".format(
                    feed_type.id))
Example #4
0
def run(urls=None, feed_ids=None, num_workers=None):
    """
    params:
        urls list of feed urls
        feed_ids list of feed ids
        num_workers: Number of workers process. If None It will be used 

    If urls and feed_ids are not provided the process will be run over all feeds enabled 
    where it was not processed today
    """

    log_file_name = os.path.join(
        LOG_FOLDER, "{0}_feeds_input.log".format(
            dtt.datetime.today().strftime("%Y-%m-%d")))

    # start - logging configuration
    # @TODO: Maybe logging in this way it is an error sinse
    # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_formatter = logging.Formatter('[%(asctime)s] %(message)s')
    logger_handler = logging.FileHandler(log_file_name)
    logger_handler.setFormatter(log_formatter)
    logger.addHandler(logger_handler)
    # end - logging configuration

    pool = Pool(processes=num_workers)

    session = DBSession()

    if not urls and not feed_ids:
        result = session.query(FeedIn.id).\
            filter(
                FeedIn.last_processed_date < dtt.date.today(),
                FeedIn.enabled == '1' ).all()
        feed_ids = [t_id[0] for t_id in result]

    elif urls:
        # It gets a list of feed_id from urls if it is passed
        result = session.query(FeedIn.id).\
            filter( FeedIn.url.in_(urls)).all()

        feed_ids = [t_id[0] for t_id in result]

    args_collection = [(feed_id, DOWNLOAD_FOLDER, 20) for feed_id in feed_ids]
    results = pool.map_async(__process_feed, args_collection).get()
Example #5
0
def process_feed(feed_id, download_folder, timeout=None):

    logger = logging.getLogger(__name__)
    session = DBSession()

    feed_in = session.query(FeedIn).get(feed_id)

    url = feed_in.url
    file_name = ""
    try:
        file_name = download_file(url,
                                  download_folder,
                                  ext=feed_in.ext or None,
                                  timeout=timeout)
        result = preprocess(file_name, feed_in.bulk_insert, ())
        for res in result:
            logger.info("{0} {1} {2} {3} {4} {5} {6}".format(
                res['status'], url, file_name, res['inserted'], res['old_ads'],
                res['repeated_ads'], res['e_msg'] or ""))

    except Exception as e:
        logger.info("{0} {1} {2} {3} {4} {5} {6}".format(
            type(e).__name__, url, file_name, 0, 0, 0, str(e)))
Example #6
0
class XmlAdMapper(AdMapper):
    def __init__(self, feed_type):
        self.map_methods = {}
        self.feed_type = feed_type
        self.db_session = DBSession()
        try:
            self.root = feed_type.mapping.filter_by(method="ROOT").one().field

        except NoResultFound:
            raise FeedMappingException(
                "No se ha creado ROOT para el mapeo de {0}".format(
                    feed_type.id))

        except MultipleResultsFound:
            raise FeedMappingException(
                "Se ha creado más de un ROOT para el mapeo de {0}".format(
                    feed_type.id))

    def iter_from_file(self, file):
        self.__file_path = file
        self.__iteration_count = 0
        self.__file_was_cleaned = False  # Flag determines if xml file was cleaned to avoid infite loops
        self.__xml_parser = etree.iterparse(self.__file_path, tag=self.root)

    def get_raw_content(self):
        """ Returns a string with a ad information in xml format """
        """
            How does it work?
            Function iterate over each element next(self.__xml_parser). If a XML syntax error was found
            it will raise a XMLSyntaxError exception. If the file was not cleaned yet (self.__file_was_cleaned is False)
            we will try clean it/fix it calling clear_file().
            After then we restart the iterator and jump the elements thas has been returned and call get_raw_content again.
            If XMLSyntaxError is raised again (self.__file_was_cleaned is True) means that clear_file() couldn't fix it 
            so raise the excepcion and avoid infinite recursion
        """
        try:
            event, element = next(self.__xml_parser)
            raw_content = etree.tostring(element,
                                         encoding="utf-8").decode("utf-8")
            element.clear()

            self.__iteration_count += 1

            return raw_content
        except etree.XMLSyntaxError as e:
            # if there is an invalid Token OR invalid entity AND file wasn't cleaned yet
            if not self.__file_was_cleaned:
                clear_file(
                    self.__file_path)  # Removes invalid characters from file
                self.__file_was_cleaned = True
                self.__xml_parser = etree.iterparse(
                    self.__file_path, tag=self.root)  # Restart iterator
                # Jump elements self.__iteration_count times
                for i in range(self.__iteration_count):
                    next(self.__xml_parser)

                return self.get_raw_content()
            else:
                raise FeedParseException("El XML no se pudo reparar. " +
                                         type(e).__name__ + str(e))

    def __load_map_methods(self):
        """ Loads in memory all map methods and it params"""

        # @WARNING: Para que funcione ésta query, es necesario que esté
        # desactivada la opción ONLY_FULL_GROUP_BY de mysql activada como default a partir de la versión 5.7.5
        # Información técnica: https://dev.mysql.com/doc/refman/5.7/en/sql-mode.html#sqlmode_only_full_group_by
        # Solución: https://www.sitepoint.com/quick-tip-how-to-permanently-change-sql-mode-in-mysql/
        methods = self.db_session.query(
            FeedTypeMapping.method,
            func.group_concat(
                FeedTypeMapping.field.op('ORDER BY')(
                    FeedTypeMapping.param_order)),
            func.group_concat(
                FeedTypeMapping.default_value.op('ORDER BY')(
                    FeedTypeMapping.param_order))).filter(
                        FeedTypeMapping.feed_type == self.feed_type,
                        FeedTypeMapping.method.in_(
                            MAP_METHODS.keys())).group_by(
                                FeedTypeMapping.method).all()

        for method_name, xpaths, default_values in methods:
            addional_params = dict([
                (param.name, param.value)
                for param in self.feed_type.additional_params.filter_by(
                    method=method_name)
            ])

            map_method = MAP_METHODS[method_name](**addional_params)

            # With the below line we get
            # {"DESCRIPCION": (DescriptionMapMethod(template = "..."), [content/text(), bathrooms/text()])}
            self.map_methods[method_name] = (map_method, xpaths.split(","),
                                             default_values.split(","))

        return self

    def map(self, raw_ad):

        if not self.map_methods:
            self.__load_map_methods()

        xml = etree.fromstring(raw_ad.raw_content)

        mapped_properties = {}
        temp_ad = TempAd()

        for method_name, (map_method, xpaths,
                          default_values) in self.map_methods.items():
            args = [
                self.extract(xml, xpath) or default_value
                for xpath, default_value in zip(xpaths, default_values)
            ]

            mapped_properties.update(map_method.map(*args, raw_ad=raw_ad))

        temp_ad.set_properties(mapped_properties)

        return temp_ad

    def exec_method(self, method_name, raw_ad):
        if not self.map_methods:
            self.__load_map_methods()

        xml = etree.fromstring(raw_ad.raw_content)
        map_method = self.map_methods[method_name][0]
        xpaths = self.map_methods[method_name][1]

        args = [self.extract(xml, xpath) for xpath in xpaths]

        return map_method.map(*args, raw_ad=raw_ad)

    def extract(self, xml, xpath):
        """ Extract data from xml based on it xpath """
        data = xml.xpath(xpath)
        if len(data) == 1:
            data = data[0].strip()
        elif len(data) == 0:
            data = ""

        return data
Example #7
0
def run(loader_name, sleep_time=0):

    log_file_name = os.path.join(
        LOG_FOLDER,
        "{0}_feeds_load.log".format(dtt.datetime.today().strftime("%Y-%m-%d")))

    # start - logging configuration
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_formatter = logging.Formatter('[%(asctime)s] %(message)s')
    logger_handler = logging.FileHandler(log_file_name)
    logger_handler.setFormatter(log_formatter)
    logger.addHandler(logger_handler)
    # end - logging configuration

    loader = create_loader(loader_name)

    processed_temp_ads = []
    errors = 0
    loaded_ok = 0

    limit = 100

    query = DBSession.query(TempAd).filter(TempAd.is_ready,
                                           TempAd.ad_id == None)
    temp_ads = query.order_by(func.rand()).limit(limit).all()

    while True:

        processed_temp_ads += [temp_ad.id for temp_ad in temp_ads]

        if not temp_ads:
            logger.info("FINISHED. ads loaded OK: {0}. Errors: {1}".format(
                loaded_ok, errors))
            break

        ads_data = []

        for temp_ad in temp_ads:
            try:
                ad_data = prepare_ad_data(loader, temp_ad)
                ads_data.append(ad_data)
            except Exception as e:
                temp_ad.error_message = "Error while it is preparing data to load: " + str(
                    e)
                logger.info("{0} {1} {2}".format(temp_ad.id, temp_ad.ad_id,
                                                 temp_ad.error_message or ""))
                errors += 1

                pass

        # It loads the ads
        result = loader.load(ads_data)

        for res in result:
            temp_ad = DBSession.query(TempAd).get(res["id"])
            temp_ad.ad_id = res["ad_id"]
            temp_ad.error_message = res["error_message"]

            if res["error_message"]:
                errors += 1
            else:
                loaded_ok += 1

            logger.info("{0} {1} {2}".format(temp_ad.id, temp_ad.ad_id,
                                             temp_ad.error_message or ""))

        DBSession.commit()

        temp_ads = query.filter(~ TempAd.id.in_(processed_temp_ads)).\
                        order_by(func.rand()).\
                        limit(limit).\
                        all()

        # Process sleeps in order to avoid overload API's server.
        time.sleep(sleep_time)
Example #8
0
    def bulk_insert(self, file):
        """ Bulk insert from a file """
        
        self.feed_type.ad_mapper.iter_from_file(file)

        max_pending = 10000 # Max INSERTs pending to commit
        current_pending = 0   # count the number of ads processing from the xml
        inserted_ads = 0

        info = {'status': None, 'file': file, 'inserted': None, 'e_msg': None}
        pending_raw_ads = []
        record_ids = []
        old_ads = 0
        repeated_ads = 0
        while True:
            try:
                raw_ad = RawAd()
                raw_ad.raw_content = self.feed_type.ad_mapper.get_raw_content()
                raw_ad.feed_in = self

                ######################## Begin - Filter section ################################
                # @TODO: Filters should be dinamic. E.g: implement some kind of observer pattern
                date_info = self.feed_type.ad_mapper.exec_method("FECHA", raw_ad = raw_ad)
                days = (dtt.today() - dtt.strptime(date_info["date"], date_info["_format"])).days                
                ######################## End - Filter section ################################


                if days > 30:
                    old_ads += 1
                    continue # It skips the remaining code in the loop. 
                             # This way we don't call to database in each iteration 


                ######################## Begin - Filter section ################################
                # @TODO: Filters should be dinamic. E.g: implement some kind of observer pattern
                id = self.feed_type.ad_mapper.exec_method("ID", raw_ad = raw_ad)["_id_in_feed"]
                record_id = id + "," + self.feed_type.ad_mapper.exec_method("URL", raw_ad = raw_ad)["link"]
                ad_exists = DBSession.execute("SELECT 1 FROM fp_feeds_in_records WHERE id = :id", {"id": record_id}).first()
                ######################## End - Filter section ################################
                if ad_exists:
                    repeated_ads += 1
                else:
                    pending_raw_ads.append(
                        {
                            "raw_ad": raw_ad.raw_content,
                            "feed_in_id": self.id
                        })

                    record_ids.append({"id": record_id})

                    current_pending += 1
                    
                    if( current_pending == max_pending):
                        self.__insert(pending_raw_ads, record_ids)

                        inserted_ads += current_pending
                        current_pending = 0

            except StopIteration:
                if(current_pending != 0):
                    self.__insert(pending_raw_ads, record_ids)
                    
                    inserted_ads += current_pending
                    current_pending = 0

                # It updates the processed date's feed
                self.last_processed_date = date.today()
                DBSession.commit()
                        
                info['status'] = 'ok'
                info['inserted'] = inserted_ads
                info['repeated_ads'] = repeated_ads
                info['old_ads'] = old_ads

                return info

            except Exception as e:
                info['status'] = type(e).__name__
                info['inserted'] = inserted_ads
                info['e_msg'] = str(e)
                info['repeated_ads'] = repeated_ads
                info['old_ads'] = old_ads

                return info
Example #9
0
 def __insert(self, pending_raw_ads, record_ids):
     DBSession.execute(RawAd.__table__.insert(), pending_raw_ads)
     DBSession.execute("INSERT INTO fp_feeds_in_records (id) VALUES (:id)", record_ids)        
     DBSession.commit()    
     del pending_raw_ads[:] 
     del record_ids[:] 
Example #10
0
def create_temp_ads(raw_ad_ids):
    session = DBSession()
    raw_ad_list = session.query(RawAd).filter(RawAd.id.in_(raw_ad_ids)).all()
    
    bulk_temp_ads = []
    bulk_props = []
    bulk_imgs = []
    processed = 0
    for raw_ad in raw_ad_list:
        try:

            temp_ad = raw_ad.map()
            temp_ad.id = raw_ad.id
            
            # Set temp_ad_id to all TempAdProperty instances
            for prop_name, prop in temp_ad.properties.items():

                # Adding to bulk
                bulk_props.append({"temp_ad_id": temp_ad.id, "name": prop.name, "value": prop.value})

            # Download images
            for image in temp_ad.images:
                # Adding to bulk
                bulk_imgs.append({
                    "temp_ad_id": temp_ad.id, 
                    "external_path": image.external_path, 
                })


            # Adding to bulk
            bulk_temp_ads.append({
                "id": temp_ad.id, 
                "feed_in_location_id": temp_ad.feed_in_location_id, 
                "feed_in_subcat_id": temp_ad.feed_in_subcat_id,
                "feed_in_id": raw_ad.feed_in_id 
            })

            processed += 1

        except Exception as e:
            logger = logging.getLogger(__name__)
            logger.info("RawId: {0} {1} {2}".format(raw_ad.id, type(e).__name__, str(e)))

    if bulk_temp_ads:
        session.execute(TempAd.__table__.insert(), bulk_temp_ads)
    
    if bulk_props:
        session.execute(TempAdProperty.__table__.insert(), bulk_props)

    if bulk_imgs:
        session.execute(TempAdImage.__table__.insert(), bulk_imgs)

    session.commit()

    return processed