def run_img(tempids = None, num_workers = None, images_by_loop = 1000): if tempids: images_query = DBSession.query(TempAdImage).\ filter(TempAdImage.temp_ad_id.in_(tempids)) else: images_query = DBSession.query(TempAdImage).\ filter( TempAdImage.internal_path == None) last_id = 0 pool = Pool(num_workers) while True: images = images_query.\ filter(TempAdImage.id > last_id).\ order_by(TempAdImage.id).\ limit(images_by_loop).\ all() if not images: break last_id = images[-1].id arguments = [image.external_path for image in images] for image, path in zip(images, pool.map(_download, arguments)): image.internal_path = path DBSession.commit()
def run(num_workers = None, max_size = 10000, chunk_size = 1000): log_file_name = os.path.join( LOG_FOLDER, "{0}_feeds_transform.log".format(dtt.datetime.today().strftime("%Y-%m-%d"))) # start - logging configuration # @TODO: Maybe logging in this way it is an error sinse # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_formatter = logging.Formatter('[%(asctime)s] %(message)s') logger_handler = logging.FileHandler(log_file_name) logger_handler.setFormatter(log_formatter) logger.addHandler(logger_handler) # end - logging configuration pool = Pool(processes = num_workers) raw_ads = [] last_id = 0 processed = 0 total_time = 0 session = DBSession() # process will scan all existing RawAd inserted while True: # We need to chunk result query sinse raw ads are so big raw_ads = session.query(RawAd.id).\ filter(RawAd.id > last_id, RawAd.status == "P").\ order_by(RawAd.id).\ limit(max_size).\ all() # If no more RawAds it breks loop if not raw_ads: logger.info("FINISHED. Total RawAds mapped: {0} in {1} secs".format(processed, round(total_time, 2))) break raw_ads = [raw_ad[0] for raw_ad in raw_ads] last_id = raw_ads[-1] chunked_raw_ads = chunk_list(raw_ads, chunk_size) start = time.time() results = pool.map_async(create_temp_ads, chunked_raw_ads).get() end = time.time() - start total_time += end processed += sum(results) logger.info("Processed: {0} in {1} secs".format(processed, round(end, 2)))
def __init__(self, feed_type): self.map_methods = {} self.feed_type = feed_type self.db_session = DBSession() try: self.root = feed_type.mapping.filter_by(method="ROOT").one().field except NoResultFound: raise FeedMappingException( "No se ha creado ROOT para el mapeo de {0}".format( feed_type.id)) except MultipleResultsFound: raise FeedMappingException( "Se ha creado más de un ROOT para el mapeo de {0}".format( feed_type.id))
def run(urls=None, feed_ids=None, num_workers=None): """ params: urls list of feed urls feed_ids list of feed ids num_workers: Number of workers process. If None It will be used If urls and feed_ids are not provided the process will be run over all feeds enabled where it was not processed today """ log_file_name = os.path.join( LOG_FOLDER, "{0}_feeds_input.log".format( dtt.datetime.today().strftime("%Y-%m-%d"))) # start - logging configuration # @TODO: Maybe logging in this way it is an error sinse # https://docs.python.org/3.5/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_formatter = logging.Formatter('[%(asctime)s] %(message)s') logger_handler = logging.FileHandler(log_file_name) logger_handler.setFormatter(log_formatter) logger.addHandler(logger_handler) # end - logging configuration pool = Pool(processes=num_workers) session = DBSession() if not urls and not feed_ids: result = session.query(FeedIn.id).\ filter( FeedIn.last_processed_date < dtt.date.today(), FeedIn.enabled == '1' ).all() feed_ids = [t_id[0] for t_id in result] elif urls: # It gets a list of feed_id from urls if it is passed result = session.query(FeedIn.id).\ filter( FeedIn.url.in_(urls)).all() feed_ids = [t_id[0] for t_id in result] args_collection = [(feed_id, DOWNLOAD_FOLDER, 20) for feed_id in feed_ids] results = pool.map_async(__process_feed, args_collection).get()
def process_feed(feed_id, download_folder, timeout=None): logger = logging.getLogger(__name__) session = DBSession() feed_in = session.query(FeedIn).get(feed_id) url = feed_in.url file_name = "" try: file_name = download_file(url, download_folder, ext=feed_in.ext or None, timeout=timeout) result = preprocess(file_name, feed_in.bulk_insert, ()) for res in result: logger.info("{0} {1} {2} {3} {4} {5} {6}".format( res['status'], url, file_name, res['inserted'], res['old_ads'], res['repeated_ads'], res['e_msg'] or "")) except Exception as e: logger.info("{0} {1} {2} {3} {4} {5} {6}".format( type(e).__name__, url, file_name, 0, 0, 0, str(e)))
class XmlAdMapper(AdMapper): def __init__(self, feed_type): self.map_methods = {} self.feed_type = feed_type self.db_session = DBSession() try: self.root = feed_type.mapping.filter_by(method="ROOT").one().field except NoResultFound: raise FeedMappingException( "No se ha creado ROOT para el mapeo de {0}".format( feed_type.id)) except MultipleResultsFound: raise FeedMappingException( "Se ha creado más de un ROOT para el mapeo de {0}".format( feed_type.id)) def iter_from_file(self, file): self.__file_path = file self.__iteration_count = 0 self.__file_was_cleaned = False # Flag determines if xml file was cleaned to avoid infite loops self.__xml_parser = etree.iterparse(self.__file_path, tag=self.root) def get_raw_content(self): """ Returns a string with a ad information in xml format """ """ How does it work? Function iterate over each element next(self.__xml_parser). If a XML syntax error was found it will raise a XMLSyntaxError exception. If the file was not cleaned yet (self.__file_was_cleaned is False) we will try clean it/fix it calling clear_file(). After then we restart the iterator and jump the elements thas has been returned and call get_raw_content again. If XMLSyntaxError is raised again (self.__file_was_cleaned is True) means that clear_file() couldn't fix it so raise the excepcion and avoid infinite recursion """ try: event, element = next(self.__xml_parser) raw_content = etree.tostring(element, encoding="utf-8").decode("utf-8") element.clear() self.__iteration_count += 1 return raw_content except etree.XMLSyntaxError as e: # if there is an invalid Token OR invalid entity AND file wasn't cleaned yet if not self.__file_was_cleaned: clear_file( self.__file_path) # Removes invalid characters from file self.__file_was_cleaned = True self.__xml_parser = etree.iterparse( self.__file_path, tag=self.root) # Restart iterator # Jump elements self.__iteration_count times for i in range(self.__iteration_count): next(self.__xml_parser) return self.get_raw_content() else: raise FeedParseException("El XML no se pudo reparar. " + type(e).__name__ + str(e)) def __load_map_methods(self): """ Loads in memory all map methods and it params""" # @WARNING: Para que funcione ésta query, es necesario que esté # desactivada la opción ONLY_FULL_GROUP_BY de mysql activada como default a partir de la versión 5.7.5 # Información técnica: https://dev.mysql.com/doc/refman/5.7/en/sql-mode.html#sqlmode_only_full_group_by # Solución: https://www.sitepoint.com/quick-tip-how-to-permanently-change-sql-mode-in-mysql/ methods = self.db_session.query( FeedTypeMapping.method, func.group_concat( FeedTypeMapping.field.op('ORDER BY')( FeedTypeMapping.param_order)), func.group_concat( FeedTypeMapping.default_value.op('ORDER BY')( FeedTypeMapping.param_order))).filter( FeedTypeMapping.feed_type == self.feed_type, FeedTypeMapping.method.in_( MAP_METHODS.keys())).group_by( FeedTypeMapping.method).all() for method_name, xpaths, default_values in methods: addional_params = dict([ (param.name, param.value) for param in self.feed_type.additional_params.filter_by( method=method_name) ]) map_method = MAP_METHODS[method_name](**addional_params) # With the below line we get # {"DESCRIPCION": (DescriptionMapMethod(template = "..."), [content/text(), bathrooms/text()])} self.map_methods[method_name] = (map_method, xpaths.split(","), default_values.split(",")) return self def map(self, raw_ad): if not self.map_methods: self.__load_map_methods() xml = etree.fromstring(raw_ad.raw_content) mapped_properties = {} temp_ad = TempAd() for method_name, (map_method, xpaths, default_values) in self.map_methods.items(): args = [ self.extract(xml, xpath) or default_value for xpath, default_value in zip(xpaths, default_values) ] mapped_properties.update(map_method.map(*args, raw_ad=raw_ad)) temp_ad.set_properties(mapped_properties) return temp_ad def exec_method(self, method_name, raw_ad): if not self.map_methods: self.__load_map_methods() xml = etree.fromstring(raw_ad.raw_content) map_method = self.map_methods[method_name][0] xpaths = self.map_methods[method_name][1] args = [self.extract(xml, xpath) for xpath in xpaths] return map_method.map(*args, raw_ad=raw_ad) def extract(self, xml, xpath): """ Extract data from xml based on it xpath """ data = xml.xpath(xpath) if len(data) == 1: data = data[0].strip() elif len(data) == 0: data = "" return data
def run(loader_name, sleep_time=0): log_file_name = os.path.join( LOG_FOLDER, "{0}_feeds_load.log".format(dtt.datetime.today().strftime("%Y-%m-%d"))) # start - logging configuration logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_formatter = logging.Formatter('[%(asctime)s] %(message)s') logger_handler = logging.FileHandler(log_file_name) logger_handler.setFormatter(log_formatter) logger.addHandler(logger_handler) # end - logging configuration loader = create_loader(loader_name) processed_temp_ads = [] errors = 0 loaded_ok = 0 limit = 100 query = DBSession.query(TempAd).filter(TempAd.is_ready, TempAd.ad_id == None) temp_ads = query.order_by(func.rand()).limit(limit).all() while True: processed_temp_ads += [temp_ad.id for temp_ad in temp_ads] if not temp_ads: logger.info("FINISHED. ads loaded OK: {0}. Errors: {1}".format( loaded_ok, errors)) break ads_data = [] for temp_ad in temp_ads: try: ad_data = prepare_ad_data(loader, temp_ad) ads_data.append(ad_data) except Exception as e: temp_ad.error_message = "Error while it is preparing data to load: " + str( e) logger.info("{0} {1} {2}".format(temp_ad.id, temp_ad.ad_id, temp_ad.error_message or "")) errors += 1 pass # It loads the ads result = loader.load(ads_data) for res in result: temp_ad = DBSession.query(TempAd).get(res["id"]) temp_ad.ad_id = res["ad_id"] temp_ad.error_message = res["error_message"] if res["error_message"]: errors += 1 else: loaded_ok += 1 logger.info("{0} {1} {2}".format(temp_ad.id, temp_ad.ad_id, temp_ad.error_message or "")) DBSession.commit() temp_ads = query.filter(~ TempAd.id.in_(processed_temp_ads)).\ order_by(func.rand()).\ limit(limit).\ all() # Process sleeps in order to avoid overload API's server. time.sleep(sleep_time)
def bulk_insert(self, file): """ Bulk insert from a file """ self.feed_type.ad_mapper.iter_from_file(file) max_pending = 10000 # Max INSERTs pending to commit current_pending = 0 # count the number of ads processing from the xml inserted_ads = 0 info = {'status': None, 'file': file, 'inserted': None, 'e_msg': None} pending_raw_ads = [] record_ids = [] old_ads = 0 repeated_ads = 0 while True: try: raw_ad = RawAd() raw_ad.raw_content = self.feed_type.ad_mapper.get_raw_content() raw_ad.feed_in = self ######################## Begin - Filter section ################################ # @TODO: Filters should be dinamic. E.g: implement some kind of observer pattern date_info = self.feed_type.ad_mapper.exec_method("FECHA", raw_ad = raw_ad) days = (dtt.today() - dtt.strptime(date_info["date"], date_info["_format"])).days ######################## End - Filter section ################################ if days > 30: old_ads += 1 continue # It skips the remaining code in the loop. # This way we don't call to database in each iteration ######################## Begin - Filter section ################################ # @TODO: Filters should be dinamic. E.g: implement some kind of observer pattern id = self.feed_type.ad_mapper.exec_method("ID", raw_ad = raw_ad)["_id_in_feed"] record_id = id + "," + self.feed_type.ad_mapper.exec_method("URL", raw_ad = raw_ad)["link"] ad_exists = DBSession.execute("SELECT 1 FROM fp_feeds_in_records WHERE id = :id", {"id": record_id}).first() ######################## End - Filter section ################################ if ad_exists: repeated_ads += 1 else: pending_raw_ads.append( { "raw_ad": raw_ad.raw_content, "feed_in_id": self.id }) record_ids.append({"id": record_id}) current_pending += 1 if( current_pending == max_pending): self.__insert(pending_raw_ads, record_ids) inserted_ads += current_pending current_pending = 0 except StopIteration: if(current_pending != 0): self.__insert(pending_raw_ads, record_ids) inserted_ads += current_pending current_pending = 0 # It updates the processed date's feed self.last_processed_date = date.today() DBSession.commit() info['status'] = 'ok' info['inserted'] = inserted_ads info['repeated_ads'] = repeated_ads info['old_ads'] = old_ads return info except Exception as e: info['status'] = type(e).__name__ info['inserted'] = inserted_ads info['e_msg'] = str(e) info['repeated_ads'] = repeated_ads info['old_ads'] = old_ads return info
def __insert(self, pending_raw_ads, record_ids): DBSession.execute(RawAd.__table__.insert(), pending_raw_ads) DBSession.execute("INSERT INTO fp_feeds_in_records (id) VALUES (:id)", record_ids) DBSession.commit() del pending_raw_ads[:] del record_ids[:]
def create_temp_ads(raw_ad_ids): session = DBSession() raw_ad_list = session.query(RawAd).filter(RawAd.id.in_(raw_ad_ids)).all() bulk_temp_ads = [] bulk_props = [] bulk_imgs = [] processed = 0 for raw_ad in raw_ad_list: try: temp_ad = raw_ad.map() temp_ad.id = raw_ad.id # Set temp_ad_id to all TempAdProperty instances for prop_name, prop in temp_ad.properties.items(): # Adding to bulk bulk_props.append({"temp_ad_id": temp_ad.id, "name": prop.name, "value": prop.value}) # Download images for image in temp_ad.images: # Adding to bulk bulk_imgs.append({ "temp_ad_id": temp_ad.id, "external_path": image.external_path, }) # Adding to bulk bulk_temp_ads.append({ "id": temp_ad.id, "feed_in_location_id": temp_ad.feed_in_location_id, "feed_in_subcat_id": temp_ad.feed_in_subcat_id, "feed_in_id": raw_ad.feed_in_id }) processed += 1 except Exception as e: logger = logging.getLogger(__name__) logger.info("RawId: {0} {1} {2}".format(raw_ad.id, type(e).__name__, str(e))) if bulk_temp_ads: session.execute(TempAd.__table__.insert(), bulk_temp_ads) if bulk_props: session.execute(TempAdProperty.__table__.insert(), bulk_props) if bulk_imgs: session.execute(TempAdImage.__table__.insert(), bulk_imgs) session.commit() return processed