class GooglePlayCrawler: def __init__(self): log_file = os.path.join(AppConfig.get_log_dir(), 'google_play_crawler.log') self.logger = Logger(log_file, 'google_play_crawler', 10 * 1024 * 1024, 2) def start(self): self.logger.info('Start key word extractor thread....') self._start_key_word_extractor() self.logger.info('Start category feeder...') category_producer = os.path.join(os.getcwd(), 'feed', 'category', 'category_producer.py') category_consumer = os.path.join(os.getcwd(), 'feed', 'category', 'category_consumer.py') self._start_process(category_producer) self._start_process(category_consumer) self.logger.info('Start developer feeder...') developer_producer = os.path.join(os.getcwd(), 'feed', 'developer', 'developer_producer.py') developer_consumer = os.path.join(os.getcwd(), 'feed', 'developer', 'developer_consumer.py') self._start_process(developer_producer) self._start_process(developer_consumer) self.logger.info('Start search feeder...') search_producer = os.path.join(os.getcwd(), 'feed', 'search', 'search_producer.py') search_consumer = os.path.join(os.getcwd(), 'feed', 'search', 'search_consumer.py') self._start_process(search_producer) self._start_process(search_consumer) self.logger.info('Start similar feeder...') similar_producer = os.path.join(os.getcwd(), 'feed', 'similar', 'similar_producer.py') similar_consumer = os.path.join(os.getcwd(), 'feed', 'similar', 'similar_consumer.py') self._start_process(similar_producer) self._start_process(similar_consumer) self.logger.info('Start app detail crawler...') app_producer = os.path.join(os.getcwd(), 'crawler', 'app_producer.py') app_consumer = os.path.join(os.getcwd(), 'crawler', 'app_consumer.py') self._start_process(app_producer) self._start_process(app_consumer) def _start_key_word_extractor(self): key_word_extractor = KeyWordExtractor(self.logger) key_word_extractor.start() def _start_process(self, file_path): cmd = ['python', file_path] try: Command.run_without_wait(cmd) except Exception: self.logger.exception('Run script %s error' % file_path)
class KeyWordConsumer: def __init__(self, log_file, log_name): self.logger = Logger(log_file, log_name, 10*1024*1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_consumer(EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) if not rabbit_topic: self.logger.debug('Construct key word consumer error') return self._conn_db() if not self._db_conn: self.logger.exception('Connect to database error') return while 1: try: rabbit_topic.start_consuming(self._callback, QUEUE_NAME) except ConnectionClosed: self.logger.debug('Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_consumer(EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) def _callback(self, channel, method, properties, key_word): self.logger.info(os.linesep) self.logger.info('----> Get body message %s and start searching this key word...<----' % key_word) try: url = 'https://play.google.com/store/search?q=%s&c=apps' % key_word search_web_driver = SearchWebDriver(url, self._db_conn, self.logger) search_web_driver.search() except Exception: self.logger.exception('Search key word %s error' % key_word) channel.basic_ack(delivery_tag=method.delivery_tag) self.logger.info('Set key word %s as consumed' % key_word) self._set_key_word_consumed(key_word) def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect to database error') def _set_key_word_consumed(self, key_word): query = 'UPDATE key_word SET status=%s WHERE key_word=%s' try: MySQLDBUtil.update(query, (CONSUMED, key_word), self._db_conn) except Exception: self.logger.exception('Set key word %s as consumed error' % key_word)
class CategoryProducer: def __init__(self): self.logger = Logger(LOG_FILE, LOG_NAME, 10*1024*1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_producer(EXCHANGE_NAME, self.logger) if not rabbit_topic: return self._conn_db() if not self._db_conn: self.logger.exception('Connect database error') return while 1: try: if self._is_no_more_records(): self.logger.info('There are no more records, wait...') time.sleep(PRODUCE_WAIT_TIME) # Still no more available records, then reset... if self._is_no_more_records(): self.logger.info('Still there are no more records, reset all records...') self._reset_category() category_list = self._fetch_category_list() for category in category_list: if RabbitTopic.is_queue_full(QUEUE_NAME, QUEUE_LIMIT, self.logger): self.logger.info('Queue %s is full, wait...' % QUEUE_NAME) time.sleep(PRODUCE_WAIT_TIME) continue try: rabbit_topic.publish(ROUTING_KEY, category) self.logger.info('Publish category %s and update the status' % category) self._update_status(PUBLISHED, category) except ConnectionClosed: self.logger.debug('Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_producer(EXCHANGE_NAME, self.logger) except Exception: self.logger.exception('Publish category error') def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect database error') def _fetch_category_list(self): category_list = [] self.logger.info('Get un-published category list...') query = 'SELECT category FROM category WHERE status=%s LIMIT %s' try: results = MySQLDBUtil.fetch_multiple_rows(query, (UN_PUBLISHED, QUEUE_LIMIT), self._db_conn) for result in results: (category,) = result category_list.append(category) except Exception: self.logger.exception('Query un-published category error') return category_list def _is_no_more_records(self): query = 'SELECT COUNT(*) FROM category WHERE status=%s' try: result = MySQLDBUtil.fetch_single_row(query, (UN_PUBLISHED,), self._db_conn) if result: (count,) = result if count == 0: return True except Exception: self.logger.exception('Check available records error') return False def _reset_category(self): try: self.logger.info('All category have been published, reset all to un-published status') self._update_status(UN_PUBLISHED) except Exception: self.logger.exception('Reset category table error') return def _update_status(self, status, category=None): if not category: query = 'UPDATE category SET status=%s' else: query = 'UPDATE category SET status=%s WHERE category=%s' try: if not category: MySQLDBUtil.update(query, (status,), self._db_conn) else: MySQLDBUtil.update(query, (status, category), self._db_conn) except Exception: self.logger.exception('Update record status')
class AppProducer: def __init__(self): self.logger = Logger(LOG_FILE, LOG_NAME, 10*1024*1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_producer(EXCHANGE_NAME, self.logger) if not rabbit_topic: return self._conn_db() if not self._db_conn: self.logger.exception('Connect database error') return while 1: try: if self._is_no_more_records(): self.logger.info('There are no more available records, wait...') time.sleep(PRODUCE_WAIT_TIME) package_list = self._fetch_package_list() for package_name in package_list: if RabbitTopic.is_queue_full(QUEUE_NAME, QUEUE_LIMIT, self.logger): self.logger.info('Queue %s is full, wait to consume...' % QUEUE_NAME) time.sleep(PRODUCE_WAIT_TIME) continue try: rabbit_topic.publish(ROUTING_KEY, package_name) self.logger.info('Publish package %s and update status' % package_name) self._update_status(PUBLISHED, package_name) except ConnectionClosed: self.logger.debug('Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_producer(EXCHANGE_NAME, self.logger) except Exception: self.logger.exception('Publish similar app package name error') def _init_rabbitmq(self): try: rabbit_topic = RabbitTopic(EXCHANGE_NAME) rabbit_topic.construct_producer() except Exception: self.logger.exception('Construct app producer error') return None return rabbit_topic def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect database error') def _fetch_package_list(self): package_list = [] self.logger.info('Get un-published package list...') query = 'SELECT package_name FROM package_name WHERE status=%s LIMIT %s' try: results = MySQLDBUtil.fetch_multiple_rows(query, (UN_PUBLISHED, QUEUE_LIMIT), self._db_conn) for result in results: (package_name,) = result package_list.append(package_name) except Exception: self.logger.exception('Query un-used package name error') return package_list def _is_no_more_records(self): query = 'SELECT COUNT(*) FROM package_name WHERE status=%s' try: result = MySQLDBUtil.fetch_single_row(query, (UN_PUBLISHED,), self._db_conn) if result: (count, ) = result if count == 0: return True except Exception: self.logger.exception('Check if there is no more packages error') return False def _update_status(self, status, package_name=None): if not package_name: query = 'UPDATE package_name SET status=%s' else: query = 'UPDATE package_name SET status=%s WHERE package_name=%s' try: if not package_name: MySQLDBUtil.update(query, (status,), self._db_conn) else: MySQLDBUtil.update(query, (status, package_name), self._db_conn) except Exception: self.logger.exception('Update record status')
class DeveloperProducer: def __init__(self): self.logger = Logger(LOG_FILE, LOG_NAME, 10 * 1024 * 1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_producer( EXCHANGE_NAME, self.logger) if not rabbit_topic: return self._conn_db() if not self._db_conn: self.logger.exception('Connect database error') return while 1: try: if self._is_no_more_records(): self.logger.info('There are no more records, wait...') time.sleep(PRODUCE_WAIT_TIME) developer_list = self._fetch_developer_list() for developer in developer_list: if RabbitTopic.is_queue_full(QUEUE_NAME, QUEUE_LIMIT, self.logger): self.logger.info('Queue %s is full, wait...' % QUEUE_NAME) time.sleep(PRODUCE_WAIT_TIME) continue try: rabbit_topic.publish(ROUTING_KEY, developer) self.logger.info( 'Publish developer %s and update status' % developer) self._update_status(PUBLISHED, developer) except ConnectionClosed: self.logger.debug( 'Connection to rabbitmq server closed, re-connecting...' ) rabbit_topic = RabbitTopic.init_rabbitmq_producer( EXCHANGE_NAME, self.logger) except Exception: self.logger.exception('Publish developer error') def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect database error') def _fetch_developer_list(self): developer_list = [] self.logger.info('Get un-published developer list...') query = 'SELECT name FROM developer WHERE status=%s LIMIT %s' try: results = MySQLDBUtil.fetch_multiple_rows( query, (UN_PUBLISHED, QUEUE_LIMIT), self._db_conn) for result in results: (developer, ) = result developer_list.append(developer) except Exception: self.logger.exception('Query un-published developer error') return developer_list def _is_no_more_records(self): query = 'SELECT COUNT(*) FROM developer WHERE status=%s' try: result = MySQLDBUtil.fetch_single_row(query, (UN_PUBLISHED, ), self._db_conn) if result: (count, ) = result if count == 0: return True except Exception: self.logger.exception('Check if there is no more developers error') return False def _update_status(self, status, developer=None): if not developer: query = 'UPDATE developer SET status=%s' else: query = 'UPDATE developer SET status=%s WHERE name=%s' try: if not developer: MySQLDBUtil.update(query, (status, ), self._db_conn) else: MySQLDBUtil.update(query, (status, developer), self._db_conn) except Exception: self.logger.exception('Update record status')
class DeveloperConsumer: def __init__(self, log_file, log_name): self.logger = Logger(log_file, log_name, 10 * 1024 * 1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_consumer( EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) if not rabbit_topic: self.logger.debug('Construct developer consumer error') return self._conn_db() if not self._db_conn: self.logger.exception('Connect to database error') return while 1: try: rabbit_topic.start_consuming(self._callback, QUEUE_NAME) except ConnectionClosed: self.logger.debug( 'Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_consumer( EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) def _callback(self, channel, method, properties, developer): self.logger.info(os.linesep) self.logger.info( '----> Get body message %s and start query this developer... <----' % developer) try: if developer.isdigit(): self.logger.info('Developer info is all digit numbers') url = '%s=%s' % (DEVELOPER_ID_HOST_URL, developer) else: self.logger.info('Developer info is non digit numbers') url = '%s=%s' % (DEVELOPER_NAME_HOST_URL, developer) self.logger.info('Query developer apps with url %s' % url) developer_web_driver = DeveloperWebDriver(url, self._db_conn, self.logger) developer_web_driver.query() except Exception: self.logger.exception('Query developer %s error' % developer) channel.basic_ack(delivery_tag=method.delivery_tag) self.logger.info('Set developer %s as consumed' % developer) self._set_developer_consumed(developer) def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect database error') def _set_developer_consumed(self, developer): query = 'UPDATE developer SET status=%s WHERE name=%s' try: MySQLDBUtil.update(query, (CONSUMED, developer), self._db_conn) except Exception: self.logger.exception('Set devloper %s as consumed error' % developer)
class AppConsumer: def __init__(self, log_file, log_name): self.logger = Logger(log_file, log_name, 10*1024*1024, 2) self._mysql_db_conn = None self._mongo_db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_consumer(EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) if not rabbit_topic: self.logger.debug('Construct app consumer error') return self._conn_db() if not self._mysql_db_conn or not self._mongo_db_conn: self.logger.exception('Connect to database error') return while 1: try: rabbit_topic.start_consuming(self._callback, QUEUE_NAME) except ConnectionClosed: self.logger.debug('Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_consumer(EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) def _callback(self, channel, method, properties, package_name): self.logger.info(os.linesep) self.logger.info('----> Get body message %s and start get app detail... <-----' % package_name) try: url = '%s=%s' % (APP_HOST_URL, package_name) self.logger.info('Query app detail with url %s' % url) app_detail = self._parse_web_content(url) if not app_detail: self.logger.info('App detail extraction fail') else: self.logger.info('Store app detail...') app_detail.package_name = package_name self._store_app_detail(app_detail) self.logger.info('Insert package name %s into similar app table...' % package_name) self._store_package_name_similar(package_name) self.logger.info('Store app description...') self._store_app_description(app_detail) self.logger.info('Store app developer...') self._store_app_developer(app_detail) except Exception: self.logger.exception('Query app detail %s error' % package_name) channel.basic_ack(delivery_tag=method.delivery_tag) self.logger.info('Set package name %s as consumed' % package_name) self._set_package_consumed(package_name) def _conn_db(self): try: self._mysql_db_conn = util.conn_mysql_db() self._mongo_db_conn = util.conn_mongo_db() except Exception: self.logger.exception('Connect database error') def _parse_web_content(self, url): app_detail = None try: response = requests.get(url) except Exception: self.logger.exception('Get web content from url %s error' % url) return app_detail try: web_content = util.decode_utf8(response.content) except Exception: self.logger.exception('Decode web content error') return app_detail if not web_content: self.logger.debug('Web content is empty, no need to parse') return app_detail else: self.logger.info('Get web content successfully,try to parse it...') app_detail_lxml_parser = AppDetailLxmlParser(web_content, self.logger) try: app_detail_lxml_parser.parse() app_detail = app_detail_lxml_parser.app_detail except Exception: self.logger.exception('Use lxml to parse the web content error, try to use the backup one beautiful soup...') app_detail_b4_parser = AppDetailB4Parser(response.content, self.logger) try: app_detail_b4_parser.parse() app_detail = app_detail_b4_parser.app_detail except Exception: self.logger.exception('Use beautiful soup to parse the web content error') return app_detail def _store_app_detail(self, app_detail): if not app_detail: self.logger.debug('No app detail content, cannot store into database') return app_detail_json = app_detail.to_json() try: MongoDBUtil.insert(app_detail_json, self._mongo_db_conn, 'app_detail') except Exception: self.logger.exception('Store app detail content into mongo db error') def _store_package_name_similar(self, package_name): query = 'INSERT IGNORE INTO similar_app (package_name) VALUES ("%s")' % package_name try: MySQLDBUtil.insert(query, None, self._mysql_db_conn) except Exception: self.logger.exception('Store package name into similar app database fail') def _store_app_description(self, app_detail): if not app_detail: return description = ' '.join(app_detail.description) description = description.replace('"', '').replace('\'', '') if not description: return query = 'INSERT INTO raw_text (text) VALUES ("%s")' % description try: MySQLDBUtil.insert(query, None, self._mysql_db_conn) except Exception: self.logger.exception('Store app description error') def _store_app_developer(self, app_detail): if not app_detail: return developer_link = app_detail.developer_link if not developer_link: return items = developer_link.split('id=') if len(items) == 2: developer_name = items[-1] query = 'INSERT IGNORE INTO developer (name) VALUES ("%s")' % developer_name try: MySQLDBUtil.insert(query, None, self._mysql_db_conn) self.logger.info('Stored app developer %s' % developer_name) except Exception: self.logger.exception('Store app developer error') else: return def _set_package_consumed(self, package_name): query = 'UPDATE package_name SET status=%s WHERE package_name=%s' try: MySQLDBUtil.update(query, (CONSUMED, package_name), self._mysql_db_conn) except Exception: self.logger.exception('Set package name %s as consumed error' % package_name)
class SimilarConsumer: def __init__(self, log_file, log_name): self.logger = Logger(log_file, log_name, 10 * 1024 * 1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_consumer( EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) if not rabbit_topic: self.logger.debug('Construct similar consumer error') return self._conn_db() if not self._db_conn: self.logger.exception('Connect to database error') return while 1: try: rabbit_topic.start_consuming(self._callback, QUEUE_NAME) except ConnectionClosed: self.logger.debug( 'Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_consumer( EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) def _callback(self, channel, method, properties, package_name): self.logger.info(os.linesep) self.logger.info( '----> Get body message %s and start query apps similar to this... <----' % package_name) try: url = '%s=%s' % (SIMILAR_HOST_URL, package_name) self.logger.info('Query similar apps with url %s' % url) package_names = self._extract_package_names(url) self.logger.info('Store package names...') self._store_package_names(package_names) except Exception: self.logger.exception('Query similar apps %s error' % package_name) channel.basic_ack(delivery_tag=method.delivery_tag) self.logger.info('Set package name %s as consumed' % package_name) self._set_package_consumed(package_name) def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect database error') def _extract_package_names(self, url): package_names = set() try: response = requests.get(url) except Exception: self.logger.exception('Get content with url %s error' % url) return package_names if response.status_code == requests.codes.ok: html_tree = html.fromstring(response.content) app_links = html_tree.xpath('//a[@class="title"]/@href') for app_link in app_links: try: package_names.add(app_link.split('id=')[-1]) except Exception: self.logger.exception( 'Extract package from link %s error' % app_link) else: self.logger.debug( 'Access similar app url %s and returns wrong response code %d' % (url, response.status_code)) return package_names def _store_package_names(self, package_names): values = [] for package_name in package_names: values.append('("%s")' % package_name) if len(values) > 0: query = 'INSERT IGNORE INTO package_name (package_name) VALUES ' + ','.join( values) try: MySQLDBUtil.insert(query, None, self._db_conn) except Exception: self.logger.exception('Store package names into database fail') def _set_package_consumed(self, package_name): query = 'UPDATE similar_app SET status=%s WHERE package_name=%s' try: MySQLDBUtil.update(query, (CONSUMED, package_name), self._db_conn) except Exception: self.logger.exception('Set package name %s as consumed error' % package_name)
class CategoryConsumer: def __init__(self, log_file, log_name): self.logger = Logger(log_file, log_name, 10 * 1024 * 1024, 2) self._db_conn = None def start(self): rabbit_topic = RabbitTopic.init_rabbitmq_consumer( EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) if not rabbit_topic: self.logger.debug('Construct category consumer error') return self._conn_db() if not self._db_conn: self.logger.exception('Connect to database error') return while 1: try: rabbit_topic.start_consuming(self._callback, QUEUE_NAME) except ConnectionClosed: self.logger.debug( 'Connection to rabbitmq server closed, re-connecting...') rabbit_topic = RabbitTopic.init_rabbitmq_consumer( EXCHANGE_NAME, QUEUE_NAME, QUEUE_LIMIT, [ROUTING_KEY], self.logger) def _callback(self, channel, method, properties, category): self.logger.info(os.linesep) self.logger.info( '----> Get body message %s and start query this category... <----' % category) try: detail_urls = self._parse_detail_urls(category) if not detail_urls: self.logger.debug('No detail category urls got') return for detail_url in detail_urls: self.logger.info('Query detail category url %s' % detail_url) category_web_driver = CategoryWebDriver( detail_url, self._db_conn, self.logger) category_web_driver.query() time.sleep(10) except Exception: self.logger.exception('Query category %s error' % category) channel.basic_ack(delivery_tag=method.delivery_tag) self.logger.info('Set category %s as consumed' % category) self._set_category_consumed(category) def _conn_db(self): try: self._db_conn = util.conn_mysql_db() except Exception: self.logger.exception('Connect database error') # see more def _parse_detail_urls(self, category): detail_urls = set() category_url = '%s/%s' % (CATEGORY_HOST_URL, category) try: response = requests.get(category_url) if response.status_code == requests.codes.ok: # html is from lxml to parse html page html_tree = html.fromstring(response.content) category_detail_links = html_tree.xpath( '//a[@class="see-more play-button small id-track-click apps id-responsive-see-more"]//@href' ) for category_detail_link in category_detail_links: category_detail_link = category_detail_link.strip() detail_urls.add('https://play.google.com%s' % category_detail_link) else: self.logger.debug( 'Access category url %s and returns wrong response code %d' % (category_url, response.status_code)) except Exception: self.logger.exception('Get category web page error') return detail_urls def _set_category_consumed(self, category): query = 'UPDATE category SET status=%s WHERE category=%s' try: MySQLDBUtil.update(query, (CONSUMED, category), self._db_conn) except Exception: self.logger.exception('Set category %s as consumed error' % category)
from common.logger import Logger log = Logger() try: from apps.mayaManager import MayaManager app = 'maya' except ImportError: pass try: from apps.houdiniManager import HoudiniManager app = 'houdini' except ImportError: pass if not app: msg = "Failed to load manager! Make sure the tool and its dependencies are in PYTHONPATH" log.exception(msg) raise ImportError(msg) def run(lights_count, modes=[], radius=1000, blend=25): if app == 'maya': manager = MayaManager() manager.getLights(MayaManager.getSelection()) else: manager = HoudiniManager() manager.getLights(HoudiniManager.getSelection()) manager.extractLights(lights_count, modes, radius, blend) return manager