def __init__(self, target): super().__init__() self.crawler = WebCrawler() target_config = getConfig().get("targets", {}).get(target) self.logger = getLogger(self.__class__.__name__) if not target_config: self.logger.error("target is not found in config.") raise Exception("target is not found in config.") self.logger.info(f"Application is processing target {target}") self.target_config = target_config self.max_threads = int(getConfig()["configs"]["max_threads"]) self.sleep_time = int(self.target_config["sleep"]) self.detail_urls = [] self.items = []
def get_keywords(self): keys = [ key for key in analyse.extract_tags(self.get_reviews(), topK=getConfig()['task1'].get( 'topK', 10), withWeight=False) ] return keys
def __init__(self): # Something self.configuration = configure.getConfig() session = boto3.Session( aws_access_key_id=self.configuration['keyid'], aws_secret_access_key=self.configuration['key'], region_name=self.configuration['region']) self.sqs = session.resource('sqs')
class RrysSpider(scrapy.Spider): name = "rrys" allowed_domains = getConfig()[name]["allowed_domains"] start_urls = getConfig()[name]["start_urls"] def parse(self, response): for sel in response.xpath( "//body/div[@class=\"middle-box\"]/div[@class=\"w\"]/div[1]/div/ul/li" ): item = RrysItem() item['seq'] = int(sel.xpath("span/text()").extract_first().strip()) item['title'] = sel.xpath("a/text()").extract_first().strip() item['link'] = "http://www.rrys2019.com" + sel.xpath( "a/@href").extract_first().strip() yield scrapy.Request(item['link'], meta={'item': item}, callback=self.parse_detail) def parse_detail(self, response): item = response.meta['item'] item['ranking'] = int( re.sub( r"[^0-9]", "", response.xpath( "//body/div[@class=\"middle-box\"]/div[@class=\"w\"]/div[1]/div[2]/div[1]/ul/li[1]/p/text()" ).extract_first().strip())) item['classification'] = response.xpath( "//body/div[@class=\"middle-box\"]/div[@class=\"w\"]/div[1]/div[1]/div[2]/div[2]/div[@class=\"level-item\"]/img/@src" ).extract_first().strip().replace( "http://js.jstucdn.com/images/level-icon/", "").replace("-big-1.png", "").upper() item['favorites'] = int( re.sub( r"[^0-9]", "", response.xpath("//li[@id=\"score_list\"]//div[1]/div[2]"). extract_first().strip())) item['cover'] = response.xpath( "//body/div[@class=\"middle-box\"]/div[@class=\"w\"]/div[1]/div[1]/div[2]/div[1]/div[1]/a/img/@src" ).extract_first().strip() yield item
def __init__(self): self.logger = getLogger(self.__class__.__name__) dbparams = dict(host=getConfig()['task2']['db']['host'], db=getConfig()['task2']['db']['name'], user=getConfig()['task2']['db']['user'], passwd=getConfig()['task2']['db']['password'], charset=getConfig()['task2']['db']['charset'], cursorclass=pymysql.cursors.DictCursor, use_unicode=getConfig()['task2']['db']['use_unicode']) self.__dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
def __init__(self, customer_id): super().__init__() self.__customer_id = customer_id self.__customer = Customer.get_customer(customer_id) if not self.__customer: raise Exception("Customer is not found.") self.vip_discount = float(getConfig().get('vip_discount', '0.8')) self.vip_threshold = float(getConfig().get('vip_threshold', '200.0')) self.vip_item_threshold = int(getConfig().get('vip_item_threshold', '10')) self.vip_item_discount = float(getConfig().get('vip_item_discount', '0.85')) self.generic_discount = float(getConfig().get('generic_discount', '0.9')) self.generic_threshold = float(getConfig().get('generic_threshold', '200.0'))
def __init__(self): super().__init__() self.logger = getLogger(self.__class__.__name__) self.reviewUrl = getConfig()['task1']['review_url'] self.web_crawler = WebCrawler()
def __init__(self): self.logger = getLogger(self.__class__.__name__) self.commentUrl = getConfig()['task2']['comment_url'] self.web_crawler = WebCrawler() self.db_helper = DbHelper() self.__comments = []
def start(self): self.client.on_connect = self.__onConnect self.client.on_disconnect = self.__onDisconnect ip = configure.getConfig()['broker_addr'] port = int(configure.getConfig()['broker_port']) self.client.connect( ip, port )
def printed_currency(val): locale.setlocale(locale.LC_ALL, getConfig().get('locale', 'en_US')) return locale.currency(val, grouping=True)
def getLogger(name): log_level = getConfig()["configs"]["log_level"] log_format = getConfig()["configs"]["log_format"] logging.basicConfig(level=LOG_LEVELS[log_level.upper()], format=log_format) return logging.getLogger(name)