def __init__(self): self.shop_extractor = ShopExtractor() self.archiver = S3Archiver()
class Miner: def __init__(self): self.shop_extractor = ShopExtractor() self.archiver = S3Archiver() def construct_parser(self, source): parser = None if source == "dida": parser = DidaParser() elif source == "dianping": parser = DianpingParser() elif source == "ftuan": parser = FtuanParser() elif source == "lashou": parser = LashouParser() elif source == "manzuo": parser = ManzuoParser() elif source == "meituan": parser = MeituanParser() elif source == "nuomi": parser = NuomiParser() elif source == "wowo": parser = WowoParser() elif source == "wuba": parser = WubaParser() return parser def mine(self, source, mine_shop=True, mine_deal=True): price_not_match = 0 parser = self.construct_parser(source) if not parser: return None deal_set = dict() shop_set = dict() filelist = self.shop_extractor.get_deal_file_list(source) for filename in filelist: filepath = os.path.join(source, filename) print "processing", source, filename try: parser.parse(filepath) except Exception, e: print "Error: miner.py", source, filename continue # get rid of this error file # assume all deals in the same file belong to the same city city_name = None city_index = None for deal in parser.deals: if deal["deal_city"] in self.shop_extractor.cache: city_name = deal["deal_city"] city_index = self.shop_extractor.cache[city_name] if city_index == "283": # if it is the whole country continue break for deal in parser.deals: sales_num = deal["sales_num"] price = deal["price"] current_city_index = None if deal["deal_city"] in self.shop_extractor.cache: current_city_index = self.shop_extractor.cache[deal["deal_city"]] if ( current_city_index == "283" ): # although it is country-wise, it still sells inside the current city. current_city_index = city_index else: current_city_index = city_index deal_id = str(current_city_index) + "_" + deal["deal_id"] # add city information # mine deal information if mine_deal: if deal_id not in deal_set: deal_set[deal_id] = [price, sales_num] else: if price != deal_set[deal_id][0] or sales_num != deal_set[deal_id][1]: print deal_id, "price or sales_num does not match!" price_not_match = price_not_match + 1 # mine shop information if mine_shop: deal_sold_at = deal["deal_sold_at"] for shop in deal_sold_at: shop_entry = self.shop_extractor.make_shop_entry_name_as_key(shop) if len(shop_entry) > 0: shop_entry[0] = str(city_index) + "_" + shop_entry[0] if shop_entry[0] not in shop_set: shop_set[shop_entry[0]] = [ deal_id, shop_entry[1], shop_entry[2], shop_entry[4], shop_entry[5], ] # tel, addr, lat, lng print "price or sales_num not match:", price_not_match return [deal_set, shop_set]