def check_source(self, source): city_set = set() parser = None if source == "dida": parser = DidaParser() elif source == "dianping": parser = DianpingParser() elif source == "ftuan": parser = FtuanParser() elif source == "lashou": parser = LashouParser() elif source == "manzuo": parser = ManzuoParser() elif source == "meituan": parser = MeituanParser() elif source == "nuomi": parser = NuomiParser() elif source == "wowo": parser = WowoParser() elif source == "wuba": parser = WubaParser() filelist = self.get_deal_file_list(source) for filename in filelist: filepath = os.path.join(source, filename) # run the corresponding parser parser.parse(filepath) # go through all deals for deal in parser.deals: deal_cities = deal["deal_city"].split(",") for deal_city in deal_cities: # deal_city = unicode(deal_city.strip(), "utf-8") deal_city = deal_city.strip() current_city_id = self.get_city_info(deal_city) if len(current_city_id) == 0: self.notfound_city_set.add(deal_city) else: self.found_city_set.add(deal_city)
def find_deal_num(self, source): print "prcessing", source deal_set = set() deal_num_per_city = dict() deal_per_city = dict() parser = None if source == "dida": parser = DidaParser() elif source == "dianping": parser = DianpingParser() elif source == "ftuan": parser = FtuanParser() elif source == "lashou": parser = LashouParser() elif source == "manzuo": parser = ManzuoParser() elif source == "meituan": parser = MeituanParser() elif source == "nuomi": parser = NuomiParser() elif source == "wowo": parser = WowoParser() elif source == "wuba": parser = WubaParser() filelist = self.get_deal_file_list(source) for filename in filelist: filepath = os.path.join(source, filename) try: parser.parse(filepath) except Exception, e: print "Error: find_deal_num.py", source, filename continue # get rid of this error file # assume all deals in the same file belong to the same city city_name = None for deal in parser.deals: if deal["deal_city"] in self.cache: city_name = deal["deal_city"] if self.cache[deal["deal_city"]] == "283": # if it is the whole country continue break for deal in parser.deals: current_city_name = None if deal["deal_city"] in self.cache: if self.cache[deal["deal_city"]] == "283": # although the deal is country-wise, but it still belongs to the current city current_city_name = city_name else: current_city_name = deal["deal_city"] else: current_city_name = city_name deal_id = deal["deal_id"] deal_set.add(deal_id) if not current_city_name: continue if current_city_name in deal_per_city: deal_per_city[current_city_name].add(deal_id) else: deal_per_city[current_city_name] = set() deal_per_city[current_city_name].add(deal_id)
def parse(self, source, created_time, filepath): parser = None if source == "dida": parser = DidaParser() elif source == "dianping": parser = DianpingParser() elif source == "ftuan": parser = FtuanParser() elif source == "lashou": parser = LashouParser() elif source == "manzuo": parser = ManzuoParser() elif source == "meituan": parser = MeituanParser() elif source == "nuomi": parser = NuomiParser() elif source == "wowo": parser = WowoParser() elif source == "wuba": parser = WubaParser() # run the corresponding parser parser.parse(filepath) # go through the deals until we find the city_id. # assumption: the city_id should be consistent for all deals in the current file city_id = None for deal in parser.deals: if not city_id: city_id = self.get_city_info(deal["deal_city"]) # query the city table if len(city_id) != 0: city_id = city_id[0]["city_id"] break for deal in parser.deals: deal_id = deal["deal_id"] sales_num = deal["sales_num"] price = deal["price"] start_time = deal["start_time"] end_time = deal["end_time"] deal_cate = deal["deal_cate"] deal_subcate = deal["deal_subcate"] current_city_id = self.get_city_info(deal["deal_city"]) if len(current_city_id) == 0: # if cannot find city id, use the global one. current_city_id = city_id else: current_city_id = current_city_id[0]["city_id"] print deal_id, current_city_id, sales_num, price, start_time, end_time, deal_cate, deal_subcate
def extract(self, source, output_file): parser = None if source == "dida": parser = DidaParser() elif source == "dianping": parser = DianpingParser() elif source == "lashou": parser = LashouParser() elif source == "meituan": parser = MeituanParser() elif source == "nuomi": parser = NuomiParser() elif source == "wowo": parser = WowoParser() elif source == "wuba": parser = WubaParser() else: print source, "does not have shop information" return shop_set = dict() filelist = self.get_deal_file_list(source) for filename in filelist: filepath = os.path.join(source, filename) print "processing", source, filename parser.parse(filepath) # assume all deals in the same file belong to the same city city_name = None for deal in parser.deals: if deal["deal_city"] in self.cache: city_name = deal["deal_city"] break for deal in parser.deals: deal_id = deal["deal_id"] deal_sold_at = deal["deal_sold_at"] for shop in deal_sold_at: # shop_entry = self.make_shop_entry_tel_as_key(shop) shop_entry = self.make_shop_entry_name_as_key(shop) if len(shop_entry) > 0: shop_entry[0] = self.cache[city_name] + "_" + shop_entry[0] if shop_entry[0] not in shop_set: shop_set[shop_entry[0]] = [ deal_id, city_name, shop_entry[1], shop_entry[2], shop_entry[3], shop_entry[4], shop_entry[5], ] # save the shop list fhandler = open(output_file, "w") for key in shop_set: shop_string = key for ele in shop_set[key]: if ele: shop_string = shop_string + "\t" + ele else: shop_string = shop_string + "\t-" fhandler.write(shop_string.encode("utf-8") + "\n") fhandler.close()