Example #1
0
	def check_source(self, source):
		city_set = set()

		parser = None
		if source == "dida": parser = DidaParser()
		elif source == "dianping": parser = DianpingParser()
		elif source == "ftuan": parser = FtuanParser()
		elif source == "lashou": parser = LashouParser()
		elif source == "manzuo": parser = ManzuoParser()
		elif source == "meituan": parser = MeituanParser()
		elif source == "nuomi": parser = NuomiParser()
		elif source == "wowo": parser = WowoParser()
		elif source == "wuba": parser = WubaParser()

		filelist = self.get_deal_file_list(source)
		for filename in filelist:
			filepath = os.path.join(source, filename)
			# run the corresponding parser
			parser.parse(filepath)
			# go through all deals
			for deal in parser.deals:
				deal_cities = deal["deal_city"].split(",")
				for deal_city in deal_cities:
				#	deal_city = unicode(deal_city.strip(), "utf-8")
					deal_city = deal_city.strip()
					current_city_id = self.get_city_info(deal_city)
					if len(current_city_id) == 0: self.notfound_city_set.add(deal_city)
					else: self.found_city_set.add(deal_city)
Example #2
0
	def find_deal_num(self, source):
		print "prcessing", source
		deal_set = set()
		deal_num_per_city = dict()
		deal_per_city = dict()

		parser = None
		if source == "dida": parser = DidaParser()
		elif source == "dianping": parser = DianpingParser()
		elif source == "ftuan": parser = FtuanParser()
		elif source == "lashou": parser = LashouParser()
		elif source == "manzuo": parser = ManzuoParser()
		elif source == "meituan": parser = MeituanParser()
		elif source == "nuomi": parser = NuomiParser()
		elif source == "wowo": parser = WowoParser()
		elif source == "wuba": parser = WubaParser()

		filelist = self.get_deal_file_list(source)
		for filename in filelist:
			filepath = os.path.join(source, filename)
			try:
				parser.parse(filepath)
			except Exception, e:
				print "Error: find_deal_num.py", source, filename
				continue  # get rid of this error file

			# assume all deals in the same file belong to the same city
			city_name = None
			for deal in parser.deals:
				if deal["deal_city"] in self.cache:
					city_name = deal["deal_city"]
					if self.cache[deal["deal_city"]] == "283":  # if it is the whole country
						continue
					break

			for deal in parser.deals:
				current_city_name = None
				if deal["deal_city"] in self.cache:
					if self.cache[deal["deal_city"]] == "283":  # although the deal is country-wise, but it still belongs to the current city
						current_city_name = city_name
					else:
						current_city_name = deal["deal_city"]
				else:
					current_city_name = city_name

				deal_id = deal["deal_id"]
				deal_set.add(deal_id)
				if not current_city_name:
					continue
				if current_city_name in deal_per_city:
					deal_per_city[current_city_name].add(deal_id)
				else:
					deal_per_city[current_city_name] = set()
					deal_per_city[current_city_name].add(deal_id)
	def parse(self, source, created_time, filepath):
		parser = None
		if source == "dida": parser = DidaParser()
		elif source == "dianping": parser = DianpingParser()
		elif source == "ftuan": parser = FtuanParser()
		elif source == "lashou": parser = LashouParser()
		elif source == "manzuo": parser = ManzuoParser()
		elif source == "meituan": parser = MeituanParser()
		elif source == "nuomi": parser = NuomiParser()
		elif source == "wowo": parser = WowoParser()
		elif source == "wuba": parser = WubaParser()

		# run the corresponding parser
		parser.parse(filepath)

		# go through the deals until we find the city_id.
		# assumption: the city_id should be consistent for all deals in the current file
		city_id = None
		for deal in parser.deals:
			if not city_id:
				city_id = self.get_city_info(deal["deal_city"])  # query the city table
				if len(city_id) != 0:
					city_id = city_id[0]["city_id"]
					break

		for deal in parser.deals:
			deal_id      = deal["deal_id"]
			sales_num    = deal["sales_num"]
			price        = deal["price"]
			start_time   = deal["start_time"]
			end_time     = deal["end_time"]
			deal_cate    = deal["deal_cate"]
			deal_subcate = deal["deal_subcate"]
			current_city_id = self.get_city_info(deal["deal_city"])
			if len(current_city_id) == 0:   # if cannot find city id, use the global one.
				current_city_id = city_id
			else:
				current_city_id = current_city_id[0]["city_id"]
			print deal_id, current_city_id, sales_num, price, start_time, end_time, deal_cate, deal_subcate
Example #4
0
    def extract(self, source, output_file):
        parser = None
        if source == "dida":
            parser = DidaParser()
        elif source == "dianping":
            parser = DianpingParser()
        elif source == "lashou":
            parser = LashouParser()
        elif source == "meituan":
            parser = MeituanParser()
        elif source == "nuomi":
            parser = NuomiParser()
        elif source == "wowo":
            parser = WowoParser()
        elif source == "wuba":
            parser = WubaParser()
        else:
            print source, "does not have shop information"
            return

        shop_set = dict()
        filelist = self.get_deal_file_list(source)
        for filename in filelist:
            filepath = os.path.join(source, filename)
            print "processing", source, filename
            parser.parse(filepath)

            # assume all deals in the same file belong to the same city
            city_name = None
            for deal in parser.deals:
                if deal["deal_city"] in self.cache:
                    city_name = deal["deal_city"]
                    break

            for deal in parser.deals:
                deal_id = deal["deal_id"]
                deal_sold_at = deal["deal_sold_at"]
                for shop in deal_sold_at:
                    # 	shop_entry = self.make_shop_entry_tel_as_key(shop)
                    shop_entry = self.make_shop_entry_name_as_key(shop)
                    if len(shop_entry) > 0:
                        shop_entry[0] = self.cache[city_name] + "_" + shop_entry[0]
                        if shop_entry[0] not in shop_set:
                            shop_set[shop_entry[0]] = [
                                deal_id,
                                city_name,
                                shop_entry[1],
                                shop_entry[2],
                                shop_entry[3],
                                shop_entry[4],
                                shop_entry[5],
                            ]

                            # save the shop list
        fhandler = open(output_file, "w")
        for key in shop_set:
            shop_string = key
            for ele in shop_set[key]:
                if ele:
                    shop_string = shop_string + "\t" + ele
                else:
                    shop_string = shop_string + "\t-"
            fhandler.write(shop_string.encode("utf-8") + "\n")
        fhandler.close()