def __call__(self, input_file_path, output_file_path): input_data = common.loadJSON(input_file_path)[:5] output_data = [] bar = tqdm.tqdm(total=len(input_data)) for input_item in input_data: try: question = self.getQuestionFromURL(input_item["url"]) output_item = {} output_item["question"] = question output_item["url"] = input_item["url"] output_item["answer_entity_ids"] = input_item[ "answer_entity_ids"] output_data += self.convert(output_item) except Exception as e: print("Exception: %s on url %s" % (str(e), input_item["url"])) bar.update() bar.close() common.dumpJSON(output_data, output_file_path)
def generate(input_dir_path, output_file_path): data = defaultdict(dict) files = glob.glob(str(input_dir_path / "**/*.json"), recursive = True) for file in files: item = common.loadJSON(file) data[item["id"].split("_")[0]][item["id"]] = {"id": item["id"], "name": item["name"], "categories": item["properties"], "location": [item["latitude"], item["longitude"]]} common.dumpJSON(data, output_file_path)
def __call__(self, input_file_path, output_file_path, cities_file_path): cities = common.loadJSON(cities_file_path) input_data = common.loadJSON(input_file_path)[:5] output_data = [] bar = tqdm.tqdm(total=len(input_data)) for input_item in input_data: try: post = self.getPostFromURL(input_item["url"]) post["city"] = cities[int( input_item["answer_entity_ids"][0].split("_")[0])] output_data.append(post) except Exception as e: print("Exception: %s on url %s" % (str(e), input_item["url"])) bar.update() bar.close() common.dumpJSON(output_data, output_file_path)
def __call__(self, city_urls_file_path, posts_urls_file_path): city_post_urls = OrderedDict() city_urls = common.loadJSON(city_urls_file_path) bar = tqdm.tqdm(total=len(city_urls)) for city, city_url in city_urls.items(): self.count = 0 post_urls = self.getPostURLsFromCityURL(city_url=city_url) city_post_urls[city] = {} city_post_urls[city]["city_url"] = city_url city_post_urls[city]["post_urls"] = post_urls bar.update() bar.close() common.dumpJSON(city_post_urls, posts_urls_file_path)
def __call__(self, posts_urls_file_path, posts_file_path): posts_urls = common.loadJSON(posts_urls_file_path) posts = [] bar = tqdm.tqdm(total = sum([len(item["post_urls"]) for item in posts_urls.values()])) for city, item in posts_urls.items(): for url in item["post_urls"]: try: post = self.getPostFromURL(url) post["city"] = city posts.append(post) except Exception as e: pass bar.update() bar.close() common.dumpJSON(posts, posts_file_path)
def __init__(self, city_entities_file_path) -> None: self.retries = 5 self.city_entities = common.loadJSON(city_entities_file_path)