def __call__(self, city_urls_file_path: Path, city_post_urls_file_path: Path) -> None: city_post_urls = OrderedDict() city_urls = json.load(open(city_urls_file_path, encoding = "utf-8")) bar = tqdm.tqdm(total = len(city_urls)) for city, city_url in city_urls.items(): self.count = 0 post_urls = self.getPostURLsFromCityURL(city_url = city_url) city_post_urls[city] = {} city_post_urls[city]["city_url"] = city_url city_post_urls[city]["post_urls"] = post_urls bar.update() bar.close() common.dumpJSON(city_post_urls, city_post_urls_file_path)
def __call__(self, city_urls_file_path, posts_urls_file_path): city_post_urls = OrderedDict() city_urls = common.loadJSON(city_urls_file_path) bar = tqdm.tqdm(total=len(city_urls)) for city, city_url in city_urls.items(): self.count = 0 post_urls = self.getPostURLsFromCityURL(city_url=city_url) city_post_urls[city] = {} city_post_urls[city]["city_url"] = city_url city_post_urls[city]["post_urls"] = post_urls bar.update() bar.close() common.dumpJSON(city_post_urls, posts_urls_file_path)
def __call__(self, posts_urls_file_path, posts_file_path): posts_urls = common.loadJSON(posts_urls_file_path) posts = [] bar = tqdm.tqdm(total = sum([len(item["post_urls"]) for item in posts_urls.values()])) for city, item in posts_urls.items(): for url in item["post_urls"]: try: post = self.getPostFromURL(url) post["city"] = city posts.append(post) except Exception as e: pass bar.update() bar.close() common.dumpJSON(posts, posts_file_path)
def __call__(self, input_file_path: Path, output_file_path: Path, cities_file_path: Path) -> None: cities = json.load(open(cities_file_path)) input_data = json.load(open(input_file_path)) output_data = [] bar = tqdm.tqdm(total = len(input_data)) for input_item in input_data: try: post = self.getPostFromURL(input_item["url"]) post["city"] = cities[int(input_item["answer_entity_ids"][0].split("_")[0])] output_data.append(post) except Exception as e: print("Exception: %s on url %s" % (str(e), input_item["url"])) bar.update() bar.close() common.dumpJSON(output_data, output_file_path)
def convert(processed_dir_path: Path, postprocessed_dir_path: Path, start_date: int, end_date: int, replace: bool, ignore: bool) -> None: processed_dir_path = options.processed_dir_path postprocessed_dir_path = options.postprocessed_dir_path start_date = datetime.strptime(start_date, "%d%m%Y") end_date = datetime.strptime(end_date, "%d%m%Y") def check(x): try: date = datetime.strptime(x.split(",")[0].strip(), "%d %b %Y") return (date >= start_date) and (date <= end_date) except: return ignore file_posts = {} for file_path in processed_dir_path.glob("**/*.json"): file_posts[str(file_path)] = json.load( open(file_path, encoding="utf-8")) for file_path, processed_posts in file_posts.items(): postprocessed_file_path = postprocessed_dir_path / Path( file_path).relative_to(processed_dir_path).with_suffix( ".postprocessed.json") if (replace == False and postprocessed_file_path.exists()): print( "Skipping file %s! DataFormat file path %s already exists!\n" % (file_path, postprocessed_file_path)) continue print("Processing file %s" % (file_path)) postprocessed_posts = [ convert(post) for post in processed_posts if check(post["date"]) ] print("Accepted %d of %d posts" % (len(postprocessed_posts), len(processed_posts))) postprocessed_posts = list( itertools.chain.from_iterable(postprocessed_posts)) common.dumpJSON(postprocessed_posts, postprocessed_file_path)
def __call__(self, input_file_path: Path, output_file_path: Path) -> None: input_data = json.load(open(input_file_path))[:25] output_data = [] bar = tqdm.tqdm(total = len(input_data)) for input_item in input_data: try: question = self.getQuestionFromURL(input_item["url"]) output_item = {} output_item["question"] = question output_item["url"] = input_item["url"] output_item["answer_entity_ids"] = input_item["answer_entity_ids"] output_data.append(output_item) except Exception as e: print("Exception: %s on url %s" % (str(e), input_item["url"])) bar.update() bar.close() common.dumpJSON(output_data, output_file_path)
def __call__(self, fetched_dir_path: Path, processed_dir_path: Path, mseq: bool, replace: bool) -> None: fetched_dir_path = options.fetched_dir_path processed_dir_path = options.processed_dir_path file_posts = {} for file_path in fetched_dir_path.glob("**/*.json"): file_posts[str(file_path)] = json.load( open(file_path, encoding="utf-8")) for file_path, posts in file_posts.items(): logs_file_path = processed_dir_path / Path(file_path).relative_to( fetched_dir_path).with_suffix(".%slogs.json" % ("mseq_" if mseq else "")) processed_file_path = processed_dir_path / Path( file_path).relative_to(fetched_dir_path).with_suffix( ".%sprocessed.json" % ("mseq_" if mseq else "")) if (replace == False and processed_file_path.exists()): print( "Skipping file %s! Processed file path %s already exists!\n" % (file_path, processed_file_path)) continue statuses = ["OK"] * len(posts) processed_post_urls = set() for index, post in enumerate(posts): if (post["url"] in processed_post_urls): posts[i] = None statuses[i] = "Duplicate post url" processed_post_urls.add(post["url"]) urls = [post["url"] for post in posts] print("Processing file %s" % (file_path)) print("Running Processing Steps 1 -> 2 . . .") posts, statuses = self.process(posts, self.processors1, statuses) if (mseq): print("Running MSEQ tagger on Posts . . .", end="\t") start = time.time() self.MSEQtagger(posts) end = time.time() print(str(datetime.timedelta(seconds=int(end - start))), "HH:MM:SS") print("Running Processing Steps 3 -> 4 . . .") posts, statuses = self.process(posts, self.processors2, statuses) else: print("Skipping Processing Step 3 (to enable use --mseq)") print("Running Processing Step 4 . . .") posts, statuses = self.process(posts, self.processors2[1:], statuses) logs = [{ "url": url, "status": status } for url, status in zip(urls, statuses)] processed_posts = list(filter(lambda post: post is not None, posts)) print("Accepted %d posts" % (len(processed_posts))) common.dumpJSON(logs, logs_file_path) common.dumpJSON(processed_posts, processed_file_path) print()