Ejemplo n.º 1
0
    def __call__(self, city_urls_file_path: Path, city_post_urls_file_path: Path) -> None:
        city_post_urls = OrderedDict()
        city_urls = json.load(open(city_urls_file_path, encoding = "utf-8"))

        bar = tqdm.tqdm(total = len(city_urls))
        for city, city_url in city_urls.items():
            self.count = 0
            post_urls = self.getPostURLsFromCityURL(city_url = city_url)

            city_post_urls[city] = {}
            city_post_urls[city]["city_url"] = city_url
            city_post_urls[city]["post_urls"] = post_urls

            bar.update()

        bar.close()
        common.dumpJSON(city_post_urls, city_post_urls_file_path)
Ejemplo n.º 2
0
    def __call__(self, city_urls_file_path, posts_urls_file_path):
        city_post_urls = OrderedDict()
        city_urls = common.loadJSON(city_urls_file_path)

        bar = tqdm.tqdm(total=len(city_urls))
        for city, city_url in city_urls.items():
            self.count = 0
            post_urls = self.getPostURLsFromCityURL(city_url=city_url)

            city_post_urls[city] = {}
            city_post_urls[city]["city_url"] = city_url
            city_post_urls[city]["post_urls"] = post_urls

            bar.update()

        bar.close()
        common.dumpJSON(city_post_urls, posts_urls_file_path)
Ejemplo n.º 3
0
    def __call__(self, posts_urls_file_path, posts_file_path):
        posts_urls = common.loadJSON(posts_urls_file_path)

        posts = []
        bar = tqdm.tqdm(total = sum([len(item["post_urls"]) for item in posts_urls.values()]))
        for city, item in posts_urls.items():
            for url in item["post_urls"]:
                try:
                    post = self.getPostFromURL(url)
                    post["city"] = city
                    posts.append(post)
                except Exception as e:
                    pass

                bar.update()

        bar.close()
        common.dumpJSON(posts, posts_file_path)
Ejemplo n.º 4
0
    def __call__(self, input_file_path: Path, output_file_path: Path, cities_file_path: Path) -> None:
        cities = json.load(open(cities_file_path))
        input_data = json.load(open(input_file_path))

        output_data = []
        bar = tqdm.tqdm(total = len(input_data))
        for input_item in input_data:
            try:
                post = self.getPostFromURL(input_item["url"])
                post["city"] = cities[int(input_item["answer_entity_ids"][0].split("_")[0])]
                output_data.append(post)
            except Exception as e:
                print("Exception: %s on url %s" % (str(e), input_item["url"]))

            bar.update()

        bar.close()
        common.dumpJSON(output_data, output_file_path)
Ejemplo n.º 5
0
def convert(processed_dir_path: Path, postprocessed_dir_path: Path,
            start_date: int, end_date: int, replace: bool,
            ignore: bool) -> None:
    processed_dir_path = options.processed_dir_path
    postprocessed_dir_path = options.postprocessed_dir_path

    start_date = datetime.strptime(start_date, "%d%m%Y")
    end_date = datetime.strptime(end_date, "%d%m%Y")

    def check(x):
        try:
            date = datetime.strptime(x.split(",")[0].strip(), "%d %b %Y")
            return (date >= start_date) and (date <= end_date)
        except:
            return ignore

    file_posts = {}
    for file_path in processed_dir_path.glob("**/*.json"):
        file_posts[str(file_path)] = json.load(
            open(file_path, encoding="utf-8"))

    for file_path, processed_posts in file_posts.items():
        postprocessed_file_path = postprocessed_dir_path / Path(
            file_path).relative_to(processed_dir_path).with_suffix(
                ".postprocessed.json")

        if (replace == False and postprocessed_file_path.exists()):
            print(
                "Skipping file %s! DataFormat file path %s already exists!\n" %
                (file_path, postprocessed_file_path))
            continue

        print("Processing file %s" % (file_path))

        postprocessed_posts = [
            convert(post) for post in processed_posts if check(post["date"])
        ]
        print("Accepted %d of %d posts" %
              (len(postprocessed_posts), len(processed_posts)))

        postprocessed_posts = list(
            itertools.chain.from_iterable(postprocessed_posts))
        common.dumpJSON(postprocessed_posts, postprocessed_file_path)
Ejemplo n.º 6
0
    def __call__(self, input_file_path: Path, output_file_path: Path) -> None:
        input_data = json.load(open(input_file_path))[:25]

        output_data = []

        bar = tqdm.tqdm(total = len(input_data))
        for input_item in input_data:
            try:
                question = self.getQuestionFromURL(input_item["url"])

                output_item = {}
                output_item["question"] = question
                output_item["url"] = input_item["url"]
                output_item["answer_entity_ids"] = input_item["answer_entity_ids"]
                output_data.append(output_item)

            except Exception as e:
                print("Exception: %s on url %s" % (str(e), input_item["url"]))

            bar.update()

        bar.close()
        common.dumpJSON(output_data, output_file_path)
Ejemplo n.º 7
0
    def __call__(self, fetched_dir_path: Path, processed_dir_path: Path,
                 mseq: bool, replace: bool) -> None:
        fetched_dir_path = options.fetched_dir_path
        processed_dir_path = options.processed_dir_path

        file_posts = {}
        for file_path in fetched_dir_path.glob("**/*.json"):
            file_posts[str(file_path)] = json.load(
                open(file_path, encoding="utf-8"))

        for file_path, posts in file_posts.items():
            logs_file_path = processed_dir_path / Path(file_path).relative_to(
                fetched_dir_path).with_suffix(".%slogs.json" %
                                              ("mseq_" if mseq else ""))
            processed_file_path = processed_dir_path / Path(
                file_path).relative_to(fetched_dir_path).with_suffix(
                    ".%sprocessed.json" % ("mseq_" if mseq else ""))

            if (replace == False and processed_file_path.exists()):
                print(
                    "Skipping file %s! Processed file path %s already exists!\n"
                    % (file_path, processed_file_path))
                continue

            statuses = ["OK"] * len(posts)

            processed_post_urls = set()
            for index, post in enumerate(posts):
                if (post["url"] in processed_post_urls):
                    posts[i] = None
                    statuses[i] = "Duplicate post url"
                processed_post_urls.add(post["url"])

            urls = [post["url"] for post in posts]

            print("Processing file %s" % (file_path))

            print("Running Processing Steps 1 -> 2 . . .")
            posts, statuses = self.process(posts, self.processors1, statuses)

            if (mseq):
                print("Running MSEQ tagger on Posts . . .", end="\t")
                start = time.time()
                self.MSEQtagger(posts)
                end = time.time()
                print(str(datetime.timedelta(seconds=int(end - start))),
                      "HH:MM:SS")

                print("Running Processing Steps 3 -> 4 . . .")
                posts, statuses = self.process(posts, self.processors2,
                                               statuses)
            else:
                print("Skipping Processing Step 3 (to enable use --mseq)")
                print("Running Processing Step 4 . . .")
                posts, statuses = self.process(posts, self.processors2[1:],
                                               statuses)

            logs = [{
                "url": url,
                "status": status
            } for url, status in zip(urls, statuses)]
            processed_posts = list(filter(lambda post: post is not None,
                                          posts))

            print("Accepted %d posts" % (len(processed_posts)))
            common.dumpJSON(logs, logs_file_path)
            common.dumpJSON(processed_posts, processed_file_path)

            print()