def for_chinese(date): """[summary] Arguments: date {str} -- Must be prefixed with underscore, e.g. '20180303_' """ # 1. Load the post object generated by _2_remove_unrelated_data all_posts = load_posts('analysis/_2_remove_unrelated_data/chinese.json') # 2. Load negative and positive chinese keywords positive_kp = load_semantic_keywords_processor(date, True, False, 'chinese') negative_kp = load_semantic_keywords_processor(date, False, True, 'chinese') # 3. Match those keywords against every post object log("Labelling semantic of chinese post", 1) for p in all_posts: matching_positive_keywords = positive_kp.extract_keywords(p["value"]) matching_negative_keywords = negative_kp.extract_keywords(p["value"]) if (len(matching_positive_keywords) > 0): p["semantic_value"]["positive"] = True if (len(matching_negative_keywords) > 0): p["semantic_value"]["negative"] = True # 4. Save the labelled post as chinese.json save_posts(all_posts, f'analysis/_3_label_semantic/chinese.json')
def main(language): log(f"Loading {language} posts", 1) # 1. Load the post data posts = load_posts(f'analysis/_1_process_raw_data/output/{language}.json') # 2. Load the labels, AKA keywords for political figure/party (example "Najib", "Pakatan Harapan") labels = get_labels(language) # 3. Match the labels with every post object label_post(posts, labels) # 4. Remove post that is not related to any keywords log(f"Removing unrelated posts", 1) purified = [x for x in posts if len(x['related_to']) > 0] # 5. Save the remaning post object log(f"Number of removed posts = " + str(len(posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/{language}.json') # 6. This step is to save the post object that is not related to any keywords SAVE_DUMPED_POST = False if SAVE_DUMPED_POST: dumped = [x for x in posts if len(x['related_to']) <= 0] save_posts( dumped, f'analysis/_2_remove_unrelated_data/dumped_{language}.json')
def main(jobs, language): posts = [] log(f"Parsing {language} posts", 1) # Run every job for job in jobs: posts += job.run() save_posts(posts, f'analysis/_1_process_raw_data/output/{language}.json') log(f"Number of {language} posts created : " + str(len(posts)), 1)
def for_english(): # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo all_posts = load_posts('analysis/_1_process_raw_data/output/english.json') leaders = json.load(open(f'keywords/target/leader.json')) parties = json.load(open(f'keywords/target/party.json')) combined = {**leaders, **parties} keyword_dict = {} for key, value in combined.items(): keyword_dict[key] = [key] + combined[key]["alias_en"] kp = KeywordProcessor() kp.add_keywords_from_dict(keyword_dict) for p in all_posts: p["related_to"] = list(set(kp.extract_keywords(p["value"]))) purified = [x for x in all_posts if len(x['related_to']) > 0] log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
def main(language): post_id = 0 tokenized_posts = [] posts = load_posts(f'analysis/_2_remove_unrelated_data/{language}.json') for p in posts: p["belongs_to"] = "p" + str(post_id) post_id += 1 p["semantic_value"] = "unassigned" p["value"] = re.sub(r'^https?:\/\/.*[\r\n]*', '', p["value"], flags=re.MULTILINE) p["value"] = re.sub(r'\{[^}]*\}', ' ', p["value"]) sentences= tokenize_post_into_sentence(p["value"]) for s in sentences: copy= p.copy() copy["value"]= s tokenized_posts.append(copy) if GENERATE_SAMPLE: save_posts(tokenized_posts[:100], f'analysis/transform_format_for_mongodb/{language}_sample.json') save_posts(tokenized_posts, f'analysis/transform_format_for_mongodb/{language}.json')
from analysis.libs.load_posts import load_posts from analysis.libs.save_posts import save_posts posts = load_posts(f'analysis/using_fasttext/labelled_english_posts.json') positive = "__label__2" with open('analysis/using_fasttext/predicted_label.txt') as file: labels = file.read().split('\n') for i in range(0, len(posts)): posts[i]["semantic_value"][("positive" if labels[i] == positive else "negative")] = True save_posts(posts, 'analysis/results/fasttext/english_analyzed.json')