def for_chinese(date): """[summary] Arguments: date {str} -- Must be prefixed with underscore, e.g. '20180303_' """ # 1. Load the post object generated by _2_remove_unrelated_data all_posts = load_posts('analysis/_2_remove_unrelated_data/chinese.json') # 2. Load negative and positive chinese keywords positive_kp = load_semantic_keywords_processor(date, True, False, 'chinese') negative_kp = load_semantic_keywords_processor(date, False, True, 'chinese') # 3. Match those keywords against every post object log("Labelling semantic of chinese post", 1) for p in all_posts: matching_positive_keywords = positive_kp.extract_keywords(p["value"]) matching_negative_keywords = negative_kp.extract_keywords(p["value"]) if (len(matching_positive_keywords) > 0): p["semantic_value"]["positive"] = True if (len(matching_negative_keywords) > 0): p["semantic_value"]["negative"] = True # 4. Save the labelled post as chinese.json save_posts(all_posts, f'analysis/_3_label_semantic/chinese.json')
def save_posts(list_of_posts, file_name): log("Saving output to " + file_name, 1) with open(file_name, 'w+', encoding='utf-8') as file: if hasattr(list_of_posts[0], '__dict__'): json.dump([ob.__dict__ for ob in list_of_posts], file, indent=4, ensure_ascii=False) else: json.dump([ob for ob in list_of_posts], file, indent=4, ensure_ascii=False)
def main(language): log(f"Analyzing {language} source", 1) dir1 = '_1_process_raw_data/output' # use dir1 to analyze raw source dir2 = '_2_remove_unrelated_data' # use dir2 to analyze filtered source all_posts = load_posts(f'analysis/{dir1}/{language}.json') all_posts = [x for x in all_posts if x["date"] != ''] standardized = standardize_date_format(all_posts) dic = {} date_list = [] for p in standardized: source = p['source'] date = p['date'] if not source in dic: dic[source] = [] if date > MIN_DATE: dic[source].append(date) date_list.append(date) date_list = filter_date(date_list) for source_name in dic: dic[source_name] = filter_date(dic[source_name]) print(source_name) print(len(dic[source_name])) dic[source_name] = group_dates(MIN_DATE, MAX_DATE, dic[source_name]) with open(f'analysis/_5_analyze_source/{language}_source.json', 'w') as outfile: print("Saving ", {language}) json.dump(dic, outfile)
def parse_files_from(self, directory, parser): log("Parsing files from " + directory, 2) posts = [] for file in os.listdir(directory): if not file.endswith('.csv') and not file.endswith('.json'): continue posts += parser(directory + file) return posts
def main(jobs, language): posts = [] log(f"Parsing {language} posts", 1) # Run every job for job in jobs: posts += job.run() save_posts(posts, f'analysis/_1_process_raw_data/output/{language}.json') log(f"Number of {language} posts created : " + str(len(posts)), 1)
def label_post(all_posts, all_labels): log("Labelling post", 1) for post in all_posts: post['related_to'] = [] for label in all_labels: # This will make sure 'lks' would not match 'talks' pattern = f"(^|[^a-z]){label}([^a-z]|$)" if re.search(pattern, post['value']) is not None: if (label not in post['related_to']): post['related_to'].append(label)
def plot_wordcloud(word_freq, title): from wordcloud import WordCloud if('chinese' in title): wc = WordCloud(font_path='/usr/share/fonts/opentype/noto/NotoSansCJK-Light.ttc').fit_words(word_freq) else: wc = WordCloud().fit_words(word_freq) import matplotlib.pyplot as plt plt.imshow(wc, interpolation='bilinear') plt.axis("off") target_path = f'analysis/_6_analyze_keyword/{title}.png' log("Saving word cloud as " + target_path, 2) plt.savefig(target_path)
def for_english(): # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo all_posts = load_posts('analysis/_1_process_raw_data/output/english.json') leaders = json.load(open(f'keywords/target/leader.json')) parties = json.load(open(f'keywords/target/party.json')) combined = {**leaders, **parties} keyword_dict = {} for key, value in combined.items(): keyword_dict[key] = [key] + combined[key]["alias_en"] kp = KeywordProcessor() kp.add_keywords_from_dict(keyword_dict) for p in all_posts: p["related_to"] = list(set(kp.extract_keywords(p["value"]))) purified = [x for x in all_posts if len(x['related_to']) > 0] log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
def load_labels(file_path, language): log(f"Loading {language} labels from {file_path}", 1) result = [] with open(file_path, encoding='utf8') as file: dic = json.load(file) for key in dic: if language == 'english': result.append(key) result += dic[key]['alias_en'] elif language == 'chinese': x = dic[key]['name_cn'] if x is not None: result.append(x) result += dic[key]['alias_cn'] else: raise Exception( "Invalid language argument. Expected (chinese/english) but got " + language) log(f"{len(result)} {language} labels loaded", 2) return result
def main(language): log(f"Loading {language} posts", 1) # 1. Load the post data posts = load_posts(f'analysis/_1_process_raw_data/output/{language}.json') # 2. Load the labels, AKA keywords for political figure/party (example "Najib", "Pakatan Harapan") labels = get_labels(language) # 3. Match the labels with every post object label_post(posts, labels) # 4. Remove post that is not related to any keywords log(f"Removing unrelated posts", 1) purified = [x for x in posts if len(x['related_to']) > 0] # 5. Save the remaning post object log(f"Number of removed posts = " + str(len(posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/{language}.json') # 6. This step is to save the post object that is not related to any keywords SAVE_DUMPED_POST = False if SAVE_DUMPED_POST: dumped = [x for x in posts if len(x['related_to']) <= 0] save_posts( dumped, f'analysis/_2_remove_unrelated_data/dumped_{language}.json')
def plot_hbar(y_labels, x_values, title): import matplotlib.pyplot as plt import numpy as np # Fixing random state for reproducibility plt.rcdefaults() fig, ax = plt.subplots() fig.set_size_inches(19, 20) y_pos = np.arange(len(y_labels)) error = np.random.rand(len(y_labels)) ax.barh(y_pos, x_values, xerr=error, align='center', color='green', ecolor='black') ax.set_yticks(y_pos) ax.set_yticklabels(y_labels) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('Frequencies') ax.set_title(title) # plt.show() target_path = f'analysis/_6_analyze_keyword/{title}.png' log("Saving graph as " + target_path, 2) plt.savefig(target_path)
def main( language, number_of_keywords_to_be_shown, semantic_type ): log(f'Analyzing {semantic_type} keywords for {language} data', 1) if semantic_type == 'positive': keyword_processor = load_semantic_keywords_processor('', True, False, language) elif semantic_type == 'negative': keyword_processor = load_semantic_keywords_processor('', False, True, language) else: raise Exception("Invalid argument") posts = load_posts(f'analysis/_3_label_semantic/{language}.json') dic = {} for p in posts: matching_keywords = keyword_processor.extract_keywords(p["value"]) for word in matching_keywords: if not word in dic: dic[word] = [] dic[word].append(p["origin"]) json.dump(dic, open( f'analysis/_6_analyze_keyword/{language}_keyword_freq.json', 'w'), ensure_ascii=False) flattened_dic = {} for key in dic: flattened_dic[key] = len(dic[key]) tuples = sorted(flattened_dic.items(), key=operator.itemgetter(1), reverse=True) y_labels = [] x_values = [] for t in tuples: y_labels.append(t[0]) x_values.append(t[1]) y_labels = y_labels[0: number_of_keywords_to_be_shown + 1] x_values = x_values[0: number_of_keywords_to_be_shown + 1] file_name = f'analysis/_6_analyze_keyword/{language}_top_{semantic_type}_keywords.txt' with open(file_name, 'w') as file: for value in y_labels: file.write(value + "\n") log(f'Top 100 keywords are save as {file_name}', 2) # log('Plotting graph', 2) # plot_hbar(y_labels, x_values, f'{semantic_type}_keyword_frequencies') # log('DONE', 1) log('Plotting word cloud', 2) plot_wordcloud(dict(zip(y_labels, x_values)), f'{language}_{semantic_type}_keywordcloud')
def main(language): # 1. Load the post object generated by _3_label_semantic post_str = str(load_posts(f'analysis/_3_label_semantic/{language}.json')) # 2. Replace synonyms, (for example: replace 'Najib','PM' with 'najib') post_str = replace_synonyms(post_str, "../../keywords/target/") all_posts = eval(replace_synonyms(post_str, get_keywords(language))) # 3. Standardize date format of every Post object log(f"Standardizing date format of each {language} posts", 0) standardized = standardize_date_format(all_posts) log(f"Sorting post based on date", 1) # 4. Sort the Post object based on date sorted_posts = sorted(standardized, key=lambda x: x['date']) # 5. Extract the data for the sorted post extracted = extract_data(sorted_posts, START_DATE, END_DATE, language) log(f'Storing results to analysis/results/{language}_extracted.json', 1) # 6. Save the extracted data as XXXX_extracted.json json.dump(extracted, open(f'analysis/results/{language}_extracted.json', 'w'), ensure_ascii=False)
def load_posts(file_path): log("Loading posts from " + file_path, 1) with open(file_path, 'rb') as file: posts = json.load(file) return posts