def for_chinese(date): """[summary] Arguments: date {str} -- Must be prefixed with underscore, e.g. '20180303_' """ # 1. Load the post object generated by _2_remove_unrelated_data all_posts = load_posts('analysis/_2_remove_unrelated_data/chinese.json') # 2. Load negative and positive chinese keywords positive_kp = load_semantic_keywords_processor(date, True, False, 'chinese') negative_kp = load_semantic_keywords_processor(date, False, True, 'chinese') # 3. Match those keywords against every post object log("Labelling semantic of chinese post", 1) for p in all_posts: matching_positive_keywords = positive_kp.extract_keywords(p["value"]) matching_negative_keywords = negative_kp.extract_keywords(p["value"]) if (len(matching_positive_keywords) > 0): p["semantic_value"]["positive"] = True if (len(matching_negative_keywords) > 0): p["semantic_value"]["negative"] = True # 4. Save the labelled post as chinese.json save_posts(all_posts, f'analysis/_3_label_semantic/chinese.json')
def main(language): log(f"Loading {language} posts", 1) # 1. Load the post data posts = load_posts(f'analysis/_1_process_raw_data/output/{language}.json') # 2. Load the labels, AKA keywords for political figure/party (example "Najib", "Pakatan Harapan") labels = get_labels(language) # 3. Match the labels with every post object label_post(posts, labels) # 4. Remove post that is not related to any keywords log(f"Removing unrelated posts", 1) purified = [x for x in posts if len(x['related_to']) > 0] # 5. Save the remaning post object log(f"Number of removed posts = " + str(len(posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/{language}.json') # 6. This step is to save the post object that is not related to any keywords SAVE_DUMPED_POST = False if SAVE_DUMPED_POST: dumped = [x for x in posts if len(x['related_to']) <= 0] save_posts( dumped, f'analysis/_2_remove_unrelated_data/dumped_{language}.json')
def main(language): log(f"Analyzing {language} source", 1) dir1 = '_1_process_raw_data/output' # use dir1 to analyze raw source dir2 = '_2_remove_unrelated_data' # use dir2 to analyze filtered source all_posts = load_posts(f'analysis/{dir1}/{language}.json') all_posts = [x for x in all_posts if x["date"] != ''] standardized = standardize_date_format(all_posts) dic = {} date_list = [] for p in standardized: source = p['source'] date = p['date'] if not source in dic: dic[source] = [] if date > MIN_DATE: dic[source].append(date) date_list.append(date) date_list = filter_date(date_list) for source_name in dic: dic[source_name] = filter_date(dic[source_name]) print(source_name) print(len(dic[source_name])) dic[source_name] = group_dates(MIN_DATE, MAX_DATE, dic[source_name]) with open(f'analysis/_5_analyze_source/{language}_source.json', 'w') as outfile: print("Saving ", {language}) json.dump(dic, outfile)
def main( language, number_of_keywords_to_be_shown, semantic_type ): log(f'Analyzing {semantic_type} keywords for {language} data', 1) if semantic_type == 'positive': keyword_processor = load_semantic_keywords_processor('', True, False, language) elif semantic_type == 'negative': keyword_processor = load_semantic_keywords_processor('', False, True, language) else: raise Exception("Invalid argument") posts = load_posts(f'analysis/_3_label_semantic/{language}.json') dic = {} for p in posts: matching_keywords = keyword_processor.extract_keywords(p["value"]) for word in matching_keywords: if not word in dic: dic[word] = [] dic[word].append(p["origin"]) json.dump(dic, open( f'analysis/_6_analyze_keyword/{language}_keyword_freq.json', 'w'), ensure_ascii=False) flattened_dic = {} for key in dic: flattened_dic[key] = len(dic[key]) tuples = sorted(flattened_dic.items(), key=operator.itemgetter(1), reverse=True) y_labels = [] x_values = [] for t in tuples: y_labels.append(t[0]) x_values.append(t[1]) y_labels = y_labels[0: number_of_keywords_to_be_shown + 1] x_values = x_values[0: number_of_keywords_to_be_shown + 1] file_name = f'analysis/_6_analyze_keyword/{language}_top_{semantic_type}_keywords.txt' with open(file_name, 'w') as file: for value in y_labels: file.write(value + "\n") log(f'Top 100 keywords are save as {file_name}', 2) # log('Plotting graph', 2) # plot_hbar(y_labels, x_values, f'{semantic_type}_keyword_frequencies') # log('DONE', 1) log('Plotting word cloud', 2) plot_wordcloud(dict(zip(y_labels, x_values)), f'{language}_{semantic_type}_keywordcloud')
def test1(): posts = load_posts('../_1_process_raw_data/output/sample_output.json') assert len(posts) == 2 assert posts[0] == { 'date': '2017-05-18 11:56:09', 'value': "b'report: red granite in talks with doj to settle 1mdb-linked lawsuit'", 'source': 'facebook', 'related_to': None, 'semantic_value': None }
def for_english(): # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo all_posts = load_posts('analysis/_1_process_raw_data/output/english.json') leaders = json.load(open(f'keywords/target/leader.json')) parties = json.load(open(f'keywords/target/party.json')) combined = {**leaders, **parties} keyword_dict = {} for key, value in combined.items(): keyword_dict[key] = [key] + combined[key]["alias_en"] kp = KeywordProcessor() kp.add_keywords_from_dict(keyword_dict) for p in all_posts: p["related_to"] = list(set(kp.extract_keywords(p["value"]))) purified = [x for x in all_posts if len(x['related_to']) > 0] log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1) save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
def main(language): post_id = 0 tokenized_posts = [] posts = load_posts(f'analysis/_2_remove_unrelated_data/{language}.json') for p in posts: p["belongs_to"] = "p" + str(post_id) post_id += 1 p["semantic_value"] = "unassigned" p["value"] = re.sub(r'^https?:\/\/.*[\r\n]*', '', p["value"], flags=re.MULTILINE) p["value"] = re.sub(r'\{[^}]*\}', ' ', p["value"]) sentences= tokenize_post_into_sentence(p["value"]) for s in sentences: copy= p.copy() copy["value"]= s tokenized_posts.append(copy) if GENERATE_SAMPLE: save_posts(tokenized_posts[:100], f'analysis/transform_format_for_mongodb/{language}_sample.json') save_posts(tokenized_posts, f'analysis/transform_format_for_mongodb/{language}.json')
def main(language): # 1. Load the post object generated by _3_label_semantic post_str = str(load_posts(f'analysis/_3_label_semantic/{language}.json')) # 2. Replace synonyms, (for example: replace 'Najib','PM' with 'najib') post_str = replace_synonyms(post_str, "../../keywords/target/") all_posts = eval(replace_synonyms(post_str, get_keywords(language))) # 3. Standardize date format of every Post object log(f"Standardizing date format of each {language} posts", 0) standardized = standardize_date_format(all_posts) log(f"Sorting post based on date", 1) # 4. Sort the Post object based on date sorted_posts = sorted(standardized, key=lambda x: x['date']) # 5. Extract the data for the sorted post extracted = extract_data(sorted_posts, START_DATE, END_DATE, language) log(f'Storing results to analysis/results/{language}_extracted.json', 1) # 6. Save the extracted data as XXXX_extracted.json json.dump(extracted, open(f'analysis/results/{language}_extracted.json', 'w'), ensure_ascii=False)
from analysis.libs.load_posts import load_posts from analysis.using_fasttext.flatten import flatten from analysis.using_fasttext.labelize_using_fasttextformat import labelize_using_fasttextformat posts = load_posts(f'analysis/_3_label_semantic/english.json') labelled = labelize_using_fasttextformat(posts) # flattened = flatten(posts) print(labelled)
from analysis.libs.load_posts import load_posts from analysis.libs.save_posts import save_posts posts = load_posts(f'analysis/using_fasttext/labelled_english_posts.json') positive = "__label__2" with open('analysis/using_fasttext/predicted_label.txt') as file: labels = file.read().split('\n') for i in range(0, len(posts)): posts[i]["semantic_value"][("positive" if labels[i] == positive else "negative")] = True save_posts(posts, 'analysis/results/fasttext/english_analyzed.json')