Esempio n. 1
0
def for_chinese(date):
    """[summary]
    
    Arguments:
        date {str} -- Must be prefixed with underscore, e.g. '20180303_' 
    """
    # 1. Load the post object generated by _2_remove_unrelated_data
    all_posts = load_posts('analysis/_2_remove_unrelated_data/chinese.json')

    # 2. Load negative and positive chinese keywords
    positive_kp = load_semantic_keywords_processor(date, True, False,
                                                   'chinese')
    negative_kp = load_semantic_keywords_processor(date, False, True,
                                                   'chinese')

    # 3. Match those keywords against every post object
    log("Labelling semantic of chinese post", 1)
    for p in all_posts:
        matching_positive_keywords = positive_kp.extract_keywords(p["value"])
        matching_negative_keywords = negative_kp.extract_keywords(p["value"])
        if (len(matching_positive_keywords) > 0):
            p["semantic_value"]["positive"] = True
        if (len(matching_negative_keywords) > 0):
            p["semantic_value"]["negative"] = True

    # 4. Save the labelled post as chinese.json
    save_posts(all_posts, f'analysis/_3_label_semantic/chinese.json')
Esempio n. 2
0
def save_posts(list_of_posts, file_name):
    log("Saving output to " + file_name, 1)
    with open(file_name, 'w+', encoding='utf-8') as file:
        if hasattr(list_of_posts[0], '__dict__'):
            json.dump([ob.__dict__ for ob in list_of_posts], file, indent=4, ensure_ascii=False)
        else:
            json.dump([ob for ob in list_of_posts], file, indent=4, ensure_ascii=False)
Esempio n. 3
0
def main(language):
    log(f"Analyzing {language} source", 1)
    dir1 = '_1_process_raw_data/output'  # use dir1 to analyze raw source
    dir2 = '_2_remove_unrelated_data'  # use dir2 to analyze filtered source
    all_posts = load_posts(f'analysis/{dir1}/{language}.json')
    all_posts = [x for x in all_posts if x["date"] != '']
    standardized = standardize_date_format(all_posts)
    dic = {}
    date_list = []
    for p in standardized:
        source = p['source']
        date = p['date']
        if not source in dic:
            dic[source] = []
        if date > MIN_DATE:
            dic[source].append(date)
            date_list.append(date)
    date_list = filter_date(date_list)

    for source_name in dic:
        dic[source_name] = filter_date(dic[source_name])
        print(source_name)
        print(len(dic[source_name]))
        dic[source_name] = group_dates(MIN_DATE, MAX_DATE, dic[source_name])

    with open(f'analysis/_5_analyze_source/{language}_source.json',
              'w') as outfile:
        print("Saving ", {language})
        json.dump(dic, outfile)
Esempio n. 4
0
 def parse_files_from(self, directory, parser):
     log("Parsing files from " + directory, 2)
     posts = []
     for file in os.listdir(directory):
         if not file.endswith('.csv') and not file.endswith('.json'):
             continue
         posts += parser(directory + file)
     return posts
Esempio n. 5
0
def main(jobs, language):
    posts = []
    log(f"Parsing {language} posts", 1)
    # Run every job
    for job in jobs:
        posts += job.run()
    save_posts(posts, f'analysis/_1_process_raw_data/output/{language}.json')
    log(f"Number of {language} posts created : " + str(len(posts)), 1)
Esempio n. 6
0
def label_post(all_posts, all_labels):
    log("Labelling post", 1)
    for post in all_posts:
        post['related_to'] = []
        for label in all_labels:
            # This will make sure 'lks' would not match 'talks'
            pattern = f"(^|[^a-z]){label}([^a-z]|$)"
            if re.search(pattern, post['value']) is not None:
                if (label not in post['related_to']):
                    post['related_to'].append(label)
Esempio n. 7
0
def plot_wordcloud(word_freq, title):
    from wordcloud import WordCloud
    if('chinese' in title):
        wc = WordCloud(font_path='/usr/share/fonts/opentype/noto/NotoSansCJK-Light.ttc').fit_words(word_freq)
    else:
        wc = WordCloud().fit_words(word_freq)
    import matplotlib.pyplot as plt
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    target_path = f'analysis/_6_analyze_keyword/{title}.png'
    log("Saving word cloud as " + target_path, 2)
    plt.savefig(target_path)
Esempio n. 8
0
def for_english():
    # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo
    all_posts = load_posts('analysis/_1_process_raw_data/output/english.json')
    leaders = json.load(open(f'keywords/target/leader.json'))
    parties = json.load(open(f'keywords/target/party.json'))
    combined = {**leaders, **parties}
    keyword_dict = {}
    for key, value in combined.items():
        keyword_dict[key] = [key] + combined[key]["alias_en"]

    kp = KeywordProcessor()
    kp.add_keywords_from_dict(keyword_dict)
    for p in all_posts:
        p["related_to"] = list(set(kp.extract_keywords(p["value"])))

    purified = [x for x in all_posts if len(x['related_to']) > 0]
    log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1)

    save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
Esempio n. 9
0
def load_labels(file_path, language):
    log(f"Loading {language} labels from {file_path}", 1)
    result = []
    with open(file_path, encoding='utf8') as file:
        dic = json.load(file)
        for key in dic:
            if language == 'english':
                result.append(key)
                result += dic[key]['alias_en']
            elif language == 'chinese':
                x = dic[key]['name_cn']
                if x is not None:
                    result.append(x)
                result += dic[key]['alias_cn']
            else:
                raise Exception(
                    "Invalid language argument. Expected (chinese/english) but got "
                    + language)
    log(f"{len(result)} {language} labels loaded", 2)
    return result
Esempio n. 10
0
def main(language):
    log(f"Loading {language} posts", 1)

    # 1. Load the post data
    posts = load_posts(f'analysis/_1_process_raw_data/output/{language}.json')

    # 2. Load the labels, AKA keywords for political figure/party (example "Najib", "Pakatan Harapan")
    labels = get_labels(language)

    # 3. Match the labels with every post object
    label_post(posts, labels)

    # 4. Remove post that is not related to any keywords
    log(f"Removing unrelated posts", 1)
    purified = [x for x in posts if len(x['related_to']) > 0]

    # 5. Save the remaning post object
    log(f"Number of removed posts = " + str(len(posts) - len(purified)), 1)
    save_posts(purified, f'analysis/_2_remove_unrelated_data/{language}.json')

    # 6. This step is to save the post object that is not related to any keywords
    SAVE_DUMPED_POST = False
    if SAVE_DUMPED_POST:
        dumped = [x for x in posts if len(x['related_to']) <= 0]
        save_posts(
            dumped, f'analysis/_2_remove_unrelated_data/dumped_{language}.json')
Esempio n. 11
0
def plot_hbar(y_labels, x_values, title):
    import matplotlib.pyplot as plt
    import numpy as np

    # Fixing random state for reproducibility

    plt.rcdefaults()
    fig, ax = plt.subplots()
    fig.set_size_inches(19, 20)

    y_pos = np.arange(len(y_labels))
    error = np.random.rand(len(y_labels))
    ax.barh(y_pos, x_values, xerr=error, align='center',
            color='green', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(y_labels)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Frequencies')
    ax.set_title(title)

    # plt.show()
    target_path = f'analysis/_6_analyze_keyword/{title}.png'
    log("Saving graph as " + target_path, 2)
    plt.savefig(target_path)
Esempio n. 12
0
def main(
        language, 
        number_of_keywords_to_be_shown, 
        semantic_type
    ):
    log(f'Analyzing {semantic_type} keywords for {language} data', 1)

    if semantic_type == 'positive':
        keyword_processor = load_semantic_keywords_processor('', True, False, language) 
    elif semantic_type == 'negative':
        keyword_processor = load_semantic_keywords_processor('', False, True, language) 
    else:
        raise Exception("Invalid argument")
    
    posts = load_posts(f'analysis/_3_label_semantic/{language}.json')
    dic = {}
    for p in posts:
        matching_keywords = keyword_processor.extract_keywords(p["value"])
        for word in matching_keywords:
            if not word in dic:
                dic[word] = []
            dic[word].append(p["origin"])

    json.dump(dic, open(
        f'analysis/_6_analyze_keyword/{language}_keyword_freq.json', 'w'), ensure_ascii=False)

    flattened_dic = {}
    for key in dic:
        flattened_dic[key] = len(dic[key])

    tuples = sorted(flattened_dic.items(),
                    key=operator.itemgetter(1), reverse=True)

    y_labels = []
    x_values = []
    for t in tuples:
        y_labels.append(t[0])
        x_values.append(t[1])

    y_labels = y_labels[0: number_of_keywords_to_be_shown + 1]
    x_values = x_values[0: number_of_keywords_to_be_shown + 1]

    file_name = f'analysis/_6_analyze_keyword/{language}_top_{semantic_type}_keywords.txt'
    with open(file_name, 'w') as file:
        for value in y_labels:
            file.write(value + "\n")
    log(f'Top 100 keywords are save as {file_name}', 2)

    # log('Plotting graph', 2)
    # plot_hbar(y_labels, x_values, f'{semantic_type}_keyword_frequencies')
    # log('DONE', 1)

    log('Plotting word cloud', 2)
    plot_wordcloud(dict(zip(y_labels, x_values)), f'{language}_{semantic_type}_keywordcloud')
Esempio n. 13
0
def main(language):
    # 1. Load the post object generated by _3_label_semantic
    post_str = str(load_posts(f'analysis/_3_label_semantic/{language}.json'))

    # 2. Replace synonyms, (for example: replace 'Najib','PM' with 'najib')
    post_str = replace_synonyms(post_str, "../../keywords/target/")
    all_posts = eval(replace_synonyms(post_str, get_keywords(language)))

    # 3. Standardize date format of every Post object
    log(f"Standardizing date format of each {language} posts", 0)
    standardized = standardize_date_format(all_posts)
    log(f"Sorting post based on date", 1)

    # 4. Sort the Post object based on date
    sorted_posts = sorted(standardized, key=lambda x: x['date'])

    # 5. Extract the data for the sorted  post
    extracted = extract_data(sorted_posts, START_DATE, END_DATE, language)
    log(f'Storing results to analysis/results/{language}_extracted.json', 1)

    # 6. Save the extracted data as XXXX_extracted.json
    json.dump(extracted,
              open(f'analysis/results/{language}_extracted.json', 'w'),
              ensure_ascii=False)
Esempio n. 14
0
def load_posts(file_path):
    log("Loading posts from " + file_path, 1)
    with open(file_path, 'rb') as file:
        posts = json.load(file)
        return posts