Esempio n. 1
0
def for_chinese(date):
    """[summary]
    
    Arguments:
        date {str} -- Must be prefixed with underscore, e.g. '20180303_' 
    """
    # 1. Load the post object generated by _2_remove_unrelated_data
    all_posts = load_posts('analysis/_2_remove_unrelated_data/chinese.json')

    # 2. Load negative and positive chinese keywords
    positive_kp = load_semantic_keywords_processor(date, True, False,
                                                   'chinese')
    negative_kp = load_semantic_keywords_processor(date, False, True,
                                                   'chinese')

    # 3. Match those keywords against every post object
    log("Labelling semantic of chinese post", 1)
    for p in all_posts:
        matching_positive_keywords = positive_kp.extract_keywords(p["value"])
        matching_negative_keywords = negative_kp.extract_keywords(p["value"])
        if (len(matching_positive_keywords) > 0):
            p["semantic_value"]["positive"] = True
        if (len(matching_negative_keywords) > 0):
            p["semantic_value"]["negative"] = True

    # 4. Save the labelled post as chinese.json
    save_posts(all_posts, f'analysis/_3_label_semantic/chinese.json')
Esempio n. 2
0
def main(language):
    log(f"Loading {language} posts", 1)

    # 1. Load the post data
    posts = load_posts(f'analysis/_1_process_raw_data/output/{language}.json')

    # 2. Load the labels, AKA keywords for political figure/party (example "Najib", "Pakatan Harapan")
    labels = get_labels(language)

    # 3. Match the labels with every post object
    label_post(posts, labels)

    # 4. Remove post that is not related to any keywords
    log(f"Removing unrelated posts", 1)
    purified = [x for x in posts if len(x['related_to']) > 0]

    # 5. Save the remaning post object
    log(f"Number of removed posts = " + str(len(posts) - len(purified)), 1)
    save_posts(purified, f'analysis/_2_remove_unrelated_data/{language}.json')

    # 6. This step is to save the post object that is not related to any keywords
    SAVE_DUMPED_POST = False
    if SAVE_DUMPED_POST:
        dumped = [x for x in posts if len(x['related_to']) <= 0]
        save_posts(
            dumped, f'analysis/_2_remove_unrelated_data/dumped_{language}.json')
Esempio n. 3
0
def main(language):
    log(f"Analyzing {language} source", 1)
    dir1 = '_1_process_raw_data/output'  # use dir1 to analyze raw source
    dir2 = '_2_remove_unrelated_data'  # use dir2 to analyze filtered source
    all_posts = load_posts(f'analysis/{dir1}/{language}.json')
    all_posts = [x for x in all_posts if x["date"] != '']
    standardized = standardize_date_format(all_posts)
    dic = {}
    date_list = []
    for p in standardized:
        source = p['source']
        date = p['date']
        if not source in dic:
            dic[source] = []
        if date > MIN_DATE:
            dic[source].append(date)
            date_list.append(date)
    date_list = filter_date(date_list)

    for source_name in dic:
        dic[source_name] = filter_date(dic[source_name])
        print(source_name)
        print(len(dic[source_name]))
        dic[source_name] = group_dates(MIN_DATE, MAX_DATE, dic[source_name])

    with open(f'analysis/_5_analyze_source/{language}_source.json',
              'w') as outfile:
        print("Saving ", {language})
        json.dump(dic, outfile)
Esempio n. 4
0
def main(
        language, 
        number_of_keywords_to_be_shown, 
        semantic_type
    ):
    log(f'Analyzing {semantic_type} keywords for {language} data', 1)

    if semantic_type == 'positive':
        keyword_processor = load_semantic_keywords_processor('', True, False, language) 
    elif semantic_type == 'negative':
        keyword_processor = load_semantic_keywords_processor('', False, True, language) 
    else:
        raise Exception("Invalid argument")
    
    posts = load_posts(f'analysis/_3_label_semantic/{language}.json')
    dic = {}
    for p in posts:
        matching_keywords = keyword_processor.extract_keywords(p["value"])
        for word in matching_keywords:
            if not word in dic:
                dic[word] = []
            dic[word].append(p["origin"])

    json.dump(dic, open(
        f'analysis/_6_analyze_keyword/{language}_keyword_freq.json', 'w'), ensure_ascii=False)

    flattened_dic = {}
    for key in dic:
        flattened_dic[key] = len(dic[key])

    tuples = sorted(flattened_dic.items(),
                    key=operator.itemgetter(1), reverse=True)

    y_labels = []
    x_values = []
    for t in tuples:
        y_labels.append(t[0])
        x_values.append(t[1])

    y_labels = y_labels[0: number_of_keywords_to_be_shown + 1]
    x_values = x_values[0: number_of_keywords_to_be_shown + 1]

    file_name = f'analysis/_6_analyze_keyword/{language}_top_{semantic_type}_keywords.txt'
    with open(file_name, 'w') as file:
        for value in y_labels:
            file.write(value + "\n")
    log(f'Top 100 keywords are save as {file_name}', 2)

    # log('Plotting graph', 2)
    # plot_hbar(y_labels, x_values, f'{semantic_type}_keyword_frequencies')
    # log('DONE', 1)

    log('Plotting word cloud', 2)
    plot_wordcloud(dict(zip(y_labels, x_values)), f'{language}_{semantic_type}_keywordcloud')
Esempio n. 5
0
def test1():
    posts = load_posts('../_1_process_raw_data/output/sample_output.json')
    assert len(posts) == 2
    assert posts[0] == {
        'date': '2017-05-18 11:56:09',
        'value':
        "b'report: red granite in talks with doj to settle 1mdb-linked lawsuit'",
        'source': 'facebook',
        'related_to': None,
        'semantic_value': None
    }
Esempio n. 6
0
def for_english():
    # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo
    all_posts = load_posts('analysis/_1_process_raw_data/output/english.json')
    leaders = json.load(open(f'keywords/target/leader.json'))
    parties = json.load(open(f'keywords/target/party.json'))
    combined = {**leaders, **parties}
    keyword_dict = {}
    for key, value in combined.items():
        keyword_dict[key] = [key] + combined[key]["alias_en"]

    kp = KeywordProcessor()
    kp.add_keywords_from_dict(keyword_dict)
    for p in all_posts:
        p["related_to"] = list(set(kp.extract_keywords(p["value"])))

    purified = [x for x in all_posts if len(x['related_to']) > 0]
    log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1)

    save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
Esempio n. 7
0
def main(language):
    post_id = 0
    tokenized_posts = []
    posts = load_posts(f'analysis/_2_remove_unrelated_data/{language}.json')
    for p in posts:
        p["belongs_to"] = "p" + str(post_id)
        post_id += 1
        p["semantic_value"] = "unassigned"
        p["value"] = re.sub(r'^https?:\/\/.*[\r\n]*', '',  p["value"], flags=re.MULTILINE)
        p["value"] = re.sub(r'\{[^}]*\}', ' ', p["value"])
        sentences= tokenize_post_into_sentence(p["value"])
        for s in sentences:
            copy= p.copy()
            copy["value"]= s
            tokenized_posts.append(copy)

    if GENERATE_SAMPLE:
        save_posts(tokenized_posts[:100],
                f'analysis/transform_format_for_mongodb/{language}_sample.json')
    save_posts(tokenized_posts,
               f'analysis/transform_format_for_mongodb/{language}.json')
Esempio n. 8
0
def main(language):
    # 1. Load the post object generated by _3_label_semantic
    post_str = str(load_posts(f'analysis/_3_label_semantic/{language}.json'))

    # 2. Replace synonyms, (for example: replace 'Najib','PM' with 'najib')
    post_str = replace_synonyms(post_str, "../../keywords/target/")
    all_posts = eval(replace_synonyms(post_str, get_keywords(language)))

    # 3. Standardize date format of every Post object
    log(f"Standardizing date format of each {language} posts", 0)
    standardized = standardize_date_format(all_posts)
    log(f"Sorting post based on date", 1)

    # 4. Sort the Post object based on date
    sorted_posts = sorted(standardized, key=lambda x: x['date'])

    # 5. Extract the data for the sorted  post
    extracted = extract_data(sorted_posts, START_DATE, END_DATE, language)
    log(f'Storing results to analysis/results/{language}_extracted.json', 1)

    # 6. Save the extracted data as XXXX_extracted.json
    json.dump(extracted,
              open(f'analysis/results/{language}_extracted.json', 'w'),
              ensure_ascii=False)
Esempio n. 9
0
from analysis.libs.load_posts import load_posts
from analysis.using_fasttext.flatten import flatten
from analysis.using_fasttext.labelize_using_fasttextformat import labelize_using_fasttextformat

posts = load_posts(f'analysis/_3_label_semantic/english.json')
labelled = labelize_using_fasttextformat(posts)
# flattened = flatten(posts)
print(labelled)
from analysis.libs.load_posts import load_posts
from analysis.libs.save_posts import save_posts

posts = load_posts(f'analysis/using_fasttext/labelled_english_posts.json')
positive = "__label__2"
with open('analysis/using_fasttext/predicted_label.txt') as file:
    labels = file.read().split('\n')
    for i in range(0, len(posts)):
        posts[i]["semantic_value"][("positive" if labels[i] == positive else
                                    "negative")] = True

save_posts(posts, 'analysis/results/fasttext/english_analyzed.json')