예제 #1
0
def for_chinese(date):
    """[summary]
    
    Arguments:
        date {str} -- Must be prefixed with underscore, e.g. '20180303_' 
    """
    # 1. Load the post object generated by _2_remove_unrelated_data
    all_posts = load_posts('analysis/_2_remove_unrelated_data/chinese.json')

    # 2. Load negative and positive chinese keywords
    positive_kp = load_semantic_keywords_processor(date, True, False,
                                                   'chinese')
    negative_kp = load_semantic_keywords_processor(date, False, True,
                                                   'chinese')

    # 3. Match those keywords against every post object
    log("Labelling semantic of chinese post", 1)
    for p in all_posts:
        matching_positive_keywords = positive_kp.extract_keywords(p["value"])
        matching_negative_keywords = negative_kp.extract_keywords(p["value"])
        if (len(matching_positive_keywords) > 0):
            p["semantic_value"]["positive"] = True
        if (len(matching_negative_keywords) > 0):
            p["semantic_value"]["negative"] = True

    # 4. Save the labelled post as chinese.json
    save_posts(all_posts, f'analysis/_3_label_semantic/chinese.json')
예제 #2
0
def main(language):
    log(f"Loading {language} posts", 1)

    # 1. Load the post data
    posts = load_posts(f'analysis/_1_process_raw_data/output/{language}.json')

    # 2. Load the labels, AKA keywords for political figure/party (example "Najib", "Pakatan Harapan")
    labels = get_labels(language)

    # 3. Match the labels with every post object
    label_post(posts, labels)

    # 4. Remove post that is not related to any keywords
    log(f"Removing unrelated posts", 1)
    purified = [x for x in posts if len(x['related_to']) > 0]

    # 5. Save the remaning post object
    log(f"Number of removed posts = " + str(len(posts) - len(purified)), 1)
    save_posts(purified, f'analysis/_2_remove_unrelated_data/{language}.json')

    # 6. This step is to save the post object that is not related to any keywords
    SAVE_DUMPED_POST = False
    if SAVE_DUMPED_POST:
        dumped = [x for x in posts if len(x['related_to']) <= 0]
        save_posts(
            dumped, f'analysis/_2_remove_unrelated_data/dumped_{language}.json')
예제 #3
0
def main(jobs, language):
    posts = []
    log(f"Parsing {language} posts", 1)
    # Run every job
    for job in jobs:
        posts += job.run()
    save_posts(posts, f'analysis/_1_process_raw_data/output/{language}.json')
    log(f"Number of {language} posts created : " + str(len(posts)), 1)
예제 #4
0
def for_english():
    # all_posts = load_posts('analysis/_2_remove_unrelated_data/english.json') # this line is to check whether this new algorithm differ from the previous algo
    all_posts = load_posts('analysis/_1_process_raw_data/output/english.json')
    leaders = json.load(open(f'keywords/target/leader.json'))
    parties = json.load(open(f'keywords/target/party.json'))
    combined = {**leaders, **parties}
    keyword_dict = {}
    for key, value in combined.items():
        keyword_dict[key] = [key] + combined[key]["alias_en"]

    kp = KeywordProcessor()
    kp.add_keywords_from_dict(keyword_dict)
    for p in all_posts:
        p["related_to"] = list(set(kp.extract_keywords(p["value"])))

    purified = [x for x in all_posts if len(x['related_to']) > 0]
    log(f"Number of removed posts = " + str(len(all_posts) - len(purified)), 1)

    save_posts(purified, f'analysis/_2_remove_unrelated_data/english.json')
예제 #5
0
def main(language):
    post_id = 0
    tokenized_posts = []
    posts = load_posts(f'analysis/_2_remove_unrelated_data/{language}.json')
    for p in posts:
        p["belongs_to"] = "p" + str(post_id)
        post_id += 1
        p["semantic_value"] = "unassigned"
        p["value"] = re.sub(r'^https?:\/\/.*[\r\n]*', '',  p["value"], flags=re.MULTILINE)
        p["value"] = re.sub(r'\{[^}]*\}', ' ', p["value"])
        sentences= tokenize_post_into_sentence(p["value"])
        for s in sentences:
            copy= p.copy()
            copy["value"]= s
            tokenized_posts.append(copy)

    if GENERATE_SAMPLE:
        save_posts(tokenized_posts[:100],
                f'analysis/transform_format_for_mongodb/{language}_sample.json')
    save_posts(tokenized_posts,
               f'analysis/transform_format_for_mongodb/{language}.json')
from analysis.libs.load_posts import load_posts
from analysis.libs.save_posts import save_posts

posts = load_posts(f'analysis/using_fasttext/labelled_english_posts.json')
positive = "__label__2"
with open('analysis/using_fasttext/predicted_label.txt') as file:
    labels = file.read().split('\n')
    for i in range(0, len(posts)):
        posts[i]["semantic_value"][("positive" if labels[i] == positive else
                                    "negative")] = True

save_posts(posts, 'analysis/results/fasttext/english_analyzed.json')