def populate_features_labels(annot_type,
                             embed_dim,
                             use_topic_only,
                             use_accomodation_features=False):

    wc.load_dictionary(wc.default_dictionary_filename())

    discussion_posts, triples = load_dicts()
    post_embeddings = load_document_proportions(embed_dim)
    # post_embeddings = load_embeddings()
    category_types = load_liwc_cat_groups()
    sent_cats = category_types['possent'] + category_types['negsent']

    if use_accomodation_features:
        sent_cats += category_types['accomodation']

    num_topics = len(topic_indices.keys())
    num_triples = len(triples[annot_type])
    dim = embed_dim * num_topics + len(sent_cats)

    if use_topic_only:
        dim = num_topics

    features = np.zeros((num_triples, dim))
    outcome_map = {ct: np.zeros(num_triples) for ct in category_types}
    treatments = np.zeros(num_triples)

    for idx, triple in enumerate(triples[annot_type]):
        p1 = triple[0]
        p2 = triple[1]
        p3 = triple[2]
        annot_val = triple[3]
        did = triple[4]

        treatment = 1 if annot_val > 1 else 0
        topic = discussion_posts[did][p1]['topic']

        embed1 = post_embeddings[p1]
        embed2 = post_embeddings[p2]
        embed = np.hstack([embed1, embed2])

        p1_liwc = wc.score_text(discussion_posts[did][p1]['text'])
        p3_liwc = wc.score_text(discussion_posts[did][p3]['text'])
        p1_sent_vec = get_liwc_vector(p1_liwc, sent_cats)

        tidx = topic_indices[topic]
        if use_topic_only:
            features[idx][tidx] = 1
        else:
            features[idx, tidx * embed_dim:(tidx + 1) * embed_dim] = embed
            features[idx, dim - len(sent_cats):] = p1_sent_vec

        treatments[idx] = treatment

        for ct in category_types:
            outcome = compute_outcome(p1_liwc, p3_liwc, category_types[ct])
            outcome_map[ct][idx] = outcome

    return features, treatments, outcome_map
def main():
    ip_filename = 'sample_input.csv'
    col_name = 'text'
    op_filename = 'sample_output.csv'

    wc.load_dictionary(wc.default_dictionary_filename())

    ip_rows = read_csv(ip_filename)
    ip_scores, category_list = get_liwc_scores(wc, ip_rows, col_name)
    write_csv(op_filename, ip_scores, ["text"] + category_list)
def main(infname, outfname):
    #ip_filename = sys.argv[1]
    #col_name = sys.argv[2]
    #op_filename = sys.argv[3]

    ip_filename = infname
    #col_name = colname
    op_filename = outfname
    wc.load_dictionary(wc.default_dictionary_filename())
    ip_rows = read_file(ip_filename)
    ip_scores, category_list = get_liwc_scores(wc, ip_rows)
    write_csv(op_filename, ip_scores, category_list)
Example #4
0
import nltk
import re
import word_category_counter
import data_helper
import os, sys
from word2vec_extractor import Word2vecExtractor
DATA_DIR = "data"
LIWC_DIR = "liwc"

word_category_counter.load_dictionary(LIWC_DIR)

w2vecmodel = "data/glove-w2v.txt"
w2v = None




def get_word_embedding_features(text):
    global w2v
    if w2v is None:
        print("loading word vectors ...", w2vecmodel)
        w2v = Word2vecExtractor(w2vecmodel)
    feature_dict = w2v.get_doc2vec_feature_dict(text)
    return feature_dict



FEATURE_SETS = {"word_pos_features", "word_features", "word_pos_liwc_features", "only_liwc",
                "word_embedding"}
Example #5
0
import shutil
import os
import sys
import word_category_counter as wc
from collections import Counter, defaultdict
import csv

sent_file = sys.argv[1]

sent_input = []
with open(sent_file, 'rU') as csvfile:
    label_reader = csv.reader(csvfile)
    for row in label_reader:
        sent_input += [row]

wc.load_dictionary(wc.default_dictionary_filename())

csv_op = [[
    "Filename", "Sentence", "Positive Emotion", "Negative Emotion", "Sadness",
    "Anger", "Anxiety"
]]
for pair in sent_input:
    name = pair[0]
    sentence = pair[1]
    liwc = wc.score_text(sentence)
    if liwc["Positive Emotion"] > liwc["Negative Emotion"]:
        x = 1
    elif liwc["Positive Emotion"] < liwc["Negative Emotion"]:
        x = -1
    else:
        x = 0