def populate_features_labels(annot_type, embed_dim, use_topic_only, use_accomodation_features=False): wc.load_dictionary(wc.default_dictionary_filename()) discussion_posts, triples = load_dicts() post_embeddings = load_document_proportions(embed_dim) # post_embeddings = load_embeddings() category_types = load_liwc_cat_groups() sent_cats = category_types['possent'] + category_types['negsent'] if use_accomodation_features: sent_cats += category_types['accomodation'] num_topics = len(topic_indices.keys()) num_triples = len(triples[annot_type]) dim = embed_dim * num_topics + len(sent_cats) if use_topic_only: dim = num_topics features = np.zeros((num_triples, dim)) outcome_map = {ct: np.zeros(num_triples) for ct in category_types} treatments = np.zeros(num_triples) for idx, triple in enumerate(triples[annot_type]): p1 = triple[0] p2 = triple[1] p3 = triple[2] annot_val = triple[3] did = triple[4] treatment = 1 if annot_val > 1 else 0 topic = discussion_posts[did][p1]['topic'] embed1 = post_embeddings[p1] embed2 = post_embeddings[p2] embed = np.hstack([embed1, embed2]) p1_liwc = wc.score_text(discussion_posts[did][p1]['text']) p3_liwc = wc.score_text(discussion_posts[did][p3]['text']) p1_sent_vec = get_liwc_vector(p1_liwc, sent_cats) tidx = topic_indices[topic] if use_topic_only: features[idx][tidx] = 1 else: features[idx, tidx * embed_dim:(tidx + 1) * embed_dim] = embed features[idx, dim - len(sent_cats):] = p1_sent_vec treatments[idx] = treatment for ct in category_types: outcome = compute_outcome(p1_liwc, p3_liwc, category_types[ct]) outcome_map[ct][idx] = outcome return features, treatments, outcome_map
def main(): ip_filename = 'sample_input.csv' col_name = 'text' op_filename = 'sample_output.csv' wc.load_dictionary(wc.default_dictionary_filename()) ip_rows = read_csv(ip_filename) ip_scores, category_list = get_liwc_scores(wc, ip_rows, col_name) write_csv(op_filename, ip_scores, ["text"] + category_list)
def main(infname, outfname): #ip_filename = sys.argv[1] #col_name = sys.argv[2] #op_filename = sys.argv[3] ip_filename = infname #col_name = colname op_filename = outfname wc.load_dictionary(wc.default_dictionary_filename()) ip_rows = read_file(ip_filename) ip_scores, category_list = get_liwc_scores(wc, ip_rows) write_csv(op_filename, ip_scores, category_list)
import nltk import re import word_category_counter import data_helper import os, sys from word2vec_extractor import Word2vecExtractor DATA_DIR = "data" LIWC_DIR = "liwc" word_category_counter.load_dictionary(LIWC_DIR) w2vecmodel = "data/glove-w2v.txt" w2v = None def get_word_embedding_features(text): global w2v if w2v is None: print("loading word vectors ...", w2vecmodel) w2v = Word2vecExtractor(w2vecmodel) feature_dict = w2v.get_doc2vec_feature_dict(text) return feature_dict FEATURE_SETS = {"word_pos_features", "word_features", "word_pos_liwc_features", "only_liwc", "word_embedding"}
import shutil import os import sys import word_category_counter as wc from collections import Counter, defaultdict import csv sent_file = sys.argv[1] sent_input = [] with open(sent_file, 'rU') as csvfile: label_reader = csv.reader(csvfile) for row in label_reader: sent_input += [row] wc.load_dictionary(wc.default_dictionary_filename()) csv_op = [[ "Filename", "Sentence", "Positive Emotion", "Negative Emotion", "Sadness", "Anger", "Anxiety" ]] for pair in sent_input: name = pair[0] sentence = pair[1] liwc = wc.score_text(sentence) if liwc["Positive Emotion"] > liwc["Negative Emotion"]: x = 1 elif liwc["Positive Emotion"] < liwc["Negative Emotion"]: x = -1 else: x = 0