def process_subjectivity_file(filename, stem): """ Load subjecitvity score lookup :params: filename (str) : path to file stem (bool) : stem word if true :return: scores (dict) : word-scores lookup """ scores = {"__dict_name__": "subj scores"} with open(filename, "r") as f: for line in f.readlines(): if line == '\n': pass else: line = line.split(" ") word = line[2].split("=")[1] score = line[-1].split("=")[1].strip() if score == "negative": score = -1 elif score == "positive": score = 1 else: score = 0 scores[stem_word(word, stem)] = score return scores
def get_bigram_sentiments(bigrams_path, stem): """ Creates bigram sentiments lookup table :params: bigrams_path (str) : path to tweet bigram sentiments stem (bool) : stem word if true :returns: bigram_sentiments (dict) : key = word, value = score """ bigram_sentiments = {"__dict_name__": "bigram sentiments"} # also doesnt work on windows without the encoding parameter with open(bigrams_path, encoding="utf-8") as infile: for line in infile: w1, w2, score, pos, neg = line.split() w1 = stem_word(w1, stem) w2 = stem_word(w2, stem) bigram_sentiments[w1, w2] = float(score) return bigram_sentiments
def get_pos_neg_words(pos_file, neg_file, stem): """ Create set of positive and negative words :params: pos_file (str) : Opinion Lexicon. https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html neg_file (str) : Opinion Lexicon. https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html stem (bool) : stem word if true :return: (pos_vocab,neg_vocab) (set) : set of words """ n = open(pos_file).readlines() p = open(neg_file).readlines() neg_vocab = set( [stem_word(w.strip(), stem) for w in n if not w.startswith(';')]) pos_vocab = set( [stem_word(w.strip(), stem) for w in p if not w.startswith(';')]) return pos_vocab, neg_vocab
def get_clusters(cluster_path, stem): """ Creates clusters lookup table :params: cluster_path (str) : path to tweet clusters stem (bool) : stem word if true :returns: clusters (dict) : key = word, value = cluster """ clusters = {} # also doesnt work on windows without the encoding parameter with open(cluster_path, encoding="utf-8") as infile: for line in infile: cluster, word, i = line.split('\t') word = stem_word(word, stem) clusters[word] = cluster return clusters
def get_negation_list(stem): """ Retrieve negation words in list. Just in case we want to add more words or find a 'negation lexicon' :return: neg_set (set) : list of common negation words stem (bool) : stem word if true """ neg_set = set([ 'none', 'hasnt', 'couldnt', 'nowhere', 'havent', 'dont', 'cant', 'didnt', 'arent', 'never', 'not', 'nothing', 'nobody', 'wouldnt', 'hadnt', 'shouldnt', 'noone', 'aint', 'isnt', 'neither', 'wont', 'doesnt', 'no' ]) if stem: neg_set = set([stem_word(w, stem) for w in neg_set]) return neg_set