def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file): model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) all_dictionaries = Dictionaries(dict_filepath) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat( twit_dict_file) for v in [10, 100, 1000, 10000, 50000]: twit_id = set(tw_distant_supervision_identity_dat[( tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = { t for t in twit_id if t not in stopwords and t.replace(" person", "") not in stopwords } twit_sets.append([twit_id, "twit_identities_" + str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES, "expert_non"]) twit_sets.append([stopwords, "stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets], [ t[1] for t in twit_sets ]
def get_dictionary_features(filename, dictionary_location, twitter_distance_supervision_file_location, twitter_cutoffs): twit_sets = [] stopwords = get_stopwords() if twitter_distance_supervision_file_location: tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat( twitter_distance_supervision_file_location) for v in twitter_cutoffs: twit_id = set(tw_distant_supervision_identity_dat[( tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = { t for t in twit_id if t not in stopwords and t.replace(" person", "") not in stopwords } twit_sets.append([twit_id, "twit_identities_" + str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES, "expert_non"]) twit_sets.append([stopwords, "stopword"]) all_dicts = Dictionaries(dictionary_location) return look_in_dict(filename, all_dicts, [t[0] for t in twit_sets], [t[1] for t in twit_sets])
def get_init_data(model_file, ark_file): from gensim.models.word2vec import Word2Vec model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) files = [resource_filename('twitter_dm', 'data/identity_dictionaries/identity/'+x) for x in resource_listdir('twitter_dm', 'data/identity_dictionaries/identity/')] files += [resource_filename('twitter_dm', 'data/identity_dictionaries/non_identity_words/'+x) for x in resource_listdir('twitter_dm', 'data/identity_dictionaries/non_identity_words/')] all_dictionaries = Dictionaries(list_of_files=files) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(None) for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords} twit_sets.append([twit_id,"twit_identities_"+str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"]) twit_sets.append([stopwords,"stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def get_twitter_distant_supervision_identity_dat(filename): stopwords = get_stopwords() identity_dat = pd.DataFrame.from_csv(filename,sep='\t',header=None,encoding="utf8").reset_index() identity_dat.columns = ['rule','term','count'] identity_dat.term = identity_dat.term.apply(get_cleaned_text) identity_dat = identity_dat[identity_dat.term != ''] identity_dat = identity_dat[identity_dat.term != ' person'] identity_dat = identity_dat[identity_dat.term != 'person'] grouped_data = identity_dat.groupby(["term","rule"]).sum().reset_index() identity_dat = grouped_data.pivot_table(values='count', index='term',columns='rule', fill_value=0)\ .sort(inplace=False,columns='i',ascending=False)\ .reset_index() identity_dat['tot'] = identity_dat.i + identity_dat.y + identity_dat.h + identity_dat.s + identity_dat.p identity_dat.sort("tot",inplace=True,ascending=False) identity_dat = identity_dat[identity_dat.term.apply(lambda x: not x in stopwords and not x.replace(" person","") in stopwords)] return identity_dat.reset_index()
def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file): model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) all_dictionaries = Dictionaries(dict_filepath) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(twit_dict_file) for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords} twit_sets.append([twit_id,"twit_identities_"+str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"]) twit_sets.append([stopwords,"stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def get_dictionary_features(filename, dictionary_location, twitter_distance_supervision_file_location, twitter_cutoffs): twit_sets = [] stopwords = get_stopwords() if twitter_distance_supervision_file_location: tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat( twitter_distance_supervision_file_location) for v in twitter_cutoffs: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords} twit_sets.append([twit_id,"twit_identities_"+str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"]) twit_sets.append([stopwords,"stopword"]) all_dicts = Dictionaries(dictionary_location) return look_in_dict(filename,all_dicts,[t[0] for t in twit_sets], [t[1] for t in twit_sets])
def get_twitter_distant_supervision_identity_dat(filename): stopwords = get_stopwords() identity_dat = pd.DataFrame.from_csv(filename, sep='\t', header=None, encoding="utf8").reset_index() identity_dat.columns = ['rule', 'term', 'count'] identity_dat.term = identity_dat.term.apply(get_cleaned_text) identity_dat = identity_dat[identity_dat.term != ''] identity_dat = identity_dat[identity_dat.term != ' person'] identity_dat = identity_dat[identity_dat.term != 'person'] grouped_data = identity_dat.groupby(["term", "rule"]).sum().reset_index() identity_dat = grouped_data.pivot_table(values='count', index='term',columns='rule', fill_value=0)\ .sort(inplace=False,columns='i',ascending=False)\ .reset_index() identity_dat[ 'tot'] = identity_dat.i + identity_dat.y + identity_dat.h + identity_dat.s + identity_dat.p identity_dat.sort("tot", inplace=True, ascending=False) identity_dat = identity_dat[ identity_dat.term.apply(lambda x: not x in stopwords and not x.replace( " person", "") in stopwords)] return identity_dat.reset_index()
__author__ = 'kennyjoseph' import numpy as np from evaluation import evaluate from util import get_cleaned_text from sklearn.feature_extraction.text import CountVectorizer from functools import partial from sklearn.linear_model import LogisticRegression from create_features import * import codecs from collections import defaultdict from twitter_dm.utility.tweet_utils import get_stopwords npcat = partial(np.concatenate, axis=1) stopwords = get_stopwords() def run_all_on_test_ids(fold, test_ids, word_vector_model, features_from_conll, dict_for_filter, eval_params = [0.25,0.3,0.35,0.4,.45,.5,.6], cutoff_params=[.0001], use_filtered_params=[True], datasets_to_use = ['full'],#'x','wv','x_wv','all_wv', regularization_params = [.6] ): return_info = [] models = [] predictions = []
__author__ = 'kennyjoseph' import re from nltk.stem import WordNetLemmatizer from util import get_cleaned_text from twitter_dm.utility.tweet_utils import get_stopwords from nltk.corpus import wordnet as wn import sys stopwords = get_stopwords() wordnet_lemmatizer = WordNetLemmatizer() #import inflect #inflect_engine = inflect.engine() singular_map = {"children" : "child", "men" : "man", "women" : "woman", "people" : "person" } class DependencyParseObject: def __init__(self, full_line=None, object_ids=[], term_map=None, do_lemmatize=True, do_singular=True): self.cpostag = None self.features = [] self.all_original_ids = [] if full_line is not None: line = full_line.split("\t") self.line = line self.id = int(line[0]) self.text = line[1]
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename): features_from_conll, blah = get_all_features(conll_filename, None, None,None,None) labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner(features_from_conll)[0:3] ## for dictionary-based evaluation stopwords = get_stopwords() data = read_grouped_by_newline_file(conll_filename) dependency_parses = [] for x in data: dependency_parses.append([DependencyParseObject(o) for o in x]) # get all the dictionaries together p_look_in_dict = partial(look_in_dict, sets=[stopwords], set_names=["stopwords"]) act_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'identities.txt'))) wordnet_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'wordnet_identities.txt'))) racial_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'racial_slur_identities.txt'))) national_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'national_identities.txt'))) job_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'job_identities.txt'))) all_ds = Dictionaries(os.path.join(path_to_dicts,'*identities.txt')) all_dict = p_look_in_dict(dependency_parses,all_ds) # get hte bootstrapped dictionary together tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(BOOTSTRAPPED_DICTIONARY_LOCATION) stopwords = get_stopwords() twit_sets = [] for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = twit_id - stopwords twit_sets.append([twit_id,"twit_identities"+str(v)]) all_random_ids = get_test_ids(conll_filename, 0, -1, -1) y = np.array(labels) output_file = open(output_filename, "w") #test all the basic dicts for d in [['act_dict',act_dict], ['racial_dict',racial_dict], ['nat_dict',national_dict], ['job_dict',job_dict], ['wordnet_dict',wordnet_dict], ['all_dict',all_dict]]: preds = get_isin_array(d[1],obj_inds) out = evaluate(.4, y, preds , obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([d[0]] + out[1:])) # test the bootstrapped dicts for twit_set, twit_set_id in twit_sets: d = look_in_dict(dependency_parses,sets=[twit_set,stopwords],set_names=["twit_identities", "stopwords"]) out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id+"_alone"] + out[1:])) d = look_in_dict(dependency_parses, all_ds,[twit_set, stopwords],[twit_set_id,"stopwords"]) out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id+"_w_all"] + out[1:])) output_file.close()
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename): features_from_conll, blah = get_all_features(conll_filename, None, None, None, None) labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner( features_from_conll)[0:3] ## for dictionary-based evaluation stopwords = get_stopwords() data = read_grouped_by_newline_file(conll_filename) dependency_parses = [] for x in data: dependency_parses.append([DependencyParseObject(o) for o in x]) # get all the dictionaries together p_look_in_dict = partial(look_in_dict, sets=[stopwords], set_names=["stopwords"]) act_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'identities.txt'))) wordnet_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'wordnet_identities.txt'))) racial_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'racial_slur_identities.txt'))) national_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'national_identities.txt'))) job_dict = p_look_in_dict( dependency_parses, Dictionaries(os.path.join(path_to_dicts, 'job_identities.txt'))) all_ds = Dictionaries(os.path.join(path_to_dicts, '*identities.txt')) all_dict = p_look_in_dict(dependency_parses, all_ds) # get hte bootstrapped dictionary together tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat( BOOTSTRAPPED_DICTIONARY_LOCATION) stopwords = get_stopwords() twit_sets = [] for v in [10, 100, 1000, 10000, 50000]: twit_id = set(tw_distant_supervision_identity_dat[( tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = twit_id - stopwords twit_sets.append([twit_id, "twit_identities" + str(v)]) all_random_ids = get_test_ids(conll_filename, 0, -1, -1) y = np.array(labels) output_file = open(output_filename, "w") #test all the basic dicts for d in [['act_dict', act_dict], ['racial_dict', racial_dict], ['nat_dict', national_dict], ['job_dict', job_dict], ['wordnet_dict', wordnet_dict], ['all_dict', all_dict]]: preds = get_isin_array(d[1], obj_inds) out = evaluate(.4, y, preds, obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([d[0]] + out[1:])) # test the bootstrapped dicts for twit_set, twit_set_id in twit_sets: d = look_in_dict(dependency_parses, sets=[twit_set, stopwords], set_names=["twit_identities", "stopwords"]) out = evaluate(.4, y, get_isin_array(d, obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id + "_alone"] + out[1:])) d = look_in_dict(dependency_parses, all_ds, [twit_set, stopwords], [twit_set_id, "stopwords"]) out = evaluate(.4, y, get_isin_array(d, obj_inds), obj_inds, all_random_ids, print_eval=True) output_file.write(tsn([twit_set_id + "_w_all"] + out[1:])) output_file.close()