def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file):

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)
    all_dictionaries = Dictionaries(dict_filepath)
    twit_sets = []
    stopwords = get_stopwords()
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(
        twit_dict_file)

    for v in [10, 100, 1000, 10000, 50000]:
        twit_id = set(tw_distant_supervision_identity_dat[(
            tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {
            t
            for t in twit_id
            if t not in stopwords and t.replace(" person", "") not in stopwords
        }
        twit_sets.append([twit_id, "twit_identities_" + str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES, "expert_non"])
    twit_sets.append([stopwords, "stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets], [
        t[1] for t in twit_sets
    ]
def get_dictionary_features(filename, dictionary_location,
                            twitter_distance_supervision_file_location,
                            twitter_cutoffs):

    twit_sets = []
    stopwords = get_stopwords()
    if twitter_distance_supervision_file_location:
        tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(
            twitter_distance_supervision_file_location)

        for v in twitter_cutoffs:
            twit_id = set(tw_distant_supervision_identity_dat[(
                tw_distant_supervision_identity_dat.tot > v)].term.values)
            twit_id = {
                t
                for t in twit_id if t not in stopwords
                and t.replace(" person", "") not in stopwords
            }
            twit_sets.append([twit_id, "twit_identities_" + str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES, "expert_non"])
    twit_sets.append([stopwords, "stopword"])

    all_dicts = Dictionaries(dictionary_location)
    return look_in_dict(filename, all_dicts, [t[0] for t in twit_sets],
                        [t[1] for t in twit_sets])
Exemple #3
0
def get_init_data(model_file, ark_file):
    from gensim.models.word2vec import Word2Vec

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)

    files = [resource_filename('twitter_dm', 'data/identity_dictionaries/identity/'+x) for x in
             resource_listdir('twitter_dm', 'data/identity_dictionaries/identity/')]

    files += [resource_filename('twitter_dm', 'data/identity_dictionaries/non_identity_words/'+x) for x in
             resource_listdir('twitter_dm', 'data/identity_dictionaries/non_identity_words/')]

    all_dictionaries = Dictionaries(list_of_files=files)
    twit_sets = []
    stopwords = get_stopwords()

    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(None)

    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords}
        twit_sets.append([twit_id,"twit_identities_"+str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"])
    twit_sets.append([stopwords,"stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def get_twitter_distant_supervision_identity_dat(filename):
    stopwords = get_stopwords()
    identity_dat = pd.DataFrame.from_csv(filename,sep='\t',header=None,encoding="utf8").reset_index()
    identity_dat.columns = ['rule','term','count']
    identity_dat.term = identity_dat.term.apply(get_cleaned_text)
    identity_dat = identity_dat[identity_dat.term != '']
    identity_dat = identity_dat[identity_dat.term != ' person']
    identity_dat = identity_dat[identity_dat.term != 'person']
    grouped_data = identity_dat.groupby(["term","rule"]).sum().reset_index()
    identity_dat = grouped_data.pivot_table(values='count',
                                                index='term',columns='rule',
                                                fill_value=0)\
                                    .sort(inplace=False,columns='i',ascending=False)\
                                    .reset_index()
    identity_dat['tot'] = identity_dat.i + identity_dat.y + identity_dat.h + identity_dat.s + identity_dat.p
    identity_dat.sort("tot",inplace=True,ascending=False)
    identity_dat = identity_dat[identity_dat.term.apply(lambda x: not x in stopwords and not x.replace(" person","") in stopwords)]
    return identity_dat.reset_index()
def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file):

    model = Word2Vec.load_word2vec_format(model_file, binary=False)
    ark_clusters = get_ark_clusters(ark_file)
    all_dictionaries = Dictionaries(dict_filepath)
    twit_sets = []
    stopwords = get_stopwords()
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(twit_dict_file)

    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords}
        twit_sets.append([twit_id,"twit_identities_"+str(v)])

    twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"])
    twit_sets.append([stopwords,"stopword"])

    return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def get_dictionary_features(filename,
                            dictionary_location,
                            twitter_distance_supervision_file_location,
                            twitter_cutoffs):

    twit_sets = []
    stopwords = get_stopwords()
    if twitter_distance_supervision_file_location:
        tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(
                                                                    twitter_distance_supervision_file_location)


        for v in twitter_cutoffs:
            twit_id = set(tw_distant_supervision_identity_dat[
                          (tw_distant_supervision_identity_dat.tot > v)].term.values)
            twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords}
            twit_sets.append([twit_id,"twit_identities_"+str(v)])


    twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"])
    twit_sets.append([stopwords,"stopword"])

    all_dicts = Dictionaries(dictionary_location)
    return look_in_dict(filename,all_dicts,[t[0] for t in twit_sets], [t[1] for t in twit_sets])
def get_twitter_distant_supervision_identity_dat(filename):
    stopwords = get_stopwords()
    identity_dat = pd.DataFrame.from_csv(filename,
                                         sep='\t',
                                         header=None,
                                         encoding="utf8").reset_index()
    identity_dat.columns = ['rule', 'term', 'count']
    identity_dat.term = identity_dat.term.apply(get_cleaned_text)
    identity_dat = identity_dat[identity_dat.term != '']
    identity_dat = identity_dat[identity_dat.term != ' person']
    identity_dat = identity_dat[identity_dat.term != 'person']
    grouped_data = identity_dat.groupby(["term", "rule"]).sum().reset_index()
    identity_dat = grouped_data.pivot_table(values='count',
                                                index='term',columns='rule',
                                                fill_value=0)\
                                    .sort(inplace=False,columns='i',ascending=False)\
                                    .reset_index()
    identity_dat[
        'tot'] = identity_dat.i + identity_dat.y + identity_dat.h + identity_dat.s + identity_dat.p
    identity_dat.sort("tot", inplace=True, ascending=False)
    identity_dat = identity_dat[
        identity_dat.term.apply(lambda x: not x in stopwords and not x.replace(
            " person", "") in stopwords)]
    return identity_dat.reset_index()
__author__ = 'kennyjoseph'
import numpy as np
from evaluation import evaluate
from util import get_cleaned_text
from sklearn.feature_extraction.text import CountVectorizer
from functools import partial
from sklearn.linear_model import LogisticRegression
from create_features import *
import codecs
from collections import defaultdict
from twitter_dm.utility.tweet_utils import get_stopwords

npcat = partial(np.concatenate, axis=1)
stopwords = get_stopwords()

def run_all_on_test_ids(fold,
                        test_ids,
                        word_vector_model,
                        features_from_conll,
                        dict_for_filter,
                        eval_params = [0.25,0.3,0.35,0.4,.45,.5,.6],
                        cutoff_params=[.0001],
                        use_filtered_params=[True],
                        datasets_to_use = ['full'],#'x','wv','x_wv','all_wv',
                        regularization_params = [.6]
                        ):

    return_info = []
    models = []
    predictions = []
Exemple #9
0
__author__ = 'kennyjoseph'

import re
from nltk.stem import WordNetLemmatizer
from util import get_cleaned_text
from twitter_dm.utility.tweet_utils import get_stopwords
from nltk.corpus import wordnet as wn
import sys
stopwords = get_stopwords()

wordnet_lemmatizer = WordNetLemmatizer()
#import inflect

#inflect_engine = inflect.engine()

singular_map = {"children" : "child",
                "men" : "man",
                "women" : "woman",
                "people" : "person"
                }

class DependencyParseObject:
    def __init__(self, full_line=None, object_ids=[],  term_map=None, do_lemmatize=True, do_singular=True):
        self.cpostag = None
        self.features = []
        self.all_original_ids = []
        if full_line is not None:
            line = full_line.split("\t")
            self.line = line
            self.id = int(line[0])
            self.text = line[1]
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename):

    features_from_conll, blah = get_all_features(conll_filename, None, None,None,None)
    labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner(features_from_conll)[0:3]
    
    ## for dictionary-based evaluation
    stopwords = get_stopwords()

    data = read_grouped_by_newline_file(conll_filename)
    dependency_parses = []
    for x in data:
        dependency_parses.append([DependencyParseObject(o) for o in x])

    # get all the dictionaries together
    p_look_in_dict = partial(look_in_dict, sets=[stopwords], set_names=["stopwords"])
    act_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'identities.txt')))
    wordnet_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'wordnet_identities.txt')))
    racial_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'racial_slur_identities.txt')))
    national_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'national_identities.txt')))
    job_dict = p_look_in_dict(dependency_parses, Dictionaries(os.path.join(path_to_dicts,'job_identities.txt')))
    
    all_ds = Dictionaries(os.path.join(path_to_dicts,'*identities.txt'))
    all_dict = p_look_in_dict(dependency_parses,all_ds)
    
    # get hte bootstrapped dictionary together
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(BOOTSTRAPPED_DICTIONARY_LOCATION)
    stopwords = get_stopwords()
    twit_sets = []
    
    for v in [10, 100, 1000, 10000,50000]:
        twit_id = set(tw_distant_supervision_identity_dat[
                      (tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = twit_id - stopwords
        twit_sets.append([twit_id,"twit_identities"+str(v)])
    
    all_random_ids = get_test_ids(conll_filename, 0, -1, -1)
    
    y = np.array(labels)

    output_file = open(output_filename, "w")

    #test all the basic dicts
    for d in [['act_dict',act_dict],
              ['racial_dict',racial_dict],
              ['nat_dict',national_dict],
              ['job_dict',job_dict],
              ['wordnet_dict',wordnet_dict],
              ['all_dict',all_dict]]:
        preds = get_isin_array(d[1],obj_inds)
        out = evaluate(.4, y, preds , obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([d[0]] + out[1:]))
    
    # test the bootstrapped dicts
    for twit_set, twit_set_id in twit_sets:
        d = look_in_dict(dependency_parses,sets=[twit_set,stopwords],set_names=["twit_identities", "stopwords"])
        out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([twit_set_id+"_alone"] + out[1:]))
        d = look_in_dict(dependency_parses,
                     all_ds,[twit_set, stopwords],[twit_set_id,"stopwords"])
        out = evaluate(.4, y, get_isin_array(d,obj_inds), obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([twit_set_id+"_w_all"] + out[1:]))

    output_file.close()
Exemple #11
0
def run_baseline_on_conll_file(conll_filename, path_to_dicts, output_filename):

    features_from_conll, blah = get_all_features(conll_filename, None, None,
                                                 None, None)
    labels, features, obj_inds = configure_features_for_wordvectors_and_remove_twitterner(
        features_from_conll)[0:3]

    ## for dictionary-based evaluation
    stopwords = get_stopwords()

    data = read_grouped_by_newline_file(conll_filename)
    dependency_parses = []
    for x in data:
        dependency_parses.append([DependencyParseObject(o) for o in x])

    # get all the dictionaries together
    p_look_in_dict = partial(look_in_dict,
                             sets=[stopwords],
                             set_names=["stopwords"])
    act_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'identities.txt')))
    wordnet_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'wordnet_identities.txt')))
    racial_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts,
                                  'racial_slur_identities.txt')))
    national_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'national_identities.txt')))
    job_dict = p_look_in_dict(
        dependency_parses,
        Dictionaries(os.path.join(path_to_dicts, 'job_identities.txt')))

    all_ds = Dictionaries(os.path.join(path_to_dicts, '*identities.txt'))
    all_dict = p_look_in_dict(dependency_parses, all_ds)

    # get hte bootstrapped dictionary together
    tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(
        BOOTSTRAPPED_DICTIONARY_LOCATION)
    stopwords = get_stopwords()
    twit_sets = []

    for v in [10, 100, 1000, 10000, 50000]:
        twit_id = set(tw_distant_supervision_identity_dat[(
            tw_distant_supervision_identity_dat.tot > v)].term.values)
        twit_id = twit_id - stopwords
        twit_sets.append([twit_id, "twit_identities" + str(v)])

    all_random_ids = get_test_ids(conll_filename, 0, -1, -1)

    y = np.array(labels)

    output_file = open(output_filename, "w")

    #test all the basic dicts
    for d in [['act_dict', act_dict], ['racial_dict', racial_dict],
              ['nat_dict', national_dict], ['job_dict', job_dict],
              ['wordnet_dict', wordnet_dict], ['all_dict', all_dict]]:
        preds = get_isin_array(d[1], obj_inds)
        out = evaluate(.4, y, preds, obj_inds, all_random_ids, print_eval=True)
        output_file.write(tsn([d[0]] + out[1:]))

    # test the bootstrapped dicts
    for twit_set, twit_set_id in twit_sets:
        d = look_in_dict(dependency_parses,
                         sets=[twit_set, stopwords],
                         set_names=["twit_identities", "stopwords"])
        out = evaluate(.4,
                       y,
                       get_isin_array(d, obj_inds),
                       obj_inds,
                       all_random_ids,
                       print_eval=True)
        output_file.write(tsn([twit_set_id + "_alone"] + out[1:]))
        d = look_in_dict(dependency_parses, all_ds, [twit_set, stopwords],
                         [twit_set_id, "stopwords"])
        out = evaluate(.4,
                       y,
                       get_isin_array(d, obj_inds),
                       obj_inds,
                       all_random_ids,
                       print_eval=True)
        output_file.write(tsn([twit_set_id + "_w_all"] + out[1:]))

    output_file.close()