Ejemplo n.º 1
0
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1):
    counts = create_representation("Explicit", count_path, normalize=False)
    old_mat = counts.m
    index = counts.wi
    smooth = old_mat.sum() * smooth

    # getting marginal probs
    row_probs = old_mat.sum(1) + smooth
    col_probs = old_mat.sum(0) + smooth
    if cds:
        col_probs = np.power(col_probs, 0.75)
    row_probs = row_probs / row_probs.sum()
    col_probs = col_probs / col_probs.sum()

    # building PPMI matrix
    ppmi_mat = make_ppmi_mat(old_mat,
                             row_probs,
                             col_probs,
                             smooth,
                             neg=neg,
                             normalize=normalize)
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()})
    from socialsent3.representations import sparse_io
    sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data,
                             out_path + ".bin")
    util.write_pickle(index, out_path + "-index.pkl")
Ejemplo n.º 2
0
def worker(proc_num, queue):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        positive_seeds, negative_seeds = seeds.adj_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "jj")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)

        polarities = polarity_induction_methods.bootstrap(
            embed.get_subembed(
                words.union(positive_seeds).union(negative_seeds)),
            positive_seeds,
            negative_seeds,
            score_method=polarity_induction_methods.random_walk,
            num_boots=50,
            n_procs=20,
            return_all=True,
            beta=0.9,
            nn=25)
        util.write_pickle(polarities,
                          constants.POLARITIES + year + '-coha-adj-boot.pkl')
Ejemplo n.º 3
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random() * 10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print(year, len(words))
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print(year, len(words))
        #        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
        #        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print(year, weight)
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds,
                                                 test_embed,
                                                 method=polarity_induction_methods.random_walk,
                                                 beta=0.9, nn=25,
                                                 **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Ejemplo n.º 4
0
def main(args):
    print('Loading data...')
    train_sents, train_labels = [], []
    with open(args.TRAIN, 'rt') as f:
        lines = f.readlines()
        for l in lines:
            cols = l.split('\t')
            train_labels.append(1 if cols[0] == 'Positive' else 0)
            train_sents.append(cols[1].strip())

    test_sents, test_labels = [], []
    with open(args.TEST, 'rt') as f:
        lines = f.readlines()
        for l in lines:
            cols = l.split('\t')
            test_labels.append(1 if cols[0] == 'Positive' else 0)
            test_sents.append(cols[1].strip())

    pos_seeds, neg_seeds = seeds.review_seeds()
    print('Creating word vectors...')
    embeddings = create_representation("FULL", args.EMBED, 100, limit=30000)
    print('Calculating polarities...')
    polarities = random_walk(embeddings,
                             pos_seeds,
                             neg_seeds,
                             beta=0.99,
                             nn=10,
                             sym=True,
                             arccos=True)

    print('Storing polarities...')
    dict2csv(polarities, path='./data/polarities/default.csv')

    print('Creating training sentence representations...')
    train_reps = []
    for i, sent in enumerate(train_sents):
        print('\t%d/%d' % (i + 1, len(train_sents)), end='\r')
        rep = sent2rep(sent, polarities)
        train_reps.append(rep)
    print()

    print('Creating testing sentence representations...')
    test_reps = []
    for i, sent in enumerate(test_sents):
        print('\t%d/%d' % (i + 1, len(test_sents)), end='\r')
        rep = sent2rep(sent, polarities)
        test_reps.append(rep)
    print()

    km = KMeans(n_clusters=2, verbose=1, max_iter=10000)
    train_preds = km.fit_predict(train_reps)
    test_preds = km.predict(test_reps)

    print('\nMetrics on train set:')
    evaluate(train_labels, train_preds)
    print('\nMetrics on test set:')
    evaluate(test_labels, test_preds)
Ejemplo n.º 5
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print("Getting evalution words..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("SentProp:")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        # method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Ejemplo n.º 6
0
from socialsent3 import seeds
from socialsent3 import lexicons
from socialsent3.polarity_induction_methods import random_walk
from socialsent3.evaluate_methods import binary_metrics
from socialsent3.representations.representation_factory import create_representation

if __name__ == "__main__":
    # print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting SentProp with 300 dimensional fastText embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    # embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/glove.6B.100d.txt",
    #                                    set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/imdb.en.vec",
                                       set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
                  if word not in pos_seeds
                  and word not in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
                             sym=True, arccos=True)

    acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words)
    print("Accuracy with best threshold: {:0.2f}".format(acc))
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
Ejemplo n.º 7
0
def hyperparam_eval():
    print("Getting evaluation words and embeddings")
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD",
                                       constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]

    print("SentProp...")
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
            print("Common")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                common_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.random_walk,
                nn=nn,
                beta=beta,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words)
            print("Hist")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                hist_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.random_walk,
                nn=nn,
                beta=beta,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words)

    print("Densify...")
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
            print("LR : ", lr, "Reg: ", reg)
            print("Common")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                common_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.densify,
                lr=lr,
                regularization_strength=reg,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words, tern=False)
            print("Hist")
            polarities = run_method(
                positive_seeds,
                negative_seeds,
                hist_embed.get_subembed(
                    set(eval_words).union(negative_seeds).union(
                        positive_seeds)),
                method=polarity_induction_methods.densify,
                lr=lr,
                regularization_strength=reg,
                **DEFAULT_ARGUMENTS)
            evaluate(polarities, lexicon, eval_words, tern=False)
Ejemplo n.º 8
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print("Getting evaluation words and embeddings..")
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print(
        len((set(positive_seeds).union(negative_seeds)).intersection(
            embed.iw)))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("Sentiment 140")
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print()

    print("SentProp")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print("SentProp")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
Ejemplo n.º 9
0
def evaluate_finance_methods():
    np.random.seed(0)
    print("Getting evalution words and embeddings..")
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("finance", remove_neutral=True)

    ### padding in neutrals from GI lexicon
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0
    positive_seeds, negative_seeds = seeds.finance_seeds()
    stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS)
    stock_counts = create_representation("Explicit", constants.STOCK_COUNTS)
    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))

    stock_words = set(stock_embed.iw)
    common_words = set(common_embed)
    eval_words = [
        word for word in lexicon
        if word in stock_words and word in common_words
        and not word in positive_seeds and not word in negative_seeds
    ]

    stock_counts = stock_counts.get_subembed(
        set(eval_words).union(positive_seeds).union(negative_seeds),
        restrict_context=False)

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))

    print("Velikovich with 1990s Fic embeddings")
    stock_counts.normalize()
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        stock_counts,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.graph_propagate,
        T=3,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=None)
    print()

    print("PMI")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            stock_counts,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.pmi,
                            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
    print()

    print("SentProp with stock embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        stock_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)

    print("Densifier with stock embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        stock_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
Ejemplo n.º 10
0
def evaluate_adj_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print("Getting evalution words and embeddings..")
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())
    adjs = vocab.pos_words("1990", "ADJ")

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.adj_seeds()

    common_embed = create_representation(
        "GIGA", constants.COMMON_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD",
                                       constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit",
                                        constants.COUNTS + "1990",
                                        normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    embed_words = [
        word for word in adjs if word in hist_words and word in common_words
    ]
    eval_words = [
        word for word in eval_words if word in embed_words
        and not word in positive_seeds and not word in negative_seeds
    ]

    hist_counts = hist_counts.get_subembed(
        set(eval_words).union(positive_seeds).union(negative_seeds),
        restrict_context=False)

    print("Evaluating with ", len(eval_words), "out of", len(lexicon))
    print("Embeddings with ", len(embed_words))

    print("PMI")
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            hist_counts,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.pmi,
                            boot_size=6,
                            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print()
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print("Dist with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.dist,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("Densifier with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("SentProp with 1990s Fic embeddings")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        nn=25,
        beta=0.9,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("Velikovich with 1990s Fic embeddings")
    hist_counts.normalize()
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        hist_counts,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.graph_propagate,
        T=3,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print()

    print("SentProp with CC")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.99,
        nn=10,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print("Densifier with CC")
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(embed_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.densify,
        boot_size=6,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
Ejemplo n.º 11
0
def main(args):
    print('Loading data...')
    train_sents, train_labels = [], []
    with open(args.TRAIN, 'rt') as f:
        lines = f.readlines()
        for l in lines:
            cols = l.split('\t')
            train_labels.append(1 if cols[0] == 'Positive' else 0)
            train_sents.append(cols[1].strip())

    test_sents, test_labels = [], []
    with open(args.TEST, 'rt') as f:
        lines = f.readlines()
        for l in lines:
            cols = l.split('\t')
            test_labels.append(1 if cols[0] == 'Positive' else 0)
            test_sents.append(cols[1].strip())

    pos_seeds, neg_seeds = seeds.review_seeds()
    print('Creating word vectors...')
    embeddings = create_representation("FULL", args.EMBED, 100,
                                       limit=50000)
    print('Calculating polarities...')
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
                             sym=True, arccos=True)

    print('Filtering polarities...')
    polarities = filter_polarities(polarities, args.CUTOFF)

    print('Storing polarities...')
    dict2csv(polarities, path='./data/polarities/filtered.csv')

    word_list = list(polarities.keys())
    train_reps, test_reps = [], []

    if args.ALGO == 'tf-idf':
        tfidf = TfidfVectorizer(vocabulary=word_list, tokenizer=nltk.word_tokenize)

        print('Creating training sentence representations...')
        train_reps = tfidf.fit_transform(train_sents)

        print('Creating testing sentence representations...')
        test_reps = tfidf.fit_transform(test_sents)
    else:
        print('Creating training sentence representations...')
        for i, sent in enumerate(train_sents):
            print('\t%d/%d' % (i + 1, len(train_sents)), end='\r')
            rep = sent2rep(sent, word_list=word_list)
            train_reps.append(rep)
        print()

        print('Creating testing sentence representations...')
        for i, sent in enumerate(test_sents):
            print('\t%d/%d' % (i + 1, len(test_sents)), end='\r')
            rep = sent2rep(sent, word_list=word_list)
            test_reps.append(rep)
        print()

    if args.LSA != 0:
        print('Transforming w/ LSA...')
        svd = TruncatedSVD(args.LSA)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        train_reps = lsa.fit_transform(train_reps)
        test_reps = lsa.fit_transform(test_reps)

    km = KMeans(n_clusters=2, verbose=1, max_iter=10000)
    train_preds = km.fit_predict(train_reps)
    test_preds = km.predict(test_reps)

    print('\nMetrics on train set:')
    evaluate(train_labels, train_preds)
    print('\nMetrics on test set:')
    evaluate(test_labels, test_preds)