Esempio n. 1
0
def kuhn_munkres_stats():
    d = read_clean_dataset()
    km = read_pickle_file(_feature_file_map['kuhn_munkres'])
    # Divide into the several datasets

    d['kuhn_munkres'] = km['Kuhn-Munkres']

    f = d[d.articleHeadlineStance == 'for']['kuhn_munkres']
    a = d[d.articleHeadlineStance == 'against']['kuhn_munkres']
    o = d[d.articleHeadlineStance == 'observing']['kuhn_munkres']

    # Calculate for normality:
    _, pf = shapiro(f)
    _, po = shapiro(o)
    _, pa = shapiro(a)

    # None are normaly distributed
    print(f"""Test for normality (K-M):
                    1) For: {pf}
                    2) Observing : {po}
                    3) Against : {pa}""")

    # Calculate p-values
    _, p_fa = mannwhitneyu(f, a)
    _, p_fo = mannwhitneyu(f, o)
    _, p_oa = mannwhitneyu(o, a)

    print(f"""P-values (K-M):
                1) For - Against: {p_fa}
                2) Observing - Against: {p_oa}
                3) For - Observing: {p_fo}""")
Esempio n. 2
0
def root_dist_stats():
    d = read_clean_dataset()

    km = read_pickle_file(_feature_file_map['root_dist'])
    # Divide into the several datasets

    for feature in ['refute_dist', 'hedge_dist']:

        d[feature] = km[feature]
        f = d[d.articleHeadlineStance == 'for'][feature]
        a = d[d.articleHeadlineStance == 'against'][feature]
        o = d[d.articleHeadlineStance == 'observing'][feature]

        # Calculate for normality:
        _, pf_r = shapiro(f)
        _, po_r = shapiro(o)
        _, pa_r = shapiro(a)

        print(f"""Test for normality ({feature}):
                        1) For: {pf_r}
                        2) Observing : {po_r}
                        3) Against : {pa_r}""")

        _, p_fa = mannwhitneyu(f, a)
        _, p_fo = mannwhitneyu(f, o)
        _, p_oa = mannwhitneyu(o, a)

        print(f"""P-values ({feature}):
                    1) For - Against: {p_fa}
                    2) Observing - Against: {p_oa}
                    3) For - Observing: {p_fo}""")
Esempio n. 3
0
def word2vec_stats():
    d = read_clean_dataset()
    w2v = read_pickle_file(_feature_file_map['word2vec'])
    # Divide into the several datasets

    d['w2v'] = w2v.avg_similarity

    f = d[d.articleHeadlineStance == 'for']['w2v']
    a = d[d.articleHeadlineStance == 'against']['w2v']
    o = d[d.articleHeadlineStance == 'observing']['w2v']

    # Calculate for normality:
    _, pf = shapiro(f)
    _, po = shapiro(o)
    _, pa = shapiro(a)

    # None are normaly distributed
    print(f"""Test for normality (W2V):
                1) For: {pf}
                2) Observing : {po}
                3) Against : {pa}""")

    # Calculate p-values
    _, p_fa = mannwhitneyu(f, a)
    _, p_fo = mannwhitneyu(f, o)
    _, p_oa = mannwhitneyu(o, a)

    print(f"""P-values (W2V):
            1) For - Against: {p_fa}
            2) Observing - Against: {p_oa}
            3) For - Observing: {p_fo}""")
Esempio n. 4
0
 def __init__(self, index=0, features=[], classifier="", settings={}, test="", hyperparameters_grid=None):
     self.id = index
     self.features = features
     self.test = test
     self.classifier = classifier
     self.trainingSettings = settings
     self.model = None
     self.hyperparameters_grid = hyperparameters_grid
     self.labels = read_clean_dataset()['articleHeadlineStance']
     self.featureMatrix = self.constructFeaturesMatrix()
     # Compute train and test data
     self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.featureMatrix, self.labels,
                                                                             test_size=0.2, random_state=0,
                                                                             stratify=self.labels)
     self.results = self.trainOnData()
Esempio n. 5
0
        avg_similarities.append(avg_sim)
        prod_similarities.append(prod_sim)
        i += 1
        if i % 50 == 0:
            print(
                f'[{i}] Sim between {claim} ||||| {headline} --> ({avg_sim}/{prod_sim})'
            )
    # After computing all similarities, add a new column to the dataframe
    d = pd.DataFrame()
    d['avg_similarity'] = avg_similarities
    d['prod_similarity'] = prod_similarities
    return d


if __name__ == '__main__':
    # Vector directory in my computer
    VECTOR_DIR = "../../../wse/vec"

    # Load the clean dataset
    df = read_clean_dataset()

    # Load the vectors (vectors are number 3 from https://fasttext.cc/docs/en/english-vectors.html)
    print('Loading vectors')
    nlp = spacy.load(VECTOR_DIR)

    print('Loaded vectors')
    similarity_df = claim_to_headline_sim(df, nlp)

    print('Saving features to', PICKLED_FEATURES_PATH)
    similarity_df.to_pickle(PICKLED_FEATURES_PATH + "word2vec.pkl")
Esempio n. 6
0
        for word in words:
            if graph.has_node(word):
                min_dist = min(
                    nx.shortest_path_length(graph, source=root, target=word),
                    min_dist)
    return min_dist


# Dependency graph is a graph (tree) with words as nodes and if
# word A is dependent on word B in a sentence, then there is an edge from B to A
def create_dependency_graph(sentence):
    '''Creates dependency graph for the sentence using StanfordNLP'''
    edges = []
    root = ''
    for token in sentence.dependencies:
        dep = token[0].text.lower()
        if dep != 'root':
            edges.append((dep, token[2].text))
        else:
            root = token[2].text
    return nx.Graph(edges), root


dataset = read_clean_dataset()  # Read the dataset
dataset = apply_lower_case(dataset)
dataset = apply_strip(dataset)

dataset = extract_root_dist(dataset)

a = dataset[['refute_dist', 'hedge_dist']]
a.to_pickle(PICKLED_FEATURES_PATH + "root_dist.pkl")
Esempio n. 7
0
def q_counts():
    # Statistics : Q-Feature (mean, std, number_samples)
    f = {
        'q_ends': (0.00885, 0.09388, 1238),
        'q_contains': (0.022617, 0.14874, 1238)
    }
    o = {
        'q_ends': (0.090437, 0.286955, 962),
        'q_contains': (0.133056, 0.339812, 962)
    }
    a = {
        'q_ends': (0.025316, 0.157284, 395),
        'q_contains': (0.075949, 0.265253, 395)
    }

    d = read_clean_dataset()
    q = read_pickle_file(_feature_file_map['Q'])
    q['Stance'] = d.articleHeadlineStance

    # Run the t-test!
    for feature in ['q_ends', 'q_contains']:
        mean_f, std_f, n_f = f[feature]
        mean_a, std_a, n_a = a[feature]
        mean_o, std_o, n_o = o[feature]
        # Run the actual test
        _, p_fo = ttest_ind_from_stats(mean1=mean_f,
                                       std1=std_f,
                                       nobs1=n_f,
                                       mean2=mean_o,
                                       std2=std_o,
                                       nobs2=n_o)
        _, p_fa = ttest_ind_from_stats(mean1=mean_f,
                                       std1=std_f,
                                       nobs1=n_f,
                                       mean2=mean_a,
                                       std2=std_a,
                                       nobs2=n_a)
        _, p_ao = ttest_ind_from_stats(mean1=mean_a,
                                       std1=std_a,
                                       nobs1=n_a,
                                       mean2=mean_o,
                                       std2=std_o,
                                       nobs2=n_o)

        print(f"""P-values ({feature})
                    1) For - Against: {p_fa}
                    2) Observing - Against: {p_ao}
                    3) For - Observing: {p_fo}""")

        # Chi-square test for dependency between feature and stance
        contingency_table = pd.crosstab(q['Stance'], q[feature], margins=False)

        chi2_stat, p_val, dof, ex = stats.chi2_contingency(contingency_table)

        print("\n")
        print(f"""=== Chi2 Stat ({feature}) ===""")
        print(chi2_stat)
        print("\n")
        print("===Degrees of Freedom===")
        print(dof)
        print("\n")
        print("===P-Value===")
        print(p_val)
        print("\n")
        print("===Contingency Table===")
        print(ex)