Beispiel #1
0
allParamSizesForYearsPossible = []
allRsForYearsUsed, allRsForYearsPossible = [], []

 
############################################################
if __name__ == "__main__":    

    try:
        get_ipython().magic(u'rm ../GSSUtility.pyc # remove this file because otherwise it will be used instead of the updated .py file')
        reload(GU)
    except:
        pass


    pathToData = '../../Data/'
    dataCont = GU.dataContainer(pathToData)
    
    articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False,                                         centralIVs=False, nextYearBound=0, linearModels=False)            
    print 'len of articleClasses:', len(articlesToUse)
#     raw_input('...')
    
    
    # define the storage containers for outputs
    group1 = 'on last GSS year'
    group2 = 'on first "future" GSS year'   
    groups = [group1, group2]
    outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues',  'numTotal',                 'propSig_CentralVars', 'paramSizesNormed_CentralVars', 'pvalues_CentralVars']

    output = defaultdict(dict)
    output['metadata'] = {'article_id':[]}
    for group in groups:
             per article.

inputs:

outputs:

@author: Misha

"""

import GSSUtility as GU  # this also imports a whole bunch of other modules

if __name__ == "__main__":

    pathToData = "../../Data/"
    dataCont = GU.dataContainer(pathToData)

    articlesToUse = GU.filterArticles(
        dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, centralIVs=True
    )
    print "len of articleClasses:", len(articlesToUse)
    raw_input("...")

    # define the storage containers for outputs
    group1 = "onDataUsed"
    group2 = "onFutureYear"
    output = defaultdict(dict)
    groups = [group1, group2]
    outcomes = [
        "propSig",
        "paramSizesNormed",
"""
import GSSUtility as GU

# *********************************************************
allPropsForYearsUsed = []
allPropsForYearsPossible = []
allParamSizesForYearsUsed = []
allParamSizesForYearsPossible = []
allRsForYearsUsed, allRsForYearsPossible = [], []


############################################################
if __name__ == "__main__":

    pathToData = "../../Data/"
    dataCont = GU.dataContainer(pathToData)

    articlesToUse = GU.filterArticles(
        dataCont.articleClasses,
        GSSYearsUsed=True,
        GSSYearsPossible=True,
        centralIVs=True,
        nextYearBound=3,
        linearModels=True,
    )
    print "len of articleClasses:", len(articlesToUse)
    raw_input("...")

    # define the storage containers for outputs
    group1 = "onDataUsed"
    group2 = "onNextYear"
import random
from scipy.stats import pearsonr, ttest_ind, ttest_rel
import time
from collections import Counter
from collections import defaultdict
#from GSSUtility import *
import GSSUtility as GU


    
    
############################################################
if __name__ == "__main__":    
    
    pathToData = '../../Data/'
    dataCont = GU.dataContainer(pathToData)
    
    tempCognateOutput = open(pathToData + 'tempCognateOutput.txt', 'w')
    
    # contains for storing (variable, cognate) tuples in order to see what substitutions
    #i'm most commonly making
    variableCognateTuples = []
    
    # define the storage containers for outputs
    output = defaultdict(dict)
    groups = ['group1', 'group2']
    outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues', 'numTotal']
    for group in groups:
        for outcome in outcomes:
            output[group][outcome] = []
            
# In[ ]:

#*********************************************************
allPropsForYearsUsed = []
allPropsForYearsPossible =[]
allParamSizesForYearsUsed = []
allParamSizesForYearsPossible = []
allRsForYearsUsed, allRsForYearsPossible = [], []

 
############################################################
if __name__ == "__main__":    
    
    pathToData = '../../Data/'
    dataCont = GU.dataContainer(pathToData)
    
    articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, centralIVs=False, yearPublished=True)            
    print 'len of articleClasses:', len(articlesToUse)
#     raw_input('...')

    YEARS = range(1972, 2013)
    outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues',                 'propSig_CentralVars', 'paramSizesNormed_CentralVars', 'pvalues_CentralVars']
    output = pd.DataFrame(np.empty((len(YEARS), len(outcomes))), columns=outcomes, index=YEARS)    
    output = output.astype(object)
    for row in output.iterrows():
        for col in range(len(row[1])):
            row[1][col] = []
    
    #for article in random.sample(articlesToUse, 150):
    for article in articlesToUse:
sys.path.append('../')    
import GSSUtility as GU
import statsmodels.formula.api as smf
from pandas.rpy import common as com


# In[2]:

get_ipython().magic(u'rm ../GSSUtility.pyc # remove this file because otherwise it will be used instead of the updated .py file')
reload(GU)


# In[3]:

pathToData = '../../Data/'
dataCont = GU.dataContainer(pathToData)


# In[3]:

def independent_columns(A, tol = 1e-05):
    """
    Return an array composed of independent columns of A.

    Note the answer may not be unique; this function returns one of many
    possible answers.

    http://stackoverflow.com/q/13312498/190597 (user1812712)
    http://math.stackexchange.com/a/199132/1140 (Gerry Myerson)
    http://mail.scipy.org/pipermail/numpy-discussion/2008-November/038705.html
        (Anne Archibald)
Beispiel #7
0
df[df.year_published.notnull()].groupby('year_published').count().head()


# In[148]:

grouped = df[df.year_published.notnull()].groupby('year_published')
grouped.get_group(2004).head()


# # Number of variables over time

# In[52]:

pathToData = '../../Data/'
dataCont = GU.dataContainer(pathToData)
    
articlesClasses = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True)            

df = pd.DataFrame(columns=['aid', 'yearpublished', 'dvs', 'ivs', 'controls', 'total'])
for a in articleClasses:
    df.loc[a.articleID, :] = np.array([a.articleID, a.yearPublished, a.DVs, a.IVs, a.controls, 0], dtype=object)

df = df[df.yearpublished.notnull()]
df.yearpublished = df.yearpublished.astype(int)
df.aid = df.aid.astype(int)
df.index = df.aid
    
df.dvs = [len(v) for k, v in df.dvs.iteritems()]
df.ivs = [len(v) for k, v in df.ivs.iteritems()]
df.controls = [len(v) for k, v in df.controls.iteritems()]
filename: minianalysis__did_not_use_all_available_data.py

description: Compare models run on data used and on earlier data not used
inputs:

outputs:

@author: Misha

"""

if __name__ == "__main__":    
    
    pathToData = '../../Data/'
    dataCont = GU.dataContainer(pathToData)
    
    # group1 = models run on original data   
    # group2 = models run on unused data (but available at time of publication)
    groups = ['group1', 'group2']    
    outcomes = ['%_of_coeffs_signif.', 'avg_coeff_size', 'Rs', 'adj_Rs', 'avg_p-value']

    output = pd.DataFrame(columns=pd.MultiIndex.from_product([outcomes, groups]), dtype=float)
#     output = output.astype(object)
#     output.columns
    articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=True, \
                                      unusedGSSYears=True, centralIVs=False)            
#     for article in random.sample(articlesToUse, 10):
    for article in articlesToUse:
    #for article in [a for a in articleClasses if a.articleID == 6755]:
    
"""
import sys
sys.path.append('../')
import GSSUtility as GU
import random
from collections import defaultdict
import numpy as np
import cPickle as cp

 
############################################################
if __name__ == "__main__":    
    
    pathToData = '../../Data/'
    dataCont = GU.dataContainer(pathToData)
    VARS_BY_YEAR = cp.load(open(pathToData + 'VARS_BY_YEAR.pickle'))
    
    articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=True, centralIVs=False, nextYearBound=3, yearPublished=True)            
    print 'len of articleClasses:', len(articlesToUse)
    raw_input('...')
    
    # define the storage containers for outputs
    group1 = 'randomVariables1'
    group2 = 'randomVariables2'    
    output = defaultdict(dict)
    groups = [group1, group2]
    outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues']   
    for year in range(1972,2014):
        for group in groups:
            output[group][year] = defaultdict(list)