allParamSizesForYearsPossible = [] allRsForYearsUsed, allRsForYearsPossible = [], [] ############################################################ if __name__ == "__main__": try: get_ipython().magic(u'rm ../GSSUtility.pyc # remove this file because otherwise it will be used instead of the updated .py file') reload(GU) except: pass pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, centralIVs=False, nextYearBound=0, linearModels=False) print 'len of articleClasses:', len(articlesToUse) # raw_input('...') # define the storage containers for outputs group1 = 'on last GSS year' group2 = 'on first "future" GSS year' groups = [group1, group2] outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues', 'numTotal', 'propSig_CentralVars', 'paramSizesNormed_CentralVars', 'pvalues_CentralVars'] output = defaultdict(dict) output['metadata'] = {'article_id':[]} for group in groups:
per article. inputs: outputs: @author: Misha """ import GSSUtility as GU # this also imports a whole bunch of other modules if __name__ == "__main__": pathToData = "../../Data/" dataCont = GU.dataContainer(pathToData) articlesToUse = GU.filterArticles( dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, centralIVs=True ) print "len of articleClasses:", len(articlesToUse) raw_input("...") # define the storage containers for outputs group1 = "onDataUsed" group2 = "onFutureYear" output = defaultdict(dict) groups = [group1, group2] outcomes = [ "propSig", "paramSizesNormed",
""" import GSSUtility as GU # ********************************************************* allPropsForYearsUsed = [] allPropsForYearsPossible = [] allParamSizesForYearsUsed = [] allParamSizesForYearsPossible = [] allRsForYearsUsed, allRsForYearsPossible = [], [] ############################################################ if __name__ == "__main__": pathToData = "../../Data/" dataCont = GU.dataContainer(pathToData) articlesToUse = GU.filterArticles( dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=True, centralIVs=True, nextYearBound=3, linearModels=True, ) print "len of articleClasses:", len(articlesToUse) raw_input("...") # define the storage containers for outputs group1 = "onDataUsed" group2 = "onNextYear"
import random from scipy.stats import pearsonr, ttest_ind, ttest_rel import time from collections import Counter from collections import defaultdict #from GSSUtility import * import GSSUtility as GU ############################################################ if __name__ == "__main__": pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) tempCognateOutput = open(pathToData + 'tempCognateOutput.txt', 'w') # contains for storing (variable, cognate) tuples in order to see what substitutions #i'm most commonly making variableCognateTuples = [] # define the storage containers for outputs output = defaultdict(dict) groups = ['group1', 'group2'] outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues', 'numTotal'] for group in groups: for outcome in outcomes: output[group][outcome] = []
# In[ ]: #********************************************************* allPropsForYearsUsed = [] allPropsForYearsPossible =[] allParamSizesForYearsUsed = [] allParamSizesForYearsPossible = [] allRsForYearsUsed, allRsForYearsPossible = [], [] ############################################################ if __name__ == "__main__": pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=False, centralIVs=False, yearPublished=True) print 'len of articleClasses:', len(articlesToUse) # raw_input('...') YEARS = range(1972, 2013) outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues', 'propSig_CentralVars', 'paramSizesNormed_CentralVars', 'pvalues_CentralVars'] output = pd.DataFrame(np.empty((len(YEARS), len(outcomes))), columns=outcomes, index=YEARS) output = output.astype(object) for row in output.iterrows(): for col in range(len(row[1])): row[1][col] = [] #for article in random.sample(articlesToUse, 150): for article in articlesToUse:
sys.path.append('../') import GSSUtility as GU import statsmodels.formula.api as smf from pandas.rpy import common as com # In[2]: get_ipython().magic(u'rm ../GSSUtility.pyc # remove this file because otherwise it will be used instead of the updated .py file') reload(GU) # In[3]: pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) # In[3]: def independent_columns(A, tol = 1e-05): """ Return an array composed of independent columns of A. Note the answer may not be unique; this function returns one of many possible answers. http://stackoverflow.com/q/13312498/190597 (user1812712) http://math.stackexchange.com/a/199132/1140 (Gerry Myerson) http://mail.scipy.org/pipermail/numpy-discussion/2008-November/038705.html (Anne Archibald)
df[df.year_published.notnull()].groupby('year_published').count().head() # In[148]: grouped = df[df.year_published.notnull()].groupby('year_published') grouped.get_group(2004).head() # # Number of variables over time # In[52]: pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) articlesClasses = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True) df = pd.DataFrame(columns=['aid', 'yearpublished', 'dvs', 'ivs', 'controls', 'total']) for a in articleClasses: df.loc[a.articleID, :] = np.array([a.articleID, a.yearPublished, a.DVs, a.IVs, a.controls, 0], dtype=object) df = df[df.yearpublished.notnull()] df.yearpublished = df.yearpublished.astype(int) df.aid = df.aid.astype(int) df.index = df.aid df.dvs = [len(v) for k, v in df.dvs.iteritems()] df.ivs = [len(v) for k, v in df.ivs.iteritems()] df.controls = [len(v) for k, v in df.controls.iteritems()]
filename: minianalysis__did_not_use_all_available_data.py description: Compare models run on data used and on earlier data not used inputs: outputs: @author: Misha """ if __name__ == "__main__": pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) # group1 = models run on original data # group2 = models run on unused data (but available at time of publication) groups = ['group1', 'group2'] outcomes = ['%_of_coeffs_signif.', 'avg_coeff_size', 'Rs', 'adj_Rs', 'avg_p-value'] output = pd.DataFrame(columns=pd.MultiIndex.from_product([outcomes, groups]), dtype=float) # output = output.astype(object) # output.columns articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=True, \ unusedGSSYears=True, centralIVs=False) # for article in random.sample(articlesToUse, 10): for article in articlesToUse: #for article in [a for a in articleClasses if a.articleID == 6755]:
""" import sys sys.path.append('../') import GSSUtility as GU import random from collections import defaultdict import numpy as np import cPickle as cp ############################################################ if __name__ == "__main__": pathToData = '../../Data/' dataCont = GU.dataContainer(pathToData) VARS_BY_YEAR = cp.load(open(pathToData + 'VARS_BY_YEAR.pickle')) articlesToUse = GU.filterArticles(dataCont.articleClasses, GSSYearsUsed=True, GSSYearsPossible=True, centralIVs=False, nextYearBound=3, yearPublished=True) print 'len of articleClasses:', len(articlesToUse) raw_input('...') # define the storage containers for outputs group1 = 'randomVariables1' group2 = 'randomVariables2' output = defaultdict(dict) groups = [group1, group2] outcomes = ['propSig', 'paramSizesNormed', 'Rs', 'adjRs', 'pvalues'] for year in range(1972,2014): for group in groups: output[group][year] = defaultdict(list)