print '\n===================\nProcessing article:', article.articleID RHS = article.IVs + article.controls dfoutput = pd.DataFrame(index=article.DVs, columns=outcomes) for DV in article.DVs: # print DV, '~', RHS maxYearUsed = max(article.GSSYearsUsed) futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) # log.write('id'+str(article.articleID)+' year '+str(maxYearUsed)) resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS, custom_data=custom_data); # models run on max year of data used if not resOnDataUsed: continue # # log.write('id'+str(article.articleID)+' year '+str(nextYear)) # resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS); # models run on min year of future data # if not resOnNextYear: continue # # Checks on which results to record # if len(resOnDataUsed.params) != len(resOnNextYear.params): # print 'The number of variables in original model is different from the number in model on future years. Skipping.' # continue # the condition below means that i don't care about models in which orig var isn't stat. sig. # if results.pvalues[-1] > 0.05: continue # results = [resOnDataUsed, resOnNextYear]
td = defaultdict(dict) for group in groups: td[group]['numTotal'] = 0.0 # td[group]['coeffsSig'] = [] td[group]['numSig'] = 0.0 # proportions of significant coeffs # td[group]['paramSizes'] = [] td[group]['paramSizesNormed'] = [] td[group]['Rs'] = [] td[group]['adjRs'] = [] td[group]['pvalues'] = [] """ LHS = article.IVs + article.controls for DV in article.DVs: maxYearUsed = max(article.GSSYearsUsed) resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, LHS) # models run on max year of data used if not resOnDataUsed: continue # Now do future years futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] for futureYear in futureYearsPossible: resOnFutureYear = GU.runModel(dataCont, futureYear, DV, LHS) # models run on min year of future data if not resOnFutureYear: continue # Checks on which results to record if len(resOnDataUsed.params) != len(resOnFutureYear.params): print "The number of variables in original model is different from the number in cognate model. Skipping." continue
# for article in articlesToUse: # for article in [a for a in articlesToUse if a.articleID == 6755]: print "Processing article:", article.articleID RHS = article.IVs + article.controls for DV in article.DVs: print DV, "~", RHS maxYearUsed = max(article.GSSYearsUsed) futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] nextYear = min( futureYearsPossible ) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS) # models run on max year of data used if not resOnDataUsed: continue resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS) # models run on min year of future data if not resOnNextYear: continue # Checks on which results to record if len(resOnDataUsed.params) != len(resOnNextYear.params): print "The number of variables in original model is different from the number in model on future years. Skipping." continue # the condition below means that i don't care about models in which orig var isn't stat. sig. # if results.pvalues[-1] > 0.05: continue results = [resOnDataUsed, resOnNextYear]
# Now let's estimate the models for DV in article.DVs: for year in GSSYearsWithCognate: # group 2 models (with cognates) group = 'group2' print 'Running cognate models' cognateLHS = originalLHS[:] cognateLHS.remove(cIV) cognateLHS.append(cognate) # need to put it in list otherwise it treats each letter as an element print 'Substituting', cIV, 'with cognate', cognate #time.sleep(2) #raw_input('Press Enter') resultsCognate = GU.runModel(dataCont, year, DV, cognateLHS) if not resultsCognate: continue # results will be None if the formula cant be estimated print DV, '~', cognateLHS, 'on year', year # RUN MODELS FROM GROUP 1 ############################################ # group 1 group = 'group1' print 'Running original models.' # make sure cIV is last in the list of variables originalLHS.remove(cIV) originalLHS.append(cIV) results = GU.runModel(dataCont, year, DV, originalLHS) if not results: continue # results will be None if the formula cant be estimated
if cognate == DV: continue # sometimes the cognate suggested by GU.identifyCognates is the DV for year in GSSYearsWithCognate: # group 2 models (with cognates) print 'Running cognate models' cognateLHS = originalLHS[:] # the "[:]" makes a deep copy ? cognateLHS.remove(cIV) cognateLHS.append(cognate) # need to put it in list otherwise it treats each letter as an element # print 'Substituting', cIV, 'with cognate', cognate #time.sleep(2) #raw_input('Press Enter') result_cog = GU.runModel(dataCont, year, DV, cognateLHS) if not result_cog: continue # results will be None if the formula cant be estimated # print DV, '~', cognateLHS, 'on year', year # RUN MODELS FROM GROUP 1 ############################################ # group 1 print 'Running original models.' # # make sure cIV is last in the list of variables originalLHS.remove(cIV) originalLHS.append(cIV) result_orig = GU.runModel(dataCont, year, DV, originalLHS) if not result_orig: continue # results will be None if the formula cant be estimated results = [result_orig, result_cog]
#for article in random.sample(articlesToUse, 150): for article in articlesToUse: #for article in [a for a in articlesToUse if a.articleID == 6755]: print 'Processing article:', article.articleID RHS = article.IVs + article.controls for DV in article.DVs: # print DV, '~', RHS for yearUsed in article.GSSYearsUsed: res = GU.runModel(dataCont, yearUsed, DV, RHS) # models run on max year of data used if not res: continue # the lines below no longer work because i'm using both continuous and dummies!! centralVars = [] for civ in article.centralIVs: if 'standardize(%s, ddof=1)' % (civ) in res.params.index: centralVars.append('standardize(%s, ddof=1)' % (civ)) else: for col in res.params.index: if 'C(' + civ + ')' in col: centralVars.append(col) # print 'IVs:', article.IVs # print 'centralVas:', centralVars # raw_input('...')
# In[40]: print 'Running article:', article.articleID for DV in article.DVs: print DV, '~', RHS # RHS.remove('AGEWED') # futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] # nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) # log.write('id'+str(article.articleID)+' year '+str(maxYearUsed)) resOnDataUsed = GU.runModel(dataCont, max(article.GSSYearsUsed), DV, RHS, custom_data=custom_data, standardized=False) # models run on max year of data used if not resOnDataUsed: continue # # log.write('id'+str(article.articleID)+' year '+str(nextYear)) # resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS); # models run on min year of future data # if not resOnNextYear: continue # # Checks on which results to record # if len(resOnDataUsed.params) != len(resOnNextYear.params): # print 'The number of variables in original model is different from the number in model on future years. Skipping.' # continue # the condition below means that i don't care about models in which orig var isn't stat. sig. # if results.pvalues[-1] > 0.05: continue # results = [resOnDataUsed, resOnNextYear]
# In[20]: # rcode=''' # library(mi) # mydf = %s # IMP = mi(mydf, n.imp=2, n.iter=6, max.minutes=1) # imp1 <- mi.data.frame(IMP, m = 1) # ''' % com.convert_to_r_dataframe(design).r_repr() # r(rcode) # com.convert_robj(r['imp1']) # In[45]: res=GU.runModel(dataCont, 1973, DV, article.IVs) # In[46]: res.summary() # In[43]: for year in article.GSSYearsUsed: design = dataCont.df.loc[year, [DV] + article.IVs] # design = design.fillna(design.mean()) formula = GU.createFormula(dataCont, design) # results = smf.ols(formula2, data=design.dropna()).fit()
td[group]['Rs'] = [] td[group]['adj_Rs'] = [] td[group]['avg_p-value'] = [] td[group]['%_of_coeffs_signif.'] = [] RHS = article.IVs + article.controls for DV in article.DVs: # RUN MODELS FROM GROUP 1 ############################################ # group 1: models on original data group = 'group1' for year in article.GSSYearsUsed: print 'Run models on original data' res_orig = GU.runModel(dataCont, year, DV=DV, IVs=RHS) if not res_orig: continue # results will be None if the formula cant be estimated print DV, '~', ' + '.join(RHS), 'on year', year # save the (temporary) results td[group]['Rs'].append(res_orig.rsquared) td[group]['adj_Rs'].append(res_orig.rsquared_adj) td[group]['numSig'] += float(len([p for p in res_orig.pvalues[1:] if p < 0.05])) # start at 1 because don't want to count the constant td[group]['avg_coeff_size'].append(np.mean(np.abs(res_orig.params[1:]))) # get the absolute value of the standardized coefficients and take the mean td[group]['avg_p-value'].append(np.mean(res_orig.pvalues[1:])) td[group]['numTotal'] += len(res_orig.params[1:]) td[group]['%_of_coeffs_signif.'].append( float(len([p for p in res_orig.pvalues[1:] if p < 0.05])) / len(res_orig.params[1:]) ) # RUN MODELS FROM GROUP 2 ############################################ # group2: models run on unused (early) data
# In[92]: print 'Running article:', article.articleID for DV in article.DVs: print DV, '~', RHS # RHS.remove('AGEWED') # futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] # nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) # log.write('id'+str(article.articleID)+' year '+str(maxYearUsed)) resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS, custom_data=custom_data, standardize=False) # models run on max year of data used if not resOnDataUsed: continue # # log.write('id'+str(article.articleID)+' year '+str(nextYear)) # resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS); # models run on min year of future data # if not resOnNextYear: continue # # Checks on which results to record # if len(resOnDataUsed.params) != len(resOnNextYear.params): # print 'The number of variables in original model is different from the number in model on future years. Skipping.' # continue # the condition below means that i don't care about models in which orig var isn't stat. sig. # if results.pvalues[-1] > 0.05: continue # results = [resOnDataUsed, resOnNextYear]
#for article in random.sample(articlesToUse, 150): for article in articlesToUse: #for article in [a for a in articlesToUse if a.articleID == 6755]: print 'Processing article:', article.articleID RHS = article.IVs + article.controls maxYearUsed = max(article.GSSYearsUsed) futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) for DV in article.DVs: print DV, '~', RHS res = GU.runModel(dataCont, nextYear, DV, RHS) # models run on max year of data used if not res: continue # the lines below no longer work because i'm using both continuous and dummies!! centralVars = [] for civ in article.centralIVs: if 'standardize(%s, ddof=1)' % (civ) in res.params.index: centralVars.append('standardize(%s, ddof=1)' % (civ)) else: for col in res.params.index: if 'C(' + civ + ')' in col: centralVars.append(col) print 'IVs:', article.IVs print 'centralVas:', centralVars # raw_input('...')
# for article in articlesToUse: #for article in [a for a in articlesToUse if a.articleID == 6755]: print 'Processing article:', article.articleID maxYearUsed = max(article.GSSYearsUsed) RHS_random1 = random.sample(set(VARS_BY_YEAR[maxYearUsed])-set(article.DVs), len(article.IVs+ article.controls)) RHS_random2 = random.sample(set(VARS_BY_YEAR[maxYearUsed])-set(article.DVs), len(article.IVs+ article.controls)) for DV in article.DVs: print DV, '~', RHS_random1 # futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] # nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) resRandom1 = GU.runModel(dataCont, maxYearUsed, DV, RHS_random1) # models run on max year of data used if not resRandom1: continue resRandom2 = GU.runModel(dataCont, maxYearUsed, DV, RHS_random2) # models run on min year of future data if not resRandom2: continue # Checks on which results to record if len(resRandom1.params) != len(resRandom2.params): print 'The number of variables in original model is different from the number in model on future years. Skipping.' continue # the condition below means that i don't care about models in which orig var isn't stat. sig. # if results.pvalues[-1] > 0.05: continue results = [resRandom1, resRandom2] # the lines below no longer work because i'm using both continuous and dummies!!
for article in articlesToUse: #for article in [a for a in articlesToUse if a.articleID == 6755]: print 'Processing article:', article.articleID maxYearUsed = max(article.GSSYearsUsed) RHS = article.IVs + article.controls RHS_random = random.sample(VARS_BY_YEAR[maxYearUsed], len(RHS)) for DV in article.DVs: print DV, '~', RHS # futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed] # nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound) resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS) # models run on max year of data used if not resOnDataUsed: continue resRandom = GU.runModel(dataCont, maxYearUsed, DV, RHS_random) # models run on min year of future data if not resOnNextYear: continue # Checks on which results to record if len(resOnDataUsed.params) != len(resRandom.params): print 'The number of variables in original model is different from the number in model on future years. Skipping.' continue # the condition below means that i don't care about models in which orig var isn't stat. sig. # if results.pvalues[-1] > 0.05: continue results = [resOnDataUsed, resRandom] # the lines below no longer work because i'm using both continuous and dummies!! centralVars = []