Beispiel #1
0
    
    print '\n===================\nProcessing article:', article.articleID

    RHS = article.IVs + article.controls

    dfoutput = pd.DataFrame(index=article.DVs, columns=outcomes)

    for DV in article.DVs:
#             print DV, '~', RHS
        maxYearUsed = max(article.GSSYearsUsed)
        futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
        nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)

#             log.write('id'+str(article.articleID)+' year '+str(maxYearUsed))

        resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS, custom_data=custom_data); # models run on max year of data used
        if not resOnDataUsed: continue

# #             log.write('id'+str(article.articleID)+' year '+str(nextYear))           
#         resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS); # models run on min year of future data
#         if not resOnNextYear: continue

#         # Checks on which results to record                
#         if len(resOnDataUsed.params) != len(resOnNextYear.params):
#             print 'The number of variables in original model is different from the number in model on future years. Skipping.'                    
#             continue

        # the condition below means that i don't care about models in which orig var isn't stat. sig.
#            if results.pvalues[-1] > 0.05: continue
#         results = [resOnDataUsed, resOnNextYear]
        td = defaultdict(dict)
        for group in groups:             
            td[group]['numTotal'] = 0.0
    #        td[group]['coeffsSig'] = []
            td[group]['numSig'] = 0.0   # proportions of significant coeffs
    #        td[group]['paramSizes'] = []
            td[group]['paramSizesNormed'] = []
            td[group]['Rs'] = []
            td[group]['adjRs'] = []
            td[group]['pvalues'] = []
        """
        LHS = article.IVs + article.controls

        for DV in article.DVs:
            maxYearUsed = max(article.GSSYearsUsed)
            resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, LHS)  # models run on max year of data used
            if not resOnDataUsed:
                continue

            # Now do future years
            futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
            for futureYear in futureYearsPossible:
                resOnFutureYear = GU.runModel(dataCont, futureYear, DV, LHS)  # models run on min year of future data
                if not resOnFutureYear:
                    continue

                # Checks on which results to record
                if len(resOnDataUsed.params) != len(resOnFutureYear.params):
                    print "The number of variables in original model is different from the number in cognate model. Skipping."
                    continue
        # for article in articlesToUse:
        # for article in [a for a in articlesToUse if a.articleID == 6755]:

        print "Processing article:", article.articleID

        RHS = article.IVs + article.controls

        for DV in article.DVs:
            print DV, "~", RHS
            maxYearUsed = max(article.GSSYearsUsed)
            futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
            nextYear = min(
                futureYearsPossible
            )  # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)

            resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS)  # models run on max year of data used
            if not resOnDataUsed:
                continue
            resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS)  # models run on min year of future data
            if not resOnNextYear:
                continue

            # Checks on which results to record
            if len(resOnDataUsed.params) != len(resOnNextYear.params):
                print "The number of variables in original model is different from the number in model on future years. Skipping."
                continue

            # the condition below means that i don't care about models in which orig var isn't stat. sig.
            #            if results.pvalues[-1] > 0.05: continue
            results = [resOnDataUsed, resOnNextYear]
        # Now let's estimate the models
        for DV in article.DVs:            
            for year in GSSYearsWithCognate:        

                # group 2 models (with cognates)
                group = 'group2'
                print 'Running cognate models'
                
                cognateLHS = originalLHS[:]
                cognateLHS.remove(cIV)
                cognateLHS.append(cognate) # need to put it in list otherwise it treats each letter as an element
                print 'Substituting', cIV, 'with cognate', cognate
                #time.sleep(2)
                #raw_input('Press Enter')                
                
                resultsCognate = GU.runModel(dataCont, year, DV, cognateLHS)          
                if not resultsCognate: continue # results will be None if the formula cant be estimated
                print DV, '~', cognateLHS, 'on year', year
                 
                # RUN MODELS FROM GROUP 1 ############################################  
                # group 1
                group = 'group1'
                print 'Running original models.'
                
                # make sure cIV is last in the list of variables
                originalLHS.remove(cIV)
                originalLHS.append(cIV) 
    
                results = GU.runModel(dataCont, year, DV, originalLHS)                     
                if not results: continue # results will be None if the formula cant be estimated
                
            
            if cognate == DV: continue # sometimes the cognate suggested by GU.identifyCognates is the DV
                
            for year in GSSYearsWithCognate:        

                # group 2 models (with cognates)
                print 'Running cognate models'
                
                cognateLHS = originalLHS[:] # the "[:]" makes a deep copy ?
                cognateLHS.remove(cIV)
                cognateLHS.append(cognate) # need to put it in list otherwise it treats each letter as an element
#                 print 'Substituting', cIV, 'with cognate', cognate
                #time.sleep(2)
                #raw_input('Press Enter')                
                
                result_cog = GU.runModel(dataCont, year, DV, cognateLHS)          
                if not result_cog: continue # results will be None if the formula cant be estimated
#                 print DV, '~', cognateLHS, 'on year', year
                 
                # RUN MODELS FROM GROUP 1 ############################################  
                # group 1
                print 'Running original models.'
                
#                 # make sure cIV is last in the list of variables
                originalLHS.remove(cIV)
                originalLHS.append(cIV) 
    
                result_orig = GU.runModel(dataCont, year, DV, originalLHS)                     
                if not result_orig: continue # results will be None if the formula cant be estimated

                results = [result_orig, result_cog]
    
    #for article in random.sample(articlesToUse, 150):
    for article in articlesToUse:
    #for article in [a for a in articlesToUse if a.articleID == 6755]:
    
        print 'Processing article:', article.articleID
        
        RHS = article.IVs + article.controls
        
        for DV in article.DVs: 

#             print DV, '~', RHS
            
            for yearUsed in article.GSSYearsUsed:

                res = GU.runModel(dataCont, yearUsed, DV, RHS) # models run on max year of data used
                if not res: continue
                         
                # the lines below no longer work because i'm using both continuous and dummies!!
                centralVars = []            
                for civ in article.centralIVs:
                    if 'standardize(%s, ddof=1)' % (civ) in res.params.index:
                        centralVars.append('standardize(%s, ddof=1)' % (civ))
                    else: 
                        for col in res.params.index:
                            if 'C(' + civ + ')' in col:
                                centralVars.append(col)
     
#                 print 'IVs:', article.IVs
#                 print 'centralVas:', centralVars
    #            raw_input('...')
# In[40]:

print 'Running article:', article.articleID

for DV in article.DVs:
    print DV, '~', RHS
#     RHS.remove('AGEWED')

#         futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
#         nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)

#             log.write('id'+str(article.articleID)+' year '+str(maxYearUsed))

    resOnDataUsed = GU.runModel(dataCont, max(article.GSSYearsUsed), DV, RHS, 
                                custom_data=custom_data,
                                standardized=False) # models run on max year of data used
    if not resOnDataUsed: continue

# #             log.write('id'+str(article.articleID)+' year '+str(nextYear))           
#         resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS); # models run on min year of future data
#         if not resOnNextYear: continue

#         # Checks on which results to record                
#         if len(resOnDataUsed.params) != len(resOnNextYear.params):
#             print 'The number of variables in original model is different from the number in model on future years. Skipping.'                    
#             continue

    # the condition below means that i don't care about models in which orig var isn't stat. sig.
#            if results.pvalues[-1] > 0.05: continue
#         results = [resOnDataUsed, resOnNextYear]
# In[20]:

# rcode='''
#     library(mi)
#     mydf = %s
#     IMP = mi(mydf, n.imp=2, n.iter=6, max.minutes=1)
#     imp1 <- mi.data.frame(IMP, m = 1)
# ''' % com.convert_to_r_dataframe(design).r_repr()
# r(rcode)
# com.convert_robj(r['imp1'])


# In[45]:

res=GU.runModel(dataCont, 1973, DV, article.IVs)


# In[46]:

res.summary()


# In[43]:

for year in article.GSSYearsUsed:
    design = dataCont.df.loc[year, [DV] + article.IVs]
#     design = design.fillna(design.mean())
    formula = GU.createFormula(dataCont, design)
#     results = smf.ols(formula2, data=design.dropna()).fit()
            td[group]['Rs'] = []
            td[group]['adj_Rs'] = []
            td[group]['avg_p-value'] = []
            td[group]['%_of_coeffs_signif.'] = []


        RHS = article.IVs + article.controls

        for DV in article.DVs:            

            # RUN MODELS FROM GROUP 1 ############################################  
            # group 1: models on original data
            group = 'group1'          
            for year in article.GSSYearsUsed:       
                print 'Run models on original data'
                res_orig = GU.runModel(dataCont, year, DV=DV, IVs=RHS)          
                if not res_orig: continue # results will be None if the formula cant be estimated                 
                print DV, '~', ' + '.join(RHS), 'on year', year
                
                # save the (temporary) results                   
                td[group]['Rs'].append(res_orig.rsquared)
                td[group]['adj_Rs'].append(res_orig.rsquared_adj)
                td[group]['numSig'] += float(len([p for p in res_orig.pvalues[1:] if p < 0.05])) # start at 1 because don't want to count the constant
                td[group]['avg_coeff_size'].append(np.mean(np.abs(res_orig.params[1:]))) # get the absolute value of the standardized coefficients and take the mean 
                td[group]['avg_p-value'].append(np.mean(res_orig.pvalues[1:]))
                td[group]['numTotal'] += len(res_orig.params[1:])
                td[group]['%_of_coeffs_signif.'].append( 
                      float(len([p for p in res_orig.pvalues[1:] if p < 0.05])) / len(res_orig.params[1:]) )
            
            # RUN MODELS FROM GROUP 2 ############################################  
            # group2: models run on unused (early) data
Beispiel #10
0

# In[92]:

print 'Running article:', article.articleID

for DV in article.DVs:
    print DV, '~', RHS
#     RHS.remove('AGEWED')

#         futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
#         nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)

#             log.write('id'+str(article.articleID)+' year '+str(maxYearUsed))

    resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS, custom_data=custom_data, standardize=False) # models run on max year of data used
    if not resOnDataUsed: continue

# #             log.write('id'+str(article.articleID)+' year '+str(nextYear))           
#         resOnNextYear = GU.runModel(dataCont, nextYear, DV, RHS); # models run on min year of future data
#         if not resOnNextYear: continue

#         # Checks on which results to record                
#         if len(resOnDataUsed.params) != len(resOnNextYear.params):
#             print 'The number of variables in original model is different from the number in model on future years. Skipping.'                    
#             continue

    # the condition below means that i don't care about models in which orig var isn't stat. sig.
#            if results.pvalues[-1] > 0.05: continue
#         results = [resOnDataUsed, resOnNextYear]
    
    #for article in random.sample(articlesToUse, 150):
    for article in articlesToUse:
    #for article in [a for a in articlesToUse if a.articleID == 6755]:
    
        print 'Processing article:', article.articleID     
        RHS = article.IVs + article.controls

        maxYearUsed = max(article.GSSYearsUsed)
        futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
        nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)
          
        for DV in article.DVs:
            print DV, '~', RHS
                       
            res = GU.runModel(dataCont, nextYear, DV, RHS) # models run on max year of data used
            if not res: continue
                     
            # the lines below no longer work because i'm using both continuous and dummies!!
            centralVars = []            
            for civ in article.centralIVs:
                if 'standardize(%s, ddof=1)' % (civ) in res.params.index:
                    centralVars.append('standardize(%s, ddof=1)' % (civ))
                else: 
                    for col in res.params.index:
                        if 'C(' + civ + ')' in col:
                            centralVars.append(col)
 
            print 'IVs:', article.IVs
            print 'centralVas:', centralVars
#            raw_input('...')
#    for article in articlesToUse:
    #for article in [a for a in articlesToUse if a.articleID == 6755]:
    
        print 'Processing article:', article.articleID

        maxYearUsed = max(article.GSSYearsUsed)
        
        RHS_random1 = random.sample(set(VARS_BY_YEAR[maxYearUsed])-set(article.DVs), len(article.IVs+ article.controls))
        RHS_random2 = random.sample(set(VARS_BY_YEAR[maxYearUsed])-set(article.DVs), len(article.IVs+ article.controls))
        
        for DV in article.DVs:
            print DV, '~', RHS_random1  

#            futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
#            nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)            
            resRandom1 = GU.runModel(dataCont, maxYearUsed, DV, RHS_random1) # models run on max year of data used
            if not resRandom1: continue
            resRandom2 = GU.runModel(dataCont, maxYearUsed, DV, RHS_random2) # models run on min year of future data
            if not resRandom2: continue
            
            # Checks on which results to record                
            if len(resRandom1.params) != len(resRandom2.params):
                print 'The number of variables in original model is different from the number in model on future years. Skipping.'                    
                continue
            
            # the condition below means that i don't care about models in which orig var isn't stat. sig.
#            if results.pvalues[-1] > 0.05: continue
            results = [resRandom1, resRandom2]
            
            # the lines below no longer work because i'm using both continuous and dummies!!
            
    for article in articlesToUse:
    #for article in [a for a in articlesToUse if a.articleID == 6755]:
    
        print 'Processing article:', article.articleID

        maxYearUsed = max(article.GSSYearsUsed)
        
        RHS = article.IVs + article.controls
        RHS_random = random.sample(VARS_BY_YEAR[maxYearUsed], len(RHS))
        
        for DV in article.DVs:
            print DV, '~', RHS  

#            futureYearsPossible = [yr for yr in article.GSSYearsPossible if yr > maxYearUsed]
#            nextYear = min(futureYearsPossible) # the arguments of GU.filterArticles function ensure that there is a suitable future year (within bound)            
            resOnDataUsed = GU.runModel(dataCont, maxYearUsed, DV, RHS) # models run on max year of data used
            if not resOnDataUsed: continue
            resRandom = GU.runModel(dataCont, maxYearUsed, DV, RHS_random) # models run on min year of future data
            if not resOnNextYear: continue
            
            # Checks on which results to record                
            if len(resOnDataUsed.params) != len(resRandom.params):
                print 'The number of variables in original model is different from the number in model on future years. Skipping.'                    
                continue
            
            # the condition below means that i don't care about models in which orig var isn't stat. sig.
#            if results.pvalues[-1] > 0.05: continue
            results = [resOnDataUsed, resRandom]
            
            # the lines below no longer work because i'm using both continuous and dummies!!
            centralVars = []