Esempio n. 1
0
def Analyze_SubTopic_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid_search reveals little variation. Linear 100 or linear 10 seem best, but not a huge effect
    Number of cases doesn't matter for MNB because most are small sample sizes (largest is 6,000).
    '''
    print 'SUBTOPIC ANALYSIS'

    #num=1000
    #metadata=ImportMeta(-1)
    path = 'Twitter/Data/'
    PREDS = {}
    for cat in set([line[1] for line in metadata.values()]):
        if cat == 'category' or cat == 'party':
            continue
        if cat == 'Student' or cat == 'indUnk':
            args = {'n_iter': 20, 'test_size': .9, 'random_state': 0}
        else:
            args = CVargs.copy()
        print 'RUNNINING ', cat, ' SUBTOPIC SCORES'
        f = 'Twitter_' + cat + '_Topic_Scores.csv'
        data = ImportCSVFeatureData(path + f, -1)
        vec = np.array([[float(l) for l in line[1:]]
                        for line in data])  #exclude cases where sex is unknown
        labels = np.array([metadata[line[0]][0]
                           for line in data])  # if 'age' not in line])
        IDX = np.array([line[0] for line in data])
        vec, labels, IDX = balance(vec, labels, IDX, bidx)
        Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args)
        print 'standardizing scores'
        preds = {}
        for k, score in Preds.iteritems():
            if np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    max(x)
                except:
                    continue
                x.append(max(x))
                preds[k] = np.mean(x)
            elif -1 * np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    min(x)
                except:
                    continue
                x.append(min(x))
                preds[k] = np.mean(x)
            else:
                preds[k] = np.mean(score)
        m = np.mean(preds.values())
        sd = np.std(preds.values())
        for k, score in preds.iteritems():
            preds[k] = (score - m) / sd
        PREDS.update(preds)
    Write_Scores(PREDS, ['id', 'subtopic_score'], outfile)
    return
Esempio n. 2
0
def Analyze_KBest_Scores(metadata, bidx, cl, outfile, **CVargs):
    filename = 'KBest'
    vec, ids, words = importArray(filename)

    labels = np.array([metadata[idx][0]
                       for idx in ids])  # if 'age' not in line])
    IDX = np.array(ids)
    #
    #filename='Twitter/Data/Twitter_KBest_Scores.csv'
    #data=ImportCSVFeatureData(filename,-1)
    #print 'drawing samples'
    #vec=np.array([[float(l) for l in line[1:]] for line in data])   #exclude cases where sex is unknown
    #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line])
    #IDX=np.array([line[0] for line in data])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)

    print 'drawing samples'
    labels = np.array([metadata[idx][0]
                       for idx in ids])  # if 'age' not in line])
    IDX = np.array(ids)

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Brown/Results/Raw_Preds.csv'
    Write_Scores(preds, ['id', 'kbest_score'], outfile)
    return
Esempio n. 3
0
def Analyze_Nonword_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    
    check rows 1535 and 15349 for inf data. Should no longer have to recode 8 and 12 (herndanV and LnM)
    
    '''
    print 'NONWORD ANALYSIS'

    #metadata=ImportMeta(-1)
    filename = 'Twitter/Data/Twitter_Nonword_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    #vec[:,8]=vec[:,8]*-1    #herndanV is always neg (changed in Make_Twitter_Data now)
    #vec[:,12]=vec[:,12]*-1   #LnM is always neg (changed in Make_Twitter_Data now)
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])
    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())
    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Nonwords_Preds.csv'
    Write_Scores(preds, ['id', 'nonword_score'], outfile)

    return
Esempio n. 4
0
def Analyze_Raw_Topic_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid_search shows C>=1 is ideal. remains 71% from 500 through 7000
    remains at 71% at sample sizes from 500 through 10000.
    '''
    print 'RAW TOPIC ANALYSIS'

    #metadata=ImportMeta(-1)
    filename = 'Twitter/Data/Raw_Topic_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Raw_Topic_Preds.csv'
    Write_Scores(preds, ['id', 'rawTopic_score'], outfile)

    return
Esempio n. 5
0
def Analyze_Raw(metadata, bidx, cl, outfile, **CVargs):
    '''
    mnb max's out at 69/70% accurate at 3,000 (or 600 training) texts.  Does not increase in accuracy after that.
    svm: grid search showed ideal is linear kernal with C=1,10, or 100; also max's out at 74% accurate for 3,000 (goes to 76 at 10,000)
    
    '''
    print 'running Raw analysis'

    #metadata=ImportMeta(-1)
    filename = 'Raw'
    vec, ids, words = importArray(filename)
    print 'drawing samples'
    #vec=data[0:,1:] #grab all but zeroth column
    #labels=data[0:,0]   #grab all of zeroth column
    labels = np.array([metadata[idx][0]
                       for idx in ids])  # if 'age' not in line])
    IDX = np.array(ids)

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Raw_Preds.csv'
    Write_Scores(preds, ['id', 'raw_score'], outfile)
    return
Esempio n. 6
0
def Analyze_Individual(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid search shows C>=1 is optimal
    accuracy is unrelated to sample size (remains 84-89% throughout)
    '''
    print 'INDIVIDUAL ANALYSIS'
    #metadata=ImportMeta(-1)
    filename = 'Twitter/Data/Twitter_Individual_Scores.txt'
    data = ImportFeatureData(filename, -1)
    vec = np.array([line[2:] for line in data if line[1] != 1.0
                    ])  #exclude cases where sex is never mentioned
    labels = np.array([
        metadata[line[0]][0] for line in data if line[1] != 1.0
    ])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data if line[1] != 1.0])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    print 'standardizing scores'
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())
    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='Individual_Preds.csv'
    Write_Scores(preds, ['id', 'indiv_score'], outfile)

    return
Esempio n. 7
0
def Analyze_LIWC(metadata, bidx, cl, outfile, **CVargs):
    filename = 'Twitter/Data/Twitter_LIWC_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])

    vec, labels, IDX = balance(vec, labels, IDX, bidx)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)
    preds = {}
    for k, score in Preds.iteritems():
        if np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                max(x)
            except:
                continue
            x.append(max(x))
            preds[k] = np.mean(x)
        elif -1 * np.inf in score:
            original = len(score)
            x = list(np.array(score)[np.logical_not(np.isinf(score))])
            try:
                min(x)
            except:
                continue
            x.append(min(x))
            preds[k] = np.mean(x)
        else:
            preds[k] = np.mean(score)
    m = np.mean(preds.values())
    sd = np.std(preds.values())

    for k, score in preds.iteritems():
        preds[k] = (score - m) / sd
    #fname='LIWC_Preds.csv'
    Write_Scores(preds, ['id', 'liwc_score'], outfile)
    return
Esempio n. 8
0
def hybridTrial(metadata):
    '''
    This code takes two above feature sets and tests whether they change their collective and individual predictability
    Raw * Raw Topics = ? * .72 = .71 (No change in nb score)
    Subtopics * raw topics = .65 * .72 = .69
    '''
    print 'import raw topic scores'
    filename = 'Twitter/Data/Raw_Topic_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])
    print 'CV for RAW TOPICS'
    CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0}
    cl = mnb()
    #Preds=Classifiers.CrossValidate(vec,labels,IDX,cl,**CVargs)

    print 'importing subtopic scores'
    path = 'Twitter/Data/'
    preds = {}
    Data = []
    for cat in set([line[2] for line in metadata.values()]):
        if cat == 'category' or cat == 'party':
            continue
        print 'RUNNINING ', cat, ' SUBTOPIC SCORES'
        f = 'Twitter_' + cat + '_Topic_Scores.csv'
        data = ImportCSVFeatureData(path + f, -1)
        Data.append(data)
        #for line in data:
        #    for idx in IDX:
        #        if line[0]==idx:
        #            rvec.append(line)
        #            break
        #vec=np.array([[float(l) for l in line[1:]] for line in data])   #exclude cases where sex is unknown
        #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line])
        #IDX=np.array([line[0] for line in data])
    print 'resorting cases to align with labels'
    rvec = [[] for i in IDX]
    #rlabels=[]

    for data in Data:
        for i, idx in enumerate(IDX):
            if idx in [line[0] for line in data]:
                for line in data:
                    if idx == line[0]:
                        rvec[i] += line[1:]
                        continue
                    #rvec.append(line[1:])
            else:
                rvec[i] += [0 for i in data[0][1:]]

    #used to align RAW
    #for idx in IDX:
    #    if line[0]==idx:
    #        rvec.append(line[1:])
    #        break
    #rlabels.append(meta-data[str(int(idx))][0])

    rvec = np.append(vec, np.array(rvec), axis=1)

    print 'crossvalidate testing COMBINATION'
    CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0}
    cl = mnb()
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)

    CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0}
    cl = mnb()
    cl = ensemble.AdaBoostClassifier(n_estimators=10)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)

    return
Esempio n. 9
0
def Analyze_Behavior_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    grid_search across all seems to agree that C==10,000 for linear or rbf is optimal
    Sample size doesn't matter because number of texts in an area max out at 6,000
    '''
    print 'BEHAVIOR ANALYSIS'

    #num=1000
    #metadata=ImportMeta(-1)

    path = 'Twitter/Data/'
    PREDS = {}
    for cat in set([line[1] for line in metadata.values()]):
        if cat == 'category' or cat == 'party':
            continue
        #if cat=='Student' or cat=='indUnk':
        #    args={'n_iter':20, 'test_size':.9,'random_state':0}
        else:
            args = CVargs.copy()
        print 'RUNNINING ', cat, ' BEHAVIOR SCORES'
        f = 'Twitter_Behavior_' + cat + '_Scores.csv'
        data = ImportCSVFeatureData(path + f, -1)
        vec = np.array([[float(l) for l in line[1:]]
                        for line in data])  #exclude cases where sex is unknown
        labels = np.array([metadata[line[0]][0]
                           for line in data])  # if 'age' not in line])
        IDX = np.array([line[0] for line in data])
        vec, labels, IDX = balance(vec, labels, IDX, bidx)
        Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args)
        preds = {}
        for k, score in Preds.iteritems():
            if np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    max(x)
                except:
                    continue
                x.append(max(x))
                preds[k] = np.mean(x)
            elif -1 * np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    min(x)
                except:
                    continue
                x.append(min(x))
                preds[k] = np.mean(x)
            else:
                preds[k] = np.mean(score)
        m = np.mean(preds.values())
        sd = np.std(preds.values())

        for k, score in preds.iteritems():
            preds[k] = (score - m) / sd
        PREDS.update(preds)
    #fname='Behavior_Preds.csv'
    Write_Scores(PREDS, ['id', 'behavior_score'], outfile)

    return
Esempio n. 10
0
def Analyze_Structure_Scores(metadata, bidx, cl, outfile, **CVargs):
    '''
    SVM - should be linear and C =10. Accuracy maxes out around 3,000 at ~62% but doesn't grow much from 500
    MNB - sample size appears to be unrelated to accuracy. Hovers at 58-62% throughout.
    '''
    print 'STRUCTURE ANALYSIS'

    #metadata=ImportMeta(-1)
    path = 'Twitter/Data/'
    PREDS = dict(zip(metadata.keys(), [[] for i in metadata.keys()]))

    for cat in set([line[1] for line in metadata.values()]):
        if cat == 'category' or cat == 'party':
            continue
        #if cat=='Student' or cat=='indUnk':    #For big categories, use a different test conditions
        #    args={'n_iter':20, 'test_size':.9,'random_state':0}
        else:
            args = CVargs.copy()
        print 'RUNNINING ', cat, ' STRUCTURE SCORES'
        f = 'Twitter_Structure_' + cat + '_Scores.csv'
        data = ImportCSVFeatureData(path + f, -1)
        vec = np.array([[float(l) for l in line[1:]]
                        for line in data])  #exclude cases where sex is unknown
        labels = np.array([metadata[line[0]][0]
                           for line in data])  # if 'age' not in line])
        IDX = np.array([line[0] for line in data])
        vec, labels, IDX = balance(vec, labels, IDX, bidx)
        Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **args)
        preds = {}
        for k, score in Preds.iteritems():
            if np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    max(x)
                except:
                    continue
                x.append(max(x))
                preds[k] = np.mean(x)
            elif -1 * np.inf in score:
                original = len(score)
                x = list(np.array(score)[np.logical_not(np.isinf(score))])
                try:
                    min(x)
                except:
                    continue
                x.append(min(x))
                preds[k] = np.mean(x)
            else:
                preds[k] = np.mean(score)
        m = np.mean(preds.values())
        sd = np.std(preds.values())

        for k, score in preds.iteritems():
            PREDS[k].append((score - m) / sd)
    #This uses a bagging model to score masculine or feminine
    for k, scores in PREDS.iteritems():
        PREDS[k] = np.mean(scores)
    Write_Scores(PREDS, ['id', 'struct_score'], outfile)

    return