Example #1
0
def textAnalysis(paramList):
    startTime=time.time()
    groupId=paramList[0]
    fileList=paramList[1]
    targetWordCount=paramList[2]
    cocoWindow=paramList[3]
    svdInt=paramList[4]
    cvWindow=paramList[5]
    simCount=paramList[6]
    #Get list of subfiles
    subFileList=[x[1] for x in fileList if x[0]==groupId[0] and x[2]==groupId[1]]
    
    tokenList = sd.tokenize(subFileList)
    
    ########################
    ###Sentiment Analysis###
    ########################
    sentimentList=sa.sentimentLookup(tokenList)
    
    ########################################
    ###POS Tagging and Judgement Analysis###
    ########################################
    
    judgementList=[sp.judgements(sp.readText(fileName)) for fileName in subFileList]
    judgementAvg=list(np.mean(np.array(judgementList),axis=0))
    
    txtString=' '.join([sp.readText(fileName) for fileName in subFileList])
    wordList=sp.targetWords(txtString,targetWordCount)
    
    #######################            
    ###Semantic analysis###
    #######################
    
    #Get word coCo
    CoCo, TF, docTF = sd.coOccurence(tokenList,cocoWindow)
    
    #Get DSM
    DSM=sd.DSM(CoCo,svdInt)
    
    #Get context vectors
    #Bring in wordlist
    
    wordList=[stemmer.stem(word) for word in wordList]
    CVDict=sd.contextVectors(tokenList, DSM, wordList, cvWindow)
    
    #Run cosine sim
    cosineSimilarity=sd.averageCosine(CVDict,subFileList,wordList,simCount)
    avgSD=np.mean([x[1] for x in cosineSimilarity])
    
    ############################
    ###Network Quantification###
    ############################
    avgEVC=nq.getNetworkQuant(DSM,wordList)
    
    endTime=time.time()
    timeRun=endTime-startTime
    print('finished running'+'_'.join(groupId)+' in '+str(end-start)+' seconds')
    sys.stdout.flush()
    #Append outputs to masterOutput
    return(['_'.join(groupId)]+[len(subFileList),timeRun]+sentimentList+judgementAvg+[avgSD]+[avgEVC])   
def runMaster(rawPath,groupList,crossValidate,groupSize,testSplit,targetWordCount,cocoWindow,svdInt,cvWindow,simCount):
    ###############################
    #####Raw File List Extract#####
    ###############################

    rawFileList=[]
    for groupId in groupList:
        for dirpath, dirnames, filenames in os.walk(rawPath+groupId+'/raw'):
            for filename in [f for f in filenames ]:
                if '.txt' in filename:
                    rawFileList.append([groupId,os.path.join(dirpath, filename)])

    #Make output directory
#    runDirectory='./pythonOutput/'+ time.strftime("%c")
#    runDirectory='./pythonOutput/cocowindow_'+str(cocoWindow)+time.strftime("%c").replace(' ','_')
    runDirectory='./pythonOutput/cocowindow_'+str(cocoWindow)
    os.makedirs(runDirectory)
    end=time.time()
    print('finished loading packages after '+str(end-start)+' seconds')
    sys.stdout.flush()
    
    
    #Perform analysis for each fold in cross validation
    for fold in range(crossValidate):                
        ###############################                
        #####Set up random binning#####
        ###############################
                        
        #Loop through each group and create sub bins
        fileList=[]
        for groupId in groupList:
            subGroup=[x for x in rawFileList if groupId == x[0]]
            randomSample=list(np.random.choice(range(len(subGroup)),size=len(subGroup),replace=False))
            splitIndex=int((1-testSplit)*len(subGroup))
            groupId=['train'+ "%02d" %int(i/groupSize) if i<splitIndex else 'test'+ "%02d" %int((i-splitIndex)/groupSize) for i in randomSample]
            
            fileList=fileList+[[subGroup[i][0],subGroup[i][1],groupId[i]] for i in range(len(subGroup))]
        
        fileDF=pd.DataFrame(fileList,columns=['group','filepath','subgroup'])
    
      
        
        #Get set of subgroups
        subgroupList=[ list(y) for y in set((x[0],x[2]) for x in fileList) ]
        
        #Make output directory
        outputDirectory=runDirectory+'/run'+str(fold)
        os.makedirs(outputDirectory)
        
        #Print file splits to runDirectory
        fileDF.to_csv(outputDirectory+'/fileSplits.csv')

        end=time.time()
        print('finished randomly creating subgroups '+str(end-start)+' seconds')
        sys.stdout.flush()        
        
        ################################
        #####Perform group analysis#####
        ################################
        
        #Set up population coCo and DSM
        trainFileList=[x[1] for x in fileList if 'train' in x[2]]
        tokenList = sd.tokenize(trainFileList) 
        
        #create DSM
        CoCo, TF, docTF = sd.coOccurence(tokenList,cocoWindow)
        
        #Get DSM
        DSM=sd.DSM(CoCo,svdInt)          
        
        #Run calculation
#        masterOutput=[textAnalysis(x) for x in paramList]  
        masterOutput=Parallel(n_jobs=3)(delayed(textAnalysis)([x,fileList,targetWordCount,DSM,cvWindow,simCount]) for x in subgroupList)  
        #Create output file
        outputDF=pd.DataFrame(masterOutput,columns=['groupId','files','timeRun','perPos','perNeg','perPosDoc','perNegDoc','judgementCount','judgementFrac','avgSD','avgEVC'])
        outputDF.to_csv(outputDirectory+'/masterOutput.csv')
Example #3
0
preCrashDateList=datelist(date(2005,1,1),date(2006,12,31))
preCrashFiles=metaFileDF[metaFileDF['datetime'].isin(preCrashDateList)].filePath
crashDateList=datelist(date(2007,1,1),date(2008,12,31))
crashFiles=metaFileDF[metaFileDF['datetime'].isin(crashDateList)].filePath
postCrashDateList=datelist(date(2007,1,1),date(2008,12,31))
postCrashFiles=metaFileDF[metaFileDF['datetime'].isin(postCrashDateList)].filePath

allFiles=list(preCrashFiles)+list(crashFiles)+list(postCrashFiles)

if runSample == True:
    import random
    allFiles = random.sample(allFiles, 200)

print('starting tokenization')
sys.stdout.flush()
tokens = sd.tokenize(allFiles)
# Filter Tokens List to only Documents that contain a Target Word
tokens = sd.retainRelevantDocs(tokens, wordList)
print('finished tokenization')
sys.stdout.flush()
#Subset tokens only for attack dates
#attackTokens={key:value for key, value in rawTokens.items() if key in allFiles}

preCrashFiles = [doc for doc in preCrashFiles if doc in tokens.keys()]
crashFiles = [doc for doc in crashFiles if doc in tokens.keys()]
postCrashFiles = [doc for doc in postCrashFiles if doc in tokens.keys()]


#Get word coCo
print('starting word coco')
sys.stdout.flush()
Example #4
0
@author: nmvenuti
Sentiment Analysis
"""

#Import packages
import sys
import numpy as np
sys.path.append('./python/')
#import syntacticParsing as sp
import semanticDensity as sd

#Get sentimentWord dict and remove duplicates. Store in lists
posFilePath='./refData/positive-words.txt'
negFilePath='./refData/negative-words.txt'
sentDict=sd.tokenize([posFilePath,negFilePath])
posWords=list(set(sentDict[posFilePath]))
negWords=list(set(sentDict[negFilePath]))

#Define packages

tokens=sd.tokenize(['./data_dsicap/DorothyDay/raw/2.html.txt','./data_dsicap/DorothyDay/raw/3.html.txt'])
def sentimentLookup(tokens):
    fileSentiment=[]
    #Get sentiment for each document
    for filename in tokens.keys():
        
        #initialize counters
        wordCount=0.0
        posCount=0.0
        negCount=0.0
Example #5
0
'./data_dsicap/WBC/raw/WestboroBaptist_Sermon_20100328.pdf.txt',
'./data_dsicap/WBC/raw/WestboroBaptist_Sermon_20090510.pdf.txt'
]



groupSize=10
testSplit=0.1
targetWordCount=10
cocoWindow=6
svdInt=50
cvWindow=6
simCount=1000


tokenList = sd.tokenize(subFileList)

########################
###Sentiment Analysis###
########################
sentimentList=sa.sentimentLookup(tokenList)

########################################
###POS Tagging and Judgement Analysis###
########################################

judgementList=[sp.judgements(sp.readText(fileName)) for fileName in subFileList]
judgementAvg=list(np.mean(np.array(judgementList),axis=0))

txtString=' '.join([sp.readText(fileName) for fileName in subFileList])
wordList=sp.targetWords(txtString,targetWordCount)
Example #6
0
def textAnalysis(paramList):
    startTime = time.time()
    groupId = paramList[0]
    fileList = paramList[1]
    targetWordCount = paramList[2]
    cocoWindow = paramList[3]
    svdInt = paramList[4]
    cvWindow = paramList[5]
    simCount = paramList[6]
    startCount = paramList[7]
    netAngle = paramList[8]

    #Get list of subfiles
    subFileList = [
        x[1] for x in fileList if x[0] == groupId[0] and x[2] == groupId[1]
    ]

    tokenList = sd.tokenize(subFileList)

    ########################
    ###Sentiment Analysis###
    ########################
    sentimentList = sa.sentimentLookup(tokenList)

    ########################################
    ###POS Tagging and Judgement Analysis###
    ########################################

    judgementList = [
        sp.judgements(sp.readText(fileName)) for fileName in subFileList
    ]
    judgementAvg = list(np.mean(np.array(judgementList), axis=0))

    txtString = ' '.join([sp.readText(fileName) for fileName in subFileList])
    wordList = sp.targetWords(txtString, targetWordCount, startCount)

    #######################
    ###Semantic analysis###
    #######################

    #Get word coCo
    CoCo, TF, docTF = sd.coOccurence(tokenList, cocoWindow)

    #Get DSM
    DSM = sd.DSM(CoCo, svdInt)

    #Get context vectors
    #Bring in wordlist

    wordList = [stemmer.stem(word) for word in wordList]
    CVDict = sd.contextVectors(tokenList, DSM, wordList, cvWindow)

    #Run cosine sim
    cosineSimilarity = sd.averageCosine(CVDict, subFileList, wordList,
                                        simCount)
    avgSD = np.mean([x[1] for x in cosineSimilarity])

    ############################
    ###Network Quantification###
    ############################
    avgEVC = nq.getNetworkQuant(DSM, wordList, netAngle)

    endTime = time.time()
    timeRun = endTime - startTime
    print('finished running' + '_'.join(groupId) + ' in ' + str(end - start) +
          ' seconds')
    sys.stdout.flush()

    #Delete and garbage collect
    del CoCo, TF, docTF, DSM, CVDict, cosineSimilarity
    gc.collect()
    #Append outputs to masterOutput
    return (['_'.join(groupId)] + [len(subFileList), timeRun] + sentimentList +
            judgementAvg + [avgSD] + [avgEVC])
Example #7
0
preAttackFiles=metaFileDF[metaFileDF['datetime'].isin(preAttackDateList)].filePath
postAttackDateList=datelist(date(2001,9,11),date(2003,9,11))
postAttackFiles=metaFileDF[metaFileDF['datetime'].isin(postAttackDateList)].filePath

print('pre',len(preAttackFiles))
print('post',len(postAttackFiles))
sys.stdout.flush()
allFiles=list(preAttackFiles)+list(postAttackFiles)

if runSample == True:
    import random
    allFiles = random.sample(allFiles, 200)

print('starting tokenization')
sys.stdout.flush()
attackTokens = sd.tokenize(allFiles)
# Filter Tokens List to only Documents that contain a Target Word
attackTokens = sd.retainRelevantDocs(attackTokens, attackWordList)
print('finished tokenization')
sys.stdout.flush()
#Subset tokens only for attack dates
#attackTokens={key:value for key, value in rawTokens.items() if key in allFiles}

preAttackFiles = [doc for doc in preAttackFiles if doc in attackTokens.keys()]
postAttackFiles = [doc for doc in postAttackFiles if doc in attackTokens.keys()]
print('pre',len(preAttackFiles))
print('post',len(postAttackFiles))
print('all',len(allFiles))
print('alltokens',len(attackTokens.keys()))

#Get word coCo
Example #8
0
@author: nmvenuti
Sentiment Analysis
"""

#Import packages
import sys
import numpy as np
sys.path.append('./python/')
#import syntacticParsing as sp
import semanticDensity as sd

#Get sentimentWord dict and remove duplicates. Store in lists
posFilePath = './refData/positive-words.txt'
negFilePath = './refData/negative-words.txt'
sentDict = sd.tokenize([posFilePath, negFilePath])
posWords = list(set(sentDict[posFilePath]))
negWords = list(set(sentDict[negFilePath]))

#Define packages

tokens = sd.tokenize([
    './data_dsicap/DorothyDay/raw/2.html.txt',
    './data_dsicap/DorothyDay/raw/3.html.txt'
])


def sentimentLookup(tokens):
    fileSentiment = []
    #Get sentiment for each document
    for filename in tokens.keys():
Example #9
0
@author: nmvenuti
Sentiment Analysis
"""

#Import packages
import sys
import numpy as np
sys.path.append('./prototype_python/')
#import syntacticParsing as sp
import semanticDensity as sd

#Get sentimentWord dict and remove duplicates. Store in lists
posFilePath='./refData/positive-words.txt'
negFilePath='./refData/negative-words.txt'
sentDict=sd.tokenize([posFilePath,negFilePath])
posWords=list(set(sentDict[posFilePath]))
negWords=list(set(sentDict[negFilePath]))

#Define packages

def sentimentLookup(tokens):
    fileSentiment=[]
    #Get sentiment for each document
    for filename in tokens.keys():
        
        #initialize counters
        wordCount=0.0
        posCount=0.0
        negCount=0.0