def textAnalysis(paramList): startTime=time.time() groupId=paramList[0] fileList=paramList[1] targetWordCount=paramList[2] cocoWindow=paramList[3] svdInt=paramList[4] cvWindow=paramList[5] simCount=paramList[6] #Get list of subfiles subFileList=[x[1] for x in fileList if x[0]==groupId[0] and x[2]==groupId[1]] tokenList = sd.tokenize(subFileList) ######################## ###Sentiment Analysis### ######################## sentimentList=sa.sentimentLookup(tokenList) ######################################## ###POS Tagging and Judgement Analysis### ######################################## judgementList=[sp.judgements(sp.readText(fileName)) for fileName in subFileList] judgementAvg=list(np.mean(np.array(judgementList),axis=0)) txtString=' '.join([sp.readText(fileName) for fileName in subFileList]) wordList=sp.targetWords(txtString,targetWordCount) ####################### ###Semantic analysis### ####################### #Get word coCo CoCo, TF, docTF = sd.coOccurence(tokenList,cocoWindow) #Get DSM DSM=sd.DSM(CoCo,svdInt) #Get context vectors #Bring in wordlist wordList=[stemmer.stem(word) for word in wordList] CVDict=sd.contextVectors(tokenList, DSM, wordList, cvWindow) #Run cosine sim cosineSimilarity=sd.averageCosine(CVDict,subFileList,wordList,simCount) avgSD=np.mean([x[1] for x in cosineSimilarity]) ############################ ###Network Quantification### ############################ avgEVC=nq.getNetworkQuant(DSM,wordList) endTime=time.time() timeRun=endTime-startTime print('finished running'+'_'.join(groupId)+' in '+str(end-start)+' seconds') sys.stdout.flush() #Append outputs to masterOutput return(['_'.join(groupId)]+[len(subFileList),timeRun]+sentimentList+judgementAvg+[avgSD]+[avgEVC])
def runMaster(rawPath,groupList,crossValidate,groupSize,testSplit,targetWordCount,cocoWindow,svdInt,cvWindow,simCount): ############################### #####Raw File List Extract##### ############################### rawFileList=[] for groupId in groupList: for dirpath, dirnames, filenames in os.walk(rawPath+groupId+'/raw'): for filename in [f for f in filenames ]: if '.txt' in filename: rawFileList.append([groupId,os.path.join(dirpath, filename)]) #Make output directory # runDirectory='./pythonOutput/'+ time.strftime("%c") # runDirectory='./pythonOutput/cocowindow_'+str(cocoWindow)+time.strftime("%c").replace(' ','_') runDirectory='./pythonOutput/cocowindow_'+str(cocoWindow) os.makedirs(runDirectory) end=time.time() print('finished loading packages after '+str(end-start)+' seconds') sys.stdout.flush() #Perform analysis for each fold in cross validation for fold in range(crossValidate): ############################### #####Set up random binning##### ############################### #Loop through each group and create sub bins fileList=[] for groupId in groupList: subGroup=[x for x in rawFileList if groupId == x[0]] randomSample=list(np.random.choice(range(len(subGroup)),size=len(subGroup),replace=False)) splitIndex=int((1-testSplit)*len(subGroup)) groupId=['train'+ "%02d" %int(i/groupSize) if i<splitIndex else 'test'+ "%02d" %int((i-splitIndex)/groupSize) for i in randomSample] fileList=fileList+[[subGroup[i][0],subGroup[i][1],groupId[i]] for i in range(len(subGroup))] fileDF=pd.DataFrame(fileList,columns=['group','filepath','subgroup']) #Get set of subgroups subgroupList=[ list(y) for y in set((x[0],x[2]) for x in fileList) ] #Make output directory outputDirectory=runDirectory+'/run'+str(fold) os.makedirs(outputDirectory) #Print file splits to runDirectory fileDF.to_csv(outputDirectory+'/fileSplits.csv') end=time.time() print('finished randomly creating subgroups '+str(end-start)+' seconds') sys.stdout.flush() ################################ #####Perform group analysis##### ################################ #Set up population coCo and DSM trainFileList=[x[1] for x in fileList if 'train' in x[2]] tokenList = sd.tokenize(trainFileList) #create DSM CoCo, TF, docTF = sd.coOccurence(tokenList,cocoWindow) #Get DSM DSM=sd.DSM(CoCo,svdInt) #Run calculation # masterOutput=[textAnalysis(x) for x in paramList] masterOutput=Parallel(n_jobs=3)(delayed(textAnalysis)([x,fileList,targetWordCount,DSM,cvWindow,simCount]) for x in subgroupList) #Create output file outputDF=pd.DataFrame(masterOutput,columns=['groupId','files','timeRun','perPos','perNeg','perPosDoc','perNegDoc','judgementCount','judgementFrac','avgSD','avgEVC']) outputDF.to_csv(outputDirectory+'/masterOutput.csv')
preCrashDateList=datelist(date(2005,1,1),date(2006,12,31)) preCrashFiles=metaFileDF[metaFileDF['datetime'].isin(preCrashDateList)].filePath crashDateList=datelist(date(2007,1,1),date(2008,12,31)) crashFiles=metaFileDF[metaFileDF['datetime'].isin(crashDateList)].filePath postCrashDateList=datelist(date(2007,1,1),date(2008,12,31)) postCrashFiles=metaFileDF[metaFileDF['datetime'].isin(postCrashDateList)].filePath allFiles=list(preCrashFiles)+list(crashFiles)+list(postCrashFiles) if runSample == True: import random allFiles = random.sample(allFiles, 200) print('starting tokenization') sys.stdout.flush() tokens = sd.tokenize(allFiles) # Filter Tokens List to only Documents that contain a Target Word tokens = sd.retainRelevantDocs(tokens, wordList) print('finished tokenization') sys.stdout.flush() #Subset tokens only for attack dates #attackTokens={key:value for key, value in rawTokens.items() if key in allFiles} preCrashFiles = [doc for doc in preCrashFiles if doc in tokens.keys()] crashFiles = [doc for doc in crashFiles if doc in tokens.keys()] postCrashFiles = [doc for doc in postCrashFiles if doc in tokens.keys()] #Get word coCo print('starting word coco') sys.stdout.flush()
@author: nmvenuti Sentiment Analysis """ #Import packages import sys import numpy as np sys.path.append('./python/') #import syntacticParsing as sp import semanticDensity as sd #Get sentimentWord dict and remove duplicates. Store in lists posFilePath='./refData/positive-words.txt' negFilePath='./refData/negative-words.txt' sentDict=sd.tokenize([posFilePath,negFilePath]) posWords=list(set(sentDict[posFilePath])) negWords=list(set(sentDict[negFilePath])) #Define packages tokens=sd.tokenize(['./data_dsicap/DorothyDay/raw/2.html.txt','./data_dsicap/DorothyDay/raw/3.html.txt']) def sentimentLookup(tokens): fileSentiment=[] #Get sentiment for each document for filename in tokens.keys(): #initialize counters wordCount=0.0 posCount=0.0 negCount=0.0
'./data_dsicap/WBC/raw/WestboroBaptist_Sermon_20100328.pdf.txt', './data_dsicap/WBC/raw/WestboroBaptist_Sermon_20090510.pdf.txt' ] groupSize=10 testSplit=0.1 targetWordCount=10 cocoWindow=6 svdInt=50 cvWindow=6 simCount=1000 tokenList = sd.tokenize(subFileList) ######################## ###Sentiment Analysis### ######################## sentimentList=sa.sentimentLookup(tokenList) ######################################## ###POS Tagging and Judgement Analysis### ######################################## judgementList=[sp.judgements(sp.readText(fileName)) for fileName in subFileList] judgementAvg=list(np.mean(np.array(judgementList),axis=0)) txtString=' '.join([sp.readText(fileName) for fileName in subFileList]) wordList=sp.targetWords(txtString,targetWordCount)
def textAnalysis(paramList): startTime = time.time() groupId = paramList[0] fileList = paramList[1] targetWordCount = paramList[2] cocoWindow = paramList[3] svdInt = paramList[4] cvWindow = paramList[5] simCount = paramList[6] startCount = paramList[7] netAngle = paramList[8] #Get list of subfiles subFileList = [ x[1] for x in fileList if x[0] == groupId[0] and x[2] == groupId[1] ] tokenList = sd.tokenize(subFileList) ######################## ###Sentiment Analysis### ######################## sentimentList = sa.sentimentLookup(tokenList) ######################################## ###POS Tagging and Judgement Analysis### ######################################## judgementList = [ sp.judgements(sp.readText(fileName)) for fileName in subFileList ] judgementAvg = list(np.mean(np.array(judgementList), axis=0)) txtString = ' '.join([sp.readText(fileName) for fileName in subFileList]) wordList = sp.targetWords(txtString, targetWordCount, startCount) ####################### ###Semantic analysis### ####################### #Get word coCo CoCo, TF, docTF = sd.coOccurence(tokenList, cocoWindow) #Get DSM DSM = sd.DSM(CoCo, svdInt) #Get context vectors #Bring in wordlist wordList = [stemmer.stem(word) for word in wordList] CVDict = sd.contextVectors(tokenList, DSM, wordList, cvWindow) #Run cosine sim cosineSimilarity = sd.averageCosine(CVDict, subFileList, wordList, simCount) avgSD = np.mean([x[1] for x in cosineSimilarity]) ############################ ###Network Quantification### ############################ avgEVC = nq.getNetworkQuant(DSM, wordList, netAngle) endTime = time.time() timeRun = endTime - startTime print('finished running' + '_'.join(groupId) + ' in ' + str(end - start) + ' seconds') sys.stdout.flush() #Delete and garbage collect del CoCo, TF, docTF, DSM, CVDict, cosineSimilarity gc.collect() #Append outputs to masterOutput return (['_'.join(groupId)] + [len(subFileList), timeRun] + sentimentList + judgementAvg + [avgSD] + [avgEVC])
preAttackFiles=metaFileDF[metaFileDF['datetime'].isin(preAttackDateList)].filePath postAttackDateList=datelist(date(2001,9,11),date(2003,9,11)) postAttackFiles=metaFileDF[metaFileDF['datetime'].isin(postAttackDateList)].filePath print('pre',len(preAttackFiles)) print('post',len(postAttackFiles)) sys.stdout.flush() allFiles=list(preAttackFiles)+list(postAttackFiles) if runSample == True: import random allFiles = random.sample(allFiles, 200) print('starting tokenization') sys.stdout.flush() attackTokens = sd.tokenize(allFiles) # Filter Tokens List to only Documents that contain a Target Word attackTokens = sd.retainRelevantDocs(attackTokens, attackWordList) print('finished tokenization') sys.stdout.flush() #Subset tokens only for attack dates #attackTokens={key:value for key, value in rawTokens.items() if key in allFiles} preAttackFiles = [doc for doc in preAttackFiles if doc in attackTokens.keys()] postAttackFiles = [doc for doc in postAttackFiles if doc in attackTokens.keys()] print('pre',len(preAttackFiles)) print('post',len(postAttackFiles)) print('all',len(allFiles)) print('alltokens',len(attackTokens.keys())) #Get word coCo
@author: nmvenuti Sentiment Analysis """ #Import packages import sys import numpy as np sys.path.append('./python/') #import syntacticParsing as sp import semanticDensity as sd #Get sentimentWord dict and remove duplicates. Store in lists posFilePath = './refData/positive-words.txt' negFilePath = './refData/negative-words.txt' sentDict = sd.tokenize([posFilePath, negFilePath]) posWords = list(set(sentDict[posFilePath])) negWords = list(set(sentDict[negFilePath])) #Define packages tokens = sd.tokenize([ './data_dsicap/DorothyDay/raw/2.html.txt', './data_dsicap/DorothyDay/raw/3.html.txt' ]) def sentimentLookup(tokens): fileSentiment = [] #Get sentiment for each document for filename in tokens.keys():
@author: nmvenuti Sentiment Analysis """ #Import packages import sys import numpy as np sys.path.append('./prototype_python/') #import syntacticParsing as sp import semanticDensity as sd #Get sentimentWord dict and remove duplicates. Store in lists posFilePath='./refData/positive-words.txt' negFilePath='./refData/negative-words.txt' sentDict=sd.tokenize([posFilePath,negFilePath]) posWords=list(set(sentDict[posFilePath])) negWords=list(set(sentDict[negFilePath])) #Define packages def sentimentLookup(tokens): fileSentiment=[] #Get sentiment for each document for filename in tokens.keys(): #initialize counters wordCount=0.0 posCount=0.0 negCount=0.0