-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataExtraction.py
executable file
·106 lines (76 loc) · 2.76 KB
/
dataExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python2.7
from nltk import pos_tag
from pprint import pprint
from os import listdir
from string import letters, octdigits
from nltk import FreqDist as fd
from csv import reader as csvReader
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize as st
from nltk.tokenize import word_tokenize as wt
from nltk.tokenize import WhitespaceTokenizer
def mean(l):
return sum(l)/len(l) if len(l)>0 else 0
def convertTag(tag):
nounTags = ['NN','NNP','NNS','NNPS']
verbTags = ['VB','VBD','VBG','VBN','VBP','VBZ']
adjTags = ['JJ','JJR','JJS']
advTags = ['RB','RBR','RBS']
if tag in nounTags:
return 'n'
elif tag in verbTags:
return 'v'
elif tag in adjTags:
return 'a'
elif tag in advTags:
return 'r'
else:
return None
resultsFile = open('corpusData.csv','r')
# files that we have already looked at
skipOver = [row[0] for row in csvReader(resultsFile, delimiter=',')]
# some screwy stuff was happening
resultsFile.close()
resultsFile = open('corpusData.csv','a+')
for f in listdir('corpus/'):
if f[-4:] == ".txt" and not f in skipOver:
fileName = f
F = open('corpus/'+f)
text = F.read()
F.close()
alphanum = letters+octdigits
paragraphs = [s for s in text.split("\n\n") if s != "" ][:-1]
numParagraphs = len(paragraphs)
# average paragraph size
wst = WhitespaceTokenizer()
paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs]
# the approximate number of words in the document
numWords = sum(paraWordCounts)
# the average number of words per paragraph
avgParagraphLen = mean(paraWordCounts)
# rejoin the paragraphs
text = ' '.join(paragraphs)
# part of speech word list for the text
text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl]
# remove symbols from list by checking the first character of the word
text = [word for word in text if word[0][0] in alphanum]
# convert words to lowercase and convert Penn Tree Bank tags to WordNet tags
text = [(word[0].lower(), convertTag(word[1])) for word in text]
# remove Nones
text = [word for word in text if word[1]]
nouns = [word for word in text if word[1] == 'n']
numNouns = len(nouns)
verbs = [word for word in text if word[1] == 'v']
numVerbs = len(verbs)
adjectives = [word for word in text if word[1] == 'a']
numAdjectives = len(adjectives)
adverbs = [word for word in text if word[1] == 'r']
numAdverbs = len(adverbs)
numTargetWords = numNouns+numVerbs+numAdjectives+numAdverbs
finalOutput = '%s,%s,%s,%s,%s,%s,%s,%s,%s' % (fileName,numWords,numParagraphs,avgParagraphLen,numNouns,numVerbs, numAdjectives,numAdverbs,numTargetWords)
print finalOutput
# write output to csv file
resultsFile.write(finalOutput+'\n')
else:
print 'Skipping %s' % (f)
resultsFile.close()