/
auxiliar.py
515 lines (425 loc) · 18.7 KB
/
auxiliar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
# -*- coding: utf-8 -*-
"""
Created on Sat May 30 16:14:18 2020
@author: Philip Combiths
Phon DPA Script Auxiliar Functions
"""
from __future__ import absolute_import
from __future__ import print_function
import os
import sys
from contextlib import contextmanager
import pandas as pd
# May also require xlrd install as dependency for pandas
import regex as re
from six.moves import input
# import csv
import unicodecsv as csv
import io
from collections import Counter
# Establish origin directory and context navigation
os.chdir(os.path.dirname(sys.argv[0]))
owd = os.getcwd()
@contextmanager
def enter_dir(newdir):
prevdir = os.getcwd()
try:
yield os.chdir(newdir)
finally:
os.chdir(prevdir)
@contextmanager
def change_dir(newdir):
prevdir = os.getcwd()
try:
yield os.chdir(os.path.expanduser(newdir))
finally:
os.chdir(prevdir)
excludeList = ['(clock)', '(eat it)', '(pole)',
'(pulling)', '(sweatshirt)', '(that one)', '(that)', '(thunder)',
'ziggy', 'pitch', 'quɑrter', 'nose', "'fire'"]
excludeListSpaces = [' (clock)', ' (eat it)', ' (pole)',
' (pulling)', ' (sweatshirt)', ' (that one)', ' (that)',
' (that)', ' (thunder)', ' ziggy', ' pitch ', ' quɑrter',
' nose', " 'fire'"]
removeList = excludeList + ["(incomplete transcription)", "ɴʀ", "NR", "\[\]",
"", "ᵗ", "□", "tuntun", "goʊːt", "ʃiz"]
def accessExcelDict(xlsDirName):
"""
From a directory of xls files in the current working directory, returns
a dictionary of a dictionary containing each Excel sheet/tab as a
pandas dataframe.
Parameters:
xlsDirName : a directory name of xls files named '####_PHON.xls'
Returns:
data_xls : a dict {#### : dict{sheet : DataFrame}}
"""
xlsDict = {}
with enter_dir(xlsDirName):
print('Reading xls files to pandas DataFrames...')
# for each file in list of files in directory xls_dir...
for file in os.listdir():
# Read Excel file as dictionary of Pandas DataFrames (data_xls) Key = sheet name
try:
data_xls = pd.read_excel(file, None)
except:
print(sys.exc_info()[1])
print('Unable to read {}'.format(file))
continue
# Extract participant number from file name
name = file[:file.find('_')]
# Get list of sheets as xls_keys
xls_keys = list(data_xls.keys())
xlsDict.update({name : data_xls})
print('DataFrames generated')
return xlsDict
def accessExcelGenerator(sheetSelection='probes'):
"""
Generator. Continuation of accessExcelDict(). From a dictionary of
dictionaries of DataFrames, iterate through each DataFrame.
Parameters:
sheetSelection : str indicating which Excel sheets to extract.
'probes' : (default) every probe sheet
'allsheets' : every sheet, including 'Copyright', 'Probe Schedule'
a probe name : matches and extracts only the given probe
Returns tuple(partID, sheet, dfSheet)
"""
xlsDict = accessExcelDict('excel')
if sheetSelection == 'probes': # Extract probe sheets only
for partID in xlsDict:
for sheet in xlsDict[partID]:
if sheet == 'Copyright' or sheet == 'Probe Schedule':
continue
dfSheet = xlsDict[partID][sheet]
yield partID, sheet, dfSheet
dfSheet = xlsDict[partID][sheet]
yield partID, sheet, dfSheet
if sheetSelection == 'allsheets': # Extract all sheets
for partID in xlsDict:
for sheet in xlsDict[partID]:
dfSheet = xlsDict[partID][sheet]
yield partID, sheet, dfSheet
else: # Extract specified sheet only
for partID in xlsDict:
for sheet in xlsDict[partID]:
if sheet == sheetSelection:
dfSheet = xlsDict[partID][sheet]
yield partID, sheet, dfSheet
def genRawCSV():
"""
From a directory of xls files from the Developmental Phonologies Archive
(DPA; Gierut, 2015), extracts probe transcription data and exports as
csv files, organized by participant ID.
Requires:
'xls' directory containing DPA xls files in current working directory
Generates:
'rawCSV' directory containing data in csv files, in current working
directory
"""
# Set default directory to location of script
os.chdir(os.path.dirname(sys.argv[0]))
cwd = os.getcwd()
# Create contextmanager function that changes directory then returns to
# original directory upon completion
@contextmanager
def change_dir(newdir):
prevdir = os.getcwd()
try:
yield os.chdir(os.path.expanduser(newdir))
finally:
os.chdir(prevdir)
## Create raw csv files
print('********Create raw, untranslated csv files from xls files********')
# Preset xls_dir
xls_dir = os.path.join(cwd,r'excel')
root_dir = cwd
# If directory is not preset, get user input
try:
xls_dir
except NameError:
xls_dir = os.path.normpath(input(
'xls directory not specified. Enter xls directory path: '))
with change_dir(os.path.normpath(xls_dir)):
print("XLS Directory set to: ", os.getcwd())
# for each file in list of files in directory xls_dir...
for file in os.listdir(xls_dir):
# Read Excel file as dictionary of Pandas DataFrames (data_xls)
# Key = sheet name
try:
data_xls = pd.read_excel(file, None)
except:
print(sys.exc_info()[1])
print('Unable to read {}'.format(file))
continue
# Extract participant number from file name
name = file[:file.find('_')]
with change_dir(os.path.normpath(root_dir)):
# Create new subdirectory to place csv files
try:
os.makedirs(os.path.join('rawCSV', name))
except WindowsError:
print(sys.exc_info()[1])
print('rawCSV/{} directory already created.'.format(name))
# Change to new subdirectory
with change_dir(os.path.join(root_dir, 'rawCSV', name)):
print("Working in directory: ", os.getcwd())
# Get list of sheets as xls_keys
xls_keys = list(data_xls.keys())
# For each sheet in file...
for sheet in data_xls:
# create Probe:CA dictionary
if sheet == 'Probe Schedule':
continue
# Exclude 'Copyright' sheet
if sheet == 'Copyright':
continue
else:
# Save DataFrame for sheet to CSV.
# Set name, encode as UTF-8, omit row index
data_xls[sheet].to_csv(sheet +'.csv',
encoding = 'utf-8', index = False)
print('{} raw csv files complete'.format(name))
print("All raw csv files created in rawCSV folder")
def participantTranscriptConv():
"""
Searches 'Probe Schedule' sheets in directory of xls files for transcription
notes. Generates notes by participant as a csv file.
Returns generated DataFrame
"""
dfNotes = pd.DataFrame({'CA':[],
'Probe':[],
'Participant':[]})
for partID, sheet, df in accessExcelGenerator('Probe Schedule'):
notesOnly = df[df['CA'].str.contains("Note" )]
notesOnly['Participant'] = partID
dfNotes = dfNotes.append(notesOnly, ignore_index = True)
dfNotes.drop(columns = ['CA'])
dfNotes = dfNotes[['Participant', 'Probe']]
dfNotes = dfNotes.rename(columns={'Probe': 'Convention'})
dfNotes.to_csv('transcriptionNotes.csv',
encoding = 'utf-8', index = False)
print("'transcriptionNotes.csv' created in 'info' folder.")
return dfNotes
def combiningStrip(text):
"""
From a string, remove combining diacritics and modifiers.
Parameters:
text : string
Requires regex module as re
Return string with combining characters removed
"""
assert type(text) is str
unicodeBlockList = [r'\p{InCombining_Diacritical_Marks_for_Symbols}',
r'\p{InSuperscripts_and_Subscripts}',
r'\p{InCombining_Diacritical_Marks}',
r'\p{InSpacing_Modifier_Letters}',
r'\p{InCombining_Diacritical_Marks_Extended}'
r'\p{InCombining_Diacritical_Marks_Supplement}']
additionalChars = [r'ᴸ', r'ᵇ', r':', r'<', r'←', r'=', r"'", r"‚"]
pattern = r'(' + r'|'.join(unicodeBlockList+additionalChars) + r')'
pattern = re.compile(pattern)
# re.search(pattern, text)
result = re.subn(pattern, '', text)
return result[0]
def reDiac():
"""
Generate regex pattern to locate diacritics
Requires regex module as re
Return compiled regex pattern
"""
unicodeBlockList = [r'\p{InCombining_Diacritical_Marks_for_Symbols}',
r'\p{InSuperscripts_and_Subscripts}',
r'\p{InCombining_Diacritical_Marks}',
r'\p{InSpacing_Modifier_Letters}',
r'\p{InCombining_Diacritical_Marks_Extended}'
r'\p{InCombining_Diacritical_Marks_Supplement}']
additionalChars = [r'ᴸ', r'ᵇ', r':', r'<', r'←', r'=', r"'", r"‚", r"ᵊ"]
pattern = r'(' + r'|'.join(unicodeBlockList+additionalChars) + r')'
pattern = re.compile(pattern)
return pattern
def extractSegments(segmentType):
"""
Given user-specified segmentType, searches all transcriptions separated
by whitespace.in xls files and returns unique results as a list and saves
result as csv in 'info' directory.
Parameters:
segmentType : str
'phones' for all unitary and multi-component phones with diacritics
'compounds' for compound phones only
'characters' for all characters
Requires accessExcelDict(), combiningStrip()
Returns list of unique results and saves as csv in 'info' directory
"""
assert segmentType in ['phones', 'compounds', 'characters', 'full_compounds'], """
segmentType must be specified as:
'phones' for all unitary and multi-component phones with diacritics
'compounds' for base compound phones only
'full_compounds' for compound phones with diacritics
'characters' for all characters"""
xlsDict = accessExcelDict('excel')
result = set()
for xls in xlsDict:
for sheet in xlsDict[xls]:
# Define working Excel tab as DataFrame
dfSheet = xlsDict[xls][sheet]
# Skip Copyright and Probe schedule sheets
if sheet == 'Copyright' or sheet == 'Probe Schedule':
continue
else:
for col in dfSheet.columns:
if col == 'Target':
continue
if col == 'Word':
continue
# Remove items from removeList
for item in removeList:
dfSheet[col] = dfSheet[col].str.replace(
item, '', re.UNICODE)
else:
if segmentType == 'phones':
dfSheetIPA = dfSheet[col].str.findall(
r'\S+', re.UNICODE)
if segmentType == 'compounds':
dfSheetIPA = dfSheet[col].map(
lambda x: combiningStrip(str(x)))
dfSheetIPA = dfSheetIPA.str.findall(
r'\S{2,}', re.UNICODE)
if segmentType == 'full_compounds':
dfSheetIPA = dfSheet[col].str.findall(
r'(?<!̂)\S{2,}', re.UNICODE)
if segmentType == 'characters':
dfSheetIPA = dfSheet[col].str.findall(
r'\S', re.UNICODE)
for item in dfSheetIPA:
if type(item) == str:
result.add(item)
if type(item) == list:
for i in item:
result.add(i)
print(f'{xls} searched')
# Sort list by length
result = sorted(list(result), key=len)
# Save result to csv in 'info' directory
try:
os.makedirs('info')
print('Created:', os.path.join(os.getcwd(),'info'))
except WindowsError:
pass
with open(os.path.join('info',f'{segmentType}.csv'), 'wb') as csvOutput:
writer = csv.writer(csvOutput, encoding = 'utf-8')
for e in result:
writer.writerow([e])
print(f"'{segmentType}.csv' created in 'info' directory.")
return result
def multProdsCount(csvDir = 'csv'):
"""
Searches a directory of csv files and adds the number of "multiple
productions"
Parameters:
csvDir : str indicating csv directory to search. Default 'csv'
Returns multiple productions count as float and prints to console.
"""
#Check that edit worked
pattern = re.compile =r',.*,.*,.*,.*,.*,(\d.\d),'
mpCount = 0
with enter_dir(csvDir):
# Create list of csv files in subdirectories
csv_files = os.listdir(os.getcwd())
# Loop through files in directory
print('Searching all csv files in directory...')
for cur_csv in csv_files:
# open CSV file in read mode with UTF-8 encoding
with io.open(cur_csv, mode='r', encoding='utf-8') as current_csv:
# Create string variable from CSV
csv_str = current_csv.read()
result = re.findall(pattern, csv_str)
for numStr in result:
mpCount += float(numStr)
print(mpCount)
return mpCount
def extractMultProds(csvDir = 'csv'):
"""
Searches a directory of csv files and adds the number of "multiple
productions"
Parameters:
csvDir : str indicating csv directory to search. Default 'csv'
Returns multiple productions count as float and prints to console.
"""
# Check that edit worked
pattern = re.compile =r'(.*,.*,.*,.*,.*,.*,)(\d.\d)(,.*)'
mpCount = 0
with enter_dir(csvDir):
# Create list of csv files in subdirectories
csv_files = os.listdir(os.getcwd())
# Loop through files in directory
print('Searching all csv files in directory...')
matchRows = []
for cur_csv in csv_files:
# open CSV file in read mode with UTF-8 encoding
with io.open(cur_csv, mode='r', encoding='utf-8') as current_csv:
# Create string variable from CSV
csv_str = current_csv.read()
result = re.findall(pattern, csv_str)
for match in result:
#for match[1] in result:
# mpCount += float(numStr)
matchRow = ''.join(match)
matchRows.append(matchRow)
with enter_dir('info'):
with io.open(f'{csvDir}_mult_prod_matches.csv', 'wb') as f:
writer = csv.writer(f)
for row in matchRows:
writer.writerow([row])
return matchRows
def postProcessingReplacements(csvDir = 'csv'):
# Read replacements table
with open('dicts/replacements_table.csv', mode='r', encoding='utf-8') as f:
lines = f.readlines()
# Remove trailing whitespace and commas from rows
lines = [i.strip().strip(',') for i in lines]
# Fix NumProductions numbers
### If replacements are added with other values for NumReplacements,
### repeat this step for each
lines = [i.replace(",2,", ",2.0,") for i in lines]
# Remove empty rows
lines = [i for i in lines if i]
# Find row index for original and replacement rows
origIndex = lines.index('Original rows go here:')
replIndex = lines.index('Replacement rows go here:')
originals = lines[origIndex+2:replIndex]
replacements = lines[replIndex+2:]
assert len(originals) == len(replacements), "ERROR: Different number of originals and replacements"
replList = [(originals[i],replacements[i]) for i in range(len(originals))]
print("****************************************************************")
print("Post-processing...")
print(f"Replacing {len(originals)} lines in csv files...")
counter = Counter()
with enter_dir(csvDir):
for fName in os.listdir(os.getcwd()):
if fName.endswith('.csv'):
with open(fName, mode = 'r', encoding = 'utf-8') as curCSV:
csvStr = curCSV.read()
revCSVStr = csvStr
for repl in replList:
if repl[0] in csvStr:
counter.update([repl[0]])
revCSVStr = revCSVStr.replace(repl[0], repl[1])
else:
continue
if revCSVStr == csvStr:
continue
else:
with open(fName, mode = 'w', encoding='utf-8') as curCSV:
curCSV.write(revCSVStr)
# Check that all replacements were made. Print warning to console.
for line in originals:
if line not in counter:
print("\t*********************************************")
print("\tWARNING: Not all replacements were made.")
print("\tCheck replacements_table.csv for accuracy.")
print("\tLine not replaced:")
print(line)
print("****************************************************************")
print("Post-processing complete!")
return counter
### Testing
#replCounter = postProcessingReplacements(csvDir = 'csv')