Ejemplo n.º 1
0
    import unicodedata
    newname = unicodedata.normalize('NFKD',
                                    weirdname).encode('ASCII',
                                                      'ignore').decode()
    return newname


for y in range(len(olydata)):
    name = olydata[y]['Athlete']
    weirdname = extract_usable_name(name)
    finalname = no_weird_characters(weirdname)
    olydata[y]['usable_name'] = finalname

#namecaller={}
#for x in range(len(olydata)):
#	call=olydata[x]['usable_name']
#	namecaller[call]=olydata[x]
from gender import detect_gender

for x in range(len(olydata)):
    result = detect_gender(olydata[x]['usable_name'])
    ratio = result['ratio']
    gender = result['gender']
    olydata[x]['ratio'] = ratio
    olydata[x]['gender'] = gender

olympicpath = join(datadirect, 'olympic_athletes_classified.json')
writejson = open(olympicpath, 'w')
json.dump(olydata, writejson, indent=2)
print("Writing to olympic_athletes_classified.json file.")
Ejemplo n.º 2
0
from csv import DictReader, DictWriter

CLASSIFIED_DATA_HEADERS = ['firstname','lastname','born_year','died_year','categories', 'years', 
                           'gender', 'usable_name', 'ratio']

def extract_usable_name(namestr):
    if '.' not in namestr:
        return namestr
    return ''

# Set up the new data file
w = open(CLASSIFIED_DATA_FILENAME, 'w')
dw = DictWriter(w, fieldnames=CLASSIFIED_DATA_HEADERS)
dw.writeheader()

# Open the non-gender classified data file
with open(WRANGLED_DATA_FILENAME) as r:
    datarows = list(DictReader(r))
    # read each row
    ct = 0
    for row in datarows:
        usablename = extract_usable_name(row['firstname'])
        ct += 1
        print("Row:", ct, "extracting --", usablename, "-- from:", row['firstname'])
        gender_result = detect_gender(usablename)
        # now add usable_name and gender data to each row
        row['usable_name'] = usablename
        row['gender'] = gender_result['gender']
        row['ratio'] = gender_result['ratio']
        dw.writerow(row)
Ejemplo n.º 3
0
	datarows = list(DictReader(f))



wf = open(CLASSIFIED_DATA_FILENAME, 'w')
cwf = DictWriter(wf, fieldnames=CLASSIFIED_HEADERS)
cwf.writeheader()


def extract_usable_name(name_string):
	name_list = name_string.split(', ')
	almost_first_name = name_list[-1]
	first_name_list = almost_first_name.split(' ')
	first_name = first_name_list[-1]
	return first_name_list[0]

linecount = 0
for row in datarows:
	linecount += 1
	the_name = row['cited_name']
	usable_name = extract_usable_name(the_name)
	print(linecount, " -- Extracted", usable_name, "from", the_name)


	genderdict = detect_gender(usable_name)
	# now write the row...
	row['gender'] = genderdict['gender']
	row['ratio'] = genderdict['ratio']
	row['usable_name'] = usable_name
	cwf.writerow(row)
Ejemplo n.º 4
0
DATA_DIR = 'tempdata'
WRANGLED_DATA_FILENAME = join(DATA_DIR, 'wrangled_data.csv')
WRANGLED_HEADERS = ['Last Name', 'Firstish Name', 'Position Title', 'Department' , 'Employee Annual Salary', 'Gender', 'Ratio', 'Usable Name']
CLASSIFIED_DATA_FILENAME = join(DATA_DIR, 'classified_data.csv')

wrangledFile = open(WRANGLED_DATA_FILENAME, 'r')
datarows = list(DictReader(wrangledFile))

def extract_usable_name(firstishName):
    return firstishName.split(' ')[0]

dicList = []
for employee in datarows:
    usable_name = extract_usable_name(employee['Firstish Name'])
    result = detect_gender(usable_name)
    employee['Gender'] = result['gender']
    employee['Ratio'] = result['ratio']
    employee['Usable Name'] = usable_name
    employee['Employee Annual Salary'] = float(employee['Employee Annual Salary'])
    dicList.append(employee)

wrangledFile.close()


wfile = open(CLASSIFIED_DATA_FILENAME, 'w')
# turn it into a DictWriter object, and tell it what the fieldnames are
wcsv = DictWriter(wfile, fieldnames=WRANGLED_HEADERS)
# write the headers row
wcsv.writeheader()
Ejemplo n.º 5
0
from gender import detect_gender
DATA_DIR = 'tempdata'
WRANGLED_DIR = 'tempdata/wrangled'
WRANGLED_DATA_PATH = join(WRANGLED_DIR, 'wranglednames.csv')
CLASSIFIED_DATA_FILENAME = join(DATA_DIR, 'classified_data.csv')


def extractable_usable_name(name):
    return name.split(' ')[0]


classified_headers = [
    'year', 'name', 'description', 'usable_name', 'gender', 'ratio'
]

w = open(CLASSIFIED_DATA_FILENAME, 'w')
dw = DictWriter(w, fieldnames=classified_headers)
dw.writeheader()

with open(WRANGLED_DATA_PATH) as r:
    datarows = list(DictReader(r))
    ct = 0
    for row in datarows:
        usable_name = extractable_usable_name(row['name'])
        ct += 1
        print("Row:", ct, "extracting --", usable_name, "--from", row['name'])
        gender_result = detect_gender(usable_name)
        row['usable_name'] = usable_name
        row['gender'] = gender_result['gender']
        row['ratio'] = gender_result['ratio']
        dw.writerow(row)
Ejemplo n.º 6
0
		result+=name[n]
	return result.strip(" ")

#Actually goes through CSV file and reads it
wrangled_file = open(DATA_PATH, 'r', encoding="latin1")
my_reader = csv.reader(wrangled_file, skipinitialspace=True)
next(my_reader, None) #Skips headers
for line in my_reader:
	year = line[0]
	category = line[1]
	name = line[2]
	country = line[3]
	field = line[4]
	motivation = line[5]
	usable_name = extract_usable_name(name)
	gender = detect_gender(usable_name)['gender']
	ratio = detect_gender(usable_name)['ratio']
	namesdict = {'year': year, 'category': category, 'name': name,
	'country': country, 'field': field, 'motivation': motivation, 
	'gender': gender, 'ratio': ratio,'usable_name': usable_name}
	all_names.append(namesdict)


#Creates new file
HEADERS = ['year', 'category', 'name' , 'country' , 'field', 'motivation', 'gender', 'ratio', 'usable_name']
wfile = open(CLASSIFIED_DATA_PATH, 'w')
wcsv = csv.DictWriter(wfile, fieldnames=HEADERS)
# write the headers row
wcsv.writeheader()

for entry in all_names:
Ejemplo n.º 7
0
from os.path import exists, join
from gender import extract_usable_name, detect_gender
import csv

DATA_DIR = 'tempdata'
SJ_DATA_FILENAME = join(DATA_DIR, 'wrangled_data.csv')
C_FILENAME = join(DATA_DIR, 'classified_data.csv')
C_HEADERS = [
    'name', 'gender', 'ratio', 'females', 'males', 'total', 'compensation'
]

sj_data = list(csv.DictReader(open(SJ_DATA_FILENAME)))

classified_data = []

for person in sj_data:
    use_name = extract_usable_name(person['Employee Name'])
    result = detect_gender(use_name)
    result['compensation'] = float(person['Total Pay & Benefits'])
    classified_data.append(result)

wfile = open(C_FILENAME, 'w')
wcsv = csv.DictWriter(wfile, fieldnames=C_HEADERS)
wcsv.writeheader()

for row in classified_data:
    wcsv.writerow(row)
wfile.close()
Ejemplo n.º 8
0
from os.path import exists, join
from gender import extract_usable_name, detect_gender
import csv


DATA_DIR = 'tempdata'
SJ_DATA_FILENAME = join(DATA_DIR, 'wrangled_data.csv')
C_FILENAME = join(DATA_DIR, 'classified_data.csv')
C_HEADERS = ['name', 'gender' , 'ratio' , 'females', 'males', 'total', 'compensation']

sj_data = list(csv.DictReader(open(SJ_DATA_FILENAME)))

classified_data = []

for person in sj_data:
    use_name = extract_usable_name(person['Employee Name'])
    result = detect_gender(use_name)
    result['compensation'] = float(person['Total Pay & Benefits'])
    classified_data.append(result)
    
    
wfile = open(C_FILENAME, 'w')
wcsv = csv.DictWriter(wfile, fieldnames=C_HEADERS)
wcsv.writeheader()

for row in classified_data:
   wcsv.writerow(row)
wfile.close()

Ejemplo n.º 9
0
filename = 'wrangled_data.csv'
foldername = os.path.join(directory, subdirectory)
pathname = os.path.join(foldername, filename)

wrangledHeaders = ['title', 'first_name', 'last_name', 'party', 'age', 'gender', 'ratio', 'usable_name']

openFile = open(pathname, 'r')
print('Reading data from', pathname)

lineCount = 0
legislatorList = []
for line in openFile:
    if lineCount > 0:
        legislatorInfo = line.strip().split(',')
        firstname = extract_usable_name(legislatorInfo[1])
        genderInfo = detect_gender(firstname)
        infoDict = {}
        infoDict['title'] = legislatorInfo[0]
        infoDict['first_name'] = legislatorInfo[1]
        infoDict['last_name'] = legislatorInfo[2]
        infoDict['party'] = legislatorInfo[3]
        infoDict['age'] = legislatorInfo[4]
        infoDict['gender'] = genderInfo['gender']
        infoDict['ratio'] = genderInfo['ratio']
        infoDict['usable_name'] = firstname
        legislatorList.append(infoDict)
    lineCount += 1

newFileName = 'classified_data.csv'
newFilePath = os.path.join(foldername, newFileName)
csvFile = open(newFilePath, 'w')
Ejemplo n.º 10
0
	import unicodedata
	newname=unicodedata.normalize('NFKD', weirdname).encode('ASCII', 'ignore').decode()
	return newname

for y in range(len(olydata)):
	name=olydata[y]['Athlete']
	weirdname=extract_usable_name(name)
	finalname=no_weird_characters(weirdname)
	olydata[y]['usable_name']=finalname



#namecaller={}
#for x in range(len(olydata)):
#	call=olydata[x]['usable_name']
#	namecaller[call]=olydata[x]
from gender import detect_gender

for x in range(len(olydata)):
	result=detect_gender(olydata[x]['usable_name'])
	ratio=result['ratio']
	gender=result['gender']
	olydata[x]['ratio']=ratio
	olydata[x]['gender']=gender

olympicpath=join(datadirect, 'olympic_athletes_classified.json')
writejson=open(olympicpath, 'w')
json.dump(olydata, writejson, indent=2)
print("Writing to olympic_athletes_classified.json file.")

Ejemplo n.º 11
0
thefile = join(folder, 'presidents.txt')

with open(thefile, 'r') as infile:
    pres_dict = {}

    with open('data/classified.csv', 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['name', 'term', 'university', 'gender'])

        #read through each line
        for line in infile:
            temp = line.strip().split(',')
            pres, term, univ = line.strip().split(',')
            univ = univ.strip()

            first_name = pres.split(' ')[0]
            res = detect_gender(first_name)
            temp.extend([res['gender']])
            writer.writerow(temp)

# # This part is just to check if we have got all the names
# # in the file.

# # count = 0
# # for school in pres_dict:
# # 	print(school, 'has had', len(pres_dict[school]), 'presidents')
# # 	print('\n')
# # 	count += 1
# # print('Looked at', count, 'schools')

# schools_analyzed = {}
Ejemplo n.º 12
0
    full_filename = join(DATA_DIR, fname)

    joe = open(full_filename, 'r')
    First_rows = list(DictReader(joe))
    joe.close()

    classified_headers = list(First_rows[0]) + ['gender', 'ratio', 'usable_name']
    classified_filename = join(CLASSIFIED_DIR, fname)
    print("About to classify", len(First_rows), 'rows into the file:', classified_filename)

    outfile = open(classified_filename, 'w')
    output_csv = DictWriter(outfile, fieldnames=classified_headers)
    output_csv.writeheader()
    xc = 0
    for row in First_rows:
        xc += 1
        first_name = row['First']
        print("On row", xc, first_name)
        if "N/A" in first_name:
            pass
        else:
            usablename = extract_usable_name(first_name)
            get = detect_gender(usablename)
            row['gender'] = get['gender']
            row['ratio'] = get['ratio']
            row['usable_name'] = usablename
            # 
            output_csv.writerow(row)
    outfile.close()
#Dan: I used California Colleges as a template. Thank you for sharing it. 
Ejemplo n.º 13
0
classified_headers = list(
    salary_rows[0].keys()) + ['gender', 'ratio', 'usable_name']

classified_filename = join(DATA_DIR, fname)
print("About to classify", len(salary_rows), 'rows into the file:',
      classified_filename)

outfile = open(classified_filename, 'w')
output_csv = DictWriter(outfile, fieldnames=classified_headers)
output_csv.writeheader()

xc = 0
for row in salary_rows:
    xc += 1
    first_name = row['First Name']
    print("On row", xc, first_name)
    # skip rows in which row['Employee Name'] is "Not provided"
    if "Not provided" in first_name:
        pass
    else:
        usablename = extract_usable_name(first_name)
        xresult = detect_gender(usablename)
        row['gender'] = xresult['gender']
        row['ratio'] = xresult['ratio']
        row['usable_name'] = usablename
        row['Base Salary'] = (row['Base Salary'][1:])
        # write to the csv file
        output_csv.writerow(row)

outfile.close()
Ejemplo n.º 14
0
    'usable_name', 'detected_gender', 'ratio'
]


def extract_usable_name(namestr):
    nameparts = namestr.split(' ')
    for n in nameparts:
        if '[' not in n:
            if '.' not in n:
                return n
    return ""


f = open(classifyfilename, 'w', newline='')
fwrite = csv.DictWriter(f, fieldnames=classifyheaders)
fwrite.writeheader()

with open(wranglefilename) as r:
    datarows = list(csv.DictReader(r))
    ct = 0
    for row in datarows:
        usablename = extract_usable_name(row['firstname'])
        ct += 1
        print("Row:", ct, "extracting --", usablename, "-- from:",
              row['firstname'])
        genderresults = detect_gender(usablename)
        row['usable_name'] = usablename
        row['detected_gender'] = genderresults['gender']
        row['ratio'] = genderresults['ratio']
        fwrite.writerow(row)
Ejemplo n.º 15
0
    return result.strip(" ")


#Actually goes through CSV file and reads it
wrangled_file = open(DATA_PATH, 'r', encoding="latin1")
my_reader = csv.reader(wrangled_file, skipinitialspace=True)
next(my_reader, None)  #Skips headers
for line in my_reader:
    year = line[0]
    category = line[1]
    name = line[2]
    country = line[3]
    field = line[4]
    motivation = line[5]
    usable_name = extract_usable_name(name)
    gender = detect_gender(usable_name)['gender']
    ratio = detect_gender(usable_name)['ratio']
    namesdict = {
        'year': year,
        'category': category,
        'name': name,
        'country': country,
        'field': field,
        'motivation': motivation,
        'gender': gender,
        'ratio': ratio,
        'usable_name': usable_name
    }
    all_names.append(namesdict)

#Creates new file
Ejemplo n.º 16
0
    "detected_gender",
    "ratio",
]


def extract_usable_name(namestr):
    nameparts = namestr.split(" ")
    for n in nameparts:
        if "[" not in n:
            if "." not in n:
                return n
    return ""


f = open(classifyfilename, "w", newline="")
fwrite = csv.DictWriter(f, fieldnames=classifyheaders)
fwrite.writeheader()

with open(wranglefilename) as r:
    datarows = list(csv.DictReader(r))
    ct = 0
    for row in datarows:
        usablename = extract_usable_name(row["firstname"])
        ct += 1
        print("Row:", ct, "extracting --", usablename, "-- from:", row["firstname"])
        genderresults = detect_gender(usablename)
        row["usable_name"] = usablename
        row["detected_gender"] = genderresults["gender"]
        row["ratio"] = genderresults["ratio"]
        fwrite.writerow(row)