def inheritancePossible(): """ Checks whether it is possible for the oldest sample to in fact be the product of inheritance (based on saturation constraints), both from the youngest sample and from the mean. """ name = 'amount of inheritance required to explain spread is possible' oldest = calculations.calcMaxSample(__getAge()) if Nuclide.stable(oldest['nuclide']): return SimResult(confidence.Confidence(confidence.Applic.ct, confidence.Validity.accept), name, 'nuclide is stable', 'nuclide is ' + str(oldest['nuclide'])) saturation = 3.0 / Nuclide.decay[oldest['nuclide']] spread = oldest[__getAge()] - calculations.calcMin(__getAge()) if spread < saturation: return SimResult(confidence.Confidence(confidence.Applic.ct, confidence.Validity.sound), name, 'total spread of ' + str(spread) + ' is less than saturation age of about ' + str(saturation), 'saturation: ' + str(saturation) + ', spread: ' + str(spread)) spread = oldest[__getAge()] - calculations.calcMean(__getAge()) if spread < saturation: return SimResult(confidence.Confidence(confidence.Applic.dt, confidence.Validity.sound), name, 'total amount > mean age of ' + str(spread) + ' is less than saturation age of about ' + str(saturation), 'saturation: ' + str(saturation) + ', spread: ' + str(spread)) return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), name, 'spread in ages is impossible via inheritance', 'saturation (about): ' + str(saturation) + ', spread: ' + str(spread))
def centralAgreement(sample): """ Checks whether we can remove samples at the edges to get a reasonably large group with a single central tendency. Should pay attention to whether removed samples have similar ages to each other or not Also, this should be run once and the results saved, instead of being run again for every single sample... """ #first let's check that we need to remove samples arg = engine.buildArgument(conclusions.Conclusion("no process")) name = "removal of youngest and oldest samples allows argument for 'no process'" if arg.getSingleConfidence().isStrongly(True): return SimResult(-arg.getSingleConfidence(), name, "good argument for 'no process' before removing any samples", arg) savedSamples = samples.sampleList[:] samples.sampleList.sort(cmp = lambda x, y: cmp(x[__getAge()], y[__getAge()])) while len(samples.sampleList) > 0: mean = calculations.calcMean(__getAge()) younger = mean - samples.sampleList[0][__getAge()] older = samples.sampleList[-1][__getAge()] - mean if younger > older: del samples.sampleList[0] else: del samples.sampleList[-1] arg = engine.buildArgument(conclusions.Conclusion("no process")) if arg.getSingleConfidence().isStrongly(True): break if sample in samples.sampleList: res = SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), name, str(sample) + " does not need to be removed to have good argument for 'no process'", arg) samples.sampleList = savedSamples return res reduction = len(samples.sampleList) / float(len(savedSamples)) #removed = len(savedSamples) - len(samples.sampleList) samples.sampleList = savedSamples conf = __getConfidence((.2, .35, .6, .8, .9), reduction, confidence.Validity.prob) #print reduction, conf #oldest n samples appear to be outliers return SimResult(conf, name, "removed " + str(100 - (reduction * 100)) + "% of samples before finding a good argument for 'no process'", arg)
def readCSV(path, filename): fields = [] rows = [] demographicsToSearch = { 'ethnicity': 'str', 'gender': 'str', 'religion': 'str', 'income': 'int', 'age': 'int', 'education': 'str', 'location': 'str', 'sex': 'int' } # dict to save strings we can count instances of: Location = {} Ethnicity = {} Gender = {} Religion = {} Education = {} # arrays to save values we can calculate mean and variance for: income = [] age = [] sex = [] # count iCount = 0 aCount = 0 sCount = 0 # reading csv file # if there is a folder in dataFiles, walk through that folder too try: with open(path + filename, 'r') as csvfile: # creating a csv reader object csvreader = csv.reader(csvfile) # extracting field names through first row fields = next(csvreader) # extracting each data row one by one for row in csvreader: rows.append(row) except: for dirs in os.walk(path): for d in dirs: if isinstance(d, str): try: with open(d + "/" + filename, 'r') as csvfile: # creating a csv reader object csvreader = csv.reader(csvfile) # extracting field names through first row fields = next(csvreader) # extracting each data row one by one for row in csvreader: rows.append(row) except: x = 0 validRowsIndex = [] validRowsName = [] # TODO need to check if a value is a string or number countFieldNum = 0 # finding the index of field was causing problems. for field in fields: for var in demographicsToSearch: # get the index and name of all values in the file that match the labels we are looking for if var in field: validRowsIndex.append(countFieldNum) validRowsName.append(var.lower()) countFieldNum += 1 print(validRowsIndex) print(validRowsName) # for each row, get the value at each valid index and add to array OR dict AND increase count # (lowercase variables are arrays, capitalized variables are dicts) for row in rows: for vR in validRowsIndex: if len(row) > validRowsIndex[vR] and row[ validRowsIndex[vR]] is not None: # print(vR, validRowsIndex[vR], validRowsName[vR], row[validRowsIndex[vR]]) if 'age' in validRowsName[vR]: a = float(row[validRowsIndex[vR]]) aCount += 1 age.append(a) elif 'sex' in validRowsName[vR]: s = float(row[validRowsIndex[vR]]) sCount += 1 sex.append(s) elif 'income' in validRowsName[vR]: i = float(row[validRowsIndex[vR]]) iCount += 1 income.append(i) elif 'location' in validRowsName[vR]: if "." not in row[validRowsIndex[vR]]: if Location.get(row[validRowsIndex[vR]]) is not None: count = Location.get(row[validRowsIndex[vR]]) + 1 Location.update({row[validRowsIndex[vR]]: count}) else: Location.update({row[validRowsIndex[vR]]: 1}) elif 'ethnicity' in validRowsName[vR].lower(): if Ethnicity.get(row[validRowsIndex[vR]]) is not None: count = Ethnicity.get(row[validRowsIndex[vR]]) + 1 Ethnicity.update({row[validRowsIndex[vR]]: count}) else: Ethnicity.update({row[validRowsIndex[vR]]: 1}) elif 'gender' in validRowsName[vR].lower(): if Gender.get(row[validRowsIndex[vR]]) is not None: count = Gender.get(row[validRowsIndex[vR]]) + 1 Gender.update({row[validRowsIndex[vR]]: count}) else: Gender.update({row[validRowsIndex[vR]]: 1}) elif 'religion' in validRowsName[vR].lower(): if Religion.get(row[validRowsIndex[vR]]) is not None: count = Religion.get(row[validRowsIndex[vR]]) + 1 Religion.update({row[validRowsIndex[vR]]: count}) else: Religion.update({row[validRowsIndex[vR]]: 1}) elif 'education' in validRowsName[vR].lower(): if Education.get(row[validRowsIndex[vR]]) is not None: count = Education.get(row[validRowsIndex[vR]]) + 1 Education.update({row[validRowsIndex[vR]]: count}) else: Education.update({row[validRowsIndex[vR]]: 1}) labelDict = {} # dict to return to runTerminalCommands # TODO : do analysis on string values if len(age) > 1: ageVar = calculations.calcVariance(age) ageMean = calculations.calcMean(age) ageHist = calculations.calcHistogram(age, "Age") labelDict.update({ "Age": [ "Count: " + str(aCount), "Variance: " + str(ageVar), "Mean: " + str(ageMean), ageHist ] }) if len(sex) > 1: sexCount = calculations.calcBreakDown(sex) sexVar = calculations.calcVariance(sex) sexMean = calculations.calcMean(sex) uniqueValues = calculations.calcUniqueValues(sex) sexHist = calculations.calcHistogram(sex, "Sex") if uniqueValues <= 2: # check if only included 2 or less sex data points labelDict.update({ "Sex": [ "Breakdown: " + sexCount, "Count: " + str(sCount), "Variance: " + str(sexVar), "Mean: " + str(sexMean), sexHist, "Recommendations: " + "You only included " + str(uniqueValues) + " Sex data points.", "If you did not include Intersex or Transgender people, consider how this might impact your results." ] }) else: labelDict.update({ "Sex": [ "Breakdown: " + sexCount, "Count: " + str(sCount), "Variance: " + str(sexVar), "Mean: " + str(sexMean), sexHist ] }) if len(income) > 1: incomeVar = calculations.calcVariance(income) incomeMean = calculations.calcMean(income) incomeHist = calculations.calcHistogram(income, "Income") labelDict.update({ "Income": [ "Count: " + str(iCount), "Variance: " + str(incomeVar), "Mean: " + str(incomeMean), incomeHist ] }) if len(Ethnicity) > 0: labelDict.update({'Ethnicity': Ethnicity}) if len(Gender) > 0: if len(Gender) == 2: Gender.update({ 'Recommendations: ': "You only have two genders in your data.", '-': "Consider how not including other genders might bias your results and lead to erasure." }) labelDict.update({'Gender': Gender}) else: labelDict.update({'Gender': Gender}) return labelDict