Beispiel #1
0
def inheritancePossible():
    """
    Checks whether it is possible for the oldest sample to in fact be the product of
    inheritance (based on saturation constraints), both from the youngest sample and
    from the mean.
    """
    
    name = 'amount of inheritance required to explain spread is possible'
    oldest = calculations.calcMaxSample(__getAge())
    
    if Nuclide.stable(oldest['nuclide']):
        return SimResult(confidence.Confidence(confidence.Applic.ct, confidence.Validity.accept),
                         name, 'nuclide is stable', 'nuclide is ' + str(oldest['nuclide']))
    
    saturation = 3.0 / Nuclide.decay[oldest['nuclide']] 

    spread = oldest[__getAge()] - calculations.calcMin(__getAge())
    if spread < saturation:
        return SimResult(confidence.Confidence(confidence.Applic.ct, confidence.Validity.sound),
                         name, 'total spread of ' + str(spread) + 
                         ' is less than saturation age of about ' + str(saturation), 
                         'saturation: ' + str(saturation) + ', spread: ' + str(spread))
    
    spread = oldest[__getAge()] - calculations.calcMean(__getAge())
    if spread < saturation:
        return SimResult(confidence.Confidence(confidence.Applic.dt, confidence.Validity.sound),
                         name, 'total amount > mean age of ' + str(spread) + 
                         ' is less than saturation age of about ' + str(saturation), 
                         'saturation: ' + str(saturation) + ', spread: ' + str(spread))
        
    return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound),
                     name, 'spread in ages is impossible via inheritance', 
                     'saturation (about): ' + str(saturation) + ', spread: ' + str(spread))
Beispiel #2
0
def centralAgreement(sample):
    """
    Checks whether we can remove samples at the edges to get a reasonably large group with
    a single central tendency.
    
    Should pay attention to whether removed samples have similar ages to each other or not
    
    Also, this should be run once and the results saved, instead of being run again for every
    single sample...
    """
    #first let's check that we need to remove samples

        
    arg = engine.buildArgument(conclusions.Conclusion("no process"))
    name = "removal of youngest and oldest samples allows argument for 'no process'"
    
    if arg.getSingleConfidence().isStrongly(True):
        return SimResult(-arg.getSingleConfidence(), name,
                         "good argument for 'no process' before removing any samples",
                         arg)
    
    savedSamples = samples.sampleList[:]
    
    samples.sampleList.sort(cmp = lambda x, y: cmp(x[__getAge()], y[__getAge()]))
    
    while len(samples.sampleList) > 0:
        mean = calculations.calcMean(__getAge())
        younger = mean - samples.sampleList[0][__getAge()]
        older = samples.sampleList[-1][__getAge()] - mean
        if younger > older:
            del samples.sampleList[0]
        else:
            del samples.sampleList[-1]
        arg = engine.buildArgument(conclusions.Conclusion("no process"))
        if arg.getSingleConfidence().isStrongly(True):
            break
        
    if sample in samples.sampleList:
        res = SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), name,
                        str(sample) + " does not need to be removed to have good argument for 'no process'",
                        arg)
        samples.sampleList = savedSamples
        return res
    
    reduction = len(samples.sampleList) / float(len(savedSamples))
    #removed = len(savedSamples) - len(samples.sampleList)
    
    samples.sampleList = savedSamples
    
    conf = __getConfidence((.2, .35, .6, .8, .9), reduction, confidence.Validity.prob)
    
    #print reduction, conf
    
    #oldest n samples appear to be outliers
    
    return SimResult(conf, name, "removed " + str(100 - (reduction * 100)) + 
                     "% of samples before finding a good argument for 'no process'", arg)
Beispiel #3
0
def readCSV(path, filename):

    fields = []
    rows = []

    demographicsToSearch = {
        'ethnicity': 'str',
        'gender': 'str',
        'religion': 'str',
        'income': 'int',
        'age': 'int',
        'education': 'str',
        'location': 'str',
        'sex': 'int'
    }
    # dict to save strings we can count instances of:
    Location = {}
    Ethnicity = {}
    Gender = {}
    Religion = {}
    Education = {}
    # arrays to save values we can calculate mean and variance for:
    income = []
    age = []
    sex = []
    # count
    iCount = 0
    aCount = 0
    sCount = 0

    # reading csv file
    # if there is a folder in dataFiles, walk through that folder too
    try:
        with open(path + filename, 'r') as csvfile:
            # creating a csv reader object
            csvreader = csv.reader(csvfile)

            # extracting field names through first row
            fields = next(csvreader)

            # extracting each data row one by one
            for row in csvreader:
                rows.append(row)
    except:
        for dirs in os.walk(path):
            for d in dirs:
                if isinstance(d, str):

                    try:
                        with open(d + "/" + filename, 'r') as csvfile:
                            # creating a csv reader object
                            csvreader = csv.reader(csvfile)

                            # extracting field names through first row
                            fields = next(csvreader)

                            # extracting each data row one by one
                            for row in csvreader:
                                rows.append(row)
                    except:
                        x = 0

    validRowsIndex = []
    validRowsName = []
    # TODO need to check if a value is a string or number
    countFieldNum = 0  # finding the index of field was causing problems.
    for field in fields:
        for var in demographicsToSearch:  # get the index and name of all values in the file that match the labels we are looking for
            if var in field:
                validRowsIndex.append(countFieldNum)
                validRowsName.append(var.lower())
        countFieldNum += 1
    print(validRowsIndex)
    print(validRowsName)
    # for each row, get the value at each valid index and add to array OR dict AND increase count
    # (lowercase variables are arrays, capitalized variables are dicts)
    for row in rows:
        for vR in validRowsIndex:
            if len(row) > validRowsIndex[vR] and row[
                    validRowsIndex[vR]] is not None:
                # print(vR, validRowsIndex[vR], validRowsName[vR], row[validRowsIndex[vR]])
                if 'age' in validRowsName[vR]:
                    a = float(row[validRowsIndex[vR]])
                    aCount += 1
                    age.append(a)
                elif 'sex' in validRowsName[vR]:
                    s = float(row[validRowsIndex[vR]])
                    sCount += 1
                    sex.append(s)
                elif 'income' in validRowsName[vR]:
                    i = float(row[validRowsIndex[vR]])
                    iCount += 1
                    income.append(i)
                elif 'location' in validRowsName[vR]:
                    if "." not in row[validRowsIndex[vR]]:
                        if Location.get(row[validRowsIndex[vR]]) is not None:
                            count = Location.get(row[validRowsIndex[vR]]) + 1
                            Location.update({row[validRowsIndex[vR]]: count})
                        else:
                            Location.update({row[validRowsIndex[vR]]: 1})
                elif 'ethnicity' in validRowsName[vR].lower():
                    if Ethnicity.get(row[validRowsIndex[vR]]) is not None:
                        count = Ethnicity.get(row[validRowsIndex[vR]]) + 1
                        Ethnicity.update({row[validRowsIndex[vR]]: count})
                    else:
                        Ethnicity.update({row[validRowsIndex[vR]]: 1})
                elif 'gender' in validRowsName[vR].lower():
                    if Gender.get(row[validRowsIndex[vR]]) is not None:
                        count = Gender.get(row[validRowsIndex[vR]]) + 1
                        Gender.update({row[validRowsIndex[vR]]: count})
                    else:
                        Gender.update({row[validRowsIndex[vR]]: 1})
                elif 'religion' in validRowsName[vR].lower():
                    if Religion.get(row[validRowsIndex[vR]]) is not None:
                        count = Religion.get(row[validRowsIndex[vR]]) + 1
                        Religion.update({row[validRowsIndex[vR]]: count})
                    else:
                        Religion.update({row[validRowsIndex[vR]]: 1})
                elif 'education' in validRowsName[vR].lower():
                    if Education.get(row[validRowsIndex[vR]]) is not None:
                        count = Education.get(row[validRowsIndex[vR]]) + 1
                        Education.update({row[validRowsIndex[vR]]: count})
                    else:
                        Education.update({row[validRowsIndex[vR]]: 1})

    labelDict = {}  # dict to return to runTerminalCommands
    # TODO : do analysis on string values
    if len(age) > 1:
        ageVar = calculations.calcVariance(age)
        ageMean = calculations.calcMean(age)
        ageHist = calculations.calcHistogram(age, "Age")
        labelDict.update({
            "Age": [
                "Count: " + str(aCount), "Variance: " + str(ageVar),
                "Mean: " + str(ageMean), ageHist
            ]
        })
    if len(sex) > 1:
        sexCount = calculations.calcBreakDown(sex)
        sexVar = calculations.calcVariance(sex)
        sexMean = calculations.calcMean(sex)
        uniqueValues = calculations.calcUniqueValues(sex)
        sexHist = calculations.calcHistogram(sex, "Sex")
        if uniqueValues <= 2:  # check if only included 2 or less sex data points
            labelDict.update({
                "Sex": [
                    "Breakdown: " + sexCount, "Count: " + str(sCount),
                    "Variance: " + str(sexVar), "Mean: " + str(sexMean),
                    sexHist, "Recommendations: " + "You only included " +
                    str(uniqueValues) + " Sex data points.",
                    "If you did not include Intersex or Transgender people, consider how this might impact your results."
                ]
            })
        else:
            labelDict.update({
                "Sex": [
                    "Breakdown: " + sexCount, "Count: " + str(sCount),
                    "Variance: " + str(sexVar), "Mean: " + str(sexMean),
                    sexHist
                ]
            })
    if len(income) > 1:
        incomeVar = calculations.calcVariance(income)
        incomeMean = calculations.calcMean(income)
        incomeHist = calculations.calcHistogram(income, "Income")
        labelDict.update({
            "Income": [
                "Count: " + str(iCount), "Variance: " + str(incomeVar),
                "Mean: " + str(incomeMean), incomeHist
            ]
        })
    if len(Ethnicity) > 0:
        labelDict.update({'Ethnicity': Ethnicity})
    if len(Gender) > 0:
        if len(Gender) == 2:
            Gender.update({
                'Recommendations: ':
                "You only have two genders in your data.",
                '-':
                "Consider how not including other genders might bias your results and lead to erasure."
            })
            labelDict.update({'Gender': Gender})
        else:
            labelDict.update({'Gender': Gender})

    return labelDict