Ejemplo n.º 1
0
def summary_stats(data_list):
    """
    Accepts a sample of numbers and returns a pretty
    print out of a variety of descriptive statistics.
    """
    mean = calculate.mean(data_list)
    median = calculate.median(data_list)
    mode = calculate.mode(data_list)
    n = len(data_list)
    max_ = max(data_list)
    min_ = min(data_list)
    range_ = calculate.range(data_list)
    standard_deviation = calculate.standard_deviation(data_list)
    variation_coefficient = calculate.variation_coefficient(data_list)

    table = ptable.indent(
        [
            ['Statistic', 'Value'],
            ['n', str(n)],
            ['mean', str(mean)],
            ['median', str(median)],
            ['mode', str(mode)],
            ['maximum', str(max_)],
            ['minimum', str(min_)],
            ['range', str(range_)],
            ['standard deviation', str(standard_deviation)],
            ['variation coefficient', str(variation_coefficient)],
        ],
        hasHeader=True,
        separateRows=False,
        prefix="| ", postfix=" |",
    )
    print(table)
def summary_stats(data_list):
    """
    Accepts a sample of numbers and returns a pretty
    print out of a variety of descriptive statistics.
    """
    mean = calculate.mean(data_list)
    median = calculate.median(data_list)
    mode = calculate.mode(data_list)
    n = len(data_list)
    max_ = max(data_list)
    min_ = min(data_list)
    range_ = calculate.range(data_list)
    standard_deviation = calculate.standard_deviation(data_list)
    
    print """
Summary statistics
==================

n:        %s
max:        %s
min:        %s
range:        %s
mean:        %s
median:        %s
mode:        %s
std:        %s
""" % (n, max_, min_, range_, mean, median, mode, standard_deviation)
def standard_deviation(data_list):
    """
	Returns the standard deviation of a list of numbers.
	
	h3. Documentation
	
		http://en.wikipedia.org/wiki/Standard_deviation
	
	"""
    data_list = map(float, data_list)
    mean = calculate.mean(data_list)
    deviations = [i - mean for i in data_list]
    deviations_squared = [math.pow(i, 2) for i in deviations]
    mean_deviation = calculate.mean(deviations_squared)
    standard_deviation = math.sqrt(mean_deviation)
    return standard_deviation
def variation_coefficient(data_list):
    """
    Accepts a list of values and returns the variation coefficient,
    which is a normalized measure of the distribution.

    This is the sort of thing you can use to compare the standard deviation
    of sets that are measured in different units.

    Note that it uses our "population" standard deviation as part of the
    calculation, not a "sample standard deviation.

    h3. Example usage

        >>> import calculate
        >>> calculate.variation_coefficient([1, 2, -2, 4, -3])
        6.442049363362563

    h3. Documentation

        * "coefficient of variation":http://en.wikipedia.org/wiki/\
Coefficient_of_variation
    """
    # Convert all the values to floats and test to make sure
    # there aren't any strings in there
    try:
        data_list = list(map(float, data_list))
    except ValueError:
        raise ValueError('Input values must contain numbers')
    std = calculate.standard_deviation(data_list)
    mean = calculate.mean(data_list)
    return std / mean
Ejemplo n.º 5
0
    def has_sound_spike(self):
        """
        Find the standard deviation of the past 10 minutes.
        Send out a tweet if there are any signals greater than two standard deviations
        in the past 10 seconds.
        """
        ten_minutes = timezone.localtime(
            timezone.now()) - datetime.timedelta(minutes=10)
        ten_seconds = timezone.localtime(
            timezone.now()) - datetime.timedelta(seconds=10)
        signals_past_ten_min = self.signal_set.filter(
            timestamp__lt=timezone.localtime(timezone.now()),
            timestamp__gte=ten_minutes)
        if signals_past_ten_min.count > 0:
            voltages = list(
                signals_past_ten_min.values_list(
                    'voltage', flat=True).order_by('voltage'))
            avg = calculate.mean(voltages)
            std_dev = calculate.standard_deviation(voltages)
            twice_std_dev = (std_dev * 2) + avg
            signals_past_10_secs = signals_past_ten_min.filter(
                timestamp__gte=ten_seconds, voltage__gte=twice_std_dev)

            # return the voltage of the highest signal if there has been a spike
            # Or return False
            if signals_past_10_secs.count() > 0:
                signals_past_10_secs = list(
                    signals_past_10_secs.values_list(
                        'voltage', flat=True).order_by('-voltage'))
                return signals_past_10_secs[0]
            else:
                return False
        else:
            return False
Ejemplo n.º 6
0
    def has_sound_spike(self):
        """
        Find the standard deviation of the past 10 minutes.
        Send out a tweet if there are any signals greater than two standard deviations
        in the past 10 seconds.
        """
        ten_minutes = timezone.localtime(timezone.now()) - datetime.timedelta(minutes=10)
        ten_seconds = timezone.localtime(timezone.now()) - datetime.timedelta(seconds=10)
        signals_past_ten_min = self.signal_set.filter(
            timestamp__lt=timezone.localtime(timezone.now()), timestamp__gte=ten_minutes
        )
        if signals_past_ten_min.count > 0:
            voltages = list(signals_past_ten_min.values_list("voltage", flat=True).order_by("voltage"))
            avg = calculate.mean(voltages)
            std_dev = calculate.standard_deviation(voltages)
            twice_std_dev = (std_dev * 2) + avg
            signals_past_10_secs = signals_past_ten_min.filter(timestamp__gte=ten_seconds, voltage__gte=twice_std_dev)

            # return the voltage of the highest signal if there has been a spike
            # Or return False
            if signals_past_10_secs.count() > 0:
                signals_past_10_secs = list(signals_past_10_secs.values_list("voltage", flat=True).order_by("-voltage"))
                return signals_past_10_secs[0]
            else:
                return False
        else:
            return False
def standard_deviation(data_list):
    """
    Accepts a list of values and returns the standard deviation.

    Standard deviation measures how widely dispersed the values are
    from the mean. A lower value means the data tend to be bunched
    close to the averge. A higher value means they tend to be further
    away.

    This is a "population" calculation that assumes that you are submitting
    all of the values, not a sample.

    h3. Example usage

        >> import calculate
        >>> calculate.standard_deviation([2,3,3,4])
        0.70710678118654757
        >>> calculate.standard_deviation([-2,3,3,40])
        16.867127793432999

    h3. Documentation

        "standard deviation":http://en.wikipedia.org/wiki/Standard_deviation

    """
    # Convert all the values to floats and test to make sure
    # there aren't any strings in there
    try:
        data_list = list(map(float, data_list))
    except ValueError:
        raise ValueError('Input values must contain numbers')

    # Find the mean
    mean = calculate.mean(data_list)

    # Create a new list containing the distance from mean
    # for each value in the sample
    deviations = [i - mean for i in data_list]

    # Square the distances
    deviations_squared = [math.pow(i, 2) for i in deviations]

    # Take the average of those squares
    mean_deviation = calculate.mean(deviations_squared)

    # And then take the square root of the mean to find the standard deviation
    return math.sqrt(mean_deviation)
def get_avg_unemployment(data, start_year=2013, end_year=2015):
    avgs = {}
    while start_year <= end_year:
        avg = calculate.mean([
            Decimal(rate.get(str(start_year))) for
            rate in data if rate.get(str(start_year))])
        avgs[str(start_year)] = avg
        start_year += 1
    return avgs
def get_avg_unemployment(data, start_year=2013, end_year=2015):
    avgs = {}
    while start_year <= end_year:
        avg = calculate.mean([
            Decimal(rate.get(str(start_year))) for rate in data
            if rate.get(str(start_year))
        ])
        avgs[str(start_year)] = avg
        start_year += 1
    return avgs
Ejemplo n.º 10
0
def main(): 
    print()
    print("------------------------------------------------------------------------------------------------------------------------")
    print(" # Feature Tester")
    print(" # Purpose : This program is used to test generated feature set.")
    print(" # You have to manually configure in feature_tester.py as follows")
    print(" #   [1] list_feature : Feature to be tested")
    print(" #   [2] row_to_read_file_input : Number of rows in the file mapping between samples and their gene expression to be read")  
    print(" #   [3] file_training_input : A file contains mapping between samples and their gene expression") 
    print(" #   [4] file_training_output : A file contains mapping between samples and their health status")  
    print(" #   [5] rows_to_read_file_pathway : Number of rows in the file mapping between pathways and their member genes to be read")  
    print(" #   [6] file_ref_name : A file mapping between gene probe id and gene entrez id")  
    print(" #   [7] file_to_convert_name : A file contains mapping between samples and their gene expression")  
    print(" #   [8] file_pathway_name : A file mapping between pathways and their member genes") 
    print(" # These files must follow a required format shown in file_format.pdf")
    print(" #")
    print(" # You will be asked to provide related files and required information about them including ")
    print(" #   [1] Number of folds")
    print(" #")
    print(" # You will be asked for the name of an output file.")
    print("------------------------------------------------------------------------------------------------------------------------")
    print()

    # list of feature to be tested 
    # example : epoch 7 in mean_no_normalize_10_10_10
    list_feature = ['BIOCARTA_INTRINSIC_PATHWAY', 'REACTOME_REGULATION_OF_MRNA_STABILITY_BY_PROTEINS_THAT_BIND_AU_RICH_ELEMENTS', 'PID_SMAD2_3NUCLEAR_PATHWAY', 'REACTOME_G1_S_SPECIFIC_TRANSCRIPTION', 'REACTOME_RESOLUTION_OF_AP_SITES_VIA_THE_SINGLE_NUCLEOTIDE_REPLACEMENT_PATHWAY', 'KEGG_BASAL_TRANSCRIPTION_FACTORS', 'REACTOME_EXTENSION_OF_TELOMERES', 'PID_A6B1_A6B4_INTEGRIN_PATHWAY', 'REACTOME_LIPID_DIGESTION_MOBILIZATION_AND_TRANSPORT', 'REACTOME_BASE_FREE_SUGAR_PHOSPHATE_REMOVAL_VIA_THE_SINGLE_NUCLEOTIDE_REPLACEMENT_PATHWAY', 'REACTOME_ABC_FAMILY_PROTEINS_MEDIATED_TRANSPORT', 'PID_MET_PATHWAY', 'KEGG_SPLICEOSOME', 'BIOCARTA_TOLL_PATHWAY', 'PID_AVB3_OPN_PATHWAY', 'REACTOME_CELL_CYCLE_MITOTIC', 'REACTOME_FORMATION_OF_THE_HIV1_EARLY_ELONGATION_COMPLEX', 'REACTOME_DNA_STRAND_ELONGATION', 'REACTOME_CYCLIN_E_ASSOCIATED_EVENTS_DURING_G1_S_TRANSITION_', 'BIOCARTA_SPPA_PATHWAY', 'REACTOME_APC_CDC20_MEDIATED_DEGRADATION_OF_NEK2A', 'REACTOME_INHIBITION_OF_THE_PROTEOLYTIC_ACTIVITY_OF_APC_C_REQUIRED_FOR_THE_ONSET_OF_ANAPHASE_BY_MITOTIC_SPINDLE_CHECKPOINT_COMPONENTS', 'PID_HIF1A_PATHWAY', 'BIOCARTA_PTEN_PATHWAY', 'REACTOME_GRB2_SOS_PROVIDES_LINKAGE_TO_MAPK_SIGNALING_FOR_INTERGRINS_', 'PID_RETINOIC_ACID_PATHWAY']

    # prepare data
    # default row_to_read = 22283
    row_to_read_file_input = 22283
    file_training_input = pd.read_csv("GSE2034-22071 (edited).csv", nrows = row_to_read_file_input)
    file_training_output= pd.read_csv("mapping_sample_to_class_full.csv", usecols = ['GEO asscession number', 'relapse (1=True)'])

    # files to be used to get pathways and their gene expression
    # default rows_to_read_file_pathway = 1329
    rows_to_read_file_pathway = 1329
    file_ref_name = "accession_number_to_entrez_id.csv"
    file_to_convert_name = "GSE2034-22071 (edited).csv"
    file_pathway_name = "c2.cp.v6.2.entrez.gmt.csv"
    file_pathway = pd.read_csv(file_pathway_name, nrows = rows_to_read_file_pathway)

    # get gene order id with its name
    list_gene_name = []
    for i in range(0, row_to_read_file_input):
        # add element with this format (gene_order_id, gene_name)
        gene_name = []
        gene_name.append(i)
        gene_name.append(file_training_input.loc[i, "ID_REF"])
        list_gene_name.append(gene_name)
    
    # get list of pathway name
    list_pathway_name = []
    for i in range(0, rows_to_read_file_pathway):
        pathway_name = []
        pathway_name.append(i)
        pathway_name.append(file_pathway.loc[i, "PATHWAY_NAME"])
        list_pathway_name.append(pathway_name)
    
    # consider non-relapse and relapse (not in specific period of time)
    sample_relapse = file_training_output.loc[file_training_output['relapse (1=True)'].isin(['1'])]
    sample_no_relapse = file_training_output.loc[file_training_output['relapse (1=True)'].isin(['0'])]

    # add GEO asscession number to each list
    list_sample_relapse = []
    for element in sample_relapse.loc[:, 'GEO asscession number']:
        list_sample_relapse.append(element)

    list_sample_no_relapse = []
    for element in sample_no_relapse.loc[:, 'GEO asscession number']:
        list_sample_no_relapse.append(element)
    
    # shuffle data to make each chunk does not depend on sample order
    random.shuffle(list_sample_relapse)
    print("list_sample_relapse SIZE = " + str(len(list_sample_relapse)))
    random.shuffle(list_sample_no_relapse)
    print("list_sample_no_relapse SIZE = " + str(len(list_sample_no_relapse)))

    # get number of folds
    while True:
        num_of_folds = input("Number of folds: ")
        if (num_of_folds.isnumeric() == False):
            print("WARNING : Input must be numeric")
        elif(int(num_of_folds) > len(list_sample_relapse)):
            print("WARNING : Number of folds exceeds the size of the 1st dataset")
        elif(int(num_of_folds) > len(list_sample_no_relapse)):
            print("WARNING : Number of folds exceeds the size of the 2nd dataset")
        elif(int(num_of_folds) <= 1):
            print("WARNING : Number of folds cannot lower than or equal to 1")
        else:
            break
    num_of_folds = int(num_of_folds)

    # get output file's name
    file_name = input("Name of output file : ")

    # # prepare text file for results to be written in
    result_file = open(str(file_name) + ".txt", "w+")

    print("Process : Creating collection to collect samples and their genes' expression")
    # create dictionary used to collect pathways of each sample
    samples_relapse = {}
    samples_no_relapse = {}

    # get all pathways of all samples in class 'relapse'
    for element_index in range(0, len(list_sample_relapse)):
        print()
        print("Creating pathways for sample " + str(element_index + 1) + " relapse is in progress ...")
        print(str(len(list_sample_relapse) - (element_index + 1)) + " samples left")
        print()

        sample = []
        sample_name = list_sample_relapse[element_index]
        pathways = calculate.getPathway(file_ref_name, file_to_convert_name, file_pathway_name, sample_name, rows_to_read_file_pathway, normalize = False)

        sample.append(sample_name)
        sample.append(pathways)
        samples_relapse[element_index] = sample

    for element_index in range(0, len(list_sample_no_relapse)):
        print()
        print("Creating pathways for sample " + str(element_index + 1) + " non-relapse is in progress ...")
        print(str(len(list_sample_no_relapse) - (element_index + 1)) + " samples left")
        print()

        sample = []
        sample_name = list_sample_no_relapse[element_index]
        pathways = calculate.getPathway(file_ref_name, file_to_convert_name, file_pathway_name, sample_name, rows_to_read_file_pathway, normalize = False)

        sample.append(sample_name)
        sample.append(pathways)
        samples_no_relapse[element_index] = sample
    
    print("Process : Creating collections of samples with their pathways' activity ...")
    
    # create collections of samples with their pathways
    # data will be collected in this format
    # { GSM1234, {0: ['KEGG_GLYCOLYSIS_GLUCONEOGENESIS', [[55902, 0.0], [2645, 0.0], ...}}
    samples_relapse_pathway_activity = {}
    samples_no_relapse_pathway_activity = {}
    for samples_index in range(0, len(samples_relapse)):
        sample = []
        list_pathway = []
        for pathway_index in range(0, len(samples_relapse[samples_index][1])):
            list_gene_expression_in_pathway = []
            pathway = []
            for gene_index in range(0, len(samples_relapse[samples_index][1][pathway_index][1])):
                gene_expression = samples_relapse[samples_index][1][pathway_index][1][gene_index][1]
                list_gene_expression_in_pathway.append(gene_expression)

            # data to collect as pathway activity
            pathway_name = samples_relapse[samples_index][1][pathway_index][0]
            pathway_activity = calculate.mean(list_gene_expression_in_pathway)

            pathway.append(pathway_name)
            pathway.append(pathway_activity)
            list_pathway.append(pathway)
        
        sample_name = samples_relapse[samples_index][0]
        sample.append(sample_name)
        sample.append(list_pathway)
        samples_relapse_pathway_activity[samples_index] = sample

    for samples_index in range(0, len(samples_no_relapse)):
        sample = []
        list_pathway = []
        for pathway_index in range(0, len(samples_no_relapse[samples_index][1])):
            list_gene_expression_in_pathway = []
            pathway = []
            for gene_index in range(0, len(samples_no_relapse[samples_index][1][pathway_index][1])):
                gene_expression = samples_no_relapse[samples_index][1][pathway_index][1][gene_index][1]
                list_gene_expression_in_pathway.append(gene_expression)

            # data to collect as pathway activity
            pathway_name = samples_no_relapse[samples_index][1][pathway_index][0]
            pathway_activity = calculate.mean(list_gene_expression_in_pathway)

            pathway.append(pathway_name)
            pathway.append(pathway_activity)
            list_pathway.append(pathway)
        
        sample_name = samples_no_relapse[samples_index][0]
        sample.append(sample_name)
        sample.append(list_pathway)
        samples_no_relapse_pathway_activity[samples_index] = sample
    
    # create list of indexes used to indicate the position in the list
    list_index_samples_relapse = []
    list_index_samples_no_relapse = []

    for index in range(0, len(list_sample_relapse)):
        list_index_samples_relapse.append(index)
    
    for index in range(0, len(list_sample_no_relapse)):
        list_index_samples_no_relapse.append(index)
    
    # shuffle it to make it flexible for epoch changed
    random.shuffle(list_index_samples_relapse)
    random.shuffle(list_index_samples_no_relapse)

    # split data into k parts
    chunk_relapse_size = math.ceil(len(list_index_samples_relapse) / num_of_folds)
    chunk_no_relapse_size = math.ceil(len(list_index_samples_no_relapse) / num_of_folds)

    chunk_list_relapse = list(calculate.chunks(list_index_samples_relapse, chunk_relapse_size))
    print("number of chunks in chunk_list_relapse = " + str(len(chunk_list_relapse)))

    chunk_list_no_relapse = list(calculate.chunks(list_index_samples_no_relapse, chunk_no_relapse_size))
    print("number of in chunk_list_no_relapse  = " + str(len(chunk_list_no_relapse)))

    check_valid, num_of_chunks = calculate.checkEqualListSize(chunk_list_relapse, chunk_list_no_relapse)

    if (check_valid == True):
        # random index the chunk to be tested
        chunk_test_index = random.randint(0, num_of_chunks - 1)

        # separating data into testing and training dataset
        # get testing set
        chunk_test_relapse = chunk_list_relapse[chunk_test_index]
        chunk_test_no_relapse = chunk_list_no_relapse[chunk_test_index]

        # get training set of this fold
        chunk_train_relapse = []
        for chunk_train_relapse_index in range(0, num_of_chunks):
            if (chunk_list_relapse[chunk_train_relapse_index] is not chunk_test_relapse):
                chunk_train_relapse.append(chunk_list_relapse[chunk_train_relapse_index])
        print("chunk train relapse size = " + str(len(chunk_train_relapse)))

        chunk_train_no_relapse = []
        for chunk_train_no_relapse_index in range(0, num_of_chunks):
            if (chunk_list_no_relapse[chunk_train_no_relapse_index] is not chunk_test_no_relapse):
                chunk_train_no_relapse.append(chunk_list_no_relapse[chunk_train_no_relapse_index])
        print("chunk train no relapse size = " + str(len(chunk_train_no_relapse)))

        # merge training data of each class
        list_train_relapse = []
        for i in range(0, len(chunk_train_relapse)):
            list_train_relapse.extend(chunk_train_relapse[i])
        print("size of list_train_relapse : " + str(len(list_train_relapse)))

        list_train_no_relapse = []
        for i in range(0, len(chunk_train_no_relapse)):
            list_train_no_relapse.extend(chunk_train_no_relapse[i])
        print("size of list_train_no_relapse : " + str(len(list_train_no_relapse)))

        # making classifier
        # get pathways' activity of members in the feature set
        # for class 'relapse'
        list_sample_relapse_pathway_activity_classifier = []
        list_pathway_name_classifier_relapse = []
        for sample_index in range(0, len(list_train_relapse)):
            list_pathway_activity = []
            sample_index_in_list = list_train_relapse[sample_index]
            for feature in list_feature:
                for pathway_index in range(0, len(samples_relapse_pathway_activity[sample_index_in_list][1])):
                    pathway_name = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0]
                        
                    if (pathway_name == feature):
                        pathway_activity = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1]
                        list_pathway_activity.append(pathway_activity)
                        if(pathway_name not in list_pathway_name_classifier_relapse):
                            list_pathway_name_classifier_relapse.append(pathway_name)

            list_sample_relapse_pathway_activity_classifier.append(list_pathway_activity)
        result_file.write("feature set (" + str(len(list_feature)) + ") : \n")
        result_file.write(str(list_feature))
        result_file.write("\n")
        result_file.write("pathway name in class 'relapse' (" + str(len(list_pathway_name_classifier_relapse))+ ") : ")
        result_file.write(str(list_pathway_name_classifier_relapse))
        result_file.write("\n")

        # for class 'non-relapse'
        list_sample_no_relapse_pathway_activity_classifier = []
        list_pathway_name_classifier_no_relapse = []
        for sample_index in range(0, len(list_train_no_relapse)):
            list_pathway_activity = []
            sample_index_in_list = list_train_no_relapse[sample_index]
            for feature in list_feature:
                for pathway_index in range(0, len(samples_no_relapse_pathway_activity[sample_index_in_list][1])):
                    pathway_name = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0]
                                                
                    if (pathway_name == feature):
                        pathway_activity = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1]
                        list_pathway_activity.append(pathway_activity)
                        if(pathway_name not in list_pathway_name_classifier_no_relapse):
                            list_pathway_name_classifier_no_relapse.append(pathway_name)

            list_sample_no_relapse_pathway_activity_classifier.append(list_pathway_activity)
        result_file.write("pathway name in class 'non-relapse' (" + str(len(list_pathway_name_classifier_no_relapse)) + ") : ")
        result_file.write(str(list_pathway_name_classifier_no_relapse))
        result_file.write("\n")

        # prepare tesing set
        # each sample contains only pathway in feature set
        # for class 'relapse'
        list_sample_relapse_pathway_activity_testing_set = []
        for sample_index in range(0, len(chunk_test_relapse)):
            list_pathway_activity = []
            sample_index_in_list = chunk_test_relapse[sample_index]
            for feature in list_feature:
                for pathway_index in range(0, len(samples_relapse_pathway_activity[sample_index_in_list][1])):
                    pathway_name = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0]
                    
                    if (pathway_name == feature):
                        pathway_activity = samples_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1]
                        list_pathway_activity.append(pathway_activity)

            list_sample_relapse_pathway_activity_testing_set.append(list_pathway_activity)
        
        # for class 'non-relapse'
        list_sample_no_relapse_pathway_activity_testing_set = []
        for sample_index in range(0, len(chunk_test_no_relapse)):
            list_pathway_activity = []
            sample_index_in_list = chunk_test_no_relapse[sample_index]
            for feature in list_feature:
                for pathway_index in range(0, len(samples_no_relapse_pathway_activity[sample_index_in_list][1])):
                    pathway_name = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][0]
                    
                    if (pathway_name == feature):
                        pathway_activity = samples_no_relapse_pathway_activity[sample_index_in_list][1][pathway_index][1]
                        list_pathway_activity.append(pathway_activity)

            list_sample_no_relapse_pathway_activity_testing_set.append(list_pathway_activity)
        
        # merge testing data to be used in lda for feature selection 
        list_sample_all_pathway_activity_testing_set = []
        list_sample_all_pathway_activity_testing_set.extend(list_sample_relapse_pathway_activity_testing_set)
        list_sample_all_pathway_activity_testing_set.extend(list_sample_no_relapse_pathway_activity_testing_set)

        # get sample name of samples feature selection set
        list_sample_relapse_name_testing_set = []
        for index in range(0, len(chunk_test_relapse)):
            sample_index_in_list = chunk_test_relapse[index]
            list_sample_relapse_name_testing_set.append(samples_relapse[sample_index_in_list][0])

        list_sample_no_relapse_name_testing_set = []
        for index in range(0, len(chunk_test_no_relapse)):
            sample_index_in_list = chunk_test_no_relapse[index]
            list_sample_no_relapse_name_testing_set.append(samples_no_relapse[sample_index_in_list][0])
        
        # merge samples' name of both class
        list_sample_name_testing_set = []
        list_sample_name_testing_set.extend(list_sample_relapse_name_testing_set)
        list_sample_name_testing_set.extend(list_sample_no_relapse_name_testing_set)

        # create list of desired output
        file_desired_outputs_testing = file_training_output.loc[file_training_output['GEO asscession number'].isin(list_sample_name_testing_set)]
        file_desired_outputs_testing['sample_id'] = file_desired_outputs_testing['GEO asscession number'].apply(lambda name: list_sample_name_testing_set.index(name)) 
        file_desired_outputs_testing = file_desired_outputs_testing.sort_values(by = ['sample_id'])
        file_desired_outputs_testing.drop(columns = 'sample_id', inplace = True)

        list_desired_outputs_testing = []
        for element in file_desired_outputs_testing.loc[:, 'relapse (1=True)']:
            list_desired_outputs_testing.append(element)
        
        # linear discrimination analysis
        list_actual_outputs_testing = calculate.lda(list_sample_all_pathway_activity_testing_set, list_sample_relapse_pathway_activity_classifier, list_sample_no_relapse_pathway_activity_classifier)

        # calculate rAUC score
        auc_score = roc_auc_score(list_desired_outputs_testing, list_actual_outputs_testing)

        result_file.write("list_sample_name_testing_set (" + str(len(list_sample_name_testing_set)) + ") : " + str(list_sample_name_testing_set) +"\n")
        result_file.write("list_desired_outputs_testing (" + str(len(list_desired_outputs_testing)) + ") : \n")
        result_file.write(str(list_desired_outputs_testing))
        result_file.write("\n")
        result_file.write("list_actual_outputs_testing (" + str(len(list_actual_outputs_testing)) + ") : \n")
        result_file.write(str(list_actual_outputs_testing))
        result_file.write("\n")
        result_file.write("AUC score : " + str(auc_score) + "\n")
    result_file.close()   
def standard_deviation_ellipses(
    geoqueryset,
    point_attribute_name='point',
    num_of_std=1,
    fix_points=True
):
    """
    Accepts a GeoQuerySet and generates one or more standard deviation ellipses
    demonstrating the geospatial distribution of where its points occur.

    Returns a one-to-many list of the ellipses as Polygon objects.

    The standard deviation ellipse illustrates the average variation in
    the distance of points from the mean center, as well as their direction.

    By default, the function expects the Point field on your model to be
    called 'point'.

    If the point field is called something else, change the kwarg
    'point_attribute_name' to whatever your field might be called.

    Also by default, the function will nudge slightly apart any identical
    points and only return the first standard deviation ellipse. If you'd like
    to change that behavior, change the corresponding kwargs.

    h3. Example usage

        >> import calculate
        >> calculate.standard_deviation_ellipses(qs)
        [<Polygon object at 0x77a1c34>]

    h3. Dependencies

        * "django":http://www.djangoproject.com/
        * "geodjango":http://www.geodjango.org/
        * "psql ellipse() function":http://postgis.refractions.net/support/\
wiki/index.php?plpgsqlfunctions

    h3. Documentation

        * "standard deviation ellipse":http://www.spatialanalysisonline.com/\
output/html/Directionalanalysisofpointdatasets.html
        * "This code is translated from SQL by Francis Dupont":http://\
postgis.refractions.net/pipermail/postgis-users/2008-June/020354.html
    """
    if not isinstance(geoqueryset, GeoQuerySet):
        error = 'First parameter must be a GeoQuerySet. You submitted a %s'
        raise TypeError(error % type(geoqueryset))

    n = len(geoqueryset)

    if n < 3:
        return [None]

    if fix_points:
        geoqueryset = calculate.nudge_points(
            geoqueryset,
            point_attribute_name=point_attribute_name
        )

    avg_x = calculate.mean([
        abs(getattr(p, point_attribute_name).x) for p in geoqueryset
    ])
    avg_y = calculate.mean([
        abs(getattr(p, point_attribute_name).y) for p in geoqueryset
    ])
    center_x = calculate.mean([
        getattr(p, point_attribute_name).x for p in geoqueryset
    ])
    center_y = calculate.mean([
        getattr(p, point_attribute_name).y for p in geoqueryset
    ])

    sum_square_diff_avg_x = sum([
        math.pow(
            (abs(getattr(p, point_attribute_name).x) - avg_x),
            2
        ) for p in geoqueryset
    ])
    sum_square_diff_avg_y = sum([
        math.pow(
            (abs(getattr(p, point_attribute_name).y) - avg_y),
            2
        ) for p in geoqueryset
    ])
    sum_diff_avg_x_y = sum([
        (abs(getattr(p, point_attribute_name).x) - avg_x) *
        (abs(getattr(p, point_attribute_name).y) - avg_y)
        for p in geoqueryset
    ])
    sum_square_diff_avg_x_y = sum([
        math.pow(
            (abs(getattr(p, point_attribute_name).x) - avg_x) *
            (abs(getattr(p, point_attribute_name).y) - avg_y),
            2
        ) for p in geoqueryset
    ])
    constant = math.sqrt(
        math.pow((sum_square_diff_avg_x - sum_square_diff_avg_y), 2) +
        (4 * sum_square_diff_avg_x_y)
    )
    theta = math.atan(
        (sum_square_diff_avg_x - sum_square_diff_avg_y + constant) /
        (2 * sum_diff_avg_x_y)
    )

    stdx_sum_x_y_cos_sin_theta = sum([
        math.pow(
            (
                (
                    (getattr(p, point_attribute_name).x - center_x) *
                    math.cos(theta)
                ) -
                (
                    (getattr(p, point_attribute_name).y - center_y) *
                    math.sin(theta)
                )
            ),
            2
        ) for p in geoqueryset
    ])
    stdy_sum_x_y_sin_cos_theta = sum([
        math.pow(
            (
                (
                    (getattr(p, point_attribute_name).x - center_x) *
                    math.sin(theta)
                ) -
                (
                    (getattr(p, point_attribute_name).y - center_y) *
                    math.cos(theta)
                )
            ),
            2
        ) for p in geoqueryset
    ])

    stdx = math.sqrt((2 * stdx_sum_x_y_cos_sin_theta) / (n - 2))
    stdy = math.sqrt((2 * stdy_sum_x_y_sin_cos_theta) / (n - 2))

    results = []
    from django.db import connection
    cursor = connection.cursor()
    while num_of_std:
        sql = "SELECT ellipse(%s, %s, (%s * %s), (%s * %s), %s, 40);" % (
            center_x,
            center_y,
            num_of_std,
            stdx,
            num_of_std,
            stdy,
            theta
        )
        cursor.execute(sql)
        results.append(fromstr(cursor.fetchall()[0][0], srid=4326))
        num_of_std -= 1
    return results
Ejemplo n.º 12
0
def main():
    # record start time
    start_time = time.time()

    print()
    print(
        "------------------------------------------------------------------------------------------------------------------------"
    )
    print(" # Method : Gene-Based Classification")
    print(" # Experiment : Within Dataset")
    print(
        " # You will be asked to provide related files and required information about them including "
    )
    print(
        " #   [1] A file contains mapping between gene probe IDs and samples")
    print(
        " #   [2] Number of rows of the file containing mapping between gene probe IDs and samples to be read"
    )
    print(" #   [3] A file contains mapping between samples and their class")
    print(
        " # These files must follow a required format shown in file_format.pdf"
    )
    print(" #")
    print(
        " # You will be asked to provide required information to conduct an experiment including"
    )
    print(" #   [1] Number of epochs")
    print(" #   [2] Number of folds")
    print(" #   [3] Number of top-ranked feature")
    print(" #")
    print(" # You will be asked for the name of an output file.")
    print(
        "------------------------------------------------------------------------------------------------------------------------"
    )
    print()

    # prepare variables
    file_training_input_name = None

    row_to_read = None

    file_training_output_name = None

    epoch = None
    num_of_folds = None
    number_of_ranked_gene = None

    file_name = None

    print(" # Enter required information about the first dataset ")
    print(
        " 1. Enter name of a file containing mapping between probes IDs and samples "
    )
    file_training_input_name = add_ons.getFile()
    print()

    print(" 2. Enter number of rows of this file to be read ")
    while True:
        row_to_read = input(" Number of rows : ")
        if (row_to_read.isnumeric() == False):
            print(" WARNING : Number of rows must be numeric.")
        elif (int(row_to_read) < 1):
            print("WARNING : Number of rows cannot be lower than 1.")
        else:
            break
    row_to_read = int(row_to_read)
    print()

    print(
        " 3. Enter name of a file containing mapping between samples and their class"
    )
    file_training_output_name = add_ons.getFile()
    print()

    # prepare data
    # row_to_read = 22283
    # file_training_input = pd.read_csv("GSE2034-22071 (edited).csv", nrows = row_to_read)
    file_training_input = pd.read_csv(file_training_input_name,
                                      nrows=row_to_read)

    # consider non-relapse and relapse (not in specific period of time)
    file_training_output = pd.read_csv(
        file_training_output_name,
        usecols=['GEO asscession number', 'relapse (1=True)'])

    # this will be used in calculating lda
    training_input = file_training_input
    training_output = file_training_output

    # get gene order id with its name
    list_gene_name = []
    for i in range(0, row_to_read):
        # add element with this format (gene_order_id, gene_name)
        gene_name = []
        gene_name.append(i)
        gene_name.append(file_training_input.loc[i, "ID_REF"])
        list_gene_name.append(gene_name)

    # separate data into 2 classes
    # consider non-relapse and relapse (not in specific period of time)
    sample_relapse = file_training_output.loc[
        file_training_output['relapse (1=True)'].isin(['1'])]
    sample_no_relapse = file_training_output.loc[
        file_training_output['relapse (1=True)'].isin(['0'])]
    # print(sample_no_relapse)

    # add GEO asscession number to each list
    list_sample_relapse = []
    for element in sample_relapse.loc[:, 'GEO asscession number']:
        list_sample_relapse.append(element)
    # print(list_sample_relapse)

    list_sample_no_relapse = []
    for element in sample_no_relapse.loc[:, 'GEO asscession number']:
        list_sample_no_relapse.append(element)

    # shuffle data to make each chunk does not depend on sample order
    random.shuffle(list_sample_relapse)
    random.shuffle(list_sample_no_relapse)

    print(" # Enter required information to conduct an experiment")
    print(" 1. Enter number of epochs ")
    while True:
        epoch = input(" Epochs : ")

        if (epoch.isnumeric() == False):
            print(" WARNING : Number of epochs must be numeric.")
        elif (int(epoch) <= 0):
            print(" WARINING : Number of epochs must be greater than 0.")
        else:
            break
    print()

    print(" 2. Enter number of folds ")
    while True:
        num_of_folds = input(" Number of folds: ")
        if (num_of_folds.isnumeric() == False):
            print(" WARNING : Number of folds must be numeric")

        # these conditions are not available in mock-up
        elif (int(num_of_folds) > len(list_sample_relapse)):
            print(
                "WARNING : Number of folds exceeds the size of samples in class relapse"
            )
        elif (int(num_of_folds) > len(list_sample_no_relapse)):
            print(
                "WARNING : Number of folds exceeds the size of samples in class non-relapse"
            )

        elif (int(num_of_folds) <= 1):
            print(" WARNING : Number of folds cannot lower than or equal to 1")
        else:
            break
    num_of_folds = int(num_of_folds)
    print()

    print(" 3. Enter number of top-ranked features")
    while True:
        number_of_ranked_gene = input(" Number of top-ranked features: ")
        if (number_of_ranked_gene.isnumeric() == False):
            print(" WARNING : Number of top-ranked features must be numeric.")

        # these conditions are not available in mock-up
        elif (int(number_of_ranked_gene) > row_to_read):
            print(
                " WARINING : Number of top-ranked features must not exceed available genes from the first file."
            )

        elif (int(number_of_ranked_gene) <= 0):
            print(
                " WARNING : Number of top-ranked features must not be lower than or equal to 0."
            )
        else:
            break
    print()

    file_name = input(" # Enter name of an output file : ")

    # prepare text file for results to be written in
    result_file = open("./result/" + str(file_name) + ".txt", "w+")

    # record file name
    result_file.write("Dataset : " + str(file_training_input_name) + "\n")
    result_file.write("Number of epochs : " + str(epoch) + "\n")
    result_file.write("Number of folds : " + str(num_of_folds) + "\n")
    result_file.write("Number of top-ranked features : " +
                      str(number_of_ranked_gene) + "\n")
    result_file.write("\n")

    print(" Number of samples in class relapse : " +
          str(len(list_sample_relapse)))
    print(" Number of samples in class non-relapse : " +
          str(len(list_sample_no_relapse)))

    # list used to collect average auc score of each epoch
    list_avg_auc_each_epoch = []

    # list to collect feature counter
    list_feature_counter = []

    for epoch_count in range(0, int(epoch)):
        start_epoch_time = time.time()
        result_file.write("#################################### Epoch : " +
                          str(epoch_count + 1) +
                          " ####################################\n")
        print("#################################### Epoch : " +
              str(epoch_count + 1) + " ####################################\n")
        # split data into k parts
        chunk_relapse_size = math.ceil(len(list_sample_relapse) / num_of_folds)
        chunk_no_relapse_size = math.ceil(
            len(list_sample_no_relapse) / num_of_folds)

        chunk_list_relapse = list(
            calculate.chunks(list_sample_relapse, chunk_relapse_size))
        print(" Number of chunks in class relapse : " +
              str(len(chunk_list_relapse)))

        chunk_list_no_relapse = list(
            calculate.chunks(list_sample_no_relapse, chunk_no_relapse_size))
        print(" Number of chunks in class non-relapse  = " +
              str(len(chunk_list_no_relapse)))
        print()

        check_valid, num_of_chunks = calculate.checkEqualListSize(
            chunk_list_relapse, chunk_list_no_relapse)

        # list to collect maximun AUC in each fold
        list_max_auc = []

        # list and variable to track feature set that has the best auc score
        auc_score_max = 0
        list_feature_set_max_auc = []
        list_auc_score = []

        print(" # Process : Cross-validation")
        # do only if number of chunks of both datasets are equal
        if (check_valid == True):
            for first_layer_test_index in range(0, num_of_chunks):
                feature_set = []
                feature_set_name = []
                top_n_genes_name_for_eval = []
                # keep testing data from each class
                first_layer_test_relapse = chunk_list_relapse[
                    first_layer_test_index]
                first_layer_test_no_relapse = chunk_list_no_relapse[
                    first_layer_test_index]
                print("\n------------------------------------------ K : " +
                      str(first_layer_test_index + 1) + " of Epoch " +
                      str(epoch_count + 1) +
                      " --------------------------------")
                print(" Samples in class relapse used as testing set :" +
                      str(first_layer_test_relapse) + "\n")
                print(" Samples in class non-relapse used as testing set : " +
                      str(first_layer_test_no_relapse) + "\n")
                print()

                # find training data
                # first layer
                first_layer_train_relapse = []
                for first_layer_train_index in range(0, num_of_chunks):
                    if (chunk_list_relapse[first_layer_train_index]
                            is not first_layer_test_relapse):
                        first_layer_train_relapse.append(
                            chunk_list_relapse[first_layer_train_index])

                first_layer_train_no_relapse = []
                for first_layer_train_index in range(0, num_of_chunks):
                    if (chunk_list_no_relapse[first_layer_train_index]
                            is not first_layer_test_no_relapse):
                        first_layer_train_no_relapse.append(
                            chunk_list_no_relapse[first_layer_train_index])

                # merge all element in the same class
                second_list_sample_relapse = []
                for i in range(0, len(first_layer_train_relapse)):
                    second_list_sample_relapse.extend(
                        first_layer_train_relapse[i])
                print(" Samples in class relapse used as trainning set = " +
                      str(second_list_sample_relapse) + "\n")

                second_list_sample_no_relapse = []
                for i in range(0, len(first_layer_train_no_relapse)):
                    second_list_sample_no_relapse.extend(
                        first_layer_train_no_relapse[i])
                print(" Samples in class non-relapse used as training set : " +
                      str(second_list_sample_no_relapse) + "\n")

                # splitting lists to use them as marker evaluation set and feature selection set
                # given that we separate it into 3 parts
                print(" Process : Feature selection")
                print(
                    "\n #### divide training set into 3 parts (2/3 for marker evaluation and 1/3 for feature selection) ####"
                )
                second_num_of_fold = 3
                second_chunk_relapse_size = math.ceil(
                    len(second_list_sample_relapse) / second_num_of_fold)
                second_chunk_no_relapse_size = math.ceil(
                    len(second_list_sample_no_relapse) / second_num_of_fold)

                second_chunk_list_relapse = list(
                    calculate.chunks(second_list_sample_relapse,
                                     second_chunk_relapse_size))

                second_chunk_list_no_relapse = list(
                    calculate.chunks(second_list_sample_no_relapse,
                                     second_chunk_no_relapse_size))

                second_check_valid, second_num_of_chunks = calculate.checkEqualListSize(
                    second_chunk_list_relapse, second_chunk_list_no_relapse)

                # do only if number of chunks of both datasets are equal
                if (second_check_valid == True):
                    second_layer_test_index = random.randint(
                        0, second_num_of_chunks - 1)

                    # keep testing data from eacch class
                    second_layer_test_relapse = second_chunk_list_relapse[
                        second_layer_test_index]
                    second_layer_test_no_relapse = second_chunk_list_no_relapse[
                        second_layer_test_index]
                    print(
                        " Samples in class relapse used as feature selection set : "
                        + str(second_layer_test_relapse) + "\n")
                    print(
                        " Samples in class non-relapse used as feature selection set : "
                        + str(second_layer_test_no_relapse))
                    print()

                    # separate training dataset from testing dataset to use in t-test ranking
                    second_layer_train_relapse = []
                    for second_layer_train_index in range(
                            0, second_num_of_chunks):
                        if (second_chunk_list_relapse[second_layer_train_index]
                                is not second_layer_test_relapse):
                            second_layer_train_relapse.append(
                                second_chunk_list_relapse[
                                    second_layer_train_index])

                    second_layer_train_no_relapse = []
                    for second_layer_train_index in range(
                            0, second_num_of_chunks):
                        if (second_chunk_list_no_relapse[
                                second_layer_train_index]
                                is not second_layer_test_no_relapse):
                            second_layer_train_no_relapse.append(
                                second_chunk_list_no_relapse[
                                    second_layer_train_index])

                    # prepare dataset for t-test
                    # merge all samples in the same class
                    ttest_list_sample_relapse = []
                    for i in range(0, len(second_layer_train_relapse)):
                        ttest_list_sample_relapse.extend(
                            second_layer_train_relapse[i])
                    print(
                        " Samples in class relapse used as marker evaluation set : "
                        + str(ttest_list_sample_relapse) + "\n")

                    ttest_list_sample_no_relapse = []
                    for i in range(0, len(second_layer_train_no_relapse)):
                        ttest_list_sample_no_relapse.extend(
                            second_layer_train_no_relapse[i])
                    print(
                        " Samples in class non-relapse used as marker evaluation set : "
                        + str(ttest_list_sample_no_relapse) + "\n")

                    # get gene expression for each gene from samples with relapse
                    list_gene_exp_relapse = []
                    for i in range(0, row_to_read):
                        gene_exp_relapse = []
                        for column in file_training_input.loc[
                                i, ttest_list_sample_relapse]:
                            gene_exp_relapse.append(column)
                        list_gene_exp_relapse.append(gene_exp_relapse)

                    # get gene expression for each gene from samples with no relapse
                    list_gene_exp_no_relapse = []
                    for i in range(0, row_to_read):
                        gene_exp_no_relapse = []
                        for column in file_training_input.loc[
                                i, ttest_list_sample_no_relapse]:
                            gene_exp_no_relapse.append(column)
                        list_gene_exp_no_relapse.append(gene_exp_no_relapse)

                    print(" # Process : Calculating t-score")
                    # conducting t-test
                    ttest_result = []
                    for i in range(0, row_to_read):
                        score = []
                        # get absolute magnitude of t-test value
                        abs_ttest_value = math.fabs(
                            stats.ttest_ind(list_gene_exp_relapse[i],
                                            list_gene_exp_no_relapse[i],
                                            equal_var=False)[0])
                        p_value = stats.ttest_ind(list_gene_exp_relapse[i],
                                                  list_gene_exp_no_relapse[i],
                                                  equal_var=False)[1]
                        # add element with this format (gene_order_id, ttest_value)
                        score.append(i)
                        score.append(abs_ttest_value)
                        ttest_result.append(score)
                    # ranking elements using their t-test value in descending order
                    ttest_result.sort(key=lambda x: x[1], reverse=True)

                    # create list of ranked gene
                    ranked_gene = []
                    for i in range(0, len(ttest_result)):
                        gene_order_id = ttest_result[i][0]

                        ranked_gene.append(list_gene_name[gene_order_id][1])

                    # show top ranked feature
                    top_n_genes_name = []
                    print(" #### t-score ranking ####")
                    for i in range(0, int(number_of_ranked_gene)):
                        top_n_genes_name.append(ranked_gene[i])
                        print(" " + str(ranked_gene[i]) + " => " +
                              " t-score : " + str(ttest_result[i][1]))
                    print()

                    # rank gene id of each sample in training data
                    # for class 'relapse'
                    # print("#### class 'Relapse' ####")
                    col_to_read_relapse = ["ID_REF"]
                    col_to_read_relapse.extend(ttest_list_sample_relapse)
                    file_training_input_relapse = pd.read_csv(
                        "GSE2034-22071 (edited).csv",
                        nrows=row_to_read,
                        usecols=col_to_read_relapse)
                    top_n_genes_relapse = file_training_input_relapse.loc[
                        file_training_input_relapse['ID_REF'].isin(
                            top_n_genes_name)]
                    top_n_genes_relapse['gene_id'] = top_n_genes_relapse[
                        'ID_REF'].apply(
                            lambda name: top_n_genes_name.index(name))
                    top_n_genes_relapse_sorted = top_n_genes_relapse.sort_values(
                        by=['gene_id'])
                    top_n_genes_relapse_sorted.drop(columns='gene_id',
                                                    inplace=True)

                    top_n_genes_relapse_sorted_train = top_n_genes_relapse_sorted
                    top_n_genes_relapse_sorted_train.drop(columns='ID_REF',
                                                          inplace=True)

                    # for class 'no relapse'
                    # print("#### class 'no Relapse' ####")
                    col_to_read_no_relapse = ["ID_REF"]
                    col_to_read_no_relapse.extend(ttest_list_sample_no_relapse)
                    file_training_input_no_relapse = pd.read_csv(
                        "GSE2034-22071 (edited).csv",
                        nrows=row_to_read,
                        usecols=col_to_read_no_relapse)
                    top_n_genes_no_relapse = file_training_input_no_relapse.loc[
                        file_training_input_no_relapse['ID_REF'].isin(
                            top_n_genes_name)]
                    top_n_genes_no_relapse['gene_id'] = top_n_genes_no_relapse[
                        'ID_REF'].apply(
                            lambda name: top_n_genes_name.index(name))
                    top_n_genes_no_relapse_sorted = top_n_genes_no_relapse.sort_values(
                        by=['gene_id'])
                    top_n_genes_no_relapse_sorted.drop(columns='gene_id',
                                                       inplace=True)

                    top_n_genes_no_relapse_sorted_train = top_n_genes_no_relapse_sorted
                    top_n_genes_no_relapse_sorted_train.drop(columns='ID_REF',
                                                             inplace=True)

                    # Preparing testing data for feature selection
                    second_layer_test_all = []
                    second_layer_test_all.extend(second_layer_test_relapse)
                    second_layer_test_all.extend(second_layer_test_no_relapse)

                    # output for testing data
                    # sort gene order of testing data
                    col_to_read_second_layer_test_gene = ["ID_REF"]
                    col_to_read_second_layer_test_gene.extend(
                        second_layer_test_all)
                    second_layer_test_gene = pd.read_csv(
                        "GSE2034-22071 (edited).csv",
                        nrows=row_to_read,
                        usecols=col_to_read_second_layer_test_gene)
                    second_layer_top_n_test = second_layer_test_gene.loc[
                        second_layer_test_gene['ID_REF'].isin(
                            top_n_genes_name)]
                    second_layer_top_n_test[
                        'gene_id'] = second_layer_top_n_test['ID_REF'].apply(
                            lambda name: top_n_genes_name.index(name))
                    second_layer_top_n_test_sorted = second_layer_top_n_test.sort_values(
                        by=['gene_id'])
                    second_layer_top_n_test_sorted.drop(columns='gene_id',
                                                        inplace=True)

                    top_n_test_sorted = second_layer_top_n_test_sorted
                    top_n_test_sorted.drop(columns='ID_REF', inplace=True)

                    # use top-rank feature as the first feature in lda classifier
                    # prepare list for input
                    # list of all input data (testing data)
                    list_second_layer_top_n_test_sorted = []
                    for column in range(0, len(top_n_test_sorted)):
                        list_each_sample = []
                        for element in top_n_test_sorted.iloc[column]:
                            list_each_sample.append(element)
                        list_second_layer_top_n_test_sorted.append(
                            list_each_sample)
                    list_second_layer_top_n_test_sorted = list(
                        np.transpose(list_second_layer_top_n_test_sorted))

                    # output for testing data
                    second_layer_test_output = training_output.loc[
                        training_output['GEO asscession number'].isin(
                            second_layer_test_all)]
                    # sorting data according to its order in testing data
                    list_sample_to_read = list(
                        second_layer_top_n_test_sorted.columns.values)

                    second_layer_test_output[
                        'sample_id'] = second_layer_test_output[
                            'GEO asscession number'].apply(
                                lambda name: list_sample_to_read.index(name))
                    second_layer_test_output = second_layer_test_output.sort_values(
                        by=['sample_id'])
                    second_layer_test_output.drop(columns='sample_id',
                                                  inplace=True)
                    # create list of output
                    list_desired_output = []
                    for element in second_layer_test_output.loc[:,
                                                                'relapse (1=True)']:
                        list_desired_output.append(element)

                    # list of gene expression and sample of class 'relapse'
                    list_top_n_gene_relapse_sorted = []
                    for column in range(0,
                                        len(top_n_genes_relapse_sorted_train)):
                        list_each_sample = []
                        for element in top_n_genes_relapse_sorted_train.iloc[
                                column]:
                            list_each_sample.append(element)
                        list_top_n_gene_relapse_sorted.append(list_each_sample)
                    list_top_n_gene_relapse_sorted = list(
                        np.transpose(list_top_n_gene_relapse_sorted))

                    # list of gene expression and sample of class 'no relapse'
                    list_top_n_gene_no_relapse_sorted = []
                    for column in range(
                            0, len(top_n_genes_no_relapse_sorted_train)):
                        list_each_sample = []
                        for element in top_n_genes_no_relapse_sorted_train.iloc[
                                column]:
                            list_each_sample.append(element)
                        list_top_n_gene_no_relapse_sorted.append(
                            list_each_sample)
                    list_top_n_gene_no_relapse_sorted = list(
                        np.transpose(list_top_n_gene_no_relapse_sorted))

                    print(" # Process : Sequential Forward Selection (SFS)")
                    # find set of genes to be used as a feature
                    check_finish = False
                    count_iteration = 1
                    gene_order = [0]
                    list_auc = []
                    while (check_finish == False):
                        if (count_iteration >= int(number_of_ranked_gene)):
                            check_finish = True
                        else:
                            max_auc_score = 0
                            gene_index_in_list = None
                            for i in range(0, int(number_of_ranked_gene)):
                                gene_order_test = deepcopy(gene_order)
                                gene_order_test.extend([i])
                                # select gene to be used in lda
                                input_relapse = []
                                for sample_index in range(
                                        0,
                                        len(list_top_n_gene_relapse_sorted)):
                                    list_each_sample = []
                                    for element_id in range(
                                            0,
                                            len(list_top_n_gene_relapse_sorted[
                                                sample_index])):
                                        if (element_id in gene_order_test):
                                            list_each_sample.append(
                                                list_top_n_gene_relapse_sorted[
                                                    sample_index][element_id])
                                    input_relapse.append(list_each_sample)
                                # print(input_relapse)

                                input_no_relapse = []
                                for sample_index in range(
                                        0,
                                        len(list_top_n_gene_no_relapse_sorted)
                                ):
                                    list_each_sample = []
                                    for element_id in range(
                                            0,
                                            len(list_top_n_gene_no_relapse_sorted[
                                                sample_index])):
                                        if (element_id in gene_order_test):
                                            list_each_sample.append(
                                                list_top_n_gene_no_relapse_sorted[
                                                    sample_index][element_id])
                                    input_no_relapse.append(list_each_sample)
                                # print(input_no_relapse)

                                input_testing_data = []
                                for sample_index in range(
                                        0,
                                        len(list_second_layer_top_n_test_sorted
                                            )):
                                    list_each_sample = []
                                    for element_id in range(
                                            0,
                                            len(list_second_layer_top_n_test_sorted[
                                                sample_index])):
                                        if (element_id in gene_order_test):
                                            list_each_sample.append(
                                                list_second_layer_top_n_test_sorted[
                                                    sample_index][element_id])
                                    input_testing_data.append(list_each_sample)

                                list_actual_output = calculate.lda(
                                    input_testing_data, input_relapse,
                                    input_no_relapse)

                                # calculate AUC score
                                auc_score = roc_auc_score(
                                    list_desired_output, list_actual_output)

                                if (auc_score > max_auc_score):
                                    max_auc_score = auc_score
                                    gene_index_in_list = i
                                    # print(max_auc_score)
                                    if max_auc_score not in list_auc:
                                        list_auc.append(max_auc_score)
                            # do not add gene that already exists in a feature
                            if (gene_index_in_list not in gene_order):
                                gene_order.extend([gene_index_in_list])
                            count_iteration += 1

                    list_max_auc.append(max(list_auc))
                    gene_order.sort()

                    # get gene_name
                    gene_order_name = []
                    for element in gene_order:
                        gene_order_name.append(top_n_genes_name[element])

                    # copy required data to be used in evaluation
                    top_n_genes_name_for_eval = deepcopy(top_n_genes_name)
                    feature_set = deepcopy(gene_order)
                    feature_set_name = deepcopy(gene_order_name)

                # count feature frequency
                if (int(epoch) > 1):
                    for feature_index in range(0, len(feature_set_name)):
                        # if list feature counter is empty
                        if not list_feature_counter:
                            feature_counter = []
                            feature_name = feature_set_name[feature_index]
                            feature_frequency = 1

                            feature_counter.append(feature_name)
                            feature_counter.append(feature_frequency)

                            list_feature_counter.append(feature_counter)
                        else:
                            feature_name = feature_set_name[feature_index]

                            # check if this feature exist in the feature counter list
                            check_found = False
                            for feature_counter_index in range(
                                    0, len(list_feature_counter)):
                                feature_counter_name = list_feature_counter[
                                    feature_counter_index][0]

                                if (feature_name == feature_counter_name):
                                    feature_frequency = list_feature_counter[
                                        feature_counter_index][1]
                                    feature_frequency += 1

                                    list_feature_counter[
                                        feature_counter_index][
                                            1] = feature_frequency
                                    check_found = True

                            # if this feature is not exist in a list feature counter
                            if (check_found == False):
                                feature_counter = []
                                feature_name = feature_set_name[feature_index]
                                feature_frequency = 1

                                feature_counter.append(feature_name)
                                feature_counter.append(feature_frequency)

                                list_feature_counter.append(feature_counter)

                # preparing data for evaluation and creating classifier
                # for class 'relapse'
                print(" # Process : Prepare classifiers and testing data")
                col_to_read_relapse_for_eval = ["ID_REF"]
                col_to_read_relapse_for_eval.extend(second_list_sample_relapse)
                file_training_input_relapse_for_eval = pd.read_csv(
                    "GSE2034-22071 (edited).csv",
                    nrows=row_to_read,
                    usecols=col_to_read_relapse_for_eval)
                top_n_genes_relapse_for_eval = file_training_input_relapse.loc[
                    file_training_input_relapse['ID_REF'].isin(
                        feature_set_name)]
                top_n_genes_relapse_for_eval[
                    'gene_id'] = top_n_genes_relapse_for_eval['ID_REF'].apply(
                        lambda name: feature_set_name.index(name))
                top_n_genes_relapse_sorted_for_eval = top_n_genes_relapse_for_eval.sort_values(
                    by=['gene_id'])
                top_n_genes_relapse_sorted_for_eval.drop(columns='gene_id',
                                                         inplace=True)
                top_n_genes_relapse_sorted_for_eval.drop(columns='ID_REF',
                                                         inplace=True)
                # print(top_n_genes_relapse_sorted_for_eval)

                # for class 'no relapse'
                col_to_read_no_relapse_for_eval = ["ID_REF"]
                col_to_read_no_relapse_for_eval.extend(
                    second_list_sample_no_relapse)
                file_training_input_no_relapse_for_eval = pd.read_csv(
                    "GSE2034-22071 (edited).csv",
                    nrows=row_to_read,
                    usecols=col_to_read_no_relapse_for_eval)
                top_n_genes_no_relapse_for_eval = file_training_input_no_relapse_for_eval.loc[
                    file_training_input_no_relapse_for_eval['ID_REF'].isin(
                        feature_set_name)]
                top_n_genes_no_relapse_for_eval[
                    'gene_id'] = top_n_genes_no_relapse_for_eval[
                        'ID_REF'].apply(
                            lambda name: feature_set_name.index(name))
                top_n_genes_no_relapse_sorted_for_eval = top_n_genes_no_relapse_for_eval.sort_values(
                    by=['gene_id'])
                top_n_genes_no_relapse_sorted_for_eval.drop(columns='gene_id',
                                                            inplace=True)
                top_n_genes_no_relapse_sorted_for_eval.drop(columns='ID_REF',
                                                            inplace=True)
                # print(top_n_genes_no_relapse_sorted_for_eval)

                first_layer_test_all = []
                first_layer_test_all.extend(first_layer_test_relapse)
                first_layer_test_all.extend(first_layer_test_no_relapse)
                # print(first_layer_test_all)

                col_to_read_first_layer_test_gene = ["ID_REF"]
                col_to_read_first_layer_test_gene.extend(first_layer_test_all)
                first_layer_test_gene = pd.read_csv(
                    "GSE2034-22071 (edited).csv",
                    nrows=row_to_read,
                    usecols=col_to_read_first_layer_test_gene)
                first_layer_top_n_test = first_layer_test_gene.loc[
                    first_layer_test_gene['ID_REF'].isin(feature_set_name)]
                first_layer_top_n_test['gene_id'] = first_layer_top_n_test[
                    'ID_REF'].apply(lambda name: feature_set_name.index(name))
                first_layer_top_n_test_sorted = first_layer_top_n_test.sort_values(
                    by=['gene_id'])
                first_layer_top_n_test_sorted.drop(columns='gene_id',
                                                   inplace=True)

                top_n_test_sorted_for_eval = first_layer_top_n_test_sorted
                top_n_test_sorted_for_eval.drop(columns='ID_REF', inplace=True)

                # prepare list for input
                # list of all input data (testing data)
                list_first_layer_top_n_test_sorted = []
                for column in range(0, len(top_n_test_sorted_for_eval)):
                    list_each_sample = []
                    for element in top_n_test_sorted_for_eval.iloc[column]:
                        list_each_sample.append(element)

                    list_first_layer_top_n_test_sorted.append(list_each_sample)
                list_first_layer_top_n_test_sorted = list(
                    np.transpose(list_first_layer_top_n_test_sorted))

                # output for testing data
                first_layer_test_output = training_output.loc[training_output[
                    'GEO asscession number'].isin(first_layer_test_all)]

                # sorting data according to its order in testing data
                list_sample_to_read_for_eval = list(
                    first_layer_top_n_test_sorted.columns.values)
                first_layer_test_output['sample_id'] = first_layer_test_output[
                    'GEO asscession number'].apply(
                        lambda name: list_sample_to_read_for_eval.index(name))
                first_layer_test_output = first_layer_test_output.sort_values(
                    by=['sample_id'])
                first_layer_test_output.drop(columns='sample_id', inplace=True)

                # create list of output
                list_desired_output_for_eval = []
                for element in first_layer_test_output.loc[:,
                                                           'relapse (1=True)']:
                    list_desired_output_for_eval.append(element)

                # list of gene expression and sample of class 'relapse' for evaluation
                list_top_n_gene_relapse_sorted_for_eval = []
                for column in range(0,
                                    len(top_n_genes_relapse_sorted_for_eval)):
                    list_each_sample = []
                    for element in top_n_genes_relapse_sorted_for_eval.iloc[
                            column]:
                        list_each_sample.append(element)
                    list_top_n_gene_relapse_sorted_for_eval.append(
                        list_each_sample)
                list_top_n_gene_relapse_sorted_for_eval = list(
                    np.transpose(list_top_n_gene_relapse_sorted_for_eval))

                # list of gene expression and sample of class 'no relapse' for evaluation
                list_top_n_gene_no_relapse_sorted_for_eval = []
                for column in range(
                        0, len(top_n_genes_no_relapse_sorted_for_eval)):
                    list_each_sample = []
                    for element in top_n_genes_no_relapse_sorted_for_eval.iloc[
                            column]:
                        list_each_sample.append(element)
                    list_top_n_gene_no_relapse_sorted_for_eval.append(
                        list_each_sample)
                list_top_n_gene_no_relapse_sorted_for_eval = list(
                    np.transpose(list_top_n_gene_no_relapse_sorted_for_eval))

                # calculate lda to get actual output
                input_relapse_for_eval = []
                for sample_index in range(
                        0, len(list_top_n_gene_relapse_sorted_for_eval)):
                    list_each_sample = []
                    for element_id in range(
                            0,
                            len(list_top_n_gene_relapse_sorted_for_eval[
                                sample_index])):
                        if (element_id in feature_set):
                            list_each_sample.append(
                                list_top_n_gene_relapse_sorted_for_eval[
                                    sample_index][element_id])
                    input_relapse_for_eval.append(list_each_sample)

                input_no_relapse_for_eval = []
                for sample_index in range(
                        0, len(list_top_n_gene_no_relapse_sorted_for_eval)):
                    list_each_sample = []
                    for element_id in range(
                            0,
                            len(list_top_n_gene_no_relapse_sorted_for_eval[
                                sample_index])):
                        if (element_id in feature_set):
                            list_each_sample.append(
                                list_top_n_gene_no_relapse_sorted_for_eval[
                                    sample_index][element_id])
                    input_no_relapse_for_eval.append(list_each_sample)

                input_testing_data_for_eval = []
                for sample_index in range(
                        0, len(list_first_layer_top_n_test_sorted)):
                    list_each_sample = []
                    for element_id in range(
                            0,
                            len(list_first_layer_top_n_test_sorted[
                                sample_index])):
                        if (element_id in feature_set):
                            list_each_sample.append(
                                list_first_layer_top_n_test_sorted[
                                    sample_index][element_id])
                    input_testing_data_for_eval.append(list_each_sample)

                list_actual_output_for_eval = calculate.lda(
                    input_testing_data_for_eval, input_relapse_for_eval,
                    input_no_relapse_for_eval)

                # calculate AUC score
                auc_score_for_eval = roc_auc_score(
                    list_desired_output_for_eval, list_actual_output_for_eval)
                list_auc_score.append(auc_score_for_eval)

                print("#### Evaluation of " + str(first_layer_test_index + 1) +
                      " - fold ####")
                print(" Feature Set : " + str(feature_set_name))
                print(" Actual Output : " + str(list_actual_output_for_eval))
                print(" Desired Output : " + str(list_desired_output_for_eval))
                print(" AUC ROC score = " + str(auc_score_for_eval))

                # track feature set which gives maximum auc score
                if (auc_score_for_eval > auc_score_max):
                    list_feature_set_max_auc = deepcopy(feature_set_name)
                    auc_score_max = auc_score

                # write output to an output file
                result_file.write("Fold : " + str(first_layer_test_index + 1) +
                                  "\n")
                result_file.write("Feature Set : " + str(feature_set_name) +
                                  "\n")
                result_file.write("Actual Output : " +
                                  str(list_actual_output_for_eval) + "\n")
                result_file.write("Desired Output : " +
                                  str(list_desired_output_for_eval) + "\n")
                result_file.write("AUC ROC Score from testing : " +
                                  str(auc_score_for_eval) + "\n")
                result_file.write("\n")

        list_avg_auc_each_epoch.append(calculate.mean(list_auc_score))

        # record ending time of this iteration
        end_epoch_time = time.time()
        time_elapse_epoch_second = end_epoch_time - start_epoch_time
        time_elapse_epoch_minute = time_elapse_epoch_second / 60
        time_elapse_epoch_hour = time_elapse_epoch_minute / 60

        time_elapse_epoch_minute = round(time_elapse_epoch_minute, 2)
        time_elapse_epoch_hour = round(time_elapse_epoch_hour, 2)

        result_file.write("\n#### Summary ####\n")
        result_file.write("Average AUC score : " +
                          str(calculate.mean(list_auc_score)) + "\n")
        result_file.write("AUC score from feature selection in each fold : " +
                          str(list_max_auc) + "\n")
        result_file.write(
            "Size of feature set which gives the highest AUC score from testing : "
            + str(len(list_feature_set_max_auc)))
        result_file.write("\n")
        result_file.write(
            "Feature set which gives the highest AUC score from testing : " +
            "\n")
        result_file.write(str(list_feature_set_max_auc))
        result_file.write("\n")
        result_file.write("Time Elapse : " + str(time_elapse_epoch_minute) +
                          " minutes (" + str(time_elapse_epoch_hour) +
                          " hours)\n")
        print(" Time Elapse : " + str(time_elapse_epoch_minute) +
              " minutes (" + str(time_elapse_epoch_hour) + " hours)\n")
        print(" AUC score from feature selection in each fold  = " +
              str(list_max_auc))

    # calculate mean over all epoch
    mean_over_all_epoch = calculate.mean(list_avg_auc_each_epoch)
    print(" Average AUC score over " + str(epoch) + " epoch : " +
          str(mean_over_all_epoch))
    result_file.write("\n")
    result_file.write("Average AUC score over " + str(epoch) + " epoch : " +
                      str(mean_over_all_epoch) + "\n")
    result_file.write("\n")

    # rank feature frequency
    if (len(list_feature_counter) < 10):
        num_of_top_frequent_pathway = len(list_feature_counter)
    else:
        # default number of features to be shown is 10
        num_of_top_frequent_pathway = 10

    # rank pathway frequency in descending order
    list_feature_counter.sort(key=lambda x: x[1], reverse=True)

    # add top pathways to a list to be shown
    list_top_pathway_frequency = []
    for top_pathway_index in range(0, num_of_top_frequent_pathway):
        list_top_pathway_frequency.append(
            list_feature_counter[top_pathway_index])

    print(" Feature frequency : ")
    result_file.write("\n")
    result_file.write("Feature frequency :\n")
    for index in range(0, len(list_top_pathway_frequency)):
        feature_name = list_top_pathway_frequency[index][0]
        feature_frequency = list_top_pathway_frequency[index][1]

        print(" " + str(index + 1) + ". " + str(feature_name) + " : " +
              str(feature_frequency))
        result_file.write(
            str(index + 1) + ". " + str(feature_name) + " : " +
            str(feature_frequency) + "\n")

    print()
    result_file.write("\n")

    # record end time
    end_time = time.time()
    time_elapse_second = end_time - start_time
    time_elapse_minute = time_elapse_second / 60
    time_elapse_hour = time_elapse_minute / 60

    time_elapse_minute = round(time_elapse_minute, 2)
    time_elapse_hour = round(time_elapse_hour, 2)

    print(" Total Time Elapse : " + str(time_elapse_minute) + " minutes (" +
          str(time_elapse_hour) + " hours)")
    result_file.write("Total Time Elapse : " + str(time_elapse_minute) +
                      " minutes (" + str(time_elapse_hour) + " hours)\n")

    result_file.close()
Ejemplo n.º 13
0
 def test_mean(self):
     self.assertEqual(calculate.mean([1, 2, 3]), 2.0)
     self.assertEqual(calculate.mean([1, 99]), 50.0)
     self.assertEqual(calculate.mean([2, 3, 3]), 2.6666666666666665)
     self.assertRaises(ValueError, calculate.mean, ['a', 0.2, 3])
Ejemplo n.º 14
0
 def test_mean(self):
     self.assertEqual(calculate.mean([1, 2, 3]), 2.0)
     self.assertEqual(calculate.mean([1, 99]), 50.0)
     self.assertEqual(calculate.mean([2, 3, 3]), 2.6666666666666665)
     self.assertRaises(ValueError, calculate.mean, ['a', 0.2, 3])
def standard_deviation_ellipses(geoqueryset,
                                point_attribute_name='point',
                                num_of_std=1,
                                fix_points=True):
    """
	Accepts a GeoQuerySet and generates one or more standard deviation ellipses
	demonstrating the geospatial distribution of where its points occur.
	
	Returns a one-to-many list of the ellipses as Polygon objects. 
	
	The standard deviation ellipse illustrates the average variation in 
	the distance of points from the mean center, as well as their direction.
	
	By default, the function expects the Point field on your model to be called 'point'.
	
	If the point field is called something else, change the kwarg 'point_attribute_name'
	to whatever your field might be called.
	
	Also by default, the function will nudge slightly apart any identical points and 
	only return the first standard deviation ellipse. If you'd like to change that behavior,
	change the corresponding kwargs.
	
	h3. Example usage
	
		>> import calculate
		>> calculate.standard_deviation_ellipses(qs)
		[<Polygon object at 0x77a1c34>]
	
	h3. Dependencies
	
		* "django":http://www.djangoproject.com/
		* "geodjango":http://www.geodjango.org/
		* "psql ellipse() function":http://postgis.refractions.net/support/wiki/index.php?plpgsqlfunctions
	
	h3. Documentation

		* "standard deviation ellipse":http://www.spatialanalysisonline.com/output/html/Directionalanalysisofpointdatasets.html
		* "This code is translated from SQL by Francis Dupont":http://postgis.refractions.net/pipermail/postgis-users/2008-June/020354.html
		
	"""
    if not isinstance(geoqueryset, GeoQuerySet):
        raise TypeError(
            'First parameter must be a Django GeoQuerySet. You submitted a %s object'
            % type(geoqueryset))

    n = len(geoqueryset)

    if n < 3:
        return [None]

    if fix_points:
        calculate.nudge_points(geoqueryset,
                               point_attribute_name=point_attribute_name)

    avg_x = calculate.mean(
        [abs(getattr(p, point_attribute_name).x) for p in geoqueryset])
    avg_y = calculate.mean(
        [abs(getattr(p, point_attribute_name).y) for p in geoqueryset])
    center_x = calculate.mean(
        [getattr(p, point_attribute_name).x for p in geoqueryset])
    center_y = calculate.mean(
        [getattr(p, point_attribute_name).y for p in geoqueryset])

    sum_square_diff_avg_x = sum([
        math.pow((abs(getattr(p, point_attribute_name).x) - avg_x), 2)
        for p in geoqueryset
    ])
    sum_square_diff_avg_y = sum([
        math.pow((abs(getattr(p, point_attribute_name).y) - avg_y), 2)
        for p in geoqueryset
    ])
    sum_diff_avg_x_y = sum([(abs(getattr(p, point_attribute_name).x) - avg_x) *
                            (abs(getattr(p, point_attribute_name).y) - avg_y)
                            for p in geoqueryset])
    sum_square_diff_avg_x_y = sum([
        math.pow((abs(getattr(p, point_attribute_name).x) - avg_x) *
                 (abs(getattr(p, point_attribute_name).y) - avg_y), 2)
        for p in geoqueryset
    ])
    constant = math.sqrt(
        math.pow((sum_square_diff_avg_x - sum_square_diff_avg_y), 2) +
        (4 * sum_square_diff_avg_x_y))
    theta = math.atan(
        (sum_square_diff_avg_x - sum_square_diff_avg_y + constant) /
        (2 * sum_diff_avg_x_y))

    stdx_sum_x_y_cos_sin_theta = sum([
        math.pow((((getattr(p, point_attribute_name).x - center_x) *
                   math.cos(theta)) -
                  ((getattr(p, point_attribute_name).y - center_y) *
                   math.sin(theta))), 2) for p in geoqueryset
    ])
    stdy_sum_x_y_sin_cos_theta = sum([
        math.pow((((getattr(p, point_attribute_name).x - center_x) *
                   math.sin(theta)) -
                  ((getattr(p, point_attribute_name).y - center_y) *
                   math.cos(theta))), 2) for p in geoqueryset
    ])

    stdx = math.sqrt((2 * stdx_sum_x_y_cos_sin_theta) / (n - 2))
    stdy = math.sqrt((2 * stdy_sum_x_y_sin_cos_theta) / (n - 2))

    results = []
    from django.db import connection
    cursor = connection.cursor()
    while num_of_std:
        cursor.execute(
            """SELECT ellipse(%s, %s, (%s * %s), (%s * %s), %s, 40);""" %
            (center_x, center_y, num_of_std, stdx, num_of_std, stdy, theta))
        results.append(fromstr(cursor.fetchall()[0][0], srid=4326))
        num_of_std -= 1
    return results
Ejemplo n.º 16
0
def percentile(data_list, value, kind='weak'):
    """
    Accepts a sample of values and a single number to compare to it
    and determine its percentile rank.

    A percentile of, for example, 80 means that 80 percent of the
    scores in the sequence are below the given score.

    In the case of gaps or ties, the exact definition depends on the type
    of the calculation stipulated by the "kind" keyword argument.

    There are three kinds of percentile calculations provided here. The
    default is "weak".

        1. "weak"

            Corresponds to the definition of a cumulative
            distribution function, with the result generated
            by returning the percentage of values at or equal
            to the the provided value.

        2. "strict"

            Similar to "weak", except that only values that are
            less than the given score are counted. This can often
            produce a result much lower than "weak" when the provided
            score is occurs many times in the sample.

        3. "mean"

            The average of the "weak" and "strict" scores.

    h3. Example usage

        >> import calculate
        >> calculate.percentile([1, 2, 3, 4], 3)
        75.0
        >> calculate.percentile([1, 2, 3, 3, 4], 3, kind='strict')
        40.0
        >> calculate.percentile([1, 2, 3, 3, 4], 3, kind='weak')
        80.0
        >> calculate.percentile([1, 2, 3, 3, 4], 3, kind='mean')
        60.0

    h3. Documentation

        * "Percentile rank":http://en.wikipedia.org/wiki/Percentile_rank

    h3. Credits

        This function is a modification of scipy.stats.percentileofscore. The
        only major difference is that I eliminated the numpy dependency, and
        omitted the rank kwarg option until I can find time to translate
        the numpy parts out.
    """
    # Convert all the values to floats and test to make sure
    # there aren't any strings in there
    try:
        data_list = list(map(float, data_list))
    except ValueError:
        raise ValueError('Input values should contain numbers, your first \
input contains something else')

    # Find the number of values in the sample
    n = float(len(data_list))

    if kind == 'strict':
        # If the selected method is strict, count the number of values
        # below the provided one and then divide it into the n
        return len([i for i in data_list if i < value]) / n * 100

    elif kind == 'weak':
        # If the selected method is weak, count the number of values
        # equal to or below the provided on and then divide it into n
        return len([i for i in data_list if i <= value]) / n * 100

    elif kind == 'mean':
        # If the selected method is mean, take the weak and strong
        # methods and average them.
        strict = len([i for i in data_list if i < value]) / n * 100
        weak = len([i for i in data_list if i <= value]) / n * 100
        return calculate.mean([strict, weak])
    else:
        raise ValueError("The kind kwarg must be 'strict', 'weak' or 'mean'. \
You can also opt to leave it out and rely on the default method.")