Esempio n. 1
0
def compute_normalization(infasta, kmer_length):
    counter = kmer_counts.BasicCounter(infasta,
                                       outfile=None,
                                       k=kmer_length,
                                       label=True,
                                       silent=True,
                                       binary=True,
                                       mean=True,
                                       std=True)
    counter.make_count_file()
    return (counter.mean, counter.std)
Esempio n. 2
0
def compute_normalization_and_frequency(infasta,
                                        kmer_length,
                                        return_normalized=True,
                                        outfile=None):
    counter = kmer_counts.BasicCounter(infasta,
                                       outfile=outfile,
                                       k=kmer_length,
                                       label=True,
                                       silent=True,
                                       binary=False,
                                       mean=True,
                                       std=True)
    counter.make_count_file()
    if not return_normalized:
        counter.counts *= counter.std
        counter.counts += counter.mean

    names = get_names_from_counter(counter)
    return (counter.mean, counter.std, counter.counts, names)
Esempio n. 3
0
def run_seekr_algorithm(parameters):
    """
    Launch the SEEKR algorithm using the parameters from the Web Service and return a zip file of the results

    The logic in this method tries to avoid recalculating k-mers and normalization for performance reasons.  The logic
    otherwise would be simpler -
    1) Calculate the normalization (mean and std dev)
    2) Calculate the frequencies of the user set and then apply the normalization from step 1
    3) Calculate the frequencies of the comparison set if it exists and apply the normalization from step 1
    4) Calculate the Pearson's R correlations between sequences in the user set and the comparison set.
        If no comparison set exists, calculate the correlations between the user set sequences

    In any of these steps, if we already have a precomputed value, we will load that instead of performing the computation.

    Notes
    -----
    numpy's corrcoef is an efficient way to calculate Pearson's correlations, but since its implementation computes a
    covariance matrix, the output is always a square matrix.  So if we had 10 sequences in a user set and compare
    against 10,000 sequences in a comparision set, numpy.corrcoef will calculate a matrix that is 10,010x10,010.
    The pearson function called supports non-square matrices and is thus used for comparing against the comparision set.
    e.g. it's matrix would be 10x10,000.

    """
    outfile = 'test1.csv'
    mean_std_loaded = False
    names = None
    comparison_names = None
    normalization_path = get_precomputed_normalization_path(parameters)
    if normalization_path is not None:
        mean = np.load(normalization_path[0])
        std = np.load(normalization_path[1])
        mean_std_loaded = True

    normal_set = parameters['normal_set']
    if normal_set is None:
        raise SeekrServerError('No normalization set Provided')
    comparison_set = None
    if 'comparison_set' in parameters:
        comparison_set = parameters['comparison_set']
    if 'comparison_set_files' in parameters:
        if normal_set == skr_config.SETTING_USER_SET:
            (mean, std, counts, names) = compute_normalization_and_frequency(
                infasta=TextIOWrapper(parameters['user_set_files']), kmer_length=parameters['kmer_length'], outfile=outfile)
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['comparison_set_files']), outfile=None,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            comparison_counts = counter.make_count_file()
            comparison_names = get_names_from_counter(counter)
        elif normal_set == skr_config.SETTING_COMPARISION_SET:
            (mean, std, comparison_counts, comparison_names) = compute_normalization_and_frequency(
                infasta=TextIOWrapper(parameters['comparison_set_files']), kmer_length=parameters['kmer_length'])
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['user_set_files']), outfile=outfile,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            counts = counter.make_count_file()
            names = get_names_from_counter(counter)

        elif mean_std_loaded:
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['user_set_files']), outfile=outfile,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            counts = counter.make_count_file()

            comparision_counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['comparison_set_files']), outfile=None,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            comparison_counts = comparision_counter.make_count_file()
            names = get_names_from_counter(counter)
            comparison_names = get_names_from_counter(comparision_counter)

        else:
            raise SeekrServerError('Normalization for Comparision Set File is not valid')

        similarity = pearson(counts, comparison_counts)
    elif comparison_set is not None and len(comparison_set) > 0 and comparison_set != 'user_set':

        unnormalized_frequency_path, names_path = get_precomputed_frequency_path(comparison_set, parameters['kmer_length'])
        assert unnormalized_frequency_path is not None and names_path is not None

        if normal_set == skr_config.SETTING_USER_SET:
            (mean, std, counts, names) = compute_normalization_and_frequency(
                infasta=TextIOWrapper(parameters['user_set_files']), kmer_length=parameters['kmer_length'], outfile=outfile)
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['comparison_set_files']), outfile=None,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            comparison_counts = _unnormalized_frequency_to_normalized(unnormalized_frequency_path, mean, std)
            comparison_names = load_names_from_path(names_path)
        elif normal_set == skr_config.SETTING_COMPARISION_SET:
            raise SeekrServerError('')

        elif mean_std_loaded:
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['user_set_files']), outfile=outfile,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            counts = counter.make_count_file()

            comparison_counts = _unnormalized_frequency_to_normalized(unnormalized_frequency_path, mean, std)
            names = get_names_from_counter(counter)
            comparison_names = load_names_from_path(names_path)

        else:
            raise SeekrServerError('No normalization set Provided')

        similarity = pearson(counts, comparison_counts)

    else:
        if mean_std_loaded:
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['user_set_files']), outfile=outfile,
                                               k=parameters['kmer_length'],
                                               label=True, silent=True, binary=False, mean=mean, std=std)
            counts = counter.make_count_file()
        elif normal_set == skr_config.SETTING_USER_SET:
            counter = kmer_counts.BasicCounter(infasta=TextIOWrapper(parameters['user_set_files']), outfile=outfile, k=parameters['kmer_length'],
                                           label=True, silent=True, binary=False)
            counts = counter.make_count_file()
        else:
            raise SeekrServerError('Normalization type is not valid')

        names = get_names_from_counter(counter)
        similarity = np.corrcoef(counts)

    #TODO refactor - original code saved to csv on disk - move this to a separate operation
    with open(outfile) as csvFile:
        counts_text = csvFile.read()

    bytes_io = BytesIO()
    np.save(bytes_io, similarity)
    bytes_io.seek(0)
    pearsons_file_in_memory = bytes_io.read()

    return counts_text, pearsons_file_in_memory