Esempio n. 1
0
    new_aln = f.flat_file_to_df([0, 1, 15])

    orig_aln.index = [i.split('_')[1] for i in orig_aln.index]
    aln_df = pd.concat([orig_aln, new_aln])
    aln_df.fillna(0, inplace=True)
    #################

    # filter out subset viruses and those that have less than 10 peptides >=80% pos
    aln_df = aln_df.reindex(peptide_lib).fillna(0)
    print("filled NA", flush=True)
    binary_b = aln_df[aln_df >= 80].fillna(0)
    binary_b = pd.DataFrame(index=binary_b.index,
                            columns=binary_b.columns,
                            data=binary_b.values,
                            dtype=bool)
    binary_b = flex_array.array(binary_b).filter_aln(ref_seq='../ref_seq/')
    aln_df = aln_df.loc[:, binary_b.
                        columns]  #use the resulting columns from filtering the binary matrix
    binary_b = pd.DataFrame(index=aln_df.index,
                            columns=aln_df.columns,
                            data=aln_df.values,
                            dtype=bool)
    print("filtered", flush=True)

    # dictionary of list of alignments for each virus. keys: virus strings, values: Series objects containing only nonzero alignments
    virus_aln = {i: aln_df.loc[binary_b[i], i] for i in binary_b.columns}
    # above, but with only the xr peptides
    virus_xr = {i: virus_aln[i][virus_aln[i] < 80] for i in binary_b.columns}
    # only 'evidence' peptides (reassigned variable name)
    virus_aln = {i: virus_aln[i][virus_aln[i] >= 80] for i in binary_b.columns}
    dep_pep = dependent_peptides()
Esempio n. 2
0
    def run_analysis(self):
        zdf = flex_array.standard_df(self.par['zscore_file'])

        binary_b = flex_array.sparse_aln_df(self.par['file_aln'])
        binary_b = flex_array.array(binary_b).filter_aln()
        binary_b = binary_b.reindex(zdf.index).fillna(0)

        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        glob_unique = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        orig_p = pd.DataFrame(index=list(binary_b), columns=list(zdf))

        hits_series = pd.Series(index=list(zdf))
        nonoverlap_hits_series = pd.Series(index=list(zdf))
        samples = list(zdf.columns)

        nonoverlap_dict = {}

        for sample_name, column in zdf.iteritems():
            hits = column[column >= self.par['Z_threshold']].copy()
            if self.par['use_filter']:
                nonoverlap_hits = flex_array.array(hits).gen_ind_hits(
                    self.dependent_pep, self.par['graph_dir'],
                    samples.index(sample_name))
            elif not self.par['use_filter']:
                nonoverlap_hits = hits.copy()
            input_num = len(nonoverlap_hits)
            hits_series[sample_name] = len(hits)
            nonoverlap_hits_series[sample_name] = input_num
            nonoverlap_dict[sample_name] = list(nonoverlap_hits.index)
            print("%s:\thits=%s, nonoverlapped=%s" %
                  (sample_name, len(hits), input_num))

            if input_num > 0:
                zb_df = binary_b.loc[nonoverlap_hits.index]
                collapse_zb, glob_array, sim_tag, p_series, orig_pseries = flex_array.array(
                    zb_df).binom_reassign(input_num, self.par['dir_ref_seq'],
                                          self.par['p_threshold'],
                                          self.par['x_threshold'],
                                          self.par['organism'])
                sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
                glob_unique[sample_name] = glob_array.apply(sum,
                                                            axis=0) + sim_tag
                pep_df[sample_name] = collapse_zb.apply(
                    lambda x: flex_array.array(x).names_string(0.001), axis=0)
                p_df[sample_name] = p_series
                orig_p[sample_name] = orig_pseries

        file_head = self.par['sub_dir'] + self.par['zscore_file'].split(
            '/')[-1].split('.')[0]  #Removes file path and extension
        if self.par['organism']:
            file_head += '_organism_'
        else:
            file_head += '_species_'

        #Write log file
        params.file_IO(self.par['sub_dir'] + 'parameters.log',
                       sep='=').dict_to_file(self.par)

        #Write analysis files
        sum_df.to_csv(file_head + 'total-counts.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        glob_unique.to_csv(file_head + 'unique-counts.txt',
                           sep='\t',
                           header=True,
                           index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        p_df.to_csv(file_head + 'p-values.txt',
                    sep='\t',
                    header=True,
                    index_label='Specie')
        orig_p.to_csv(file_head + 'orig-p-values.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')

        for i in p_df.columns:
            pvals = np.array(p_df[i].values)
            if not pd.isnull(pvals).all():
                mask = [j for j in np.where(np.isfinite(pvals))[0]]
                pval_corrected = np.empty(pvals.shape)
                pval_corrected.fill(np.nan)
                pval_corrected[mask] = multipletests(pvals[mask],
                                                     method='fdr_bh')[1]
                padjust_df[i] = pval_corrected
        padjust_df.to_csv(file_head + 'p-adjusted.txt',
                          sep='\t',
                          header=True,
                          index_label='Specie')

        #Write independent peptides file
        f = open(self.par['sub_dir'] + 'independent_peptides.txt', 'w')
        for i in samples:
            f.write(i)
            for j in nonoverlap_dict[i]:
                f.write('\t' + str(j))
            f.write('\n')
        f.close()

        #Write summary file
        f = open(self.par['sub_dir'] + 'results_summary.txt', 'w')
        f.write(
            "Sample name\tVirus\tBH p-value\tRaw p-value\tOrig p-value\tAssigned counts\t"
        )
        f.write(
            "Assigned peptides\tTotal sample hits\tTotal filtered sample hits\n"
        )
        for i in samples:
            BH = padjust_df[i]
            BH = BH[BH < self.par['bh_threshold']]
            p_value = p_df[i]
            p_value = p_value[BH.index]
            orig_pvalue = orig_p[i]
            orig_pvalue = orig_pvalue[BH.index]
            counts = sum_df[i]
            counts = counts[BH.index]
            peptides = pep_df[i]
            peptides = peptides[BH.index]

            for j in BH.index:
                if counts[j] > self.par['x_threshold']:
                    f.write(i + '\t')
                    f.write(j + '\t' + str(BH[j]) + '\t')
                    f.write(
                        str(p_value[j]) + '\t' + str(orig_pvalue[j]) + '\t')
                    f.write(str(counts[j]) + '\t' + str(peptides[j]) + '\t')
                    f.write(
                        str(hits_series[i]) + '\t' +
                        str(nonoverlap_hits_series[i]) + '\n')
        f.close()
        print("End of run.")
        return None
Esempio n. 3
0
    def run_analysis(self):
        zdf = flex_array.standard_df(self.par['zscore_file'])

        f = params.file_IO(
            '../ref_seq/pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8',
            '\t')
        orig_aln = f.flat_file_to_df([0, 1, 15])
        f = params.file_IO(
            '../ref_seq/new_pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8',
            '\t')
        new_aln = f.flat_file_to_df([0, 1, 15])

        orig_aln.index = [i.split('_')[1] for i in orig_aln.index]
        aln_df = pd.concat([orig_aln, new_aln])
        aln_df.fillna(0, inplace=True)

        binary_b = aln_df[aln_df >= 80].fillna(0)
        binary_b = pd.DataFrame(index=binary_b.index,
                                columns=binary_b.columns,
                                data=binary_b.values,
                                dtype=bool)
        binary_b = flex_array.array(binary_b).filter_aln(
            ref_seq=self.par['dir_ref_seq'])
        binary_b = binary_b.reindex(zdf.index).fillna(0)
        aln_df = aln_df.loc[:, binary_b.columns]

        #binary_b = flex_array.sparse_aln_df(self.par['file_aln'])
        #binary_b = flex_array.array(binary_b).filter_aln(ref_seq=self.par['dir_ref_seq'])

        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        glob_unique = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        n_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        orig_p = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        filter_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))

        hits_series = pd.Series(index=list(zdf))
        nonoverlap_hits_series = pd.Series(index=list(zdf))
        samples = list(zdf.columns)

        nonoverlap_dict = {}

        parallel_dict1 = {}
        parallel_dict2 = {}

        for sample_name, column in zdf.iteritems():
            hits = column[column >= self.par['Z_threshold']].copy()
            if self.par['use_filter']:
                nonoverlap_hits = flex_array.gen_ind_hits(
                    hits, self.dependent_pep, self.par['graph_dir'],
                    samples.index(sample_name))
                input_num = len(nonoverlap_hits)
            elif not self.par['use_filter']:
                nonoverlap_hits = hits.copy()
                input_num = len(
                    flex_array.gen_ind_hits(hits, self.dependent_pep,
                                            self.par['graph_dir'],
                                            samples.index(sample_name)))
            hits_series[sample_name] = len(hits)
            nonoverlap_hits_series[sample_name] = input_num
            nonoverlap_dict[sample_name] = list(nonoverlap_hits.index)
            print("%s:\thits=%s, nonoverlapped=%s" %
                  (sample_name, len(hits), input_num))

            if input_num > 0:
                zb_df = aln_df.loc[nonoverlap_hits.index]
                parallel_dict1[sample_name] = zb_df
                parallel_dict2[sample_name] = nonoverlap_hits
                '''
				collapse_zb, glob_array, sim_tag, p_series, orig_pseries, filter_series = flex_array.array(zb_df).binom_reassign(
						nonoverlap_hits, self.dependent_pep, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism'])
				sum_df[sample_name]=collapse_zb.apply(sum, axis=0) + sim_tag
				glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag
				pep_df[sample_name]=collapse_zb.apply(lambda x: flex_array.array(x).names_string(0.001),axis=0)
				p_df[sample_name]=p_series
				orig_p[sample_name]=orig_pseries
				filter_df[sample_name]=filter_series
				'''
        #parallel_dict1 = pd.Series(parallel_dict1)
        #parallel_dict2 = pd.Series(parallel_dict2)
        #parallel_dict2 = parallel_dict.loc[parallel_dict1.index]
        list1 = list(parallel_dict1.keys())  #sample names
        list2 = list(parallel_dict1.values())  #zb_df
        list3 = [parallel_dict2[i] for i in list1]  #hits series
        zipped = zip(list2, list3, list1)

        results = Parallel(n_jobs=-1)(
            delayed(flex_array.binom_reassign)
            (zb_df, nonoverlap_hits, sample_name, self.dependent_pep,
             self.par['dir_ref_seq'], self.par['p_threshold'],
             self.par['x_threshold'], self.par['organism'])
            for zb_df, nonoverlap_hits, sample_name in zipped)

        r1, r2, r3, r4, r5, r6, r7, r8 = zip(*results)
        for i in range(len(r7)):
            sample_name = r7[i]
            collapse_zb = r1[i]
            glob_array = r2[i]
            sim_tag = r3[i]
            p_series = r4[i]
            orig_pseries = r5[i]
            filter_series = r6[i]
            n_series = r8[i]
            sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
            glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag
            pep_df[sample_name] = collapse_zb.apply(
                lambda x: flex_array.array(x).names_string(0.001), axis=0)
            n_df[sample_name] = n_series
            p_df[sample_name] = p_series
            orig_p[sample_name] = orig_pseries
            filter_df[sample_name] = filter_series

        file_head = self.par['sub_dir'] + self.par['zscore_file'].split(
            '/')[-1].split('.')[0]  #Removes file path and extension
        if self.par['organism']:
            file_head += '_organism_'
        else:
            file_head += '_species_'

        #Write log file
        params.file_IO(self.par['sub_dir'] + 'parameters.log',
                       sep='=').dict_to_file(self.par)

        #Write analysis files
        sum_df.to_csv(file_head + 'total-counts.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        glob_unique.to_csv(file_head + 'unique-counts.txt',
                           sep='\t',
                           header=True,
                           index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        p_df.to_csv(file_head + 'p-values.txt',
                    sep='\t',
                    header=True,
                    index_label='Specie')
        orig_p.to_csv(file_head + 'orig-p-values.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        filter_df.to_csv(file_head + 'virus-filter.txt',
                         sep='\t',
                         header=True,
                         index_label='Specie')

        for i in p_df.columns:
            pvals = np.array(p_df[i].values)
            if not pd.isnull(pvals).all():
                mask = [j for j in np.where(np.isfinite(pvals))[0]]
                pval_corrected = np.empty(pvals.shape)
                pval_corrected.fill(np.nan)
                pval_corrected[mask] = multipletests(pvals[mask],
                                                     method='fdr_bh')[1]
                padjust_df[i] = pval_corrected
        padjust_df.to_csv(file_head + 'p-adjusted.txt',
                          sep='\t',
                          header=True,
                          index_label='Specie')

        #Write independent peptides file
        f = open(self.par['sub_dir'] + 'independent_peptides.txt', 'w')
        for i in samples:
            f.write(i)
            for j in nonoverlap_dict[i]:
                f.write('\t' + str(j))
            f.write('\n')
        f.close()

        #Write summary file
        f = open(file_head + 'results_summary.txt', 'w')
        f.write(
            "Sample name\tVirus\tBH p-value\tRaw p-value\tOrig p-value\tAssigned counts\tFiltered Assigned Counts\t"
        )
        f.write(
            "Assigned peptides\tTotal significant peptides\tRanking N\tTotal sample hits\tTotal filtered sample hits\n"
        )
        for i in samples:
            BH = padjust_df[i]
            BH = BH[BH < self.par['bh_threshold']]
            p_value = p_df[i]
            n_value = n_df[i]
            n_value = n_value[BH.index]
            p_value = p_value[BH.index]
            filter_value = filter_df[i]
            filter_value = filter_value[BH.index]
            orig_pvalue = orig_p[i]
            orig_pvalue = orig_pvalue[BH.index]
            counts = sum_df[i]
            counts = counts[BH.index]
            peptides = pep_df[i]
            peptides = peptides[BH.index]

            for j in BH.index:
                if filter_value[j] > self.par['x_threshold']:
                    f.write(i + '\t')
                    f.write(j + '\t' + str(BH[j]) + '\t')
                    f.write(
                        str(p_value[j]) + '\t' + str(orig_pvalue[j]) + '\t')
                    f.write(
                        str(counts[j]) + '\t' + str(filter_value[j]) + '\t' +
                        str(peptides[j]) + '\t')
                    #write number of peptides
                    pep_set = set()
                    for k in BH.index:
                        pep_list = peptides[k].split(';')
                        pep_set = pep_set.union(set(pep_list))
                    f.write(str(len(pep_set)) + '\t')
                    f.write(str(n_value[j]) + '\t')
                    f.write(
                        str(hits_series[i]) + '\t' +
                        str(nonoverlap_hits_series[i]) + '\n')
        f.close()
        print("End of run.")
        return None
Esempio n. 4
0
    def probability_ref(self):
        # Writes to file probability tables later used in binomial assessments
        viral_peptidome = open(self.par['file_annotation'], 'r')
        peptide_lib = []
        next(viral_peptidome)
        for line in viral_peptidome:
            items = line.split('\t')
            peptide_lib.append(str(items[0]))
        viral_peptidome.close()
        peptide_lib = peptide_lib[:-1]
        #peptide_lib.sort(key=int)

        binary_b = flex_array.sparse_aln_df(self.par['file_aln'])
        binary_b = binary_b.reindex(peptide_lib).fillna(0)
        binary_b = flex_array.array(binary_b).filter_aln(
            ref_seq=self.par['dir_ref_seq'])
        binary_b = pd.DataFrame(index=binary_b.index,
                                columns=binary_b.columns,
                                data=binary_b.values,
                                dtype=bool)

        virus_aln = {
            i: binary_b.loc[binary_b[i], i]
            for i in binary_b.columns
        }  #list of alignments to feed into filter
        dep_pep = self.dependent_peptides()
        virus_tot_filter = [
            flex_array.gen_ind_hits(virus_aln[i], dep_pep) for i in virus_aln
        ]
        #virus_aln = dict(zip(list(binary_b.columns), virus_aln))

        virus_sums = pd.Series(index=binary_b.columns,
                               data=[
                                   len(i) for i in virus_tot_filter
                               ])  #binary_b.apply(np.count_nonzero, axis=0)
        first_round_prob = pd.Series(index=binary_b.columns)
        viruses = list(binary_b.columns)
        for i in first_round_prob.index:
            print("Virus " + str(viruses.index(i)))
            first_round_prob[i] = virus_sums[i] / (
                len(peptide_lib) - (len(virus_aln[i]) - virus_sums[i]))
        first_round_prob.to_csv(self.par['dir_ref_seq'] +
                                "total_probabilities_20180524.csv",
                                header=False,
                                index=True)
        print("First probability file generated.")
        '''
		virus_shared = pd.DataFrame(index=viruses, columns=viruses)
		virus_unique = pd.DataFrame(index=viruses, columns=viruses)
		for i in viruses:
			for j in viruses:
				shared = virus_aln[i]*virus_aln[j]; shared.fillna(0.0, inplace=True);
				shared = shared[shared>0]#; shared = list(shared.index); shared = [str(i) for i in shared]
				#shared = ';'.join(shared)
				
				virus_shared.loc[i,j] = len(shared)#(flex_array.gen_ind_hits(shared, dep_pep))
				
		for i in virus_shared.columns:
			virus_unique[i] = virus_sums[i] - virus_shared[i]
			'''

        second_round_prob = pd.DataFrame(index=viruses, columns=viruses)
        third_round_prob = pd.DataFrame(index=viruses, columns=viruses)
        #virus_pairs = []
        #total = 139656
        #count = 0

        virus_pairs = list(combo(viruses, 2))

        def calc_pair(pair):
            i = pair[0]
            j = pair[1]
            d1 = {}
            d2 = {}
            i_index = set(virus_aln[i].index)
            j_index = set(virus_aln[j].index)
            shared_index = set(i_index).intersection(j_index)
            shared = virus_aln[i].loc[list(shared_index)]
            if len(shared) == 0:
                d1[(i, j)] = first_round_prob[j]
                d1[(j, i)] = first_round_prob[i]
                d2[(i, j)] = 0.0
                d2[(j, i)] = 0.0
            else:
                unique_j = j_index - shared_index
                unique_j = virus_aln[j].loc[unique_j]
                filter_unique_j = flex_array.gen_ind_hits(unique_j, dep_pep)
                d1[(i, j)] = len(filter_unique_j) / (
                    len(peptide_lib) - len(shared) -
                    (len(unique_j) - len(filter_unique_j)))
                unique_i = i_index - shared_index
                unique_i = virus_aln[i].loc[unique_i]
                filter_unique_i = flex_array.gen_ind_hits(unique_i, dep_pep)
                d1[(j, i)] = len(filter_unique_i) / (
                    len(peptide_lib) - len(shared) -
                    (len(unique_i) - len(filter_unique_i)))
                filter_shared = flex_array.gen_ind_hits(shared, dep_pep)
                d2[(i, j)] = len(filter_shared) / (
                    len(peptide_lib) - len(unique_i) - len(unique_j) -
                    (len(shared) - len(filter_shared)))
                d2[(j, i)] = d2[(i, j)]

            return d1, d2

        results = Parallel(n_jobs=-1, verbose=100000)(delayed(calc_pair)(pair)
                                                      for pair in virus_pairs)
        m1, m2 = zip(*results)

        #		for i in index:
        #			for j in range(index.index(i),len(second_round_prob.columns)):
        #				#pair = set([i, j])
        #				#if pair not in virus_pairs:
        #					# print progress
        #				j = second_round_prob.columns[j]
        #				count += 1
        #				print("Proportion of pairs evaluated: " + str(count/total))
        #				# add pair to list of sets
        #				#virus_pairs.append(pair)
        #				'''
        #				#do uniques
        #				shared = virus_aln[i] & virus_aln[j]; shared.fillna(False, inplace=True); shared = shared[shared];
        #				unique_j = virus_aln[j] ^ shared; unique_j.fillna(True, inplace=True); unique_j = unique_j[unique_j];
        #				filter_unique_j = flex_array.gen_ind_hits(unique_j, dep_pep)
        #				second_round_prob.loc[i,j] = len(filter_unique_j)/(len(peptide_lib)-len(shared)-(len(unique_j)-len(filter_unique_j)))
        #
        #				unique_i = virus_aln[i] ^ shared; unique_i.fillna(True, inplace=True); unique_i = unique_i[unique_i];
        #				filter_unique_i = flex_array.gen_ind_hits(unique_i, dep_pep)
        #				second_round_prob.loc[j,i] = len(filter_unique_i)/(len(peptide_lib)-len(shared)-(len(unique_i)-len(filter_unique_i)))
        #				# now do shared probabilities
        #				filter_shared = flex_array.gen_ind_hits(shared, dep_pep)
        #				third_round_prob.loc[i,j] = len(filter_shared)/(len(peptide_lib)-len(unique_i)-len(unique_j)-(len(shared)-len(filter_shared)))
        #				third_round_prob.loc[j,i] = third_round_prob.loc[i,j]
        #				'''
        #				# find shared
        #				i_index = set(virus_aln[i].index); j_index = set(virus_aln[j].index)
        #				shared_index = set(i_index).intersection(j_index); shared = virus_aln[i].loc[list(shared_index)]
        #				# set values if shared is 0
        #				if len(shared) == 0:
        #					second_round_prob.loc[i,j] = first_round_prob[j]
        #					second_round_prob.loc[j,i] = first_round_prob[i]
        #					third_round_prob.loc[i,j] = 0.0
        #					third_round_prob.loc[j,i] = 0.0
        #				else:
        #					# unique at j
        #					unique_j = j_index - shared_index; unique_j = virus_aln[j].loc[unique_j]
        #					filter_unique_j = flex_array.gen_ind_hits(unique_j, dep_pep)
        #					second_round_prob.loc[i,j] = len(filter_unique_j)/(len(peptide_lib)-len(shared)-(len(unique_j)-len(filter_unique_j)))
        #					# unique at i
        #					unique_i = i_index - shared_index; unique_i = virus_aln[i].loc[unique_i]
        #					filter_unique_i = flex_array.gen_ind_hits(unique_i, dep_pep)
        #					second_round_prob.loc[j,i] = len(filter_unique_i)/(len(peptide_lib)-len(shared)-(len(unique_i)-len(filter_unique_i)))
        #					# shared prob
        #					filter_shared = flex_array.gen_ind_hits(shared, dep_pep)
        #					third_round_prob.loc[i,j] = len(filter_shared)/(len(peptide_lib)-len(unique_i)-len(unique_j)-(len(shared)-len(filter_shared)))
        #					third_round_prob.loc[j,i] = third_round_prob.loc[i,j]
        #

        second_round_prob.to_csv(self.par['dir_ref_seq'] +
                                 "unique_probabilities_20180524.csv",
                                 header=True,
                                 index=True)
        print("Second probability file generated.")

        third_round_prob.to_csv(self.par['dir_ref_seq'] +
                                "shared_probabilities_20180524.csv",
                                header=True,
                                index=True)
        print("Third (and last) probability file generated.")
        '''
				a = binary_b[i]; b = binary_b[j]
				virus_intersections.loc[i,j] = np.dot(a,b)
		
		virus_unique = pd.DataFrame(index=viruses, columns=viruses)
		for i in virus_intersections.columns:
			virus_unique[i] = virus_sums[i] - virus_intersections[i]
		
		second_round_prob = pd.DataFrame(index=viruses, columns=viruses)
		for i in virus_intersections.index:
			for j in virus_intersections.columns:
				second_round_prob.loc[i,j] = virus_unique.loc[i,j]/(len(peptide_lib)-virus_intersections.loc[i,j])
		second_round_prob.to_csv(self.par['dir_ref_seq']+"unique_probabilities.csv", header=True, index=True)
		print("Second probability file generated.")
		
		third_round_prob = pd.DataFrame(index=viruses, columns=viruses)
		for i in virus_intersections.index:
			for j in virus_intersections.columns:
				third_round_prob.loc[i,j] = virus_intersections.loc[i,j]/(len(peptide_lib)-virus_unique.loc[i,j]-virus_unique.loc[j,i])
		third_round_prob.to_csv(self.par['dir_ref_seq']+"shared_probabilities.csv", header=True, index=True)
		print("Third (and last) probability file generated.")
		'''

        return None
Esempio n. 5
0
    def probability_ref(self):
        # Writes to file probability tables later used in binomial assessments
        viral_peptidome = open(self.par['file_annotation'], 'r')
        peptide_lib = []
        next(viral_peptidome)
        for line in viral_peptidome:
            items = line.split('\t')
            peptide_lib.append(str(items[0]))
        viral_peptidome.close()
        peptide_lib = peptide_lib[:-1]
        peptide_lib.sort(key=int)

        binary_b = flex_array.binary_aln_df(self.par['file_aln'])
        binary_b = flex_array.array(binary_b).filter_aln()
        binary_b = binary_b.reindex(peptide_lib).fillna(0)

        virus_sums = binary_b.apply(np.count_nonzero, axis=0)
        first_round_prob = virus_sums / len(peptide_lib)
        first_round_prob.to_csv(self.par['dir_ref_seq'] +
                                "total_probabilities.csv",
                                header=False,
                                index=True)
        print("First probability file generated.")

        viruses = list(binary_b.columns)
        virus_intersections = pd.DataFrame(index=viruses, columns=viruses)
        for i in viruses:
            for j in viruses:
                a = binary_b[i]
                b = binary_b[j]
                virus_intersections.loc[i, j] = np.dot(a, b)

        virus_unique = pd.DataFrame(index=viruses, columns=viruses)
        for i in virus_intersections.columns:
            virus_unique[i] = virus_sums[i] - virus_intersections[i]

        second_round_prob = pd.DataFrame(index=viruses, columns=viruses)
        for i in virus_intersections.index:
            for j in virus_intersections.columns:
                second_round_prob.loc[i, j] = virus_unique.loc[i, j] / (
                    len(peptide_lib) - virus_intersections.loc[i, j])
        second_round_prob.to_csv(self.par['dir_ref_seq'] +
                                 "unique_probabilities.csv",
                                 header=True,
                                 index=True)
        print("Second probability file generated.")

        third_round_prob = pd.DataFrame(index=viruses, columns=viruses)
        for i in virus_intersections.index:
            for j in virus_intersections.columns:
                third_round_prob.loc[i, j] = virus_intersections.loc[i, j] / (
                    len(peptide_lib) - virus_unique.loc[i, j] -
                    virus_unique.loc[j, i])
        third_round_prob.to_csv(self.par['dir_ref_seq'] +
                                "shared_probabilities.csv",
                                header=True,
                                index=True)
        print("Third (and last) probability file generated.")

        return None


# End