Ejemplo n.º 1
0
def write_to_data(data, dp, mode):
    # calculate and populate the p values
    # calculate pat g for related and unrelated cases
    related_pat = set([p for h, v in dp.iteritems() for p in v['data']])
    related_pat_g = len(related_pat)
    unrelated_pat_g = len(
        [p for p in related_pat if p in patients_hpo['unrelated']])
    for h, v in dp.iteritems():
        related_pat_a = v['related_pat_a']
        unrelated_pat_a = v['unrelated_pat_a']
        related_pat_h = v['related_pat_h']
        unrelated_pat_h = v['unrelated_pat_h']
        related_pat_gh_set = set([p for p in v['data']])
        related_pat_gh = len(related_pat_gh_set)
        unrelated_pat_gh = len(
            [p for p in related_pat_gh_set if p in patients_hpo['unrelated']])
        related_p_val = fisher.pvalue(
            related_pat_a - related_pat_h - related_pat_g + related_pat_gh,
            related_pat_h - related_pat_gh, related_pat_g - related_pat_gh,
            related_pat_gh)
        unrelated_p_val = fisher.pvalue(
            unrelated_pat_a - unrelated_pat_h - unrelated_pat_g +
            unrelated_pat_gh, unrelated_pat_h - unrelated_pat_gh,
            unrelated_pat_g - unrelated_pat_gh, unrelated_pat_gh)
        data[h]['related_' + mode + '_p_val'] = related_p_val.right_tail
        data[h]['unrelated_' + mode + '_p_val'] = unrelated_p_val.right_tail
        data[h]['related_' + mode + '_pat_g'] = related_pat_g
        data[h]['unrelated_' + mode + '_pat_g'] = unrelated_pat_g
        data[h]['related_' + mode + '_pat_gh'] = related_pat_gh
        data[h]['unrelated_' + mode + '_pat_gh'] = unrelated_pat_gh
Ejemplo n.º 2
0
def table_maker(subset, ind1, ind2, row_labels, col_labels, title):
    """
    `subset` provides a subsetted boolean of items to consider.  If no subset,
    you can use all with `np.ones_like(ind1) == 1`

    `ind1` is used to subset rows, e.g., log2fc > 0.  This is used for rows, so
    row_label might be ['upregulated', 'others']

    `ind2` is used to subset cols.  For example, col_labels would be
    ['bound', 'unbound']
    """
    table = [
        sum(subset & ind1 & ind2),
        sum(subset & ind1 & ~ind2),
        sum(subset & ~ind1 & ind2),
        sum(subset & ~ind1 & ~ind2)
    ]
    print
    print title
    print '-' * len(title)
    print print_2x2_table(table, row_labels=row_labels, col_labels=col_labels)
    print print_row_perc_table(table,
                               row_labels=row_labels,
                               col_labels=col_labels)
    print print_col_perc_table(table,
                               row_labels=row_labels,
                               col_labels=col_labels)
    print fisher.pvalue(*table)
Ejemplo n.º 3
0
def table_maker(subset, ind1, ind2, row_labels, col_labels, title):
    """
    `subset` provides a subsetted boolean of items to consider.  If no subset,
    you can use all with `np.ones_like(ind1) == 1`

    `ind1` is used to subset rows, e.g., log2fc > 0.  This is used for rows, so
    row_label might be ['upregulated', 'others']

    `ind2` is used to subset cols.  For example, col_labels would be
    ['bound', 'unbound']
    """
    table = [
        sum(subset & ind1 & ind2),
        sum(subset & ind1 & ~ind2),
        sum(subset & ~ind1 & ind2),
        sum(subset & ~ind1 & ~ind2)
    ]
    print
    print title
    print '-' * len(title)
    print print_2x2_table(table, row_labels=row_labels, col_labels=col_labels)
    print print_row_perc_table(
        table, row_labels=row_labels, col_labels=col_labels)
    print print_col_perc_table(
        table, row_labels=row_labels, col_labels=col_labels)
    print fisher.pvalue(*table)
Ejemplo n.º 4
0
def test(comps, genome, motif, rcounts, nums): # rcounts
    print "%s.%s.%s: fisher test on real and perm data" % (comps, genome, motif)
    results = {}
    for rt in ["r1", "r2", "r3"]:
        for event_class in ["s", "e"]:
            val_class = rcounts.get("%s.%s" % (rt, event_class), 0)
            val_control = rcounts.get("%s.%s" % (rt, "c"), 0)
            num_class = nums.get("%s.%s" % (rt, event_class), 0)
            num_control = nums.get("%s.%s" % (rt, "c"), 0)
            val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail
            results["%s.%s" % (rt, event_class)] = val

            # information gain (g at the end)
            i1 = (num_class/float(num_class+num_control))*math.log(num_class/float(num_class+num_control), 2)
            i2 = (num_control/float(num_class+num_control))*math.log(num_control/float(num_class+num_control), 2)
            i = -(i1+i2)
            c1_num = max(1, float(val_class + val_control)) # dont allow it to be 0
            if (val_class/c1_num)>0 and val_control/c1_num>0:
                c1_num = -( (val_class/c1_num)*math.log(val_class/c1_num, 2) + (val_control/c1_num)*math.log(val_control/c1_num, 2) )
                c1_num = (val_class+val_control)/float(num_class+num_control) * c1_num
            else:
                c1_num = 0
            c2_num = max(1, float( (num_class-val_class) + (num_control-val_control) ) )  # dont allow it to be 0
            if (num_class-val_class)/c2_num>0 and (num_control-val_control)/c2_num>0:
                c2_num = -( ((num_class-val_class)/c2_num)*math.log((num_class-val_class)/c2_num, 2) + ((num_control-val_control)/c2_num)*math.log((num_control-val_control)/c2_num, 2) )
                c2_num = (num_class+num_control-(val_class+val_control))/float(num_class+num_control) * c2_num
            else:
                c2_num = 0
            g = i - (c1_num + c2_num)

            #print rt, event_class, val_class, num_class
            #print rt, "c", val_control, num_control
            #print

            results["%s.%s.g" % (rt, event_class)] = g
            for p in range(0, rnamotifs2.config.perms):
                val_class = rcounts.get("%s.%s.p%s" % (rt, event_class, p), 0)
                val_control = rcounts.get("%s.%s.p%s" % (rt, "c", p), 0)
                num_class = rnamotifs2.perm.ec_dist[p].get(event_class, 0)
                num_control = rnamotifs2.perm.ec_dist[p].get("c", 0)
                val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail
                results["%s.%s.p%s" % (rt, event_class, p)] = val

    test_results = {}
    for rt in ["r1", "r2", "r3"]:
        for event_class in ["s", "e"]:
            pval = results["%s.%s" % (rt, event_class)]
            pemp = [results["%s.%s.p%s" % (rt, event_class, p)] for p in range(0, rnamotifs2.config.perms)]
            g = results["%s.%s.g" % (rt, event_class)]
            test_results["%s.%s" % (rt, event_class)] = [pval, pemp, g]
    return test_results
Ejemplo n.º 5
0
def calculate_fisher(row_vals, col_vals, test_type):
	"""Calculate fishers exact test on prepared contingency table"""

	row_val_1, row_val_2 = row_vals
	col_val_1, col_val_2 = col_vals

	if test_type == 1:
		return fisher.pvalue(row_val_1, row_val_2, col_val_1, col_val_2).two_tail
	elif test_type == 2:
		return fisher.pvalue(row_val_1, row_val_2, col_val_1, col_val_2).left_tail
	elif test_type == 3:
		return fisher.pvalue(row_val_1, row_val_2, col_val_1, col_val_2).right_tail
	else:
		raise TypeError
Ejemplo n.º 6
0
def es_apa(data_s, data_e, data_c):
    num_s, num_e, num_c = rnamotifs2.data.dist["s"], rnamotifs2.data.dist["e"], rnamotifs2.data.dist["c"]
    logs = []
    loge = []
    temp_e = []
    temp_s = []
    for val_s, val_e, val_c in zip(data_s, data_e, data_c): # there are 4 areas
        f1 = -2 * math.log(pvalue(val_s, val_c, num_s-val_s, num_c-val_c).right_tail)
        f2 = -2 * math.log(pvalue(val_e, val_c, num_e-val_e, num_c-val_c).right_tail)
        temp_s.append(f1)
        temp_e.append(f2)
    logs.append(temp_s)
    loge.append(temp_e)
    return logs, loge
Ejemplo n.º 7
0
def modifyStrelkaRow(record,fixIndels=True):
    """Add info for strelka processing to vcf record
    
    :param record: a pyVCF record object
    """
    if(record.is_snp or record.ALT[0] is None):
        ref = record.REF
        alt = record.ALT[0]
        record.INFO['NORMREF']=getattr(record.samples[0].data,ref+'U')[0]
        record.INFO['TUMREF']=getattr(record.samples[1].data,ref+'U')[0]
        # strelka sometimes reports a non-passing variant as no "ALT" allele (no change)
        if(alt is None): 
            record.INFO['NORMALT']=0
            record.INFO['TUMALT']=0
            record.INFO['TUMVAF']=0
            record.INFO['TUMVARFRACTION']=0
        else:
            alt = str(alt)
            record.INFO['NORMALT']=getattr(record.samples[0].data,alt+'U')[0]
            record.INFO['TUMALT']=getattr(record.samples[1].data,alt+'U')[0]
            try:
                record.INFO['TUMVAF']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['TUMREF'])
            except ZeroDivisionError:
                record.INFO['TUMVAF']=0
            try:
                record.INFO['TUMVARFRACTION']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['NORMALT'])
            except ZeroDivisionError:
                record.INFO['TUMVARFRACTION']=0
        record.INFO['LOG_FISHER']=-math.log10(fisher.pvalue(record.INFO['TUMREF'],record.INFO['TUMALT'],record.INFO['NORMREF'],record.INFO['NORMALT']).two_tail)
        return(record)
    else:
        record.INFO['NORMREF']=getattr(record.samples[0].data,'TAR')[0]
        record.INFO['NORMALT']=getattr(record.samples[0].data,'TIR')[0]
        record.INFO['TUMREF']=getattr(record.samples[1].data,'TAR')[0]
        record.INFO['TUMALT']=getattr(record.samples[1].data,'TIR')[0]
        try:
            record.INFO['TUMVAF']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['TUMREF'])
        except ZeroDivisionError:
            record.INFO['TUMVAF']=0
        try:
            record.INFO['TUMVARFRACTION']=float(record.INFO['TUMALT'])/(record.INFO['TUMALT']+record.INFO['NORMALT'])
        except ZeroDivisionError:
            record.INFO['TUMVARFRACTION']=0
        if(fixIndels):
            record.REF=record.REF.replace('.','')
            for i in range(len(record.ALT)):
                if(not isinstance(record.ALT[i],vcf.model._Substitution)):
                    return(None)
        record.INFO['LOG_FISHER']=-math.log10(fisher.pvalue(record.INFO['TUMREF'],record.INFO['TUMALT'],record.INFO['NORMREF'],record.INFO['NORMALT']).two_tail)
        return(record)
Ejemplo n.º 8
0
def cap(size1, size2):
    """
    This function finds the population value at which we
    begin to obtain a signiciant value. This may need to
    be modified depending on the training size.
    """
    lowest_sig = 0.05
    for n in range(1, 10):
        if size1 < size2:
            sig = pvalue(size2, size1 - n, 0, n).two_tail
            return n
        else:
            sig = pvalue(size1, size2 - n, 0, n).two_tail
            if sig <= 0.05:
                return n
Ejemplo n.º 9
0
	def single_maker_allelic_association(self, phenotype_list=[], genotype_list=[]):
		"""
			Computes single marker logistic regressionassociation for 
			lists of phenotypes and genotypes of equal length 
		"""
		# Make sure phenotype and genotype lists are same size
		if len(phenotype_list) != len(genotype_list):
			return None

		case_alleles = []
		control_alleles = []
		for i in range(len(phenotype_list)):
			loc_alleles = genotype_list[i].split(",")
			if phenotype_list[i] == 1:
				for a in loc_alleles:
					case_alleles.append(a)
			else:
				for a in loc_alleles:
					control_alleles.append(a)

		# Getting set of alleles and their counts
		allele = list(set(chain(case_alleles,control_alleles)))
		case_counts = Counter(case_alleles)
		control_counts = Counter(control_alleles)

		# Implementing slow scipy chi-square if we have more than two allles
		if len(allele) > 2:
			table = np.zeros(shape=(2,len(allele)))
			for i in range(len(allele)):
				table[0,i] = case_counts[allele[i]]
				table[1,i] = control_counts[allele[i]]

			chi2, p, dof, ex = chi2_contingency(table)
			return p

		# Running fast Fisher's algoritm OW 
		if len(case_counts) == 2 and  len(control_counts) == 2:
			p = pvalue(case_counts[allele[0]], control_counts[allele[0]], case_counts[allele[1]], control_counts[allele[1]]).two_tail	

		elif len(case_counts) == 2 and len(control_counts) == 1:
			p = pvalue(case_counts[allele[0]], control_counts[allele[0]], case_counts[allele[1]], 0).two_tail	

		elif len(case_counts) == 1 and len(control_counts) == 2:
			p = pvalue(case_counts[allele[0]], control_counts[allele[0]], 0, control_counts[allele[1]]).two_tail	
		else:
			p = 1

		return p
    def _find_sequence_p_values_with_fisher(self, sequence_presence_matrix,
                                            is_first_class):
        sequence_p_values = []

        for sequence_vector in sequence_presence_matrix:
            if sequence_vector.sum() > 1:

                first_class_present = np.sum(sequence_vector[np.logical_and(
                    sequence_vector, is_first_class)])
                second_class_present = np.sum(sequence_vector[np.logical_and(
                    sequence_vector, np.logical_not(is_first_class))])
                first_class_absent = np.sum(
                    np.logical_and(is_first_class, sequence_vector == 0))
                second_class_absent = np.sum(
                    np.logical_and(np.logical_not(is_first_class),
                                   sequence_vector == 0))

                sequence_p_values.append(
                    fisher.pvalue(first_class_present, second_class_present,
                                  first_class_absent,
                                  second_class_absent).right_tail)
            else:
                sequence_p_values.append(SequenceFilterHelper.INVALID_P_VALUE)

        return sequence_p_values
Ejemplo n.º 11
0
def getSub(ref, fread, dics):
    #fread={A,C,G,T}
    nref = fread[ref.upper()]
    sub = [(ref.upper() + i, nref, fread[i]) for i in fread
           if i != ref.upper() and fread[i] != 0]
    allsub = ' '.join([x[0] for x in sub])
    # lista del tipo [('AT', 50, 10), ('AG', 50, 2)]
    res = [
    ]  #[(int(dics[i[0]]*(i[1]+i[2])),((i[1]+i[2])-exp1),pvalue(i[1],i[2],int(dics[i[0]]*(i[1]+i[2])),((i[1]+i[2])-exp1))) for i in sub]
    for i in sub:
        obs1 = i[1]
        obs2 = i[2]
        exp1 = int(dics[i[0]] * (i[1] + i[2]))
        exp2 = ((i[1] + i[2]) - exp1)
        if not exfisher: pval = FishersExactTest([[exp1, exp2], [obs1, obs2]])
        else: pval = pvalue(obs1, obs2, exp1, exp2)
        pval = getTail(pval)
        res.append((i[0], obs1, obs2, exp1, exp2, str(pval)))
    if len(res) == 1: return res[0][5]  #,allsub,fread
    elif len(res) > 1:
        rr = [float(x[-1]) for x in res]
        idx = rr.index(min(rr))
        return res[idx][5]  #,allsub,fread
    else:
        return '1.0'  #,0,0
Ejemplo n.º 12
0
 def ethinic_filter_gNOMAD(self):
     """
     Pass a file with the gnomad dataset filter for the genes of interest.
     For example a dataset might intersects with 800,000 snps (determined by
     wc -l of the file. We will test to see if any of those are candidates
     for removal. The pvalue threshold will be 0.05 / 28 (28 being the number
     of ethnic combinations). If 1 of 28 combinations are indeed a hit for
     signif. then we toss that SNP from the dataset as it could be missused
     for separating disease vs non-dieases by ethnicity instead of by diease
     bearing snps.
     """
     cutoff = 0.05 / 28
     snps_to_remove = []
     with open(self.gnomad_file, 'r') as fin:
         line_num = 0
         for line in fin:
             div = line.split("\t")
             chrm_info = div[0]
             genotype_list = []
             for ethnicity in range(8):  # 8 for number of ethn. in gnomad
                 genotype_list.append(div[1].split(",")[ethnicity])
             for i in itertools.combinations(genotype_list, 2):
                 n11 = int(i[0].split(" ")[0])
                 n12 = int(i[0].split(" ")[1])
                 n21 = int(i[1].split(" ")[0])
                 n22 = int(i[1].split(" ")[1])
                 pval = pvalue(n11, n21, n12, n22).two_tail
                 if pval <= cutoff:
                     snps_to_remove.append(chrm_info)
                     break
             line_num += 1
             print(line_num / 15008010, end='\r')  # percent complete
     self.filtered_columns = self.col_translator[
         self.col_translator['chrm_pos_ref_alt'].isin(snps_to_remove)].index
def get_separator_words(toks1):
    """
    Finds the words that separate a list of tokens from a background corpus
    Basically this generates a list of informative/interesting words in a set
    toks1 is a list of words
    Returns a list of separator words
    """
    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
    else:
        essay_corpus = open(ESSAY_CORPUS_PATH).read()
        essay_corpus = sub_chars(essay_corpus)
        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
        pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
    sep_words = []
    for word in tab_toks1.keys():
        tok1_present = tab_toks1[word]
        if(tok1_present > 2):
            tok1_total = tab_toks1._N
            tok2_present = toks2[word]
            tok2_total = toks2._N
            fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
            if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
                sep_words.append(word)
    sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
    return sep_words
Ejemplo n.º 14
0
def create_count_table(df, Cyt1, Cyt2):
    df_cont = pd.DataFrame(zip(df[Cyt1], df[Cyt2]), columns = ['A', 'B'])
#create contigency table
    df_cont[df_cont > 0] = 1
#get counts for each condition
    d = df_cont.to_dict()
    A = 0
    B = 0
    AandB = 0
    none = 0
    tup = zip(d['A'].values(), d['B'].values())
    for row in tup:
        if row[0] == 0 and row[1] == 0:
            none = none +1
        if row[0] == 0 and row[1] == 1:
            B = B + 1
        if row[0] == 1 and row[1] == 0:
            A = A +1
        if row[0] ==1 and row[1] == 1:
            AandB = AandB + 1
    # Fishers exact test
    matrix = numpy.matrix([[AandB, B],[A, none]])
    p = pvalue(AandB, B, A, none)
    #output = [Cyt1, Cyt2, p.left_tail, p.right_tail, p.two_tail]
    output = p.two_tail
    return output, Cyt1, Cyt2
Ejemplo n.º 15
0
def dnds_stat(estimations):
    '''return estimations of windows with dN/dS > 1'''

    filtered_estimations = []
    fname = estimations[0]['file name']
    genedS = float(estimations[0]['dS'])
    for i in range(len(estimations)):
        if 'nan' in estimations[i].values():
            continue
        name = estimations[i]['file name']
        if name != fname:
            fname = estimations[i]['file name']
            genedS = float(estimations[i]['dS'])
        if genedS != 0:
            estimations[i]['dN/dS(whole gene)'] = float(
                estimations[i]['dN']) / genedS
        else:
            continue

        # process numbers with fisher module
        if estimations[i]['whole gene'] == '1' or \
           estimations[i]['dN/dS(whole gene)'] > 1:
            n = round(float(estimations[i]['dN']) * float(estimations[i]['N']))
            N = round(float(estimations[i]['N'])) - n
            s = round(float(estimations[i]['dS']) * float(estimations[i]['S']))
            S = round(float(estimations[i]['S'])) - s
            mat = [[n, N], [s, S]]
            p = pvalue(n, N, s, S)
            estimations[i]['p-value'] = p.two_tail
            filtered_estimations.append(estimations[i])
    return filtered_estimations
Ejemplo n.º 16
0
 def get_vocab(self, input_text, input_scores, max_features):
     train_mat = self.vectorizer1.transform(input_text)
     input_score_med = np.median(input_scores)
     new_scores = [0 if i<=input_score_med else 1 for i in input_scores]
     ind_max_features = math.floor(max_features/max(input_scores))
     all_vocab = []
     all_cols = [np.asarray(train_mat.getcol(i).todense().transpose())[0] for i in xrange(0,train_mat.shape[1])]
     for s in xrange(0,max(input_scores)):
         sel_inds = [i for i in xrange(0,len(input_scores)) if input_scores[i]==s]
         out_inds = [i for i in xrange(0,len(input_scores)) if input_scores[i]!=s]
         pvalues = []
         for i in xrange(0,len(all_cols)):
             lcol = all_cols[i]
             good_lcol = lcol[sel_inds]
             bad_lcol = lcol[out_inds]
             good_lcol_present = len(good_lcol[good_lcol > 0])
             good_lcol_missing = len(good_lcol[good_lcol == 0])
             bad_lcol_present = len(bad_lcol[bad_lcol > 0])
             bad_lcol_missing = len(bad_lcol[bad_lcol == 0])
             pval = pvalue(good_lcol_present, bad_lcol_present, good_lcol_missing, bad_lcol_missing)
             pvalues.append(pval.two_tail)
         col_inds = list(xrange(0,train_mat.shape[1]))
         p_frame = pd.DataFrame(np.array([col_inds, pvalues]).transpose(), columns=["inds", "pvalues"])
         p_frame = p_frame.sort(['pvalues'], ascending=True)
         getVar = lambda searchList, ind: [searchList[int(i)] for i in ind]
         vocab = getVar(self.vectorizer1.get_feature_names(), p_frame['inds'][:ind_max_features+2])
         all_vocab.append(vocab)
     return list(set(list(chain.from_iterable(all_vocab))))
Ejemplo n.º 17
0
    def fisherTest(tab, alternative='two-sided'):
        """Fisher's exact test on a 2x2 contingency table.

        Wrapper around fisher.pvalue found in:
        Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen)
        https://pypi.python.org/pypi/fisher/

        Test is performed in C (100x speed-up)

        Parameters
        ----------
        tab : list of lists or 2x2 ndarray
            Each element should contain counts
        alternative : string
            Specfies the alternative hypothesis (similar to scipy.fisher_exact)
            Options: 'two-sided', 'less', 'greater'

        Returns
        -------
        OR : float
            Odds-ratio associated with the 2 x 2 table
        p : float
            P-value associated with the test and the alternative hypothesis"""
        
        res = fisher.pvalue(tab[0][0], tab[0][1], tab[1][0], tab[1][1])
        OR = (tab[0][0] * tab[1][1]) / (tab[0][1] * tab[1][0])

        if alternative == 'two-sided':
            return (OR, res.two_tail)
        elif alternative == 'less':
            return (OR, res.left_tail)
        elif alternative == 'greater':
            return (OR, res.right_tail)
Ejemplo n.º 18
0
    def calculate_differential_methylation_fisher_exact(self, weighted = False):
        sum_meth_control = 0
        sum_meth_affected = 0
        sum_cov_control = 0
        sum_cov_affected = 0
        for cpg in self.cpgs:
            if weighted:
                sum_meth_control += cpg.weighted_methylation_control
                sum_meth_affected += cpg.weighted_methylation_affected
                sum_cov_control += cpg.cov_control
                sum_cov_affected += cpg.cov_affected
            else:
                sum_meth_control += cpg.meth_control
                sum_meth_affected += cpg.meth_affected
                sum_cov_control += cpg.cov_control
                sum_cov_affected += cpg.cov_affected

        control = sum_meth_control / sum_cov_control
        affected = sum_meth_affected / sum_cov_affected
        control_methylated = sum_cov_control * control / 100
        control_unmethylated = sum_cov_control - control_methylated
        affected_methylated = sum_cov_affected * affected / 100
        affected_unmethylated = sum_cov_affected - affected_methylated
        try:
            #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/
            p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated)
            pvalue = p.two_tail
        except:
            oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided')
        return pvalue
Ejemplo n.º 19
0
    def find_label_associated_sequence_p_values(
            comparison_data: ComparisonData, repertoires: List[Repertoire],
            label: Label):

        sequence_p_values = []
        is_first_class = np.array([
            repertoire.metadata[label.name] for repertoire in repertoires
        ]) == label.positive_class

        for sequence_vector in comparison_data.get_item_vectors(
            [repertoire.identifier for repertoire in repertoires]):

            if sequence_vector.sum() > 1:

                first_class_present = np.sum(sequence_vector[np.logical_and(
                    sequence_vector, is_first_class)])
                second_class_present = np.sum(sequence_vector[np.logical_and(
                    sequence_vector, np.logical_not(is_first_class))])
                first_class_absent = np.sum(
                    np.logical_and(is_first_class, sequence_vector == 0))
                second_class_absent = np.sum(
                    np.logical_and(np.logical_not(is_first_class),
                                   sequence_vector == 0))

                sequence_p_values.append(
                    fisher.pvalue(first_class_present, second_class_present,
                                  first_class_absent,
                                  second_class_absent).right_tail)
            else:
                sequence_p_values.append(SequenceFilterHelper.INVALID_P_VALUE)

        return sequence_p_values
Ejemplo n.º 20
0
def get_separator_words(toks1):
    """
    Finds the words that separate a list of tokens from a background corpus
    Basically this generates a list of informative/interesting words in a set
    toks1 is a list of words
    Returns a list of separator words
    """
    tab_toks1 = nltk.FreqDist(word.lower() for word in toks1)
    if(os.path.isfile(ESSAY_COR_TOKENS_PATH)):
        toks2 = pickle.load(open(ESSAY_COR_TOKENS_PATH, 'rb'))
    else:
        essay_corpus = open(ESSAY_CORPUS_PATH).read()
        essay_corpus = sub_chars(essay_corpus)
        toks2 = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(essay_corpus))
        pickle.dump(toks2, open(ESSAY_COR_TOKENS_PATH, 'wb'))
    sep_words = []
    for word in tab_toks1.keys():
        tok1_present = tab_toks1[word]
        if(tok1_present > 2):
            tok1_total = tab_toks1._N
            tok2_present = toks2[word]
            tok2_total = toks2._N
            fish_val = pvalue(tok1_present, tok2_present, tok1_total, tok2_total).two_tail
            if(fish_val < .001 and tok1_present / float(tok1_total) > (tok2_present / float(tok2_total)) * 2):
                sep_words.append(word)
    sep_words = [w for w in sep_words if not w in nltk.corpus.stopwords.words("english") and len(w) > 5]
    return sep_words
def run_compare(flat, adir, bdir, context, window, binary, pvalue_cutoff, ratio_range):
    #fh = open('fisher.different.%s.%ibp.gff' % (context, window), 'w')
    fh = sys.stdout
    print >>sys.stderr, "writing to:", fh.name
    print >>fh, "##gff-version 3"
    for chr in flat.seqids:
        try:
            bp_max = len(flat.fasta[chr])
        except KeyError:
            print >>sys.stderr, chr, "not found. skipping"
            continue
        (a_cs, a_ts, a_mask), (b_cs, b_ts, b_mask) = bin_setup(chr, adir, bdir, context)
        for start in xrange(0, bp_max + window, window):
            end = min(start + window, bp_max)
            if start == end: continue
            a_t_count = a_ts[start:end].sum()
            a_c_count = a_cs[start:end].sum()
            b_t_count = b_ts[start:end].sum()
            b_c_count = b_cs[start:end].sum()

            p = pvalue(a_t_count, a_c_count, b_t_count, b_c_count)
            pv = float(p.two_tail)

            if not binary and pv > pvalue_cutoff: continue
            gc = f.fasta[chr][start:end].upper()
            gc = gc.count("G") + gc.count("C")

            # if a_tot or b_tot == 0, then use 'na'
            a_tot = float(a_c_count + a_t_count)
            a_methyl = (a_c_count / a_tot) if a_tot != 0 else 0#None

            b_tot = float(b_c_count + b_t_count)
            b_methyl = (b_c_count / b_tot) if b_tot !=0 else 0#None
            #strand = "+" if a_methyl > b_methyl else "-"
            strand = "."
            # TODO: use absolute?
            plot = a_methyl - b_methyl if not None in (a_methyl, b_methyl) else 'na'

            # scale by total.
            plot = plot / (a_methyl + b_methyl)

            #print plot, a_methyl, b_methyl
            #if plot == 'na': continue
            if binary:
                if plot != 'na':
                    plot == 1 if (ratio_range[0] <= plot <= ratio_range[1]) else 0
            else:
                if not (ratio_range[0] <= plot <= ratio_range[1]):
                    #print >>sys.stderr, "skipping because of ratio range."
                    continue

            if binary and plot != 'na': plot = 0 if pv > pvalue_cutoff else 1

            attrs="p=%.3G;ac=%i;at=%i;bc=%i;bt=%i;gc=%i;plot=%.3G" % \
                        (pv, a_c_count, a_t_count, b_c_count, b_t_count, gc, plot)
            accns = flat.get_features_in_region(chr, start + 1, end)
            accns = [a["accn"] for a in accns]
            if accns:
                attrs +=";accns=" + ",".join(accns)
            print >>fh, "\t".join(map(str, [chr, "methylation", "dmc", start + 1, end, plot, strand, ".", attrs]))
Ejemplo n.º 22
0
def contingent(intervals, domain_name, nodoms_only=False):
    """
    intervals should be all intervals in all genes that contain the domain
    """
    import fisher

    n_domain_variants = sum(len(i.mafs.split(",")) for i in intervals if i.domain == domain_name)
    if nodoms_only:
        n_gene_variants = sum(len(i.mafs.split(",")) for i in intervals if i.domain == ".")
    else:
        n_gene_variants = sum(len(i.mafs.split(",")) for i in intervals if i.domain != domain_name)
    gene=set()
    n_domain_bases, n_gene_bases = 0, 0
    for iv in intervals:
        gene.add(iv.gene)
        starts = map(int, iv.starts.split(","))
        ends = map(int, iv.ends.split(","))
        l = sum(e - s for s, e in zip(starts, ends))
        assert all(e > s for s, e in zip(starts, ends)), domain_name
        if iv.domain == domain_name:
            n_domain_bases += l
        elif nodoms_only and iv.domain == ".":
            n_gene_bases += l
        elif not nodoms_only and iv.domain != domain_name:
            n_gene_bases += l
    tbl = "gene:%d/%d,dom:%d/%d" % (n_gene_variants, n_gene_bases, n_domain_variants, n_domain_bases)

    p = fisher.pvalue(n_gene_bases, n_gene_variants, n_domain_bases, n_domain_variants)

    denom = float(n_gene_variants) / (n_gene_bases or 1) or 1
    return p.two_tail, (float(n_domain_variants) / (n_domain_bases or 1)) / denom, tbl, gene
Ejemplo n.º 23
0
    def fisherTest(tab, alternative='two-sided'):
        """Fisher's exact test on a 2x2 contingency table.

        Wrapper around fisher.pvalue found in:
        Fast Fisher's Exact Test (Haibao Tang, Brent Pedersen)
        https://pypi.python.org/pypi/fisher/

        Test is performed in C (100x speed-up)

        Parameters
        ----------
        tab : list of lists or 2x2 ndarray
            Each element should contain counts
        alternative : string
            Specfies the alternative hypothesis (similar to scipy.fisher_exact)
            Options: 'two-sided', 'less', 'greater'

        Returns
        -------
        OR : float
            Odds-ratio associated with the 2 x 2 table
        p : float
            P-value associated with the test and the alternative hypothesis"""

        res = fisher.pvalue(tab[0][0], tab[0][1], tab[1][0], tab[1][1])
        OR = (tab[0][0] * tab[1][1]) / (tab[0][1] * tab[1][0])

        if alternative == 'two-sided':
            return (OR, res.two_tail)
        elif alternative == 'less':
            return (OR, res.left_tail)
        elif alternative == 'greater':
            return (OR, res.right_tail)
Ejemplo n.º 24
0
    def calculate_differential_methylation_fisher_exact(self, weighted=False):
        sum_meth_control = 0
        sum_meth_affected = 0
        sum_cov_control = 0
        sum_cov_affected = 0
        for cpg in self.cpgs:
            if weighted:
                sum_meth_control += cpg.weighted_methylation_control
                sum_meth_affected += cpg.weighted_methylation_affected
                sum_cov_control += cpg.cov_control
                sum_cov_affected += cpg.cov_affected
            else:
                sum_meth_control += cpg.meth_control
                sum_meth_affected += cpg.meth_affected
                sum_cov_control += cpg.cov_control
                sum_cov_affected += cpg.cov_affected

        control = sum_meth_control / sum_cov_control
        affected = sum_meth_affected / sum_cov_affected
        control_methylated = sum_cov_control * control / 100
        control_unmethylated = sum_cov_control - control_methylated
        affected_methylated = sum_cov_affected * affected / 100
        affected_unmethylated = sum_cov_affected - affected_methylated
        try:
            #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/
            p = fisher_exact.pvalue(control_methylated, control_unmethylated,
                                    affected_methylated, affected_unmethylated)
            pvalue = p.two_tail
        except:
            oddsratio, pvalue = stats.fisher_exact(
                [(control_methylated, control_unmethylated),
                 (affected_methylated, affected_unmethylated)],
                alternative='two-sided')
        return pvalue
Ejemplo n.º 25
0
def control_comparison(control_cohort, gene_id, sample_hits, sample_size, inheritance_mode, variant_filter, quality_filter):
    """
    Compare the results of num_hits, total against the reference population
    Return dict of 'num_hits', 'fisher_2sided_palue',
    """
    cohort = get_population_datastore().get_control_cohort(control_cohort)
    indivs_with_inheritance, gene_variation = get_individuals_with_inheritance_in_gene(
        get_population_datastore(),
        get_reference(),
        cohort,
        inheritance_mode,
        gene_id,
        variant_filter=variant_filter,
        quality_filter=quality_filter
        )
    control_hits = len(indivs_with_inheritance)
    fisher_results = fisher.pvalue(
        sample_hits,
        sample_size,
        control_hits,
        get_population_datastore().get_control_cohort_size(settings.DEFAULT_CONTROL_COHORT)
    )
    return {
        'control_hits': control_hits,
        'fisher_2sided_pvalue': fisher_results.two_tail,
    }
Ejemplo n.º 26
0
def hypergeom_test(patients,anno_dict,anno_pats,categories=[],sign_thr =0.05):
    # e.g. whether bicluster membership is assicated with group membership
    in_bicluster = set(patients).intersection(anno_pats)
    outside_bicluster =  anno_pats.difference(set(patients))
    best_p_val = 0.05
    enriched_cat = "NA"
    best_fold_enrichment = 0
    best_overlap = 0
    if len(categories) == 0:
        categories = anno_dict.keys()
    for category in categories:
        in_group = anno_dict[category]
        outside_group = anno_pats.difference(in_group)
        #print(field, category,len(in_bicluster), len(outside_bicluster), len(in_group), len(outside_group))
        # define group membership
        overlap = len(in_bicluster.intersection(in_group))
        outside_both = len(outside_bicluster.intersection(outside_group))
        in_bicluster_outside_group = len(in_bicluster.intersection(outside_group))
        outside_bicluster_in_group = len(set(outside_bicluster).intersection(set(in_group)))
        # right-sided exact Fisher's test
        p_val = pvalue(overlap,in_bicluster_outside_group,outside_bicluster_in_group,outside_both).right_tail
        
        if p_val < 0.05:
            expected_overlap = float(len(in_group))/len(anno_pats)*len(in_bicluster)
            fold_enrichment = float(overlap)/expected_overlap
            #print(p_val, category)
            log_neg_pval = -np.log10(p_val)
            if best_p_val < log_neg_pval:
                best_p_val = log_neg_pval
                enriched_cat = category
                best_fold_enrichment = fold_enrichment
                best_overlap = overlap
    return best_p_val,best_fold_enrichment,best_overlap, enriched_cat 
Ejemplo n.º 27
0
def calculate_fisher(PATH_PEAKS, biocond, ip_data, input_data, library_size_ip,
                     library_size_input, window_cutoff):
    with open(PATH_PEAKS + 'Fisher_' + ip_data + '.txt', "r") as ipfile, \
            open(PATH_PEAKS + 'Fisher_' + input_data + '.txt', "r") as inputFile, \
            open(PATH_PEAKS + 'Fisher_' + biocond + '.txt', "w") as bed_file:
        # Read transcript result
        header = [
            'WindowId', 'Windowcov', 'Windowcov_Input', 'Ratio_windowcov',
            'pvalue'
        ]
        bed_file.write('\t'.join(header) + '\n')
        ipfile.readline()
        inputFile.readline()
        window_name_to_row = dict()
        for rowInput in inputFile:
            #print(rowInput)
            window_id_input = re.split('\t| *',
                                       rowInput)[0]  #rowInput.split('\t')[0]
            window_name_to_row[window_id_input] = rowInput

        # WindowId 0
        # Windowcov 1
        # RPM 2
        index = 1
        for row_ip in ipfile:
            window_id_ip = re.split('\t| *', row_ip)[0]  #row_ip.split('\t')[0]
            row_input = window_name_to_row[window_id_ip]
            row_input = row_input.replace(window_id_ip, '').strip()
            new_row = row_ip.strip() + '\t' + row_input.strip()

            # Calc ratio window
            window_cov = float(
                list(filter(None, re.split('\t| *', row_ip)))
                [1].strip())  #float(row_ip.split('\t')[1].strip())
            window_input_cov = float(
                re.split('\t| *', row_input)
                [0].strip())  #float(row_input.split('\t')[0].strip())
            if window_input_cov == 0:
                new_row += '\t' + str(window_cov)
            else:
                ratio_windows = window_cov / window_input_cov
                new_row += '\t' + str(ratio_windows)

            # Calc fisher-test
            #print('library size input')
            #print(library_size_input)
            if window_cov > window_cutoff:
                p = pvalue(int(window_cov), library_size_ip,
                           int(window_input_cov), library_size_input)
                new_row += '\t' + str(p.right_tail)
            else:
                new_row += '\t1'

            # write file
            bed_file.write(new_row + '\n')
            if index % 1000000 == 0:
                print('Windows ' + str(index), '/25000000')
            index += 1
    print("Fisher test calculated")
Ejemplo n.º 28
0
def calculate_fisher(row_vals, col_vals, test_type):
    """Calculate fishers exact test on prepared contingency table"""

    row_val_1, row_val_2 = row_vals
    col_val_1, col_val_2 = col_vals

    if test_type == 1:
        return fisher.pvalue(row_val_1, row_val_2, col_val_1,
                             col_val_2).two_tail
    elif test_type == 2:
        return fisher.pvalue(row_val_1, row_val_2, col_val_1,
                             col_val_2).left_tail
    elif test_type == 3:
        return fisher.pvalue(row_val_1, row_val_2, col_val_1,
                             col_val_2).right_tail
    else:
        raise TypeError
Ejemplo n.º 29
0
def fisherExact(line, idx):
    """Apply fisher exact test to appropriate columns of bed line. Columns are
    selected with the indexes in idx
    """
    ## cnt= [[int(line[3]), int(line[4])], [int(line[5]), int(line[6])]]
    fet= fisher.pvalue(int(line[idx[0]]), int(line[idx[1]]), int(line[idx[2]]), int(line[idx[3]]))
    pvalues= [str(round(fet.left_tail, 4)), str(round(fet.right_tail, 4))]
    line.append('\t'.join(pvalues))
    return(line)
Ejemplo n.º 30
0
def heatmap_v2(chromosomes,pop_counts, num_variants, population_dict,frequency_range, exclude, 
                p_value, muted_dir,tag= '',output= 'pval',row= 24, col= 4, test= 'fisher'):

    '''
    pairwise comparison of count matrices. Chi2 applied cell-wise. 
    p-value or proportion - output argument. 
    - v2: count matrices are provided in pop_counts dictionary. 
    '''
    if exclude:
        files= read_exclude()
    else:
        files= {}
    
    refpop, pop = list(pop_counts.keys())

    ratio_grid = np.zeros((row, col))
    sig_x, sig_y = [], []
    
    for i in range(row):
        for j in range(col):
            chi_array= np.array([
                    [pop_counts[pop][i][j], num_variants[pop]],
                    [pop_counts[refpop][i][j], num_variants[refpop]]
                ])

            chi_0= np.sum(chi_array,axis= 1)
            chi_1= np.sum(chi_array,axis= 0)
            
            if chi_0[0] == 0 or chi_0[1] == 0:
                ratio_grid[i][j] = np.nan
                sig_x.append(j+0.5)
                sig_y.append(i+0.5)
            
            elif chi_1[0] == 0 or chi_1[1] == 0:
                ratio_grid[i][j] = 1
            
            else:
                ##
                if test == 'chi2':
                    _, this_pval, _, _ = chi2_contingency(
                        chi_array
                    )
                else:
                    p= pvalue(pop_counts[pop][i][j], num_variants[pop],
                        pop_counts[refpop][i][j], num_variants[refpop])
                    this_pval= p.two_tail
                    
                if output == 'pval':
                    ratio_grid[i][j] = this_pval
                else:
                    ratio_grid[i][j] = (pop_counts[pop][i][j] * num_variants[refpop] /
                                        (num_variants[pop] * pop_counts[refpop][i][j]))
                if this_pval < p_value:
                    sig_x.append(j+0.5)
                    sig_y.append(i+0.5)

    return ratio_grid, (sig_x, sig_y)
Ejemplo n.º 31
0
def calc_gene_overlap_pval(bic, bic2, N):
    g1 = bic["genes"]
    g2 = bic2["genes"]
    g1_g2 = len(g1.intersection(g2))
    g1_only = len(g1.difference(g2))
    g2_only = len(g2.difference(g1))
    p_val = pvalue(g1_g2, g1_only, g2_only,
                   N - g1_g2 - g1_only - g2_only).right_tail
    return p_val
Ejemplo n.º 32
0
def computeFisherExact(
):  ## compute fisher's exact test, use sum for replicates
    for k in combined_circ_d:  ## for all unique circRNAs
        cc_1 = []
        cc_2 = []
        lc_1 = []
        lc_2 = []
        ufi = combined_circ_d[k][0]
        dfi = "noKey"
        if len(combined_circ_d[k]) == 2:  ## it has two introns listed
            dfi = combined_circ_d[k][1]
            ## upstream and downstream flanking introns

        ## get cc_1 and lc_1
        for i in range(numC1):  ## for each replicate
            cVal = 0
            lVal = 0
            if k in c_dic[S1][i]:
                cVal = c_dic[S1][i][k]
                if ufi in lcs1[i]:  ## upstream junction count exists
                    lVal = lcs1[i][ufi]
                if dfi in lcs1[i]:  ## downstream junction count exists
                    lVal += lcs1[i][dfi]
            cc_1.append(cVal)
            lc_1.append(lVal)

        ## get cc_2 and lc_2
        for i in range(numC2):  ## for each replicate
            cVal = 0
            lVal = 0
            if k in c_dic[S2][i]:
                cVal = c_dic[S2][i][k]
                if ufi in lcs2[i]:  ## upstream junction count exists
                    lVal = lcs2[i][ufi]
                if dfi in lcs2[i]:  ## downstream junction count exists
                    lVal += lcs2[i][dfi]

            cc_2.append(cVal)
            lc_2.append(lVal)

        counts_d[k] = [cc_1, lc_1, cc_2, lc_2]

        ##print (cc_1, cc_2, lc_1, lc_2);
        n1 = 2 * sum(cc_1)
        n2 = sum(lc_1)
        n3 = 2 * sum(cc_2)
        n4 = sum(lc_2)

        p = fisher.pvalue(n1, n2, n3, n4)
        fp[k] = p.two_tail
        ## saving p-value. it has p.left_tail, p.right_tail, and p.two_tail values

    for k in sorted(fp):
        sortedKey.append(k)
        ## sort keys for fdr calculation
    logging.debug("Done computing two-tail fisher exact test")
Ejemplo n.º 33
0
    def fisherTest(tab, alternative='two-sided'):
        res = fisher.pvalue(tab[0][0], tab[0][1], tab[1][0], tab[1][1])
        OR = (tab[0][0] * tab[1][1]) / (tab[0][1] * tab[1][0])

        if alternative == 'two-sided':
            return (OR, res.two_tail)
        elif alternative == 'less':
            return (OR, res.left_tail)
        elif alternative == 'greater':
            return (OR, res.right_tail)
Ejemplo n.º 34
0
def hypergeom(m, n, n1, n2):
    """
    From Fury et al., www.nslij-genetics.org/wli/pub/ieee-embs06.pdf

    :param m: overlapping genes
    :param n: total genes that could be sampled
    :param n1: number of genes in set 1
    "param n2: number of genes in set 2
    """
    return fisher.pvalue(*_fury_table(m, n, n1, n2)).right_tail
Ejemplo n.º 35
0
def count_F(s1, s2):
    if True:  #	return corrcoef([code[x] for x in s1],[code[x] for x in s2])
        l = len(s1)
        c = [x + 10 * y for x, y in zip(s1, s2)]
        x = []
        for n in [0, 1, 10, 11]:
            x.append(c.count(n))
        x = [float(y) for y in x]
        F = fisher.pvalue(x[0], x[1], x[2], x[3]).two_tail
        return F
Ejemplo n.º 36
0
def significance_on_tuple(sig_tuple):
    _, _, w1_occurrence, w2_occurrence, cooccurrences, n_docs = sig_tuple
    pvalue = fisher.pvalue(
        cooccurrences, w2_occurrence - cooccurrences,
        w1_occurrence - cooccurrences,
        (n_docs - w1_occurrence - w2_occurrence + cooccurrences))
    # pvalue = fisher.pvalue(cooccurrences,w2_occurrence-cooccurrences,w1_occurrence,(n_docs-w1_occurrence))
    #print sig_tuple, pvalue.left_tail, pvalue.right_tail
    #return (pvalue.left_tail,pvalue.right_tail,pvalue.two_tail)
    return pvalue.left_tail
Ejemplo n.º 37
0
def _fisherStrandBias(record):
    try:
        A = record.genotype('TUMOR')['ALT_F1R2']
        B = record.genotype('TUMOR')['ALT_F2R1']
        C = record.genotype('TUMOR')['REF_F1R2']
        D = record.genotype('TUMOR')['REF_F2R1']
        FSB = -math.log10(fisher.pvalue(A,B,C,D).two_tail)
    except:
        FSB = '.'
    return(FSB)
Ejemplo n.º 38
0
def fisherExact(line, idx):
    """Apply fisher exact test to appropriate columns of bed line. Columns are
    selected with the indexes in idx
    """
    ## cnt= [[int(line[3]), int(line[4])], [int(line[5]), int(line[6])]]
    fet = fisher.pvalue(int(line[idx[0]]), int(line[idx[1]]),
                        int(line[idx[2]]), int(line[idx[3]]))
    pvalues = [str(round(fet.left_tail, 4)), str(round(fet.right_tail, 4))]
    line.append('\t'.join(pvalues))
    return (line)
Ejemplo n.º 39
0
def hypergeom(m, n, n1, n2):
    """
    From Fury et al., www.nslij-genetics.org/wli/pub/ieee-embs06.pdf

    :param m: overlapping genes
    :param n: total genes that could be sampled
    :param n1: number of genes in set 1
    "param n2: number of genes in set 2
    """
    return fisher.pvalue(*_fury_table(m, n, n1, n2)).right_tail
Ejemplo n.º 40
0
    def _test_hypergeom(m, n, n1, n2):
        R_pval = r.phyper(min(n1, n2), n1, n - n1, n2)[0] \
                - r.phyper(m - 1, n1, n - n1, n2)[0]
        f_pval = fisher.pvalue(*_fury_table(m, n, n1, n2)).right_tail

        # at least to 10 sig figs
        R_str = ('%.10f' % R_pval)
        f_str = ('%.10f' % f_pval)

        print 'R:', R_str, 'Fisher:', f_str
        assert R_str == f_str
Ejemplo n.º 41
0
def hg_test(rlist1, rlist2, t_u, t_w, return_items=False):
    N = len(rlist1)
    overlap = set(rlist1[:t_u]).intersection(set(rlist2[:t_w]))
    overlap_size = len(overlap)
    p_val = pvalue(overlap_size, t_u - overlap_size, t_w - overlap_size,
                   N + overlap_size - t_u - t_w).right_tail
    enrichment = float(overlap_size) / (float((t_u) * (t_w)) / N)
    if return_items:
        return p_val, enrichment, overlap
    else:
        return p_val, enrichment, overlap_size
Ejemplo n.º 42
0
def find_threshold(t_u, t_w, N, significance_thr):
    '''Find min. possible overlap still passing the significance threshold, given t_u,t_w and N.'''
    prev_x = N
    for x in range(min(t_u, t_w), 0, -1):
        p_val = pvalue(x, t_u - x, t_w - x, N - t_u - t_w + x).right_tail
        #print(t_u,t_w,x, p_val)
        if p_val < significance_thr:
            prev_x = x
        else:
            break
    return prev_x
Ejemplo n.º 43
0
    def _test_hypergeom(m, n, n1, n2):
        R_pval = r.phyper(min(n1, n2), n1, n - n1, n2)[0] \
                - r.phyper(m - 1, n1, n - n1, n2)[0]
        f_pval = fisher.pvalue(*_fury_table(m, n, n1, n2)).right_tail

        # at least to 10 sig figs
        R_str = ('%.10f' % R_pval)
        f_str = ('%.10f' % f_pval)

        print 'R:', R_str, 'Fisher:', f_str
        assert R_str == f_str
Ejemplo n.º 44
0
def fisher_chi(study_c, study_p, control_c, control_p):
    """takes study count, population count, control count and control total"""
    if study_p <= 1000 and control_p <= 1000:
        p = fisher.pvalue(study_c, study_p, control_c, control_p)
        res = [p.two_tail]
    else:
        v = ro.IntVector([study_c, study_p, control_c, control_p])
        m = ro.r['matrix'](v, 2, 2)
        res = ro.r['chisq.test'](m)[2]
    enrichment = 'e' if 1.0 * study_c / study_p > 1.0 * control_c / control_p else 'p'
    return res[0], enrichment
Ejemplo n.º 45
0
    def run_study_genome_v2(self, association_2_count_dict_foreground,
                            association_2_count_dict_background, foreground_n,
                            background_n):
        """
        ###################################################
        # contingency table general variable names:
        #     foreground       |     background     |
        # -------------------------------------------------
        # +   a = foregr_count |   c = backgr_count |   r1
        # -------------------------------------------------
        # -     b              |       d            |   r2
        # -------------------------------------------------
        #     foregr_n         |     backgr_n       |    n
        """
        fisher_dict = {}
        len_dict = len(association_2_count_dict_foreground)
        term_arr = np.empty(
            (len_dict, ), dtype=np.dtype('U13')
        )  # cat Functions_table_STRING.txt | cut -f 2 | awk '{print length, $0}' | sort -nr | head -1
        p_value_arr = np.zeros(shape=(len_dict, ), dtype="float64")
        foreground_count_arr = np.zeros(shape=(len_dict, ), dtype="int8")

        for i, (association, foreground_count) in enumerate(
                association_2_count_dict_foreground.items()):
            try:
                background_count = association_2_count_dict_background[
                    association]
            except KeyError:
                self.args_dict[
                    "ERROR_association_2_count"] = "ERROR retrieving counts for association {} please contact [email protected] with this error message".format(
                        association)
                return None
            a = foreground_count  # number of proteins associated with given GO-term
            b = foreground_n - foreground_count  # number of proteins not associated with GO-term
            c = background_count
            d = background_n - background_count
            if d < 0:
                d = 0
            ### enriched or overrepresented --> right_tail or greater (but foreground and background are switched)
            try:
                p_val_uncorrected = fisher_dict[(a, b, c, d)]
            except KeyError:
                p_val_uncorrected = pvalue(a, b, c, d).right_tail
                fisher_dict[(a, b, c, d)] = p_val_uncorrected
            term_arr[i] = association
            p_value_arr[i] = p_val_uncorrected
            foreground_count_arr[i] = foreground_count

        df = pd.DataFrame()
        df["term"] = term_arr
        df["p_value"] = p_value_arr
        df["foreground_count"] = foreground_count_arr
        df = multiple_testing.BH_fast_v3(df)
        return df
Ejemplo n.º 46
0
    def run_study_genome(self, association_2_count_dict_foreground,
                         association_2_count_dict_background, foreground_n,
                         background_n):
        """
        ###################################################
        # contingency table general variable names:
        #     foreground       |     background     |
        # -------------------------------------------------
        # +   a = foregr_count |   c = backgr_count |   r1
        # -------------------------------------------------
        # -     b              |       d            |   r2
        # -------------------------------------------------
        #     foregr_n         |     backgr_n       |    n
        """
        fisher_dict = {}
        term_list, description_list, p_value_list, foreground_count_list = [], [], [], []
        for association, foreground_count in association_2_count_dict_foreground.items(
        ):
            try:
                background_count = association_2_count_dict_background[
                    association]
            except KeyError:
                self.args_dict[
                    "ERROR_association_2_count"] = "ERROR retrieving counts for association {} please contact [email protected] with this error message".format(
                        association)
                return None
                # background_count = np.nan
            a = foreground_count  # number of proteins associated with given GO-term
            b = foreground_n - foreground_count  # number of proteins not associated with GO-term
            c = background_count
            d = background_n - background_count
            if d < 0:
                d = 0
            ### enriched or overrepresented --> right_tail or greater (but foreground and background are switched)
            try:
                p_val_uncorrected = fisher_dict[(a, b, c, d)]
            except KeyError:
                p_val_uncorrected = pvalue(a, b, c, d).right_tail
                #p_val_uncorrected = stats.fisher_exact([[a, b], [c, d]], alternative='greater')[1]
                fisher_dict[(a, b, c, d)] = p_val_uncorrected
            term_list.append(association)
            p_value_list.append(p_val_uncorrected)
            # foreground_ids_list.append(';'.join(self.association_2_ANs_dict_foreground[association])) # !!! remove this and add infos after FDR filtering
            foreground_count_list.append(foreground_count)

        # create DataFrame from List compare time setup
        df = pd.DataFrame({
            "term": term_list,
            "p_value": p_value_list,
            # "foreground_ids": foreground_ids_list, # do later
            "foreground_count": foreground_count_list
        })
        df = multiple_testing.BH_fast_v3(df)
        return df
Ejemplo n.º 47
0
def fisher_chi(study_c, study_p, control_c, control_p):
    """takes study count, population count, control count and control total"""
    if study_p <= 1000 and control_p <= 1000:
        p = fisher.pvalue(study_c, study_p, control_c, control_p)
        res = [p.two_tail]
    else:
        v = ro.IntVector([study_c, study_p, control_c, control_p])
        m = ro.r["matrix"](v, 2, 2)
        res = ro.r["chisq.test"](m)[2]
    enrichment = "e" if 1.0 * study_c / study_p > 1.0 * control_c / control_p else "p"
    return res[0], enrichment
Ejemplo n.º 48
0
def rtest(comps, genome, motif, rcounts, nums): # rcounts
    print "%s.%s.%s: fisher test on real and perm data" % (comps, genome, motif)
    val_class = rcounts.get("t", 0)
    val_control = rcounts.get("c", 0)
    #num_class = nums.get("t.all", 0) # v_18
    #num_control = nums.get("c.all", 0) # v_18
    num_class = nums.get("t", 0) # v_17
    num_control = nums.get("c", 0) # v_17
    num_all = float(num_class+num_control)
    val_all = float(val_class+val_control)

    fisher = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail

    # information gain (g at the end)
    i1 = (num_class/num_all)*math.log(num_class/num_all, 2)
    i2 = (num_control/num_all)*math.log(num_control/num_all, 2)
    i = -(i1+i2)
    c1_num = max(1, val_all) # dont allow it to be 0
    if (val_class/c1_num)>0 and val_control/c1_num>0:
        c1_num = -( (val_class/c1_num)*math.log(val_class/c1_num, 2) + (val_control/c1_num)*math.log(val_control/c1_num, 2) )
        c1_num = val_all/num_all * c1_num
    else:
        c1_num = 0
    c2_num = max(1, float( (num_class-val_class) + (num_control-val_control) ) )  # dont allow it to be 0
    if (num_class-val_class)/c2_num>0 and (num_control-val_control)/c2_num>0:
        c2_num = -( ((num_class-val_class)/c2_num)*math.log((num_class-val_class)/c2_num, 2) + ((num_control-val_control)/c2_num)*math.log((num_control-val_control)/c2_num, 2) )
        c2_num = (num_all-val_all)/num_all * c2_num
    else:
        c2_num = 0
    ig = i - (c1_num + c2_num)

    p_emp = []
    for p in range(0, rnamotifs2.config.perms):
        val_class = rcounts.get("%s.p%s" % (event_class, p), 0)
        val_control = rcounts.get("c.p%s" % p, 0)
        num_class = rnamotifs2.perm.ec_dist[p].get(event_class, 0)
        num_control = rnamotifs2.perm.ec_dist[p].get("c", 0)
        val = pvalue(val_class, val_control, num_class-val_class, num_control-val_control).right_tail
        p_emp.append(val)

    return (fisher, p_emp, ig)
Ejemplo n.º 49
0
def fisher_right_tail(intersection_size, gse_size, query_size, module_size):
    """
    :type query_size: int
    :type gse_size: int
    :type intersection_size: int
    :type module_size: int
    """
    a = intersection_size
    b = module_size - a
    c = query_size - a
    d = gse_size - query_size - b
    return fisher.pvalue(a, b, c, d).right_tail
Ejemplo n.º 50
0
def main():
    options, args = cmdparameter(sys.argv)
    #-----------------------------------
    go_file = options.go
    gene_file = options.gene
    output = options.output
    pvalue_thresh = float(options.pvalue)
    if output:
        fh_out = open(output, 'w')
    else:
        fh_out = sys.stdout
    verbose = options.verbose
    debug = options.debug
    #-----------------------------------
    annoGeneD = set([j for i in open(go_file) \
        for j in i.split('\t')[2].split(',')])
    geneD = set([i.strip() for i in open(gene_file)])
    geneD = geneD.intersection(annoGeneD)
    geneD_len = len(geneD)
    #------------------------------------
    #--------------------------------
    annoL = []
    header = 1
    for line in open(go_file):
        lineL = line.strip().split('\t')
        if header:
            print >>fh_out, "%s\t%s\tTargetGene\tTargetCount\tTargetTotal\tp\tfracT" \
                % (lineL[0], lineL[1])
            header -= 1
            continue
        #lineL = line.split('\t')
        go_gene = lineL[2].split(',')
        anno_gene = [gene for gene in go_gene if gene in geneD]
        if anno_gene:
            termCount = int(lineL[3])
            totalCount = int(lineL[4])
            annoCount = len(anno_gene)
            p = pvalue(annoCount, termCount-annoCount,
                    geneD_len-annoCount,
                    totalCount-geneD_len+annoCount-termCount)
            p = p.two_tail
            fracT = annoCount * 1.0 / geneD_len / termCount * totalCount
            if fracT > 1 and p <= pvalue_thresh:
                print >>fh_out, "%s\t%s\t%s\t%d\t%d\t%f\t%f" \
                    % (lineL[0], lineL[1], ','.join(anno_gene),
                        annoCount, geneD_len, p, fracT)
    #-------------END reading file----------
    if output:
        fh_out.close()
        os.system("multipleTest.sh -f %s" % output)
    if verbose:
        print >>sys.stderr,\
            "--Successful %s" % strftime(timeformat, localtime())
Ejemplo n.º 51
0
def _fisher(rule):
    '''
    Fisher's p-value for one rule.
    '''
    N = float(len(rule.kb.examples))
    nX = float(rule.coverage)
    nY = rule.kb.distribution[rule.target]
    nXY = rule.distribution[rule.target]
    nXnotY = nX - nXY
    nnotXY = nY - nXY
    nnotXnotY = N - nXnotY - nnotXY
    return pvalue(nXY, nXnotY, nnotXY, nnotXnotY)
Ejemplo n.º 52
0
def calc_overlap_pval_J(bic, bic2, all_samples):
    s1 = bic["samples"]
    s2 = bic2["samples"]
    if bic["direction"] != bic2["direction"]:
        s2 = all_samples.difference(s2)
    s1_s2 = len(s1.intersection(s2))
    s1_only = len(s1.difference(s2))
    s2_only = len(s2.difference(s1))
    p_val = pvalue(s1_s2, s1_only, s2_only,
                   len(all_samples) - s1_s2 - s1_only - s2_only).right_tail
    J = 1.0 * s1_s2 / (s1_s2 + s1_only + s2_only)
    return p_val, J
Ejemplo n.º 53
0
def filtering(control_file, affected_file, filtered_control_file, filtered_affected_file, max_pvalue = None, min_cov = None, max_cov = None, min_delta_methylation = None, filter_quantil = None):

    control_quantil = None
    affected_quantil = None
    if filter_quantil:
        control_quantil = mquantiles( np.loadtxt(control_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0]
        affected_quantil = mquantiles( np.loadtxt(affected_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0]

    non_filtered_sites = 0
    for site_counter, (control_line, affected_line) in enumerate( izip(open(control_file), open(affected_file)) ):
        c_chrom, c_start, c_end, c_cov, c_meth, c_strand = control_line.strip().split('\t')
        a_chrom, a_start, a_end, a_cov, a_meth, a_strand = affected_line.strip().split('\t')
        try:
            assert( c_chrom == a_chrom )
            assert( c_start == a_start )
            assert( c_end == a_end )
            assert( c_strand == a_strand )
        except AssertionError:
            sys.exit('That file needs intersected inputfiles, so that each site is present in both files, affected and control.\n %s : %s \n %s : %s \n %s : %s \n %s : %s \n' % (c_chrom, a_chrom, c_start, a_start, c_end, a_end, c_strand, a_strand))

        c_cov, c_meth, a_cov, a_meth = map(float, [c_cov, c_meth, a_cov, a_meth])
        if min_cov != None and (a_cov < min_cov or c_cov < min_cov):
            continue
        if max_cov != None and (a_cov > max_cov or c_cov > max_cov):
            continue
        if min_delta_methylation != None and abs(a_meth - c_meth) < min_delta_methylation:
            continue
        if filter_quantil and (c_cov > control_quantil or a_cov > affected_quantil):
            continue

        if max_pvalue != None:
            control_methylated = c_cov * c_meth / 100
            control_unmethylated = c_cov - control_methylated
            affected_methylated = a_cov * a_meth / 100
            affected_unmethylated = a_cov - affected_methylated
            try:
                #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/
                p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated)
                pvalue = p.two_tail
            except:
                oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided')

            if pvalue > max_pvalue:
                continue

        non_filtered_sites += 1
        filtered_control_file.write(control_line)
        filtered_affected_file.write(affected_line)

    sys.stdout.write( "%s from %s filtered.\n" % (site_counter+1 - non_filtered_sites, site_counter + 1) )
    filtered_affected_file.close()
    filtered_control_file.close()
Ejemplo n.º 54
0
def main():
    lensysargv = len(sys.argv)
    if lensysargv < 4:
        print >>sys.stderr, "Print the result to screen"
        print >>sys.stderr, 'Using python %s filename total_first_col \
total_second_col two_tail[left_tail, right_tail] head[number of lines\
 needs to skip, default 1] divide_a_value[used when your count larger \
than 4294967296]' % sys.argv[0]
        sys.exit(0)
    #-----------------------------------
    file = sys.argv[1]
    total_first_col = int(sys.argv[2])
    total_second_col = int(sys.argv[3])
    #print total_first_col,  total_second_col
    if lensysargv > 4:
        tail = sys.argv[4]
    else:
        tail = "two_tail"
    if lensysargv > 5:
        head = int(sys.argv[5])
    else:
        head = 1
    if lensysargv > 6:
        scale = int(sys.argv[6])
        total_first_col = total_first_col/scale
        total_second_col = total_second_col/scale
    else:
        scale = 1
    #-----------------------------------------------
    for line in open(file):
        line = line.rstrip()
        if head:
            print "%s\t%s" % (line, 'p')
            head -= 1
            continue
        #----------------------------------
        lineL = line.split()
        q = int(lineL[1]) / scale
        m = int(lineL[2]) / scale
        if q == 0 and m == 0:
            continue
        p = pvalue(q, m, total_first_col-q, total_second_col-m)
        if tail == 'two_tail':
            print "%s\t%s" % (line, p.two_tail)
        elif tail == 'left_tail':
            print "%s\t%s" % (line, p.left_tail)
        elif tail == 'right_tail':
            print "%s\t%s" % (line, p.right_tail)
Ejemplo n.º 55
0
    def temporal_scan( \
            baseline_filters, target_filters, analysis_start, analysis_end,
            keylist = None, cur_window = 7, ref_window = 91, lag = 0, constant_baseline = False,
            index = None, time_field = None):
        start = None
        end   = None

        if EventDetector.cfg == None:
            EventDetector.load_configuration('config/tad.cfg')

        if start is None:
            start = analysis_start - dt.timedelta(days = cur_window + lag + ref_window - 1)

        if end is None:
            end = analysis_end

        counts = EventDetector.get_counts(
                start, end, baseline_filters, target_filters,
                keylist, index, time_field, constant_baseline)
        if isinstance(counts, str):
            raise Exception(counts)
        elif len(counts) == 0:
            raise Exception('ERROR: No results returned. Valid analysis range specified?')

        kernel_ref      = np.ones(ref_window)
        kernel_cur      = np.ones(cur_window)

        n_days = (analysis_end - analysis_start).days + 1

        baseline_ref    = np.correlate(counts['baseline'], kernel_ref)[:n_days]
        target_ref      = np.correlate(counts['target']  , kernel_ref)[:n_days]
        baseline_cur    = np.correlate(counts['baseline'], kernel_cur)[-n_days:]
        target_cur      = np.correlate(counts['target']  , kernel_cur)[-n_days:]

        on_date = analysis_start
        results = []
        for si in xrange(n_days):
            p = pvalue(baseline_ref[si], target_ref[si], baseline_cur[si], target_cur[si])

            results.append([
                on_date, baseline_ref[si], target_ref[si], baseline_cur[si],
                target_cur[si], p.left_tail, p.two_tail, p.right_tail])
            on_date += dt.timedelta(days = 1)

        return results
Ejemplo n.º 56
0
def test_discrete(a, b):
    # multiple classes, Fisher's exact test, followed by Bonferonni correction
    # returns the smallest p-value for all tested classes

    all_categories = set(flatten(a))
    pvalues = []
    for category in all_categories:
        # calculate number of items with this category
        a1, a0 = get_counts(a, category)
        b1, b0 = get_counts(b, category)
        # we are only interested in enrichment, so right_tail
        pvalue = fisher.pvalue(a1, a0, b1, b0).right_tail
        pvalues.append((pvalue, category))

    # fisher's exact test plus bonferroni correction of number of tests
    min_pvalue, min_category = min(pvalues)
    min_pvalue *= len(pvalues)
    return min_pvalue, min_category
Ejemplo n.º 57
0
 def _enrichment (self, genes1, genes2, part=''):
     '''
     computes an enrichment test between two sets of lists of genes
     '''
     len_genes1 = len (genes1)
     len_genes2 = len (genes2)
     dico = self.gsea
     part = '|' + part
     # start fishers
     for ann, annot_genes in self.annot.iteritems():
         p1 = len (set (annot_genes) & genes1)
         p2 = len (set (annot_genes) & genes2)
         n1 = len_genes1 - p1
         n2 = len_genes2 - p2
         dico [ann + part] = {'p1' : p1, 'n1': n1,
                              'p2' : p2, 'n2': n2,
                              'pv' : pvalue (p1, n1, p2, n2).two_tail, # 3/4 of time spent here
                              'odd': _get_odd_ratio (p1, p2, n1, n2)}
Ejemplo n.º 58
0
def get_vocab(text, score, max_feats=750, max_feats2=200):
    """
    Uses a fisher test to find words that are significant in that they separate
    high scoring essays from low scoring essays.
    text is a list of input essays.
    score is a list of scores, with score[n] corresponding to text[n]
    max_feats is the maximum number of features to consider in the first pass
    max_feats2 is the maximum number of features to consider in the second (final) pass
    Returns a list of words that constitute the significant vocabulary
    """
    dict = CountVectorizer(ngram_range=(1,2), max_features=max_feats)
    dict_mat = dict.fit_transform(text)
    set_score = numpy.asarray(score, dtype=numpy.int)
    med_score = numpy.median(set_score)
    new_score = set_score
    if(med_score == 0):
        med_score = 1
    new_score[set_score < med_score] = 0
    new_score[set_score >= med_score] = 1

    fish_vals = []
    for col_num in range(0, dict_mat.shape[1]):
        loop_vec = dict_mat.getcol(col_num).toarray()
        good_loop_vec = loop_vec[new_score == 1]
        bad_loop_vec = loop_vec[new_score == 0]
        good_loop_present = len(good_loop_vec[good_loop_vec > 0])
        good_loop_missing = len(good_loop_vec[good_loop_vec == 0])
        bad_loop_present = len(bad_loop_vec[bad_loop_vec > 0])
        bad_loop_missing = len(bad_loop_vec[bad_loop_vec == 0])
        fish_val = pvalue(good_loop_present, bad_loop_present, good_loop_missing, bad_loop_missing).two_tail
        fish_vals.append(fish_val)

    cutoff = 1
    if(len(fish_vals) > max_feats2):
        cutoff = sorted(fish_vals)[max_feats2]
    good_cols = numpy.asarray([num for num in range(0, dict_mat.shape[1]) if fish_vals[num] <= cutoff])

    getVar = lambda searchList, ind: [searchList[i] for i in ind]
    vocab = getVar(dict.get_feature_names(), good_cols)

    return vocab
Ejemplo n.º 59
0
	def allelic_association_comb(self, phenotype_list,genotype_list):
		associations = []
		allele_list = []
		alleles = set()
		for g in genotype_list:
			if g == "":
				allele_list.append(["",""])
				alleles.add("")	
			else:
				allele_list.append(g.split(","))
				for al in g.split(","):
					alleles.add(al)

		for a_al in alleles:
			if True:
				a_al_case = 0; a_al_control = 0;
				b_al_case = 0; b_al_control = 0;
				for phenotype, genotype in zip(phenotype_list,allele_list):

					for allele in genotype:
						if allele == a_al:
							if phenotype == 1:
								a_al_case += 1
							else:
								a_al_control += 1

						if genotype != a_al:
							if phenotype == 1:
								b_al_case += 1
							else:
								b_al_control += 1

				p_val = pvalue(a_al_case, a_al_control,b_al_case, b_al_control)

				associations.append({
					"a_al" : a_al,
					"b_al" : "All Others",
					"p_val" : round(p_val.two_tail,4)

				})
		return associations
Ejemplo n.º 60
0
 def get_vocab(self, input_text, input_scores):
     train_mat = self.vectorizer1.transform(input_text)
     input_score_med = np.median(input_scores)
     new_scores = [0 if i<=input_score_med else 1 for i in input_scores]
     pvalues = []
     for i in xrange(0,train_mat.shape[1]):
         lcol = np.asarray(train_mat.getcol(i).todense().transpose())[0]
         good_lcol = lcol[[n for n in xrange(0,len(new_scores)) if new_scores[n]==1]]
         bad_lcol = lcol[[n for n in xrange(0,len(new_scores)) if new_scores[n]==0]]
         good_lcol_present = len(good_lcol[good_lcol > 0])
         good_lcol_missing = len(good_lcol[good_lcol == 0])
         bad_lcol_present = len(bad_lcol[bad_lcol > 0])
         bad_lcol_missing = len(bad_lcol[bad_lcol == 0])
         pval = pvalue(good_lcol_present, bad_lcol_present, good_lcol_missing, bad_lcol_missing)
         pvalues.append(pval.two_tail)
     col_inds = list(xrange(0,train_mat.shape[1]))
     p_frame = pd.DataFrame(np.array([col_inds, pvalues]).transpose(), columns=["inds", "pvalues"])
     p_frame = p_frame.sort(['pvalues'], ascending=True)
     getVar = lambda searchList, ind: [searchList[int(i)] for i in ind]
     vocab = getVar(self.vectorizer1.get_feature_names(), p_frame['inds'][:2000])
     return vocab