Example #1
0
    def test_arith_flex_series(self):
        df = self.simple

        row = df.xs('a')
        col = df['two']
        # after arithmetic refactor, add truediv here
        ops = ['add', 'sub', 'mul', 'mod']
        for op in ops:
            f = getattr(df, op)
            op = getattr(operator, op)
            assert_frame_equal(f(row), op(df, row))
            assert_frame_equal(f(col, axis=0), op(df.T, col).T)

        # special case for some reason
        assert_frame_equal(df.add(row, axis=None), df + row)

        # cases which will be refactored after big arithmetic refactor
        assert_frame_equal(df.div(row), df / row)
        assert_frame_equal(df.div(col, axis=0), (df.T / col).T)

        # broadcasting issue in GH7325
        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
        expected = DataFrame([[nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)

        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
        expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)
Example #2
0
def propNoteGraph(data_test,b_u,b_i,mu,L,R):
    # Give the interesting graphic
    index_note = np.arange(1,6)
    count_1 = np.zeros([5,2])
    count_2 = np.zeros([5,2])
    notes = DataFrame(count_1,index=index_note,columns=['BON','MAUVAIS'])
    notes_naif = DataFrame(count_2,index=index_note,columns=['BON','MAUVAIS'])
    
    for r in range(data_test.shape[0]):
#        r_pred = round(mu + b_u[data_test.user_id.values[r]] + b_i[data_test.movie_id.values[r]] + X[data_test.user_id.values[r],data_test.movie_id.values[r]])           
        mean = mu + b_u[data_test[r,0]] + b_i[data_test[r,1]]        
        r_pred = round(mean + np.dot(L[data_test[r,0],:],R[data_test[r,1],:]))          
        r_pred = min(5,r_pred)
        r_pred = max(1,r_pred)
        r_true = int(round(mean+data_test[r,2]))
        r_naif = round(mean)

        if r_naif==r_true:
            notes_naif.BON[r_true]+=1
        else:
            notes_naif.MAUVAIS[r_true]+=1
        
        if r_pred==r_true:
            notes.BON[r_true]+=1
        else:
            notes.MAUVAIS[r_pred]+=1
                
    notes_naif_prop = notes_naif.div(notes_naif.sum(1),axis=0)
    notes_prop = notes.div(notes.sum(1),axis=0)
    
    notes_naif_VS_algo = pd.concat([notes_prop.BON,notes_naif_prop.BON], axis=1)
    notes_naif_VS_algo.columns = ['ALGO','NAIF']
    return notes_naif_VS_algo
Example #3
0
def hmm_build(alphabet, aln, threshold, sigma):
    '''Given alphabet, multiple alignment aln, insertion threshold and pseudocount sigma,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = ['M{0} D{0} I{0}'.format(i).split() for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']
    
    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # scale rows to [0, 1]
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)
    
    #add pseudocounts
    transitions.iloc[:2, 1:4] += sigma
    transitions.iloc[-4:-1, -2:] += sigma
    for i in range(k):
        transitions.iloc[i*3-1:i*3+2, i*3+1:i*3+4] += sigma
        emissions.iloc[i*3+1:i*3+3, :] += sigma
    emissions.iloc[-2, :] += sigma
    
    # scale again
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
def to_molecular(df: pd.DataFrame, renorm=True):
    """
    Converts mass quantities to molar quantities of the same order.
    E.g.:
    mass% --> mol%
    mass-ppm --> mol-ppm
    """
    MWs = [pt.formula(c).mass for c in df.columns]
    if renorm:
         return renormalise(df.div(MWs))
    else:
        return df.div(MWs)
Example #5
0
def hmm_build(alphabet, aln, threshold):
    '''given alphabet, multiple alignment aln, and insertion threshold,
    return the profile HMM transition and emission matrix.'''
    
    aln_cols = list(zip(*(aln)))
    m, n = len(aln), len(aln_cols)       # m sequences, n columns

    # indices of columns where '-' count is below threshold
    match_cols = [i for i in range(n) if aln_cols[i].count('-') / m < threshold]

    # state names
    k = len(match_cols)   # k states
    states_ = [('M'+ str(i), 'D' + str(i), 'I' + str(i)) for i in range(1, k + 1)]
    states = ['S', 'I0'] + [i for j in states_ for i in j] + ['E']

    # building matrices
    transitions = DataFrame(data=0.0, columns=states, index=states)
    emissions = DataFrame(data=0.0, columns=alphabet, index=states) 

    for seq in aln:  # iterate through each sequence
        state_ix = 0
        last_state = 'S'
        for i in range(n):
            if i in match_cols:
                state_ix += 1
                if seq[i] != '-':
                    current_state = 'M' + str(state_ix)
                    emissions.loc[current_state, seq[i]] += 1
                else:
                    current_state = 'D' + str(state_ix)
                
                transitions.loc[last_state, current_state] += 1
                last_state = current_state
            
            elif seq[i] != '-':
                current_state = 'I' + str(state_ix)
                transitions.loc[last_state, current_state] += 1
                emissions.loc[current_state, seq[i]] += 1
                last_state = current_state
                            
        transitions.loc[last_state, 'E'] += 1

    # normalize rows
    transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
    emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

    return transitions, emissions
def generate_probability_vector_result(output_path):

    cluster_frame = pd.read_csv(output_path + '/clusters.csv', header=None)
    cluster_frame = cluster_frame.set_index(cluster_frame.ix[:,0]).ix[:, 1:]
    cluster_array = cluster_frame.values

    points_frame = pd.read_csv(output_path + '/points.csv', header=None)
    # points_frame = points_frame.drop_duplicates()
    points_array = points_frame.values

    distance_matrix = pw.euclidean_distances(cluster_array, points_array)
    distance_matrix = distance_matrix.T
    distance_frame = DataFrame(distance_matrix)
    # print(distance_frame)
    # print(distance_frame.sum(axis=1))
    distance_frame = distance_frame.div(distance_frame.sum(axis=1), axis=0)
    distance_frame.to_csv(output_path + '/probability.csv')
Example #7
0
matrices of transition and emission probabilities.
'''

from pandas import DataFrame
from io import StringIO

f = open('rosalind_ba10h.txt').read().rstrip().split('--------\n')
x = list(f[0].rstrip())
alphabet = f[1].rstrip().split()
path = list(f[2].rstrip())
states = f[3].rstrip().split()    

transitions = DataFrame(data=0.0, index=states, columns=states)
emissions   = DataFrame(data=0.0, index=states, columns=alphabet)

for t in zip(path[:-1], path[1:]):
    transitions.loc[t] += 1

for a in zip(path, x):
    emissions.loc[a] += 1

transitions = transitions.div(transitions.sum(1) + 1e-10, axis=0).round(3)
emissions = emissions.div(emissions.sum(1) + 1e-10, axis=0).round(3)

f = StringIO()
transitions.to_csv(f, sep='\t', float_format='%g')
f.write('--------\n')
emissions.to_csv(f, sep='\t', float_format='%g')

open('rosalind_ba10h_sub.txt', 'wt').write(f.getvalue().rstrip())
Example #8
0
                   columns=list('abc'))
frame2 = DataFrame(np.arange(1,10).reshape(3,3),
                   columns=list('abc'))
print(frame1)
print(frame2)

# frame 덧셈
add = frame1.add(frame2)
print(add)

# frame 뺄셈
sub = frame2.sub(frame1)
print(sub)

# frame 나눗셈 div = frame2 / frame1
div = frame2.div(frame1)
print(div) # inf : 부모가 0인 경우 

# frame 곱셈 
mul = frame1.mul(frame2)
print(mul)

# 행/열 단위 합계/평균/최댓값/최솟값

sum1 = mul.sum(axis = 1) # 행 단위
sum2 = mul.sum(axis = 0) # 열 단위
print('행 단위 합계:\n',sum1)
print('열 단위 합계:\n',sum2)


avg1 = mul.mean(axis = 1) # 행 단위 평균