Example #1
0
def do_svm(rfam_id):
    rfam = Rfam(use_website = True)
    rnas, organisms, consensus_2d = rfam.get_entry(rfam_id = 'RF%05u'%rfam_id)
    #a matrix for each stem-loop
    stem_loop_descriptions = []
    for i, rna in enumerate(rnas):
        #print to_bn(consensus_2d, len(rna))
        #print rna.sequence
        ss = base_pairs_to_secondary_structure(rna, consensus_2d)
        ss.find_junctions()
        ss.find_stem_loops()

        if i == 0:
            print ss.stem_loops    

        for index, stem_loop in enumerate(ss.stem_loops):
            stem_loop_description = None
            if index >= len(stem_loop_descriptions):
                stem_loop_description = {}
                stem_loop_descriptions.append(stem_loop_description)
            else:
                stem_loop_description =  stem_loop_descriptions[index]       
            #print stem_loop
            for helix in stem_loop['helices']:
                location = helix['location']
                #we extract the sequence for each strand and we remove the gaps
                strand_1 = rna.sequence[location[0][0]-1:location[0][1]].replace('-','')
                strand_2 = rna.sequence[location[1][0]-1:location[1][1]].replace('-','')
                #if len(strand_1) != len(strand_2):
                #    print "not fully conserved helix"
                #    print rna.sequence[location[0][0]-1:location[0][1]], rna.sequence[location[1][0]-1:location[1][1]]
                l = stem_loop_description.get(helix['name']+'_strand_1', [])
                l.append(len(strand_1))
                stem_loop_description[helix['name']+'_strand_1'] = l
                l = stem_loop_description.get(helix['name']+'_strand_2', [])
                l.append(len(strand_2))
                stem_loop_description[helix['name']+'_strand_2'] = l
            for inner_loop in stem_loop['inner_loops']:
                for single_strand in inner_loop['single_strands']:
                    #we extract the sequence for this single-strand and we remove the gaps
                    seq = rna.sequence[single_strand['location'][0]-1:single_strand['location'][1]].replace('-','')
                    #print single_strand['name']
                    l = stem_loop_description.get("inner_loop_%s"%single_strand['name'], [])
                    l.append(len(seq))
                    stem_loop_description["inner_loop_%s"%single_strand['name']] = l

            apical_loop = stem_loop['apical_loop']['single_strands'][0]
            seq = rna.sequence[apical_loop['location'][0]-1:apical_loop['location'][1]].replace('-','')
            l = stem_loop_description.get("apical_loop_%s"%apical_loop['name'], [])
            l.append(len(seq))
            stem_loop_description["apical_loop_%s"%apical_loop['name']] = l
    
    for stem_loop_description in stem_loop_descriptions:
        df = DataFrame(stem_loop_description)
        columns = df.columns
        print columns
        print df.as_matrix(columns)
Example #2
0
def Impute(data_as_DataFrame, kNNGraph, Method = IgnoringNan.mean, target = None ):
    """Impute(data_as_DataFrame,Graph) -> pandas DataFrame with nan's imputed
    
    Imputation is via Graph Neighborhoods of kNNGraph
    Method is applied to each neighborhood array of values for a 
    vertex with an nan
    
    Note: data_as_DataFrame can also be a numpy array 
    """
    
    try:
        data_as_DataFrame.columns
        data_as_DataFrame.index
    
        DFrame = data_as_DataFrame.copy()
    except:
        DFrame = DataFrame( data_as_DataFrame )
        
    cols = DFrame.columns
    inds = DFrame.index
    Data = DFrame.as_matrix()
    
    m,n = DFrame.shape
    for i in range(m):
        nbrs = kNNGraph.neighbors(i)
        for j in range(n):
            if( isnan( Data[i,j] ) ):
                 DFrame.set_value( inds[i],cols[j], int( Method( array( [Data[nbr,j] for nbr in nbrs] ) ) ) )
    return DFrame
 def transform(self):
     subjectlist = map(lambda x:int(x[1:3]),self.subjsess_list)
     feedbacksesslist = map(lambda x:int(x[8:10]),self.subjsess_list)
     X = DataFrame()
     xsubj = []
     xsess = []
     xfeedbacknum = []
     xstartpos = []
     # xstartpostime = []    #time isnt really improving accuracy
     for findex in range(len(self.flist)):
         x_df = read_csv(self.flist[findex])
         fb_indices = x_df[x_df["FeedBackEvent"] == 1].index.tolist()
         # starttime_indices = x_df["Time"].iloc[fb_indices]
         del x_df
         fb_nums = range(len(fb_indices))
         subj_nums = [subjectlist[findex]]*len(fb_indices)
         sess_nums = [feedbacksesslist[findex]]*len(fb_indices)
         xsubj.extend(subj_nums)
         xsess.extend(sess_nums)
         xfeedbacknum.extend(fb_nums)
         xstartpos.extend(fb_indices)
         # xstartpostime.extend(starttime_indices)
     X["subject"] = xsubj
     X["sess"] = xsess
     X["feedback_num"] = xfeedbacknum
     X["start_pos"] = xstartpos
     # X["start_pos_time"] = xstartpostime
     return X.as_matrix()
    def __init__(self, notation=None, resolution=0.01, parameters=None):

        self.parameters = {"beta_r": 0.5,
                           "beta_e": 1e-5,
                           "c": 1.0,
                           "alpha": 0.05,
                           "phi": 50.0,
                           "gamma": 0.2,
                           "r0": 0.05,
                           "A": 5.0,
                           "mu_poisson": 3.0,
                           "mu_wald": 0.68,
                           "lambda_wald": 0.93}
        if parameters is not None:
            self.parameters.update(parameters)

        # TODO(David): Completely change this, probably, because it sucks
        self.resolution = resolution
        self.original_notation = notation
        self.notation = notation  # in case we need to convert point events
        self.array_notation = notation  # numpy array for jit
        self.parse_notation()
        self.convert_notation()

        # TODO(David): Fix for multiple cycle types
        df_cycle_types = DataFrame(self.array_notation[:, :, 0], columns=["event", "id"])
        df_cycle_types = df_cycle_types[(df_cycle_types.event==STIMULUS_ON) | (df_cycle_types.event==STIMULUS_OFF)].drop_duplicates()
        df_cycle_types["row"] = arange(0, df_cycle_types.shape[0])

        self.time_marker_info = df_cycle_types.as_matrix()
        self.n_time_markers = self.time_marker_info.shape[0]
        self.time_marker_row = 2
Example #5
0
def transform_target_vector(df: pd.DataFrame, binary=False) -> np.array:
    """Only used on data with known labels otherwise it will fail"""
    binarize = lambda x: 1 if x > 0 else 0
    detect = lambda x: x if np.isnan(x) else binarize(x)
    if binary:
        df.returnQuantity = df.returnQuantity.apply(detect)
    return np.squeeze(df.as_matrix(columns=['returnQuantity'])).astype(np.float32)
Example #6
0
    def test_as_matrix(self):
        frame = self.frame
        mat = frame.as_matrix()

        frameCols = frame.columns
        for i, row in enumerate(mat):
            for j, value in enumerate(row):
                col = frameCols[j]
                if np.isnan(value):
                    assert np.isnan(frame[col][i])
                else:
                    assert value == frame[col][i]

        # mixed type
        mat = self.mixed_frame.as_matrix(['foo', 'A'])
        assert mat[0, 0] == 'bar'

        df = DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
        mat = df.as_matrix()
        assert mat[0, 0] == 1j

        # single block corner case
        mat = self.frame.as_matrix(['A', 'B'])
        expected = self.frame.reindex(columns=['A', 'B']).values
        assert_almost_equal(mat, expected)
Example #7
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Example #8
0
def standard_som():
   path = "data/dorothea_clean.csv"
   #path = "research/data/housing.data"
   table = pd.read_csv(path, header=None)
   table_new = table.dropna(axis=0)
   data = DataFrame.as_matrix(table_new)

   my_som = SOM(20, 20, 5000)
   lattice = my_som.calc(data)
   u_matrix = create_u_matrix(lattice)
   plt.matshow(u_matrix.T, fignum=100, cmap='viridis')
   plt.show()
def construct_data_matrix(input_file_name, output_filename_python, output_filename_matlab, has_header=True):
    data = dict()  # used to store the data matrix Z
    data_dates = dict()  # used to store the dates used in the data matrix
    # response_vector = dict()  # response vector y

    ticker = None
    with open(input_file_name, 'r') as infile:
        if has_header:
            infile.readline()

        for l in csv.reader(infile):

            cur_date = datetime.strptime(l[PRICE_CHANGE_FIELDS.Date.zvalue], PRICE_CHANGE_DATE_FORMAT)

            # Filter everything that's not from 2014 H2
            if cur_date.year != 2014 or cur_date.month < 6:
                continue

            # Filter lines without an analyst target price
            try:
                Pi = float(l[PRICE_CHANGE_FIELDS.New_Target.zvalue])  # Analyst prediction = Pi / Peoq
            except ValueError:
                continue

            Peoq = float(l[PRICE_CHANGE_FIELDS.Current_Price.zvalue])
            Pf = float(l[PRICE_CHANGE_FIELDS.Price_in_a_year.zvalue])  # true values y = Pf / Peoq
            analyst = l[PRICE_CHANGE_FIELDS.Firm.zvalue]

            if ticker != l[PRICE_CHANGE_FIELDS.Ticker.zvalue]:
                ticker = l[PRICE_CHANGE_FIELDS.Ticker.zvalue]
                data[ticker] = dict()
                data_dates[ticker] = dict()
                data[ticker][RESPONSE_LABEL] = Pf /Peoq

            # Don't assume data is ordered latest first
            if analyst not in data[ticker] or data_dates[ticker][analyst] < cur_date:
                data[ticker][analyst] = Pi / Peoq
                data_dates[ticker][analyst] = cur_date

            # if ticker not in response_vector:
            #     response_vector[ticker] = Pf / Peoq

    pickle.dump(data, open(output_filename_python, 'wb'))
    # Z = pickle.load(open('data/data_matrix.pkl'))

    df = DataFrame(data).T.fillna(0)
    column_labels = df.columns.tolist()  # Firms
    row_labels = df.T.columns.tolist()  # Tickers
    # df['Citigroup Inc.']
    # df.T['AAPL']

    savedict = {"data": df.as_matrix(), "column_labels": column_labels, "row_labels": row_labels}
    sio.savemat(output_filename_matlab, savedict)
def find_top_k_chars(recording, model, k=3):
    x_test, labels = preprocess_data(recording)
    # Convert this test data to datafrmae
    test_dat = DataFrame(x_test)

    # Center and scale the data
    center_scale(test_dat)

    # This is a dictionary with a unique id as key and labels as value
    id_to_char = dict(zip(range(len(model.classes_[0])), model.classes_[0]))

    # Predict using the rf model and return the predicted characters
    test_prediction = model.predict_proba(test_dat.as_matrix())
    return dict(zip(labels, top_k_prediction(test_prediction, id_to_char)))
Example #11
0
 def __init__(self):
     # Load data
     self.public_sector = read_table(
         "data/dhmosia_kthria/dhmosia_kthria_attikis.csv",
         index_col="ktirio_ypiresia", sep=';')
     self.stops = read_csv("data/oasa/stops.txt")
     self.routes = read_csv("data/oasa/routes.txt",
                            index_col="route_short_name")
     self.stop_times = read_csv("data/oasa/stop_times.txt",
                                index_col="stop_id")
     # Get only the coordinates of the bus stops to make the
     # "training" data
     self.coordinates = DataFrame.as_matrix(
         self.stops, columns=["stop_lon", "stop_lat"])
     self.nbrs = NearestNeighbors().fit(self.coordinates)
Example #12
0
def encode_features(df: pd.DataFrame, ft: str) -> csr_matrix:
    """Encode categorical features"""
    if ft not in set(encode_label + encode_int):
        return csr_matrix(df.as_matrix(columns=[ft]))

    label_enc = LabelEncoder()
    one_hot_enc = OneHotEncoder(sparse=True)

    if ft in encode_label:
        V = df[ft].as_matrix().T
        V_lab = label_enc.fit_transform(V).reshape(-1, 1)
        V_enc = one_hot_enc.fit_transform(V_lab)
        return V_enc

    if ft in encode_int:
        V = df[ft].as_matrix().reshape(-1, 1)
        V_enc = one_hot_enc.fit_transform(V)
        return V_enc
Example #13
0
def kmeans_scikit(data , k):
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    mat = points_list.as_matrix()
    print(mat)
    # Using sklearn
    km = sklearn.cluster.KMeans(n_clusters=k)
    km.fit(mat)
    # Get cluster assignment labels
    labels = km.labels_
    print(labels)
    print('==============')
    print(km.predict([[20 ,-15]]))
    # Format results as a DataFrame
    #results = pd.DataFrame([points_list.index,labels]).T
    points_list['labels'] = labels
    points_list.plot(kind='scatter', x=0, y=1    , c='labels'  )
    plt.show()
    print(points_list)
Example #14
0
def __kalman(df: pd.DataFrame) -> pd.DataFrame:
    """
    :return: Kalman smooth these columns: ['masl', 'lat', 'lon', 'dmasl', 'dlat', 'dlon']
    """
    columns = ['masl', 'lat', 'lon', 'dmasl', 'dlat', 'dlon']
    trans_mat = np.array([
        [1, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
    ])
    obs_mat = np.array([
        [1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
    ])
    kf = pykalman.KalmanFilter(
        transition_matrices=trans_mat,
        transition_covariance=1.0e-4 * np.eye(6),
        observation_matrices=obs_mat,
        observation_covariance=1.0e-1 * np.eye(6),
        initial_state_mean=[df.masl[0], df.lat[0], df.lon[0], df.dmasl[0], df.dlat[0], df.dlon[0]],
        initial_state_covariance=1.0e-3 * np.eye(6),
    )
    x = df.as_matrix(columns=columns)

    (state_means, state_covs) = kf.em(x, n_iter=6).smooth(x)

    df['k_masl'] = state_means[:, 0]
    df['k_lat'] = state_means[:, 1]
    df['k_lon'] = state_means[:, 2]
    df['k_dmasl'] = state_means[:, 3]
    df['k_dlat'] = state_means[:, 4]
    df['k_dlong'] = state_means[:, 5]
    return df
Example #15
0
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import MutinomialNB
data=DataFrame({'petal1':[],'petal2':[],'sepal1':[],'sepal2':[]})
tar=DataFrame({'target':[]})

md=genfromtxt('/home/shubh/Downloads/iris.data',delimiter=',',dtype=None)
i=0
for row in md:
    d={'petal1':[md[i][0]],'petal2':[md[i][1]],'sepal1':[md[i][2]],'sepal2':[md[i][3]]}
    t={'target':[md[i][4]]}
    d2=DataFrame(d)
    t2=DataFrame(t)
    data=data.append(d2)
    tar=tar.append(t2)
    i=i+1

#print data.head()
target=tar['target'].values
feed=data.as_matrix(['petal1','petal2','sepal1','sepal2'])
#print feed

cls=MultinomialNB()
cls.fit(feed,target)
a=[4.8,3.0,1.4,0.3]
ts=cls.predict(a)
print ts



#print target
Example #16
0



"""
Main
"""
data = sensorData.join(campusData, how = 'inner', rsuffix = 'Time_Stamp')
# Change attributes order
data = DataFrame(data, columns = ['Temperature', 'Humidity', 'WindSpeed', 
	'SolarRadiation', 'Temp1', 'Hum1', 'Temp2', 'Hum2'])
# Drops any Nan Values after removal of outliers (Precautionary)
data = data.dropna(axis = 0, how = 'any', 
						thresh = None, subset = None, inplace = False)

trainData = data.as_matrix(columns = ['Temperature', 'Humidity', 
										'WindSpeed', 'SolarRadiation'])
resultData = data.as_matrix(columns = ['Temp1'])
maxValue = max(resultData)
minValue = min(resultData)
resultData = (resultData - minValue) / (maxValue - minValue)
(w0To1, w1To2) = learnProcess(trainData, resultData)









Example #17
0
DoubleDensity1 = APM.randint(8,20, 2*n1)-0.5
DoubleDensity1[n1:] = APM.randint( 8,15,n1 )
DDy = [ APM.randint(0,9)+ APM.rand() for i in range(2*n1)]

Class0 = array( [ APM.randint(4,13, n0),    randint(4,9,n0)-0.5, [0 for i in range(n0) ] ] )
Class1 = array( [ DoubleDensity1, DDy, [1 for i in range(2*n1) ] ] )
ClassU = array( [ APM.randint(8,12,nu), APM.randint(3,6,nu) + APM.randint(0,5,nu)*0.2 , [nan for i in range(nu) ]] )

Exdat = zeros( (3,n0+2*n1+nu)  )

Exdat[:,:n0] = Class0
Exdat[:,n0:(n0+2*n1)] = Class1
Exdat[:,(n0+2*n1):] = ClassU

Example3 = DataFrame( Exdat.T, columns = ['x','y','class'] )
Example3Data = Example3.as_matrix()

n0 = 50
n1 = 200
nu =  5

Class0 = array( [ [ (APM.rand()+ 7)*cos(2*pi*i/n0) for i in range(n0)], [ (APM.rand()+ 7)*sin(2*pi*i/n0) for i in range(n0)], [0 for i in range(n0) ] ] )
Class1 = array( [ [ (4*APM.rand()+ 2)*cos(2*pi*i/n1) for i in range(n1)], [ (4*APM.rand()+ 2)*sin(2*pi*i/n1) for i in range(n1)], [1 for i in range(n1) ] ] )
ClassU = array( [ [ (6.1+i/nu)*cos(2*pi*i/nu) for i in range(nu)], [ (6.1+i/nu)*sin(2*pi*i/nu) for i in range(nu)], [nan for i in range(nu) ]] )

Exdat = zeros( (3,n0+n1+nu)  )

Exdat[:,:n0] = Class0
Exdat[:,n0:(n0+n1)] = Class1
Exdat[:,(n0+n1):] = ClassU
ndata = DataFrame()
for col in data.columns:
    encoder.fit(uniques[col])
    ndata[col] = encoder.transform(data[col])

# Simple Explorartory Analysis
# Bar plots
for col in data.columns:
    agg = data.groupby([col, 'class']).count()
    sns.set_style("whitegrid")
    ax = sns.barplot(x=agg.ix[:, [0]].index.values, y=agg.ix[:, [0]].values.T[0])
    plt.title("Distribution of " + col)
    plt.show()

# PCA
pca_cal(standardize_dataset(ndata.as_matrix()), data['class'], data.columns, title="PCA with normalization")

# Seperate dataset to test and train set
kf = KFold(n=len(ndata), n_folds=10, shuffle=True)
train, test = kf.get_indices()
s = Score()

total_train_error = []
total_test_error = []

for k in range(1, 200, 3):
    train_error = []
    test_error = []
    for i in range(10):
        print "round %d %d" % (k, i)
        train_data = ndata.ix[train[i]]
Example #19
0
data['min'] = D.min(axis=1)
data['max'] = D.max(axis=1)
data['median'] = D.median(axis=1)
data['sum'] = D.sum(axis=1)
data['skewness'] = D.skew(axis=1)
#data['mode'] = D.astype(int).mode(axis=1)
data['mad'] = D.mad(axis=1)
data['var'] = D.var(axis=1)
data['sem'] = D.sem(axis=1)
data['kurt'] = D.kurt(axis=1)
data['quantile_25'] = D.quantile(0.25,axis=1)
data['quantile_50'] = D.quantile(0.50,axis=1)
data['quantile_75'] = D.quantile(0.75,axis=1)

#data = data/data.max(axis=0) #normalize wrt max
df = data.as_matrix()

# Generate random features and distance matrix.
dist = np.zeros([df.shape[0],df.shape[0]])
for i in range(df.shape[0]):
    for j in range(df.shape[0]):
        dist[i,j] = np.sum(np.abs(df[i]-df[j]))

# Compute and plot first dendrogram.
fig = plt.figure(figsize=(20,20))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(dist, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
Example #20
0
        if int(info[0]) == item_id:
            f.close()
            return info[1]
        line = f.readline()
    f.close()
    return None

if __name__ == '__main__':
    #データの読み込み
    original_data = np.loadtxt("u.data", delimiter="\t")
    original_users = original_data[:,0]
    original_items = original_data[:,1]
    score = original_data[:,2]
    #ユーザーid,映画idのリストから重複を削除
    users = np.unique(original_users)
    items = np.unique(original_items)
    #ユーザー*アイテムのデータフレーム作成
    #全要素を0で初期化
    df = DataFrame(np.zeros((len(users), len(items))), index=users, columns=items)
    #評価点の代入
    for i in range(len(original_users)):
        df.ix[original_users[i], original_items[i]] = score[i]
    data = df.as_matrix()
    MF = MatrixFactorization()
    MF.fit(data)
    index = convert_user_id_to_index(users, 2)  #ユーザーidに対応するインデックスを取得
    item_index, rate = MF.predict(data, index)  #ユーザーのインデックスに対してレコメンドするアイテムのインデックスを取得
    item_id = convert_index_to_item_id(items, item_index)   #アイテムのインデックスに対応するidを取得
    print(get_item_name(item_id), rate) #レコメンドされたアイテム名を出力
    print("error=%d"%MF.get_result_error(data)) #RとU^TVの二乗誤差を出力
obj2 = Series([4, 5, 6, 7], index=['d', 'b', 'a', 'c'])

print obj2

print "obj2.index:", obj2

print obj2['a']

print obj2['d']


'''
列数据的获取
'''

'''
DataFrame.as_matrix(columns=None)
'''


import pandas as pd
data1 = pd.DataFrame(...)  # 任意初始化一个列数为3的DataFrame
data1.columns = ['a', 'b', 'c']

1.
data1['b']
# 这里取到第2列(即b列)的值

2.
data1.b
# 效果同1,取第2列(即b列)
Example #22
0
 def cluster_comments(self):
     """
     Clusters comments based on their timestamps and
     assigns cluster-membership as attribute to nodes.
     """
     the_nodes = self.graph.nodes()
     if len(the_nodes) < 7:
         logging.warning(
             "Skipped clustering for %s, only %i comments",
             self.post_title,
             len(the_nodes))
         for node in the_nodes:
             self.graph.node[node]['cluster_id'] = None
     else:
         com_ids, stamps = zip(
             *((node, data["com_timestamp"])
               for node, data in self.graph.nodes_iter(data=True)))
         data = DataFrame(
             {'timestamps': stamps}, index=com_ids).sort_values(
                 by='timestamps')
         epoch = data.ix[0, 'timestamps']
         data['timestamps'] = data['timestamps'].apply(
             lambda timestamp: (timestamp - epoch).total_seconds())
         # TODO: identify outliers (or sparse end of stamps) and set
         # cluster-id to None
         # (find sparse "end" by using the time-stamps as index
         # and ones as data,
         # pd.rolling_sum() for 1 day-window and create mask based on < 2)
         # then create cluster_data for data[data["cluster_id"] != None]
         cluster_data = data.as_matrix()
         # TODO: need more refined way to set quantile
         # more consise, but see if check for 0 bandwidth is still needed
         for quantile in [.05, .3, 1, False]:
             if quantile:
                 try:
                     bandwidth = estimate_bandwidth(
                         cluster_data, quantile=quantile)
                 except ValueError:
                     logging.info(
                         "Estimation with quantile %f failed", quantile)
                 else:
                     break
             else:
                 logging.warning("Could not cluster %s", self.post_title)
                 sys.exit(1)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             try:
                 mshift = MeanShift(bandwidth=bandwidth, bin_seeding=False)
                 mshift.fit(cluster_data)
             except ValueError:
                 mshift = MeanShift(bandwidth=0.5, bin_seeding=False)
                 mshift.fit(cluster_data)
             labels = mshift.labels_
             unique_labels = np.sort(np.unique(labels))
             logging.info("Found %i clusters in %s",
                          len(unique_labels), self.post_title)
         try:
             assert len(labels) == len(cluster_data)
             # assert (unique_labels == np.arange(len(unique_labels))).all()
         except AssertionError as err:
             logging.warning("Mismatch cluster-labels: %s", err)
             print(unique_labels)
             print(labels)
         data['cluster_id'] = labels
         for com_id in data.index:
             self.graph.node[com_id]['cluster_id'] = data.ix[
                 com_id, 'cluster_id']