Ejemplo n.º 1
0
def do_svm(rfam_id):
    rfam = Rfam(use_website = True)
    rnas, organisms, consensus_2d = rfam.get_entry(rfam_id = 'RF%05u'%rfam_id)
    #a matrix for each stem-loop
    stem_loop_descriptions = []
    for i, rna in enumerate(rnas):
        #print to_bn(consensus_2d, len(rna))
        #print rna.sequence
        ss = base_pairs_to_secondary_structure(rna, consensus_2d)
        ss.find_junctions()
        ss.find_stem_loops()

        if i == 0:
            print ss.stem_loops    

        for index, stem_loop in enumerate(ss.stem_loops):
            stem_loop_description = None
            if index >= len(stem_loop_descriptions):
                stem_loop_description = {}
                stem_loop_descriptions.append(stem_loop_description)
            else:
                stem_loop_description =  stem_loop_descriptions[index]       
            #print stem_loop
            for helix in stem_loop['helices']:
                location = helix['location']
                #we extract the sequence for each strand and we remove the gaps
                strand_1 = rna.sequence[location[0][0]-1:location[0][1]].replace('-','')
                strand_2 = rna.sequence[location[1][0]-1:location[1][1]].replace('-','')
                #if len(strand_1) != len(strand_2):
                #    print "not fully conserved helix"
                #    print rna.sequence[location[0][0]-1:location[0][1]], rna.sequence[location[1][0]-1:location[1][1]]
                l = stem_loop_description.get(helix['name']+'_strand_1', [])
                l.append(len(strand_1))
                stem_loop_description[helix['name']+'_strand_1'] = l
                l = stem_loop_description.get(helix['name']+'_strand_2', [])
                l.append(len(strand_2))
                stem_loop_description[helix['name']+'_strand_2'] = l
            for inner_loop in stem_loop['inner_loops']:
                for single_strand in inner_loop['single_strands']:
                    #we extract the sequence for this single-strand and we remove the gaps
                    seq = rna.sequence[single_strand['location'][0]-1:single_strand['location'][1]].replace('-','')
                    #print single_strand['name']
                    l = stem_loop_description.get("inner_loop_%s"%single_strand['name'], [])
                    l.append(len(seq))
                    stem_loop_description["inner_loop_%s"%single_strand['name']] = l

            apical_loop = stem_loop['apical_loop']['single_strands'][0]
            seq = rna.sequence[apical_loop['location'][0]-1:apical_loop['location'][1]].replace('-','')
            l = stem_loop_description.get("apical_loop_%s"%apical_loop['name'], [])
            l.append(len(seq))
            stem_loop_description["apical_loop_%s"%apical_loop['name']] = l
    
    for stem_loop_description in stem_loop_descriptions:
        df = DataFrame(stem_loop_description)
        columns = df.columns
        print columns
        print df.as_matrix(columns)
Ejemplo n.º 2
0
def ListReading(filname, sname):
    table = read_excel(filname, sheetname=sname, header=0)
    Matrix = DataFrame.as_matrix(table)
    n = len(Matrix)
    m = len(Matrix[0])

    NPOS = {}
    Bond = {}
    C0 = {}
    C1 = {}

    for i in range(1, n):
        for j in range(1, m):
            if i == 1:
                NPOS[str(Matrix[0, j])] = Matrix[i, j]
            elif i == 2:
                Bond[(str(Matrix[0, j]), 'Sin')] = Matrix[i, j]
            elif i == 3:
                Bond[(str(Matrix[0, j]), 'Dou')] = Matrix[i, j]
            elif i == 4:
                C0[str(Matrix[0, j])] = Matrix[i, j]
            elif i == 5:
                C1[str(Matrix[0, j])] = Matrix[i, j]

    return NPOS, Bond, C0, C1
Ejemplo n.º 3
0
    def __get_news_matrix(self, news: pd.DataFrame, max_len=1):
        ordered_cols = [
            'datetime', 'symbol', 'title', 'actual', 'forecast', 'previous',
            'symbol_pair', 'preceding_price'
        ]
        news = news[ordered_cols]

        all_titles = self.prep_data_provider.get_all_titles()
        all_pairs = self.prep_data_provider.get_currency_pair_strings()
        all_currencies = self.prep_data_provider.get_all_currencies()

        news = news.reset_index(drop=True)
        #
        # print(news.iloc[11717].tolist())
        # print(news.iloc[11718].tolist())
        # print(news.iloc[11719].tolist())

        # news = self.one_hot_from_all_items(news, 'preceding_price', all_labels)
        news['preceding_price'].apply(lambda x: x * 10)
        news = self.one_hot_from_all_items(news, 'symbol', all_currencies)
        news = self.one_hot_from_all_items(news, 'symbol_pair', all_pairs)
        news = self.one_hot_from_all_items(news, 'title', all_titles)

        # print(news.iloc[11717].tolist())
        # print(news.iloc[11718].tolist())
        # print(news.iloc[11719].tolist())
        # quit()
        news = news.drop('datetime', 1)
        return news.as_matrix()[:max_len]
Ejemplo n.º 4
0
    def pre_process(x_train: pd.DataFrame, ind: int):

        def labels(shape: Tuple, target_shape: Tuple):
            # create one-hot image-shape array of labels for a picture
            array = np.ndarray((shape[0], shape[1], 2))
            array[:, :, :] = [1, 0]
            array[y1:y2 + 1, x1:x2 + 1, :] = [0, 1]
            image = cv2.resize(array, (target_shape[1], target_shape[0]))
            return np.reshape(image, (-1, 2))

        # Read image from data
        x_train = x_train.as_matrix()
        file = x_train[ind, 0]
        x1, y1, x2, y2 = x_train[ind, 1], x_train[ind, 2], x_train[ind, 3], x_train[ind, 4]
        p = re.compile('dayClip\d+')
        span = p.search(file).span()
        clip = file[span[0]: span[1]]
        formatted = file.replace(clip, clip + "/frames/" + clip)
        img = cv2.imread("./" + formatted)
        shape = img.shape
        # Crop
        cropped = img[0:shape[0] // 2]
        # Resize
        resized = cv2.resize(cropped, (400, 200), interpolation=cv2.INTER_AREA)
        # Blur
        blurred = cv2.GaussianBlur(resized, (5, 5), 0)
        # Convert color space
        final_image = cv2.cvtColor(blurred, cv2.COLOR_BGR2YUV)
        return final_image, labels(cropped.shape, resized.shape)
Ejemplo n.º 5
0
 def transform(self):
     subjectlist = map(lambda x:int(x[1:3]),self.subjsess_list)
     feedbacksesslist = map(lambda x:int(x[8:10]),self.subjsess_list)
     X = DataFrame()
     xsubj = []
     xsess = []
     xfeedbacknum = []
     xstartpos = []
     # xstartpostime = []    #time isnt really improving accuracy
     for findex in range(len(self.flist)):
         x_df = read_csv(self.flist[findex])
         fb_indices = x_df[x_df["FeedBackEvent"] == 1].index.tolist()
         # starttime_indices = x_df["Time"].iloc[fb_indices]
         del x_df
         fb_nums = range(len(fb_indices))
         subj_nums = [subjectlist[findex]]*len(fb_indices)
         sess_nums = [feedbacksesslist[findex]]*len(fb_indices)
         xsubj.extend(subj_nums)
         xsess.extend(sess_nums)
         xfeedbacknum.extend(fb_nums)
         xstartpos.extend(fb_indices)
         # xstartpostime.extend(starttime_indices)
     X["subject"] = xsubj
     X["sess"] = xsess
     X["feedback_num"] = xfeedbacknum
     X["start_pos"] = xstartpos
     # X["start_pos_time"] = xstartpostime
     return X.as_matrix()
Ejemplo n.º 6
0
    def transform(self, x):
        """Combine the matrix in X with the selected function.

        Args:
            x: array of shape [n_samples, n_features]

        Returns:
            array of shape [n_samples, n_features + 1]
            with a extra featured with the combined matrix.

        """
        df = x
        is_df = False

        if self.columns and self.column_name:
            if not isinstance(x, DataFrame):
                columns = [str(c_i) for c_i in range(x.shape[1])]
                self.column_name = str(x.shape[1] + 1)

                df = DataFrame(data=x, columns=columns)

                is_df = True

            df = combine_matrix(df,
                                columns=self.columns,
                                column_result=self.column_name,
                                func=self.op)

        return df if not is_df else df.as_matrix(
            columns=df.columns.sort_values())
Ejemplo n.º 7
0
def ParamReading(filen, sname):

    from sets import Set

    table = read_excel(filen, sheetname=sname, header=0)
    Matrix = DataFrame.as_matrix(table)
    n = len(Matrix)
    m = len(Matrix[0])

    SMI = Set()  #Smileset
    Second = Set()
    L = {}  #Propery dictionary
    #Smiles data
    for i in range(1, n):
        xs = []

        #Set addition--------
        for j in range(0, 3):
            xs.append(str(Matrix[i, j]))
        SMI.add(tuple(xs))

        #List addition------
        for j in range(3, m):
            if isnan(Matrix[i, j]) == False:
                L[tuple(xs), str(Matrix[0, j])] = Matrix[i, j]

    #Second index informatio
    for j in range(3, m):
        Second.add(str(Matrix[0, j]))
    return SMI, Second, L
Ejemplo n.º 8
0
 def transform(self):
     subjectlist = map(lambda x: int(x[1:3]), self.subjsess_list)
     feedbacksesslist = map(lambda x: int(x[8:10]), self.subjsess_list)
     X = DataFrame()
     xsubj = []
     xsess = []
     xfeedbacknum = []
     xstartpos = []
     # xstartpostime = []    #time isnt really improving accuracy
     for findex in range(len(self.flist)):
         x_df = read_csv(self.flist[findex])
         fb_indices = x_df[x_df["FeedBackEvent"] == 1].index.tolist()
         # starttime_indices = x_df["Time"].iloc[fb_indices]
         del x_df
         fb_nums = range(len(fb_indices))
         subj_nums = [subjectlist[findex]] * len(fb_indices)
         sess_nums = [feedbacksesslist[findex]] * len(fb_indices)
         xsubj.extend(subj_nums)
         xsess.extend(sess_nums)
         xfeedbacknum.extend(fb_nums)
         xstartpos.extend(fb_indices)
         # xstartpostime.extend(starttime_indices)
     X["subject"] = xsubj
     X["sess"] = xsess
     X["feedback_num"] = xfeedbacknum
     X["start_pos"] = xstartpos
     # X["start_pos_time"] = xstartpostime
     return X.as_matrix()
Ejemplo n.º 9
0
def Impute(data_as_DataFrame, kNNGraph, Method = IgnoringNan.mean, target = None ):
    """Impute(data_as_DataFrame,Graph) -> pandas DataFrame with nan's imputed
    
    Imputation is via Graph Neighborhoods of kNNGraph
    Method is applied to each neighborhood array of values for a 
    vertex with an nan
    
    Note: data_as_DataFrame can also be a numpy array 
    """
    
    try:
        data_as_DataFrame.columns
        data_as_DataFrame.index
    
        DFrame = data_as_DataFrame.copy()
    except:
        DFrame = DataFrame( data_as_DataFrame )
        
    cols = DFrame.columns
    inds = DFrame.index
    Data = DFrame.as_matrix()
    
    m,n = DFrame.shape
    for i in range(m):
        nbrs = kNNGraph.neighbors(i)
        for j in range(n):
            if( isnan( Data[i,j] ) ):
                 DFrame.set_value( inds[i],cols[j], int( Method( array( [Data[nbr,j] for nbr in nbrs] ) ) ) )
    return DFrame
Ejemplo n.º 10
0
    def __init__(self, notation=None, resolution=0.01, parameters=None):

        self.parameters = {"beta_r": 0.5,
                           "beta_e": 1e-5,
                           "c": 1.0,
                           "alpha": 0.05,
                           "phi": 50.0,
                           "gamma": 0.2,
                           "r0": 0.05,
                           "A": 5.0,
                           "mu_poisson": 3.0,
                           "mu_wald": 0.68,
                           "lambda_wald": 0.93}
        if parameters is not None:
            self.parameters.update(parameters)

        # TODO(David): Completely change this, probably, because it sucks
        self.resolution = resolution
        self.original_notation = notation
        self.notation = notation  # in case we need to convert point events
        self.array_notation = notation  # numpy array for jit
        self.parse_notation()
        self.convert_notation()

        # TODO(David): Fix for multiple cycle types
        df_cycle_types = DataFrame(self.array_notation[:, :, 0], columns=["event", "id"])
        df_cycle_types = df_cycle_types[(df_cycle_types.event==STIMULUS_ON) | (df_cycle_types.event==STIMULUS_OFF)].drop_duplicates()
        df_cycle_types["row"] = arange(0, df_cycle_types.shape[0])

        self.time_marker_info = df_cycle_types.as_matrix()
        self.n_time_markers = self.time_marker_info.shape[0]
        self.time_marker_row = 2
Ejemplo n.º 11
0
def transform_target_vector(df: pd.DataFrame, binary=False) -> np.array:
    """Only used on data with known labels otherwise it will fail"""
    binarize = lambda x: 1 if x > 0 else 0
    detect = lambda x: x if np.isnan(x) else binarize(x)
    if binary:
        df.returnQuantity = df.returnQuantity.apply(detect)
    return np.squeeze(df.as_matrix(columns=['returnQuantity'])).astype(np.float32)
Ejemplo n.º 12
0
    def test_as_matrix(self):
        frame = self.frame
        mat = frame.as_matrix()

        frameCols = frame.columns
        for i, row in enumerate(mat):
            for j, value in enumerate(row):
                col = frameCols[j]
                if np.isnan(value):
                    assert np.isnan(frame[col][i])
                else:
                    assert value == frame[col][i]

        # mixed type
        mat = self.mixed_frame.as_matrix(['foo', 'A'])
        assert mat[0, 0] == 'bar'

        df = DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
        mat = df.as_matrix()
        assert mat[0, 0] == 1j

        # single block corner case
        mat = self.frame.as_matrix(['A', 'B'])
        expected = self.frame.reindex(columns=['A', 'B']).values
        assert_almost_equal(mat, expected)
def buildTransferEntropyMatrix(fname, debug=False):
    #print("Starting JVM and importing Java Objects...")
    print(fname)

    teCalcClass = JPackage("infodynamics.measures.continuous.gaussian"
                           ).TransferEntropyCalculatorGaussian
    teCalc = teCalcClass()
    seCalcClass = JPackage(
        "infodynamics.measures.continuous.kernel").EntropyCalculatorKernel
    seCalc = seCalcClass()
    #print("Success!")

    raw_df = pd.read_table(fname, delimiter='\t')
    data = DataFrame.as_matrix(raw_df)[:, 1:-1]
    mat = np.negative(np.ones((data.shape[1], data.shape[1])))
    print("Building matrix for " + fname)
    for i in range(mat.shape[0]):
        if debug:
            print("Row: " + str(i) + ", Time: " + str(time.time()))
        for j in range(mat.shape[1]):
            if i != j:
                c1 = data[:, i]
                c2 = data[:, j]
                mat[i, j] = getTransferEntropy(teCalcClass, teCalc, c1, c2)
            else:
                c = data[:, i]
                mat[i, j] = getShannonEntropy(seCalc, c)  #Keep in bits.
    if data.shape[1] == 265:
        mat = np.insert(mat, (25, 132, 176), 0.0, axis=0)
        mat = np.insert(mat, (25, 132, 176), 0.0, axis=1)
    #shutdownJVM()
    return mat
Ejemplo n.º 14
0
def build_wordencoder(embeddings: pd.DataFrame, transform: Callable[[str], str]) \
        -> TextEncoder:
    """
    Create a word-level encoder: a Callable, mapping strings into integer arrays.
    Encoders dispatch on input type: if you pass a single string, you will get
    a 1D array, if you pass an Iterable of strings, you will get a 2D array,
    where row i encodes the i-th string in the Iterable.
    :param embeddings: a dataframe of word vectors indexed by words. The last
    vector (row) is used to encode OOV words.
    :return:
    """
    wordmap = {word: i for i, word in enumerate(embeddings.index)}
    if not wordmap:
        raise ValueError('empty `embeddings`')
    if not all(isinstance(word, str) for word in wordmap):
        raise ValueError('`embeddings` can be indexed by strings alone')
    oov = wordmap[embeddings.index[-1]]
    vectors = embeddings.as_matrix()

    def index(word: str) -> int:
        if not word:
            raise ValueError("can't encode empty words")
        return wordmap.get(transform(word), oov)

    def wordencoder(target: Union[str, Iterable[str]]) -> np.ndarray:
        if isinstance(target, str):
            return vectors[index(target)]
        indices = list(map(index, target))
        if not indices:
            raise ValueError('there are no `target`s')
        return np.vstack(vectors[indices])

    return wordencoder
Ejemplo n.º 15
0
    def orient_undirected_graph(self,
                                data,
                                graph,
                                CItest="gaussian",
                                method_indep='pcalg',
                                alpha=0.01,
                                njobs=SETTINGS.NB_JOBS,
                                verbose=False,
                                **kwargs):
        """Run PC on an undirected graph."""
        # Building setup w/ arguments.
        self.arguments['{CITEST}'] = self.CI_tests[CItest]
        self.arguments['{METHOD_INDEP}'] = self.method_indep[method_indep]
        self.arguments['{DIRECTED}'] = 'TRUE'
        self.arguments['{ALPHA}'] = str(alpha)
        self.arguments['{NJOBS}'] = str(njobs)
        self.arguments['{VERBOSE}'] = str(verbose).upper()

        fe = DataFrame(nx.adj_matrix(graph, weight=None).todense())
        fg = DataFrame(1 - fe.as_matrix())

        results = self.run_pc(data,
                              fixedEdges=fe,
                              fixedGaps=fg,
                              verbose=verbose)

        return nx.relabel_nodes(nx.DiGraph(results),
                                {idx: i
                                 for idx, i in enumerate(data.columns)})
Ejemplo n.º 16
0
def gonzales(data, k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:], index=data[:, 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 = points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance', 'center'], axis=1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1, k + 1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(
                    center.as_matrix(columns=[0, 1]),
                    p.as_matrix(columns=[0, 1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp

        centers_list = centers_list.append(
            points_list.ix[[next_cluster], :distance_column_index])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1, inplace=True)

    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0, 1])
Ejemplo n.º 17
0
def _to_xy(df: pd.DataFrame, target: str):
    """Converts a Pandas dataframe to the x,y inputs that TensorFlow needs"""
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    dummies = df[target]
    return df.as_matrix(result).astype(np.float32), dummies.as_matrix().flatten().astype(int)
Ejemplo n.º 18
0
def transform_target_vector(df: pd.DataFrame, binary=False) -> np.array:
    """Only used on data with known labels otherwise it will fail"""
    binarize = lambda x: 1 if x > 0 else 0
    detect = lambda x: x if np.isnan(x) else binarize(x)
    if binary:
        df.returnQuantity = df.returnQuantity.apply(detect)
    return np.squeeze(df.as_matrix(columns=['returnQuantity'])).astype(
        np.float32)
Ejemplo n.º 19
0
def getData(training_file, time_steps, split_percent):
    print("Train on file",training_file)

    series = read_csv(training_file)

    file_vector = getFileVector(training_file)
    #file_vector_size = len(file_vector)

    print("File vector ", file_vector)
    file_vector_size = len(file_vector)
    
    # Convert series to dataframe to perform shift and generate test set
    series_dataframe = DataFrame(series)
    shifted_dataframe = series_dataframe.shift(1)
    shifted_dataframe.fillna(0, inplace=True)

    # DataFrames to Numpy arrays
    series_dataframe_narray =  series_dataframe.as_matrix()
    shifted_dataframe_narray = shifted_dataframe.as_matrix()

    # Create small samples of time_steps size
    split_length = time_steps
    num_samples = shifted_dataframe_narray.shape[0]-split_length

    file_vector_narray = np.array(file_vector).reshape(1,1,-1)

    #print("File vector reshaped before np repeat ", file_vector_narray.shape)

    file_vector_repeat = np.repeat(np.repeat(file_vector_narray,num_samples,axis=0),split_length,axis=1)

    #print("X_samples shape repeated ", file_vector_repeat.shape)
    num_features = file_vector_size+1

    X_samples = np.zeros([num_samples, split_length, num_features])
    y_samples = np.zeros([num_samples, 1])

    #print("X_samples shape ", X_samples.shape)

    for i in range(num_samples) :
        #print("X samples shape ", X_samples[i].shape)
        #print("shifted ", shifted_dataframe_narray[i:i+split_length,0].reshape(-1,1).shape)
        X_samples[i] = np.append(file_vector_repeat[i],shifted_dataframe_narray[i:i+split_length,0].reshape(-1,1),axis=1)

        #print("X_samples ", X_samples[i])
        y_samples[i] = series_dataframe_narray[i+split_length,0]

    # Add additional dimension to feed it into tensorflow. It needs 3D tensor for LSTM
    #X_samples = np.reshape(X_samples,[X_samples.shape[0],X_samples.shape[1],1])


    # Split into training and test data
    split = int(split_percent*num_samples)
    X_train_samples = X_samples[:split]
    y_train_samples = y_samples[:split]
    X_test_samples = X_samples[split:]
    y_test_samples = y_samples[split:]

    return X_train_samples, y_train_samples, X_test_samples, y_test_samples
Ejemplo n.º 20
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Ejemplo n.º 21
0
def df2array(stock_df: pd.DataFrame,
             X_feats: List[str],
             y_feat: str,
             rescale=False):
    dataX = stock_df.as_matrix(X_feats)
    dataY = stock_df.as_matrix([y_feat]).reshape(-1)
    dataY = np.sign(np.sign(dataY) + 1.0)  # float => label

    dataX = dataX[np.isfinite(dataY), :]
    dataY = dataY[np.isfinite(dataY)]

    dataX = np.nan_to_num(dataX)

    if rescale:
        X_mean = np.mean(dataX, axis=0)
        X_std = np.std(dataX, axis=0)
        dataX = (dataX - X_mean[np.newaxis, :]) / X_std[np.newaxis, :]
    return dataX, dataY
Ejemplo n.º 22
0
def pickleRawData():
    f = open(RAW_FILE_NAME,'r')
    middata = []  #  [h l close vo time]
    rawdata = f.readlines()
    f.close()
    # rawdata = rawdata[-2500:]
    print('read and format data:\n')
    for line in tqdm(rawdata):
        cell = line.split(',')
        time = cell[0]+' '+cell[1]        
        highest = float(cell[3])
        lowest = float(cell[4])
        close = float(cell[5])
        volume = float(cell[6])
        time = makeTime(time)
        middata.append([highest, lowest, close, volume, time])

    datalen = len(middata)
    assert( datalen> SAMPLE_LENGTH)
    num = datalen - SAMPLE_LENGTH +1
    lastdata = []
    print('read completed! now enter core processing:\n')
    for i in tqdm(range(num)):
        d = middata[i:i+SAMPLE_LENGTH]
        df = DataFrame(d)
        max_v = max(df[0])
        min_v = min(df[1])
        f_k = lambda x: (GRID_HIGH*(x-min_v)/(max_v-min_v))
        df['h'] = df[0].apply(f_k)
        df['l'] = df[1].apply(f_k)
        df['c'] = df[2].apply(f_k)
        df['v'] = df[3]
        df['t'] = df[4]
        df['v'].astype('float')
        for i in range(5):
            del df[i]
        # print(df)
        matrix = df.as_matrix()
        matrix = matrix.transpose()
        matrix = matrix/GRID_HIGH
        # print(matrix)
        # exit()
        lastdata.append(matrix)

    lastlen = len(lastdata)
    assert(lastlen > PREDICT_LENGTH)
    lastnum = lastlen - PREDICT_LENGTH
    outxy = []
    print('core completed!, make output tuple:\n')
    for i in tqdm(range(lastnum)):
        outxy.append((lastdata[i],lastdata[i+PREDICT_LENGTH]))

    random.shuffle(outxy)
    # print(outxy)

    with open(filename(), 'wb') as f:
        pickle.dump(outxy, f)
Ejemplo n.º 23
0
 def fit(self, X: pd.DataFrame, w: np.ndarray):
     if len(X) == 0:
         raise NotEnoughParticles("Fitting not possible.")
     self._X_arr = X.as_matrix()
     sample_cov = smart_cov(self._X_arr, w)
     dim = sample_cov.shape[0]
     eff_sample_size = 1 / (w**2).sum()
     bw_factor = self.bandwidth_selector(eff_sample_size, dim)
     self.cov = sample_cov * bw_factor**2 * self.scaling
     self.normal = st.multivariate_normal(cov=self.cov, allow_singular=True)
Ejemplo n.º 24
0
 def fit(self, X: pd.DataFrame, w: np.ndarray):
     if len(X) == 0:
         raise NotEnoughParticles("Fitting not possible.")
     self._X_arr = X.as_matrix()
     cov = smart_cov(self._X_arr, w)
     effective_sample_size = len(X) / (1 + w.var())
     dimension = cov.shape[0]
     self.cov = cov * self.bandwidth_selector(effective_sample_size,
                                              dimension) * self.scaling
     self.normal = st.multivariate_normal(cov=self.cov, allow_singular=True)
Ejemplo n.º 25
0
def unfinished_tasks(all_configurations, data_frame: pd.DataFrame,
                     conf_columns: List[str]):
    done_configs = data_frame.as_matrix(conf_columns)
    done_configs = {tuple(config) for config in done_configs}

    all_as_set = {tuple(config) for config in all_configurations}
    assert done_configs <= all_as_set
    print('found {}/{} is done.'.format(len(set(done_configs) & all_as_set),
                                        len(all_as_set)))
    print('there are {} finished settings not in all_configurations.'.format(
        len(done_configs - all_as_set)))
    return list(all_as_set - done_configs)
Ejemplo n.º 26
0
 def fit(self, news_df: pd.DataFrame, batch_size: int, epochs: int, company_alias: Dict = None, verbose: int = 0):
     news_df_prs = tokenize_csv_file(news_df, should_replace_company=True, should_remove_NE=True,
                                     should_remove_numbers=False, company_alias=company_alias)
     self.emb_matrix = self.text_converter.fit(news_df_prs, 'text')
     self.model = nn_model(self.emb_matrix)
     X = self.text_converter.convert(news_df_prs, 'text')
     y = news_df.as_matrix(['sentiment'])
     print('Embedding Matrix Size: {}\nTraining Data Size, X: {}, Y:{}'.format(np.shape(self.emb_matrix),
                                                                               np.shape(X),
                                                                               np.shape(y)))
     print('Start Training...')
     self.model.fit(X, y, batch_size=batch_size, epochs=epochs, verbose=verbose)
def construct_data_matrix(input_file_name, output_filename_python, output_filename_matlab, has_header=True):
    data = dict()  # used to store the data matrix Z
    data_dates = dict()  # used to store the dates used in the data matrix
    # response_vector = dict()  # response vector y

    ticker = None
    with open(input_file_name, 'r') as infile:
        if has_header:
            infile.readline()

        for l in csv.reader(infile):

            cur_date = datetime.strptime(l[PRICE_CHANGE_FIELDS.Date.zvalue], PRICE_CHANGE_DATE_FORMAT)

            # Filter everything that's not from 2014 H2
            if cur_date.year != 2014 or cur_date.month < 6:
                continue

            # Filter lines without an analyst target price
            try:
                Pi = float(l[PRICE_CHANGE_FIELDS.New_Target.zvalue])  # Analyst prediction = Pi / Peoq
            except ValueError:
                continue

            Peoq = float(l[PRICE_CHANGE_FIELDS.Current_Price.zvalue])
            Pf = float(l[PRICE_CHANGE_FIELDS.Price_in_a_year.zvalue])  # true values y = Pf / Peoq
            analyst = l[PRICE_CHANGE_FIELDS.Firm.zvalue]

            if ticker != l[PRICE_CHANGE_FIELDS.Ticker.zvalue]:
                ticker = l[PRICE_CHANGE_FIELDS.Ticker.zvalue]
                data[ticker] = dict()
                data_dates[ticker] = dict()
                data[ticker][RESPONSE_LABEL] = Pf /Peoq

            # Don't assume data is ordered latest first
            if analyst not in data[ticker] or data_dates[ticker][analyst] < cur_date:
                data[ticker][analyst] = Pi / Peoq
                data_dates[ticker][analyst] = cur_date

            # if ticker not in response_vector:
            #     response_vector[ticker] = Pf / Peoq

    pickle.dump(data, open(output_filename_python, 'wb'))
    # Z = pickle.load(open('data/data_matrix.pkl'))

    df = DataFrame(data).T.fillna(0)
    column_labels = df.columns.tolist()  # Firms
    row_labels = df.T.columns.tolist()  # Tickers
    # df['Citigroup Inc.']
    # df.T['AAPL']

    savedict = {"data": df.as_matrix(), "column_labels": column_labels, "row_labels": row_labels}
    sio.savemat(output_filename_matlab, savedict)
Ejemplo n.º 28
0
def standard_som():
   path = "data/dorothea_clean.csv"
   #path = "research/data/housing.data"
   table = pd.read_csv(path, header=None)
   table_new = table.dropna(axis=0)
   data = DataFrame.as_matrix(table_new)

   my_som = SOM(20, 20, 5000)
   lattice = my_som.calc(data)
   u_matrix = create_u_matrix(lattice)
   plt.matshow(u_matrix.T, fignum=100, cmap='viridis')
   plt.show()
Ejemplo n.º 29
0
    def orient_directed_graph(self, data, dag, alg='HC', **kwargs):
        """ Improve a directed acyclic graph using CGNN

        :param data: data
        :param dag: directed acyclic graph to optimize
        :param alg: type of algorithm
        :param log: Save logs of the execution
        :return: improved directed acyclic graph
        """
        data = DataFrame(scale(data.as_matrix()), columns=data.columns)
        alg_dic = {'HC': hill_climbing, 'tabu': tabu_search}
        return alg_dic[alg](dag, data, self.infer_graph, **kwargs)
def cal_cor():
    cor = list()
    df = DataFrame()
    input = open('raw_data.pkl', 'rb')
    df = pickle.load(input)
    data = np.matrix(df)
    data = (data.T)[1:, :-1]
    rate = df.as_matrix(['ReturnRate'])[1:]
    #filter cor
    a = []
    for i in xrange(data.shape[0]):
        cor.append(np.corrcoef(rate.T, data[i])[0][1])
        if abs(cor[i]) < 0.03:
            a.append(i)
            print 'del:' + index[i] + ':' + str(cor[i])
        else:
            print 'keep:----' + index[i] + ':' + str(cor[i]) + '----'
    data = np.delete(data, a, axis=0)
    #delete filtered features in data
    a.reverse()
    index_cor = copy(index)
    for i in a:
        index_cor.pop(i)

    output = open('filter_cor.pkl', 'wb')
    pack = {}
    pack['result'] = rate
    pack['data'] = data
    pack['index'] = index_cor

    pickle.dump(pack, output)
    output.close()

    #绘图
    if True:
        fig = figure()
        #六个特征同时
        for i in xrange(len(pack['index'])):
            plot(pack['data'].tolist()[i], label=pack['index'][i])
        plot(pack['result'].T.tolist()[0], linewidth=2.5, label='result')
        legend(loc='upper left')
        savefig(
            'C:\\Projects\\FuzzyNeuro\\FuzzyNeuro\\20170320\\1\\raw_data.png',
            dpi=200)
        #分别描绘六个特征
        for i in xrange(len(pack['index'])):
            figure()
            plot(pack['data'].tolist()[i], label=pack['index'][i])
            legend(loc='upper left')
            savefig('C:\\Projects\\FuzzyNeuro\\FuzzyNeuro\\20170320\\1\\' +
                    pack['index'][i] + '_raw.png',
                    dpi=200)
Ejemplo n.º 31
0
def CorReading(filname, sname):
    table = read_excel(filname, sheetname=sname, header=0)
    Matrix = DataFrame.as_matrix(table)
    n = len(Matrix)
    m = len(Matrix[0])

    L = {}

    for i in range(1, n):
        for j in range(1, m):
            if isnan(Matrix[i, j]) == False:
                L[(str(Matrix[0, j]), str(Matrix[i, 0]))] = Matrix[i, j]
    return L
Ejemplo n.º 32
0
def cleanTrain():
    if not DEBUG:
        pdtest = pd.read_csv('test_ver2.csv/test_ver2.csv', delimiter=',')
        pdtrain = pd.read_csv('train_ver2.csv/train_ver2.csv', delimiter=',')

        pickle.dump(pdtrain, open(r'RawTrain.pickle', "wb"))
        pickle.dump(pdtest, open(r'RawTest.pickle', 'wb'))

    if DEBUG:
        train = pd.read_csv('train_ver2.csv/train_ver2.csv', delimiter=',')
        train2 = DataFrame.as_matrix(train)

    return pdtrain
Ejemplo n.º 33
0
def sequential_forward_selection(clf, X: pd.DataFrame, y: pd.DataFrame,
                                 k) -> list:
    """
    calculate for each available amount of features the best set.
    like in the tutor, large sets contain the smaller sets.

    :return: a dict indexed by int's, each entry contains a set of the best features selected for this entry.
    """

    X = X.loc[:, X.columns != 'Unnamed: 0']
    base = [feature for feature in X.keys()]
    bestIndexes = dict()
    bestScores = dict()
    X = X.as_matrix()
    y = y.as_matrix().ravel()

    for i in range(k):
        bestScore = 0
        for j in range(0, len(base)):
            if j in bestIndexes.values():
                continue
            currIndexes = [bestIndexes[l] for l in range(i)]
            currIndexes.append(j)
            currX = X[:, currIndexes]
            tempScore = metrics.accuracy_score(
                y, cross_val_predict(clf, currX, y, cv=3))
            if tempScore > bestScore:
                bestScore = tempScore
                bestIndexes[i] = j
                bestScores[i] = bestScore

    indexByOrder = []
    bestFeatures = []
    print(bestScores)
    for l in bestIndexes.keys():
        indexByOrder.append(bestIndexes[l])
        bestFeatures.append(base[bestIndexes[l]])

    return bestIndexes, bestFeatures, bestScores
Ejemplo n.º 34
0
def generateMatrix():
    db=pymysql.connect("localhost","root","root","bookRec")
    cursor=db.cursor()
    sql="select user_id,book_id,score from comment limit 1000;"   #取出所有的用户id,书籍id,评分
    cursor.execute(sql)
    data=cursor.fetchall()
    db.close()
    frame=DataFrame()               #建立矩阵
    n=0
    for item in data:
        n+=1
        print(n)               #向矩阵中增加值,矩阵会自动扩大
        frame.loc[item[1],item[0]]=item[2]
    return mat(frame.as_matrix())
def find_top_k_chars(recording, model, k=3):
    x_test, labels = preprocess_data(recording)
    # Convert this test data to datafrmae
    test_dat = DataFrame(x_test)

    # Center and scale the data
    center_scale(test_dat)

    # This is a dictionary with a unique id as key and labels as value
    id_to_char = dict(zip(range(len(model.classes_[0])), model.classes_[0]))

    # Predict using the rf model and return the predicted characters
    test_prediction = model.predict_proba(test_dat.as_matrix())
    return dict(zip(labels, top_k_prediction(test_prediction, id_to_char)))
Ejemplo n.º 36
0
def pca(df: DataFrame, file_path: str, eigenvalues_condition: Callable[[float],
                                                                       bool]):
    """
    Transforma un dataset en otro con menos dimensiones mediante PCA y permite guardarlo en un archivo csv.
    Implementacion basada en el documento 'A tutorial on Principal Components Analysis' de Lindsay I Smith

    :param df: dataset con atributos solamente numericos y sin el atributo objetivo
    :param file_path: ruta relativa al archivo csv en donde se guardara el resultado
    :param eigenvalues_condition: funcion booleana para filtrar los valores propios (y con estos los vectores propios
        asociados) que se usaran para generar la matriz row_feature_vector (ver documento).
    """

    # se omite el primer paso asumiendo que los datos cumplen las precondiciones

    # segundo paso: resta de los promedios
    row_data_adjust = DataFrame()
    means = []
    for a in df.columns.values:
        means.append(df[a].mean())
    for (i, a) in enumerate(df.columns.values):
        row_data_adjust[a] = df[a] - means[i]

    # tercer paso: calculo de matriz de covarianzas
    C = row_data_adjust.cov()

    # cuarto paso: calculo de valores y vectores propios de la matriz de covarianzas
    U, Sigma, V = randomized_svd(C.as_matrix(),
                                 n_components=C.shape[0],
                                 n_iter=5,
                                 random_state=None)

    # quinto paso: eleccion de componentes para formar el vector de caracteristicas
    order = (-Sigma).argsort()
    Sigma = Sigma[order]
    U = U[:, order]
    filtered_indices = [
        i for i in range(len(Sigma)) if eigenvalues_condition(Sigma[i])
    ]
    row_feature_vector = U[:, filtered_indices].transpose()

    # sexto paso : derivacion del nuevo dataset
    row_data_adjust = row_data_adjust.as_matrix()\
        .transpose()
    # noinspection PyUnresolvedReferences
    final_data = np.matmul(row_feature_vector, row_data_adjust)
    final_data = final_data.transpose()

    # se guarda en un csv
    final_data = DataFrame(final_data)
    final_data.to_csv(file_path, index=False, encoding='utf-8')
Ejemplo n.º 37
0
 def __init__(self):
     # Load data
     self.public_sector = read_table(
         "data/dhmosia_kthria/dhmosia_kthria_attikis.csv",
         index_col="ktirio_ypiresia", sep=';')
     self.stops = read_csv("data/oasa/stops.txt")
     self.routes = read_csv("data/oasa/routes.txt",
                            index_col="route_short_name")
     self.stop_times = read_csv("data/oasa/stop_times.txt",
                                index_col="stop_id")
     # Get only the coordinates of the bus stops to make the
     # "training" data
     self.coordinates = DataFrame.as_matrix(
         self.stops, columns=["stop_lon", "stop_lat"])
     self.nbrs = NearestNeighbors().fit(self.coordinates)
Ejemplo n.º 38
0
def kmeans_scikit(data , k):
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    mat = points_list.as_matrix()
    print(mat)
    # Using sklearn
    km = sklearn.cluster.KMeans(n_clusters=k)
    km.fit(mat)
    # Get cluster assignment labels
    labels = km.labels_
    print(labels)
    print('==============')
    print(km.predict([[20 ,-15]]))
    # Format results as a DataFrame
    #results = pd.DataFrame([points_list.index,labels]).T
    points_list['labels'] = labels
    points_list.plot(kind='scatter', x=0, y=1    , c='labels'  )
    plt.show()
    print(points_list)
Ejemplo n.º 39
0
def encode_features(df: pd.DataFrame, ft: str) -> csr_matrix:
    """Encode categorical features"""
    if ft not in set(encode_label + encode_int):
        return csr_matrix(df.as_matrix(columns=[ft]))

    label_enc = LabelEncoder()
    one_hot_enc = OneHotEncoder(sparse=True)

    if ft in encode_label:
        V = df[ft].as_matrix().T
        V_lab = label_enc.fit_transform(V).reshape(-1, 1)
        V_enc = one_hot_enc.fit_transform(V_lab)
        return V_enc

    if ft in encode_int:
        V = df[ft].as_matrix().reshape(-1, 1)
        V_enc = one_hot_enc.fit_transform(V)
        return V_enc
Ejemplo n.º 40
0
    def _train(self,
               model_dates: pd.Series,
               train_data: pd.DataFrame) -> ERModel:
        train_start_date = model_dates.trainStart
        train_end_date = model_dates.trainEnd

        time_line = train_data.index
        train_data = train_data.as_matrix()

        left = bisect.bisect_left(time_line, train_start_date)
        right = bisect.bisect_left(time_line, train_end_date)

        y = train_data[left:right, 0]
        x = train_data[left:right, 1:]

        model = ERModel()
        model.fit(x, y)
        return model
Ejemplo n.º 41
0
def kmeans_scikit(data, k):
    points_list = DataFrame(data[:, 1:], index=data[:, 0])
    mat = points_list.as_matrix()
    print(mat)
    # Using sklearn
    km = sklearn.cluster.KMeans(n_clusters=k)
    km.fit(mat)
    # Get cluster assignment labels
    labels = km.labels_
    print(labels)
    print('==============')
    print(km.predict([[20, -15]]))
    # Format results as a DataFrame
    #results = pd.DataFrame([points_list.index,labels]).T
    points_list['labels'] = labels
    points_list.plot(kind='scatter', x=0, y=1, c='labels')
    plt.show()
    print(points_list)
Ejemplo n.º 42
0
def __kalman(df: pd.DataFrame) -> pd.DataFrame:
    """
    :return: Kalman smooth these columns: ['masl', 'lat', 'lon', 'dmasl', 'dlat', 'dlon']
    """
    columns = ['masl', 'lat', 'lon', 'dmasl', 'dlat', 'dlon']
    trans_mat = np.array([
        [1, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
    ])
    obs_mat = np.array([
        [1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
    ])
    kf = pykalman.KalmanFilter(
        transition_matrices=trans_mat,
        transition_covariance=1.0e-4 * np.eye(6),
        observation_matrices=obs_mat,
        observation_covariance=1.0e-1 * np.eye(6),
        initial_state_mean=[df.masl[0], df.lat[0], df.lon[0], df.dmasl[0], df.dlat[0], df.dlon[0]],
        initial_state_covariance=1.0e-3 * np.eye(6),
    )
    x = df.as_matrix(columns=columns)

    (state_means, state_covs) = kf.em(x, n_iter=6).smooth(x)

    df['k_masl'] = state_means[:, 0]
    df['k_lat'] = state_means[:, 1]
    df['k_lon'] = state_means[:, 2]
    df['k_dmasl'] = state_means[:, 3]
    df['k_dlat'] = state_means[:, 4]
    df['k_dlong'] = state_means[:, 5]
    return df
Ejemplo n.º 43
0
 def cluster_comments(self):
     """
     Clusters comments based on their timestamps and
     assigns cluster-membership as attribute to nodes.
     """
     the_nodes = self.graph.nodes()
     if len(the_nodes) < 7:
         logging.warning(
             "Skipped clustering for %s, only %i comments",
             self.post_title,
             len(the_nodes))
         for node in the_nodes:
             self.graph.node[node]['cluster_id'] = None
     else:
         com_ids, stamps = zip(
             *((node, data["com_timestamp"])
               for node, data in self.graph.nodes_iter(data=True)))
         data = DataFrame(
             {'timestamps': stamps}, index=com_ids).sort_values(
                 by='timestamps')
         epoch = data.ix[0, 'timestamps']
         data['timestamps'] = data['timestamps'].apply(
             lambda timestamp: (timestamp - epoch).total_seconds())
         # TODO: identify outliers (or sparse end of stamps) and set
         # cluster-id to None
         # (find sparse "end" by using the time-stamps as index
         # and ones as data,
         # pd.rolling_sum() for 1 day-window and create mask based on < 2)
         # then create cluster_data for data[data["cluster_id"] != None]
         cluster_data = data.as_matrix()
         # TODO: need more refined way to set quantile
         # more consise, but see if check for 0 bandwidth is still needed
         for quantile in [.05, .3, 1, False]:
             if quantile:
                 try:
                     bandwidth = estimate_bandwidth(
                         cluster_data, quantile=quantile)
                 except ValueError:
                     logging.info(
                         "Estimation with quantile %f failed", quantile)
                 else:
                     break
             else:
                 logging.warning("Could not cluster %s", self.post_title)
                 sys.exit(1)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             try:
                 mshift = MeanShift(bandwidth=bandwidth, bin_seeding=False)
                 mshift.fit(cluster_data)
             except ValueError:
                 mshift = MeanShift(bandwidth=0.5, bin_seeding=False)
                 mshift.fit(cluster_data)
             labels = mshift.labels_
             unique_labels = np.sort(np.unique(labels))
             logging.info("Found %i clusters in %s",
                          len(unique_labels), self.post_title)
         try:
             assert len(labels) == len(cluster_data)
             # assert (unique_labels == np.arange(len(unique_labels))).all()
         except AssertionError as err:
             logging.warning("Mismatch cluster-labels: %s", err)
             print(unique_labels)
             print(labels)
         data['cluster_id'] = labels
         for com_id in data.index:
             self.graph.node[com_id]['cluster_id'] = data.ix[
                 com_id, 'cluster_id']
Ejemplo n.º 44
0



"""
Main
"""
data = sensorData.join(campusData, how = 'inner', rsuffix = 'Time_Stamp')
# Change attributes order
data = DataFrame(data, columns = ['Temperature', 'Humidity', 'WindSpeed', 
	'SolarRadiation', 'Temp1', 'Hum1', 'Temp2', 'Hum2'])
# Drops any Nan Values after removal of outliers (Precautionary)
data = data.dropna(axis = 0, how = 'any', 
						thresh = None, subset = None, inplace = False)

trainData = data.as_matrix(columns = ['Temperature', 'Humidity', 
										'WindSpeed', 'SolarRadiation'])
resultData = data.as_matrix(columns = ['Temp1'])
maxValue = max(resultData)
minValue = min(resultData)
resultData = (resultData - minValue) / (maxValue - minValue)
(w0To1, w1To2) = learnProcess(trainData, resultData)









Ejemplo n.º 45
0
DoubleDensity1 = APM.randint(8,20, 2*n1)-0.5
DoubleDensity1[n1:] = APM.randint( 8,15,n1 )
DDy = [ APM.randint(0,9)+ APM.rand() for i in range(2*n1)]

Class0 = array( [ APM.randint(4,13, n0),    randint(4,9,n0)-0.5, [0 for i in range(n0) ] ] )
Class1 = array( [ DoubleDensity1, DDy, [1 for i in range(2*n1) ] ] )
ClassU = array( [ APM.randint(8,12,nu), APM.randint(3,6,nu) + APM.randint(0,5,nu)*0.2 , [nan for i in range(nu) ]] )

Exdat = zeros( (3,n0+2*n1+nu)  )

Exdat[:,:n0] = Class0
Exdat[:,n0:(n0+2*n1)] = Class1
Exdat[:,(n0+2*n1):] = ClassU

Example3 = DataFrame( Exdat.T, columns = ['x','y','class'] )
Example3Data = Example3.as_matrix()

n0 = 50
n1 = 200
nu =  5

Class0 = array( [ [ (APM.rand()+ 7)*cos(2*pi*i/n0) for i in range(n0)], [ (APM.rand()+ 7)*sin(2*pi*i/n0) for i in range(n0)], [0 for i in range(n0) ] ] )
Class1 = array( [ [ (4*APM.rand()+ 2)*cos(2*pi*i/n1) for i in range(n1)], [ (4*APM.rand()+ 2)*sin(2*pi*i/n1) for i in range(n1)], [1 for i in range(n1) ] ] )
ClassU = array( [ [ (6.1+i/nu)*cos(2*pi*i/nu) for i in range(nu)], [ (6.1+i/nu)*sin(2*pi*i/nu) for i in range(nu)], [nan for i in range(nu) ]] )

Exdat = zeros( (3,n0+n1+nu)  )

Exdat[:,:n0] = Class0
Exdat[:,n0:(n0+n1)] = Class1
Exdat[:,(n0+n1):] = ClassU
Ejemplo n.º 46
0
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import MutinomialNB
data=DataFrame({'petal1':[],'petal2':[],'sepal1':[],'sepal2':[]})
tar=DataFrame({'target':[]})

md=genfromtxt('/home/shubh/Downloads/iris.data',delimiter=',',dtype=None)
i=0
for row in md:
    d={'petal1':[md[i][0]],'petal2':[md[i][1]],'sepal1':[md[i][2]],'sepal2':[md[i][3]]}
    t={'target':[md[i][4]]}
    d2=DataFrame(d)
    t2=DataFrame(t)
    data=data.append(d2)
    tar=tar.append(t2)
    i=i+1

#print data.head()
target=tar['target'].values
feed=data.as_matrix(['petal1','petal2','sepal1','sepal2'])
#print feed

cls=MultinomialNB()
cls.fit(feed,target)
a=[4.8,3.0,1.4,0.3]
ts=cls.predict(a)
print ts



#print target
Ejemplo n.º 47
0
        if int(info[0]) == item_id:
            f.close()
            return info[1]
        line = f.readline()
    f.close()
    return None

if __name__ == '__main__':
    #データの読み込み
    original_data = np.loadtxt("u.data", delimiter="\t")
    original_users = original_data[:,0]
    original_items = original_data[:,1]
    score = original_data[:,2]
    #ユーザーid,映画idのリストから重複を削除
    users = np.unique(original_users)
    items = np.unique(original_items)
    #ユーザー*アイテムのデータフレーム作成
    #全要素を0で初期化
    df = DataFrame(np.zeros((len(users), len(items))), index=users, columns=items)
    #評価点の代入
    for i in range(len(original_users)):
        df.ix[original_users[i], original_items[i]] = score[i]
    data = df.as_matrix()
    MF = MatrixFactorization()
    MF.fit(data)
    index = convert_user_id_to_index(users, 2)  #ユーザーidに対応するインデックスを取得
    item_index, rate = MF.predict(data, index)  #ユーザーのインデックスに対してレコメンドするアイテムのインデックスを取得
    item_id = convert_index_to_item_id(items, item_index)   #アイテムのインデックスに対応するidを取得
    print(get_item_name(item_id), rate) #レコメンドされたアイテム名を出力
    print("error=%d"%MF.get_result_error(data)) #RとU^TVの二乗誤差を出力
Ejemplo n.º 48
0
obj2 = Series([4, 5, 6, 7], index=['d', 'b', 'a', 'c'])

print obj2

print "obj2.index:", obj2

print obj2['a']

print obj2['d']


'''
列数据的获取
'''

'''
DataFrame.as_matrix(columns=None)
'''


import pandas as pd
data1 = pd.DataFrame(...)  # 任意初始化一个列数为3的DataFrame
data1.columns = ['a', 'b', 'c']

1.
data1['b']
# 这里取到第2列(即b列)的值

2.
data1.b
# 效果同1,取第2列(即b列)
ndata = DataFrame()
for col in data.columns:
    encoder.fit(uniques[col])
    ndata[col] = encoder.transform(data[col])

# Simple Explorartory Analysis
# Bar plots
for col in data.columns:
    agg = data.groupby([col, 'class']).count()
    sns.set_style("whitegrid")
    ax = sns.barplot(x=agg.ix[:, [0]].index.values, y=agg.ix[:, [0]].values.T[0])
    plt.title("Distribution of " + col)
    plt.show()

# PCA
pca_cal(standardize_dataset(ndata.as_matrix()), data['class'], data.columns, title="PCA with normalization")

# Seperate dataset to test and train set
kf = KFold(n=len(ndata), n_folds=10, shuffle=True)
train, test = kf.get_indices()
s = Score()

total_train_error = []
total_test_error = []

for k in range(1, 200, 3):
    train_error = []
    test_error = []
    for i in range(10):
        print "round %d %d" % (k, i)
        train_data = ndata.ix[train[i]]
Ejemplo n.º 50
0
data['min'] = D.min(axis=1)
data['max'] = D.max(axis=1)
data['median'] = D.median(axis=1)
data['sum'] = D.sum(axis=1)
data['skewness'] = D.skew(axis=1)
#data['mode'] = D.astype(int).mode(axis=1)
data['mad'] = D.mad(axis=1)
data['var'] = D.var(axis=1)
data['sem'] = D.sem(axis=1)
data['kurt'] = D.kurt(axis=1)
data['quantile_25'] = D.quantile(0.25,axis=1)
data['quantile_50'] = D.quantile(0.50,axis=1)
data['quantile_75'] = D.quantile(0.75,axis=1)

#data = data/data.max(axis=0) #normalize wrt max
df = data.as_matrix()

# Generate random features and distance matrix.
dist = np.zeros([df.shape[0],df.shape[0]])
for i in range(df.shape[0]):
    for j in range(df.shape[0]):
        dist[i,j] = np.sum(np.abs(df[i]-df[j]))

# Compute and plot first dendrogram.
fig = plt.figure(figsize=(20,20))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
Y = sch.linkage(dist, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])