def calc_distance_matrix(G, max_distance=None):
    """Returns a matrix containing the shortest distance
    between all nodes in a network

    Parameters
    ----------
    G : graph
       A NetworkX graph

    max_distance : float or None, optional (default='None')
       The maximum possible distance value in the network.
       If None, max_distance is the longest shortest path between
       two nodes of the network (the graph eccentricity)

    Returns
    -------
    dist_matrix : NumPy array
      An NxN numpy array.

    Notes
    -----
    Along the diagonal, the values are all 0.
    Unconnected nodes have a distance of max_distance to other nodes.
    """

    # Network (collaborator) Distance
    dist_matrix = nx.all_pairs_shortest_path_length(G)
    dist_matrix = DataFrame(dist_matrix, index=G.nodes(), columns=G.nodes())
    if max_distance is None:
        max_distance = float(dist_matrix.max().max())
    dist_matrix = dist_matrix.fillna(max_distance)
    # The unconnected ones are infinitely far from the rest
    diag_idx = np.diag_indices(len(dist_matrix), ndim=2)
    dist_matrix.values[diag_idx] = 0
    return dist_matrix
    def clean(numpy_array):  #load your csv data here in numpy_array
        data=ut.preprocessData(numpy_array)

        #print dataarray
        #print data

        ###### numpy into pandas dataframe
        df = pd.DataFrame(data)
        #print df
        #print df.dtypes

        df=df.astype('float16')
        #print df.dtypes


        ###### generate preprocessed csv file 
        #df.to_csv('preprocessed_data.csv', sep=',',index=False)

        ###### normalize data between [0,1] using X_norm= (X - Xmin)/ (Xmax - Xmin)
        df_norm= (df - df.min()) / (df.max()-df.min())
        df_norm=df_norm.fillna(-1)

        ##### generate normalized csv 
        #df_norm.to_csv('normalized_data.csv',sep=',', index=False)
        
        return df_norm.as_matrix() 
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
def compute_confusion_matrix(target, predicted, normalize=True, sort = True):
    """ returns a confusion matrix as a data frame with labels
    Parameters:
        target (array): The values that are predicted.
        predicted (array): predicted values.
        normalize (bool): If True, Normalize
        normalize (bool): If true sort by value.
    Returns (DataFrame): df with the confusion matrix.
    """

    # Determine the uniqu values in the target list, sort them and assign as labels.
    labels = np.unique(list(target))
    labels.sort()

    # Compute the confusion matrix, place into data frame and normailize if desired.
    confusion = metrics.confusion_matrix(target, predicted, labels)
    confusion = DataFrame(confusion, index=labels, columns=labels)
    if normalize:
        confusion = confusion.apply(lambda x: x / np.sum(x), axis=1)

    # if sort is true: find the max value for each and then sort, the confusion matrix
    if sort:
        #get the max values, order and then use to order the confusion matrix on both axes
        max_values =confusion.max(axis = 1)
        max_values.sort(inplace = True, ascending=False)
        order = max_values.index
        confusion = confusion.loc[order,order]
    return confusion
Example #5
0
    def __generate_trace(self, objectives: DataFrame, metadata: list = None, legend: str = '', normalize: bool = False,
                         **kwargs):
        number_of_objectives = objectives.shape[1]

        if normalize:
            objectives = (objectives - objectives.min()) / (objectives.max() - objectives.min())

        marker = dict(
            color='rgb(127, 127, 127)',
            size=3,
            symbol='x',
            line=dict(
                color='rgb(204, 204, 204)',
                width=1
            ),
            opacity=0.8
        )
        marker.update(**kwargs)

        if number_of_objectives == 2:
            trace = go.Scattergl(
                x=objectives[0],
                y=objectives[1],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        elif number_of_objectives == 3:
            trace = go.Scatter3d(
                x=objectives[0],
                y=objectives[1],
                z=objectives[2],
                mode='markers',
                marker=marker,
                name=legend,
                customdata=metadata
            )
        else:
            dimensions = list()
            for column in objectives:
                dimensions.append(
                    dict(range=[0, 1],
                         label=self.axis_labels[column:column+1][0] if self.axis_labels[column:column+1] else None,
                         values=objectives[column])
                )

            trace = go.Parcoords(
                line=dict(color='blue'),
                dimensions=dimensions,
                name=legend,
            )

        return trace
Example #6
0
    def test_ndarray_compat(self):

        # test numpy compat with Series as sub-class of NDFrame
        tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
                         index=date_range('1/1/2000', periods=1000))

        def f(x):
            return x[x.idxmax()]

        result = tsdf.apply(f)
        expected = tsdf.max()
        tm.assert_series_equal(result, expected)

        # .item()
        s = Series([1])
        result = s.item()
        assert result == 1
        assert s.item() == s.iloc[0]

        # using an ndarray like function
        s = Series(np.random.randn(10))
        result = Series(np.ones_like(s))
        expected = Series(1, index=range(10), dtype='float64')
        tm.assert_series_equal(result, expected)

        # ravel
        s = Series(np.random.randn(10))
        tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F'))

        # compress
        # GH 6658
        s = Series([0, 1., -1], index=list('abc'))
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.], index=['b']))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Index(dtype=object) as the same as original
        exp = Series([], dtype='float64', index=Index([], dtype='object'))
        tm.assert_series_equal(result, exp)

        s = Series([0, 1., -1], index=[.1, .2, .3])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s > 0, s)
        tm.assert_series_equal(result, Series([1.], index=[.2]))

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = np.compress(s < -1, s)
        # result empty Float64Index as the same as original
        exp = Series([], dtype='float64', index=Index([], dtype='float64'))
        tm.assert_series_equal(result, exp)
def _to_labels(probabilities: pd.DataFrame) -> pd.Series:
    labels = probabilities.idxmax(axis='columns')

    # Find places where there are multiple maximum values
    max_probabilities = probabilities.max(axis='columns')
    is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows')
    number_of_max: pd.Series = is_max.sum(axis='columns')
    multiple_max: pd.Series = number_of_max.gt(1)
    # Set those locations as an 'undecided' label
    labels[multiple_max] = 'undecided'
    # TODO: emit a warning if any are set to 'undecided'

    return labels
Example #8
0
    def test_fillna_dict_series(self):
        df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan],
                        'b': [1, 2, 3, np.nan, np.nan],
                        'c': [np.nan, 1, 2, 3, 4]})

        result = df.fillna({'a': 0, 'b': 5})

        expected = df.copy()
        expected['a'] = expected['a'].fillna(0)
        expected['b'] = expected['b'].fillna(5)
        assert_frame_equal(result, expected)

        # it works
        result = df.fillna({'a': 0, 'b': 5, 'd': 7})

        # Series treated same as dict
        result = df.fillna(df.max())
        expected = df.fillna(df.max().to_dict())
        assert_frame_equal(result, expected)

        # disable this for now
        with pytest.raises(NotImplementedError, match='column by column'):
            df.fillna(df.max(1), axis=1)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
 def _extract_wpa(self, document):
     
     verbs = set(self._get_verbs(document, with_tags=False))
     
     count_verbs = len(verbs)
     count_pa_words = len(PA_WORDS)
     
     wpa_similarity_frame = DataFrame(
         np.empty((count_verbs, count_pa_words)), index=verbs, columns=PA_WORDS
     )
     
     for verb in verbs:
         for pa_word in PA_WORDS:                
             synset_1 = Synset('{}.v.0'.format(Word(pa_word).lemmatize('v')))                                
             synset_2 = Synset('{}.v.0'.format(Word(verb).lemmatize('v')))                                
             wpa_similarity_frame[pa_word][verb] = synset_2.wup_similarity(synset_1) 
     
     wpa_max_columns = wpa_similarity_frame.max()
             
     return max(wpa_max_columns)
Example #11
0
 def predict(self, prediction_data):
   preds = DataFrame(prediction_data)
   col_names = prediction_data.keys()
   tally_dict = {}
   for col_name in unique(preds):
     tally_dict[col_name] = [0 for x in range(preds.shape[0])]
   for row in preds.iterrows():
     index, data = row
     for col_name, elem in zip(col_names, data):
       tally_dict[elem][index] += self.weights[col_name]
   tally_df = DataFrame(tally_dict)
   max_val = [int(round(x)) for x in tally_df.max(1).tolist()]
   max_level = []
   for row in tally_df.index:
     int_vals = [int(round(x)) for x in tally_df.ix[row].tolist()] 
     is_max = [x == max_val[row] for x in int_vals]
     if sum(is_max) > 1:
       max_level.append(None)
     else:
       max_level.append(tally_df.columns[ is_max ][0])
   return(max_level)
df_app_cat = df_app_cat.sort(columns="avg")


# In[286]:

plt.plot(df_app_cat["avg"])


# In[287]:

plt.plot(df_app_cat["avg"], "bo", df_app_cat["avg"], "k")


# In[288]:

df_app_cat.max()


# In[289]:

t1["app_cat_high"] = 0
t2["app_cat_high"] = 0
test["app_cat_high"] = 0
t1["app_cat_high"][t1["app_category"] == "fc6fa53d"] = 1
t2["app_cat_high"][t2["app_category"] == "fc6fa53d"] = 1
test["app_cat_high"][test["app_category"] == "fc6fa53d"] = 1


# In[292]:

validation_check2(feature_cols, ["app_cat_high"])
c    36
'''

print
'lambda(匿名函数)以及应用'
print
frame
'''

   A  B  C
a  0  1  2
b  3  4  5
c  6  7  8
'''
print
frame.max()
'''
A    6
B    7
C    8
'''
f = lambda x: x.max() - x.min()
print
frame.apply(f)  # 作用到每一列
'''
A    6
B    6
C    6
'''
print
frame.apply(f, axis=1)  # 作用到每一行
dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"])
dframe1

# Sum method
dframe1.sum()  # ignores null values (treats them as 0s)
dframe1.sum(axis=1)  # sum across rows

# Min method
dframe1.min()  # finds the minimum value in each column
dframe1.min(axis=1)  # minimum value of each row

dframe1.idxmin()  # Find the index of minimum value column

# Max method
dframe1.max()
dframe1.idxmax()

# Cumulative sum
dframe1.cumsum()  # accumulates along each columns values

# Describe method
dframe1.describe()  # summary statistics of dataframe (by columns)

# correlation and covariance
import pandas.io.data as pdweb

# import pandas_datareader.data as pdweb
import datetime

prices = pdweb.get_data_yahoo(
Example #15
0
def pca(x, y=None, ylev=None,
        nlab=0, lsize=10, lalpha=1,
        center="both", scale="none",
        legend=True, cname="variable",
        color=None):
    if type(color) != type({}):
        color = None
    xForSvd = x.ix[:, x.std(axis=0) > 0]
    xsvd = svdForPca(xForSvd, center, scale)
    svdRowPlot = DataFrame(
        xsvd[0][:, 0:2],
        index = xForSvd.index,
        columns = ["PC1", "PC2"]
    )
    svdRowPlot = svdRowPlot.divide(svdRowPlot.max(axis=0) -
                                   svdRowPlot.min(axis=0), axis=1)
    svdColPlot = DataFrame(
        numpy.transpose(xsvd[2][0:2, :]),
        index = xForSvd.columns,
        columns = ["PC1", "PC2"]
    )
    svdColPlot = svdColPlot.divide(svdColPlot.max(axis=0) -
                                   svdColPlot.min(axis=0), axis=1)
    if nlab > 0:
        svdColPlotMag = (svdColPlot**2).sum(axis=1)
        svdColPlotMag.sort_values(ascending=False, inplace=True)
        svdColPlot = svdColPlot.ix[svdColPlotMag.index]
        svdColPlot["label"] = ""
        svdColPlot.ix[0:nlab, "label"] = \
                svdColPlot.ix[0:nlab].index.to_series()
    if legend:
        ax = plt.subplot(111)
    plt.plot(svdColPlot["PC1"], svdColPlot["PC2"],
             "o", color=(0, 0, 0, 0.1), markersize=5,
             label=cname)
    if nlab > 0:
        for i in range(nlab):
            plt.text(svdColPlot.ix[i, "PC1"],
                     svdColPlot.ix[i, "PC2"],
                     svdColPlot.ix[i, "label"],
                     fontsize = lsize,
                     color = (0, 0, 0, lalpha),
                     label = None)
    if y is not None:
        if ylev is None:
            ylev = y.unique()
        for level in ylev:
            if color is not None and level in color.keys():
                plt.plot(svdRowPlot.ix[y == level, 0],
                         svdRowPlot.ix[y == level, 1],
                         "o",
                         markersize = 8,
                         label = level,
                         color = color[level])
            else:
                plt.plot(svdRowPlot.ix[y == level, 0],
                         svdRowPlot.ix[y == level, 1],
                         "o",
                         markersize = 8,
                         label = level)
    else:
        plt.plot(svdRowPlot["PC1"], svdRowPlot["PC2"],
                 "o", markersize=8)
    if legend:
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width*0.8, box.height])
        ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), numpoints=1)
    plt.show()
Example #16
0
# after prepaired data, time to plot it:

for new_counter in range(file_counter+1):
    #print new_counter
    Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    #prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]
    
    Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]
    
    Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)]
    
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    #x2_std_dev = 3
    #once again correcting counter:
    x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2]
    #print x1[0], x2[0], quelle_initialTimestamps[new_counter]
    # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    f, axarr = plt.subplots(2, sharex=True)
    axarr[0].grid()
    axarr[0].plot(x1, y1)
Example #17
0
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

# 数据标准化
datafile = 'd:/data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据

(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化


###替换值
data = Series([1., -999., 2., -999., -1000., 3.])
data

data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})
    def _extract_ado_loc_org(self, document):
        """
            PA word has dependent object of PO category (ADO)
            
            In a PI post, a purchase action is targeted towards a consumable object. 
            This is reflected in the dependency structure of the text.
            
            In a PI post, the consumable object is usually the directly 
            dependent object of the purchase action verb.
            
            If there is a PA word in the text and it has a dependent object belonging to a PO category, ADO = 1, otherwise ADO = 0
             
        """
        
        # 1. Identify if there is a PA word. (or a very similar one)
        # 2. Identify if this PA word has an object.
        # 3. Identify if this object belongs to the PO category.
        # 4. If the 3 statements above are true, return ADO = 1 else ADO = 0
        
        s = pattern.en.parsetree(document, relations=True, lemmata=True)
        
        ADO = 0
        LOC = 0
        ORG = 0
        
        # Extract VERBs
        # Find out if they are ACTION VERBs or not
        # For each found, find if it has an object which is it's direct dependant
        # For each object found, find if it belongs to a PO category
        # For each object found, find it's NER (LOC, ORG)
        
        for sentence in s:
            
            for chunk in sentence.chunks:
                
                if chunk.type == 'VP':

                    print 'Chunk    : ', chunk
                    print 'Subject  : ', chunk.subject
                    print 'Object   : ', chunk.object
                    print 'String   : ', chunk.string                                                            
                    print 'Tagged   : ', chunk.tagged
                    print 'Role     : ', chunk.role                                                             
                    print 'Relation : ', chunk.relation                                                                                 
                    print 'Related  : ', chunk.related                                                                                 

                    
                    # Does it have an object?                    
                    if chunk.object is not None:
                        
                        # Get the verbs!                        
                        verbs_and_tags = filter(lambda x: x[1] in VERB_TAGS, chunk.tagged)
                        print 'Verbs    : ', verbs_and_tags
                        verbs = [verb[0] for verb in verbs_and_tags]                        
                    
                        # Are they PA words?                            
                        count_verbs = len(verbs)
                        count_pa_words = len(PA_WORDS)
        
                        wpa_similarity_frame = DataFrame(
                            np.empty((count_verbs, count_pa_words)), index=verbs, columns=PA_WORDS
                        )
        
                        for verb in verbs:
                            for pa_word in PA_WORDS:                
                                synset_1 = Synset('{}.v.0'.format(Word(pa_word).lemmatize('v')))                                
                                synset_2 = Synset('{}.v.0'.format(Word(verb).lemmatize('v')))                                
                                wpa_similarity_frame[pa_word][verb] = synset_2.wup_similarity(synset_1) 
        
                        wpa_max_columns = wpa_similarity_frame.max()
                
                        wpa = max(wpa_max_columns)
                        
                        if wpa >= 0.7:
                            # Get the nouns from the object
                            
                            head_noun = chunk.object.head                            
                            
                            # do they belong to PO category? 

                            if head_noun:  # check if head belongs to PO Category
                                print 'Head    : ', head_noun
                                
                                ADO = 1 # Fix this, implment this actually based on determining if head_noun belongs to PO Category
                                
                                # IMPORTANT: 
                                # Try and compile your own list of Consumable and Non-Consumable Categories
                                # as well as the words that belong to them.
                                # Freebase isn't available and Google Knowledge base seems not applicable.
                                
                                print 'Next PP   : ', chunk.object.next('PP')
                                if chunk.object.next('PP') is not None:
                                    print 'Next NP   : ', chunk.object.next('PP').next('NP')
                                
                                    word = chunk.object.next('PP').next('NP').head
                                
                                    ner_tagged = stanford_tagger.tag([word.string.title()])                                
                                    print 'NER      : ', ner_tagged
                                    print 

                                    print 'NER_LOC_TAGS:   ', filter(lambda w: w[1] in NER_LOC_TAGS, ner_tagged)
                                    if len(filter(lambda w: w[1] in NER_LOC_TAGS, ner_tagged)) > 0:
                                        LOC = 1
                                    else:
                                        LOC = 0
                                    
                                    print 'NER_ORG_TAGS:   ', filter(lambda w: w[1] in NER_ORG_TAGS, ner_tagged)
                                    if len(filter(lambda w: w[1] in NER_ORG_TAGS, ner_tagged)) > 0 :
                                        ORG = 1
                                    else:
                                        ORG = 0                                        

                                
                                return {'ADO': ADO, 'ORG': ORG, 'LOC': LOC}    
                        
                        

                     
                    print                    
                    
            
        print
            

        return {'ADO': ADO, 'ORG': ORG, 'LOC': LOC}