def cell_type_corr(self):
     """Compute correlation coefficient between cell type"""
     print "The correlation matrix for cell types is:\n"
     celltype = DataFrame(
         {
             "HL60": self.data["HL60_0_hrs"].append(self.data["HL60_24_hrs"]),
             "U937": self.data["U937_0_hrs"].append(self.data["U937_24_hrs"]),
             "Jurkat": self.data["Jurkat_0_hrs"].append(self.data["Jurkat_24_hrs call"]),
             "NB4": self.data["NB4_0_hrs"].append(self.data["NB4_24_hrs"]),
         }
     )
     print celltype.corr()
     print "\n"
def main():
    data = utils.read_data_from_csv("data/winequality-red.csv")

    for attribute in data[0].keys():
        for name, func in BIN_FUNCTIONS.iteritems():
            plot_histogram(data, attribute, func, name)

    data_frame = DataFrame(data)
    plot_scatter_matrix(data_frame)
    plot_parallel_coordinates(data_frame)

    plot_pca_projection(data)
    plot_pca_projection(data, normalized=True)

    plot_mds(data)

    data_frame.corr(method="pearson").to_csv("build/pearson.csv")
    data_frame.corr(method="kendall").to_csv("build/kendall.csv")
Example #3
0
print(df5)

import math


def int_float_squares(series):
    return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2})


print(df.apply(int_float_squares, axis=1))

### 7. Basic Stats ###

print(df.describe())
print(df.cov())
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]})
print(other)
print(pd.merge(df, other, on="str_col", how="inner"))
print(pd.merge(df, other, on="str_col", how="outer"))
print(pd.merge(df, other, on="str_col", how="left"))
print(pd.merge(df, other, on="str_col", how="right"))

### 9. Plot ###

plot_df = DataFrame(np.random.randn(1000, 2), columns=["x", "y"])
plot_df["y"] = plot_df["y"].map(lambda x: x + 1)
def preprocess(directory, n_entries):

    hdf_path = directory.get_path("logs.h5", temp=False)
    print "hdf_path: %s" % hdf_path

    store = HDFStore(hdf_path)
    print "Keys: %s" % store.keys()
    print store
    store.close()
    df = pd.read_hdf(hdf_path, "logs")

    # df = directory.load('logs.h5')
    print "df: %s" % df

    if n_entries >= 0:
        df = df[:n_entries]

    secs = (df.index.max() - df.index.min()).total_seconds()
    hours = secs / 3600
    levels = df.level.unique()

    print "%.1f hours of logs" % hours

    print "%d log entries/hour" % int(len(df) / hours)
    print "%.1f thousand log entries/hour" % (int(len(df) / hours) / 1000.0)
    print df.shape, df.columns
    for level in levels:
        print "%-5s : %5d" % (level, len(df[df.level == level]))
    print "df : %s" % str(df.shape)

    if False:

        def get_peak(counts):
            """Retun the peak value in Series counts"""
            if len(counts) == 0:
                return None
            return counts.indmax()
            # return counts.index[counts.argmax()]

    start_time, end_time = df.index.min(), df.index.max()
    print "orginal: start_time, end_time = %s, %s" % (start_time, end_time)

    # Start time and end time trunctated to whole minutes
    start_time = truncate_to_minutes(start_time + timedelta(minutes=2))
    end_time = truncate_to_minutes(end_time - timedelta(minutes=2))
    print "cleaned: start_time, end_time = %s, %s" % (start_time, end_time)

    details = get_details(df)
    directory.save("details", details)

    # The counts for each 1 minute bin
    minute_counts = get_minute_counts(df, start_time, end_time)
    print "minute_counts: %s\n%s" % (type(minute_counts), minute_counts.describe())
    print "total entries: %s" % minute_counts.sum()

    level_counts = {level: get_minute_counts(df[df.level == level], start_time, end_time) for level in levels}

    # level_peaks = {level: get_peak(level_counts[level])  for level in levels}
    # print 'level_peaks: %s' % level_peaks

    if False:
        unique_files = df.file.unique()
        print "%d source files" % len(unique_files)
        for i, fl in enumerate(sorted(unique_files)[:5]):
            print "%3d: %s" % (i, fl)

        directory.save("unique_files", unique_files)

    #
    # Get all the unique log messages
    #
    level_file_line = df.groupby(["level", "file", "line"])
    lfl_size = level_file_line.size()
    lfl_sorted = lfl_size.order(ascending=False)
    print "lfl_sorted: %s" % str(lfl_sorted.shape)

    # directory.save('level_file_line', tuple(level_file_line))
    directory.save("lfl_sorted", lfl_sorted)

    # file:line uniquely identifies each level,file,line
    # Construct mappings in both directions
    lfl_to_string = OrderedDict(((lvl, fl, ln), "%s:%d" % (fl, ln)) for lvl, fl, ln in lfl_sorted.index)
    string_to_lfl = OrderedDict(("%s:%d" % (fl, ln), (lvl, fl, ln)) for lvl, fl, ln in lfl_sorted.index)
    print "string_to_lfl: %s" % len(string_to_lfl)

    # [((level,file,line),count)] sorted by count in descending order
    entry_types_list = zip(lfl_sorted.index, lfl_sorted)

    # {(level,file,line) : count}
    entry_types = OrderedDict(entry_types_list)
    directory.save("entry_types", entry_types)
    print "entry_types: %s" % len(entry_types)

    #
    # Build the correlation table
    #
    threshold = min(100, len(df) // 1000)
    lfl_freq_dict = {
        s: get_minute_counts(df[(df.file == fl) & (df.line == ln)], start_time, end_time)
        for s, (lvl, fl, ln) in string_to_lfl.items()
        if len(df[(df.file == fl) & (df.line == ln)]) >= threshold
    }
    print "++++"
    lfl_freq = DataFrame(lfl_freq_dict, columns=string_to_lfl.keys())
    directory.save("lfl_freq", lfl_freq)

    lfl_freq_corr = lfl_freq.corr()
    directory.save("lfl_freq_corr", lfl_freq_corr)
    print "lfl_freq_corr: %s" % str(lfl_freq_corr.shape)
Example #5
0
    favcorrs = []
    unfavcorrs = []
    diffcorrs = []
    approvcorrs = []
    disappcorrs = []
    appdiffcorrs = []
    ovotecorrs = []
    rvotecorrs = []
    votediffcorrs = []
    lags = []

    for x in xrange(-90, 90):
        data["lag"] = data.ma_sentiment.shift(x)
        lags.append(x)
        favcorrs.append(data.corr()["lag"]["favorable"])
        diffcorrs.append(data.corr()["lag"]["difference"])
        unfavcorrs.append(data.corr()["lag"]["unfavorable"])

    favcorrs = Series(favcorrs, index=lags)
    unfavcorrs = Series(unfavcorrs, index=lags)
    diffcorrs = Series(diffcorrs, index=lags)

    lagged_corrs = DataFrame({"favorable": favcorrs, "unfavorable": unfavcorrs, "difference": diffcorrs})
    lagged_corrs.to_csv(os.path.join("data", "lexicon_lagged_corrs" + str(k) + ".csv"), sep="\t")
    lglist.append(lagged_corrs)

sm7_laggedcorrs = lglist[0]
sm15_laggedcorrs = lglist[1]
sm30_laggedcorrs = lglist[2]

y = data.ma_sent.dropna()
Example #6
0
def fill_per_peptide_correlations(protein_records):
    per_peptide_correlation_parameter_labels = [
        "{0} per peptide correlation (Pearson)".format(name) for name in per_peptide_correlation_parameter_names
    ]

    total_received_peptides_number = 0
    total_missed_peptides_number = 0
    for protein_record in protein_records:
        total_received_peptides_number += len(protein_record.received_peptide_records)
        total_missed_peptides_number += len(protein_record.missed_peptide_records)

    total_received_pairs_number = total_received_peptides_number * (total_received_peptides_number - 1) // 2
    received_per_peptide_correlations = DataFrame(
        zeros((total_received_pairs_number, len(per_peptide_correlation_parameter_labels)), dtype=float64),
        columns=per_peptide_correlation_parameter_labels,
    )
    total_missed_pairs_number = total_missed_peptides_number * (total_missed_peptides_number - 1) // 2
    missed_per_peptide_correlations = DataFrame(
        zeros((total_missed_pairs_number, len(per_peptide_correlation_parameter_labels)), dtype=float64),
        columns=per_peptide_correlation_parameter_labels,
    )

    received_kidera_factors = DataFrame(
        zeros((len(kidera_factor_names), total_received_peptides_number), dtype=float64)
    )
    missed_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_missed_peptides_number), dtype=float64))

    received_acid_percents = DataFrame(
        zeros((len("AGVMDYNSWLFIKPQCERTH"), total_received_peptides_number), dtype=float64)
    )
    missed_acid_percents = DataFrame(zeros((len("AGVMDYNSWLFIKPQCERTH"), total_missed_peptides_number), dtype=float64))

    received_acid_compounds = DataFrame(
        zeros((len(amino_acid_group_names), total_received_peptides_number), dtype=float64)
    )
    missed_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_missed_peptides_number), dtype=float64))

    # received_charges = []
    # missed_charges = []

    received_hydrophobic_moments = DataFrame(
        zeros((len(hydrophobic_moments_names), total_received_peptides_number), dtype=float64)
    )
    missed_hydrophobic_moments = DataFrame(
        zeros((len(hydrophobic_moments_names), total_missed_peptides_number), dtype=float64)
    )

    secondary_structure_fraction_names = ["Helix", "Turn", "Sheet"]
    received_secondary_structure_fractions = DataFrame(
        zeros((len(secondary_structure_fraction_names), total_received_peptides_number), dtype=float64)
    )
    missed_secondary_structure_fractions = DataFrame(
        zeros((len(secondary_structure_fraction_names), total_missed_peptides_number), dtype=float64)
    )

    label = "Filling received peptides array-like parameter lists: "
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        for received_peptide_record in protein_record.received_peptide_records:
            kidera_factor_index = 0
            for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors:
                received_kidera_factors[index - 1][kidera_factor_index] = kidera_factor["value"]
                kidera_factor_index += 1

            acid_index = 0
            for acid in "AGVMDYNSWLFIKPQCERTH":
                received_acid_percents[index - 1][
                    acid_index
                ] = received_peptide_record.peptide_parameters.amino_acid_percents[acid]
                acid_index += 1

            group_index = 0
            for group in received_peptide_record.peptide_parameters.amino_acids_composition:
                received_acid_compounds[index - 1][group_index] = group["percent"]
                group_index += 1

            # charges = []
            # for charge in received_peptide_record.peptide_parameters.charges:
            #     charges.append(charge['charge'])
            # received_charges.append(charges)

            moment_index = 0
            for moment in received_peptide_record.peptide_parameters.hydrophobic_moments:
                if moment["name"] != "Polygly-polypro helix":
                    received_hydrophobic_moments[index - 1][moment_index] = moment["moment"]
                    group_index += 1

            fraction_index = 0
            for fraction in received_peptide_record.peptide_parameters.secondary_structure_fraction:
                received_secondary_structure_fractions[index - 1][fraction_index] = fraction["value"]
                fraction_index += 1

            show_progress(label, 35, index / total_received_peptides_number)
            index += 1
    print()

    label = "Filling missed peptides array-like parameter lists: "
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        for missed_peptide_record in protein_record.missed_peptide_records:
            kidera_factor_index = 0
            for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors:
                missed_kidera_factors[index - 1][kidera_factor_index] = kidera_factor["value"]
                kidera_factor_index += 1

            acid_index = 0
            for acid in "AGVMDYNSWLFIKPQCERTH":
                missed_acid_percents[index - 1][
                    acid_index
                ] = missed_peptide_record.peptide_parameters.amino_acid_percents[acid]
                acid_index += 1

            group_index = 0
            for group in missed_peptide_record.peptide_parameters.amino_acids_composition:
                missed_acid_compounds[index - 1][group_index] = group["percent"]
                group_index += 1

                # charges = []
                # for charge in missed_peptide_record.peptide_parameters.charges:
                #     charges.append(charge['charge'])
                # missed_charges.append(charges)
                #
            moment_index = 0
            for moment in missed_peptide_record.peptide_parameters.hydrophobic_moments:
                if moment["name"] != "Polygly-polypro helix":
                    missed_hydrophobic_moments[index - 1][moment_index] = moment["moment"]
                    group_index += 1

            fraction_index = 0
            for fraction in missed_peptide_record.peptide_parameters.secondary_structure_fraction:
                missed_secondary_structure_fractions[index - 1][fraction_index] = fraction["value"]
                fraction_index += 1

            show_progress(label, 35, index / total_missed_peptides_number)
            index += 1
    print()

    print("Calculating Kidera factors per peptide Pearson correlation (received peptides): ", end="")
    received_per_peptide_correlations[
        "Kidera factors per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(received_kidera_factors.corr(method="pearson"), "Kidera factors")
    print("done")

    print("Calculating Kidera factors per peptide Pearson correlation (missed peptides): ", end="")
    missed_per_peptide_correlations[
        "Kidera factors per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(missed_kidera_factors.corr(method="pearson"), "Kidera factors")
    print("done")

    print("Calculating amino acid percents per peptide Pearson correlation (received peptides): ", end="")
    received_per_peptide_correlations[
        "Amino acid percents per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(received_acid_percents.corr(method="pearson"), "Amino acid percents")
    print("done")

    print("Calculating amino acid percents per peptide Pearson correlation (missed peptides): ", end="")
    missed_per_peptide_correlations[
        "Amino acid percents per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(missed_acid_percents.corr(method="pearson"), "Amino acid percents")
    print("done")

    print("Calculating amino acid compositions per peptide Pearson correlation (received peptides): ", end="")
    received_per_peptide_correlations[
        "Amino acid compositions per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(received_acid_compounds.corr(method="pearson"), "Amino acid compositions")
    print("done")

    print("Calculating amino acid compositions per peptide Pearson correlation (missed peptides): ", end="")
    missed_per_peptide_correlations[
        "Amino acid compositions per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(missed_acid_compounds.corr(method="pearson"), "Amino acid compositions")
    print("done")

    #
    # label = 'Calculating charges Kendall correlation (missed peptides): '
    # show_progress(label, 40, 0.0)
    # index = 1
    # for first_charges in range(0, len(missed_charges)):
    #     for second_charges in range(first_charges + 1, len(missed_charges)):
    #         missed['Charges per peptide correlation (Kendall)'].append(
    #             statistics.kendalltau(missed_charges[first_charges], missed_charges[second_charges]).correlation)
    #     show_progress(label, 40, index / len(missed_charges))
    #     index += 1
    # print()

    print("Calculating hydrophobic moments per peptide Pearson correlation (received peptides): ", end="")
    received_per_peptide_correlations[
        "Hydrophobic moments per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(received_hydrophobic_moments.corr(method="pearson"), "Hydrophobic moments")
    print("done")

    print("Calculating hydrophobic moments per peptide Pearson correlation (missed peptides): ", end="")
    missed_per_peptide_correlations[
        "Hydrophobic moments per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(missed_hydrophobic_moments.corr(method="pearson"), "Hydrophobic moments")
    print("done")

    print("Calculating secondary structure fractions per peptide Pearson correlation (received peptides): ", end="")
    received_per_peptide_correlations[
        "Secondary structure fractions per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(
        received_secondary_structure_fractions.corr(method="pearson"), "Secondary structure fractions"
    )
    print("done")

    print("Calculating secondary structure fractions per peptide Pearson correlation (missed peptides): ", end="")
    missed_per_peptide_correlations[
        "Secondary structure fractions per peptide correlation (Pearson)"
    ] = convert_correlation_matrix_to_serie(
        missed_secondary_structure_fractions.corr(method="pearson"), "Secondary structure fractions"
    )
    print("done")

    return received_per_peptide_correlations, missed_per_peptide_correlations
Example #7
0
obj.rank()
obj.rank(method="first")  # 'average', 'min', 'max', 'first'

frame.rank(axis=1)

# descriptive statistics
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list("abcd"), columns=["one", "two"])
df.describe()
# skipna=True, mean, std, var, sum,
# max, min, argmax, argmin, idxmax, idxmin,
# cumsum, cumprod, diff, pct_change

# Correlation and Covariance
df = DataFrame(np.random.randn(100, 3), columns=list("abc"))
df.corr()
df.cov()
df.corrwith(df["a"])

# unique values, value counts, membership
obj = Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
uniques = obj.unique()
obj.value_counts()
mask = obj.isin(["b", "c"])
obj[mask]

# deal with missing data
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = np.nan
df.ix[:2, 2] = np.nan
Example #8
0
    {"int_col": [1, 2, 6, 8, -1], "float_col": [0.1, 0.2, 0.2, 10.1, None], "str_col": ["a", "b", None, "c", "a"]}
)

print df

# Extract just the float_col and int_col
# print df.ix[:,['float_col','int_col']]

print df[["float_col", "int_col"]]

# test indexing

# print df[df['float_col'] > 0.15]

# Handy!
print df.describe()

# Covariance between 'suitable' columns:
print df.cov()

# Correlation between columns.
print df.corr()

values = df.values[:, :-1]
print "VALUES:\n", values

print type(values)  # numpy ndarray
print type(values[1, 1])  # int

# convert them all to float32 (some were int)
print df.values[:, :-1].astype(np.float32)
Example #9
0
            2   3   1
            3   3.5 0.5
        ]

    calculate var, cov, corr, cov-matrix, corr-matrix
"""

data = {"x1": [1, 2], "x2": [2, 3], "x3": [2, 1]}
A = DataFrame(data)
print A

data1 = Series([3, 3.5, 0.5], index=["x1", "x2", "x3"])
A = A.append(data1, ignore_index=True)
print "\n", A, "\n"

var_x1 = A["x1"].var()
var_x2 = A["x2"].var()
var_x3 = A["x3"].var()

print "var_x1: %f" % var_x1
print "var_x2: %f" % var_x2
print "var_x3: %f" % var_x3

cov_matrix = A.cov()
print "\ncov_matrix:\n", cov_matrix

corr_matrix = A.corr()
print "\ncorr_matrix:\n", corr_matrix

print "\nA['x1'].corr(A['x2']): %f" % A["x1"].corr(A["x2"])
print "A['x1'].corr(A['x3']): %f" % A["x1"].corr(A["x3"])