Example #1
0
    def test_to_csv_from_csv1(self):

        with ensure_clean("__tmp_to_csv_from_csv1__") as path:
            self.frame["A"][:5] = nan

            self.frame.to_csv(path)
            self.frame.to_csv(path, columns=["A", "B"])
            self.frame.to_csv(path, header=False)
            self.frame.to_csv(path, index=False)

            # test roundtrip
            self.tsframe.to_csv(path)
            recons = DataFrame.from_csv(path)

            assert_frame_equal(self.tsframe, recons)

            self.tsframe.to_csv(path, index_label="index")
            recons = DataFrame.from_csv(path, index_col=None)
            assert len(recons.columns) == len(self.tsframe.columns) + 1

            # no index
            self.tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(self.tsframe.values, recons.values)

            # corner case
            dm = DataFrame({"s1": Series(lrange(3), lrange(3)), "s2": Series(lrange(2), lrange(2))})
            dm.to_csv(path)
            recons = DataFrame.from_csv(path)
            assert_frame_equal(dm, recons)
Example #2
0
    def test_to_csv_from_csv2(self):

        with ensure_clean("__tmp_to_csv_from_csv2__") as path:

            # duplicate index
            df = DataFrame(np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"])
            df.to_csv(path)
            result = DataFrame.from_csv(path)
            assert_frame_equal(result, df)

            midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
            df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"])
            df.to_csv(path)
            result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False)
            # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it
            # ?
            assert_frame_equal(result, df, check_names=False)

            # column aliases
            col_aliases = Index(["AA", "X", "Y", "Z"])
            self.frame2.to_csv(path, header=col_aliases)
            rs = DataFrame.from_csv(path)
            xp = self.frame2.copy()
            xp.columns = col_aliases

            assert_frame_equal(xp, rs)

            self.assertRaises(ValueError, self.frame2.to_csv, path, header=["AA", "X"])
Example #3
0
 def _addDataFrameByPath(self, path):
     try:
         new_df = DataFrame.from_csv(path, index_col=[0], sep=";", parse_dates=False)
         if new_df.shape[1] == 1 or new_df.shape[1] == 0:
             new_df = DataFrame.from_csv(path, index_col=[0], sep=",", parse_dates=False)
     except IndexError:  # index columns not recognized
         new_df = DataFrame.from_csv(path, index_col=[0], sep=",", parse_dates=False)
     new_df.path = path
     self.data_frames[path] = new_df
     self.dataFilesList.addItem(path)
     return new_df
Example #4
0
def load_data(received_file_name, missed_file_name):
    received = DataFrame.from_csv(received_file_name)
    received["received"] = Series([1.0 for i in range(0, len(received))], index=received.index)
    missed = DataFrame.from_csv(missed_file_name)
    missed["received"] = Series([0.0 for i in range(0, len(missed))], index=missed.index)

    print("received peptides loaded: {0}".format(len(received)))
    print("missed peptides loaded: {0}".format(len(missed)))
    total = concat([received, missed])
    print("total peptides: {0}".format(len(total)))

    return total
Example #5
0
    def test_to_csv_headers(self):
        # GH6186, the presence or absence of `index` incorrectly
        # causes to_csv to have different header semantics.
        from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
        to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"])
        with ensure_clean("__tmp_to_csv_headers__") as path:
            from_df.to_csv(path, header=["X", "Y"])
            recons = DataFrame.from_csv(path)
            assert_frame_equal(to_df, recons)

            from_df.to_csv(path, index=False, header=["X", "Y"])
            recons = DataFrame.from_csv(path)
            recons.reset_index(inplace=True)
            assert_frame_equal(to_df, recons)
Example #6
0
 def loadEntryCodes(self, overwrite=False):
     projectsDF = DataFrame.from_csv(
         os.path.join(self.pdbCodesDatesStorageFileLocation, self.CODE_STORAGE_FILE_NAME)
     )
     projectsDF["date"] = to_datetime(projectsDF["date"])
     projectsDF = projectsDF.copy().dropna()
     self.entryCodes = projectsDF
Example #7
0
    def test_to_csv_dtnat(self):
        # GH3437
        from pandas import NaT

        def make_dtnat_arr(n, nnat=None):
            if nnat is None:
                nnat = int(n * 0.1)  # 10%
            s = list(date_range("2000", freq="5min", periods=n))
            if nnat:
                for i in np.random.randint(0, len(s), nnat):
                    s[i] = NaT
                i = np.random.randint(100)
                s[-i] = NaT
                s[i] = NaT
            return s

        chunksize = 1000
        # N=35000
        s1 = make_dtnat_arr(chunksize + 5)
        s2 = make_dtnat_arr(chunksize + 5, 0)

        # s3=make_dtnjat_arr(chunksize+5,0)
        with ensure_clean("1.csv") as pth:
            df = DataFrame(dict(a=s1, b=s2))
            df.to_csv(pth, chunksize=chunksize)
            recons = DataFrame.from_csv(pth)._convert(datetime=True, coerce=True)
            assert_frame_equal(df, recons, check_names=False, check_less_precise=True)
Example #8
0
def list_calls_with_permissions(file, permission_map_file):
    """ List all API calls which require a permissions in file (according the
        mapping from Felt et al. CSS 2011 in APICalls.txt).
    """

    df = DataFrame.from_csv(permission_map_file, sep="\t")
    a, d, dx = AnalyzeAPK(file)
    for method in d.get_methods():
        for i in method.get_instructions():
            if i.get_name()[:6] == "invoke":
                # get method desc
                call = i.get_output(0).split(",")[-1].strip()
                # remove return value
                call = call[: call.index(")") + 1]
                # split in class and method
                call = call.split("->")
                method_class = get_type(call[0])
                ins_method, params = call[1].split("(")
                params = ",".join(parse_parameters(params.replace(")", "")))
                apicall = "{0}.{1}({2})".format(method_class, ins_method, params)
                try:
                    print df.ix[apicall]["Permission(s)"]
                    print apicall
                except:
                    pass
Example #9
0
    def setState(self, state):
        import StringIO

        for combo in self._getColumnCombos() + self._getRowCombos():
            combo.setState(state.popleft())
        self.displayedValueComboBox.setState(state.popleft())
        self.data_frames = {}

        data_frame_paths = state.popleft()
        self.dataFilesList.clear()
        for data_frame_path in data_frame_paths:
            self._addDataFrameByPath(data_frame_path)

        filter_widget_state = state.popleft()
        self.filterWidget.setState(filter_widget_state)
        self.filterWidget.setEnabled(True)

        data_frame_string = state.popleft()
        if data_frame_string:
            f = StringIO.StringIO(data_frame_string)
            self.dataFrameView.setDataFrame(DataFrame.from_csv(f))
            self.dataDisplayed = True
            self.transposeViewButton.setEnabled(True)
        else:
            pass
Example #10
0
def load_data(data_file):
    df = DF.from_csv(data_file)
    columns = list(df.columns.values)
    if "Date" in columns:
        df["Date"] = to_datetime(df["Date"])

    return df
Example #11
0
def loadTrain(filename):
    traindf = DataFrame.from_csv(filename, sep="\t", index_col=False)
    traindf1 = traindf[["essay_id", "essay_set", "essay", "rater1_domain1", "rater2_domain1", "domain1_score"]]
    filtercase = (
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
        ".",
    )
    X = traindf1["essay"]
    Y = traindf1["rater1_domain1"]
    return X, Y
    def _output_failure_rate_of_each_key(self, key_name, all_keys, value_key):
        """
		需求1:算出失误率表格
		"""
        target_filename = self._output_directory + key_name + "_" + "failure_rate.csv"

        pivoted_table_failure = pivot_table(
            self._failure_total_data, values=value_key, index=[key_name], aggfunc="sum", fill_value=None, dropna=False
        )
        pivoted_table_reserve = pivot_table(
            self._reservation_total_data,
            values=value_key,
            index=[key_name],
            aggfunc="sum",
            fill_value=None,
            dropna=False,
        )
        self._failure_panels[key_name] = pivoted_table_failure
        self._reservation_panels[key_name] = pivoted_table_reserve

        if os.path.isfile(target_filename):
            failure_rate_table = DataFrame.from_csv(target_filename)
            self._failure_rate_panels[key_name] = failure_rate_table
            return

        failure_rate_table = pivoted_table_failure.divide(pivoted_table_reserve, axis=0, level=None, fill_value=None)

        # failure_rate_table=self._find_max_index_in_1d_panel(failure_rate_table)

        failure_rate_table.to_csv(target_filename)
        self._failure_rate_panels[key_name] = failure_rate_table
        pass
Example #13
0
def smooth_graph(csv_name):
    df = DataFrame.from_csv(csv_name, parse_dates=False)

    xs = np.array([])
    for j in range(len(df.index)):
        xs = np.append(xs, time.mktime((datetime.datetime.strptime(df.index[j], "%Y-%m-%d")).timetuple()))

    xnew = np.linspace(xs.min(), xs.max(), 300)

    for k in range(df.shape[1]):
        ys = np.array(df[df.columns[k]])
        mark_smooth = spline(xs, ys, xnew)
        plt.plot(xnew, mark_smooth, label=df.columns[k])

    plt.title("Grades")
    plt.xlabel("Date")
    plt.ylabel("Mark")

    plt.legend(loc="best", title="Period")

    plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment="right")

    plt.grid()

    save_fig()
    plt.show()
Example #14
0
def load_data(path):
    print "...loading data"
    train_df = DataFrame.from_csv(path + "train.csv", index_col=False).fillna(0).astype(int)
    test_df = DataFrame.from_csv(path + "test.csv", index_col=False).fillna(0).astype(int)
    if debug_mode == False:
        train_set = [train_df.values[0:35000, 1:] / 255.0, train_df.values[0:35000, 0]]
        valid_set = [train_df.values[35000:, 1:] / 255.0, train_df.values[35000:, 0]]
    else:
        train_set = [train_df.values[0:3500, 1:] / 255.0, train_df.values[0:3500, 0]]
        valid_set = [train_df.values[3500:4000, 1:] / 255.0, train_df.values[3500:4000, 0]]
    test_set = test_df.values / 255.0
    # print train_set[0][:10][:10],'\n',train_set[1][:10],'\n',valid_set[0][-10:][:10],'\n',valid_set[1][-10:],'\n',test_set[0][10:][:10]
    test_set_x = theano.shared(np.asarray(test_set, dtype=theano.config.floatX), borrow=True)
    valid_set_x, valid_set_y = shared_dataset(valid_set, borrow=True)
    train_set_x, train_set_y = shared_dataset(train_set, borrow=True)
    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), test_set_x]
    return rval
Example #15
0
    def test_to_csv_withcommas(self):

        # Commas inside fields should be correctly escaped when saving as CSV.
        df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]})

        with ensure_clean("__tmp_to_csv_withcommas__.csv") as path:
            df.to_csv(path)
            df2 = DataFrame.from_csv(path)
            assert_frame_equal(df2, df)
Example #16
0
def generate_plot_folder(all_files_in_external_data, all_years_in_table, l):
    features = [
        "PopulationDensity",
        "AreaLand",
        "TotalPopulation",
        "Median Age",
        "MinorityPercentage",
        "PercentageUnemoployment",
        "MedianhouseholdIncome",
        "HouseholdsGiniIndex",
        "Market_Val",
        "TotalMidMarchEmployees",
        "TotalAnnualPayroll($1,000)",
        "TotalNumberofEstablishments",
    ]
    # 'TotalPopulationPerFunctionArea',
    # 'WorkersPerFunctionArea']
    x = df.from_csv(all_files_in_external_data)
    y = df.from_csv(all_years_in_table)
    if all_years_in_table.split("_")[1] == "ZIPCODE":
        newtable = y.merge(x, how="left", left_on="ZIPCODE", right_on="ZipCode")
    else:
        newtable = y.merge(x, how="left", on="CENSUSTRACT")
    for i in features:
        x = sm.add_constant(newtable[i])
        y = newtable[l]
        res = sm.OLS(y, x).fit()
        R2 = res.rsquared
        plt.scatter(x[i], y, color="blue")
        plt.plot(x[i], res.fittedvalues, color="red", linewidth=4)
        plt.ylabel(l)
        plt.xlabel(i + "<R^2 = " + str(R2) + ">")
        plt.title(l + " vs " + i + " ".join(all_years_in_table.split("_")[1:4]))
        if not os.path.exists("_".join(all_years_in_table.split("_")[1:4])):
            os.makedirs("_".join(all_years_in_table.split("_")[1:4]))
        plt.savefig(
            "_".join(all_years_in_table.split("_")[1:4])
            + "/"
            + l
            + "_vs_"
            + i
            + "_".join(all_years_in_table.split("_")[1:4])
        )
        plt.clf()
    def __do_the_stats(self):
        """周期性调用"""

        file_list = program_top.utilities.my_dir.browse_dir(self._input_directory, ".csv")

        fail_file_filter = lambda x: (x.__contains__("fail") and x.__contains__("total"))
        pure_file_fileter = lambda x: (x.__contains__("reserve") and x.__contains__("total"))

        fail_file_list = filter(fail_file_filter, file_list)
        pure_file_list = filter(pure_file_fileter, file_list)

        self._clear_up_before_data_update()

        self._failure_total_data = DataFrame.from_csv(fail_file_list[0])
        self._reservation_total_data = DataFrame.from_csv(pure_file_list[0])
        self._total_reservation_count = self._reservation_total_data.sum(numeric_only=True).values.tolist()[0]

        keys_of_interest = ["supplier", "channel", "hotel_supplier"]
        value_key = "sum_count"
        total_supplier_lists = self._reservation_total_data["supplier"].unique().tolist()
        self._active_hotel_counts = hotel_searcher_ref.get_active_hotel_lists(total_supplier_lists)

        self._get_two_key_vs_failure_rate(keys_of_interest[0], keys_of_interest[1], value_key=value_key)
        self._get_two_key_vs_importance_panel(keys_of_interest[0], keys_of_interest[1], value_key=value_key)
        self._output_two_key_vs_caution_score(keys_of_interest[0], keys_of_interest[1], value_key=value_key)

        # self._output_altitude_plot(keys_of_interest[0], keys_of_interest[1], self._double_key_failure_rate_panels[keys_of_interest[0], keys_of_interest[1]])

        for each_key in keys_of_interest:
            self._output_failure_rate_of_each_key(each_key, keys_of_interest, value_key)
            self._get_reservation_importance(each_key, keys_of_interest, value_key)
            self._output_caution_score_of_each_key(each_key, keys_of_interest, value_key)
            pass

            # self._output_failure_rate_vs_reservation_scattered_point_plot(keys_of_interest[2])
            # self._output_caution_score_vs_reservation_importance_scattered_point_plot(keys_of_interest[2])
            # self._cut_data_into_categorical_group(keys_of_interest[2],'failure_rate',50,self._failure_rate_panels[keys_of_interest[2]])
            # self._cut_data_into_categorical_group(keys_of_interest[2],'caution_score',50,self._caution_score_panel[keys_of_interest[2]])
        self._write_logs("订单失误和警告分数统计完成")
        from program_top.utilities.process_and_main_function.my_engine import my_engine
        from program_top.components import s3_gate_uploader

        my_engine(s3_gate_uploader)  # 创建一个s3上传数据的实例
        pass
Example #18
0
def do_query(ttl_file, query, rdf_format="turtle", serialize_format="csv", output_df=True):
    g = rdflib.Graph()
    g.parse(ttl_file, format=rdf_format)
    result = g.query(query)
    result = result.serialize(format=serialize_format)
    if output_df == True:
        result = StringIO(result)
        return DataFrame.from_csv(result, sep=",")
    else:
        return result
Example #19
0
def parse_table_data(lines):
    """Parse list of lines from SOFT file into DataFrame

    :param lines: iterable -- iterator over lines
    :returns: pandas.DataFrame -- table data

    """
    # filter lines that do not start with symbols
    data = "\n".join([i.rstrip() for i in lines if i[0] not in ("^", "!", "#")])
    return DataFrame.from_csv(StringIO(data), index_col=None, sep="\t")
Example #20
0
 def __init__(self, strategy, account, directory="data"):
     self.strategy = strategy
     self.account = account
     self.files = os.listdir(directory)
     self.data = {}
     self.dates = None
     self.actions = {}
     for filename in self.files:
         rawData = DataFrame.from_csv("%s/%s" % (directory, filename))
         self.data[filename] = rawData
         self.dates = self.dates.union(rawData.index) if self.dates is not None else rawData.index
Example #21
0
    def test_to_csv_bug(self):
        f1 = StringIO("a,1.0\nb,2.0")
        df = DataFrame.from_csv(f1, header=None)
        newdf = DataFrame({"t": df[df.columns[0]]})

        with ensure_clean() as path:
            newdf.to_csv(path)

            recons = read_csv(path, index_col=0)
            # don't check_names as t != 1
            assert_frame_equal(recons, newdf, check_names=False)
Example #22
0
def train_word_vector(source, dict, wordvec):
    utils.jieba_add_dict(dict)
    comments_df = DataFrame.from_csv(source, sep="\t")
    document = []
    for line in comments_df["comment"].values:
        line = utils.remove_punctuation(line)
        cutted_line = jieba.cut(line)
        document.append(list(cutted_line))
    model = gensim.models.Word2Vec(document)
    print "saving word vector model"
    model.save(wordvec)
    return model
Example #23
0
    def from_files(path, run_id, wafer_id):
        fn_base = os.path.join(path, "{0}_{1:02}".format(run_id, wafer_id))

        try:
            df = DataFrame(
                {
                    11: DataFrame.from_csv(fn_base + ".11", header=None, sep="\t", index_col=None, parse_dates=False)[
                        1
                    ],
                    12: DataFrame.from_csv(fn_base + ".12", header=None, sep="\t", index_col=None, parse_dates=False)[
                        1
                    ],
                    15: DataFrame.from_csv(fn_base + ".15", header=None, sep="\t", index_col=None, parse_dates=False)[
                        1
                    ],
                    6: DataFrame.from_csv(fn_base + ".6", header=None, sep="\t", index_col=None, parse_dates=False)[1],
                    7: DataFrame.from_csv(fn_base + ".7", header=None, sep="\t", index_col=None, parse_dates=False)[1],
                    8: DataFrame.from_csv(fn_base + ".8", header=None, sep="\t", index_col=None, parse_dates=False)[1],
                }
            )
        except:
            return None

        m = re.search("/(normal|abnormal)", path)
        if m is None:
            return None

        label = 1 if m.group(1) == "abnormal" else -1

        return WaferRun(run_id, wafer_id, label, df)
Example #24
0
    def __init__(self, retirement_age=65, discount_rate=0.775, calculate_through_year=2115):
        # FIXME: Eventually, this will transition to a Database rather than file systems.
        csv_folder = os.path.join(settings.BASE_DIR, "apps", "calculator", "onetime_scripts", "csv_data")

        activemembers_file = os.path.join(csv_folder, "ILPR_1.0_activemembers.csv")  # Constant over time; varies by age
        annuitants_file = os.path.join(csv_folder, "ILPR_1.0_annuitants.csv")  # Constant over time; varies by age
        income_file = os.path.join(csv_folder, "ILPR_1.0_income.csv")  # Varies over time; constant by age
        payment_file = os.path.join(csv_folder, "ILPR_1.0_payment.csv")  # Varies over time; constant by age
        servicetable_file = os.path.join(csv_folder, "ILPR_1.0_servicetable.csv")  # Varies by years of service
        yearsofservice_file = os.path.join(
            csv_folder, "ILPR_1.0_yearsofservice.csv"
        )  # Constant over time; varies by age

        self.activemembers_df = DataFrame.from_csv(activemembers_file)
        self.annuitants_df = DataFrame.from_csv(annuitants_file)
        self.income_df = DataFrame.from_csv(income_file)
        self.payment_df = DataFrame.from_csv(payment_file)
        self.servicetable_df = DataFrame.from_csv(servicetable_file)
        self.yearsofservice_df = DataFrame.from_csv(yearsofservice_file)

        self.base_year = 2014
        self.life_expectancy = 85
        self.life_expectancy_after_retirement = self.life_expectancy - retirement_age
        self.percentage_of_final_salary = 0.0167
        self.discount_rate = discount_rate
        self.inflation_rate = 0.03
        self.calculate_through_year = calculate_through_year
Example #25
0
def get_returns():

    path = "C:\\Users\\vishr_000\\Documents\\GitHub\\ManagedFutures"
    os.chdir(path)
    df_prices = DataFrame.from_csv("Futures Data.csv", header=0)
    df_prices = df_prices.convert_objects(convert_numeric=True)
    df_prices = df_prices.replace(0, np.NaN)
    df_prices = df_prices.fillna(method="pad")

    df_returns = df_prices / df_prices.shift(1)

    returns_index = df_returns.cumprod()
    return df_prices, df_returns
def load_test_labels(csv_path):
    subject_to_df = defaultdict(list)
    d = DataFrame.from_csv(csv_path, index_col=None)
    for i in d.index:
        clip = d["clip"][i]
        preictal = d["preictal"][i]

        subject_name = "_".join(clip.split("_", 2)[:2])
        subject_to_df[subject_name].append((clip, preictal))

    for subject_name, subject_data in subject_to_df.iteritems():
        subject_to_df[subject_name] = DataFrame(subject_data, columns=["clip", "preictal"])
    return subject_to_df
Example #27
0
    def test_to_csv_from_csv_w_all_infs(self):

        # test roundtrip with inf, -inf, nan, as full columns and mix
        self.frame["E"] = np.inf
        self.frame["F"] = -np.inf

        with ensure_clean() as path:
            self.frame.to_csv(path)
            recons = DataFrame.from_csv(path)

            # TODO to_csv drops column name
            assert_frame_equal(self.frame, recons, check_names=False)
            assert_frame_equal(np.isinf(self.frame), np.isinf(recons), check_names=False)
Example #28
0
def fix_sentiments():
    """Add the 'sender' column so I know who to make a connection to.
    """
    df = DataFrame.from_csv("sentiments.csv")
    texts = []
    with open("texts/filenames.txt", "r") as filenames:
        fn_list = map(str.strip, [filename for filename in filenames])
        fn_list = map(lambda x: "texts/texts/" + x, fn_list)
        for fn in fn_list:
            texts.append(get_texts(fn))  # returns TextMessage object
    texts = map(lambda x: getattr(x, "sender"), [item for sublist in texts for item in sublist])
    df["sender"] = Series(texts, index=df.index)
    df.to_csv("sentiments_fixed.csv", encoding="utf-8")
def read_csv_into_paterns(absolute_csv_filename, pattern):
    if os.path.isfile(absolute_csv_filename):
        target_frame = DataFrame.from_csv(absolute_csv_filename)
        target_list = target_frame.to_dict(pattern)
        return target_list

    else:
        target_string = ",".join([absolute_csv_filename, ""])

        utilities.write_log(target_string)

        return []
    pass
Example #30
0
    def test_to_csv_from_csv_w_some_infs(self):

        # test roundtrip with inf, -inf, nan, as full columns and mix
        self.frame["G"] = np.nan
        f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5]
        self.frame["H"] = self.frame.index.map(f)

        with ensure_clean() as path:
            self.frame.to_csv(path)
            recons = DataFrame.from_csv(path)

            # TODO to_csv drops column name
            assert_frame_equal(self.frame, recons, check_names=False)
            assert_frame_equal(np.isinf(self.frame), np.isinf(recons), check_names=False)