Example #1
0
def multivarAnalysis():

	x = np.arange(-180,181)
	y = np.arange(-89,90)
	xvec = [xi for xi in x for yi in y]
	yvec = [yi for xi in x for yi in y]
	sali = [np.nan for i in xvec]
	temp = copy.copy(sali)
	sili = copy.copy(sali)
	nitr = copy.copy(sali)
	phos = copy.copy(sali)
	bath = copy.copy(sali)
	prod = copy.copy(sali)
	sali = fillList(sali,readNetcdf('gmtplots/netcdffiles/salinity.nc',['lon','lat','z'],1,noCheck,notNan))
	temp = fillList(temp,readNetcdf('gmtplots/netcdffiles/temperature.nc',['lon','lat','z'],1,noCheck,notNan))
	sili = fillList(sili,readNetcdf('gmtplots/netcdffiles/silicateAnnual.nc',['lon','lat','z'],1,noCheck,notNan))
	nitr = fillList(nitr,readNetcdf('gmtplots/netcdffiles/nitrateAnnual.nc',['lon','lat','z'],1,noCheck,notNan))
	phos = fillList(phos,readNetcdf('gmtplots/netcdffiles/phosphateAnnual.nc',['lon','lat','z'],1,noCheck,notNan))
	bath = fillList(bath,readNetcdf('gmtplots/netcdffiles/bathymetryMasked.nc',['lon','lat','z'],1,noCheck,notNan))
	prodSummer = readNetcdf('gmtplots/netcdffiles/summerAll.nc',['lon','lat','z'],1,noCheck,notNan)
	prodWinter = readNetcdf('gmtplots/netcdffiles/winterAll.nc',['lon','lat','z'],1,noCheck,notNan)
	prodSummer['classif'].extend(prodWinter['classif'])
	prodSummer['lon'].extend(prodWinter['lon'])
	prodSummer['lat'].extend(prodWinter['lat'])
	prod = fillList(prod,prodSummer)
	#ekki prenta ef ekkert i listanum
	stats = DataFrame([[i+1,xvec[i],yvec[i],sali[i],temp[i],sili[i],nitr[i],phos[i],bath[i],prod[i]] for i in range(len(xvec)) if not np.isnan([i+1,xvec[i],yvec[i],sali[i],temp[i],sili[i],nitr[i],phos[i],bath[i],prod[i]]).any()], index = None , columns = None)
	stats.to_csv('lithologyStats.m',sep = '\t', index = None , columns = None)
Example #2
0
    def test_to_csv_with_single_column(self):
        # see gh-18676, https://bugs.python.org/issue32255
        #
        # Python's CSV library adds an extraneous '""'
        # before the newline when the NaN-value is in
        # the first row. Otherwise, only the newline
        # character is added. This behavior is inconsistent
        # and was patched in https://bugs.python.org/pull_request4672.
        df1 = DataFrame([None, 1])
        expected1 = """\
""
1.0
"""
        with tm.ensure_clean('test.csv') as path:
            df1.to_csv(path, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected1

        df2 = DataFrame([1, None])
        expected2 = """\
1.0
""
"""
        with tm.ensure_clean('test.csv') as path:
            df2.to_csv(path, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected2
Example #3
0
def predict(fname_specialists='net-specialists.pickle'):
    with open(fname_specialists, 'rb') as f:
        specialists = pickle.load(f)

    X = load2d(test=True)[0]
    y_pred = np.empty((X.shape[0], 0))

    for model in specialists.values():
        y_pred1 = model.predict(X)
        y_pred = np.hstack([y_pred, y_pred1])

    columns = ()
    for cols in specialists.keys():
        columns += cols

    y_pred2 = y_pred * 48 + 48
    y_pred2 = y_pred2.clip(0, 96)
    df = DataFrame(y_pred2, columns=columns)

    lookup_table = read_csv(os.path.expanduser(FLOOKUP))
    values = []

    for index, row in lookup_table.iterrows():
        values.append((
            row['RowId'],
            df.ix[row.ImageId - 1][row.FeatureName],
            ))

    now_str = datetime.now().isoformat().replace(':', '-')
    submission = DataFrame(values, columns=('RowId', 'Location'))
    filename = 'submission-{}.csv'.format(now_str)
    submission.to_csv(filename, index=False)
    print("Wrote {}".format(filename))
Example #4
0
 def test_to_csv_wide_frame_formatting(self):
     # Issue #8621
     df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
     with ensure_clean() as filename:
         df.to_csv(filename, header=False, index=False)
         rs = read_csv(filename, header=None)
         assert_frame_equal(rs, df)
Example #5
0
def twitter_daily_aggregate(retrievaldate):

	#Date Retrieval
	d=[]
	dt = parser.parse(retrievaldate) + timedelta(days=-1)
	d.append(dt)
	d.append(d[-1] + timedelta(days=1))

	#DataFrame Init
	ctrend = DataFrame()
	while d[-1] < datetime.utcnow(): 
		print 'processing ', d[-1], ' ..........'
		#Daily Mention Count
		mnts = twitter_count(d, mentions)

		#User Follower Count
		usrs =  twitter_follower(d,users)
		#Join
		trend = mnts.join(usrs)
		trend['Date'] = Period(d[-1],'D')
		#Append to DataFrame
		ctrend = concat([ctrend,trend])
		#Extend Dates
		d.append(d[-1] + timedelta(days=1))
	#Join DataFrames and Fill NAs
	ctrend =  ctrend.fillna(0)
	#Save
	print 'printing the file'
	ctrend.to_csv('twitter_trend.csv')
	return ctrend
Example #6
0
    def test_to_csv_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC)

        result = buf.getvalue()
        expected = ('"A","B"\n'
                    '1,"foo"\n'
                    '2,"bar"\n'
                    '3,"baz"\n')

        self.assertEqual(result, expected)

        # quoting windows line terminators, presents with encoding?
        # #3503
        text = 'a,b,c\n1,"test \r\n",3\n'
        df = pd.read_csv(StringIO(text))
        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        self.assertEqual(buf.getvalue(), text)

        # testing if quoting parameter is passed through with multi-indexes
        # related to issue #7791
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])
        expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
        self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
Example #7
0
    def test_to_csv_from_csv1(self):

        with ensure_clean('__tmp_to_csv_from_csv1__') as path:
            self.frame['A'][:5] = nan

            self.frame.to_csv(path)
            self.frame.to_csv(path, columns=['A', 'B'])
            self.frame.to_csv(path, header=False)
            self.frame.to_csv(path, index=False)

            # test roundtrip
            self.tsframe.to_csv(path)
            recons = DataFrame.from_csv(path)

            assert_frame_equal(self.tsframe, recons)

            self.tsframe.to_csv(path, index_label='index')
            recons = DataFrame.from_csv(path, index_col=None)
            assert(len(recons.columns) == len(self.tsframe.columns) + 1)

            # no index
            self.tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(self.tsframe.values, recons.values)

            # corner case
            dm = DataFrame({'s1': Series(lrange(3), lrange(3)),
                            's2': Series(lrange(2), lrange(2))})
            dm.to_csv(path)
            recons = DataFrame.from_csv(path)
            assert_frame_equal(dm, recons)
Example #8
0
    def test_importItems(self):
        wrongFields = [{"a": "What is your gender?",
                      "b": 0.7,
                      "c": "radio",
                      "d": 0.3,
                      "e": "Male, Female, Other",
                      "f": 'vert'}]

        wrongOptions = [{"questionText": "What is your gender?",
                      "questionWidth": 0.7,
                      "type": "radio",
                      "responseWidth": 0.3,
                      "options": "Other",
                      "layout": 'vert',
                      "index": 0}]

        df = DataFrame(self.questions)
        df.to_excel(fileName_xlsx, index=False)
        df.to_csv(fileName_csv, index=False)

        # Check wrong field error
        with pytest.raises(NameError):
            self.survey = Form(self.win, items=wrongFields, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False)

        # Check options for list of dicts
        with pytest.raises(ValueError):
            self.survey = Form(self.win, items=wrongOptions, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False)

        # Check csv
        self.survey = Form(self.win, items=fileName_csv,
                           size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False)
        # Check Excel
        self.survey = Form(self.win, items=fileName_xlsx,
                           size=(1.0, 0.3), pos=(0.0, 0.0), randomize=False, autoLog=False)
Example #9
0
    def test_to_csv_decimal(self):
        # GH 781
        df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})

        expected_default = ',col1,col2,col3\n0,1,a,10.1\n'
        assert df.to_csv() == expected_default

        expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n'
        assert df.to_csv(decimal=',', sep=';') == expected_european_excel

        expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n'
        assert df.to_csv(float_format='%.2f') == expected_float_format_default

        expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
        assert df.to_csv(decimal=',', sep=';',
                         float_format='%.2f') == expected_float_format

        # GH 11553: testing if decimal is taken into account for '0.0'
        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
        expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
        assert df.to_csv(index=False, decimal='^') == expected

        # same but for an index
        assert df.set_index('a').to_csv(decimal='^') == expected

        # same for a multi-index
        assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
Example #10
0
def predict(fname_specialists='net2.pickle'):
    with open(fname_specialists, 'rb') as f:
        net = pickle.load(f)

        X = load2d(test=True)[0]

        y_pred = net.predict(X)

        y_pred2 = y_pred * 48 + 48
        y_pred2 = y_pred2.clip(0, 96)

        df = DataFrame(y_pred2)

        lookup_table = read_csv(os.path.expanduser(FLOOKUP))
        values = []

        for index, row in lookup_table.iterrows():
            values.append((
                row.RowId,
                y_pred2[int(row.ImageId)-1][int(row.RowId)%30-1]
                ))


        submission = DataFrame(values, columns=('RowId','Location'))
        filename = 'submission1.csv'

        submission.to_csv(filename, index=False)
        print("Wrote {}".format(filename))
Example #11
0
def getFeatures(filename):
    csvfile = pd.read_csv(filename)  # Reading .csv files containing tweets.
    tweet_ids = csvfile["id_str"]  # Copying the 'id_str' attribute values to a item.
    length = len(tweet_ids)  # Getting the length of 'tweet_ids'.

    df = DataFrame(d, index=[0])  # Creating a DataFrame

    twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
    ACCESS_TOKEN = twitter.obtain_access_token()
    twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
    # Generating Access Token

    for i in range(0, length):
        status = twitter.show_status(id=tweet_ids[i])
        d["id"] = status["id_str"].encode("utf-8")
        d["created_at"] = status["created_at"].encode("utf-8")
        d["from_user"] = status["user"]["screen_name"].encode("utf-8")
        d["followers_count"] = status["user"]["followers_count"]
        d["friends_count"] = status["user"]["friends_count"]
        d["statuses_count"] = status["user"]["statuses_count"]
        d["verified"] = status["user"]["verified"]
        d["location"] = 0 if (len(status["user"]["location"].encode("utf-8")) == 0) else 1
        d["text"] = status["text"].encode("utf-8")
        d["retweet_count"] = status["retweet_count"]
        d["favorite_count"] = status["favorite_count"]
        d["hashtag_count"] = len(status["entities"]["hashtags"])
        d["url_count"] = len(status["entities"]["urls"])
        d["mentions_count"] = len(status["entities"]["user_mentions"])
        if len(status["entities"]["urls"]) > 0:
            for x in range(0, len(status["entities"]["urls"])):
                d["links"] += status["entities"]["urls"][x]["expanded_url"].encode("utf-8") + "  "
        df = df.append(d, ignore_index=True)
        df.to_csv("NSamples.csv")  # Saving file to disk
        d["links"] = ""
    print "\nAll Done!"
def predict(test_set: DataFrame, model: LogisticRegression, reg, filename):
    test_df = test_set.filter(regex=reg)
    test_np = test_df.as_matrix()
    predictions = model.predict(test_np)
    result = DataFrame({'PassengerId': test_set['PassengerId'].as_matrix(),
                        'Survived': predictions.astype(np.int32)})
    result.to_csv(filename, index=False)
def main():
    ##  Set default trace out dir, and get wide kernel names
    trace_out_dir = path.join(dir_script, "../output/trace")
    wide_kernel_names = get_wide_kernel_names_trace(trace_out_dir)

    ##  Get wide bench names
    wide_bench_names = get_wide_bench_names(wide_kernel_names)

    duration_frame = DataFrame(index = wide_bench_names)
    duration_root_dir = path.join(dir_script, "../log")

    duration_frame['base_model'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'base_model'))
    duration_frame['model'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'model'))
    duration_frame['model_compare'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'opt_break/trace_off'))
    duration_frame['base_trace'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'base_trace'))
    duration_frame['trace'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'trace'))
    duration_frame['profiler'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'profiler'))
    duration_frame['sim'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'sim'))
    
    duration_frame['opt_break_trace_off'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'opt_break/trace_off'))

    breakdown_frame_index_wide_bench_name(duration_frame)

    duration_out_file = path.join(dir_script, "../output/duration.csv")
    duration_frame.to_csv(duration_out_file)
Example #14
0
def submission(fname_net="net4.pickle"):
    with open(fname_net, "rb") as f:
        net = pickle.load(f)  # net = specialists

    X = load.load2d(test=True)[0]
    y_pred = net.predict(X)
    print "Finish predict test file"
    columns = "left_eye_center_x,left_eye_center_y,right_eye_center_x,right_eye_center_y,left_eye_inner_corner_x,left_eye_inner_corner_y,left_eye_outer_corner_x,left_eye_outer_corner_y,right_eye_inner_corner_x,right_eye_inner_corner_y,right_eye_outer_corner_x,right_eye_outer_corner_y,left_eyebrow_inner_end_x,left_eyebrow_inner_end_y,left_eyebrow_outer_end_x,left_eyebrow_outer_end_y,right_eyebrow_inner_end_x,right_eyebrow_inner_end_y,right_eyebrow_outer_end_x,right_eyebrow_outer_end_y,nose_tip_x,nose_tip_y,mouth_left_corner_x,mouth_left_corner_y,mouth_right_corner_x,mouth_right_corner_y,mouth_center_top_lip_x,mouth_center_top_lip_y,mouth_center_bottom_lip_x,mouth_center_bottom_lip_y"
    columns = columns.split(",")

    y_pred = y_pred * 48 + 48
    y_pred = y_pred.clip(0, 96)
    df = DataFrame(y_pred, columns=columns)

    lookup_table = read_csv(os.path.expanduser("./data/IdLookupTable.csv"))
    values = []

    for index, row in lookup_table.iterrows():
        values.append((row["RowId"], df.ix[row.ImageId - 1][row.FeatureName]))

    now_str = datetime.now().isoformat().replace(":", "-")
    submission = DataFrame(values, columns=("RowId", "Location"))
    filename = "submission-{}.csv".format(now_str)
    submission.to_csv(filename, index=False)
    print ("Wrote {}".format(filename))
Example #15
0
def arrange_aggregates(cumsums, symbols, aggs):
    for i in symbols:
        cumsums[i] = cumsums[i].ix[:,0:5]

    cols = cumsums['ATL'].columns.tolist()
    cols2 = aggs['ATL'].columns.tolist()
    cols3 = (aggs['ATL'].columns + '1').tolist()
    cols.extend(cols2)
    cols.extend(cols3)
    ATL = DataFrame(columns = cols)

    for team in symbols:
        for Date in cumsums[team]['Date']:
            Opponent = cumsums[team].ix[cumsums[team]['Date'] == Date, 'Opponent'].all()
            cumsums_temp = cumsums[team].ix[cumsums[team]['Date'] == Date]
            cumsums_temp = cumsums_temp.reset_index()
            team_temp = aggs[team]
            oppenent_temp = DataFrame(aggs[Opponent])
            oppenent_temp.columns = cols3
            atl = pd.concat([cumsums_temp, team_temp, oppenent_temp], axis = 1)
            atl = atl.drop('index', axis=1)
            atl.columns = cols
            ATL = pd.concat([ATL, atl], axis = 0)

        print team

    ATL.to_csv('final.csv', sep=',', index=False)
Example #16
0
def sanity_check():
    """Check that the interaction and bookkeeping is OK.

    Set the agent to epsilon equal to 0.99. This makes
    almost all the actions to be selected uniformly at random.
    The action value for each context should follow the expected
    reward for each context.
    """
    print('Running a contextual bandit experiment')
    cb = ContextualBandit()
    ca = ContextualAgent(cb, epsilon=0.99)
    steps = 10000
    for _ in range(steps):
        ca.run()
    rewards = np.array(cb.actions)
    df = DataFrame(ca.log, columns=('context', 'action', 'reward', 'Q(c,a)'))
    fn = 'sanity_check.csv'
    df.to_csv(fn, index=False)
    print('Sequence written in', fn)
    print()
    for context, prob in cb.contexts.items():
        print(context, ': ')
        print('samp : ', ca.Q[context])
        print(' teo : ', prob * rewards - (1 - prob) * rewards)
        print()
    globals().update(locals())
Example #17
0
    def write_bar_as_csv(bt):

        CloseTime = bt.range_bar.CloseTime[:]
        High = bt.range_bar.High[:]
        Low = bt.range_bar.Low[:]
        Open = bt.range_bar.Open[:]
        Close = bt.range_bar.Close[:]

        CloseTime.reverse()
        High.reverse()
        Low.reverse()
        Open.reverse()
        Close.reverse()

        range_bar_df = DataFrame({'Date': CloseTime,
                                  'H': High,
                                  'L': Low,
                                  'O': Open,
                                  'C': Close}, columns=['Date', 'H', 'L', 'O', 'C'])

        strat = bt.strategies[bt.strategies.keys()[0]]

        for indicator_name in strat.indicators:
            curr_indicator = strat.indicators[indicator_name].val
            curr_indicator.reverse()
            range_bar_df[indicator_name] = curr_indicator

        print "Writing to: {}".format(bt.bar_data_root)
        range_bar_df.to_csv(path_or_buf=bt.bar_data_root, index=False)
def main(train_file, test_file):
  #print "loading data.."
  csv.field_size_limit(1310720)
  trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' ))
  projectid, traindata_old = zip (*trainreader)  

  testreader = csv.reader (open ('/home/kiran/kdd/test.csv'))
  projectid, testdata_old = zip (*testreader)


  # remove stopwords
  traindata = []
  testdata = []
  for observation in traindata_old:
      traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))
  for observation in testdata_old:
      testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))

  tfv = CountVectorizer (binary=1,ngram_range=(1, 1))
  X_all = traindata + testdata
  lentrain = len(traindata)
  tfv.fit(X_all)
  X_all = tfv.transform(X_all)
  X = X_all[:lentrain]
  X_test = X_all[lentrain:]
  scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real')
  scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real')
  myCols = tfv.get_feature_names ()
  myCols = DataFrame (myCols)
  myCols.to_csv ('bin_1gram.csv', index=False)
Example #19
0
    def test_to_csv_dtnat(self):
        # GH3437
        from pandas import NaT

        def make_dtnat_arr(n, nnat=None):
            if nnat is None:
                nnat = int(n * 0.1)  # 10%
            s = list(date_range('2000', freq='5min', periods=n))
            if nnat:
                for i in np.random.randint(0, len(s), nnat):
                    s[i] = NaT
                i = np.random.randint(100)
                s[-i] = NaT
                s[i] = NaT
            return s

        chunksize = 1000
        # N=35000
        s1 = make_dtnat_arr(chunksize + 5)
        s2 = make_dtnat_arr(chunksize + 5, 0)

        # s3=make_dtnjat_arr(chunksize+5,0)
        with ensure_clean('1.csv') as pth:
            df = DataFrame(dict(a=s1, b=s2))
            df.to_csv(pth, chunksize=chunksize)
            recons = DataFrame.from_csv(pth)._convert(datetime=True,
                                                      coerce=True)
            assert_frame_equal(df, recons, check_names=False,
                               check_less_precise=True)
def predict(subject, data_path, model_path, submission_path):
    patient_filenames = [filename for filename in os.listdir(model_path) if
                         subject in filename and filename.endswith('.pickle')]
    for filename in patient_filenames:
        print filename

        d = load_test_data(data_path, subject)
        x, id = d['x'], d['id']

        with open(model_path + '/' + filename, 'rb') as f:
            state_dict = cPickle.load(f)

        scalers = state_dict['scalers']
        x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \
            else scale_across_features(x, x_test=None, scalers=scalers)

        cnn = ConvNet(state_dict['params'])
        cnn.set_weights(state_dict['weights'])
        test_proba = cnn.get_test_proba(x)

        ans = zip(id, test_proba)

        df = DataFrame(data=ans, columns=['clip', 'preictal'])
        csv_name = '.'.join(filename.split('.')[:-1]) if '.' in filename else filename
        df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
Example #21
0
    def test_to_csv_from_csv2(self):

        with ensure_clean('__tmp_to_csv_from_csv2__') as path:

            # duplicate index
            df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = DataFrame.from_csv(path)
            assert_frame_equal(result, df)

            midx = MultiIndex.from_tuples(
                [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
            df = DataFrame(np.random.randn(3, 3), index=midx,
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = DataFrame.from_csv(path, index_col=[0, 1, 2],
                                        parse_dates=False)
            # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it
            # ?
            assert_frame_equal(result, df, check_names=False)

            # column aliases
            col_aliases = Index(['AA', 'X', 'Y', 'Z'])
            self.frame2.to_csv(path, header=col_aliases)
            rs = DataFrame.from_csv(path)
            xp = self.frame2.copy()
            xp.columns = col_aliases

            assert_frame_equal(xp, rs)

            self.assertRaises(ValueError, self.frame2.to_csv, path,
                              header=['AA', 'X'])
def simple_commands():
    # Initial set of baby names and birth rates.
    names = ['Bob', 'Jessica', 'Mary', 'John', 'Mel']
    births = [968, 155, 77, 578, 973]
    # Merge name and birth lists into a single list of tuples (kind of a dict).
    BabyDataSet = list(zip(names, births))
    # Export the BabyDataSet into a DataFrame (similar to a SQL table).
    df = DataFrame(data = BabyDataSet, columns = ['Name', 'Births'])
    # Export the data frame into a CSV file (without and without headers).
    file_nh = 'dat_births_1880_without_header.csv'
    file_wh = 'dat_births_1880_with_header.csv'
    df.to_csv(file_nh, index = False, header = False)
    df.to_csv(file_wh, index = True, header = True)
    # Read data from the CSV file without headers and assign new labels.
    df_nh = pd.read_csv(file_nh, header = None, names = ['Anda', 'La_osa'])
    print('\nFrom file without labels:')
    print(df_nh)
    # Read data from the CSV file with headers.
    df_wh = pd.read_csv(file_wh, header = 0, index_col = 0)
    print('\nFrom file with labels:')
    print(df_wh)
    # Check columns data types.
    print('\nChecking the column data types:')
    print(df_wh.dtypes)
    # Find the name with the highest birthrate.
    print('\nFinding the name with the highest birthrate:')
    sort = df_wh.sort_values(['Births'], ascending = False)
    print(sort.head(1))     # <- Select the 1st row of the sorted data frame.
    print(sort[1:2])
    # Find the largest value within the 'Births' column of the original array.
    print('\nThe largest value within the column \'Births\' is:')
    print(df_wh['Births'].max())
Example #23
0
def getIndexChangeRate(startDate,endDate):    
    df_result = DataFrame()
    df = ts.get_hist_data('sh',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sh'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('sz',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sz'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('zxb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'zxb'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('cyb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'cyb'
    df_result = df_result.append(df)
    
    fileName = r'D:\stock\index_changeRate_' +startDate+'_' + endDate + '.csv'
    df_result = df_result.loc[:,['date','mkt','close','volume','price_change','p_change','gap','gap_rate']]
    df_result = df_result.sort_index(by='date',ascending=False)
    df_result.to_csv(fileName,index = False)
def __extract_single_features(feature):
    """
    Creates two files (one with unprocessed, and one with singular feature) containing questions that has the feature

    Arguments:
        feature: The feature(s) to look for

    Returns:
         tuple (pandas.DataFrame, pandas.DataFrame): Tuple that contains the dataframe with
          updated unprocessed questions (those that contains the given feature), and the
          other dataframe that has the features added to its question text
    """
    up_name = "UP_" + feature.strip()
    new_index = old_index = 0
    path = const.FILEPATH_TRAINING_DATA + FILENAME_START
    unprocessed_df = load_training_data(path, False, exclude_site_tags=True)
    feature_df = DataFrame.from_csv(__get_filename(const.FILEPATH_FEATURE_DETECTOR, feature))
    new_up_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    new_feat_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    for question in feature_df[const.QUESTION_TEXT_KEY]:
        if feature in question:
            new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy()
            new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy()
            new_index += 1
        old_index += 1
    new_up_dataframe.to_csv(__get_filename(NEW_PATH, up_name), encoding='utf-8')
    new_feat_dataframe.to_csv(__get_filename(NEW_PATH, feature), encoding='utf-8')
Example #25
0
class matchbox:
    def __init__(self, articlepaths):
        self.num_exports = 0
        self.num_articles_total = len(articlepaths)
        self.num_articles_matched = 0
        self.num_matches = 0
        self.dataframe = DataFrame()
        self.init_time = time.strftime("%Y-%m-%d_%H-%M-%S_")

    def update(self, matches):
        self.dataframe = self.dataframe.append(matches, ignore_index=True)
        self.num_articles_matched += 1
        self.num_matches += len(matches)
        print('Matched {} places in article {} of {} ({:.2%} complete). '
              'Total: {}.'.format(len(matches),
                                          self.num_articles_matched,
                                          self.num_articles_total,
                                          self.num_articles_matched / self.num_articles_total,
                                          self.num_matches))

    def empty_into_csv(self):
        self.num_exports += 1
        outname = outdir + self.init_time + 'pubs_aegypti_' + str(self.num_exports) + '.csv'
        self.dataframe.to_csv(outname, encoding='utf-8')
        print('Wrote matches from chunk {} to {}.'.format(self.num_exports, outname))
        del self.dataframe
        self.dataframe = DataFrame()
def __extract_multiple_features(feature1, feature2, filename):
    """
    Creates two files (one with unprocessed, and one with singular feature) containing questions that has the features

    Arguments:
        feature1: The first feature to look for
        feature2: The second feature to look for
        filename (str): File containing the features

    """
    new_index = old_index = 0
    up_name = "UP_" + filename.strip()
    path = const.FILEPATH_TRAINING_DATA + FILENAME_START
    unprocessed_df = load_training_data(path, False, exclude_site_tags=True)
    feature_df = DataFrame.from_csv(__get_filename(const.FILEPATH_FEATURE_DETECTOR, filename))
    new_up_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    new_feat_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    for question in feature_df[const.QUESTION_TEXT_KEY]:
        if feature1 in question:
            new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy()
            new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy()
            new_index += 1
        elif feature2 in question:
            new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy()
            new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy()
            new_index += 1
        old_index += 1
    new_up_dataframe.to_csv(__get_filename(NEW_PATH, up_name), encoding='utf-8')
    new_feat_dataframe.to_csv(__get_filename(NEW_PATH, filename), encoding='utf-8')
Example #27
0
    def test_to_csv_compression(self, compression_only,
                                read_infer, to_infer):
        # see gh-15008
        compression = compression_only

        if compression == "zip":
            pytest.skip("{compression} is not supported "
                        "for to_csv".format(compression=compression))

        # We'll complete file extension subsequently.
        filename = "test."

        if compression == "gzip":
            filename += "gz"
        else:
            # xz --> .xz
            # bz2 --> .bz2
            filename += compression

        df = DataFrame({"A": [1]})

        to_compression = "infer" if to_infer else compression
        read_compression = "infer" if read_infer else compression

        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression=to_compression)
            result = pd.read_csv(path, index_col=0,
                                 compression=read_compression)
            tm.assert_frame_equal(result, df)
Example #28
0
    def test_to_csv_from_csv2(self):

        with ensure_clean('__tmp_to_csv_from_csv2__') as path:

            # duplicate index
            df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = self.read_csv(path)
            assert_frame_equal(result, df)

            midx = MultiIndex.from_tuples(
                [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
            df = DataFrame(np.random.randn(3, 3), index=midx,
                           columns=['x', 'y', 'z'])

            df.to_csv(path)
            result = self.read_csv(path, index_col=[0, 1, 2],
                                   parse_dates=False)
            assert_frame_equal(result, df, check_names=False)

            # column aliases
            col_aliases = Index(['AA', 'X', 'Y', 'Z'])
            self.frame2.to_csv(path, header=col_aliases)

            rs = self.read_csv(path)
            xp = self.frame2.copy()
            xp.columns = col_aliases
            assert_frame_equal(xp, rs)

            msg = "Writing 4 cols but got 2 aliases"
            with pytest.raises(ValueError, match=msg):
                self.frame2.to_csv(path, header=['AA', 'X'])
Example #29
0
    def test_passing_dtype(self):
        # see gh-6607
        df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
            'AB'), index=['1A', '1B', '1C', '1D', '1E'])

        with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
            df.to_csv(path)

            # see gh-3795: passing 'str' as the dtype
            result = self.read_csv(path, dtype=str, index_col=0)
            expected = df.astype(str)
            tm.assert_frame_equal(result, expected)

            # for parsing, interpret object as str
            result = self.read_csv(path, dtype=object, index_col=0)
            tm.assert_frame_equal(result, expected)

            # we expect all object columns, so need to
            # convert to test for equivalence
            result = result.astype(float)
            tm.assert_frame_equal(result, df)

            # invalid dtype
            self.assertRaises(TypeError, self.read_csv, path,
                              dtype={'A': 'foo', 'B': 'float64'},
                              index_col=0)

        # see gh-12048: empty frame
        actual = self.read_csv(StringIO('A,B'), dtype=str)
        expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
        tm.assert_frame_equal(actual, expected)
def commentsForStory(objectId, log):
   try:
      url = 'https://hn.algolia.com/api/v1/items/%d' % (objectId)

      req = urllib.request.Request(url)
      response = urllib.request.urlopen(req)
      data = json.loads(response.read().decode("utf-8"))
   except (KeyboardInterrupt, SystemExit):
      raise
   except IOError as e:
      message = '%d: %s' % (e.code, e.reason)
      log[str(objectId)] = message
      print(message)
      return

   tree = commentTree(data)
   commentRecords = preorderTraversalIgnoreRoot(tree)

   if len(commentRecords) == 0:
      log[str(objectId)] = NO_COMMENTS
      return

   columns = ['id', 'author', 'text', 'points', 'created_at', 'parent_id', 'story_id']
   df = DataFrame(columns = columns, index = numpy.arange(len(commentRecords)))
   for index, comment in enumerate(commentRecords):
      df.ix[index] = comment

   df.to_csv("comments-by-story/comments-%d.csv" % objectId, encoding='utf-8', index=False)
   log[str(objectId)] = SUCCESS