def commentsForStory(objectId, log):
   try:
      url = 'https://hn.algolia.com/api/v1/items/%d' % (objectId)

      req = urllib.request.Request(url)
      response = urllib.request.urlopen(req)
      data = json.loads(response.read().decode("utf-8"))
   except (KeyboardInterrupt, SystemExit):
      raise
   except IOError as e:
      message = '%d: %s' % (e.code, e.reason)
      log[str(objectId)] = message
      print(message)
      return

   tree = commentTree(data)
   commentRecords = preorderTraversalIgnoreRoot(tree)

   if len(commentRecords) == 0:
      log[str(objectId)] = NO_COMMENTS
      return

   columns = ['id', 'author', 'text', 'points', 'created_at', 'parent_id', 'story_id']
   df = DataFrame(columns = columns, index = numpy.arange(len(commentRecords)))
   for index, comment in enumerate(commentRecords):
      df.ix[index] = comment

   df.to_csv("comments-by-story/comments-%d.csv" % objectId, encoding='utf-8', index=False)
   log[str(objectId)] = SUCCESS
Beispiel #2
0
    def test_passing_dtype(self):
        # see gh-6607
        df = DataFrame(np.random.rand(5, 2).round(4), columns=list(
            'AB'), index=['1A', '1B', '1C', '1D', '1E'])

        with tm.ensure_clean('__passing_str_as_dtype__.csv') as path:
            df.to_csv(path)

            # see gh-3795: passing 'str' as the dtype
            result = self.read_csv(path, dtype=str, index_col=0)
            expected = df.astype(str)
            tm.assert_frame_equal(result, expected)

            # for parsing, interpret object as str
            result = self.read_csv(path, dtype=object, index_col=0)
            tm.assert_frame_equal(result, expected)

            # we expect all object columns, so need to
            # convert to test for equivalence
            result = result.astype(float)
            tm.assert_frame_equal(result, df)

            # invalid dtype
            self.assertRaises(TypeError, self.read_csv, path,
                              dtype={'A': 'foo', 'B': 'float64'},
                              index_col=0)

        # see gh-12048: empty frame
        actual = self.read_csv(StringIO('A,B'), dtype=str)
        expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
        tm.assert_frame_equal(actual, expected)
Beispiel #3
0
    def test_to_csv_compression(self, compression_only,
                                read_infer, to_infer):
        # see gh-15008
        compression = compression_only

        if compression == "zip":
            pytest.skip("{compression} is not supported "
                        "for to_csv".format(compression=compression))

        # We'll complete file extension subsequently.
        filename = "test."

        if compression == "gzip":
            filename += "gz"
        else:
            # xz --> .xz
            # bz2 --> .bz2
            filename += compression

        df = DataFrame({"A": [1]})

        to_compression = "infer" if to_infer else compression
        read_compression = "infer" if read_infer else compression

        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression=to_compression)
            result = pd.read_csv(path, index_col=0,
                                 compression=read_compression)
            tm.assert_frame_equal(result, df)
Beispiel #4
0
def multivarAnalysis():

	x = np.arange(-180,181)
	y = np.arange(-89,90)
	xvec = [xi for xi in x for yi in y]
	yvec = [yi for xi in x for yi in y]
	sali = [np.nan for i in xvec]
	temp = copy.copy(sali)
	sili = copy.copy(sali)
	nitr = copy.copy(sali)
	phos = copy.copy(sali)
	bath = copy.copy(sali)
	prod = copy.copy(sali)
	sali = fillList(sali,readNetcdf('gmtplots/netcdffiles/salinity.nc',['lon','lat','z'],1,noCheck,notNan))
	temp = fillList(temp,readNetcdf('gmtplots/netcdffiles/temperature.nc',['lon','lat','z'],1,noCheck,notNan))
	sili = fillList(sili,readNetcdf('gmtplots/netcdffiles/silicateAnnual.nc',['lon','lat','z'],1,noCheck,notNan))
	nitr = fillList(nitr,readNetcdf('gmtplots/netcdffiles/nitrateAnnual.nc',['lon','lat','z'],1,noCheck,notNan))
	phos = fillList(phos,readNetcdf('gmtplots/netcdffiles/phosphateAnnual.nc',['lon','lat','z'],1,noCheck,notNan))
	bath = fillList(bath,readNetcdf('gmtplots/netcdffiles/bathymetryMasked.nc',['lon','lat','z'],1,noCheck,notNan))
	prodSummer = readNetcdf('gmtplots/netcdffiles/summerAll.nc',['lon','lat','z'],1,noCheck,notNan)
	prodWinter = readNetcdf('gmtplots/netcdffiles/winterAll.nc',['lon','lat','z'],1,noCheck,notNan)
	prodSummer['classif'].extend(prodWinter['classif'])
	prodSummer['lon'].extend(prodWinter['lon'])
	prodSummer['lat'].extend(prodWinter['lat'])
	prod = fillList(prod,prodSummer)
	#ekki prenta ef ekkert i listanum
	stats = DataFrame([[i+1,xvec[i],yvec[i],sali[i],temp[i],sili[i],nitr[i],phos[i],bath[i],prod[i]] for i in range(len(xvec)) if not np.isnan([i+1,xvec[i],yvec[i],sali[i],temp[i],sili[i],nitr[i],phos[i],bath[i],prod[i]]).any()], index = None , columns = None)
	stats.to_csv('lithologyStats.m',sep = '\t', index = None , columns = None)
Beispiel #5
0
def predict(fname_specialists='net-specialists.pickle'):
    with open(fname_specialists, 'rb') as f:
        specialists = pickle.load(f)

    X = load2d(test=True)[0]
    y_pred = np.empty((X.shape[0], 0))

    for model in specialists.values():
        y_pred1 = model.predict(X)
        y_pred = np.hstack([y_pred, y_pred1])

    columns = ()
    for cols in specialists.keys():
        columns += cols

    y_pred2 = y_pred * 48 + 48
    y_pred2 = y_pred2.clip(0, 96)
    df = DataFrame(y_pred2, columns=columns)

    lookup_table = read_csv(os.path.expanduser(FLOOKUP))
    values = []

    for index, row in lookup_table.iterrows():
        values.append((
            row['RowId'],
            df.ix[row.ImageId - 1][row.FeatureName],
            ))

    now_str = datetime.now().isoformat().replace(':', '-')
    submission = DataFrame(values, columns=('RowId', 'Location'))
    filename = 'submission-{}.csv'.format(now_str)
    submission.to_csv(filename, index=False)
    print("Wrote {}".format(filename))
Beispiel #6
0
class matchbox:
    def __init__(self, articlepaths):
        self.num_exports = 0
        self.num_articles_total = len(articlepaths)
        self.num_articles_matched = 0
        self.num_matches = 0
        self.dataframe = DataFrame()
        self.init_time = time.strftime("%Y-%m-%d_%H-%M-%S_")

    def update(self, matches):
        self.dataframe = self.dataframe.append(matches, ignore_index=True)
        self.num_articles_matched += 1
        self.num_matches += len(matches)
        print('Matched {} places in article {} of {} ({:.2%} complete). '
              'Total: {}.'.format(len(matches),
                                          self.num_articles_matched,
                                          self.num_articles_total,
                                          self.num_articles_matched / self.num_articles_total,
                                          self.num_matches))

    def empty_into_csv(self):
        self.num_exports += 1
        outname = outdir + self.init_time + 'pubs_aegypti_' + str(self.num_exports) + '.csv'
        self.dataframe.to_csv(outname, encoding='utf-8')
        print('Wrote matches from chunk {} to {}.'.format(self.num_exports, outname))
        del self.dataframe
        self.dataframe = DataFrame()
Beispiel #7
0
    def test_to_csv_with_single_column(self):
        # see gh-18676, https://bugs.python.org/issue32255
        #
        # Python's CSV library adds an extraneous '""'
        # before the newline when the NaN-value is in
        # the first row. Otherwise, only the newline
        # character is added. This behavior is inconsistent
        # and was patched in https://bugs.python.org/pull_request4672.
        df1 = DataFrame([None, 1])
        expected1 = """\
""
1.0
"""
        with tm.ensure_clean('test.csv') as path:
            df1.to_csv(path, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected1

        df2 = DataFrame([1, None])
        expected2 = """\
1.0
""
"""
        with tm.ensure_clean('test.csv') as path:
            df2.to_csv(path, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected2
Beispiel #8
0
def getIndexChangeRate(startDate,endDate):    
    df_result = DataFrame()
    df = ts.get_hist_data('sh',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sh'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('sz',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sz'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('zxb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'zxb'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('cyb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'cyb'
    df_result = df_result.append(df)
    
    fileName = r'D:\stock\index_changeRate_' +startDate+'_' + endDate + '.csv'
    df_result = df_result.loc[:,['date','mkt','close','volume','price_change','p_change','gap','gap_rate']]
    df_result = df_result.sort_index(by='date',ascending=False)
    df_result.to_csv(fileName,index = False)
Beispiel #9
0
def twitter_daily_aggregate(retrievaldate):

	#Date Retrieval
	d=[]
	dt = parser.parse(retrievaldate) + timedelta(days=-1)
	d.append(dt)
	d.append(d[-1] + timedelta(days=1))

	#DataFrame Init
	ctrend = DataFrame()
	while d[-1] < datetime.utcnow(): 
		print 'processing ', d[-1], ' ..........'
		#Daily Mention Count
		mnts = twitter_count(d, mentions)

		#User Follower Count
		usrs =  twitter_follower(d,users)
		#Join
		trend = mnts.join(usrs)
		trend['Date'] = Period(d[-1],'D')
		#Append to DataFrame
		ctrend = concat([ctrend,trend])
		#Extend Dates
		d.append(d[-1] + timedelta(days=1))
	#Join DataFrames and Fill NAs
	ctrend =  ctrend.fillna(0)
	#Save
	print 'printing the file'
	ctrend.to_csv('twitter_trend.csv')
	return ctrend
Beispiel #10
0
    def test_to_csv_from_csv2(self):

        with ensure_clean('__tmp_to_csv_from_csv2__') as path:

            # duplicate index
            df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = DataFrame.from_csv(path)
            assert_frame_equal(result, df)

            midx = MultiIndex.from_tuples(
                [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
            df = DataFrame(np.random.randn(3, 3), index=midx,
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = DataFrame.from_csv(path, index_col=[0, 1, 2],
                                        parse_dates=False)
            # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it
            # ?
            assert_frame_equal(result, df, check_names=False)

            # column aliases
            col_aliases = Index(['AA', 'X', 'Y', 'Z'])
            self.frame2.to_csv(path, header=col_aliases)
            rs = DataFrame.from_csv(path)
            xp = self.frame2.copy()
            xp.columns = col_aliases

            assert_frame_equal(xp, rs)

            self.assertRaises(ValueError, self.frame2.to_csv, path,
                              header=['AA', 'X'])
Beispiel #11
0
 def test_to_csv_wide_frame_formatting(self):
     # Issue #8621
     df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
     with ensure_clean() as filename:
         df.to_csv(filename, header=False, index=False)
         rs = read_csv(filename, header=None)
         assert_frame_equal(rs, df)
Beispiel #12
0
    def test_to_csv_dtnat(self):
        # GH3437
        from pandas import NaT

        def make_dtnat_arr(n, nnat=None):
            if nnat is None:
                nnat = int(n * 0.1)  # 10%
            s = list(date_range('2000', freq='5min', periods=n))
            if nnat:
                for i in np.random.randint(0, len(s), nnat):
                    s[i] = NaT
                i = np.random.randint(100)
                s[-i] = NaT
                s[i] = NaT
            return s

        chunksize = 1000
        # N=35000
        s1 = make_dtnat_arr(chunksize + 5)
        s2 = make_dtnat_arr(chunksize + 5, 0)

        # s3=make_dtnjat_arr(chunksize+5,0)
        with ensure_clean('1.csv') as pth:
            df = DataFrame(dict(a=s1, b=s2))
            df.to_csv(pth, chunksize=chunksize)
            recons = DataFrame.from_csv(pth)._convert(datetime=True,
                                                      coerce=True)
            assert_frame_equal(df, recons, check_names=False,
                               check_less_precise=True)
Beispiel #13
0
    def test_to_csv_from_csv1(self):

        with ensure_clean('__tmp_to_csv_from_csv1__') as path:
            self.frame['A'][:5] = nan

            self.frame.to_csv(path)
            self.frame.to_csv(path, columns=['A', 'B'])
            self.frame.to_csv(path, header=False)
            self.frame.to_csv(path, index=False)

            # test roundtrip
            self.tsframe.to_csv(path)
            recons = DataFrame.from_csv(path)

            assert_frame_equal(self.tsframe, recons)

            self.tsframe.to_csv(path, index_label='index')
            recons = DataFrame.from_csv(path, index_col=None)
            assert(len(recons.columns) == len(self.tsframe.columns) + 1)

            # no index
            self.tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(self.tsframe.values, recons.values)

            # corner case
            dm = DataFrame({'s1': Series(lrange(3), lrange(3)),
                            's2': Series(lrange(2), lrange(2))})
            dm.to_csv(path)
            recons = DataFrame.from_csv(path)
            assert_frame_equal(dm, recons)
Beispiel #14
0
    def write_bar_as_csv(bt):

        CloseTime = bt.range_bar.CloseTime[:]
        High = bt.range_bar.High[:]
        Low = bt.range_bar.Low[:]
        Open = bt.range_bar.Open[:]
        Close = bt.range_bar.Close[:]

        CloseTime.reverse()
        High.reverse()
        Low.reverse()
        Open.reverse()
        Close.reverse()

        range_bar_df = DataFrame({'Date': CloseTime,
                                  'H': High,
                                  'L': Low,
                                  'O': Open,
                                  'C': Close}, columns=['Date', 'H', 'L', 'O', 'C'])

        strat = bt.strategies[bt.strategies.keys()[0]]

        for indicator_name in strat.indicators:
            curr_indicator = strat.indicators[indicator_name].val
            curr_indicator.reverse()
            range_bar_df[indicator_name] = curr_indicator

        print "Writing to: {}".format(bt.bar_data_root)
        range_bar_df.to_csv(path_or_buf=bt.bar_data_root, index=False)
Beispiel #15
0
    def test_to_csv_quoting(self):
        df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})

        buf = StringIO()
        df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC)

        result = buf.getvalue()
        expected = ('"A","B"\n'
                    '1,"foo"\n'
                    '2,"bar"\n'
                    '3,"baz"\n')

        self.assertEqual(result, expected)

        # quoting windows line terminators, presents with encoding?
        # #3503
        text = 'a,b,c\n1,"test \r\n",3\n'
        df = pd.read_csv(StringIO(text))
        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        self.assertEqual(buf.getvalue(), text)

        # testing if quoting parameter is passed through with multi-indexes
        # related to issue #7791
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])
        expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
        self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
Beispiel #16
0
    def test_to_csv_from_csv2(self):

        with ensure_clean('__tmp_to_csv_from_csv2__') as path:

            # duplicate index
            df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = self.read_csv(path)
            assert_frame_equal(result, df)

            midx = MultiIndex.from_tuples(
                [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
            df = DataFrame(np.random.randn(3, 3), index=midx,
                           columns=['x', 'y', 'z'])

            df.to_csv(path)
            result = self.read_csv(path, index_col=[0, 1, 2],
                                   parse_dates=False)
            assert_frame_equal(result, df, check_names=False)

            # column aliases
            col_aliases = Index(['AA', 'X', 'Y', 'Z'])
            self.frame2.to_csv(path, header=col_aliases)

            rs = self.read_csv(path)
            xp = self.frame2.copy()
            xp.columns = col_aliases
            assert_frame_equal(xp, rs)

            msg = "Writing 4 cols but got 2 aliases"
            with pytest.raises(ValueError, match=msg):
                self.frame2.to_csv(path, header=['AA', 'X'])
Beispiel #17
0
def arrange_aggregates(cumsums, symbols, aggs):
    for i in symbols:
        cumsums[i] = cumsums[i].ix[:,0:5]

    cols = cumsums['ATL'].columns.tolist()
    cols2 = aggs['ATL'].columns.tolist()
    cols3 = (aggs['ATL'].columns + '1').tolist()
    cols.extend(cols2)
    cols.extend(cols3)
    ATL = DataFrame(columns = cols)

    for team in symbols:
        for Date in cumsums[team]['Date']:
            Opponent = cumsums[team].ix[cumsums[team]['Date'] == Date, 'Opponent'].all()
            cumsums_temp = cumsums[team].ix[cumsums[team]['Date'] == Date]
            cumsums_temp = cumsums_temp.reset_index()
            team_temp = aggs[team]
            oppenent_temp = DataFrame(aggs[Opponent])
            oppenent_temp.columns = cols3
            atl = pd.concat([cumsums_temp, team_temp, oppenent_temp], axis = 1)
            atl = atl.drop('index', axis=1)
            atl.columns = cols
            ATL = pd.concat([ATL, atl], axis = 0)

        print team

    ATL.to_csv('final.csv', sep=',', index=False)
def __extract_single_features(feature):
    """
    Creates two files (one with unprocessed, and one with singular feature) containing questions that has the feature

    Arguments:
        feature: The feature(s) to look for

    Returns:
         tuple (pandas.DataFrame, pandas.DataFrame): Tuple that contains the dataframe with
          updated unprocessed questions (those that contains the given feature), and the
          other dataframe that has the features added to its question text
    """
    up_name = "UP_" + feature.strip()
    new_index = old_index = 0
    path = const.FILEPATH_TRAINING_DATA + FILENAME_START
    unprocessed_df = load_training_data(path, False, exclude_site_tags=True)
    feature_df = DataFrame.from_csv(__get_filename(const.FILEPATH_FEATURE_DETECTOR, feature))
    new_up_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    new_feat_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    for question in feature_df[const.QUESTION_TEXT_KEY]:
        if feature in question:
            new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy()
            new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy()
            new_index += 1
        old_index += 1
    new_up_dataframe.to_csv(__get_filename(NEW_PATH, up_name), encoding='utf-8')
    new_feat_dataframe.to_csv(__get_filename(NEW_PATH, feature), encoding='utf-8')
def __extract_multiple_features(feature1, feature2, filename):
    """
    Creates two files (one with unprocessed, and one with singular feature) containing questions that has the features

    Arguments:
        feature1: The first feature to look for
        feature2: The second feature to look for
        filename (str): File containing the features

    """
    new_index = old_index = 0
    up_name = "UP_" + filename.strip()
    path = const.FILEPATH_TRAINING_DATA + FILENAME_START
    unprocessed_df = load_training_data(path, False, exclude_site_tags=True)
    feature_df = DataFrame.from_csv(__get_filename(const.FILEPATH_FEATURE_DETECTOR, filename))
    new_up_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    new_feat_dataframe = DataFrame(columns=unprocessed_df.columns.values)
    for question in feature_df[const.QUESTION_TEXT_KEY]:
        if feature1 in question:
            new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy()
            new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy()
            new_index += 1
        elif feature2 in question:
            new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy()
            new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy()
            new_index += 1
        old_index += 1
    new_up_dataframe.to_csv(__get_filename(NEW_PATH, up_name), encoding='utf-8')
    new_feat_dataframe.to_csv(__get_filename(NEW_PATH, filename), encoding='utf-8')
def predict(subject, data_path, model_path, submission_path):
    patient_filenames = [filename for filename in os.listdir(model_path) if
                         subject in filename and filename.endswith('.pickle')]
    for filename in patient_filenames:
        print filename

        d = load_test_data(data_path, subject)
        x, id = d['x'], d['id']

        with open(model_path + '/' + filename, 'rb') as f:
            state_dict = cPickle.load(f)

        scalers = state_dict['scalers']
        x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \
            else scale_across_features(x, x_test=None, scalers=scalers)

        cnn = ConvNet(state_dict['params'])
        cnn.set_weights(state_dict['weights'])
        test_proba = cnn.get_test_proba(x)

        ans = zip(id, test_proba)

        df = DataFrame(data=ans, columns=['clip', 'preictal'])
        csv_name = '.'.join(filename.split('.')[:-1]) if '.' in filename else filename
        df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
def simple_commands():
    # Initial set of baby names and birth rates.
    names = ['Bob', 'Jessica', 'Mary', 'John', 'Mel']
    births = [968, 155, 77, 578, 973]
    # Merge name and birth lists into a single list of tuples (kind of a dict).
    BabyDataSet = list(zip(names, births))
    # Export the BabyDataSet into a DataFrame (similar to a SQL table).
    df = DataFrame(data = BabyDataSet, columns = ['Name', 'Births'])
    # Export the data frame into a CSV file (without and without headers).
    file_nh = 'dat_births_1880_without_header.csv'
    file_wh = 'dat_births_1880_with_header.csv'
    df.to_csv(file_nh, index = False, header = False)
    df.to_csv(file_wh, index = True, header = True)
    # Read data from the CSV file without headers and assign new labels.
    df_nh = pd.read_csv(file_nh, header = None, names = ['Anda', 'La_osa'])
    print('\nFrom file without labels:')
    print(df_nh)
    # Read data from the CSV file with headers.
    df_wh = pd.read_csv(file_wh, header = 0, index_col = 0)
    print('\nFrom file with labels:')
    print(df_wh)
    # Check columns data types.
    print('\nChecking the column data types:')
    print(df_wh.dtypes)
    # Find the name with the highest birthrate.
    print('\nFinding the name with the highest birthrate:')
    sort = df_wh.sort_values(['Births'], ascending = False)
    print(sort.head(1))     # <- Select the 1st row of the sorted data frame.
    print(sort[1:2])
    # Find the largest value within the 'Births' column of the original array.
    print('\nThe largest value within the column \'Births\' is:')
    print(df_wh['Births'].max())
Beispiel #22
0
def sanity_check():
    """Check that the interaction and bookkeeping is OK.

    Set the agent to epsilon equal to 0.99. This makes
    almost all the actions to be selected uniformly at random.
    The action value for each context should follow the expected
    reward for each context.
    """
    print('Running a contextual bandit experiment')
    cb = ContextualBandit()
    ca = ContextualAgent(cb, epsilon=0.99)
    steps = 10000
    for _ in range(steps):
        ca.run()
    rewards = np.array(cb.actions)
    df = DataFrame(ca.log, columns=('context', 'action', 'reward', 'Q(c,a)'))
    fn = 'sanity_check.csv'
    df.to_csv(fn, index=False)
    print('Sequence written in', fn)
    print()
    for context, prob in cb.contexts.items():
        print(context, ': ')
        print('samp : ', ca.Q[context])
        print(' teo : ', prob * rewards - (1 - prob) * rewards)
        print()
    globals().update(locals())
Beispiel #23
0
    def test_importItems(self):
        wrongFields = [{"a": "What is your gender?",
                      "b": 0.7,
                      "c": "radio",
                      "d": 0.3,
                      "e": "Male, Female, Other",
                      "f": 'vert'}]

        wrongOptions = [{"questionText": "What is your gender?",
                      "questionWidth": 0.7,
                      "type": "radio",
                      "responseWidth": 0.3,
                      "options": "Other",
                      "layout": 'vert',
                      "index": 0}]

        df = DataFrame(self.questions)
        df.to_excel(fileName_xlsx, index=False)
        df.to_csv(fileName_csv, index=False)

        # Check wrong field error
        with pytest.raises(NameError):
            self.survey = Form(self.win, items=wrongFields, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False)

        # Check options for list of dicts
        with pytest.raises(ValueError):
            self.survey = Form(self.win, items=wrongOptions, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False)

        # Check csv
        self.survey = Form(self.win, items=fileName_csv,
                           size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False)
        # Check Excel
        self.survey = Form(self.win, items=fileName_xlsx,
                           size=(1.0, 0.3), pos=(0.0, 0.0), randomize=False, autoLog=False)
Beispiel #24
0
def predict(fname_specialists='net2.pickle'):
    with open(fname_specialists, 'rb') as f:
        net = pickle.load(f)

        X = load2d(test=True)[0]

        y_pred = net.predict(X)

        y_pred2 = y_pred * 48 + 48
        y_pred2 = y_pred2.clip(0, 96)

        df = DataFrame(y_pred2)

        lookup_table = read_csv(os.path.expanduser(FLOOKUP))
        values = []

        for index, row in lookup_table.iterrows():
            values.append((
                row.RowId,
                y_pred2[int(row.ImageId)-1][int(row.RowId)%30-1]
                ))


        submission = DataFrame(values, columns=('RowId','Location'))
        filename = 'submission1.csv'

        submission.to_csv(filename, index=False)
        print("Wrote {}".format(filename))
Beispiel #25
0
def getFeatures(filename):
    csvfile = pd.read_csv(filename)  # Reading .csv files containing tweets.
    tweet_ids = csvfile["id_str"]  # Copying the 'id_str' attribute values to a item.
    length = len(tweet_ids)  # Getting the length of 'tweet_ids'.

    df = DataFrame(d, index=[0])  # Creating a DataFrame

    twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
    ACCESS_TOKEN = twitter.obtain_access_token()
    twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
    # Generating Access Token

    for i in range(0, length):
        status = twitter.show_status(id=tweet_ids[i])
        d["id"] = status["id_str"].encode("utf-8")
        d["created_at"] = status["created_at"].encode("utf-8")
        d["from_user"] = status["user"]["screen_name"].encode("utf-8")
        d["followers_count"] = status["user"]["followers_count"]
        d["friends_count"] = status["user"]["friends_count"]
        d["statuses_count"] = status["user"]["statuses_count"]
        d["verified"] = status["user"]["verified"]
        d["location"] = 0 if (len(status["user"]["location"].encode("utf-8")) == 0) else 1
        d["text"] = status["text"].encode("utf-8")
        d["retweet_count"] = status["retweet_count"]
        d["favorite_count"] = status["favorite_count"]
        d["hashtag_count"] = len(status["entities"]["hashtags"])
        d["url_count"] = len(status["entities"]["urls"])
        d["mentions_count"] = len(status["entities"]["user_mentions"])
        if len(status["entities"]["urls"]) > 0:
            for x in range(0, len(status["entities"]["urls"])):
                d["links"] += status["entities"]["urls"][x]["expanded_url"].encode("utf-8") + "  "
        df = df.append(d, ignore_index=True)
        df.to_csv("NSamples.csv")  # Saving file to disk
        d["links"] = ""
    print "\nAll Done!"
def main():
    ##  Set default trace out dir, and get wide kernel names
    trace_out_dir = path.join(dir_script, "../output/trace")
    wide_kernel_names = get_wide_kernel_names_trace(trace_out_dir)

    ##  Get wide bench names
    wide_bench_names = get_wide_bench_names(wide_kernel_names)

    duration_frame = DataFrame(index = wide_bench_names)
    duration_root_dir = path.join(dir_script, "../log")

    duration_frame['base_model'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'base_model'))
    duration_frame['model'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'model'))
    duration_frame['model_compare'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'opt_break/trace_off'))
    duration_frame['base_trace'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'base_trace'))
    duration_frame['trace'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'trace'))
    duration_frame['profiler'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'profiler'))
    duration_frame['sim'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'sim'))
    
    duration_frame['opt_break_trace_off'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'opt_break/trace_off'))

    breakdown_frame_index_wide_bench_name(duration_frame)

    duration_out_file = path.join(dir_script, "../output/duration.csv")
    duration_frame.to_csv(duration_out_file)
def predict(test_set: DataFrame, model: LogisticRegression, reg, filename):
    test_df = test_set.filter(regex=reg)
    test_np = test_df.as_matrix()
    predictions = model.predict(test_np)
    result = DataFrame({'PassengerId': test_set['PassengerId'].as_matrix(),
                        'Survived': predictions.astype(np.int32)})
    result.to_csv(filename, index=False)
Beispiel #28
0
    def test_to_csv_decimal(self):
        # GH 781
        df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})

        expected_default = ',col1,col2,col3\n0,1,a,10.1\n'
        assert df.to_csv() == expected_default

        expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n'
        assert df.to_csv(decimal=',', sep=';') == expected_european_excel

        expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n'
        assert df.to_csv(float_format='%.2f') == expected_float_format_default

        expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
        assert df.to_csv(decimal=',', sep=';',
                         float_format='%.2f') == expected_float_format

        # GH 11553: testing if decimal is taken into account for '0.0'
        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
        expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
        assert df.to_csv(index=False, decimal='^') == expected

        # same but for an index
        assert df.set_index('a').to_csv(decimal='^') == expected

        # same for a multi-index
        assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
Beispiel #29
0
def submission(fname_net="net4.pickle"):
    with open(fname_net, "rb") as f:
        net = pickle.load(f)  # net = specialists

    X = load.load2d(test=True)[0]
    y_pred = net.predict(X)
    print "Finish predict test file"
    columns = "left_eye_center_x,left_eye_center_y,right_eye_center_x,right_eye_center_y,left_eye_inner_corner_x,left_eye_inner_corner_y,left_eye_outer_corner_x,left_eye_outer_corner_y,right_eye_inner_corner_x,right_eye_inner_corner_y,right_eye_outer_corner_x,right_eye_outer_corner_y,left_eyebrow_inner_end_x,left_eyebrow_inner_end_y,left_eyebrow_outer_end_x,left_eyebrow_outer_end_y,right_eyebrow_inner_end_x,right_eyebrow_inner_end_y,right_eyebrow_outer_end_x,right_eyebrow_outer_end_y,nose_tip_x,nose_tip_y,mouth_left_corner_x,mouth_left_corner_y,mouth_right_corner_x,mouth_right_corner_y,mouth_center_top_lip_x,mouth_center_top_lip_y,mouth_center_bottom_lip_x,mouth_center_bottom_lip_y"
    columns = columns.split(",")

    y_pred = y_pred * 48 + 48
    y_pred = y_pred.clip(0, 96)
    df = DataFrame(y_pred, columns=columns)

    lookup_table = read_csv(os.path.expanduser("./data/IdLookupTable.csv"))
    values = []

    for index, row in lookup_table.iterrows():
        values.append((row["RowId"], df.ix[row.ImageId - 1][row.FeatureName]))

    now_str = datetime.now().isoformat().replace(":", "-")
    submission = DataFrame(values, columns=("RowId", "Location"))
    filename = "submission-{}.csv".format(now_str)
    submission.to_csv(filename, index=False)
    print ("Wrote {}".format(filename))
def main(train_file, test_file):
  #print "loading data.."
  csv.field_size_limit(1310720)
  trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' ))
  projectid, traindata_old = zip (*trainreader)  

  testreader = csv.reader (open ('/home/kiran/kdd/test.csv'))
  projectid, testdata_old = zip (*testreader)


  # remove stopwords
  traindata = []
  testdata = []
  for observation in traindata_old:
      traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))
  for observation in testdata_old:
      testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False))

  tfv = CountVectorizer (binary=1,ngram_range=(1, 1))
  X_all = traindata + testdata
  lentrain = len(traindata)
  tfv.fit(X_all)
  X_all = tfv.transform(X_all)
  X = X_all[:lentrain]
  X_test = X_all[lentrain:]
  scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real')
  scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real')
  myCols = tfv.get_feature_names ()
  myCols = DataFrame (myCols)
  myCols.to_csv ('bin_1gram.csv', index=False)
Beispiel #31
0
        callbacks=callbacks_list)

# make a prediction
yhat = model.predict(test_X)
yhat = yhat[:, 0]
allset = np.concatenate((yhat, yhat), axis=1)
allset = np.concatenate((allset, yhat), axis=1)
yhat = scaler.inverse_transform(allset)[:, [2]]

pred = DataFrame(yhat)
pred.columns = ["pred"]
real = GRN_MTR.iloc[(len(GRN_MTR) - len(pred)):, :]
real.columns = ["real"]
ambas = pd.concat([pred.set_index(real.index), real],
                  axis=1,
                  ignore_index=False)
ambas['diff'] = ambas.real - ambas.pred
ambas = DataFrame(ambas)
ambas.columns = ["pred", "real", "diff"]

# Save results
plt.figure()
plot = ambas.plot(figsize=(30,5),title="Prediction vs real "+"test "\
        +args.yi+"/"+args.yf+" "+args.mi+"/"+args.mf)
fig = plot.get_figure()
fig.savefig("/figs/Prediction_vs_real_test_"+args.yi+"-"+args.yf\
        +"_"+args.mi+"-"+args.mf+".png")

ambas.to_csv("/data/Prediction_vs_real_test_"+args.yi+\
        "_"+args.yf+"_"+args.mi+"_"+args.mf+".csv",index=False)
 def write_results_to_file(self):
     print(logstamp(), "WRITING RESULTS...")
     df = DataFrame(self.results)
     df.index.name = "row_id"
     df.index = df.index + 1
     df.to_csv(self.local_results_filepath)
Beispiel #33
0
def dump_results(
    dframe: pd.DataFrame,
    csvfile: Optional[str] = None,
    yamlfile: Optional[str] = None,
    resinsightfile: Optional[str] = None,
    ertfile: Optional[str] = None,
) -> None:
    """Dump dataframe with ERT observations to CSV and/or YML
    format to disk. Writes to stdout if filenames are "-". Skips
    export if filenames are empty or None.

    Args:
        dframe
        csvfile: Filename
        yamlfile: Filename
        resinsightfile: Filename
        ertfile: Filename
    """

    if not (csvfile or yamlfile or resinsightfile or ertfile):
        logger.warning("No output filenames provided")
    if csvfile:
        if csvfile != __MAGIC_STDOUT__:
            logger.info("Writing observations as CSV to %s", csvfile)
            dframe.to_csv(csvfile, index=False)
        else:
            # Ignore pipe errors when writing to stdout:
            signal.signal(signal.SIGPIPE, signal.SIG_DFL)
            dframe.to_csv(sys.stdout, index=False)

    if yamlfile and yamlfile:
        obs_dict_for_yaml = df2obsdict(dframe)
        if not obs_dict_for_yaml and not dframe.empty:
            logger.error("None of your observations are supported in YAML")
        yaml_str = yaml.safe_dump(obs_dict_for_yaml)

        if yamlfile != __MAGIC_STDOUT__:
            logger.info(
                "Writing observations in YAML (webviz) format to file: %s",
                resinsightfile,
            )
            with open(yamlfile, "w") as f_handle:
                f_handle.write(yaml_str)
        else:
            print(yaml_str)

    if resinsightfile:
        ri_dframe = df2resinsight_df(dframe)
        if resinsightfile != __MAGIC_STDOUT__:
            logger.info(
                "Writing observations in ResInsight format to CSV-file: %s",
                resinsightfile,
            )
            ri_dframe.to_csv(resinsightfile, index=False, sep=";")
        else:
            # Ignore pipe errors when writing to stdout:
            signal.signal(signal.SIGPIPE, signal.SIG_DFL)
            ri_dframe.to_csv(sys.stdout, index=False, sep=";")

    if ertfile:
        ertobs_str = df2ertobs(dframe)
        if ertfile != __MAGIC_STDOUT__:
            with open(ertfile, "w") as f_handle:
                logger.info("Writing ERT observation format to %s", ertfile)
                f_handle.write(ertobs_str)
        else:
            print(ertobs_str)
Beispiel #34
0
            prev_price = current_price

        if continuous_increase_counter == 0:
            i += 1
        else:
            i += continuous_increase_counter

    return pivots_list

def convert_pivot_sequences_to_training_seq(directory, filename_with_seq, filename_with_ohlc):
    pivot_sequences = read_csv(directory + filename_with_seq, sep=';', encoding='utf-8', index_col=0)
    ohlc = read_csv(directory + filename_with_ohlc, sep=';', encoding='utf-8', index_col=0)



py.init_notebook_mode(connected=True)
# file_path = save_all_trades('LTCUSD')
# print_ohlc_from_csv('result/LTCUSD/', 'LTCUSD_2017_10_11_12_01_03.txt', 'LTCUSD', mode='markers')

result = find_pivot_sequences('result/LTCUSD/', 'LTCUSD_2017_bid.csv')
data = DataFrame(result, columns=['start_index', 'max_index', 'length', 'pivot_price', 'current_time'])
data.to_csv('result/LTCUSD/LTCUSD_2017_pivots.csv', sep=';', encoding='utf-8')

training_sequences = convert_pivot_sequences_to_training_seq('result/LTCUSD/', 'LTCUSD_2017_pivots.csv')
training data_frame = DataFrame(training_sequences, columns=['start', 'max', 'length', 'pivot_price', 'current_time'])
training.to_csv('result/LTCUSD/LTCUSD_2017_pivots.csv', sep=';', encoding='utf-8')

#result = find_longest_continious_sequence('result/LTCUSD/', 'LTCUSD_2017_10_11_12_01_03_bid.csv', 'LTCUSD')
print("Finish")
Beispiel #35
0
def save_disclosure_data(df: DataFrame):
    df.to_csv("../data/pfd_final.csv")
Beispiel #36
0

train_corpus = clean_text(df_train)

cv = CountVectorizer(max_features=1500)
X_train = cv.fit_transform(train_corpus).toarray()
Y_train = df_train.iloc[:, 2]

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

test_corpus = clean_text(df_test)
X_test = cv.fit_transform(test_corpus).toarray()
Y_pred = classifier.predict(X_test)

from pandas import DataFrame
pred = DataFrame(Y_pred)
pred.to_csv(r'C:\Users\kajah\Desktop\result.csv')

#from sklearn.naive_bayes import GaussianNB
#classifier = GaussianNB()
#classifier.fit(X_train,Y_train)
#
#Y_pred = classifier.predict(X_test)
#from sklearn.svm import SVC
#classifier = SVC(kernel = 'linear', random_state = 0)
#classifier.fit(X_train, Y_train)
## Predicting the test set results
#Y_pred = classifier.predict(X_test)
Beispiel #37
0
    for i in friends:
        value = i[var]
        variable.append(value)
    return variable


#调用函数得到各变量,并把数据存到csv文件中,保存到桌面
NickName = get_var("NickName")
Sex = get_var('Sex')
Province = get_var('Province')
City = get_var('City')
Signature = get_var('Signature')
from pandas import DataFrame
data = {
    'NickName': NickName,
    'Sex': Sex,
    'Province': Province,
    'City': City,
    'Signature': Signature
}
frame = DataFrame(data)
frame.to_csv('data2.csv', index=True, encoding='utf8')
import re
siglist = []
for i in friends:
    signature = i["Signature"].strip().replace("span", "").replace(
        "class", "").replace("emoji", "")
    rep = re.compile("1f\d+\w*|[<>/=]")
    signature = rep.sub("", signature)
    siglist.append(signature)
text = "".join(siglist)
# God Player
God = [200, 60]  # God[0] = -150 and God[1] = 35
# Combining Datasets
SecondB = SecondB_PCA.join(SecondB_Info)
#print(FirstB['pc1'][0])

# create list of the best Players
Closiest_to_God = np.empty([len(SecondB), 1])
count = 0
while count < len(SecondB):
    Closiest_to_God[count][0] = sqrt(((SecondB['pc1'][count] - God[0])**2) +
                                     ((SecondB['pc2'][count] - God[1])**2))
    #Closiest_to_God[count][1] = FirstB_Names['Players'][count+1]
    #print(Closiest_to_God[count][0])
    #    print(count)
    count += 1
    if count >= len(SecondB):
        break
print(len(Closiest_to_God))
print(len(SecondB_Info))
Closiest_to_God = np.hstack((Closiest_to_God, SecondB_Info[1:]))
# Attaches the names of each player to their Euclidean distance to God

Closiest_to_God = Closiest_to_God[Closiest_to_God[:, 0].argsort()]
print(Closiest_to_God)

##Export to CSV
dataset = DataFrame(Closiest_to_God)
df = dataset.to_csv(r'export/SecondB_List.csv', index=None, header=True)
print(dataset)
Beispiel #39
0
# coding: utf-8

import pandas as pd
import numpy as np
from pandas import DataFrame

lgb_sub = pd.read_csv("../buptloc/sub_lgb_12_0527_1140.csv")

lgb_sub = DataFrame(lgb_sub)

for j in lgb_sub.columns:
    for k in range(len(lgb_sub)):
        if lgb_sub[j][k] < 0:
            lgb_sub[j][k] = - lgb_sub[j][k]

lgb_sub.to_csv("../buptloc/sub_lgb_12_0527_1140.csv")
Beispiel #40
0
 def _write_loop_block(self, df: pd.DataFrame):
     self.write_loopheader(df)
     df.to_csv(self.filename, mode='a', sep=self.sep, header=False, index=False,
               float_format=self.float_format, na_rep=self.na_rep)
def _1(data: pd.DataFrame) -> QualityFilterStatsFmt:
    ff = QualityFilterStatsFmt()
    data.to_csv(str(ff))
    return ff
Beispiel #42
0
                        categoria = 4
                        #feito

                    if 'decretada a liquidacao extrajudicial' in pub:
                        categoria = 5

                    #if 'recuperacao judicial' in pub:
                    #if 'processamento da recuperacao judicial' in pub and ('concedido o processamento' in pub or 'plano de recuperacao' in pub):
                    if 'processamento da recuperacao judicial' in pub:
                        categoria = 6

                    if categoria != 0:
                        id += 0
                        dict[pub] = categoria
                        #print (id.__str__()+"\n\nPUB =>"+i.__str__()+"\nCATEGORIA IDENTIFICADA => "+categoria.__str__())
                        if (id == -300):
                            break
            if id == -300:
                break
        if id == -300:
            break
    if id == -300:
        break

print("Arquivos lidos {0}".format(arqs_lidos))
print("Pubs lidas {0}".format(pubs_lidas))
df = DataFrame(list(dict.items()), columns=['pub', 'categoria'])
print(df.categoria.value_counts())
export_csv = df.to_csv(r'export_dataframe_13112019.csv',
                       index=None,
                       header=True)
Beispiel #43
0
def write_xgb_predictions(predictions: pd.DataFrame, summary_file):
    predictions = flatten_predictions(predictions)
    output = predictions.to_csv(index=False, float_format="%f")

    with open(summary_file, "wt") as fobj:
        fobj.write(output)
Beispiel #44
0
df_4["Parch_C"] = df_4["Parch"].astype('category')
df_4["Titel_C"] = df_4["Titel"].astype('category')
df_4["Age_G"] = pd.cut(
    df_4["Age"], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 120],
    labels=[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 120])
df_4["Age_E"] = pd.cut(df_4["Age"], [0, 15, 20, 120], labels=[15, 20, 120])
df_4["Age_Er"] = pd.cut(df_4["Age"], [0, 15, 120], labels=[15, 120])
df_4.dtypes

t, Xtest = dmatrices(
    'survived ~ Parch + SibSp + C(Pclass) + C(Titel) +C(Alone)',
    df_2,
    return_type="dataframe")

Xtest = Xtest.rename(
    columns={
        'C(Pclass)[T.2]': 'midClass',
        'C(Pclass)[T.3]': 'lowClass',
        'C(Sex)[T.male]': 'male',
        'C(Age_Er)[T.120]': 'Erwachsen'
    })

yPredict = svm_Model.predict(Xtest)
myResult = DataFrame()
myResult["PassengerId"] = df_4["PassengerId"]
myResult["Survived"] = yPredict
myResult.dtypes
myResult["Survived"] = myResult["Survived"].astype('int64')
myResult = myResult.sort(['PassengerId'])
myResult.to_csv("H:/DATA/Python/Titanic/myResult.csv", index=False)
Beispiel #45
0
    def load_df(
        self,
        df: pandas.DataFrame,
        table: str,
        field_dict: Optional[Dict[Any, Any]] = None,
        delimiter: str = ',',
        encoding: str = 'utf8',
        pandas_kwargs: Any = None,
        **kwargs: Any,
    ) -> None:
        """
        Loads a pandas DataFrame into hive.

        Hive data types will be inferred if not passed but column names will
        not be sanitized.

        :param df: DataFrame to load into a Hive table
        :type df: pandas.DataFrame
        :param table: target Hive table, use dot notation to target a
            specific database
        :type table: str
        :param field_dict: mapping from column name to hive data type.
            Note that it must be OrderedDict so as to keep columns' order.
        :type field_dict: collections.OrderedDict
        :param delimiter: field delimiter in the file
        :type delimiter: str
        :param encoding: str encoding to use when writing DataFrame to file
        :type encoding: str
        :param pandas_kwargs: passed to DataFrame.to_csv
        :type pandas_kwargs: dict
        :param kwargs: passed to self.load_file
        """
        def _infer_field_types_from_df(df: pandas.DataFrame) -> Dict[Any, Any]:
            dtype_kind_hive_type = {
                'b': 'BOOLEAN',  # boolean
                'i': 'BIGINT',  # signed integer
                'u': 'BIGINT',  # unsigned integer
                'f': 'DOUBLE',  # floating-point
                'c': 'STRING',  # complex floating-point
                'M': 'TIMESTAMP',  # datetime
                'O': 'STRING',  # object
                'S': 'STRING',  # (byte-)string
                'U': 'STRING',  # Unicode
                'V': 'STRING',  # void
            }

            order_type = OrderedDict()
            for col, dtype in df.dtypes.iteritems():
                order_type[col] = dtype_kind_hive_type[dtype.kind]
            return order_type

        if pandas_kwargs is None:
            pandas_kwargs = {}

        with TemporaryDirectory(prefix='airflow_hiveop_') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, mode="w") as f:
                if field_dict is None:
                    field_dict = _infer_field_types_from_df(df)

                df.to_csv(
                    path_or_buf=f,
                    sep=delimiter,
                    header=False,
                    index=False,
                    encoding=encoding,
                    date_format="%Y-%m-%d %H:%M:%S",
                    **pandas_kwargs,
                )
                f.flush()

                return self.load_file(filepath=f.name,
                                      table=table,
                                      delimiter=delimiter,
                                      field_dict=field_dict,
                                      **kwargs)
Beispiel #46
0
from pandas import DataFrame
import sqlite3 as sql

with sql.connect("app.db") as con:
    con.row_factory = sql.Row

    #Query 1 - Whether or not a given username, password combination is valid: right username
    cur = con.cursor()
    cur.execute(
        "SELECT user.uid FROM user WHERE user.uid = 1 AND user.password = '******'"
    )
    rows = cur.fetchall()
    df = DataFrame(rows)
    df.to_csv(r'./app/csv/01test.csv', index=False)
    cur.close()

    # Query 2 - Whether or not a given username, password combination is valid: wrong username
    cur = con.cursor()
    cur.execute(
        "SELECT user.uid FROM user WHERE user.uid = 1 AND user.password = '******'"
    )
    rows = cur.fetchall()
    df = DataFrame(rows)
    df.to_csv(r'./app/csv/02test.csv', index=False)
    cur.close()

    # Query 3 - Airlines whose name or abbreviation contain some substring
    cur = con.cursor()
    cur.execute(
        "SELECT * FROM airline WHERE airline.name LIKE '%Delta%' OR airline.iata LIKE '%Delta'"
    )
from pandas import DataFrame
from pyhive import hive
import boto3
import json

s3 = boto3.client('s3')
s3.download_file('aws-logs-723293022411-us-east-1',
                 'bootstrap-test/hiveConf.json', 'hiveConf.json')

f = open("hiveConf.json", "r")

jsonFile = json.load(f)

cursor = hive.connect(jsonFile["hiveConf"]["host"]).cursor()
cursor.execute(jsonFile["hiveConf"]["execute1"])
cursor.execute(jsonFile["hiveConf"]["execute2"])
cursor.execute(jsonFile["hiveConf"]["execute3"])
df = DataFrame(cursor.fetchall())
df.to_csv(jsonFile["hiveConf"]["saveTo"])
    clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=20,
                        max_depth=4,
                        objective='binary:logistic')
    folds = KFold(10, True, 1)
    #getshapvalue(instances, y, clf, vdim)
    auc_score, accuracy, sensitivity, specificity, MCC = getCrossValidation(
        instances, y, vdim, clf, folds)

    print('results for feature:' + featurename)
    print(
        '****AUC score:%.3f, accuracy:%.3f, sensitivity:%.3f, specificity:%.3f, MCC:%.3f****'
        % (auc_score, accuracy, sensitivity, specificity, MCC))

    toc = time.clock()
    print('The prediction time: %.3f minutes' % ((toc - tic) / 60.0))
    print(
        '###############################################################################\n'
    )

    # output result
    results = DataFrame({'Feature': [featurename], \
                         'AUC': [auc_score], \
                         'ACC': [accuracy], \
                         'SN': [sensitivity], \
                         'SP': [specificity], \
                         'MCC': [MCC]}
                        )
    results = results[['Feature', 'AUC', 'ACC', 'SN', 'SP', 'MCC']]
    results.to_csv(featurename + 'Results1.csv', index=False)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = np.array(df2['TipoMaturità'])
y = np.array(df2['VotoDiploma'])
z = np.array(df2['CFU1'])
ax.set_xlabel('TipoMaturità')
ax.set_ylabel('VotoDiploma')
ax.set_zlabel('CFU1')

ax.scatter(x,y,z, marker="s", c=kmeans.labels_.astype(float), s=50, cmap="RdBu", alpha=0.5 )

plt.show()
"""


df_final.to_csv("./ListaStudentCluster.csv", sep=',', index=False,)
ClusterFile = [tuple(row) for row in csv.reader(open("./ListaStudentCluster.csv", 'r'))] #AGGIUSTARE I PATH, IN MODO DA RAGGIUNGERE QUESTO FILE



for i in range(1, len(ClusterFile)):

    if int(ClusterFile[i][4]) == 0:
        count0 = count0 +1
        if int(ClusterFile[i][3]) > 0:
            countFc0 = countFc0 + 1
    if int(ClusterFile[i][4]) == 1:
        count1 = count1 +1
        if int(ClusterFile[i][3]) > 0:
            countFc1 = countFc1 + 1
"""
Beispiel #50
0
    def upload_predictions(self,
                           file_path: str = "predictions.csv",
                           tournament: int = 8,
                           model_id: str = None,
                           df: pd.DataFrame = None) -> str:
        """Upload predictions from file.
        Will read TRIGGER_ID from the environment if this model is enabled with
        a Numerai Compute cluster setup by Numerai CLI.

        Args:
            file_path (str): CSV file with predictions that will get uploaded
            tournament (int): ID of the tournament (optional, defaults to 8)
                -- DEPRECATED there is only one tournament nowadays
            model_id (str): Target model UUID (required for accounts with
                multiple models)
            df (pandas.DataFrame): pandas DataFrame to upload, if function is
                given df and file_path, df will be uploaded.

        Returns:
            str: submission_id

        Example:
            >>> api = NumerAPI(secret_key="..", public_id="..")
            >>> model_id = api.get_models()['uuazed']
            >>> api.upload_predictions("prediction.cvs", model_id=model_id)
            '93c46857-fed9-4594-981e-82db2b358daf'
            >>> # upload from pandas DataFrame directly:
            >>> api.upload_predictions(df=predictions_df, model_id=model_id)
        """
        self.logger.info("uploading predictions...")

        # write the pandas DataFrame as a binary buffer if provided
        buffer_csv = None

        if df is not None:
            buffer_csv = BytesIO(df.to_csv(index=False).encode())
            buffer_csv.name = file_path

        auth_query = '''
            query($filename: String!
                  $tournament: Int!
                  $modelId: String) {
                submission_upload_auth(filename: $filename
                                       tournament: $tournament
                                       modelId: $modelId) {
                    filename
                    url
                }
            }
            '''
        arguments = {
            'filename': os.path.basename(file_path),
            'tournament': tournament,
            'modelId': model_id
        }
        submission_resp = self.raw_query(auth_query,
                                         arguments,
                                         authorization=True)
        submission_auth = submission_resp['data']['submission_upload_auth']

        # get compute id if available and pass it along
        headers = {"x_compute_id": os.getenv("NUMERAI_COMPUTE_ID")}
        with open(file_path, 'rb') if df is None else buffer_csv as fh:
            requests.put(submission_auth['url'],
                         data=fh.read(),
                         headers=headers)
        create_query = '''
            mutation($filename: String!
                     $tournament: Int!
                     $modelId: String
                     $triggerId: String) {
                create_submission(filename: $filename
                                  tournament: $tournament
                                  modelId: $modelId
                                  triggerId: $triggerId) {
                    id
                }
            }
            '''
        arguments = {
            'filename': submission_auth['filename'],
            'tournament': tournament,
            'modelId': model_id,
            'triggerId': os.getenv('TRIGGER_ID', None)
        }
        create = self.raw_query(create_query, arguments, authorization=True)
        submission_id = create['data']['create_submission']['id']
        return submission_id
def store_csv(df: pd.DataFrame, fpath: str):
    """ store as csv """
    # save as csv
    fpath = fpath + ".csv"
    df.to_csv(fpath, index=False)
    def test_to_csv_quoting(self):
        df = DataFrame({
            'c_bool': [True, False],
            'c_float': [1.0, 3.2],
            'c_int': [42, np.nan],
            'c_string': ['a', 'b,c'],
        })

        expected = """\
,c_bool,c_float,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,"b,c"
"""
        result = df.to_csv()
        assert result == expected

        result = df.to_csv(quoting=None)
        assert result == expected

        result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
        assert result == expected

        expected = """\
"","c_bool","c_float","c_int","c_string"
"0","True","1.0","42.0","a"
"1","False","3.2","","b,c"
"""
        result = df.to_csv(quoting=csv.QUOTE_ALL)
        assert result == expected

        # see gh-12922, gh-13259: make sure changes to
        # the formatters do not break this behaviour
        expected = """\
"","c_bool","c_float","c_int","c_string"
0,True,1.0,42.0,"a"
1,False,3.2,"","b,c"
"""
        result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
        assert result == expected

        msg = "need to escape, but no escapechar set"
        tm.assert_raises_regex(csv.Error, msg, df.to_csv,
                               quoting=csv.QUOTE_NONE)
        tm.assert_raises_regex(csv.Error, msg, df.to_csv,
                               quoting=csv.QUOTE_NONE,
                               escapechar=None)

        expected = """\
,c_bool,c_float,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,b!,c
"""
        result = df.to_csv(quoting=csv.QUOTE_NONE,
                           escapechar='!')
        assert result == expected

        expected = """\
,c_bool,c_ffloat,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,bf,c
"""
        result = df.to_csv(quoting=csv.QUOTE_NONE,
                           escapechar='f')
        assert result == expected

        # see gh-3503: quoting Windows line terminators
        # presents with encoding?
        text = 'a,b,c\n1,"test \r\n",3\n'
        df = pd.read_csv(StringIO(text))
        buf = StringIO()
        df.to_csv(buf, encoding='utf-8', index=False)
        assert buf.getvalue() == text

        # xref gh-7791: make sure the quoting parameter is passed through
        # with multi-indexes
        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
        df = df.set_index(['a', 'b'])
        expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
        assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
Beispiel #53
0
def df_to_tsv(df: pd.DataFrame, outfile: str, sep="\t", index=False) -> None:
    df.to_csv(outfile, sep=sep, index=index)
Beispiel #54
0
 def load(self, df: pd.DataFrame, output_path: str) -> None:
     # Export data
     df.to_csv(output_path, index=False)
Beispiel #55
0
def homeSourceFilter(communityName, communityLink):
    """
    // 小区房源基本信息存储list
    """
    addresses = []
    floods = []
    followInfoes = []
    """
    // 获取首页
    """
    res = requests_get(communityLink, timeout=(20, 60))
    soup = BeautifulSoup(res.text, 'lxml')
    """
    // 获取分页信息
    """
    if len(soup.select('div[class="page-box house-lst-page-box"]')) == 0:
        print("{0}:未找到分页信息".format(communityName))
        raise Exception("未找到分页信息")
    else:
        page_data = soup.select(
            'div[class="page-box house-lst-page-box"]')[0]['page-data']
        page_data = eval(page_data)
        totalPage = page_data["totalPage"]
    """
    // 分页处理
    // 涵盖只有单页的情况
    """
    for pageIndex in range(1, totalPage + 1):
        if (pageIndex > 1):
            insertIndex = communityLink.rfind('/', 0, -2)
            pageLink = communityLink[:insertIndex + 1] + 'pg' + str(
                pageIndex) + communityLink[insertIndex + 1:]
            res = requests_get(pageLink, timeout=(20, 60))
            soup = BeautifulSoup(res.text, 'lxml')
        """
        // 通过li标签定位房源
        // 每个房源有三个class属性存有所需信息: address, flood, followInfo
        """
        houseResourceList = soup.select('li[class="clear"]')

        for houseResource in houseResourceList:
            """
            // address
            """
            address = houseResource.find_all('div', class_="address")
            """
            // flood
            """
            flood = houseResource.find_all('div', class_="flood")
            """
            // followInfo
            """
            followInfo = houseResource.find_all('div', class_="followInfo")

            addresses.append("".join(list(address[0].stripped_strings)))
            floods.append("".join(list(flood[0].stripped_strings)))
            followInfoes.append("".join(list(followInfo[0].stripped_strings)))

    allHomeSourceInfo = DataFrame({
        "address": addresses,
        "flood": floods,
        "followInfo": followInfoes
    })
    allHomeSourceInfo.to_csv(os_getcwd() + "\\" + "AllCommunity\\" +
                             communityName + ".csv",
                             encoding='gbk')
Beispiel #56
0
        os.makedirs(newpath.rsplit('/',1)[0])#'/'.join(newpath.rsplit('/',2)[0]) + '/')
    shutil.copy(fullpath, newpath)
    

def copy_bag_file(fullpath):
    FLY_ID, FMF_TIME, GROUP = parse_fmftime(fullpath.rsplit('/',1)[0])
    BAG_FILE = match_fmf_and_bag(FMF_TIME)
    shutil.copy(BAG_FILE, (POOL_DIR + 'BAGS/'))

baglist = []
for bag in glob.glob(SEARCH_DIR + 'BAGS/*.bag'):
    bagtimestamp = parse_bagtime(bag)
    baglist.append((bag, bagtimestamp))
bagframe = DataFrame(baglist, columns=['Filepath', 'Timestamp'])
bagframe.index = pd.to_datetime(bagframe['Timestamp'])
bagframe = bagframe.sort()

    
for matching_dir in glob.glob(SEARCH_DIR + SEARCH_TERM):
    filelist.append(matching_dir)
    for fn in find_files(matching_dir, 'frame_by_frame_synced.pickle'):
        copy_with_dirs(fn)
        copy_with_dirs(fn.rsplit('/',1)[0] + '/tracking_info.pickle')
        copy_with_dirs(fn.rsplit('/',1)[0] + '/wingdata.pickle')
        copy_bag_file(fn)



fileDF = DataFrame(filelist)
fileDF.to_csv(POOL_DIR + 'filelist.txt', sep='\n', header=None, index=None)
Beispiel #57
0
def _4(data: pd.DataFrame) -> (SILVATaxidMapFormat):
    ff = SILVATaxidMapFormat()
    with ff.open() as fh:
        data.to_csv(fh, sep='\t', header=True)
    return ff
data = DataFrame(columns=columns)

for i in Copa_America:
    url=Copa_America[i][0]
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    table_body = soup.find('tbody')
    for j,row in enumerate(table_body.findAll('tr')):
        td = row.findAll('td')
        picture = td[0].find('img').get('data-src')
        pid = td[0].find('img').get('id')
        nationality = td[1].find('a').get('title')
        flag_img = td[1].find('img').get('data-src')
        name = td[1].findAll('a')[1].text
        age = int(td[2].text.strip())
        overall = int(td[3].text.strip())
        potential = int(td[4].text.strip())
        posicion = td[5].find("span").text.strip()
        contrato = td[5].find("div").text[-11:]
        value = td[7].text.strip()
        wage = td[7].text.strip()
        special = int(td[10].text.strip())
        player_data = DataFrame([[pid, name, age, picture, nationality, flag_img, overall,
                                      potential, posicion, contrato, value, wage, special]])
        player_data.columns = columns
        data = data.append(player_data, ignore_index=True)

data = data.drop_duplicates()
data.to_csv('Copa_america.csv', encoding='utf-8')
Beispiel #59
0
def _1(data: pd.DataFrame) -> (SILVATaxonomyFormat):
    ff = SILVATaxonomyFormat()
    with ff.open() as fh:
        data.to_csv(fh, sep='\t', header=False)
    return ff
Beispiel #60
0
    csv_name = 'querycsv'
    startNo = ''
    endNo = ''
    singleY = ''
    for op, value in opts:
        if op == '-s':
            startNo = value
        elif op == '-e':
            endNo = value
        elif op == '-y':
            singleY = value
        elif op == '-h':
            historical_data.usage()
            sys.exit()
    if debug: print(startNo)
    if debug: print(endNo)
    if debug: print(singleY)
    rs = []
    if startNo != '' and endNo != '':
        rs = historical_data.get_data_indentifier_range(startNo, endNo)
        csv_name = startNo + '-' + endNo + '-data'
    elif singleY != '':
        rs = historical_data.get_one_year_data(singleY)
        csv_name = singleY + '-data'
    else:
        rs = historical_data.get_all_data()
        csv_name = 'ALL-data'
    df = DataFrame(rs)
    df.to_csv('./result/' + csv_name + '.csv', index=False)
    print(df)