コード例 #1
0
ファイル: test_parsers.py プロジェクト: bshanks/pandas
    def test_iterator(self):
        reader = read_csv(StringIO(self.data1), index_col=0, iterator=True)

        df = read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.get_chunk(3)
        assert_frame_equal(chunk, df[:3])

        last_chunk = reader.get_chunk(5)
        assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        assert_frame_equal(chunks[0], df[:2])
        assert_frame_equal(chunks[1], df[2:4])
        assert_frame_equal(chunks[2], df[4:])

        treader = read_table(StringIO(self.data1), sep=',', index_col=0,
                             iterator=True)
        self.assert_(isinstance(treader, TextParser))
コード例 #2
0
ファイル: test_network.py プロジェクト: AkiraKane/pandas
 def test_parse_public_s3_bucket_chunked(self):
     # Read with a chunksize
     chunksize = 5
     local_tips = read_csv(tm.get_data_path('tips.csv'))
     for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
         if comp == 'bz2' and compat.PY2:
             # The Python 2 C parser can't read bz2 from S3.
             self.assertRaises(ValueError, read_csv,
                               's3://pandas-test/tips.csv' + ext,
                               compression=comp)
         else:
             df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                  chunksize=chunksize, compression=comp)
             self.assertEqual(df_reader.chunksize, chunksize)
             for i_chunk in [0, 1, 2]:
                 # Read a couple of chunks and make sure we see them
                 # properly.
                 df = df_reader.get_chunk()
                 self.assertTrue(isinstance(df, DataFrame))
                 self.assertFalse(df.empty)
                 true_df = local_tips.iloc[
                     chunksize * i_chunk: chunksize * (i_chunk + 1)]
                 # Chunking doesn't preserve row numbering
                 true_df = true_df.reset_index().drop('index', axis=1)
                 tm.assert_frame_equal(true_df, df)
コード例 #3
0
    def test_deprecated_args(self, engine, kwargs):
        data = "1,2,3"
        arg, _ = list(kwargs.items())[0]

        with tm.assert_produces_warning(
                FutureWarning, check_stacklevel=False):
            read_csv(StringIO(data), engine=engine, **kwargs)
コード例 #4
0
ファイル: plots.py プロジェクト: aykol/MPWorks
def sg1_vs_sg2_plotly():
    """plot SG #1 vs #2 via plotly"""
    out_fig = Figure()
    bisectrix = Scatter(x=[0,230], y=[0,230], mode='lines', name='bisectrix', showlegend=False)
    inmatdb_df = read_csv('mpworks/check_snl/results/bad_snlgroups_2_in_matdb.csv')
    inmatdb_text = map(','.join, zip(
        inmatdb_df['task_id 1'], inmatdb_df['task_id 2']
    ))
    inmatdb_trace = Scatter(
        x=inmatdb_df['sg_num 2'].as_matrix(), y=inmatdb_df['sg_num 1'].as_matrix(),
        text=inmatdb_text, mode='markers', name='in MatDB'
    )
    notinmatdb_df = read_csv('mpworks/check_snl/results/bad_snlgroups_2_notin_matdb.csv')
    notinmatdb_text = map(','.join, zip(
        map(str, notinmatdb_df['snlgroup_id 1']), map(str, notinmatdb_df['snlgroup_id 2'])
    ))
    notinmatdb_trace = Scatter(
        x=notinmatdb_df['sg_num 2'].as_matrix()+0.1,
        y=notinmatdb_df['sg_num 1'].as_matrix()+0.1,
        text=notinmatdb_text, mode='markers', name='not in MatDB'
    )
    out_fig['data'] = Data([bisectrix, notinmatdb_trace, inmatdb_trace])
    out_fig['layout'] = Layout(
        hovermode='closest',
        title='Spacegroup Assignment Comparison of matching Canonical SNLs',
        xaxis=XAxis(showgrid=False, title='SG #2', range=[0,230]),
        yaxis=YAxis(showgrid=False, title='SG #1', range=[0,230]),
    )
    filename = 'spacegroup_canonicals_'
    filename += datetime.datetime.now().strftime('%Y-%m-%d') 
    py.plot(out_fig, filename=filename, auto_open=False)
    py.image.save_as(out_fig, 'canonicals_spacegroups.png')
コード例 #5
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_multi_index_no_level_names(self):
        data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""

        data2 = """A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""

        lines = data.split('\n')
        no_header = '\n'.join(lines[1:])
        names = ['A', 'B', 'C', 'D']
        df = read_csv(StringIO(no_header), index_col=[0, 1], names=names)
        expected = read_csv(StringIO(data), index_col=[0, 1])
        assert_frame_equal(df, expected)

        # 2 implicit first cols
        df2 = read_csv(StringIO(data2))
        assert_frame_equal(df2, df)
コード例 #6
0
ファイル: solution.py プロジェクト: leoldn/hackerrank
def init_test_dataframes():
    inp_df = read_csv(test_data_address_in, sep=',')
    inp_df.columns = [inp_columns]
    trg_df = read_csv(test_data_address_target, sep=',')
    trg_df.columns = [trg_column]
    inp_df = inp_df.replace(mapping.keys(), mapping.values())
    return inp_df, trg_df
コード例 #7
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_pass_names_with_index(self):
        lines = self.data1.split('\n')
        no_header = '\n'.join(lines[1:])

        # regular index
        names = ['index', 'A', 'B', 'C', 'D']
        df = read_csv(StringIO(no_header), index_col=0, names=names)
        expected = read_csv(StringIO(self.data1), index_col=0)
        assert_frame_equal(df, expected)

        # multi index
        data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
        lines = data.split('\n')
        no_header = '\n'.join(lines[1:])
        names = ['index1', 'index2', 'A', 'B', 'C', 'D']
        df = read_csv(StringIO(no_header), index_col=[0, 1], names=names)
        expected = read_csv(StringIO(data), index_col=[0, 1])
        assert_frame_equal(df, expected)

        df = read_csv(StringIO(data), index_col=['index1', 'index2'])
        assert_frame_equal(df, expected)
コード例 #8
0
ファイル: test_network.py プロジェクト: AllenDowney/pandas
 def test_parse_public_s3n_bucket(self):
     # Read from AWS s3 as "s3n" URL
     df = read_csv('s3n://pandas-test/tips.csv', nrows=10)
     assert isinstance(df, DataFrame)
     assert not df.empty
     tm.assert_frame_equal(read_csv(
         tm.get_data_path('tips.csv')).iloc[:10], df)
コード例 #9
0
ファイル: add_peaks.py プロジェクト: karmel/vespucci
def import_peaks(options, file_name, peaks_file_name):
    """
    Given a  peak file, save a temp table and store peak data.
    """
    AtlasPeak.create_table(file_name)
    # AtlasPeak.set_table_name('peak_' + file_name)

    if not options.not_homer:
        # Find header row
        header_row = None
        f = open(peaks_file_name)
        for i, line in enumerate(f):
            if line[:7] == "#PeakID":
                header_row = i
                break
        if header_row is None:
            raise Exception("There is no header in this Homer peak file!")
        data = read_csv(peaks_file_name, sep="\t", header=i)
    else:
        data = read_csv(peaks_file_name, sep="\t", header=None)

    for _, row in data.iterrows():
        if not options.not_homer:
            peak = AtlasPeak.init_from_homer_row(row)
        else:
            peak = getattr(AtlasPeak, "init_from_{0}_row".format(options.not_homer))(row)

        if peak:
            peak.save()

    AtlasPeak.add_indices()
コード例 #10
0
ファイル: test_excel.py プロジェクト: Libardo1/pandas
    def test_parse_cols_str(self):
        _skip_if_no_openpyxl()
        _skip_if_no_xlrd()

        suffix = ["", "x"]

        for s in suffix:

            pth = os.path.join(self.dirpath, "test.xls%s" % s)
            xls = ExcelFile(pth)

            df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols="A:D")
            df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=["A", "B", "C"])
            df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols="A:D")
            tm.assert_frame_equal(df, df2, check_names=False)  # TODO add index to xls, read xls ignores index name ?
            tm.assert_frame_equal(df3, df2, check_names=False)
            del df, df2, df3

            df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols="A,C,D")
            df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=["B", "C"])
            df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols="A,C,D")
            tm.assert_frame_equal(df, df2, check_names=False)  # TODO add index to xls file
            tm.assert_frame_equal(df3, df2, check_names=False)
            del df, df2, df3

            df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols="A,C:D")
            df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=["B", "C"])
            df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols="A,C:D")
            tm.assert_frame_equal(df, df2, check_names=False)
            tm.assert_frame_equal(df3, df2, check_names=False)
コード例 #11
0
ファイル: test_network.py プロジェクト: ivannz/pandas
 def test_parse_public_s3a_bucket(self):
     # Read from AWS s3 as "s3a" URL
     df = read_csv('s3a://pandas-test/tips.csv', nrows=10)
     self.assertTrue(isinstance(df, DataFrame))
     self.assertFalse(df.empty)
     tm.assert_frame_equal(read_csv(
         tm.get_data_path('tips.csv')).iloc[:10], df)
コード例 #12
0
ファイル: test_unsupported.py プロジェクト: aechase/pandas
    def test_deprecated_args(self):
        data = '1,2,3'

        # deprecated arguments with non-default values
        deprecated = {
            'as_recarray': True,
            'buffer_lines': True,
            'compact_ints': True,
            'skip_footer': True,
            'use_unsigned': True,
        }

        engines = 'c', 'python'

        for engine in engines:
            for arg, non_default_val in deprecated.items():
                if engine == 'c' and arg == 'skip_footer':
                    # unsupported --> exception is raised
                    continue

                if engine == 'python' and arg == 'buffer_lines':
                    # unsupported --> exception is raised
                    continue

                with tm.assert_produces_warning(
                        FutureWarning, check_stacklevel=False):
                    kwargs = {arg: non_default_val}
                    read_csv(StringIO(data), engine=engine,
                             **kwargs)
コード例 #13
0
ファイル: test_textreader.py プロジェクト: ChenXiukun/pandas
    def test_empty_field_eof(self):
        data = 'a,b,c\n1,2,3\n4,,'

        result = TextReader(StringIO(data), delimiter=',').read()

        expected = {0: np.array([1, 4]),
                    1: np.array(['2', ''], dtype=object),
                    2: np.array(['3', ''], dtype=object)}
        assert_array_dicts_equal(result, expected)

        # GH5664
        a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c'])
        b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]],
                      columns=list('abcd'),
                      index=[1, 1])
        c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan],
                       [8, 9, 10, 11], [13, 14, nan, nan]],
                      columns=list('abcd'),
                      index=[0, 5, 7, 12])

        for _ in range(100):
            df = read_csv(StringIO('a,b\nc\n'), skiprows=0,
                          names=['a'], engine='c')
            assert_frame_equal(df, a)

            df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2),
                          names=list("abcd"), engine='c')
            assert_frame_equal(df, b)

            df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'),
                          names=list('abcd'), engine='c')
            assert_frame_equal(df, c)
コード例 #14
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
 def test_multiple_date_col_named_components(self):
     xp = read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1,2]},
                   index_col='nominal')
     colspec = {'nominal' : ['date', 'nominalTime']}
     df = read_csv(StringIO(self.ts_data), parse_dates=colspec,
                   index_col='nominal')
     assert_frame_equal(df, xp)
コード例 #15
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_na_value_dict(self):
        data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""

        df = read_csv(StringIO(data),
                      na_values={'A': ['foo'], 'B': ['bar']})
        expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
                              'B': [np.nan, 'foo', np.nan, 'foo'],
                              'C': [np.nan, 'foo', np.nan, 'foo']})
        assert_frame_equal(df, expected)

        data = """\
a,b,c,d
0,NA,1,5
"""
        xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
        xp.index.name = 'a'
        df = read_csv(StringIO(data), na_values={}, index_col=0)
        assert_frame_equal(df, xp)

        xp = DataFrame({'b': [np.nan], 'd': [5]},
                       MultiIndex.from_tuples([(0, 1)]))
        df = read_csv(StringIO(data), na_values={}, index_col=[0, 2])
        assert_frame_equal(df, xp)

        xp = DataFrame({'b': [np.nan], 'd': [5]},
                       MultiIndex.from_tuples([(0, 1)]))
        df = read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
        assert_frame_equal(df, xp)
コード例 #16
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_index_col_named(self):
        no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""

        h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
        data = h + no_header
        #import pdb; pdb.set_trace()
        rs = read_csv(StringIO(data), index_col='ID')
        xp = read_csv(StringIO(data), header=0).set_index('ID')
        assert_frame_equal(rs, xp)

        self.assertRaises(ValueError, read_csv, StringIO(no_header),
                          index_col='ID')

        data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
        names = ['a', 'b', 'c', 'd', 'message']
        xp = DataFrame({'a' : [1, 5, 9], 'b' : [2, 6, 10], 'c' : [3, 7, 11],
                        'd' : [4, 8, 12]},
                       index=Index(['hello', 'world', 'foo'], name='message'))
        rs = read_csv(StringIO(data), names=names, index_col=['message'])
        assert_frame_equal(xp, rs)
        self.assert_(xp.index.name == rs.index.name)

        rs = read_csv(StringIO(data), names=names, index_col='message')
        assert_frame_equal(xp, rs)
        self.assert_(xp.index.name == rs.index.name)
コード例 #17
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_parse_dates_column_list(self):
        from pandas.core.datetools import to_datetime

        data = '''date;destination;ventilationcode;unitcode;units;aux_date
01/01/2010;P;P;50;1;12/1/2011
01/01/2010;P;R;50;1;13/1/2011
15/01/2010;P;P;50;1;14/1/2011
01/05/2010;P;P;50;1;15/1/2011'''

        expected = read_csv(StringIO(data), sep=";", index_col=range(4))

        lev = expected.index.levels[0]
        expected.index.levels[0] = lev.to_datetime(dayfirst=True)
        expected['aux_date'] = to_datetime(expected['aux_date'],
                                           dayfirst=True)
        expected['aux_date'] = map(Timestamp, expected['aux_date'])
        self.assert_(isinstance(expected['aux_date'][0], datetime))

        df = read_csv(StringIO(data), sep=";", index_col = range(4),
                      parse_dates=[0, 5], dayfirst=True)
        assert_frame_equal(df, expected)

        df = read_csv(StringIO(data), sep=";", index_col = range(4),
                      parse_dates=['date', 'aux_date'], dayfirst=True)
        assert_frame_equal(df, expected)
コード例 #18
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_sniff_delimiter(self):
        text = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
        data = read_csv(StringIO(text), index_col=0, sep=None)
        self.assert_(data.index.equals(Index(['foo', 'bar', 'baz'])))

        data2 = read_csv(StringIO(text), index_col=0, delimiter='|')
        assert_frame_equal(data, data2)

        text = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
        data3 = read_csv(StringIO(text), index_col=0, sep=None, skiprows=2)
        assert_frame_equal(data, data3)

        # can't get this to work on Python 3
        if not py3compat.PY3:
            text = u"""ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
""".encode('utf-8')
            data4 = read_csv(BytesIO(text), index_col=0, sep=None, skiprows=2,
                             encoding='utf-8')
            assert_frame_equal(data, data4)
コード例 #19
0
    def savePrediction8(self):
        """
        save the predicted coordinates for the 8-feature set into a csv
        file to upload
        """
        # transform predictions
        prediction = self.prediction * 48 + 48
        prediction = prediction.clip(0, 96)

        # read id list
        idset = read_csv(os.path.expanduser(self.fIdList))

        outputPrediction = []
        mapping = {1:1, 2:2, 3:3, 4:4, 21:5, 22:6, 29:7, 30:8}

        for i in range(len(idset)):
            # we only predict the second part of the set of images.
            # so we need to shift by 592
            # TODO(tobias): shift the images in IdList_8.csv
            ImageID = idset['ImageId'][i]-592
            Feature = idset['FeatureName'][i]
            newFeatureId = mapping[Feature]
            outputPrediction.append(prediction[ImageID, newFeatureId-1])

        # read output list
        outputset = read_csv(os.path.expanduser(self.fOutputList))

        # fill output list with predictions
        outputset['Location'] = outputPrediction

        # write output list to disk
        outputset.to_csv(os.path.expanduser(self.fOutFile), index=False)
コード例 #20
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_iterator(self):
        reader = read_csv(StringIO(self.data1), index_col=0, iterator=True)
        df = read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.get_chunk(3)
        assert_frame_equal(chunk, df[:3])

        last_chunk = reader.get_chunk(5)
        assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        assert_frame_equal(chunks[0], df[:2])
        assert_frame_equal(chunks[1], df[2:4])
        assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        assert_frame_equal(chunks[0], df[1:3])

        # test bad parameter (skip_footer)
        reader = read_csv(StringIO(self.data1), index_col=0, iterator=True,
                          skip_footer=True)
        self.assertRaises(ValueError, reader.get_chunk, 3)

        treader = read_table(StringIO(self.data1), sep=',', index_col=0,
                             iterator=True)
        self.assert_(isinstance(treader, TextParser))
コード例 #21
0
ファイル: test_network.py プロジェクト: qdxt/python
 def test_infer_s3_compression(self, s3_resource):
     for ext in ['', '.gz', '.bz2']:
         df = read_csv('s3://pandas-test/tips.csv' + ext,
                       engine='python', compression='infer')
         assert isinstance(df, DataFrame)
         assert not df.empty
         tm.assert_frame_equal(read_csv(
             tm.get_data_path('tips.csv')), df)
コード例 #22
0
ファイル: test_network.py プロジェクト: ivannz/pandas
 def test_infer_s3_compression(self):
     for ext in ['', '.gz', '.bz2']:
         df = read_csv('s3://pandas-test/tips.csv' + ext,
                       engine='python', compression='infer')
         self.assertTrue(isinstance(df, DataFrame))
         self.assertFalse(df.empty)
         tm.assert_frame_equal(read_csv(
             tm.get_data_path('tips.csv')), df)
コード例 #23
0
ファイル: test_network.py プロジェクト: ivannz/pandas
 def test_parse_public_s3_bucket_nrows(self):
     for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
         df = read_csv('s3://pandas-test/tips.csv' +
                       ext, nrows=10, compression=comp)
         self.assertTrue(isinstance(df, DataFrame))
         self.assertFalse(df.empty)
         tm.assert_frame_equal(read_csv(
             tm.get_data_path('tips.csv')).iloc[:10], df)
コード例 #24
0
ファイル: test_network.py プロジェクト: ivannz/pandas
    def test_s3_fails(self):
        with tm.assertRaises(IOError):
            read_csv('s3://nyqpug/asdf.csv')

        # Receive a permission error when trying to read a private bucket.
        # It's irrelevant here that this isn't actually a table.
        with tm.assertRaises(IOError):
            read_csv('s3://cant_get_it/')
コード例 #25
0
ファイル: test_unsupported.py プロジェクト: aechase/pandas
    def test_nrows_and_chunksize(self):
        data = 'a b c'
        msg = "cannot be used together yet"

        for engine in ('c', 'python'):
            with tm.assertRaisesRegexp(NotImplementedError, msg):
                read_csv(StringIO(data), engine=engine,
                         nrows=10, chunksize=5)
コード例 #26
0
ファイル: test_network.py プロジェクト: AllenDowney/pandas
 def test_parse_public_s3_bucket_nrows_python(self):
     for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
         df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python',
                       nrows=10, compression=comp)
         assert isinstance(df, DataFrame)
         assert not df.empty
         tm.assert_frame_equal(read_csv(
             tm.get_data_path('tips.csv')).iloc[:10], df)
コード例 #27
0
ファイル: test_parsers.py プロジェクト: smc77/pandas
    def test_csv_custom_parser(self):
        data = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
        df = read_csv(StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d"))
        expected = read_csv(StringIO(data), parse_dates=True)
        assert_frame_equal(df, expected)
コード例 #28
0
ファイル: test_unsupported.py プロジェクト: aechase/pandas
    def test_mangle_dupe_cols_false(self):
        # see gh-12935
        data = 'a b c\n1 2 3'
        msg = 'is not supported'

        for engine in ('c', 'python'):
            with tm.assertRaisesRegexp(ValueError, msg):
                read_csv(StringIO(data), engine=engine,
                         mangle_dupe_cols=False)
コード例 #29
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_read_chunksize_named(self):
        reader = read_csv(StringIO(self.data1), index_col='index', chunksize=2)
        df = read_csv(StringIO(self.data1), index_col='index')

        chunks = list(reader)

        assert_frame_equal(chunks[0], df[:2])
        assert_frame_equal(chunks[1], df[2:4])
        assert_frame_equal(chunks[2], df[4:])
コード例 #30
0
ファイル: test_parsers.py プロジェクト: MikeLindenau/pandas
    def test_parse_dates_implicit_first_col(self):
        data = """A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
"""
        df = read_csv(StringIO(data), parse_dates=True)
        expected = read_csv(StringIO(data), index_col=0, parse_dates=True)
        self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp)))
        assert_frame_equal(df, expected)
コード例 #31
0
 def read_csv(self, file):
     return read_csv(file, parse_dates=True)
コード例 #32
0
lcbc = args.lencbc
lumi = args.lenumi
umifirst = args.umifirst
cbcfile = args.cbcfile
hd = args.cbchd

#### Define input fastq files ####
fq1 = fqr + '_R1.fastq.gz'
fq2 = fqr + '_R2.fastq.gz'

if not os.path.isfile(fq1) or not os.path.isfile(fq2):
    print 'fastq files not found'
    sys.exit()

#### Read barcodes ####
dbc = read_csv(cbcfile, sep = '\t', index_col=0, header = None)
if not all([len(idx)==lcbc for idx in dbc.index]):
    sys.exit('barcode length provided does not match reference set')
d = {idx: dbc.loc[idx,1] for idx in dbc.index}
bc2sample = expandBCset(d, hd)

#### Do the job ####

fout = open(fqr + '_cbc.fastq', 'w+')
nt = 0
ns = 0
with gzip.open(fq1) as f1, gzip.open(fq2) as f2: 
    for idx, (l1, l2) in enumerate(it.izip(f1, f2)):
        l1, l2 = l1.rstrip().rsplit(' ')[0], l2.rstrip().rsplit(' ')[0]
        l = np.mod(idx,4)
        if l == 0:
コード例 #33
0
        "RIFLDIY02_N.B": 'Swap2Y',
        "RIFLDIY03_N.B": 'Swap3Y',
        "RIFLDIY04_N.B": 'Swap4Y',
        "RIFLDIY05_N.B": 'Swap5Y',
        "RIFLDIY07_N.B": 'Swap7Y',
        "RIFLDIY10_N.B": 'Swap10Y',
        "RIFLDIY30_N.B": 'Swap30Y',
        "RILSPDEPM01_N.B": 'Libor1M',
        "RILSPDEPM03_N.B": 'Libor3M',
        "RILSPDEPM06_N.B": 'Libor6M'
    }

    # Parse the file: skip the first 5 rows, headers are on row 6,
    # ND and NC indicate missing values, first column is the index and contains
    # dates
    df_libor = read_csv(fname,
                        header=5,
                        skiprows=range(5),
                        na_values=['ND', 'NC'],
                        index_col=0,
                        parse_dates=True)

    # Convert column names to simple labels
    df_libor = df_libor.rename(columns=columns_dic)

    good_rows = df_libor.apply(good_row, axis=1)

    df_libor_clean = df_libor[good_rows]

    df_libor_clean.save('data/df_libor.pkl')
コード例 #34
0
    def _parse_level_0cs(fp):
        """
        Parses and EVE Level 0CS file.
        """
        is_missing_data = False  # boolean to check for missing data
        missing_data_val = np.nan
        header = []
        fields = []
        line = fp.readline()
        # Read header at top of file
        while line.startswith(";"):
            header.append(line)
            if '; Missing data:' in line:
                is_missing_data = True
                missing_data_val = line.split(':')[1].strip()

            line = fp.readline()

        meta = MetaDict()
        for hline in header:
            if hline == '; Format:\n' or hline == '; Column descriptions:\n':
                continue
            elif ('Created' in hline) or ('Source' in hline):
                meta[hline.split(':',
                                 1)[0].replace(';',
                                               ' ').strip()] = hline.split(':', 1)[1].strip()
            elif ':' in hline:
                meta[hline.split(':')[0].replace(';', ' ').strip()] = hline.split(':')[1].strip()

        fieldnames_start = False
        for hline in header:
            if hline.startswith("; Format:"):
                fieldnames_start = False
            if fieldnames_start:
                fields.append(hline.split(":")[0].replace(';', ' ').strip())
            if hline.startswith("; Column descriptions:"):
                fieldnames_start = True

        # Next line is YYYY DOY MM DD
        date_parts = line.split(" ")

        year = int(date_parts[0])
        month = int(date_parts[2])
        day = int(date_parts[3])

        def parser(x):
            # Parse date column (HHMM)
            return datetime(year, month, day, int(x[0:2]), int(x[2:4]))

        data = read_csv(fp, sep=r"\s+", names=fields,
                        index_col=0, date_parser=parser, header=None, engine='python')
        if is_missing_data:  # If missing data specified in header
            data[data == float(missing_data_val)] = np.nan

        # Add the units data
        units = OrderedDict([('XRS-B proxy', u.W/u.m**2),
                             ('XRS-A proxy', u.W/u.m**2),
                             ('SEM proxy', u.W/u.m**2),
                             ('0.1-7ESPquad', u.W/u.m**2),
                             ('17.1ESP', u.W/u.m**2),
                             ('25.7ESP', u.W/u.m**2),
                             ('30.4ESP', u.W/u.m**2),
                             ('36.6ESP', u.W/u.m**2),
                             ('darkESP', u.ct),
                             ('121.6MEGS-P', u.W/u.m**2),
                             ('darkMEGS-P', u.ct),
                             ('q0ESP', u.dimensionless_unscaled),
                             ('q1ESP', u.dimensionless_unscaled),
                             ('q2ESP', u.dimensionless_unscaled),
                             ('q3ESP', u.dimensionless_unscaled),
                             ('CMLat', u.deg),
                             ('CMLon', u.deg)])
        # Todo: check units used.
        return data, meta, units
コード例 #35
0
ファイル: eve.py プロジェクト: tarang727/sunpy
 def _parse_average_csv(fp):
     """Parses an EVE Averages file."""
     return "", read_csv(fp, sep=",", index_col=0, parse_dates=True)
コード例 #36
0
ファイル: test_unsupported.py プロジェクト: yuyisky88/pandas
    def test_c_engine(self):
        # see gh-6607
        data = 'a b c\n1 2 3'
        msg = 'does not support'

        # specify C engine with unsupported options (raise)
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data),
                     engine='c',
                     sep=None,
                     delim_whitespace=False)
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data), engine='c', sep=r'\s')
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128))
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data), engine='c', skipfooter=1)

        # specify C-unsupported options without python-unsupported options
        with tm.assert_produces_warning(parsers.ParserWarning):
            read_csv(StringIO(data), sep=None, delim_whitespace=False)
        with tm.assert_produces_warning(parsers.ParserWarning):
            read_csv(StringIO(data), sep=r'\s')
        with tm.assert_produces_warning(parsers.ParserWarning):
            read_csv(StringIO(data), sep='\t', quotechar=chr(128))
        with tm.assert_produces_warning(parsers.ParserWarning):
            read_csv(StringIO(data), skipfooter=1)

        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""
        msg = 'Error tokenizing data'

        with pytest.raises(ParserError, match=msg):
            read_csv(StringIO(text), sep='\\s+')
        with pytest.raises(ParserError, match=msg):
            read_csv(StringIO(text), engine='c', sep='\\s+')

        msg = "Only length-1 thousands markers supported"
        data = """A|B|C
1|2,334|5
10|13|10.
"""
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data), thousands=',,')
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data), thousands='')

        msg = "Only length-1 line terminators supported"
        data = 'a,b,c~~1,2,3~~4,5,6'
        with pytest.raises(ValueError, match=msg):
            read_csv(StringIO(data), lineterminator='~~')
コード例 #37
0
ファイル: clipboards.py プロジェクト: wesbarnett/pandas
def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
    r"""
    Read text from clipboard and pass to read_csv.

    Parameters
    ----------
    sep : str, default '\s+'
        A string or regex delimiter. The default of '\s+' denotes
        one or more whitespace characters.

    **kwargs
        See read_csv for the full argument list.

    Returns
    -------
    DataFrame
        A parsed DataFrame object.
    """
    encoding = kwargs.pop("encoding", "utf-8")

    # only utf-8 is valid for passed value because that's what clipboard
    # supports
    if encoding is not None and encoding.lower().replace("-", "") != "utf8":
        raise NotImplementedError(
            "reading from clipboard only supports utf-8 encoding")

    from pandas.io.clipboard import clipboard_get
    from pandas.io.parsers import read_csv

    text = clipboard_get()

    # Try to decode (if needed, as "text" might already be a string here).
    try:
        text = text.decode(
            kwargs.get("encoding") or get_option("display.encoding"))
    except AttributeError:
        pass

    # Excel copies into clipboard with \t separation
    # inspect no more then the 10 first lines, if they
    # all contain an equal number (>0) of tabs, infer
    # that this came from excel and set 'sep' accordingly
    lines = text[:10000].split("\n")[:-1][:10]

    # Need to remove leading white space, since read_csv
    # accepts:
    #    a  b
    # 0  1  2
    # 1  3  4

    counts = {x.lstrip(" ").count("\t") for x in lines}
    if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
        sep = "\t"
        # check the number of leading tabs in the first line
        # to account for index columns
        index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
        if index_length != 0:
            kwargs.setdefault("index_col", list(range(index_length)))

    # Edge case where sep is specified to be None, return to default
    if sep is None and kwargs.get("delim_whitespace") is None:
        sep = r"\s+"

    # Regex separator currently only works with python engine.
    # Default to python if separator is multi-character (regex)
    if len(sep) > 1 and kwargs.get("engine") is None:
        kwargs["engine"] = "python"
    elif len(sep) > 1 and kwargs.get("engine") == "c":
        warnings.warn(
            "read_clipboard with regex separator does not work properly with c engine."
        )

    return read_csv(StringIO(text), sep=sep, **kwargs)
コード例 #38
0
import tensorflow as tf
import numpy as np
from pandas.io.parsers import read_csv

#다변인 선형회귀 모델에 영향을 미치는 변인이 여러 개 일 때 사용하는 모델
model = tf.global_variables_initializer()

data = read_csv('cancer5.csv', sep=',')
xy = np.array(data, dtype=np.float32)

x_data = xy[:, :-1]
y_data = xy[:, [-1]]  #status 값

X = tf.placeholder(tf.float32, shape=[None, 7])
Y = tf.placeholder(tf.float32, shape=[None, 1])
W = tf.Variable(tf.random_normal([7, 1]), name="weight")
b = tf.Variable(tf.random_normal([1]), name="bias")

hypothesis = tf.matmul(X, W) + b
cost = tf.reduce_mean(tf.square(hypothesis - Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00005)
train = optimizer.minimize(cost)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for step in range(100001):
    cost_, hypo_, _ = sess.run([cost, hypothesis, train],
                               feed_dict={
                                   X: x_data,
                                   Y: y_data
コード例 #39
0
ファイル: USD_Deposit_Swap.py プロジェクト: surfmaverick/pyql
        "RIFLDIY02_N.B": ('Swap2Y', 2),
        "RIFLDIY03_N.B": ('Swap3Y', 3),
        "RIFLDIY04_N.B": ('Swap4Y', 4),
        "RIFLDIY05_N.B": ('Swap5Y', 5),
        "RIFLDIY07_N.B": ('Swap7Y', 7),
        "RIFLDIY10_N.B": ('Swap10Y', 10),
        "RIFLDIY30_N.B": ('Swap30Y', 30),
        "RILSPDEPM01_N.B": ('Libor1M', 1.0 / 12),
        "RILSPDEPM03_N.B": ('Libor3M', 3.0 / 12),
        "RILSPDEPM06_N.B": ('Libor6M', 6.0 / 12)
    }

    # convert buffer to data frame
    df_libor = read_csv(buff,
                        sep=',',
                        header=True,
                        index_col=0,
                        parse_dates=True,
                        skiprows=[0, 1, 2, 3, 4]).transpose()

    # rename rows and columns with better names
    col_name_dic = {k: columns_dic[k][0] for k in columns_dic.keys()}
    df_libor = df_libor.rename(index=col_name_dic,
                               columns={df_libor.columns[0]: 'Rate'})

    # dictionary of maturities
    col_mat_dic = {columns_dic[k][0]:columns_dic[k][1] \
                   for k in columns_dic.keys()}

    # add maturity column
    df_libor['Maturity'] = [col_mat_dic[k] for k in df_libor.index]
コード例 #40
0
# Takes a (_, 30) prediction array and writes a .csv submission file in the Kaggle format

import os
from datetime import datetime
from pandas import DataFrame
from pandas.io.parsers import read_csv

FLOOKUP = "data/IdLookupTable.csv"

lookup_table = read_csv(os.path.expanduser(FLOOKUP))
features = ["left_eye_center", "right_eye_center", "left_eye_inner_corner", "left_eye_outer_corner", "right_eye_inner_corner", "right_eye_outer_corner", "left_eyebrow_inner_end", "left_eyebrow_outer_end", "right_eyebrow_inner_end", "right_eyebrow_outer_end", "nose_tip", "mouth_left_corner", "mouth_right_corner", "mouth_center_top_lip", "mouth_center_bottom_lip"]
columns = [[elem + "_x", elem + "_y"] for elem in features]
columns = [elem for l in columns for elem in l]

def write_submission(y_pred):
    y_pred2 = y_pred * 48 + 48
    y_pred2 = y_pred2.clip(0, 96)
    df = DataFrame(y_pred2, columns=columns)

    values = []

    for index, row in lookup_table.iterrows():
        values.append((
            row['RowId'],
            df.ix[row.ImageId - 1][row.FeatureName],
            ))

    now_str = datetime.now().isoformat().replace(':', '-')
    submission = DataFrame(values, columns=('RowId', 'Location'))
    filename = 'submission/submission-{}.csv'.format(now_str)
    submission.to_csv(filename, index=False)
コード例 #41
0
ファイル: HDclustering.py プロジェクト: pythseq/scScarTrace
import sys, os
from pandas.io.parsers import read_csv
import numpy as np
import pandas as pd
import sklearn.cluster
import matplotlib.pyplot as plt
from Colors import *

try:
    df = read_csv(sys.argv[1], sep='\t', index_col=0)
    outfile = sys.argv[2]
    pdfplot = sys.argv[3]
except:
    sys.exit(
        'Please, give path to _df.txt file (full name); root for output file; produce pdf plot (y/n)'
    )

hcl = df['hclust']
del df['hclust']

clones = {}
for cell in df.index:
    #    scars = '-'.join(df.columns[df.loc[cell] > 0])
    scars = '-'.join(df.columns[df.loc[cell] > 3.5])
    if scars in clones:
        clones[scars].append(cell)
    else:
        clones[scars] = [cell]

dfnew = pd.DataFrame()
i = 0
コード例 #42
0
ファイル: Pr21.py プロジェクト: gonzsa04/Machine-Learning
def load_csv(file_name):
    """carga el fichero csv especificado y lo devuelve en un array de numpy"""
    valores = read_csv(file_name, header=None).values

    return valores.astype(float)
コード例 #43
0
ファイル: pet.py プロジェクト: ghisvail/clinica
def read_psf_information(pvc_psf_tsv, subject_ids, session_ids, pet_tracer):
    """Read PSF information from TSV file.

    Args:
        pvc_psf_tsv: TSV file containing participant_id, session_id, acq_label,
            psf_x, psf_y & psf_z columns
        subject_ids: list of participant IDs (e.g. ['sub-CLNC01', 'sub-CLNC01'])
        session_ids: list of session IDs (e.g. ['ses-M00', 'ses-M18'])
        pet_tracer: Tracer we want to select in acq_label column. Other tracers
            will not be read in this function

    Example of pvc_psf_tsv:

    participant_id    session_id     acq_label     psf_x    psf_y    psf_z
    sub-CLNC01        ses-M00        FDG           8        9        10
    sub-CLNC01        ses-M18        FDG           8        9        10
    sub-CLNC01        ses-M00        AV45          7        6        5
    sub-CLNC02        ses-M00        FDG           8        9        10
    sub-CLNC03        ses-M00        FDG           8        9        10

    Returns:
        PSF information following [subject_ids, session_ids] order
    """
    import os

    from pandas.io.parsers import read_csv

    if not os.path.isfile(pvc_psf_tsv):
        raise FileNotFoundError(
            f"Could not find the psf_tsv file {pvc_psf_tsv}")
    try:
        psf_df = read_csv(pvc_psf_tsv, sep="\t")
    except (IOError, UnicodeDecodeError):
        raise RuntimeError("An error while reading {pvc_psf_tsv} happened")

    if any(elem not in [
            "participant_id", "session_id", "acq_label", "psf_x", "psf_y",
            "psf_z"
    ] for elem in list(psf_df.columns)):
        raise IOError(
            f"The file {pvc_psf_tsv} must contain the following columns (separated by tabulations):\n"
            f"participant_id, session_id, acq_label, psf_x, psf_y, psf_z\n"
            f"{str(list(psf_df.columns))}\n"
            f"Pay attention to the spaces (there should be none).")

    subjects_psf = list(psf_df.participant_id)
    sessions_psf = list(psf_df.session_id)
    pet_tracer_psf = list(psf_df.acq_label)
    idx_reordered = []
    for i, sub in enumerate(subject_ids):
        current_ses = session_ids[i]
        idx_sub = [
            j for j in range(len(subjects_psf))
            if (sub == subjects_psf[j]) and (current_ses == sessions_psf[j])
            and (pet_tracer == pet_tracer_psf[j])
        ]
        if len(idx_sub) == 0:
            raise RuntimeError(
                f"Subject {sub} with session {current_ses} and tracer {pet_tracer} "
                f"that you want to proceed was not found in the TSV file containing "
                f"PSF specifications ({pvc_psf_tsv}).")
        if len(idx_sub) > 1:
            raise RuntimeError(
                f"Subject {sub} with session {current_ses} and tracer {pet_tracer} "
                f"that you want to proceed was found multiple times "
                f"in the TSV file containing PSF specifications ({pvc_psf_tsv})."
            )
        idx_reordered.append(idx_sub[0])

    psf_x = list(psf_df.psf_x)
    psf_y = list(psf_df.psf_y)
    psf_z = list(psf_df.psf_z)
    iterables_psf = [[psf_x[i], psf_y[i], psf_z[i]] for i in idx_reordered]
    return iterables_psf
コード例 #44
0
ファイル: mobile_price.py プロジェクト: sgavil/AAyMineria
def load_data(file_name):
    values = read_csv(file_name, header=None).values
    return values.astype(float)
コード例 #45
0
def tips_df(datapath):
    """DataFrame with the tips dataset."""
    return read_csv(datapath("io", "data", "csv", "tips.csv"))
コード例 #46
0
ファイル: bachoo.py プロジェクト: vyvydkf628/predict
import tensorflow as tf
import numpy as np
from pandas.io.parsers import read_csv

model = tf.global_variables_initializer()

data = read_csv('C:/bachoo/price data.csv', sep=',')
xy = np.array(data, dtype=np.float32)
x_data = xy[:, 1:-1]
y_data = xy[:, [-1]]
X = tf.placeholder(tf.float32, shape=[None, 4])
Y = tf.placeholder(tf.float32, shape=[None, 1])
W = tf.Variable(tf.random_normal([4, 1]), name="weight")
b = tf.Variable(tf.random_normal([1]), name="bias")
hypothesis = tf.matmul(X, W) + b
cost = tf.reduce_mean(tf.square(hypothesis - Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00005)
train = optimizer.minimize(cost)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for step in range(100001):
    cost_, hypo_, _ = sess.run([cost, hypothesis, train],
                               feed_dict={
                                   X: x_data,
                                   Y: y_data
                               })
    if step % 500 == 0:
        print("#", step, " 손실비용 ", cost_)
        print("- 배추가격 :", hypo_[0])
saver = tf.train.Saver()
save_path = saver.save(sess, "./saved.cpkt")
コード例 #47
0
 def test_read_s3_with_hash_in_key(self, tips_df, s3so):
     # GH 25945
     result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so)
     tm.assert_frame_equal(tips_df, result)
コード例 #48
0
    def _parse_level_0cs(filepath):
        """
        Parses and EVE Level 0CS file.
        """
        is_missing_data = False  # boolean to check for missing data
        missing_data_val = np.nan
        header = []
        fields = []
        with codecs.open(filepath, mode='rb', encoding='ascii') as fp:
            line = fp.readline()
            # Read header at top of file
            while line.startswith(";"):
                header.append(line)
                if '; Missing data:' in line:
                    is_missing_data = True
                    missing_data_val = line.split(':')[1].strip()

                line = fp.readline()

        meta = MetaDict()
        for hline in header:
            if hline == '; Format:\n' or hline == '; Column descriptions:\n':
                continue
            elif ('Created' in hline) or ('Source' in hline):
                meta[hline.split(':', 1)[0].replace(
                    ';', ' ').strip()] = hline.split(':', 1)[1].strip()
            elif ':' in hline:
                meta[hline.split(':')[0].replace(
                    ';', ' ').strip()] = hline.split(':')[1].strip()

        fieldnames_start = False
        for hline in header:
            if hline.startswith("; Format:"):
                fieldnames_start = False
            if fieldnames_start:
                fields.append(hline.split(":")[0].replace(';', ' ').strip())
            if hline.startswith("; Column descriptions:"):
                fieldnames_start = True

        # Next line is YYYY DOY MM DD
        date_parts = line.split(" ")
        year = int(date_parts[0])
        month = int(date_parts[2])
        day = int(date_parts[3])

        data = read_csv(filepath,
                        delim_whitespace=True,
                        names=fields,
                        comment=';',
                        dtype={'HHMM': int})
        # First line is YYYY DOY MM DD
        data = data.iloc[1:, :]
        data['Hour'] = data['HHMM'] // 100
        data['Minute'] = data['HHMM'] % 100
        data = data.drop(['HHMM'], axis=1)

        data['Year'] = year
        data['Month'] = month
        data['Day'] = day

        datecols = ['Year', 'Month', 'Day', 'Hour', 'Minute']
        data['Time'] = to_datetime(data[datecols])
        data = data.set_index('Time')
        data = data.drop(datecols, axis=1)

        if is_missing_data:  # If missing data specified in header
            data[data == float(missing_data_val)] = np.nan

        # Add the units data
        units = OrderedDict([('XRS-B proxy', u.W / u.m**2),
                             ('XRS-A proxy', u.W / u.m**2),
                             ('SEM proxy', u.W / u.m**2),
                             ('0.1-7ESPquad', u.W / u.m**2),
                             ('17.1ESP', u.W / u.m**2),
                             ('25.7ESP', u.W / u.m**2),
                             ('30.4ESP', u.W / u.m**2),
                             ('36.6ESP', u.W / u.m**2), ('darkESP', u.ct),
                             ('121.6MEGS-P', u.W / u.m**2),
                             ('darkMEGS-P', u.ct),
                             ('q0ESP', u.dimensionless_unscaled),
                             ('q1ESP', u.dimensionless_unscaled),
                             ('q2ESP', u.dimensionless_unscaled),
                             ('q3ESP', u.dimensionless_unscaled),
                             ('CMLat', u.deg), ('CMLon', u.deg)])
        # Todo: check units used.
        return data, meta, units
コード例 #49
0
ファイル: test_unsupported.py プロジェクト: yuyisky88/pandas
    def test_deprecated_args(self, engine, kwargs):
        data = "1,2,3"
        arg, _ = list(kwargs.items())[0]

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            read_csv(StringIO(data), engine=engine, **kwargs)
コード例 #50
0
ファイル: test_excel.py プロジェクト: sarahcodes/pandas
 def read_csv(self, *args, **kwds):
     kwds = kwds.copy()
     kwds['engine'] = 'python'
     return read_csv(*args, **kwds)
コード例 #51
0
from pandas.io.parsers import read_csv
import numpy
import pandas
from numpy.random import seed
from numpy.random import rand
from numpy.random import random_integers
df = read_csv("F:/data/WHO.csv")
print("Dataframe", df)
print("Shape", df.shape)
print("Length", len(df))
print("Column Headers", df.columns)
print("Index", df.index)
print("Values", df.values)

country_col = df["Country"]
print("Type df", type(df))
print("Type country col", type(country_col))
print("Series shape", country_col.shape)
print("Series index", country_col.index)
print("Series values", country_col.values)
print("Series name", country_col.name)
print("Last 2 countries", country_col[-2:])
print("Last 2 countries type", type(country_col[-2:]))

last_col = df.columns[-1]
print("df signs\n", numpy.sign(df[last_col]))

df1 = pandas.DataFrame({
    'Weather': ['cold', 'hot', 'cold', 'hot'],
    'Food': ['soup', 'soup', 'icecream', 'chocolate'],
    'Price': 10 * rand(4),
コード例 #52
0
 def test_empty_csv_input(self):
     # GH14867
     df = read_csv(StringIO(), chunksize=20, header=None,
                   names=['a', 'b', 'c'])
     assert isinstance(df, TextFileReader)
コード例 #53
0
import os
from pandas.io.parsers import read_csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, Activation, Dropout, Flatten
from keras.optimizers import RMSprop

FTRAIN = '/Users/tian/Documents/CNN/mnist/train.csv'
FTEST = '/Users/tian/Documents/CNN/mnist/test.csv'

train_df = read_csv(os.path.expanduser(FTRAIN))
# 42000, 785
# int64(785)
test_df = read_csv(os.path.expanduser(FTEST))
# 28000, 784
# int64(784)

X_train = train_df[train_df.columns[1:]].values
X_train = X_train.astype(np.float32) / 255

y_train = train_df[train_df.columns[0]]
y_train = y_train.astype("category")

X_test = test_df.values.astype(np.float32) / 255


def plot_samples(data=X_train, label=y_train, limit=41000):
コード例 #54
0
def salaries_table(datapath):
    """DataFrame with the salaries dataset"""
    return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t")
コード例 #55
0
 def test_read_dta9(self):
     expected = read_csv(self.csv9, parse_dates=True, sep='\t')
     parsed = self.read_dta(self.dta9)
     tm.assert_frame_equal(parsed, expected)
コード例 #56
0
ファイル: Practica1.grafica.py プロジェクト: nesi73/AA
def carga_csv(file_name):
    """carga"""
    valores = read_csv(file_name, header=None).values
    return valores.astype(float)
コード例 #57
0
def feature_fusion():
    # Name of train file
    trainName = '/SortedTrain.csv'
    # Path of a folder contains all single feature categories
    savePath = COMBINED_PATH_CSV
    path = SAVED_PATH_CSV + 'train/'
    os.chdir(path)
    # Read the name of each folder
    featureCategoryFolders = io.get_files_in_directory(path,
                                                       file_extension='csv')
    #featureCategoryFolders = featureCategoryFolders[1:]
    print 'Folders: ', featureCategoryFolders
    # Load train files
    featureCategoriesLen = len(featureCategoryFolders)
    class_lable = read_csv(TRAIN_ID_PATH, delimiter=',')
    new_idx = np.argsort(class_lable.ix[:, 0])
    class_lable = class_lable.ix[new_idx, 1]
    #print(class_lable[700:800])
    class_lable = class_lable.reset_index(drop=True)
    singleTrains = list()
    for k in range(featureCategoriesLen):
        dataSet = read_csv(featureCategoryFolders[k], delimiter=',')
        data = dataSet  #.ix[:, :]
        #data.astype(np.float)
        #print(class_lable[0:data.shape[0]-1])
        singleTrain = dataSetInformaion(
            featureCategoryFolders[k][featureCategoryFolders[k].rfind('/') +
                                      1:-4], data,
            class_lable[0:data.shape[0]])
        singleTrains.append(singleTrain)

    remainingFeatureCategoriesIndices = np.ones(len(featureCategoryFolders))
    finalDataSets = list()
    # Main loop of combination
    for A in range(featureCategoriesLen):

        minimumLogLoss = 100
        minimumDataSet = None
        minimumIndex = -1
        processingDataSet = None
        # Main loop of single evaluation
        for k in range(featureCategoriesLen):
            nameOfDataSet = ''
            if remainingFeatureCategoriesIndices[k] == 0:
                continue
            if len(finalDataSets) != 0:
                # Join datasets
                dataSet1 = finalDataSets[len(finalDataSets) - 1].data
                # print dataSet1
                dataSet2 = singleTrains[k].data
                # print dataSet2
                result = pd.concat([dataSet1, dataSet2], axis=1, join='inner')

                processingDataSet = dataSetInformaion(
                    finalDataSets[len(finalDataSets) - 1].dataSetName + '+' +
                    singleTrains[k].dataSetName, result,
                    singleTrains[k].classLabel)
                # print result
                # classLabel = result.ix[:, -1]
                # data = result.ix[:, :-1]
                # nameOfDataSet =
            else:
                processingDataSet = singleTrains[k]
                # classLabel = singleTrains[k].data
                # data = singleTrains[k].classLabel
                # nameOfDataSet = featureCategoryFolders[k]
            accuracies = []
            logLosses = []

            # print 'DataSet', str(k), '=========================', 'cross validation result'
            # # For each fold in cross validation
            # rng = np.random.RandomState(31337)
            # kF = KFold(classLabel.shape[0], n_folds=2, shuffle=True, random_state=rng)
            # for trainIndex, testIndex in kF:
            #     trainKF = data.ix[trainIndex,:]
            #     trainID = classLabel.ix[trainIndex]
            #     xgbModel = xgb.XGBClassifier().fit(trainKF,trainID)
            #     actualLabels = classLabel.ix[testIndex]
            #
            #     predictProbability = xgbModel.predict_proba(data.ix[testIndex,:])
            #     logLoss = multiclass_log_loss(actualLabels,predictProbability)
            #     logLosses.append(logLoss)
            #
            #     predictedLabels = xgbModel.predict(data.ix[testIndex,:])
            #     #print(confusion_matrix(actualLabels, predictedLabels))
            #     acc = accuracy_score(actualLabels, predictedLabels)
            #     accuracies.append(acc)
            #
            # accuraciesMean = np.mean(accuracies)
            # loglossesMean = np.mean(logLosses)
            # print accuraciesMean
            # print loglossesMean
            #
            # FeatureCategoriesAccuracy.append(accuraciesMean)
            # FeatureCategoriesLogLoss.append(loglossesMean)

            print 'DataSet', str(processingDataSet.dataSetName
                                 ), '=========================', 'train result'
            # For each fold in cross validation

            xgbModel = xgb.XGBClassifier().fit(processingDataSet.data,
                                               processingDataSet.classLabel)
            predictProbability = xgbModel.predict_proba(processingDataSet.data)
            logLoss = multiclass_log_loss(processingDataSet.classLabel,
                                          predictProbability)
            #logLoss = float("%.3f"% logLoss)
            predictedLabels = xgbModel.predict(processingDataSet.data)
            acc = accuracy_score(processingDataSet.classLabel, predictedLabels)
            print acc
            print logLoss
            print processingDataSet.data.shape

            if logLoss < minimumLogLoss:
                minimumDataSet = processingDataSet
                minimumIndex = k
                minimumLogLoss = logLoss
                # featureCategoriesAccuracy.append(acc)
                # featureCategoriesLogLoss.append(logLoss)

        print 'Final Round ', A, '========================='
        # Minimum_Index = np.where(featureCategoriesLogLoss == np.min(featureCategoriesLogLoss))
        # print featureCategoriesLogLoss
        # finalDataSets.append(singleTrains[Minimum_Index[0]])
        finalDataSets.append(minimumDataSet)
        remainingFeatureCategoriesIndices[minimumIndex] = 0
    finalSets = [set.dataSetName for set in finalDataSets]
    print ','.join(finalSets)
    # Save the combined datasets
    os.chdir(savePath)
    for ds in finalDataSets:
        jointFile = pd.concat([ds.data, ds.classLabel], axis=1, join='inner')
        if not os.path.exists(savePath + ds.dataSetName):
            os.makedirs(savePath + ds.dataSetName)
        jointFile.to_csv(COMBINED_PATH_CSV + ds.dataSetName + '/NewTrain.csv',
                         sep=',',
                         index=False)
    try:
        del singleTrains, singleTrain, processingDataSet, jointFile, finalSets, \
            remainingFeatureCategoriesIndices, featureCategoryFolders, dataSet, dataSet1, dataSet2, data, class_lable
    except:
        pass
    print 'All combinations saved!!!'
    print 'Run cross-validation ...'
    featureCombinationsFinalAccuracy = 0
    featureCombinationsFinalDataSet = None
    featureCombinationsLogLossMin = 100
    for ds in finalDataSets:
        print 'DataSet', ds.dataSetName, '=========================', 'cross validation result'
        # For each fold in cross validation
        rng = np.random.RandomState(31337)
        kF = KFold(ds.classLabel.shape[0],
                   n_folds=5,
                   shuffle=True,
                   random_state=rng)
        for trainIndex, testIndex in kF:
            trainKF = ds.data.ix[trainIndex, :]
            trainID = ds.classLabel.ix[trainIndex]
            xgbModel = xgb.XGBClassifier().fit(trainKF, trainID)
            actualLabels = ds.classLabel.ix[testIndex]

            predictProbability = xgbModel.predict_proba(
                ds.data.ix[testIndex, :])
            logLoss = multiclass_log_loss(actualLabels, predictProbability)
            logLosses.append(logLoss)

            predictedLabels = xgbModel.predict(ds.data.ix[testIndex, :])
            # print(confusion_matrix(actualLabels, predictedLabels))
            acc = accuracy_score(actualLabels, predictedLabels)
            accuracies.append(acc)

        accuraciesMean = np.mean(accuracies)
        loglossesMean = np.mean(logLosses)
        print accuraciesMean
        print loglossesMean

        if loglossesMean < featureCombinationsLogLossMin:
            featureCombinationsFinalDataSet = ds
            featureCombinationsFinalAccuracy = accuraciesMean
            featureCombinationsLogLossMin = loglossesMean
    print 'Final Result ---------------------------------'
    print featureCombinationsFinalDataSet.dataSetName, featureCombinationsFinalAccuracy, featureCombinationsLogLossMin
コード例 #58
0
        "RIFLDIY01_N.B": 'Swap1Y',
        "RIFLDIY02_N.B": 'Swap2Y',
        "RIFLDIY03_N.B": 'Swap3Y',
        "RIFLDIY04_N.B": 'Swap4Y',
        "RIFLDIY05_N.B": 'Swap5Y',
        "RIFLDIY07_N.B": 'Swap7Y',
        "RIFLDIY10_N.B": 'Swap10Y',
        "RIFLDIY30_N.B": 'Swap30Y',
        "RILSPDEPM01_N.B": 'Libor1M',
        "RILSPDEPM03_N.B": 'Libor3M',
        "RILSPDEPM06_N.B": 'Libor6M'
    }

    # the data converter is applied to all columns
    # excluding the index column (0)

    dc_dict = {i: dataconverter for i in range(1, len(columns_dic) + 1)}

    df_libor = read_csv(fname,
                        sep=',',
                        header=0,
                        index_col=0,
                        parse_dates=True,
                        converters=dc_dict,
                        skiprows=[0, 1, 2, 3, 4])

    df_libor = df_libor.rename(columns=columns_dic)
    good_rows = df_libor.apply(good_row, axis=1)
    df_libor_good = df_libor[good_rows]
    df_libor_good.to_pickle(os.path.join('..', 'data', 'df_libor.pkl'))
コード例 #59
0
import pandas as pd
import pandas.io.parsers as pd_par
import numpy as np
import math
import copy
import QSTK.qstkutil.qsdateutil as du
import datetime as dt
import QSTK.qstkutil.DataAccess as da
import QSTK.qstkutil.tsutil as tsu

startCash = 50000
orderFile = "order_h4-6.csv"
valueFile = "value_h4-6.csv"

orderDF = pd_par.read_csv(orderFile, header=None)

# Getting the Symbols from the .csv file
ls_symbols = list(set(orderDF['X.4'].values))

# Need to sort the trades DF by increasing date
orderDF = orderDF.sort(['X.1', 'X.2', 'X.3'])

# Getting the start and end dates from the .csv file
dt_start = dt.datetime( orderDF.head(1)['X.1'], orderDF.head(1)['X.2'], orderDF.head(1)['X.3'])
dt_end = dt.datetime( orderDF.tail(1)['X.1'], orderDF.tail(1)['X.2'], orderDF.tail(1)['X.3'] + 1 )

# Getting market data
dataobj = da.DataAccess('Yahoo', cachestalltime=0)
ls_keys = ['close', 'actual_close']
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt.timedelta(hours=16))
コード例 #60
0
def summarize_topics(filenames, test, selection, dist, max_phrase_len,
                     min_phrase_count):
    """
    """

    state = read_csv(filenames[0],
                     compression='gzip',
                     skiprows=2,
                     usecols=[0, 4, 5],
                     header=0,
                     names=['doc', 'word', 'topic'],
                     sep=' ')
    state['word'] = state['word'].astype(str)

    topics = read_csv(filenames[1],
                      sep='(?: |\t)',
                      engine='python',
                      index_col=0,
                      header=None,
                      names=(['alpha'] + [x for x in xrange(1, 202)]))
    if dist == 'average-posterior':
        topics['prob'] = zeros(len(topics))
        for _, df in state.groupby('doc'):
            topics['prob'] += (
                topics['alpha'].add(df.groupby('topic').size(), fill_value=0) /
                (topics['alpha'].sum() + len(df)))
        topics['prob'] /= state['doc'].nunique()
    elif dist == 'empirical':
        topics['prob'] = state.groupby('topic')['word'].count() / len(state)
    else:
        topics['prob'] = topics['alpha'] / topics['alpha'].sum()


#    assert topics['prob'].sum() >= 1-1e-15
#    assert topics['prob'].sum() <= 1+1e-15

    num_topics = len(topics)

    phrases = dict()

    #print >> sys.stderr, 'Creating candidate n-grams...'

    ngram = dict([(l, l * ['']) for l in xrange(1, max_phrase_len + 1)])
    doc = dict([(l, l * [-1]) for l in xrange(1, max_phrase_len + 1)])
    topic = dict([(l, l * [-1]) for l in xrange(1, max_phrase_len + 1)])

    counts = dict([(l, defaultdict(lambda: zeros(num_topics + 2, dtype=int)))
                   for l in xrange(1, max_phrase_len + 1)])

    for _, row in state.iterrows():
        for l in xrange(1, max_phrase_len + 1):

            ngram[l] = ngram[l][1:] + [row['word']]
            doc[l] = doc[l][1:] + [row['doc']]
            topic[l] = topic[l][1:] + [row['topic']]

            if len(set(doc[l])) == 1:
                if len(set(topic[l])) == 1:
                    counts[l][tuple(ngram[l])][row['topic']] += 1
                    counts[l][tuple(ngram[l])][num_topics] += 1
                counts[l][tuple(ngram[l])][num_topics + 1] += 1

    for l in xrange(1, max_phrase_len + 1):

        ngrams = DataFrame.from_records(
            [[' '.join(x), ' '.join(x[:-1]), ' '.join(x[1:])] + y.tolist()
             for x, y in counts[l].items()],
            columns=(['ngram', 'prefix', 'suffix'] + range(num_topics) +
                     ['same', 'all']))
        counts[l] = ngrams

        #        tmp = state.groupby('doc')['doc'].count()
        #        tmp = (len(state) - tmp[tmp < l].sum() - len(tmp[tmp >= l]) * (l - 1))
        #        assert ngrams['all'].sum() == tmp
        #        assert (sum(ngrams[range(0, num_topics)].sum(axis=1) ==
        #                    ngrams['same']) == len(ngrams))

        #print >> sys.stderr, 'Selecting %d-gram phrases...' % l

        if l == 1:
            phrases[l] = set(
                ngrams[ngrams['all'] >= min_phrase_count]['ngram'])
            continue

        n = ngrams['all'].sum()

        if test == bfu or test == bfc:
            alpha = 1.0
            alpha_sum = 4 * alpha
            beta = alpha_sum / n

        prefix_cache = ngrams.groupby('prefix')['all'].sum()
        suffix_cache = ngrams.groupby('suffix')['all'].sum()

        #        assert prefix_cache.sum() == ngrams['all'].sum()
        #        assert suffix_cache.sum() == ngrams['all'].sum()

        scores = len(ngrams) * [None]

        for idx, row in ngrams[ngrams['prefix'].isin(phrases[l - 1])
                               & ngrams['suffix'].isin(phrases[l - 1]) &
                               (ngrams['all'] >= min_phrase_count)].iterrows():

            a = row['all']

            a_plus_b = suffix_cache[row['suffix']]
            a_plus_c = prefix_cache[row['prefix']]

            b = a_plus_b - a
            c = a_plus_c - a
            d = n - a_plus_b - c

            args = [a, b, c, d, n, a_plus_b, a_plus_c]

            if test == bfu:
                args += [alpha, alpha_sum, beta]
            elif test == bfc:
                args += [alpha, alpha_sum]

            scores[idx] = test(*args)

        ngrams['score'] = scores

        if test == bfu or test == bfc:
            keep = ngrams['score'] <= (1.0 / 10)
        else:
            keep = ngrams['score'] > 10.83

        if selection == 'none':
            phrases[l] = set(ngrams[keep]['ngram'])
        else:
            if l == 2:
                phrases[l] = dict(ngrams[keep].set_index('ngram')['score'])
            else:
                m = 2 if selection == 'bigram' else l - 1
                if test == bfu or test == bfc:
                    tmp = set([
                        k for k, v in phrases[m].items() if v <= percentile(
                            sorted(phrases[m].values(), reverse=True),
                            (1.0 - 1.0 / 2**l) * 100)
                    ])
                else:
                    tmp = set([
                        k for k, v in phrases[m].items()
                        if v >= percentile(sorted(phrases[m].values()),
                                           (1.0 - 1.0 / 2**l) * 100)
                    ])
                if selection == 'bigram':
                    keep &= Series([
                        all([
                            ' '.join(bigram) in tmp
                            for bigram in zip(words, words[1:])
                        ]) for words in
                        [ngram.split() for ngram in ngrams['ngram']]
                    ])
                    phrases[l] = set(ngrams[keep]['ngram'])
                else:
                    keep &= (ngrams['prefix'].isin(tmp)
                             & ngrams['suffix'].isin(tmp))
                    phrases[l] = dict(ngrams[keep].set_index('ngram')['score'])

        ngrams.drop(['prefix', 'suffix', 'score'], axis=1, inplace=True)

    if selection == 'bigram':
        phrases[2] = set(phrases[2].keys())
    elif selection == 'n-1-gram':
        for l in xrange(2, max_phrase_len + 1):
            phrases[l] = set(phrases[l].keys())

    scores = defaultdict(lambda: defaultdict(float))

    for l in xrange(1, max_phrase_len + 1):

        ngrams = counts[l]
        n = ngrams['same'].sum()
        ngrams['prob'] = ngrams['same'] / n

        for topic in xrange(num_topics):

            n_topic = ngrams[topic].sum()
            p_topic = topics['prob'][topic]
            p_not_topic = 1.0 - p_topic

            for _, row in ngrams[(ngrams['ngram'].isin(phrases[l]))
                                 & (ngrams[topic] > 0)].iterrows():

                p_phrase = row['prob']
                p_topic_g_phrase = row[topic] / row['same']
                p_topic_g_not_phrase = ((n_topic - row[topic]) /
                                        (n - row['same']))

                p_not_phrase = 1.0 - p_phrase
                p_not_topic_g_phrase = 1.0 - p_topic_g_phrase
                p_not_topic_g_not_phrase = 1.0 - p_topic_g_not_phrase

                a = 0.0

                if p_topic_g_phrase != 0.0:
                    a += (p_topic_g_phrase *
                          (log2(p_topic_g_phrase) - log2(p_topic)))
                if p_not_topic_g_phrase != 0.0:
                    a += (p_not_topic_g_phrase *
                          (log2(p_not_topic_g_phrase) - log2(p_not_topic)))

                b = 0.0

                if p_topic_g_not_phrase != 0.0:
                    b += (p_topic_g_not_phrase *
                          (log2(p_topic_g_not_phrase) - log2(p_topic)))
                if p_not_topic_g_not_phrase != 0.0:
                    b += (p_not_topic_g_not_phrase *
                          (log2(p_not_topic_g_not_phrase) - log2(p_not_topic)))

                scores[topic][row['ngram']] = p_phrase * a + p_not_phrase * b

    for topic, row in topics.iterrows():
        print '---Topic %d---' % (topic)
        print '\n'.join([
            '%s\t%f' % (x, y) for x, y in sorted(
                scores[topic].items(), key=(lambda x: x[1]), reverse=True)
        ]) + '\n'

    return