def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) assert_frame_equal(chunk, df[:3]) last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) treader = read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) self.assert_(isinstance(treader, TextParser))
def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: if comp == 'bz2' and compat.PY2: # The Python 2 C parser can't read bz2 from S3. self.assertRaises(ValueError, read_csv, 's3://pandas-test/tips.csv' + ext, compression=comp) else: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) self.assertEqual(df_reader.chunksize, chunksize) for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them # properly. df = df_reader.get_chunk() self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) true_df = local_tips.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] # Chunking doesn't preserve row numbering true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df)
def test_deprecated_args(self, engine, kwargs): data = "1,2,3" arg, _ = list(kwargs.items())[0] with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): read_csv(StringIO(data), engine=engine, **kwargs)
def sg1_vs_sg2_plotly(): """plot SG #1 vs #2 via plotly""" out_fig = Figure() bisectrix = Scatter(x=[0,230], y=[0,230], mode='lines', name='bisectrix', showlegend=False) inmatdb_df = read_csv('mpworks/check_snl/results/bad_snlgroups_2_in_matdb.csv') inmatdb_text = map(','.join, zip( inmatdb_df['task_id 1'], inmatdb_df['task_id 2'] )) inmatdb_trace = Scatter( x=inmatdb_df['sg_num 2'].as_matrix(), y=inmatdb_df['sg_num 1'].as_matrix(), text=inmatdb_text, mode='markers', name='in MatDB' ) notinmatdb_df = read_csv('mpworks/check_snl/results/bad_snlgroups_2_notin_matdb.csv') notinmatdb_text = map(','.join, zip( map(str, notinmatdb_df['snlgroup_id 1']), map(str, notinmatdb_df['snlgroup_id 2']) )) notinmatdb_trace = Scatter( x=notinmatdb_df['sg_num 2'].as_matrix()+0.1, y=notinmatdb_df['sg_num 1'].as_matrix()+0.1, text=notinmatdb_text, mode='markers', name='not in MatDB' ) out_fig['data'] = Data([bisectrix, notinmatdb_trace, inmatdb_trace]) out_fig['layout'] = Layout( hovermode='closest', title='Spacegroup Assignment Comparison of matching Canonical SNLs', xaxis=XAxis(showgrid=False, title='SG #2', range=[0,230]), yaxis=YAxis(showgrid=False, title='SG #1', range=[0,230]), ) filename = 'spacegroup_canonicals_' filename += datetime.datetime.now().strftime('%Y-%m-%d') py.plot(out_fig, filename=filename, auto_open=False) py.image.save_as(out_fig, 'canonicals_spacegroups.png')
def test_multi_index_no_level_names(self): data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 """ data2 = """A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 """ lines = data.split('\n') no_header = '\n'.join(lines[1:]) names = ['A', 'B', 'C', 'D'] df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) expected = read_csv(StringIO(data), index_col=[0, 1]) assert_frame_equal(df, expected) # 2 implicit first cols df2 = read_csv(StringIO(data2)) assert_frame_equal(df2, df)
def init_test_dataframes(): inp_df = read_csv(test_data_address_in, sep=',') inp_df.columns = [inp_columns] trg_df = read_csv(test_data_address_target, sep=',') trg_df.columns = [trg_column] inp_df = inp_df.replace(mapping.keys(), mapping.values()) return inp_df, trg_df
def test_pass_names_with_index(self): lines = self.data1.split('\n') no_header = '\n'.join(lines[1:]) # regular index names = ['index', 'A', 'B', 'C', 'D'] df = read_csv(StringIO(no_header), index_col=0, names=names) expected = read_csv(StringIO(self.data1), index_col=0) assert_frame_equal(df, expected) # multi index data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 """ lines = data.split('\n') no_header = '\n'.join(lines[1:]) names = ['index1', 'index2', 'A', 'B', 'C', 'D'] df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) expected = read_csv(StringIO(data), index_col=[0, 1]) assert_frame_equal(df, expected) df = read_csv(StringIO(data), index_col=['index1', 'index2']) assert_frame_equal(df, expected)
def test_parse_public_s3n_bucket(self): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def import_peaks(options, file_name, peaks_file_name): """ Given a peak file, save a temp table and store peak data. """ AtlasPeak.create_table(file_name) # AtlasPeak.set_table_name('peak_' + file_name) if not options.not_homer: # Find header row header_row = None f = open(peaks_file_name) for i, line in enumerate(f): if line[:7] == "#PeakID": header_row = i break if header_row is None: raise Exception("There is no header in this Homer peak file!") data = read_csv(peaks_file_name, sep="\t", header=i) else: data = read_csv(peaks_file_name, sep="\t", header=None) for _, row in data.iterrows(): if not options.not_homer: peak = AtlasPeak.init_from_homer_row(row) else: peak = getattr(AtlasPeak, "init_from_{0}_row".format(options.not_homer))(row) if peak: peak.save() AtlasPeak.add_indices()
def test_parse_cols_str(self): _skip_if_no_openpyxl() _skip_if_no_xlrd() suffix = ["", "x"] for s in suffix: pth = os.path.join(self.dirpath, "test.xls%s" % s) xls = ExcelFile(pth) df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols="A:D") df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=["A", "B", "C"]) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols="A:D") tm.assert_frame_equal(df, df2, check_names=False) # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df3, df2, check_names=False) del df, df2, df3 df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols="A,C,D") df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=["B", "C"]) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols="A,C,D") tm.assert_frame_equal(df, df2, check_names=False) # TODO add index to xls file tm.assert_frame_equal(df3, df2, check_names=False) del df, df2, df3 df = xls.parse("Sheet1", index_col=0, parse_dates=True, parse_cols="A,C:D") df2 = read_csv(self.csv1, index_col=0, parse_dates=True) df2 = df2.reindex(columns=["B", "C"]) df3 = xls.parse("Sheet2", skiprows=[1], index_col=0, parse_dates=True, parse_cols="A,C:D") tm.assert_frame_equal(df, df2, check_names=False) tm.assert_frame_equal(df3, df2, check_names=False)
def test_parse_public_s3a_bucket(self): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def test_deprecated_args(self): data = '1,2,3' # deprecated arguments with non-default values deprecated = { 'as_recarray': True, 'buffer_lines': True, 'compact_ints': True, 'skip_footer': True, 'use_unsigned': True, } engines = 'c', 'python' for engine in engines: for arg, non_default_val in deprecated.items(): if engine == 'c' and arg == 'skip_footer': # unsupported --> exception is raised continue if engine == 'python' and arg == 'buffer_lines': # unsupported --> exception is raised continue with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): kwargs = {arg: non_default_val} read_csv(StringIO(data), engine=engine, **kwargs)
def test_empty_field_eof(self): data = 'a,b,c\n1,2,3\n4,,' result = TextReader(StringIO(data), delimiter=',').read() expected = {0: np.array([1, 4]), 1: np.array(['2', ''], dtype=object), 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) # GH5664 a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c']) b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list('abcd'), index=[1, 1]) c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]], columns=list('abcd'), index=[0, 5, 7, 12]) for _ in range(100): df = read_csv(StringIO('a,b\nc\n'), skiprows=0, names=['a'], engine='c') assert_frame_equal(df, a) df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2), names=list("abcd"), engine='c') assert_frame_equal(df, b) df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'), names=list('abcd'), engine='c') assert_frame_equal(df, c)
def test_multiple_date_col_named_components(self): xp = read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1,2]}, index_col='nominal') colspec = {'nominal' : ['date', 'nominalTime']} df = read_csv(StringIO(self.ts_data), parse_dates=colspec, index_col='nominal') assert_frame_equal(df, xp)
def test_na_value_dict(self): data = """A,B,C foo,bar,NA bar,foo,foo foo,bar,NA bar,foo,foo""" df = read_csv(StringIO(data), na_values={'A': ['foo'], 'B': ['bar']}) expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], 'B': [np.nan, 'foo', np.nan, 'foo'], 'C': [np.nan, 'foo', np.nan, 'foo']}) assert_frame_equal(df, expected) data = """\ a,b,c,d 0,NA,1,5 """ xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) xp.index.name = 'a' df = read_csv(StringIO(data), na_values={}, index_col=0) assert_frame_equal(df, xp) xp = DataFrame({'b': [np.nan], 'd': [5]}, MultiIndex.from_tuples([(0, 1)])) df = read_csv(StringIO(data), na_values={}, index_col=[0, 2]) assert_frame_equal(df, xp) xp = DataFrame({'b': [np.nan], 'd': [5]}, MultiIndex.from_tuples([(0, 1)])) df = read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) assert_frame_equal(df, xp)
def test_index_col_named(self): no_header = """\ KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" data = h + no_header #import pdb; pdb.set_trace() rs = read_csv(StringIO(data), index_col='ID') xp = read_csv(StringIO(data), header=0).set_index('ID') assert_frame_equal(rs, xp) self.assertRaises(ValueError, read_csv, StringIO(no_header), index_col='ID') data = """\ 1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo """ names = ['a', 'b', 'c', 'd', 'message'] xp = DataFrame({'a' : [1, 5, 9], 'b' : [2, 6, 10], 'c' : [3, 7, 11], 'd' : [4, 8, 12]}, index=Index(['hello', 'world', 'foo'], name='message')) rs = read_csv(StringIO(data), names=names, index_col=['message']) assert_frame_equal(xp, rs) self.assert_(xp.index.name == rs.index.name) rs = read_csv(StringIO(data), names=names, index_col='message') assert_frame_equal(xp, rs) self.assert_(xp.index.name == rs.index.name)
def test_parse_dates_column_list(self): from pandas.core.datetools import to_datetime data = '''date;destination;ventilationcode;unitcode;units;aux_date 01/01/2010;P;P;50;1;12/1/2011 01/01/2010;P;R;50;1;13/1/2011 15/01/2010;P;P;50;1;14/1/2011 01/05/2010;P;P;50;1;15/1/2011''' expected = read_csv(StringIO(data), sep=";", index_col=range(4)) lev = expected.index.levels[0] expected.index.levels[0] = lev.to_datetime(dayfirst=True) expected['aux_date'] = to_datetime(expected['aux_date'], dayfirst=True) expected['aux_date'] = map(Timestamp, expected['aux_date']) self.assert_(isinstance(expected['aux_date'][0], datetime)) df = read_csv(StringIO(data), sep=";", index_col = range(4), parse_dates=[0, 5], dayfirst=True) assert_frame_equal(df, expected) df = read_csv(StringIO(data), sep=";", index_col = range(4), parse_dates=['date', 'aux_date'], dayfirst=True) assert_frame_equal(df, expected)
def test_sniff_delimiter(self): text = """index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ data = read_csv(StringIO(text), index_col=0, sep=None) self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) data2 = read_csv(StringIO(text), index_col=0, delimiter='|') assert_frame_equal(data, data2) text = """ignore this ignore this too index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ data3 = read_csv(StringIO(text), index_col=0, sep=None, skiprows=2) assert_frame_equal(data, data3) # can't get this to work on Python 3 if not py3compat.PY3: text = u"""ignore this ignore this too index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """.encode('utf-8') data4 = read_csv(BytesIO(text), index_col=0, sep=None, skiprows=2, encoding='utf-8') assert_frame_equal(data, data4)
def savePrediction8(self): """ save the predicted coordinates for the 8-feature set into a csv file to upload """ # transform predictions prediction = self.prediction * 48 + 48 prediction = prediction.clip(0, 96) # read id list idset = read_csv(os.path.expanduser(self.fIdList)) outputPrediction = [] mapping = {1:1, 2:2, 3:3, 4:4, 21:5, 22:6, 29:7, 30:8} for i in range(len(idset)): # we only predict the second part of the set of images. # so we need to shift by 592 # TODO(tobias): shift the images in IdList_8.csv ImageID = idset['ImageId'][i]-592 Feature = idset['FeatureName'][i] newFeatureId = mapping[Feature] outputPrediction.append(prediction[ImageID, newFeatureId-1]) # read output list outputset = read_csv(os.path.expanduser(self.fOutputList)) # fill output list with predictions outputset['Location'] = outputPrediction # write output list to disk outputset.to_csv(os.path.expanduser(self.fOutFile), index=False)
def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) assert_frame_equal(chunk, df[:3]) last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) assert_frame_equal(chunks[0], df[1:3]) # test bad parameter (skip_footer) reader = read_csv(StringIO(self.data1), index_col=0, iterator=True, skip_footer=True) self.assertRaises(ValueError, reader.get_chunk, 3) treader = read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) self.assert_(isinstance(treader, TextParser))
def test_infer_s3_compression(self, s3_resource): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df)
def test_infer_s3_compression(self): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df)
def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) self.assertTrue(isinstance(df, DataFrame)) self.assertFalse(df.empty) tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def test_s3_fails(self): with tm.assertRaises(IOError): read_csv('s3://nyqpug/asdf.csv') # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with tm.assertRaises(IOError): read_csv('s3://cant_get_it/')
def test_nrows_and_chunksize(self): data = 'a b c' msg = "cannot be used together yet" for engine in ('c', 'python'): with tm.assertRaisesRegexp(NotImplementedError, msg): read_csv(StringIO(data), engine=engine, nrows=10, chunksize=5)
def test_parse_public_s3_bucket_nrows_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df)
def test_csv_custom_parser(self): data = """A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ df = read_csv(StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d")) expected = read_csv(StringIO(data), parse_dates=True) assert_frame_equal(df, expected)
def test_mangle_dupe_cols_false(self): # see gh-12935 data = 'a b c\n1 2 3' msg = 'is not supported' for engine in ('c', 'python'): with tm.assertRaisesRegexp(ValueError, msg): read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False)
def test_read_chunksize_named(self): reader = read_csv(StringIO(self.data1), index_col='index', chunksize=2) df = read_csv(StringIO(self.data1), index_col='index') chunks = list(reader) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:])
def test_parse_dates_implicit_first_col(self): data = """A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ df = read_csv(StringIO(data), parse_dates=True) expected = read_csv(StringIO(data), index_col=0, parse_dates=True) self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) assert_frame_equal(df, expected)
def read_csv(self, file): return read_csv(file, parse_dates=True)
lcbc = args.lencbc lumi = args.lenumi umifirst = args.umifirst cbcfile = args.cbcfile hd = args.cbchd #### Define input fastq files #### fq1 = fqr + '_R1.fastq.gz' fq2 = fqr + '_R2.fastq.gz' if not os.path.isfile(fq1) or not os.path.isfile(fq2): print 'fastq files not found' sys.exit() #### Read barcodes #### dbc = read_csv(cbcfile, sep = '\t', index_col=0, header = None) if not all([len(idx)==lcbc for idx in dbc.index]): sys.exit('barcode length provided does not match reference set') d = {idx: dbc.loc[idx,1] for idx in dbc.index} bc2sample = expandBCset(d, hd) #### Do the job #### fout = open(fqr + '_cbc.fastq', 'w+') nt = 0 ns = 0 with gzip.open(fq1) as f1, gzip.open(fq2) as f2: for idx, (l1, l2) in enumerate(it.izip(f1, f2)): l1, l2 = l1.rstrip().rsplit(' ')[0], l2.rstrip().rsplit(' ')[0] l = np.mod(idx,4) if l == 0:
"RIFLDIY02_N.B": 'Swap2Y', "RIFLDIY03_N.B": 'Swap3Y', "RIFLDIY04_N.B": 'Swap4Y', "RIFLDIY05_N.B": 'Swap5Y', "RIFLDIY07_N.B": 'Swap7Y', "RIFLDIY10_N.B": 'Swap10Y', "RIFLDIY30_N.B": 'Swap30Y', "RILSPDEPM01_N.B": 'Libor1M', "RILSPDEPM03_N.B": 'Libor3M', "RILSPDEPM06_N.B": 'Libor6M' } # Parse the file: skip the first 5 rows, headers are on row 6, # ND and NC indicate missing values, first column is the index and contains # dates df_libor = read_csv(fname, header=5, skiprows=range(5), na_values=['ND', 'NC'], index_col=0, parse_dates=True) # Convert column names to simple labels df_libor = df_libor.rename(columns=columns_dic) good_rows = df_libor.apply(good_row, axis=1) df_libor_clean = df_libor[good_rows] df_libor_clean.save('data/df_libor.pkl')
def _parse_level_0cs(fp): """ Parses and EVE Level 0CS file. """ is_missing_data = False # boolean to check for missing data missing_data_val = np.nan header = [] fields = [] line = fp.readline() # Read header at top of file while line.startswith(";"): header.append(line) if '; Missing data:' in line: is_missing_data = True missing_data_val = line.split(':')[1].strip() line = fp.readline() meta = MetaDict() for hline in header: if hline == '; Format:\n' or hline == '; Column descriptions:\n': continue elif ('Created' in hline) or ('Source' in hline): meta[hline.split(':', 1)[0].replace(';', ' ').strip()] = hline.split(':', 1)[1].strip() elif ':' in hline: meta[hline.split(':')[0].replace(';', ' ').strip()] = hline.split(':')[1].strip() fieldnames_start = False for hline in header: if hline.startswith("; Format:"): fieldnames_start = False if fieldnames_start: fields.append(hline.split(":")[0].replace(';', ' ').strip()) if hline.startswith("; Column descriptions:"): fieldnames_start = True # Next line is YYYY DOY MM DD date_parts = line.split(" ") year = int(date_parts[0]) month = int(date_parts[2]) day = int(date_parts[3]) def parser(x): # Parse date column (HHMM) return datetime(year, month, day, int(x[0:2]), int(x[2:4])) data = read_csv(fp, sep=r"\s+", names=fields, index_col=0, date_parser=parser, header=None, engine='python') if is_missing_data: # If missing data specified in header data[data == float(missing_data_val)] = np.nan # Add the units data units = OrderedDict([('XRS-B proxy', u.W/u.m**2), ('XRS-A proxy', u.W/u.m**2), ('SEM proxy', u.W/u.m**2), ('0.1-7ESPquad', u.W/u.m**2), ('17.1ESP', u.W/u.m**2), ('25.7ESP', u.W/u.m**2), ('30.4ESP', u.W/u.m**2), ('36.6ESP', u.W/u.m**2), ('darkESP', u.ct), ('121.6MEGS-P', u.W/u.m**2), ('darkMEGS-P', u.ct), ('q0ESP', u.dimensionless_unscaled), ('q1ESP', u.dimensionless_unscaled), ('q2ESP', u.dimensionless_unscaled), ('q3ESP', u.dimensionless_unscaled), ('CMLat', u.deg), ('CMLon', u.deg)]) # Todo: check units used. return data, meta, units
def _parse_average_csv(fp): """Parses an EVE Averages file.""" return "", read_csv(fp, sep=",", index_col=0, parse_dates=True)
def test_c_engine(self): # see gh-6607 data = 'a b c\n1 2 3' msg = 'does not support' # specify C engine with unsupported options (raise) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine='c', sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine='c', sep=r'\s') with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128)) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine='c', skipfooter=1) # specify C-unsupported options without python-unsupported options with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=r'\s') with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep='\t', quotechar=chr(128)) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), skipfooter=1) text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" msg = 'Error tokenizing data' with pytest.raises(ParserError, match=msg): read_csv(StringIO(text), sep='\\s+') with pytest.raises(ParserError, match=msg): read_csv(StringIO(text), engine='c', sep='\\s+') msg = "Only length-1 thousands markers supported" data = """A|B|C 1|2,334|5 10|13|10. """ with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), thousands=',,') with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), thousands='') msg = "Only length-1 line terminators supported" data = 'a,b,c~~1,2,3~~4,5,6' with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), lineterminator='~~')
def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_csv. Parameters ---------- sep : str, default '\s+' A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. **kwargs See read_csv for the full argument list. Returns ------- DataFrame A parsed DataFrame object. """ encoding = kwargs.pop("encoding", "utf-8") # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace("-", "") != "utf8": raise NotImplementedError( "reading from clipboard only supports utf-8 encoding") from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv text = clipboard_get() # Try to decode (if needed, as "text" might already be a string here). try: text = text.decode( kwargs.get("encoding") or get_option("display.encoding")) except AttributeError: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split("\n")[:-1][:10] # Need to remove leading white space, since read_csv # accepts: # a b # 0 1 2 # 1 3 4 counts = {x.lstrip(" ").count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" # check the number of leading tabs in the first line # to account for index columns index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) if index_length != 0: kwargs.setdefault("index_col", list(range(index_length))) # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get("delim_whitespace") is None: sep = r"\s+" # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) if len(sep) > 1 and kwargs.get("engine") is None: kwargs["engine"] = "python" elif len(sep) > 1 and kwargs.get("engine") == "c": warnings.warn( "read_clipboard with regex separator does not work properly with c engine." ) return read_csv(StringIO(text), sep=sep, **kwargs)
import tensorflow as tf import numpy as np from pandas.io.parsers import read_csv #다변인 선형회귀 모델에 영향을 미치는 변인이 여러 개 일 때 사용하는 모델 model = tf.global_variables_initializer() data = read_csv('cancer5.csv', sep=',') xy = np.array(data, dtype=np.float32) x_data = xy[:, :-1] y_data = xy[:, [-1]] #status 값 X = tf.placeholder(tf.float32, shape=[None, 7]) Y = tf.placeholder(tf.float32, shape=[None, 1]) W = tf.Variable(tf.random_normal([7, 1]), name="weight") b = tf.Variable(tf.random_normal([1]), name="bias") hypothesis = tf.matmul(X, W) + b cost = tf.reduce_mean(tf.square(hypothesis - Y)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00005) train = optimizer.minimize(cost) sess = tf.Session() sess.run(tf.global_variables_initializer()) for step in range(100001): cost_, hypo_, _ = sess.run([cost, hypothesis, train], feed_dict={ X: x_data, Y: y_data
"RIFLDIY02_N.B": ('Swap2Y', 2), "RIFLDIY03_N.B": ('Swap3Y', 3), "RIFLDIY04_N.B": ('Swap4Y', 4), "RIFLDIY05_N.B": ('Swap5Y', 5), "RIFLDIY07_N.B": ('Swap7Y', 7), "RIFLDIY10_N.B": ('Swap10Y', 10), "RIFLDIY30_N.B": ('Swap30Y', 30), "RILSPDEPM01_N.B": ('Libor1M', 1.0 / 12), "RILSPDEPM03_N.B": ('Libor3M', 3.0 / 12), "RILSPDEPM06_N.B": ('Libor6M', 6.0 / 12) } # convert buffer to data frame df_libor = read_csv(buff, sep=',', header=True, index_col=0, parse_dates=True, skiprows=[0, 1, 2, 3, 4]).transpose() # rename rows and columns with better names col_name_dic = {k: columns_dic[k][0] for k in columns_dic.keys()} df_libor = df_libor.rename(index=col_name_dic, columns={df_libor.columns[0]: 'Rate'}) # dictionary of maturities col_mat_dic = {columns_dic[k][0]:columns_dic[k][1] \ for k in columns_dic.keys()} # add maturity column df_libor['Maturity'] = [col_mat_dic[k] for k in df_libor.index]
# Takes a (_, 30) prediction array and writes a .csv submission file in the Kaggle format import os from datetime import datetime from pandas import DataFrame from pandas.io.parsers import read_csv FLOOKUP = "data/IdLookupTable.csv" lookup_table = read_csv(os.path.expanduser(FLOOKUP)) features = ["left_eye_center", "right_eye_center", "left_eye_inner_corner", "left_eye_outer_corner", "right_eye_inner_corner", "right_eye_outer_corner", "left_eyebrow_inner_end", "left_eyebrow_outer_end", "right_eyebrow_inner_end", "right_eyebrow_outer_end", "nose_tip", "mouth_left_corner", "mouth_right_corner", "mouth_center_top_lip", "mouth_center_bottom_lip"] columns = [[elem + "_x", elem + "_y"] for elem in features] columns = [elem for l in columns for elem in l] def write_submission(y_pred): y_pred2 = y_pred * 48 + 48 y_pred2 = y_pred2.clip(0, 96) df = DataFrame(y_pred2, columns=columns) values = [] for index, row in lookup_table.iterrows(): values.append(( row['RowId'], df.ix[row.ImageId - 1][row.FeatureName], )) now_str = datetime.now().isoformat().replace(':', '-') submission = DataFrame(values, columns=('RowId', 'Location')) filename = 'submission/submission-{}.csv'.format(now_str) submission.to_csv(filename, index=False)
import sys, os from pandas.io.parsers import read_csv import numpy as np import pandas as pd import sklearn.cluster import matplotlib.pyplot as plt from Colors import * try: df = read_csv(sys.argv[1], sep='\t', index_col=0) outfile = sys.argv[2] pdfplot = sys.argv[3] except: sys.exit( 'Please, give path to _df.txt file (full name); root for output file; produce pdf plot (y/n)' ) hcl = df['hclust'] del df['hclust'] clones = {} for cell in df.index: # scars = '-'.join(df.columns[df.loc[cell] > 0]) scars = '-'.join(df.columns[df.loc[cell] > 3.5]) if scars in clones: clones[scars].append(cell) else: clones[scars] = [cell] dfnew = pd.DataFrame() i = 0
def load_csv(file_name): """carga el fichero csv especificado y lo devuelve en un array de numpy""" valores = read_csv(file_name, header=None).values return valores.astype(float)
def read_psf_information(pvc_psf_tsv, subject_ids, session_ids, pet_tracer): """Read PSF information from TSV file. Args: pvc_psf_tsv: TSV file containing participant_id, session_id, acq_label, psf_x, psf_y & psf_z columns subject_ids: list of participant IDs (e.g. ['sub-CLNC01', 'sub-CLNC01']) session_ids: list of session IDs (e.g. ['ses-M00', 'ses-M18']) pet_tracer: Tracer we want to select in acq_label column. Other tracers will not be read in this function Example of pvc_psf_tsv: participant_id session_id acq_label psf_x psf_y psf_z sub-CLNC01 ses-M00 FDG 8 9 10 sub-CLNC01 ses-M18 FDG 8 9 10 sub-CLNC01 ses-M00 AV45 7 6 5 sub-CLNC02 ses-M00 FDG 8 9 10 sub-CLNC03 ses-M00 FDG 8 9 10 Returns: PSF information following [subject_ids, session_ids] order """ import os from pandas.io.parsers import read_csv if not os.path.isfile(pvc_psf_tsv): raise FileNotFoundError( f"Could not find the psf_tsv file {pvc_psf_tsv}") try: psf_df = read_csv(pvc_psf_tsv, sep="\t") except (IOError, UnicodeDecodeError): raise RuntimeError("An error while reading {pvc_psf_tsv} happened") if any(elem not in [ "participant_id", "session_id", "acq_label", "psf_x", "psf_y", "psf_z" ] for elem in list(psf_df.columns)): raise IOError( f"The file {pvc_psf_tsv} must contain the following columns (separated by tabulations):\n" f"participant_id, session_id, acq_label, psf_x, psf_y, psf_z\n" f"{str(list(psf_df.columns))}\n" f"Pay attention to the spaces (there should be none).") subjects_psf = list(psf_df.participant_id) sessions_psf = list(psf_df.session_id) pet_tracer_psf = list(psf_df.acq_label) idx_reordered = [] for i, sub in enumerate(subject_ids): current_ses = session_ids[i] idx_sub = [ j for j in range(len(subjects_psf)) if (sub == subjects_psf[j]) and (current_ses == sessions_psf[j]) and (pet_tracer == pet_tracer_psf[j]) ] if len(idx_sub) == 0: raise RuntimeError( f"Subject {sub} with session {current_ses} and tracer {pet_tracer} " f"that you want to proceed was not found in the TSV file containing " f"PSF specifications ({pvc_psf_tsv}).") if len(idx_sub) > 1: raise RuntimeError( f"Subject {sub} with session {current_ses} and tracer {pet_tracer} " f"that you want to proceed was found multiple times " f"in the TSV file containing PSF specifications ({pvc_psf_tsv})." ) idx_reordered.append(idx_sub[0]) psf_x = list(psf_df.psf_x) psf_y = list(psf_df.psf_y) psf_z = list(psf_df.psf_z) iterables_psf = [[psf_x[i], psf_y[i], psf_z[i]] for i in idx_reordered] return iterables_psf
def load_data(file_name): values = read_csv(file_name, header=None).values return values.astype(float)
def tips_df(datapath): """DataFrame with the tips dataset.""" return read_csv(datapath("io", "data", "csv", "tips.csv"))
import tensorflow as tf import numpy as np from pandas.io.parsers import read_csv model = tf.global_variables_initializer() data = read_csv('C:/bachoo/price data.csv', sep=',') xy = np.array(data, dtype=np.float32) x_data = xy[:, 1:-1] y_data = xy[:, [-1]] X = tf.placeholder(tf.float32, shape=[None, 4]) Y = tf.placeholder(tf.float32, shape=[None, 1]) W = tf.Variable(tf.random_normal([4, 1]), name="weight") b = tf.Variable(tf.random_normal([1]), name="bias") hypothesis = tf.matmul(X, W) + b cost = tf.reduce_mean(tf.square(hypothesis - Y)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00005) train = optimizer.minimize(cost) sess = tf.Session() sess.run(tf.global_variables_initializer()) for step in range(100001): cost_, hypo_, _ = sess.run([cost, hypothesis, train], feed_dict={ X: x_data, Y: y_data }) if step % 500 == 0: print("#", step, " 손실비용 ", cost_) print("- 배추가격 :", hypo_[0]) saver = tf.train.Saver() save_path = saver.save(sess, "./saved.cpkt")
def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result)
def _parse_level_0cs(filepath): """ Parses and EVE Level 0CS file. """ is_missing_data = False # boolean to check for missing data missing_data_val = np.nan header = [] fields = [] with codecs.open(filepath, mode='rb', encoding='ascii') as fp: line = fp.readline() # Read header at top of file while line.startswith(";"): header.append(line) if '; Missing data:' in line: is_missing_data = True missing_data_val = line.split(':')[1].strip() line = fp.readline() meta = MetaDict() for hline in header: if hline == '; Format:\n' or hline == '; Column descriptions:\n': continue elif ('Created' in hline) or ('Source' in hline): meta[hline.split(':', 1)[0].replace( ';', ' ').strip()] = hline.split(':', 1)[1].strip() elif ':' in hline: meta[hline.split(':')[0].replace( ';', ' ').strip()] = hline.split(':')[1].strip() fieldnames_start = False for hline in header: if hline.startswith("; Format:"): fieldnames_start = False if fieldnames_start: fields.append(hline.split(":")[0].replace(';', ' ').strip()) if hline.startswith("; Column descriptions:"): fieldnames_start = True # Next line is YYYY DOY MM DD date_parts = line.split(" ") year = int(date_parts[0]) month = int(date_parts[2]) day = int(date_parts[3]) data = read_csv(filepath, delim_whitespace=True, names=fields, comment=';', dtype={'HHMM': int}) # First line is YYYY DOY MM DD data = data.iloc[1:, :] data['Hour'] = data['HHMM'] // 100 data['Minute'] = data['HHMM'] % 100 data = data.drop(['HHMM'], axis=1) data['Year'] = year data['Month'] = month data['Day'] = day datecols = ['Year', 'Month', 'Day', 'Hour', 'Minute'] data['Time'] = to_datetime(data[datecols]) data = data.set_index('Time') data = data.drop(datecols, axis=1) if is_missing_data: # If missing data specified in header data[data == float(missing_data_val)] = np.nan # Add the units data units = OrderedDict([('XRS-B proxy', u.W / u.m**2), ('XRS-A proxy', u.W / u.m**2), ('SEM proxy', u.W / u.m**2), ('0.1-7ESPquad', u.W / u.m**2), ('17.1ESP', u.W / u.m**2), ('25.7ESP', u.W / u.m**2), ('30.4ESP', u.W / u.m**2), ('36.6ESP', u.W / u.m**2), ('darkESP', u.ct), ('121.6MEGS-P', u.W / u.m**2), ('darkMEGS-P', u.ct), ('q0ESP', u.dimensionless_unscaled), ('q1ESP', u.dimensionless_unscaled), ('q2ESP', u.dimensionless_unscaled), ('q3ESP', u.dimensionless_unscaled), ('CMLat', u.deg), ('CMLon', u.deg)]) # Todo: check units used. return data, meta, units
def test_deprecated_args(self, engine, kwargs): data = "1,2,3" arg, _ = list(kwargs.items())[0] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): read_csv(StringIO(data), engine=engine, **kwargs)
def read_csv(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = 'python' return read_csv(*args, **kwds)
from pandas.io.parsers import read_csv import numpy import pandas from numpy.random import seed from numpy.random import rand from numpy.random import random_integers df = read_csv("F:/data/WHO.csv") print("Dataframe", df) print("Shape", df.shape) print("Length", len(df)) print("Column Headers", df.columns) print("Index", df.index) print("Values", df.values) country_col = df["Country"] print("Type df", type(df)) print("Type country col", type(country_col)) print("Series shape", country_col.shape) print("Series index", country_col.index) print("Series values", country_col.values) print("Series name", country_col.name) print("Last 2 countries", country_col[-2:]) print("Last 2 countries type", type(country_col[-2:])) last_col = df.columns[-1] print("df signs\n", numpy.sign(df[last_col])) df1 = pandas.DataFrame({ 'Weather': ['cold', 'hot', 'cold', 'hot'], 'Food': ['soup', 'soup', 'icecream', 'chocolate'], 'Price': 10 * rand(4),
def test_empty_csv_input(self): # GH14867 df = read_csv(StringIO(), chunksize=20, header=None, names=['a', 'b', 'c']) assert isinstance(df, TextFileReader)
import os from pandas.io.parsers import read_csv import numpy as np import matplotlib.pyplot as plt from sklearn.utils import shuffle from keras.utils import np_utils from keras.models import Sequential, Model from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, Activation, Dropout, Flatten from keras.optimizers import RMSprop FTRAIN = '/Users/tian/Documents/CNN/mnist/train.csv' FTEST = '/Users/tian/Documents/CNN/mnist/test.csv' train_df = read_csv(os.path.expanduser(FTRAIN)) # 42000, 785 # int64(785) test_df = read_csv(os.path.expanduser(FTEST)) # 28000, 784 # int64(784) X_train = train_df[train_df.columns[1:]].values X_train = X_train.astype(np.float32) / 255 y_train = train_df[train_df.columns[0]] y_train = y_train.astype("category") X_test = test_df.values.astype(np.float32) / 255 def plot_samples(data=X_train, label=y_train, limit=41000):
def salaries_table(datapath): """DataFrame with the salaries dataset""" return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t")
def test_read_dta9(self): expected = read_csv(self.csv9, parse_dates=True, sep='\t') parsed = self.read_dta(self.dta9) tm.assert_frame_equal(parsed, expected)
def carga_csv(file_name): """carga""" valores = read_csv(file_name, header=None).values return valores.astype(float)
def feature_fusion(): # Name of train file trainName = '/SortedTrain.csv' # Path of a folder contains all single feature categories savePath = COMBINED_PATH_CSV path = SAVED_PATH_CSV + 'train/' os.chdir(path) # Read the name of each folder featureCategoryFolders = io.get_files_in_directory(path, file_extension='csv') #featureCategoryFolders = featureCategoryFolders[1:] print 'Folders: ', featureCategoryFolders # Load train files featureCategoriesLen = len(featureCategoryFolders) class_lable = read_csv(TRAIN_ID_PATH, delimiter=',') new_idx = np.argsort(class_lable.ix[:, 0]) class_lable = class_lable.ix[new_idx, 1] #print(class_lable[700:800]) class_lable = class_lable.reset_index(drop=True) singleTrains = list() for k in range(featureCategoriesLen): dataSet = read_csv(featureCategoryFolders[k], delimiter=',') data = dataSet #.ix[:, :] #data.astype(np.float) #print(class_lable[0:data.shape[0]-1]) singleTrain = dataSetInformaion( featureCategoryFolders[k][featureCategoryFolders[k].rfind('/') + 1:-4], data, class_lable[0:data.shape[0]]) singleTrains.append(singleTrain) remainingFeatureCategoriesIndices = np.ones(len(featureCategoryFolders)) finalDataSets = list() # Main loop of combination for A in range(featureCategoriesLen): minimumLogLoss = 100 minimumDataSet = None minimumIndex = -1 processingDataSet = None # Main loop of single evaluation for k in range(featureCategoriesLen): nameOfDataSet = '' if remainingFeatureCategoriesIndices[k] == 0: continue if len(finalDataSets) != 0: # Join datasets dataSet1 = finalDataSets[len(finalDataSets) - 1].data # print dataSet1 dataSet2 = singleTrains[k].data # print dataSet2 result = pd.concat([dataSet1, dataSet2], axis=1, join='inner') processingDataSet = dataSetInformaion( finalDataSets[len(finalDataSets) - 1].dataSetName + '+' + singleTrains[k].dataSetName, result, singleTrains[k].classLabel) # print result # classLabel = result.ix[:, -1] # data = result.ix[:, :-1] # nameOfDataSet = else: processingDataSet = singleTrains[k] # classLabel = singleTrains[k].data # data = singleTrains[k].classLabel # nameOfDataSet = featureCategoryFolders[k] accuracies = [] logLosses = [] # print 'DataSet', str(k), '=========================', 'cross validation result' # # For each fold in cross validation # rng = np.random.RandomState(31337) # kF = KFold(classLabel.shape[0], n_folds=2, shuffle=True, random_state=rng) # for trainIndex, testIndex in kF: # trainKF = data.ix[trainIndex,:] # trainID = classLabel.ix[trainIndex] # xgbModel = xgb.XGBClassifier().fit(trainKF,trainID) # actualLabels = classLabel.ix[testIndex] # # predictProbability = xgbModel.predict_proba(data.ix[testIndex,:]) # logLoss = multiclass_log_loss(actualLabels,predictProbability) # logLosses.append(logLoss) # # predictedLabels = xgbModel.predict(data.ix[testIndex,:]) # #print(confusion_matrix(actualLabels, predictedLabels)) # acc = accuracy_score(actualLabels, predictedLabels) # accuracies.append(acc) # # accuraciesMean = np.mean(accuracies) # loglossesMean = np.mean(logLosses) # print accuraciesMean # print loglossesMean # # FeatureCategoriesAccuracy.append(accuraciesMean) # FeatureCategoriesLogLoss.append(loglossesMean) print 'DataSet', str(processingDataSet.dataSetName ), '=========================', 'train result' # For each fold in cross validation xgbModel = xgb.XGBClassifier().fit(processingDataSet.data, processingDataSet.classLabel) predictProbability = xgbModel.predict_proba(processingDataSet.data) logLoss = multiclass_log_loss(processingDataSet.classLabel, predictProbability) #logLoss = float("%.3f"% logLoss) predictedLabels = xgbModel.predict(processingDataSet.data) acc = accuracy_score(processingDataSet.classLabel, predictedLabels) print acc print logLoss print processingDataSet.data.shape if logLoss < minimumLogLoss: minimumDataSet = processingDataSet minimumIndex = k minimumLogLoss = logLoss # featureCategoriesAccuracy.append(acc) # featureCategoriesLogLoss.append(logLoss) print 'Final Round ', A, '=========================' # Minimum_Index = np.where(featureCategoriesLogLoss == np.min(featureCategoriesLogLoss)) # print featureCategoriesLogLoss # finalDataSets.append(singleTrains[Minimum_Index[0]]) finalDataSets.append(minimumDataSet) remainingFeatureCategoriesIndices[minimumIndex] = 0 finalSets = [set.dataSetName for set in finalDataSets] print ','.join(finalSets) # Save the combined datasets os.chdir(savePath) for ds in finalDataSets: jointFile = pd.concat([ds.data, ds.classLabel], axis=1, join='inner') if not os.path.exists(savePath + ds.dataSetName): os.makedirs(savePath + ds.dataSetName) jointFile.to_csv(COMBINED_PATH_CSV + ds.dataSetName + '/NewTrain.csv', sep=',', index=False) try: del singleTrains, singleTrain, processingDataSet, jointFile, finalSets, \ remainingFeatureCategoriesIndices, featureCategoryFolders, dataSet, dataSet1, dataSet2, data, class_lable except: pass print 'All combinations saved!!!' print 'Run cross-validation ...' featureCombinationsFinalAccuracy = 0 featureCombinationsFinalDataSet = None featureCombinationsLogLossMin = 100 for ds in finalDataSets: print 'DataSet', ds.dataSetName, '=========================', 'cross validation result' # For each fold in cross validation rng = np.random.RandomState(31337) kF = KFold(ds.classLabel.shape[0], n_folds=5, shuffle=True, random_state=rng) for trainIndex, testIndex in kF: trainKF = ds.data.ix[trainIndex, :] trainID = ds.classLabel.ix[trainIndex] xgbModel = xgb.XGBClassifier().fit(trainKF, trainID) actualLabels = ds.classLabel.ix[testIndex] predictProbability = xgbModel.predict_proba( ds.data.ix[testIndex, :]) logLoss = multiclass_log_loss(actualLabels, predictProbability) logLosses.append(logLoss) predictedLabels = xgbModel.predict(ds.data.ix[testIndex, :]) # print(confusion_matrix(actualLabels, predictedLabels)) acc = accuracy_score(actualLabels, predictedLabels) accuracies.append(acc) accuraciesMean = np.mean(accuracies) loglossesMean = np.mean(logLosses) print accuraciesMean print loglossesMean if loglossesMean < featureCombinationsLogLossMin: featureCombinationsFinalDataSet = ds featureCombinationsFinalAccuracy = accuraciesMean featureCombinationsLogLossMin = loglossesMean print 'Final Result ---------------------------------' print featureCombinationsFinalDataSet.dataSetName, featureCombinationsFinalAccuracy, featureCombinationsLogLossMin
"RIFLDIY01_N.B": 'Swap1Y', "RIFLDIY02_N.B": 'Swap2Y', "RIFLDIY03_N.B": 'Swap3Y', "RIFLDIY04_N.B": 'Swap4Y', "RIFLDIY05_N.B": 'Swap5Y', "RIFLDIY07_N.B": 'Swap7Y', "RIFLDIY10_N.B": 'Swap10Y', "RIFLDIY30_N.B": 'Swap30Y', "RILSPDEPM01_N.B": 'Libor1M', "RILSPDEPM03_N.B": 'Libor3M', "RILSPDEPM06_N.B": 'Libor6M' } # the data converter is applied to all columns # excluding the index column (0) dc_dict = {i: dataconverter for i in range(1, len(columns_dic) + 1)} df_libor = read_csv(fname, sep=',', header=0, index_col=0, parse_dates=True, converters=dc_dict, skiprows=[0, 1, 2, 3, 4]) df_libor = df_libor.rename(columns=columns_dic) good_rows = df_libor.apply(good_row, axis=1) df_libor_good = df_libor[good_rows] df_libor_good.to_pickle(os.path.join('..', 'data', 'df_libor.pkl'))
import pandas as pd import pandas.io.parsers as pd_par import numpy as np import math import copy import QSTK.qstkutil.qsdateutil as du import datetime as dt import QSTK.qstkutil.DataAccess as da import QSTK.qstkutil.tsutil as tsu startCash = 50000 orderFile = "order_h4-6.csv" valueFile = "value_h4-6.csv" orderDF = pd_par.read_csv(orderFile, header=None) # Getting the Symbols from the .csv file ls_symbols = list(set(orderDF['X.4'].values)) # Need to sort the trades DF by increasing date orderDF = orderDF.sort(['X.1', 'X.2', 'X.3']) # Getting the start and end dates from the .csv file dt_start = dt.datetime( orderDF.head(1)['X.1'], orderDF.head(1)['X.2'], orderDF.head(1)['X.3']) dt_end = dt.datetime( orderDF.tail(1)['X.1'], orderDF.tail(1)['X.2'], orderDF.tail(1)['X.3'] + 1 ) # Getting market data dataobj = da.DataAccess('Yahoo', cachestalltime=0) ls_keys = ['close', 'actual_close'] ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt.timedelta(hours=16))
def summarize_topics(filenames, test, selection, dist, max_phrase_len, min_phrase_count): """ """ state = read_csv(filenames[0], compression='gzip', skiprows=2, usecols=[0, 4, 5], header=0, names=['doc', 'word', 'topic'], sep=' ') state['word'] = state['word'].astype(str) topics = read_csv(filenames[1], sep='(?: |\t)', engine='python', index_col=0, header=None, names=(['alpha'] + [x for x in xrange(1, 202)])) if dist == 'average-posterior': topics['prob'] = zeros(len(topics)) for _, df in state.groupby('doc'): topics['prob'] += ( topics['alpha'].add(df.groupby('topic').size(), fill_value=0) / (topics['alpha'].sum() + len(df))) topics['prob'] /= state['doc'].nunique() elif dist == 'empirical': topics['prob'] = state.groupby('topic')['word'].count() / len(state) else: topics['prob'] = topics['alpha'] / topics['alpha'].sum() # assert topics['prob'].sum() >= 1-1e-15 # assert topics['prob'].sum() <= 1+1e-15 num_topics = len(topics) phrases = dict() #print >> sys.stderr, 'Creating candidate n-grams...' ngram = dict([(l, l * ['']) for l in xrange(1, max_phrase_len + 1)]) doc = dict([(l, l * [-1]) for l in xrange(1, max_phrase_len + 1)]) topic = dict([(l, l * [-1]) for l in xrange(1, max_phrase_len + 1)]) counts = dict([(l, defaultdict(lambda: zeros(num_topics + 2, dtype=int))) for l in xrange(1, max_phrase_len + 1)]) for _, row in state.iterrows(): for l in xrange(1, max_phrase_len + 1): ngram[l] = ngram[l][1:] + [row['word']] doc[l] = doc[l][1:] + [row['doc']] topic[l] = topic[l][1:] + [row['topic']] if len(set(doc[l])) == 1: if len(set(topic[l])) == 1: counts[l][tuple(ngram[l])][row['topic']] += 1 counts[l][tuple(ngram[l])][num_topics] += 1 counts[l][tuple(ngram[l])][num_topics + 1] += 1 for l in xrange(1, max_phrase_len + 1): ngrams = DataFrame.from_records( [[' '.join(x), ' '.join(x[:-1]), ' '.join(x[1:])] + y.tolist() for x, y in counts[l].items()], columns=(['ngram', 'prefix', 'suffix'] + range(num_topics) + ['same', 'all'])) counts[l] = ngrams # tmp = state.groupby('doc')['doc'].count() # tmp = (len(state) - tmp[tmp < l].sum() - len(tmp[tmp >= l]) * (l - 1)) # assert ngrams['all'].sum() == tmp # assert (sum(ngrams[range(0, num_topics)].sum(axis=1) == # ngrams['same']) == len(ngrams)) #print >> sys.stderr, 'Selecting %d-gram phrases...' % l if l == 1: phrases[l] = set( ngrams[ngrams['all'] >= min_phrase_count]['ngram']) continue n = ngrams['all'].sum() if test == bfu or test == bfc: alpha = 1.0 alpha_sum = 4 * alpha beta = alpha_sum / n prefix_cache = ngrams.groupby('prefix')['all'].sum() suffix_cache = ngrams.groupby('suffix')['all'].sum() # assert prefix_cache.sum() == ngrams['all'].sum() # assert suffix_cache.sum() == ngrams['all'].sum() scores = len(ngrams) * [None] for idx, row in ngrams[ngrams['prefix'].isin(phrases[l - 1]) & ngrams['suffix'].isin(phrases[l - 1]) & (ngrams['all'] >= min_phrase_count)].iterrows(): a = row['all'] a_plus_b = suffix_cache[row['suffix']] a_plus_c = prefix_cache[row['prefix']] b = a_plus_b - a c = a_plus_c - a d = n - a_plus_b - c args = [a, b, c, d, n, a_plus_b, a_plus_c] if test == bfu: args += [alpha, alpha_sum, beta] elif test == bfc: args += [alpha, alpha_sum] scores[idx] = test(*args) ngrams['score'] = scores if test == bfu or test == bfc: keep = ngrams['score'] <= (1.0 / 10) else: keep = ngrams['score'] > 10.83 if selection == 'none': phrases[l] = set(ngrams[keep]['ngram']) else: if l == 2: phrases[l] = dict(ngrams[keep].set_index('ngram')['score']) else: m = 2 if selection == 'bigram' else l - 1 if test == bfu or test == bfc: tmp = set([ k for k, v in phrases[m].items() if v <= percentile( sorted(phrases[m].values(), reverse=True), (1.0 - 1.0 / 2**l) * 100) ]) else: tmp = set([ k for k, v in phrases[m].items() if v >= percentile(sorted(phrases[m].values()), (1.0 - 1.0 / 2**l) * 100) ]) if selection == 'bigram': keep &= Series([ all([ ' '.join(bigram) in tmp for bigram in zip(words, words[1:]) ]) for words in [ngram.split() for ngram in ngrams['ngram']] ]) phrases[l] = set(ngrams[keep]['ngram']) else: keep &= (ngrams['prefix'].isin(tmp) & ngrams['suffix'].isin(tmp)) phrases[l] = dict(ngrams[keep].set_index('ngram')['score']) ngrams.drop(['prefix', 'suffix', 'score'], axis=1, inplace=True) if selection == 'bigram': phrases[2] = set(phrases[2].keys()) elif selection == 'n-1-gram': for l in xrange(2, max_phrase_len + 1): phrases[l] = set(phrases[l].keys()) scores = defaultdict(lambda: defaultdict(float)) for l in xrange(1, max_phrase_len + 1): ngrams = counts[l] n = ngrams['same'].sum() ngrams['prob'] = ngrams['same'] / n for topic in xrange(num_topics): n_topic = ngrams[topic].sum() p_topic = topics['prob'][topic] p_not_topic = 1.0 - p_topic for _, row in ngrams[(ngrams['ngram'].isin(phrases[l])) & (ngrams[topic] > 0)].iterrows(): p_phrase = row['prob'] p_topic_g_phrase = row[topic] / row['same'] p_topic_g_not_phrase = ((n_topic - row[topic]) / (n - row['same'])) p_not_phrase = 1.0 - p_phrase p_not_topic_g_phrase = 1.0 - p_topic_g_phrase p_not_topic_g_not_phrase = 1.0 - p_topic_g_not_phrase a = 0.0 if p_topic_g_phrase != 0.0: a += (p_topic_g_phrase * (log2(p_topic_g_phrase) - log2(p_topic))) if p_not_topic_g_phrase != 0.0: a += (p_not_topic_g_phrase * (log2(p_not_topic_g_phrase) - log2(p_not_topic))) b = 0.0 if p_topic_g_not_phrase != 0.0: b += (p_topic_g_not_phrase * (log2(p_topic_g_not_phrase) - log2(p_topic))) if p_not_topic_g_not_phrase != 0.0: b += (p_not_topic_g_not_phrase * (log2(p_not_topic_g_not_phrase) - log2(p_not_topic))) scores[topic][row['ngram']] = p_phrase * a + p_not_phrase * b for topic, row in topics.iterrows(): print '---Topic %d---' % (topic) print '\n'.join([ '%s\t%f' % (x, y) for x, y in sorted( scores[topic].items(), key=(lambda x: x[1]), reverse=True) ]) + '\n' return