def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') with ensure_clean() as filename: s.to_csv(filename, compression=compression, header=True) # test the round trip - to_csv -> read_csv rs = pd.read_csv(filename, compression=compression, index_col=0, squeeze=True) assert_series_equal(s, rs) # explicitly ensure file was compressed f = tm.decompress_file(filename, compression=compression) text = f.read().decode('utf8') assert s.name in text f.close() f = tm.decompress_file(filename, compression=compression) assert_series_equal(s, pd.read_csv(f, index_col=0, squeeze=True)) f.close()
def get_id(sec): '''Scrape the internal HTML ID for the film''' global sec_to_id if sec_to_id.empty: try: sec_to_id = Series.from_csv('hsx_security_to_id', header=0) except: print('Security -> ID table not found, making a new one') with open('hsx_security_to_id', 'w') as f: f.write('security,id') sec_to_id = Series.from_csv('hsx_security_to_id', header=0) if sec not in sec_to_id: r = requests.get('http://www.hsx.com/security/view/{}'.format(sec)) #extract from webpage soup = BeautifulSoup(r.text) try: script = soup.findAll('script')[4].text.split('\n') sec_id = script[3].split('=')[2] sec_id = sec_id.split('"')[0] except: print("Cannot find id for {}".format(sec)) return -1 sec_to_id[sec] = int(sec_id) Series.to_csv(sec_to_id,'hsx_security_to_id',header='security,id') return sec_to_id[sec]
def fastq_length_plot(fastq, plotname, writename): ''' fastq length plot ''' dict_length = defaultdict(int) with open(fastq) as handle: for record in SeqIO.parse(handle, "fastq"): l = len(record.seq) dict_length[l] += 1 for i in range(0, 150): if i in dict_length.keys(): pass else: dict_length[i] = 0 df = Series(dict_length) df = df.sort_index() df.to_csv(writename, header=False, sep="\t", float_format="%.0f") df.plot(kind="bar", color="#990000", fontsize=15, width=1) plt.xlim(-1, 61) plt.xticks(range(0, 61, 10), ("0", "10", "20", "30", "40", "50", "60"), rotation=0) plt.savefig(plotname, bbox_inches='tight') plt.close()
def setup(): global count_matrix global translations x = 0 if isfile(setup_status_file): x = int(open(setup_status_file).read()) if x < 1: sys.stdout.write('Creating Database') create_db() sys.stdout.write('\rCreating Movie Table') create_table_movies() seed_table_movies() with open(setup_status_file, "w") as out: out.write('1') if x < 2: sys.stdout.write('\rLoading Data') data = read_csv(movie_data_file + '.csv', delimiter=',') sys.stdout.write('\rFormatting Data') data['soup'] = data.apply(create_soup, axis=1) sys.stdout.write('\rCalculating Counts') count_matrix = CountVectorizer().fit_transform(data['soup']) save_npz(dataframe_file, count_matrix) sys.stdout.write('\rCalculating Indices ') id_to_index = Series(data.index, index=data['id']) id_to_index.to_csv(translations_file, encoding='utf-8', header=True) sys.stdout.write('\r ') with open(setup_status_file, "w") as out: out.write('2') count_matrix = load_npz(dataframe_file) translations = read_csv(translations_file) sys.stdout.write('\rSetup Complete\n')
def generate(cls, intents: list, dir_path: str): path = Path(dir_path) cls.wordDict.reset() intents_df = cls.intents.generateIntentDataFrame(intents) classes_s = cls.intents.generateIntentClasses(intents) intents_df.to_csv(str(path / 'intents_df.csv'), index=False) keys = cls.intents.getKeys() for value in intents_df[keys['patterns']].values: cls.wordDict.load(value.split(' ')) wordDict_df = cls.wordDict.getDataFrame() wordDict_df.to_csv(path / 'wordDict_df.csv', index=False) transformedWordDict_s = Encoder.transformSeries(wordDict_df['words'], as_is=False, set_series=True) transformedWordDict_s.to_csv(path / 'transformedWordDict_s.csv', index=False) train_x_df = Encoder.encode(intents_df['patterns'], transformedWordDict_s) train_x_df.to_csv(path / 'train_x_df.csv', index=False) train_y_df = Encoder.oneHotEncode(intents_df['tag']) train_y_df.to_csv(path / 'train_y_df.csv', index=False) classes_s = Series(train_y_df.columns) classes_s.to_csv(path / 'classes_s.csv', index=False)
def get_id(sec): '''Scrape the internal HTML ID for the film''' global sec_to_id if sec_to_id.empty: try: sec_to_id = Series.from_csv('hsx_security_to_id', header=0) except: print('Security -> ID table not found, making a new one') with open('hsx_security_to_id', 'w') as f: f.write('security,id') sec_to_id = Series.from_csv('hsx_security_to_id', header=0) if sec not in sec_to_id: r = requests.get('http://www.hsx.com/security/view/{}'.format(sec)) #extract from webpage soup = BeautifulSoup(r.text) try: script = soup.findAll('script')[4].text.split('\n') sec_id = script[3].split('=')[2] sec_id = sec_id.split('"')[0] except: print("Cannot find id for {}".format(sec)) return -1 sec_to_id[sec] = int(sec_id) Series.to_csv(sec_to_id, 'hsx_security_to_id', header='security,id') return sec_to_id[sec]
def disaster_message_tf_idf(max_workers): input = File_manager('preprocessed', 'disasterMessage') tf_idf = File_manager('analyzed', 'disasterMessageTFIDF') idf = File_manager('analyzed', 'disasterMessageIDF') cmpr = idf.compare_version(input.ver) new_ver = input.ver.copy() if new_ver['disasterMessage'] == '0': return if idf.ver[cmpr[0]] == new_ver['disasterMessage'] and len(cmpr) == 1: return new_ver['disasterMessageTFIDF'] = new_ver['disasterMessage'] del new_ver['disasterMessage'] tf_idf.update_version(new_ver) new_ver['disasterMessageIDF'] = new_ver['disasterMessageTFIDF'] del new_ver['disasterMessageTFIDF'] idf.update_version(new_ver) preprocessed_data = read_csv(input.path) docs = preprocessed_data['tokens'] tfidfv = TfidfVectorizer( lowercase=False, token_pattern=r'(?u)[^┃]+?(?=┃|$)' ).fit(docs) vocabs = sorted(tfidfv.vocabulary_, key=tfidfv.vocabulary_.get) tf_idf_data = DataFrame(tfidfv.transform(docs).toarray(), columns=vocabs) idf_data = Series(tfidfv.idf_, index=vocabs).sort_values() tf_idf_data.to_csv(tf_idf.path, index=False) idf_data.to_csv(idf.path, header=False)
def main(): if os.path.isdir(SRC): from pandas import Series from natsort import natsorted img_list = get_image_paths(SRC, FTYPE) fname_luma_list = get_all_lumas(img_list) sorted_by_fname = natsorted(fname_luma_list, key=lambda x: x[0]) if STRIPPATHS is True: fl_series = Series( {os.path.basename(f): l for f, l in sorted_by_fname}) else: fl_series = Series({f: l for f, l in sorted_by_fname}) fl_series.to_csv(DST, header=['average_lumninance'], index_label='filename') elif os.path.isfile(SRC): print(average_luma(SRC)) else: print("Specify correct arguments") sys.exit()
def main(): try: path = "/home/longbai/commentdata/UserInfo.txt" usercolumns = ['userid', 'useremail', 'anonymous', 'avatar'] userdfreader = pd.read_csv(path, header=None, sep='#V_V#A_A#', iterator=True, encoding='utf-8', names=usercolumns, engine='python') # userdf = pd.read_csv(path, header=None, sep='#V_V#A_A#', encoding='utf-8', names=usercolumns, engine='python') goon = True c = Series() total = 0 while goon: try: infochunk = userdfreader.get_chunk(1000000) total += 1 if not infochunk.empty: emailseries = infochunk['useremail'].value_counts() repeatemail = emailseries[emailseries > 1] if not repeatemail.empty(): c = c + repeatemail print('repeat email length is {}, total={}'.format( len(c), total)) except StopIteration: print('..........over............') goon = False print('.....save file start.........') c.to_csv('useremail.csv', encoding='utf-8', mode='a') print('.....save file over.........') except Exception as e: print(e)
def save_sets(X_train: pd.DataFrame, X_val: pd.DataFrame, y_train: pd.Series, y_val: pd.Series, X_test: pd.DataFrame, location, suffix: str = '', file_type: str = 'csv') -> None: """ """ # Cater for empty suffix and not having a dangling underscore if suffix: suffix = f'_{suffix}' if file_type == 'csv': X_train.to_csv(location / f'X_train{suffix}.{file_type}', index=False) X_val.to_csv(location / f'X_val{suffix}.{file_type}', index=False) y_train.to_csv(location / f'y_train{suffix}.{file_type}', index=False) y_val.to_csv(location / f'y_val{suffix}.{file_type}', index=False) X_test.to_csv(location / f'X_test{suffix}.{file_type}', index=False) elif file_type == 'parquet': X_train.to_parquet(location / f'X_train{suffix}.{file_type}', index=False) X_val.to_parquet(location / f'X_val{suffix}.{file_type}', index=False) y_train.to_parquet(location / f'y_train{suffix}.{file_type}', index=False) y_val.to_parquet(location / f'y_val{suffix}.{file_type}', index=False) X_test.to_parquet(location / f'X_test{suffix}.{file_type}', index=False)
def create_vocab(trainqa_path, answerset_path, vocab_path): """Create the 4000 vocabulary based on questions in train split. 3999 most frequent words and 1 <UNK>. Args: trainqa_path: path to train_qa.json. vocab_path: vocabulary file. """ vocab = dict() train_qa = pd.read_json(trainqa_path) # remove question whose answer is not in answerset answerset = pd.read_csv(answerset_path, header=None)[0] train_qa = train_qa[train_qa['answer'].isin(answerset)] questions = train_qa['question'].values for q in questions: words = q.rstrip('?').split() for word in words: if len(word) >= 2: vocab[word] = vocab.get(word, 0) + 1 vocab = Series(vocab) vocab.sort_values(ascending=False, inplace=True) vocab = DataFrame(vocab.iloc[0:3999]) vocab.loc['<UNK>'] = [0] vocab.to_csv(vocab_path, columns=[], header=False)
def saveDictionaryToFile(my_dict, file_name): '''THis function will write the values of a dictionary into a csv, BUT it will also append the mean value as the last row''' data = Series(my_dict, index=my_dict.keys()) mean_value = data.mean() data['AVG'] = mean_value data.sort_index(axis=0, inplace=True) data.to_csv(file_name)
def count_objects(filename): with open(f'in/{filename}.geojson', "r", encoding="utf-8") as f: dct = json.load(f) s = Series(map(lambda x: x['geometry']['type'], dct['features'])).value_counts() path_out = f"out/object_count.csv" s.to_csv(path_out) return path_out
def save_preprocessed( df: pd.DataFrame, target: pd.Series, dataset_in_path: str, target_in_path: str ) -> None: dataset_out_path = get_output_path(dataset_in_path) target_out_path = get_output_path(target_in_path) df.to_csv(dataset_out_path, index=False) target.to_csv(target_out_path, index=False)
def test_to_csv_float_format(self): with ensure_clean() as filename: ser = Series([0.123456, 0.234567, 0.567567]) ser.to_csv(filename, float_format='%.2f') rs = Series.from_csv(filename) xp = Series([0.12, 0.23, 0.57]) assert_series_equal(rs, xp)
def detect(data, args): in_file = data['r2_path'] out_prefix = data['sample_id'] out_file = out_prefix + "_polyA.dat.gz" out_name_false = out_prefix + "_none.dat.gz" counts = Counter() num_line = 0 logger.my_logger.info("reading file %s" % in_file) logger.my_logger.info("creating files %s %s" % (out_file, out_name_false)) data['detect'] = out_file if os.path.exists(out_file): return data with file_transaction(out_file) as tx_out_file: with open_fastq(in_file) as handle, gzip.open(tx_out_file, 'w') as out, gzip.open(out_name_false, 'w') as out_false: for line in handle: #print line num_line += 1 if num_line % 1000000 == 0: logger.my_logger.info("read %s lines:" % num_line) if line.startswith("@HISEQ"): #print line name = line.strip() seq = handle.next().strip() handle.next().strip() qual = handle.next().strip() find = _adapter(seq, qual) #print "%s %s" % (seq, find) if find: seq, qual = find ns = poly_A_percentage(seq) #ns = polyA(seq) if ns: if ns[1]-ns[0] >= 6: #print "positions are" + str(ns[0]) + ".." + str(ns[1]) mod = seq[:ns[0]] seq_polyA = seq[ns[0]:ns[1]] seq_gene = seq[ns[1]:] qual_polyA = qual[ns[0]:ns[1]] qual_gene = qual[ns[1]:] #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf) out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (name, ns[0], ns[1], mod, seq_polyA, qual_polyA, seq_gene, qual_gene)) counts['polyA'] += 1 if len(mod) > 0: counts['mod'] += 1 else: counts['shortA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("shortA", name, seq, qual)) else: counts['noA'] += 1 out_false.write("%s\t%s\t%s\t%s\n" % ("None", name, seq, qual)) else: out_false.write("%s\t%s\t%s\t%s\n" % ("No_tag", name, seq, qual)) counts['notag'] += 1 with file_transaction(out_prefix + ".stat") as tx_stat_file: df = Series(counts) df.to_csv(tx_stat_file, sep="\t") logger.my_logger.info("%s" % counts) return data
def predict_lightgbm_model( model, test_x: pd.DataFrame, test_id: pd.Series, ): test_pred = model.predict(test_x) test_id["SalePrice"] = test_pred test_id.to_csv("submition.csv", index=False) return test_id
def test_to_csv_float_format(self): with tm.ensure_clean() as filename: ser = Series([0.123456, 0.234567, 0.567567]) ser.to_csv(filename, float_format="%.2f", header=False) rs = self.read_csv(filename) xp = Series([0.12, 0.23, 0.57]) tm.assert_series_equal(rs, xp)
def test_to_csv_unicode_index(self): buf = StringIO() s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) s.to_csv(buf, encoding="UTF-8") buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") assert_series_equal(s, s2)
def test_to_csv_float_format(self): with ensure_clean() as filename: ser = Series([0.123456, 0.234567, 0.567567]) ser.to_csv(filename, float_format="%.2f") rs = self.read_csv(filename) xp = Series([0.12, 0.23, 0.57]) assert_series_equal(rs, xp)
def test_to_csv_unicode_index(self): buf = StringIO() s = Series(["\u05d0", "d2"], index=["\u05d0", "\u05d1"]) s.to_csv(buf, encoding="UTF-8", header=False) buf.seek(0) s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") tm.assert_series_equal(s, s2)
def load_ipo_info(): """从网易财经下载个股的IPO数据""" cfg = ConfigParser() cfg.read('config.ini') ipo_info_url = cfg.get('ipo_info', 'ipo_info_url') db_path = Path(cfg.get('factor_db', 'db_path'), cfg.get('ipo_info', 'db_path')) # 读取所有已上市个股代码 # data_api = DataApi(addr='tcp://data.tushare.org:8910') # data_api.login('13811931480', 'eyJhbGciOiJIUzI1NiJ9.eyJjcmVhdGVfdGltZSI6IjE1MTI4Nzk0NTI2MjkiLCJpc3MiOiJhdXRoMCIsImlkIjoiMTM4MTE5MzE0ODAifQ.I0SXsA1bK--fbGu0B5Is2xdKOjALAeWBJRX6GdVmUL8') # df_stock_basics, msg = data_api.query(view='jz.instrumentInfo', # fields='status,list_date,name,market', # filter='inst_type=1&status=&market=SH,SZ&symbol=', # data_format='pandas') # if msg != '0,': # print('读取市场个股代码失败。') # return # df_stock_basics.symbol = df_stock_basics.symbol.map(lambda x: x.split('.')[0]) df_stock_basics = Utils.get_stock_basics(all=True) # 遍历个股, 下载ipo信息数据 df_ipo_info = DataFrame() for _, stock_info in df_stock_basics.iterrows(): # 如果个股ipo数据已存在, 则跳过 if db_path.joinpath('%s.csv' % stock_info.symbol).exists(): continue print('下载%s的IPO数据.' % stock_info.symbol) ipo_info_header = [] ipo_info_data = [] secu_code = Utils.code_to_symbol(stock_info.symbol) url = ipo_info_url % stock_info.symbol[2:] html = requests.get(url).content soup = BeautifulSoup(html, 'html.parser') tags = soup.find_all(name='h2') for tag in tags: if tag.get_text().strip() == 'IPO资料': ipo_table = tag.find_next(name='table') for tr in ipo_table.find_all(name='tr'): tds = tr.find_all(name='td') name = tds[0].get_text().replace(' ', '').replace( '\n', '').replace('\r', '') value = tds[1].get_text().replace(' ', '').replace( ',', '').replace('\n', '').replace('\r', '') ipo_info_header.append(name) ipo_info_data.append(value) ipo_info = Series(ipo_info_data, index=ipo_info_header) ipo_info['代码'] = secu_code ipo_info.to_csv(db_path.joinpath('%s.csv' % secu_code)) df_ipo_info = df_ipo_info.append(ipo_info, ignore_index=True) break if not df_ipo_info.empty: df_ipo_info.to_csv(db_path.joinpath('ipo_info.csv'), index=False, mode='a', header=False)
def _to_csv(data: pandas.Series, filename: str) -> None: LOGGER.info('Writing %s.', filename) step = 0.00000001 data = pandas.DataFrame({ 'Second': numpy.arange(0.0, step * len(data), step)[:len(data)], 'Volt': data, }) data.to_csv(filename, index=False)
def _updateSettingsFile(self, new_settings): if isfile(self.settings_file): old_settings = Series.from_csv(self.settings_file) for setting, value in new_settings.iteritems(): if type(value) == list: # represent lists as csv encapsulated in quotes value = ','.join(map(str, value)) old_settings[setting] = value else: old_settings = Series(new_settings) old_settings.to_csv(self.settings_file)
def write_dict(dict_, key_name, value_name, file_path): series = Series(dict_, name=value_name) series.index.name = key_name if not file_path.endswith(".tsv"): file_path += ".tsv" series.to_csv(file_path, sep="\t")
def saveMultipleDictionaryToFile(all_dicts, file_name, names): '''THis function will write the values of a dictionary into a csv, BUT it will also append the mean value as the last row''' for ii, my_dict in enumerate(all_dicts): if ii == 0: data = Series(my_dict, index=my_dict.keys()) mean_value = data.mean() data['AVG'] = mean_value data.sort_index(axis=0, inplace=True) data.to_csv(file_name)
def _write_results(self, res_df: pd.Series, sim_func, pred_score, lambda_param): sim_func = SIMILARITY_DICT.get(sim_func, sim_func) dir_path = dp.ensure_dir( f'{self.output_dir}/raw/{sim_func}/{self.predictor}/predictions/') file_name = f'predictions-{pred_score}+lambda+{lambda_param}' res_df.to_csv(path_or_buf=f'{dir_path}/{file_name}', index=True, sep=' ', float_format='%f', header=False)
def test_logit_regression(results): # lib_path = os.popen("pwd").read()[:-1] + "/lib" # sys.path.append(lib_path) test_data['Survived'] = 1.223 print test_data compared_results = ka.predict(test_data, results, 'Logit') # Use your model to make prediction on our test set. print compared_results compared_results = Series(compared_results) # convert our model to a series for easy output compared_results.to_csv("logitregres.csv")
def test_to_csv_interval_index(self): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: s.to_csv(path, header=False) result = self.read_csv(path, index_col=0, squeeze=True) # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s.copy() expected.index = expected.index.astype(str) tm.assert_series_equal(result, expected)
def messages_data(soup,message_csv): messages = scrape_element(soup, 'messages', '.Message') msg_lengths = [] pd.set_option('display.max_colwidth', -1) for k, v in messages.items(): msg_lengths.append(len(v)) text = Series(str(np.array(v.encode('utf-8')))) print text text.to_csv(message_csv, sep=',', header=False, index=False, mode='a') df_msg_lgth = DataFrame(msg_lengths) df_msg_describe = DataFrame(df_msg_lgth.describe()).T cols = df_msg_describe.columns df_msg_describe.columns = ['msg_' + c for c in cols] return df_msg_describe
def save_cell_labels(cell_labels: pd.Series, fpath: str, sep: str = '\t') -> None: """Save cell labels to plain-text file.""" if sep == '\t': delimited_str = 'tab-delimited' elif sep == ',': delimited_str = 'comma-delimited' else: delimited_str = '"%s"-delimited' % sep cell_labels.to_csv(fpath, sep='\t') _LOGGER.info('Saved labels for %d cells to %d plain-text file.', cell_labels.size, delimited_str)
def write_dict(dict_, filepath, key_name, value_name): """ Write dictionary as 2 column table. :param dict_: dict; :param filepath: str; :param key_name: str; :param value_name: str; :return: None """ s = Series(dict_) s.index.name = key_name s.name = value_name s.to_csv(filepath, sep='\t')
def sperm_RNA(bowtie_out_combined, prefix): ''' get tsRNA, rsRNA, piRNA and profile ''' dic_miR = defaultdict(int) dic_miR_iso = defaultdict(int) dic_tsRNA = defaultdict(int) dic_rsRNA = defaultdict(int) dic_piR = defaultdict(int) dic_piR_cluster = defaultdict(int) with open(bowtie_out_combined) as handle: for line in handle: seg = line.split() count = int(seg[0].split("-")[1]) ### miRNA if re.search("miRNA", seg[2]): pass #mir = seg[2].split("|")[0] #dic_miR[mir]+=count ### no 5p isoform #if seg[3] == "0": # dic_miR_iso[mir]+=count ### tsRNA elif re.search("tsRNA", seg[2]): tsR = seg[4] + "|" + seg[2] dic_tsRNA[tsR] += count ### rsRNA elif re.search("rsRNA", seg[2]): rsR = seg[4] + "|" + seg[2] dic_rsRNA[rsR] += count ### piRNA elif re.search("piRNA", seg[2]): dic_piR[seg[4]] += count pir_c = seg[2].split("|")[0] dic_piR_cluster[pir_c] += count ### tsRNA #dic_tsRNA = dict(sorted(dic_tsRNA.items(), key=lambda d:d[1], reverse=True)) tsRNA_out = prefix + ".tsRNA_counts.txt" tsRNA_series = Series(dic_tsRNA) tsRNA_series.to_csv(tsRNA_out, header=False, sep='\t') ### rsRNA rsRNA_out = prefix + ".rsRNA_counts.txt" rsRNA_series = Series(dic_rsRNA) rsRNA_series.to_csv(rsRNA_out, header=False, sep='\t') ### piRNA piRNA_out = prefix + ".piRNA_seq.txt" piRNA_series = Series(dic_piR) piRNA_series.to_csv(piRNA_out, header=False, sep='\t') piRNA_out2 = prefix + ".piRNA_cluster.txt" piRNA_series2 = Series(dic_piR_cluster) piRNA_series2.to_csv(piRNA_out2, header=False, sep='\t')
def test_to_csv_path_is_none(self): # GH 8215 # Series.to_csv() was returning None, inconsistent with # DataFrame.to_csv() which returned string s = Series([1, 2, 3]) csv_str = s.to_csv(path=None) assert isinstance(csv_str, str)
def test_to_csv_path_is_none(self): # GH 8215 # Series.to_csv() was returning None, inconsistent with # DataFrame.to_csv() which returned string s = Series([1, 2, 3]) csv_str = s.to_csv(path_or_buf=None, header=False) assert isinstance(csv_str, str)
def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output as when one # would add a "normal" Series/DataFrame. s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) res = StringIO() s.to_csv(res) exp = StringIO() s2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue()) df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) res = StringIO() df.to_csv(res) exp = StringIO() df2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue())
def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') with ensure_clean() as filename: s.to_csv(filename, compression=compression, header=True) # test the round trip - to_csv -> read_csv rs = pd.read_csv(filename, compression=compression, index_col=0, squeeze=True) assert_series_equal(s, rs) # explicitly ensure file was compressed with tm.decompress_file(filename, compression=compression) as fh: text = fh.read().decode('utf8') assert s.name in text with tm.decompress_file(filename, compression=compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, squeeze=True))
def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output # as when one would add a "normal" Series/DataFrame. s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) res = StringIO() s.to_csv(res, header=False) exp = StringIO() s2.to_csv(exp, header=False) assert res.getvalue() == exp.getvalue() df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) res = StringIO() df.to_csv(res) exp = StringIO() df2.to_csv(exp) assert res.getvalue() == exp.getvalue()
else: tf.ix[fila,word] = tf.ix[fila,word] + 1 tf.ix[fila] = tf.ix[fila] / len(tokens) fila = fila + 1 print "Fila: ", fila #print tf print "TF MATRIX LISTO" idf = Series() #print idf.index for term in termslist.keys(): apariciones = termslist[term] totaldoc = data.shape[0] argumento = totaldoc / (1 + apariciones) #print argumento test = Series({term : math.log(argumento)}) idf = idf.add(test, fill_value=0) #print idf print "IDF LISTO" gc.collect() for i, row in tf.iterrows(): print i tf.ix[i] = row.multiply(idf) #gc.collect() #print tf #print idf #tfidf = tf.apply(lambda x: x.multiply(idf), axis = 1) #print tfidf tf.to_csv('tfidf.csv') idf.to_csv('idf.csv')
} grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='roc_auc', cv=3) grid_search.fit(scaled_X_train, y_train) print 'Best score: %.3f'%grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' %(param_name, best_parameters[param_name]) predictions = grid_search.predict(scaled_X_test) print classification_report(y_test, predictions) for param_name in parameters.keys(): clf_args[param_name[5:]] = best_parameters[param_name] print 'clf_args:', clf_args final_scaler = preprocessing.StandardScaler() scaled_final_train_df = final_scaler.fit_transform(final_train_df) scaled_final_test_df = final_scaler.transform(final_test_df) classifier = RandomForestClassifier(**clf_args) classifier.fit(scaled_final_train_df, final_targets_df) output = classifier.predict_proba(scaled_final_test_df) output_probabilities = [round(x[1], 3) for x in output] S = Series(output_probabilities, index=Ids) S.to_csv('Santander_randomForest_results.csv', header=True, index_label= ['ID', 'TARGET'])
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from pandas import DataFrame, Series import json import requests ranking_url = 'https://itunes.apple.com/jp/rss/topfreeapplications/limit=100/json' raw_ranking = requests.get(ranking_url).json() ranking = [{'id': entry['id']['attributes']['im:id'], 'name': entry['im:name']}for entry in raw_ranking['feed']['entry']] data = [] for element in ranking: review_url = 'https://itunes.apple.com/jp/rss/customerreviews/id={0}/json'.format(element['id']) raw_reviews = requests.get(review_url).json() try: reviews = [review['content']['label'] for review in raw_reviews['feed']['entry'][1:50]] for review in reviews: data.append({'id': element['id'], 'name': element['name'], 'review': review.encode('utf-8')}) except KeyError: continue data = Series(data) data.to_csv('reviews.csv')
import pandas as pd from pandas import Series, DataFrame from sklearn.neighbors import KNeighborsClassifier train_file = pd.read_csv('train.csv') test_file = pd.read_csv('test.csv') train_df = DataFrame(train_file) test_df = DataFrame(test_file) # make separate data frame for digits and take out of training set target_df = train_df.label train_df = train_df.drop(['label'], axis=1) train_data = train_df.values.astype(np.uint8) target_data = target_df.values.astype(np.uint8) test_data = test_df.values.astype(np.uint8) n_neighbors, weights = 20, 'distance' clf = KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(train_data, target_data) print 'Starting k-neighbors...' output = clf.predict(test_data) ImageIds = np.arange(1, 28001) S = Series(output, index=ImageIds, dtype=np.uint8) S.to_csv('kNeighbors_results.csv', header=True, index_label=['ImageId', 'Label'])
print(data.to_csv(sys.stdout, na_rep='NULL')) print('\n') print(data.to_csv(sys.stdout, index=False, header=False)) print('\n') print(data.to_csv(sys.stdout, index=False, col=['a','b','c'])) print('\n') dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv('data/tseries.csv') print('\n') print(Series.from_csv('data/tseries.csv', parse_dates=True)) print('\n')
def _1(data: pd.Series) -> AlphaDiversityFormat: ff = AlphaDiversityFormat() with ff.open() as fh: data.to_csv(fh, sep='\t', header=True) return ff
# show that predicted survival probabilities are # inversely correlated with Age fig.add_subplot(224, axisbg="#DBDBDB") plt.scatter(res.predict(),x.Age , alpha=a) plt.grid(True, linewidth=0.15) plt.title("The Change of Survival Probability by Age") plt.xlabel("Predicted chance of survival") plt.ylabel("Age") ################################ ## Part 4: RUN ON TEST SET ## ################################ test_data = pd.read_csv("data/test.csv") # "Add our independent variable to our test data. (It's usually left # blank by Kaggle because it's the value you're trying to predict.)" # 1.23 is just a random value, it could have been anything test_data['Survived'] = 1.23 # use model to make prediction on test set compared_results = ka.predict(test_data, results, 'Logit') # convert model to Series for easy output compared_results = Series(compared_results) # output and submit to Kaggle compared_results.to_csv("data/output/logitregres.csv")
######################################################################## parsed=parse(urlopen('http://nymag.com/daily/intelligencer/2013/04/bloombergs-vip-terminal-tweeters.html')) doc=parsed.getroot() links=doc.findall('.//a') links[15:20] lnk=links[28] lnk lnk.get('href') lnk.text_content() urls=[lnk.get('href') for lnk in doc.findall('.//a')] temp=Series(urls[103:205]) for i in range(0,len(temp)): temp[i]=temp[i].replace('//www.twitter.com/','') temp.to_csv("nymag_tweets.csv") ######################################################################## ######################################################################## parsed=parse(urlopen('http://www.businessinsider.com/the-best-finance-people-on-twitter-2012-4?op=1')) doc=parsed.getroot() links=doc.findall('.//a') links[15:20] lnk=links[28] lnk lnk.get('href') lnk.text_content() urls=[lnk.get('href') for lnk in doc.findall('.//a')] str_url='https://twitter.com/#!/'
# gender and class (again it will be gender + 1 as was in GenderClass # we also take family size + 1 in order to distinguish men and women that are alone) train_df['GenderFamilySize'] = (train_df.Gender + 1) * (train_df.FamilySize + 1) test_df['GenderFamilySize'] = (test_df.Gender + 1) * (test_df.FamilySize + 1) # passenger 1044 is a 3rd class male 60.5yrs old who embarked at Southampton # and he is missing his fare value, from data the mean fare for 3rd class # man embarking at Southampton is 13.307149 test_df.loc[ (test_df.Fare.isnull()), 'Fare'] = 13.307149 train_df = train_df.drop(['SibSp', 'Parch', 'Embarked', 'GenderFamilySize'], axis=1) test_df = test_df.drop(['SibSp', 'Parch', 'Embarked', 'GenderFamilySize'], axis=1) # We are left with columns for 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Gender', 'AgeFill', 'GenderClass', 'FamilySize', 'GenderFamilySize' train_data = train_df.values test_data = test_df.values # begin the random forest forest = RandomForestClassifier(n_estimators = 100) forest = forest.fit(train_data[0::,1::], train_data[0::, 0]) output = forest.predict(test_data) PassengerIds = np.arange(892, 1310) S = Series(output, index=PassengerIds, dtype=int) S.to_csv('titanic_results.csv', header=True, index_label=['PassengerId','Survived'])
linewidth=3 ) plt.ylabel('Density of Prox1 in Hilus',size=20) plt.xticks(size = 20, rotation=0) plt.yticks(size = 14, rotation=0) # Density Bar Graph SW and C57 DensityTableSW_C57 = Series([Density[Dictionary['C57_p30']].mean(),Density[Dictionary['SW_p30']].mean(),Density[Dictionary['C57']].mean(),Density[Dictionary['SW']].mean()], index =['C57 P30','SW P30','C57 P60','SW']) SW_p30_Error = Density[Dictionary['SW_p30']].std()/sqrt(Density[Dictionary['SW_p30']].count()) SW_Error = Density[Dictionary['SW']].std()/sqrt(Density[Dictionary['SW']].count()) plt.figure() DensityTableSW_C57.plot(kind='bar',yerr=[C57_p30_Error,SW_p30_Error,C57_Error,SW_Error],color='y') plt.ylabel('Density of Prox1 in Hilus') plt.xticks(rotation=0) DensityTableSW_C57.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure4 Strains\New folder\C57_SW_Density.csv') ## PLotting Hilus area Bar Graph Table_Area_SW_C57 = Series([HilusAreaSum[Dictionary['C57_p16']].mean(),HilusAreaSum[Dictionary['C57_p30']].mean(),HilusAreaSum[Dictionary['SW_p30']].mean(),HilusAreaSum[Dictionary['C57']].mean(),HilusAreaSum[Dictionary['SW']].mean()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW']) Table_Area_SW_C57_STD = Series([HilusAreaSum[Dictionary['C57_p16']].std(),HilusAreaSum[Dictionary['C57_p30']].std(),HilusAreaSum[Dictionary['SW_p30']].std(),HilusAreaSum[Dictionary['C57']].std(),HilusAreaSum[Dictionary['SW']].std()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW']) Table_Area_SW_C57_Error = Table_Area_SW_C57_STD/SQRT plt.figure() Table_Area_SW_C57.plot(kind='bar',yerr=TableSW_C57_Error,color='y') plt.ylabel('Hilus Area') plt.xticks(rotation=0) ## PLotting Total Cells Bar Graph Table_Cells_SW_C57 = Series([TotalCellsSum[Dictionary['C57_p16']].mean(),TotalCellsSum[Dictionary['C57_p30']].mean(),TotalCellsSum[Dictionary['SW_p30']].mean(),TotalCellsSum[Dictionary['C57']].mean(),TotalCellsSum[Dictionary['SW']].mean()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW']) Table_Cells_SW_C57_STD = Series([TotalCellsSum[Dictionary['C57_p16']].std(),TotalCellsSum[Dictionary['C57_p30']].std(),TotalCellsSum[Dictionary['SW_p30']].std(),TotalCellsSum[Dictionary['C57']].std(),TotalCellsSum[Dictionary['SW']].std()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW'])
# 指定读取的行数 df10 = pd.read_csv('resources/ex5.csv',nrows=10) # print df10 # 写入csv,缺省值写成NaN,行标签不写入文件,列标签为header df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False) # 按指定顺序写入指定列 df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False,columns=['b','c','a']) # Serises的读写 dates1 = pd.date_range('1/1/2000','1/1/2016') s = Series(dates1,index=np.arange(dates1.size)) # print s s.to_csv('resources/write2.csv',sep=',') s1 = Series.from_csv('resources/write2.csv') # print s1 # csv f = open('resources/write1.csv') reader = csv.reader(f) lines = list(reader) header,values = lines[0],lines[1:] data_dic = { k:v for k,v in zip(header,zip(*values)) } # print data_dic
# Embarked s3fa_col = (dfn.Pclass == 3).mul(dfn.Sex == 'female').mul(dfn.Embarked == 'S').mul(dfn.Title > 0) s3fa_fn = lambda x: 0.5 if x else -0.5 s3fa_col = s3fa_col.map(s3fa_fn) s3fa_col.name = 'S3FA' dfne = pd.concat([dfn, s3fa_col], axis=1) # Result cols = ['C0', 'C1', 'C2', 'Gender', 'Title', 'S3FA'] return dfne[cols] df = pd.read_csv('data/train.csv') mdf = munge(df) X = mdf y = df['Survived'] tuned_parameters = {'penalty': ['l1', 'l2'], 'C': np.logspace(-2, 0, 5), 'max_iter': np.logspace(2, 3, 5)} clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5, n_jobs=4) clf.fit(X, y) print(clf.best_estimator_) for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) test_df = pd.read_csv('data/test.csv') res = clf.predict(munge(test_df)) res = Series(res, name='Survived', index=test_df.index) res = pd.concat([test_df, res], axis=1)[['PassengerId', 'Survived']] res.to_csv('data/out-1-lr.csv', index=False)
return (filepath, trace) def sim_busy_times(trace, cpus, interval): data = {num_cores: trace.cpu.simultaneously_busy_time(num_cores, cpus=list(cpus), interval=INTERVAL) for num_cores in xrange(len(cpus)+1)} total_duration = trace.duration if not INTERVAL else INTERVAL.duration return Series(data=data.values(), index=data.keys(), name=trace.filename) / total_duration _files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT)) F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files} little_idle_dict = defaultdict(list) big_idle_dict = defaultdict(list) for _file in _files: fp, trace = parse_file(_file) for cpu in ALL_CPUS: for item in trace.cpu.lpm_intervals(cpu=cpu, interval=INTERVAL): if item.cpu in BIG_CPUS: big_idle_dict[item.state].append(item.interval.duration) elif item.cpu in LITTLE_CPUS: little_idle_dict[item.state].append(item.interval.duration) for k, v in little_idle_dict.iteritems(): results = Series(v)*1e6 results.to_csv(r'{path}\LITTLE_C{idx}.csv'.format(path=PATH, idx=k)) for k, v in big_idle_dict.iteritems(): results = Series(v)*1e6 results.to_csv(r'{path}\BIG_C{idx}.csv'.format(path=PATH, idx=k))
if __name__ == "__main__": train_dir = '/Users/ray/Downloads/trainResized' test_dir = '/Users/ray/Downloads/testResized' train_labels_filepath = '/Users/ray/Downloads/trainLabels.csv' # download the features per image (train) train_features = generate_features(train_dir) # download the labels train_labels = generate_labels(train_labels_filepath) # merge the features with labels train_data = pd.merge(left=train_labels, right=train_features, left_on='ID', right_index=True) # train the model forest_classifier = RandomForestClassifier(n_estimators=100) training_input = train_data.ix[:, 2:].values target_values = train_data['Class'].apply(lambda x: ord(x)).values forest_model = forest_classifier.fit(training_input, target_values) # download the features per image (test) test_features = generate_features(test_dir) # predict the test test_labels_raw = forest_model.predict(test_features) test_labels = Series(test_labels_raw, index=test_features.index).apply(lambda x: chr(x)) # build output for the test test_labels.name = 'Class' test_labels.to_csv('/Users/ray/Downloads/result.csv', index_label='Id', header=True)
# 'xgb__learning_rate': (0.01, 0.03, 0.05), # 'xgb__colsample_bytree': (0.8, 0.85) # } # # grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='roc_auc', cv=3) # grid_search.fit(scaled_X_train, y_train) # print 'Best score: %.3f'%grid_search.best_score_ # print 'Best parameters set:' # best_parameters = grid_search.best_estimator_.get_params() # for param_name in sorted(parameters.keys()): # print '\t%s: %r' %(param_name, best_parameters[param_name]) # # predictions = grid_search.predict(scaled_X_test) # print classification_report(y_test, predictions) # # for param_name in parameters.keys(): # xgb_args[param_name[5:]] = best_parameters[param_name] # # print 'xgb_args:', xgb_args final_scaler = preprocessing.StandardScaler() scaled_final_train_df = final_scaler.fit_transform(final_train_df) scaled_final_test_df = final_scaler.transform(final_test_df) classifier = XGBClassifier(**xgb_args) classifier.fit(scaled_final_train_df, final_targets_df) output = classifier.predict_proba(scaled_final_test_df)[:,1] S = Series(output, index=Ids) S.to_csv('Santander_xgboost_results_1.csv', header=True, index_label=['ID', 'TARGET'])
title_col.name = "Title" dfn = pd.concat([df, title_col], axis=1) # Embarked s3fa_col = (dfn.Pclass == 3).mul(dfn.Sex == "female").mul(dfn.Embarked == "S").mul(dfn.Title > 0) s3fa_fn = lambda x: 0.5 if x else -0.5 s3fa_col = s3fa_col.map(s3fa_fn) s3fa_col.name = "S3FA" dfne = pd.concat([dfn, s3fa_col], axis=1) # Result cols = ["C0", "C1", "C2", "Gender", "Title", "S3FA"] return dfne[cols] df = pd.read_csv("../input/train.csv") mdf = munge(df) X = mdf y = df["Survived"] tuned_parameters = {"penalty": ["l1", "l2"], "C": np.logspace(-2, 0, 5), "max_iter": np.logspace(2, 3, 5)} clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5, n_jobs=4) clf.fit(X, y) print(clf.best_estimator_) for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) test_df = pd.read_csv("../input/test.csv") res = clf.predict(munge(test_df)) res = Series(res, name="Survived", index=test_df.index) res = pd.concat([test_df, res], axis=1)[["PassengerId", "Survived"]] res.to_csv("out-1-lr.csv", index=False)
if not os.path.isfile("coverages.csv") : print "compute coverages" if 'coverage' in locals() : del coverage handle = open(file, "rU") for record in tqdm(SeqIO.parse(handle, "fasta")) : seq = str(record.seq) l = len(seq) if 'coverage' not in locals(): coverage = [0]*l for (i,c) in enumerate(seq): if c not in ['.','-']: coverage[i] = coverage[i] +1 coverage=Series(coverage) coverage.to_csv("coverages.csv",index=False) handle.close() else : print "import coverages" coverage = Series.from_csv("coverages.csv",header=-1, index_col=False) print "compute median-ish things" medians = [] means = [] maxs = [] mins = [] lens = [] left = [] right = [] unsure = [] handle = open(file, "rU")
def main(): out_dir = os.path.dirname(__file__) ex1_path = study.DATA_DIR + '/ch06/ex1.csv' cat(ex1_path) df = pd.read_csv(ex1_path) p(df) p(pd.read_table(ex1_path, sep=',')) p('header less---------------------') ex2_path = study.DATA_DIR + '/ch06/ex2.csv' cat(ex2_path) names = ['a','b', 'c', 'd', 'message'] p(pd.read_csv(ex2_path, header=None)) p(pd.read_csv(ex2_path, names=names)) p(pd.read_csv(ex2_path, names=names, index_col='message')) p('hierarchy index---------------------') mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv' cat(mindex_path) p(pd.read_csv(mindex_path, index_col=['key1', 'key2'])) p('separate by regex-------------') ex3_path = study.DATA_DIR + '/ch06/ex3.csv' cat(ex3_path) p(pd.read_csv(ex3_path, sep='\s+')) p('skip rows-----------') ex4_path = study.DATA_DIR + '/ch06/ex4.csv' cat(ex4_path) p(pd.read_csv(ex4_path, skiprows=[0,2,3])) p('N/A------------------') ex5_path = study.DATA_DIR + '/ch06/ex5.csv' cat(ex5_path) result = pd.read_csv(ex5_path) p(result) p(pd.isnull(result)) result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA p(result) p('N/A dict------------------') sentinels = {'message': ['foo', 'NA'], 'something': ['two']} p(sentinels) p(pd.read_csv(ex5_path, na_values=sentinels)) p('6.1.1 read data chunk size---------------------') ex6_path = study.DATA_DIR + '/ch06/ex6.csv' p(pd.read_csv(ex6_path).count()) p(pd.read_csv(ex6_path, nrows=5)) chunker = pd.read_csv(ex6_path, chunksize=1000) p(chunker) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot.order(ascending=False) p(tot[:10]) p('6.1.2 write---------------------') data = pd.read_csv(ex5_path) p(data) ex5_out_path = out_dir + '/ex5_out.csv' data.to_csv(ex5_out_path) cat(ex5_path) data.to_csv(sys.stdout, index=False, header=False) print '' data.to_csv(sys.stdout, index=False, cols=list('abc')) print '' p('Series--------------') tseries_out_path = out_dir + '/tseries_out.csv' dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv(tseries_out_path) cat(tseries_out_path) p(Series.from_csv(tseries_out_path, parse_dates=True)) p('6.1.3 csv-------------------------') ex7_path = study.DATA_DIR + '/ch06/ex7.csv' cat(ex7_path) f = open(ex7_path) reader = csv.reader(f) for line in reader: print line lines = list(csv.reader(open(ex7_path))) header, values = lines[0], lines[1:] data_dict = {h: v for h,v in zip(header, zip(*values))} p(data_dict) my_data_out_path = out_dir + '/mydata.csv' with open(my_data_out_path, 'w') as fp: writer = csv.writer(fp, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) cat(my_data_out_path) p('6.1.4 JSON-------------------------') obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ result = json.loads(obj) p(result) asjson = json.dumps(result) p(asjson) siblings = DataFrame(result['siblings'], columns=['name', 'age']) p(siblings) p('6.1.4 XML/HTML Web Scraping-------------------------') url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options' if not url is '': parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() p([lnk.get('href') for lnk in doc.findall('.//a')][-10:]) tables = doc.findall('.//table') p(parse_options_data(tables[9])[:5]) p(parse_options_data(tables[13])[:5]) p('6.1.5 Read XML-------------------------') xml_path = out_dir + '/Performance_MNR.xml' xml_content =""" <INDICATOR> <INDICATOR_SEQ>373889</INDICATOR_SEQ> <PARENT_SEQ></PARENT_SEQ> <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME> <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME> <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION> <PERIOD_YEAR>2011</PERIOD_YEAR> <PERIOD_MONTH>12</PERIOD_MONTH> <CATEGORY>Service Indicators</CATEGORY> <FREQUENCY>M</FREQUENCY> <DESIRED_CHANGE>U</DESIRED_CHANGE> <INDICATOR_UNIT>%</INDICATOR_UNIT> <DECIMAL_PLACES>1</DECIMAL_PLACES> <YTD_TARGET>97.00</YTD_TARGET> <YTD_ACTUAL></YTD_ACTUAL> <MONTHLY_TARGET>97.00</MONTHLY_TARGET> <MONTHLY_ACTUAL></MONTHLY_ACTUAL> </INDICATOR> """ if not os.path.exists(xml_path): with open(xml_path, 'w') as f: f.write(xml_content) parsed = objectify.parse(open(xml_path)) root = parsed.getroot() data = [] skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_SEQ', 'DECIMAL_PLACES'] p(dir(root)) for elt in root: # .INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) perf = DataFrame(data) p(perf) tag = '<a href="http://google.com">Google</a>' root = objectify.parse(StringIO.StringIO(tag)).getroot() p(root) p(root.get('href')) p(root.text)
"silent": 1, "thread": 1, "seed": 1301 } num_boost_round = 1000 print("Train a XGBoost model") X_train, X_valid = train_test_split(train, test_size=0.01, random_state=10) y_train = np.log1p(X_train.Sales) y_valid = np.log1p(X_valid.Sales) dtrain = xgb.DMatrix(X_train[features], y_train) dvalid = xgb.DMatrix(X_valid[features], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True) print("Validating") predict = gbm.predict(xgb.DMatrix(X_valid[features])) error = rmspe(X_valid.Sales.values, np.expm1(predict)) print('RMSPE: {:.6f}'.format(error)) print("Make predictions on the test set") dtest = xgb.DMatrix(test[features]) ytest = gbm.predict(dtest) sub = Series() sub = sub.append(Series(np.expm1(ytest), index = test.Id)) sub = sub.append(Series(0, index = closedId)) # Make Submission sub = pd.DataFrame({"Id": sub.index, "Sales": sub.values}) sub.to_csv("xgboost_submission2.csv", index=False)