def _check_extension_sheets(self, ext): path = '__tmp_to_excel_from_excel_sheets__.' + ext self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', cols=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, 'test1') self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1', index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) os.remove(path)
def _check_extension_sheets(self, ext): path = "__tmp_to_excel_from_excel_sheets__." + ext self.frame["A"][:5] = nan self.frame.to_excel(path, "test1") self.frame.to_excel(path, "test1", cols=["A", "B"]) self.frame.to_excel(path, "test1", header=False) self.frame.to_excel(path, "test1", index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, "test1") self.tsframe.to_excel(writer, "test2") writer.save() reader = ExcelFile(path) recons = reader.parse("test1", index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse("test2", index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal("test1", reader.sheet_names[0]) np.testing.assert_equal("test2", reader.sheet_names[1]) os.remove(path)
def _check_extension_sheets(self, ext): path = '__tmp_to_excel_from_excel_sheets__.' + ext self.frame['A'][:5] = nan self.frame.to_excel(path,'test1') self.frame.to_excel(path,'test1', cols=['A', 'B']) self.frame.to_excel(path,'test1', header=False) self.frame.to_excel(path,'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer,'test1') self.tsframe.to_excel(writer,'test2') writer.save() reader = ExcelFile(path) recons = reader.parse('test1',index_col=0) tm.assert_frame_equal(self.frame, recons) recons = reader.parse('test2',index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) os.remove(path)
def export_to(self, file_path, batchsize=100): self.xls_writer = ExcelWriter(file_path) # get record count record_count = self._query_mongo(count=True) # query in batches and for each batch create an XLSDataFrameWriter and # write to existing xls_writer object start = 0 header = True while start < record_count: cursor = self._query_mongo(self.filter_query, start=start, limit=batchsize) data = self._format_for_dataframe(cursor) # write all cursor's data to their respective sheets for section_name, section in self.sections.iteritems(): records = data[section_name] # TODO: currently ignoring nested repeats so ignore sections that have 0 records if len(records) > 0: columns = section["columns"] + self.EXTRA_COLUMNS writer = XLSDataFrameWriter(records, columns) writer.write_to_excel(self.xls_writer, section_name, header=header, index=False) header = False # increment counter(s) start += batchsize self.xls_writer.save()
def export_to(self, file_path, batchsize=1000): self.xls_writer = ExcelWriter(file_path) # get record count record_count = self._query_mongo(count=True) # query in batches and for each batch create an XLSDataFrameWriter and # write to existing xls_writer object start = 0 header = True while start < record_count: cursor = self._query_mongo(self.filter_query, start=start, limit=batchsize) data = self._format_for_dataframe(cursor) # write all cursor's data to their respective sheets for section_name, section in self.sections.iteritems(): records = data[section_name] # TODO: currently ignoring nested repeats so ignore sections that have 0 records if len(records) > 0: # use a different group delimiter if needed columns = section["columns"] if self.group_delimiter != DEFAULT_GROUP_DELIMITER: columns = [self.group_delimiter.join(col.split("/")) for col in columns ] columns = columns + self.EXTRA_COLUMNS writer = XLSDataFrameWriter(records, columns) writer.write_to_excel(self.xls_writer, section_name, header=header, index=False) header = False # increment counter(s) start += batchsize time.sleep(0.1) self.xls_writer.save()
def export_to(self, file_path): self.xls_writer = ExcelWriter(file_path) # query in batches and for each batch create an XLSDataFrameWriter and # write to existing xls_writer object # get records from mongo - do this on export so we can batch if we # choose to, as we should cursor = self._query_mongo(self.filter_query) data = self._format_for_dataframe(cursor) #TODO: batching will not work as expected since indexes are calculated # based the current batch, a new batch will re-calculate indexes and if # they are going into the same excel file, we'll have duplicates # possible solution - keep track of the last index from each section # write all cursor's data to different sheets # TODO: for every repeat, the index should be re-calculated for section in self.sections: # TODO: currently ignoring nested repeat data which will have no # records records = data[section["name"]] if len(records) > 0: section_name = section["name"] columns = section["columns"] + self.EXTRA_COLUMNS writer = XLSDataFrameWriter(records, columns) writer.write_to_excel(self.xls_writer, section_name, header=True, index=False) self.xls_writer.save()
def to_excel(self, path, na_rep=''): """ Write each DataFrame in Panel to a separate excel sheet Parameters ---------- excel_writer : string or ExcelWriter object File path or existing ExcelWriter na_rep : string, default '' Missing data rep'n """ from pandas.io.parsers import ExcelWriter writer = ExcelWriter(path) for item, df in self.iteritems(): name = str(item) df.to_excel(writer, name, na_rep=na_rep) writer.save()
def to_excel(self, path, na_rep=''): """ Write each DataFrame in Panel to a separate excel sheet Parameters ---------- excel_writer : string or ExcelWriter object File path or existing ExcelWriter na_rep : string, default '' Missing data representation """ from pandas.io.parsers import ExcelWriter writer = ExcelWriter(path) for item, df in self.iteritems(): name = str(item) df.to_excel(writer, name, na_rep=na_rep) writer.save()
def export_to(self, file_path, batchsize=1000): self.xls_writer = ExcelWriter(file_path) # get record count record_count = self._query_mongo(count=True) # query in batches and for each batch create an XLSDataFrameWriter and # write to existing xls_writer object start = 0 header = True while start < record_count: cursor = self._query_mongo(self.filter_query, start=start, limit=batchsize) data = self._format_for_dataframe(cursor) # write all cursor's data to their respective sheets for section_name, section in self.sections.iteritems(): records = data[section_name] # TODO: currently ignoring nested repeats # so ignore sections that have 0 records if len(records) > 0: # use a different group delimiter if needed columns = section["columns"] if self.group_delimiter != DEFAULT_GROUP_DELIMITER: columns = [ self.group_delimiter.join(col.split("/")) for col in columns ] columns = columns + self.EXTRA_COLUMNS writer = XLSDataFrameWriter(records, columns) writer.write_to_excel(self.xls_writer, section_name, header=header, index=False) header = False # increment counter(s) start += batchsize time.sleep(0.1) self.xls_writer.save()
# big = big.drop('AnnStaticRet', 1) # big = big.drop('AnnCapitalRet', 1) # big['AnnStaticRet'] = new_ind.AnnStaticRet.values # big['AnnCapitalRet'] = new_ind.AnnCapitalRet.values today_str = str(str(month) + str(day) + str(year)) big = big.rename(columns={'Last': 'OptionPrice', 'industry': 'Industry'}) xlsx = '.xlsx' csv = '.csv' file_name = 'All_covered_call' + today_str sectors = big.Sector.unique().astype(str) name_xl = file_name + xlsx writer = ExcelWriter(name_xl) big.to_excel(writer, sheet_name='All Sectors') summary = big.groupby(['Sector', 'Industry']).mean() summary.to_excel(writer, sheet_name='Sector Summary') for i in sectors: to_save = big[big.Sector == i] name = i.replace('/', '-') to_save.to_excel(writer, sheet_name=name) writer.save() name_cs = file_name + csv big.to_csv(name_cs)
class XLSDataFrameBuilder(AbstractDataFrameBuilder): """ Generate structures from mongo and DataDictionary for a DataFrameXLSWriter This builder can choose to query the data in batches and write to a single ExcelWriter object using multiple instances of DataFrameXLSWriter """ INDEX_COLUMN = u"_index" PARENT_TABLE_NAME_COLUMN = u"_parent_table_name" PARENT_INDEX_COLUMN = u"_parent_index" EXTRA_COLUMNS = [INDEX_COLUMN, PARENT_TABLE_NAME_COLUMN, PARENT_INDEX_COLUMN] SHEET_NAME_MAX_CHARS = 30 XLS_SHEET_COUNT_LIMIT = 255 XLS_COLUMN_COUNT_MAX = 255 CURRENT_INDEX_META = 'current_index' def __init__(self, username, id_string, filter_query=None): super(XLSDataFrameBuilder, self).__init__(username, id_string, filter_query) def _setup(self): super(XLSDataFrameBuilder, self)._setup() # need to split columns, with repeats in individual sheets and # everything else on the default sheet self._generate_sections() def export_to(self, file_path, batchsize=100): self.xls_writer = ExcelWriter(file_path) # get record count record_count = self._query_mongo(count=True) # query in batches and for each batch create an XLSDataFrameWriter and # write to existing xls_writer object start = 0 header = True while start < record_count: cursor = self._query_mongo(self.filter_query, start=start, limit=batchsize) data = self._format_for_dataframe(cursor) # write all cursor's data to their respective sheets for section_name, section in self.sections.iteritems(): records = data[section_name] # TODO: currently ignoring nested repeats so ignore sections that have 0 records if len(records) > 0: columns = section["columns"] + self.EXTRA_COLUMNS writer = XLSDataFrameWriter(records, columns) writer.write_to_excel(self.xls_writer, section_name, header=header, index=False) header = False # increment counter(s) start += batchsize self.xls_writer.save() def _format_for_dataframe(self, cursor): """ Format each record for consumption by a dataframe returns a dictionary with the key being the name of the sheet, and values a list of dicts to feed into a DataFrame """ data = dict((section_name, []) for section_name in self.sections.keys()) default_section = self.sections[self.survey_name] default_columns = default_section["columns"] for record in cursor: # from record, we'll end up with multiple records, one for each # section we have # add records for the default section self._add_data_for_section(data[self.survey_name], record, default_columns, self.survey_name) parent_index = default_section[self.CURRENT_INDEX_META] for sheet_name, section in self.sections.iteritems(): # skip default section i.e survey name if sheet_name != self.survey_name: xpath = section["xpath"] columns = section["columns"] # TODO: handle nested repeats -ignoring nested repeats for # now which will not be in the top level record, perhaps # nest sections as well so we can recurs in and get them if record.has_key(xpath): repeat_records = record[xpath] num_repeat_records = len(repeat_records) for repeat_record in repeat_records: self._add_data_for_section(data[sheet_name], repeat_record, columns, sheet_name, parent_index, self.survey_name) return data def _add_data_for_section(self, data_section, record, columns, section_name, parent_index = -1, parent_table_name = None): data_section.append({}) self.sections[section_name][self.CURRENT_INDEX_META] += 1 index = self.sections[section_name][self.CURRENT_INDEX_META] #data_section[len(data_section)-1].update(record) # we could simply do # this but end up with duplicate data from repeats # find any select multiple(s) and add additional columns to record record = self._split_select_multiples(record, self.select_multiples) # alt, precision self._split_gps_fields(record, self.gps_fields) for column in columns: data_value = None try: data_value = record[column] except KeyError: # a record may not have responses for some elements simply # because they were not captured pass data_section[len(data_section)-1].update({column: data_value}) data_section[len(data_section)-1].update({ XLSDataFrameBuilder.INDEX_COLUMN: index, XLSDataFrameBuilder.PARENT_INDEX_COLUMN: parent_index, XLSDataFrameBuilder.PARENT_TABLE_NAME_COLUMN: parent_table_name}) def _generate_sections(self): """ Split survey questions into separate sections for each xls sheet and columns for each section """ # clear list self.sections = OrderedDict() self.survey_name, survey_xpath = survey_name_and_xpath_from_dd(self.dd) # generate a unique and valid xls sheet name self.survey_name = get_valid_sheet_name(self.survey_name, self.sections.keys()) # setup the default section self._create_section(self.survey_name, survey_xpath, False) # dict of select multiple elements self.select_multiples = {} # get form elements to split repeats into separate section/sheets and # everything else in the default section for e in self.dd.get_survey_elements(): # check for a Section or sub-classes of if isinstance(e, Section): # always default to the main sheet sheet_name = self.survey_name # if a repeat we use its name if isinstance(e, RepeatingSection): sheet_name = e.name sheet_name = get_valid_sheet_name(sheet_name, self.sections.keys()) self._create_section(sheet_name, e.get_abbreviated_xpath(), True) # for each child add to survey_sections for c in e.children: if isinstance(c, Question) and not \ question_types_to_exclude(c.type)\ and not c.bind.get(u"type") == MULTIPLE_SELECT_BIND_TYPE: self._add_column_to_section(sheet_name, c) elif c.bind.get(u"type") == MULTIPLE_SELECT_BIND_TYPE: self.select_multiples[c.get_abbreviated_xpath()] = \ [option.get_abbreviated_xpath() for option in c.children] # if select multiple, get its choices and make them # columns for option in c.children: self._add_column_to_section(sheet_name, option) # split gps fields within this section if c.bind.get(u"type") == GEOPOINT_BIND_TYPE: # add columns for geopoint components for xpath in\ self.dd.get_additional_geopoint_xpaths( c.get_abbreviated_xpath()): self._add_column_to_section(sheet_name, xpath) self.get_exceeds_xls_limits() def get_exceeds_xls_limits(self): if not hasattr(self, "exceeds_xls_limits"): self.exceeds_xls_limits = False if len(self.sections) > self.XLS_SHEET_COUNT_LIMIT: self.exceeds_xls_limits = True else: for section in self.sections.itervalues(): if len(section["columns"]) > self.XLS_COLUMN_COUNT_MAX: self.exceeds_xls_limits = True break return self.exceeds_xls_limits def _create_section(self, section_name, xpath, is_repeat): index = len(self.sections) self.sections[section_name] = {"name": section_name, "xpath": xpath, "columns": [], "is_repeat": is_repeat, self.CURRENT_INDEX_META: 0} def _add_column_to_section(self, sheet_name, column): section = self.sections[sheet_name] xpath = None if isinstance(column, SurveyElement): xpath = column.get_abbreviated_xpath() elif isinstance(column, basestring): xpath = column assert(xpath) # make sure column is not already in list if xpath not in section["columns"]: section["columns"].append(xpath)
class XLSDataFrameBuilder(AbstractDataFrameBuilder): """ Generate structures from mongo and DataDictionary for a DataFrameXLSWriter This builder can choose to query the data in batches and write to a single ExcelWriter object using multiple instances of DataFrameXLSWriter """ INDEX_COLUMN = u"_index" PARENT_TABLE_NAME_COLUMN = u"_parent_table_name" PARENT_INDEX_COLUMN = u"_parent_index" EXTRA_COLUMNS = [ INDEX_COLUMN, PARENT_TABLE_NAME_COLUMN, PARENT_INDEX_COLUMN ] SHEET_NAME_MAX_CHARS = 30 XLS_SHEET_COUNT_LIMIT = 255 XLS_COLUMN_COUNT_MAX = 255 CURRENT_INDEX_META = 'current_index' def __init__(self, username, id_string, filter_query=None, group_delimiter=DEFAULT_GROUP_DELIMITER, split_select_multiples=True): super(XLSDataFrameBuilder, self).__init__(username, id_string, filter_query, group_delimiter, split_select_multiples) def _setup(self): super(XLSDataFrameBuilder, self)._setup() # need to split columns, with repeats in individual sheets and # everything else on the default sheet self._generate_sections() def export_to(self, file_path, batchsize=1000): self.xls_writer = ExcelWriter(file_path) # get record count record_count = self._query_mongo(count=True) # query in batches and for each batch create an XLSDataFrameWriter and # write to existing xls_writer object start = 0 header = True while start < record_count: cursor = self._query_mongo(self.filter_query, start=start, limit=batchsize) data = self._format_for_dataframe(cursor) # write all cursor's data to their respective sheets for section_name, section in self.sections.iteritems(): records = data[section_name] # TODO: currently ignoring nested repeats so ignore sections that have 0 records if len(records) > 0: # use a different group delimiter if needed columns = section["columns"] if self.group_delimiter != DEFAULT_GROUP_DELIMITER: columns = [ self.group_delimiter.join(col.split("/")) for col in columns ] columns = columns + self.EXTRA_COLUMNS writer = XLSDataFrameWriter(records, columns) writer.write_to_excel(self.xls_writer, section_name, header=header, index=False) header = False # increment counter(s) start += batchsize time.sleep(0.1) self.xls_writer.save() def _format_for_dataframe(self, cursor): """ Format each record for consumption by a dataframe returns a dictionary with the key being the name of the sheet, and values a list of dicts to feed into a DataFrame """ data = dict( (section_name, []) for section_name in self.sections.keys()) main_section = self.sections[self.survey_name] main_sections_columns = main_section["columns"] for record in cursor: # from record, we'll end up with multiple records, one for each # section we have # add records for the default section self._add_data_for_section(data[self.survey_name], record, main_sections_columns, self.survey_name) parent_index = main_section[self.CURRENT_INDEX_META] for sheet_name, section in self.sections.iteritems(): # skip default section i.e survey name if sheet_name != self.survey_name: xpath = section["xpath"] columns = section["columns"] # TODO: handle nested repeats -ignoring nested repeats for # now which will not be in the top level record, perhaps # nest sections as well so we can recurs in and get them if record.has_key(xpath): repeat_records = record[xpath] num_repeat_records = len(repeat_records) for repeat_record in repeat_records: self._add_data_for_section(data[sheet_name], repeat_record, columns, sheet_name, parent_index, self.survey_name) return data def _add_data_for_section(self, data_section, record, columns, section_name, parent_index=-1, parent_table_name=None): data_section.append({}) self.sections[section_name][self.CURRENT_INDEX_META] += 1 index = self.sections[section_name][self.CURRENT_INDEX_META] #data_section[len(data_section)-1].update(record) # we could simply do # this but end up with duplicate data from repeats if self.split_select_multiples: # find any select multiple(s) and add additional columns to record record = self._split_select_multiples(record, self.select_multiples) # alt, precision self._split_gps_fields(record, self.gps_fields) for column in columns: data_value = None try: data_value = record[column] except KeyError: # a record may not have responses for some elements simply # because they were not captured pass data_section[len(data_section ) - 1].update( { self.group_delimiter.join( column.split('/')) if self.group_delimiter != DEFAULT_GROUP_DELIMITER else column: data_value }) data_section[len(data_section) - 1].update({ XLSDataFrameBuilder.INDEX_COLUMN: index, XLSDataFrameBuilder.PARENT_INDEX_COLUMN: parent_index, XLSDataFrameBuilder.PARENT_TABLE_NAME_COLUMN: parent_table_name }) # add ADDITIONAL_COLUMNS data_section[len(data_section) - 1].update( dict([(column, record[column] if record.has_key(column) else None) for column in self.ADDITIONAL_COLUMNS])) def _generate_sections(self): """ Split survey questions into separate sections for each xls sheet and columns for each section """ # clear list self.sections = OrderedDict() # dict of select multiple elements self.select_multiples = {} survey_element = self.dd.survey self.survey_name = get_valid_sheet_name(survey_element.name, self.sections.keys()) self._create_section(self.survey_name, survey_element.get_abbreviated_xpath(), False) # build sections self._build_sections_recursive(self.survey_name, self.dd.get_survey()) for section_name in self.sections: self.sections[section_name]['columns'] += self.ADDITIONAL_COLUMNS self.get_exceeds_xls_limits() def _build_sections_recursive(self, section_name, element, is_repeating=False): """Builds a section's children and recurses any repeating sections to build those as a separate section """ for child in element.children: # if a section, recurse if isinstance(child, Section): new_is_repeating = isinstance(child, RepeatingSection) new_section_name = section_name # if its repeating, build a new section if new_is_repeating: new_section_name = get_valid_sheet_name( child.name, self.sections.keys()) self._create_section(new_section_name, child.get_abbreviated_xpath(), True) self._build_sections_recursive(new_section_name, child, new_is_repeating) else: # add to survey_sections if isinstance(child, Question) and not \ question_types_to_exclude(child.type)\ and not child.bind.get(u"type") == MULTIPLE_SELECT_BIND_TYPE: self._add_column_to_section(section_name, child) elif child.bind.get(u"type") == MULTIPLE_SELECT_BIND_TYPE: self.select_multiples[child.get_abbreviated_xpath()] = \ [option.get_abbreviated_xpath() for option in child.children] # if select multiple, get its choices and make them # columns if self.split_select_multiples: for option in child.children: self._add_column_to_section(section_name, option) else: self._add_column_to_section(section_name, child) # split gps fields within this section if child.bind.get(u"type") == GEOPOINT_BIND_TYPE: # add columns for geopoint components for xpath in\ self.dd.get_additional_geopoint_xpaths( child.get_abbreviated_xpath()): self._add_column_to_section(section_name, xpath) def get_exceeds_xls_limits(self): if not hasattr(self, "exceeds_xls_limits"): self.exceeds_xls_limits = False if len(self.sections) > self.XLS_SHEET_COUNT_LIMIT: self.exceeds_xls_limits = True else: for section in self.sections.itervalues(): if len(section["columns"]) > self.XLS_COLUMN_COUNT_MAX: self.exceeds_xls_limits = True break return self.exceeds_xls_limits def _create_section(self, section_name, xpath, is_repeat): index = len(self.sections) self.sections[section_name] = { "name": section_name, "xpath": xpath, "columns": [], "is_repeat": is_repeat, self.CURRENT_INDEX_META: 0 } def _add_column_to_section(self, sheet_name, column): section = self.sections[sheet_name] xpath = None if isinstance(column, SurveyElement): xpath = column.get_abbreviated_xpath() elif isinstance(column, basestring): xpath = column assert (xpath) # make sure column is not already in list if xpath not in section["columns"]: section["columns"].append(xpath)
temp_frame2 = temp_frame2.dropna() if month == 0: final_frame = final_frame.join(temp_frame2, how='right') else: final_frame = pd.concat([final_frame, temp_frame2]) except: pass print 'Just finished ticker %s of %s' % (ticker, num_tickers) today = str( str(dt.datetime.now().month) + str(dt.datetime.now().day) + str(dt.datetime.now().year)) file_name = 'NASDAQ_covered_call' + today xlsx = '.xlsx' csv = '.csv' name_xl = file_name + xlsx name_cs = file_name + csv writer = ExcelWriter(file_name) final_frame.to_excel(writer, sheet_name='Covered Call') writer.save() final_frame.to_csv(name_cs) end_time = time() elapsed_time = end_time - start_time print elapsed_time