コード例 #1
0
ファイル: pdexcel.py プロジェクト: Algomorph/moore_scraper
 def __init__(self, db_filename = "moore_grants_database.xlsx",
              report_prefix = "report", 
              sheet_name = "grants",
              index_column = "url",
              report_only_new = True):
     '''
     Constructor
     '''
     if(not os.path.isfile(db_filename)):
         #generate a blank writable excel sheet from scratch
         field_names = [field_name for field_name in Grant.fields]
         writer = ExcelWriter(db_filename)
         profile_dataframe = pd.DataFrame(columns = field_names)
         profile_dataframe.to_excel(writer,sheet_name)
         writer.save()
         writer.close()
     
     self.report_filename = (report_prefix + "_" 
                             + str(datetime.today())[:19]
                             .replace(":","_").replace(" ","[") + "].xlsx")
     # kept for posterity, in case only the date component is needed and 
     # we don't care about overwrites
     # self.report_filename = report_prefix + "_" + str(date.today())
     self.db_filename = db_filename
     self.sheet_name = sheet_name
     self.dataframe = pd.read_excel(db_filename,sheet_name, index_col = index_column)
     self.usaved_sol_counter = 0
     self.added_counter = 0
     self.added_items = set()
     self.index_column = index_column
コード例 #2
0
ファイル: pdexcel.py プロジェクト: Algomorph/fbo_scraper
 def __init__(self, db_filename = "fbo_solicitations.xlsx",
              report_prefix = "report", 
              sol_sheet_name = "solicitations",
              filtered_sheet_name = "filtered_solicitations",
              index_column = "sponsor_number",
              report_only_new = True):
     '''
     Constructor
     '''
     if(not os.path.isfile(db_filename)):
         #generate a blank writable excel sheet from scratch
         field_names = [field_name for field_name in Opportunity.fields]
         field_names.remove("filtered")
         writer = ExcelWriter(db_filename)
         sol_df = pd.DataFrame(columns = field_names)
         filtered_df = pd.DataFrame(columns = field_names)
         sol_df.to_excel(writer,sol_sheet_name)
         filtered_df.to_excel(writer,filtered_sheet_name)
         writer.save()
         writer.close()
     
     self.report_filename = (report_prefix + "_" 
                             + str(datetime.today())[:19]
                             .replace(":","_").replace(" ","[") + "].xlsx")
     #kept for posterity, in case only the date component is needed and we don't care about overwrites
     #self.report_filename = report_prefix + "_" + str(date.today())
     self.db_filename = db_filename
     self.sol_sheet_name = sol_sheet_name
     self.filtered_sheet_name = filtered_sheet_name
     self.sol_df = pd.read_excel(db_filename,sol_sheet_name, index_col = index_column)
     self.filtered_df = pd.read_excel(db_filename,filtered_sheet_name, index_col = index_column)
     self.usaved_sol_counter = 0
     self.sol_counter = 0
     self.added_items = set()
コード例 #3
0
ファイル: pdexcel.py プロジェクト: Algomorph/moore_scraper
 def save_all(self):
     '''
     Dumps all solicitations in both databases to an excel file
     '''
     print "\n\n========  Saving {:s}  ========".format(self.sheet_name)
     writer = ExcelWriter(self.db_filename)
     self.dataframe.to_excel(writer,self.sheet_name,merge_cells=False)
     writer.save()
     writer.close()
     print "========  Done saving.  ========\n"
コード例 #4
0
    def test_excel_date_datetime_format(self, engine, ext, path):
        # see gh-4133
        #
        # Excel output format strings
        df = DataFrame(
            [
                [date(2014, 1, 31), date(1999, 9, 24)],
                [
                    datetime(1998, 5, 26, 23, 33, 4),
                    datetime(2014, 2, 28, 13, 5, 13)
                ],
            ],
            index=["DATE", "DATETIME"],
            columns=["X", "Y"],
        )
        df_expected = DataFrame(
            [
                [datetime(2014, 1, 31),
                 datetime(1999, 9, 24)],
                [
                    datetime(1998, 5, 26, 23, 33, 4),
                    datetime(2014, 2, 28, 13, 5, 13)
                ],
            ],
            index=["DATE", "DATETIME"],
            columns=["X", "Y"],
        )

        with tm.ensure_clean(ext) as filename2:
            writer1 = ExcelWriter(path)
            writer2 = ExcelWriter(
                filename2,
                date_format="DD.MM.YYYY",
                datetime_format="DD.MM.YYYY HH-MM-SS",
            )

            df.to_excel(writer1, "test1")
            df.to_excel(writer2, "test1")

            writer1.close()
            writer2.close()

            reader1 = ExcelFile(path)
            reader2 = ExcelFile(filename2)

            rs1 = pd.read_excel(reader1, "test1", index_col=0)
            rs2 = pd.read_excel(reader2, "test1", index_col=0)

            tm.assert_frame_equal(rs1, rs2)

            # Since the reader returns a datetime object for dates,
            # we need to use df_expected to check the result.
            tm.assert_frame_equal(rs2, df_expected)
コード例 #5
0
ファイル: pdexcel.py プロジェクト: Algomorph/fbo_scraper
 def save_all(self):
     '''
     Dumps all solicitations in both databases to an excel file,
     into two separate spreadsheets: one for filtered items, the other
     for the remaining (relevant) items
     '''
     print "\n\n========  Saving solicitations...  ========"
     writer = ExcelWriter(self.db_filename)
     self.sol_df.to_excel(writer,self.sol_sheet_name,merge_cells=False)
     self.filtered_df.to_excel(writer,self.filtered_sheet_name,merge_cells=False)
     writer.save()
     writer.close()
     print "========  Done saving.  ========\n"
コード例 #6
0
ファイル: pdexcel.py プロジェクト: JLaws0n/fbo_scraper
 def save_all(self):
     '''
     Dumps all solicitations in both databases to an excel file,
     into two separate spreadsheets: one for filtered items, the other
     for the remaining (relevant) items
     '''
     print "\n\n========  Saving solicitations...  ========"
     writer = ExcelWriter(self.db_filename)
     self.sol_df.to_excel(writer, self.sol_sheet_name, merge_cells=False)
     self.filtered_df.to_excel(writer,
                               self.filtered_sheet_name,
                               merge_cells=False)
     writer.save()
     writer.close()
     print "========  Done saving.  ========\n"
コード例 #7
0
ファイル: pdexcel.py プロジェクト: Algomorph/moore_scraper
 def generate_report(self):
     '''
     Generates a separate excel report, consisting of non-award-type notices
     that are not yet overdue
     '''
     print "\n\n========  Generating report...  ========"
     df = self.dataframe.copy()
     ix = pd.Series([(True if ix in self.added_items else False ) 
                                   for ix in df.index ],
                                   index=df.index)
     report_df = df[ix == True]
     
     writer = ExcelWriter(self.report_filename)
     report_df.to_excel(writer,self.sheet_name,merge_cells=False)
     writer.save()
     writer.close()
     
     print "========  Report Generated as " + self.report_filename + " ========\n"
コード例 #8
0
ファイル: test_excel.py プロジェクト: thorwhalen/pandas
    def test_excel_date_datetime_format(self):
        _skip_if_no_xlrd()
        df = DataFrame(
            [[date(2014, 1, 31), date(1999, 9, 24)],
             [
                 datetime(1998, 5, 26, 23, 33, 4),
                 datetime(2014, 2, 28, 13, 5, 13)
             ]],
            index=['DATE', 'DATETIME'],
            columns=['X', 'Y'])
        df_expected = DataFrame(
            [[datetime(2014, 1, 31),
              datetime(1999, 9, 24)],
             [
                 datetime(1998, 5, 26, 23, 33, 4),
                 datetime(2014, 2, 28, 13, 5, 13)
             ]],
            index=['DATE', 'DATETIME'],
            columns=['X', 'Y'])

        with ensure_clean(self.ext) as filename1:
            with ensure_clean(self.ext) as filename2:
                writer1 = ExcelWriter(filename1)
                writer2 = ExcelWriter(filename2,
                                      date_format='DD.MM.YYYY',
                                      datetime_format='DD.MM.YYYY HH-MM-SS')

                df.to_excel(writer1, 'test1')
                df.to_excel(writer2, 'test1')

                writer1.close()
                writer2.close()

                reader1 = ExcelFile(filename1)
                reader2 = ExcelFile(filename2)

                rs1 = reader1.parse('test1', index_col=None)
                rs2 = reader2.parse('test1', index_col=None)

                tm.assert_frame_equal(rs1, rs2)

                # since the reader returns a datetime object for dates, we need
                # to use df_expected to check the result
                tm.assert_frame_equal(rs2, df_expected)
コード例 #9
0
ファイル: pdexcel.py プロジェクト: Algomorph/nsf_scraper
 def generate_report(self):
     '''
     Generates a separate excel report, consisting of non-award-type notices
     that are not yet overdue
     '''
     print "\n\n========  Generating report...  ========"
     today = datetime.today()
     df = self.sol_df.copy()
     df["new"] = pd.Series([(True if ix in self.added_items else False ) 
                                   for ix in df.index ],
                                   index=df.index)
     
     report_df = df[(df["proposal_due_date"] >= today) | (df["proposal_due_date"] == None)]
     
     writer = ExcelWriter(self.report_filename)
     report_df.to_excel(writer,self.sol_sheet_name,merge_cells=False)
     writer.save()
     writer.close()
     
     print "========  Report Generated as " + self.report_filename + " ========\n"
コード例 #10
0
ファイル: pdexcel.py プロジェクト: Algomorph/fbo_scraper
 def generate_report(self):
     '''
     Generates a separate excel report, consisting of non-award-type notices
     that are not yet overdue
     '''
     print "\n\n========  Generating report...  ========"
     today = datetime.today()
     df = self.sol_df.copy()
     df["new"] = pd.Series([(1 if ix in self.added_items else 0 ) 
                                   for ix in df.index ],
                                   index=df.index)
     df["dd"] = [datetime.strptime(dt, "%m/%d/%Y") for dt in df["deadline_date"].values]
     report_df = df[(df["dd"] >= today) 
                             & (df["announcement_type"] != "Award")]
     
     writer = ExcelWriter(self.report_filename)
     report_df.to_excel(writer,self.sol_sheet_name,merge_cells=False)
     writer.save()
     writer.close()
     
     print "========  Report Generated as " + self.report_filename + " ========\n"
コード例 #11
0
ファイル: pdexcel.py プロジェクト: JLaws0n/fbo_scraper
    def __init__(self,
                 db_filename="fbo_solicitations.xlsx",
                 report_prefix="report",
                 sol_sheet_name="solicitations",
                 filtered_sheet_name="filtered_solicitations",
                 index_column="sponsor_number",
                 report_only_new=True):
        '''
        Constructor
        '''
        if (not os.path.isfile(db_filename)):
            #generate a blank writable excel sheet from scratch
            field_names = [field_name for field_name in Opportunity.fields]
            field_names.remove("filtered")
            writer = ExcelWriter(db_filename)
            sol_df = pd.DataFrame(columns=field_names)
            filtered_df = pd.DataFrame(columns=field_names)
            sol_df.to_excel(writer, sol_sheet_name)
            filtered_df.to_excel(writer, filtered_sheet_name)
            writer.save()
            writer.close()

        self.report_filename = (
            report_prefix + "_" +
            str(datetime.today())[:19].replace(":", "_").replace(" ", "[") +
            "].xlsx")
        #kept for posterity, in case only the date component is needed and we don't care about overwrites
        #self.report_filename = report_prefix + "_" + str(date.today())
        self.db_filename = db_filename
        self.sol_sheet_name = sol_sheet_name
        self.filtered_sheet_name = filtered_sheet_name
        self.sol_df = pd.read_excel(db_filename,
                                    sol_sheet_name,
                                    index_col=index_column)
        self.filtered_df = pd.read_excel(db_filename,
                                         filtered_sheet_name,
                                         index_col=index_column)
        self.usaved_sol_counter = 0
        self.sol_counter = 0
        self.added_items = set()
コード例 #12
0
ファイル: test_writers.py プロジェクト: forking-repos/pandas
    def test_excel_date_datetime_format(self, engine, ext):
        # see gh-4133
        #
        # Excel output format strings
        df = DataFrame([[date(2014, 1, 31),
                         date(1999, 9, 24)],
                        [datetime(1998, 5, 26, 23, 33, 4),
                         datetime(2014, 2, 28, 13, 5, 13)]],
                       index=["DATE", "DATETIME"], columns=["X", "Y"])
        df_expected = DataFrame([[datetime(2014, 1, 31),
                                  datetime(1999, 9, 24)],
                                 [datetime(1998, 5, 26, 23, 33, 4),
                                  datetime(2014, 2, 28, 13, 5, 13)]],
                                index=["DATE", "DATETIME"], columns=["X", "Y"])

        with ensure_clean(ext) as filename2:
            writer1 = ExcelWriter(self.path)
            writer2 = ExcelWriter(filename2,
                                  date_format="DD.MM.YYYY",
                                  datetime_format="DD.MM.YYYY HH-MM-SS")

            df.to_excel(writer1, "test1")
            df.to_excel(writer2, "test1")

            writer1.close()
            writer2.close()

            reader1 = ExcelFile(self.path)
            reader2 = ExcelFile(filename2)

            rs1 = pd.read_excel(reader1, "test1", index_col=0)
            rs2 = pd.read_excel(reader2, "test1", index_col=0)

            tm.assert_frame_equal(rs1, rs2)

            # Since the reader returns a datetime object for dates,
            # we need to use df_expected to check the result.
            tm.assert_frame_equal(rs2, df_expected)
コード例 #13
0
ファイル: test_excel.py プロジェクト: 5i7788/pandas
    def test_excel_date_datetime_format(self):
        _skip_if_no_xlrd()
        df = DataFrame([[date(2014, 1, 31),
                         date(1999, 9, 24)],
                        [datetime(1998, 5, 26, 23, 33, 4),
                         datetime(2014, 2, 28, 13, 5, 13)]],
                       index=['DATE', 'DATETIME'], columns=['X', 'Y'])
        df_expected = DataFrame([[datetime(2014, 1, 31),
                                  datetime(1999, 9, 24)],
                                 [datetime(1998, 5, 26, 23, 33, 4),
                                  datetime(2014, 2, 28, 13, 5, 13)]],
                                index=['DATE', 'DATETIME'], columns=['X', 'Y'])

        with ensure_clean(self.ext) as filename1:
            with ensure_clean(self.ext) as filename2:
                writer1 = ExcelWriter(filename1)
                writer2 = ExcelWriter(filename2,
                  date_format='DD.MM.YYYY',
                  datetime_format='DD.MM.YYYY HH-MM-SS')

                df.to_excel(writer1, 'test1')
                df.to_excel(writer2, 'test1')

                writer1.close()
                writer2.close()

                reader1 = ExcelFile(filename1)
                reader2 = ExcelFile(filename2)

                rs1 = reader1.parse('test1', index_col=None)
                rs2 = reader2.parse('test1', index_col=None)

                tm.assert_frame_equal(rs1, rs2)

                # since the reader returns a datetime object for dates, we need
                # to use df_expected to check the result
                tm.assert_frame_equal(rs2, df_expected)
コード例 #14
0
ファイル: pdexcel.py プロジェクト: Algomorph/nsf_scraper
    def __init__(self, db_filename = "nsf_solicitations.xlsx",
                 report_prefix = "report", 
                 sol_sheet_name = "solicitations",
                 filtered_sheet_name = "filtered_solicitations",
                 index_column = "pims_id"):
        '''
        Constructor
        '''
        if(not os.path.isfile(db_filename)):
            #generate a blank writable excel sheet from scratch
            field_names = [field_name for field_name in NsfSolicitation.fields]
            field_names.remove("filtered")
            writer = ExcelWriter(db_filename)
            sol_df = pd.DataFrame(columns = field_names)
            filtered_df = pd.DataFrame(columns = field_names)
            sol_df.to_excel(writer,sol_sheet_name)
            filtered_df.to_excel(writer,filtered_sheet_name)
            writer.save()
            writer.close()
        
        self.report_filename = (report_prefix + "_" 
                                + str(datetime.today())[:19]
                                .replace(":","_").replace(" ","[") + "].xlsx")

        self.db_filename = db_filename
        self.sol_sheet_name = sol_sheet_name
        self.filtered_sheet_name = filtered_sheet_name
        self.sol_df = pd.read_excel(db_filename,sol_sheet_name, index_col = index_column)
        self.filtered_df = pd.read_excel(db_filename,filtered_sheet_name, index_col = index_column)
        self.usaved_sol_counter = 0
        self.sol_counter = 0
        self.added_items = set()
        self.solicitation_numbers = set()
        for sn in self.sol_df["solicitation_number"].values:
            self.solicitation_numbers.add(sn)
        self.index_column = index_column
コード例 #15
0
ファイル: pdexcel.py プロジェクト: JLaws0n/fbo_scraper
    def generate_report(self):
        '''
        Generates a separate excel report, consisting of non-award-type notices
        that are not yet overdue
        '''
        print "\n\n========  Generating report...  ========"
        today = datetime.today()
        df = self.sol_df.copy()
        df["new"] = pd.Series([(1 if ix in self.added_items else 0)
                               for ix in df.index],
                              index=df.index)
        df["dd"] = [
            datetime.strptime(dt, "%m/%d/%Y")
            for dt in df["deadline_date"].values
        ]
        report_df = df[(df["dd"] >= today)
                       & (df["announcement_type"] != "Award")]

        writer = ExcelWriter(self.report_filename)
        report_df.to_excel(writer, self.sol_sheet_name, merge_cells=False)
        writer.save()
        writer.close()

        print "========  Report Generated as " + self.report_filename + " ========\n"
コード例 #16
0
    def write(
        self,
        writer,
        sheet_name="Sheet1",
        startrow=0,
        startcol=0,
        freeze_panes=None,
        engine=None,
        storage_options: StorageOptions = None,
    ):
        """
        writer : path-like, file-like, or ExcelWriter object
            File path or existing ExcelWriter
        sheet_name : string, default 'Sheet1'
            Name of sheet which will contain DataFrame
        startrow :
            upper left cell row to dump area_data frame
        startcol :
            upper left cell column to dump area_data frame
        freeze_panes : tuple of integer (length 2), default None
            Specifies the one-based bottommost row and rightmost column that
            is to be frozen
        engine : string, default None
            write engine to use if writer is a path - you can also set this
            via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
            and ``io.excel.xlsm.writer``.

            .. deprecated:: 1.2.0

                As the `xlwt <https://pypi.org/project/xlwt/>`__ package is no longer
                maintained, the ``xlwt`` engine will be removed in a future
                version of pandas.

        {storage_options}

            .. versionadded:: 1.2.0
        """
        from pandas.io.excel import ExcelWriter

        num_rows, num_cols = self.df.shape
        if num_rows > self.max_rows or num_cols > self.max_cols:
            raise ValueError(
                f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} "
                f"Max sheet size is: {self.max_rows}, {self.max_cols}")

        formatted_cells = self.get_formatted_cells()
        if isinstance(writer, ExcelWriter):
            need_save = False
        else:
            # pandas\io\formats\excel.py:808: error: Cannot instantiate
            # abstract class 'ExcelWriter' with abstract attributes 'engine',
            # 'save', 'supported_extensions' and 'write_cells'  [abstract]
            writer = ExcelWriter(  # type: ignore[abstract]
                writer,
                engine=engine,
                storage_options=storage_options)
            need_save = True

        try:
            writer.write_cells(
                formatted_cells,
                sheet_name,
                startrow=startrow,
                startcol=startcol,
                freeze_panes=freeze_panes,
            )
        finally:
            # make sure to close opened file handles
            if need_save:
                writer.close()
コード例 #17
0
    def write(
        self,
        writer,
        sheet_name="Sheet1",
        startrow=0,
        startcol=0,
        freeze_panes=None,
        engine=None,
        storage_options: StorageOptions = None,
    ):
        """
        writer : path-like, file-like, or ExcelWriter object
            File path or existing ExcelWriter
        sheet_name : string, default 'Sheet1'
            Name of sheet which will contain DataFrame
        startrow :
            upper left cell row to dump data frame
        startcol :
            upper left cell column to dump data frame
        freeze_panes : tuple of integer (length 2), default None
            Specifies the one-based bottommost row and rightmost column that
            is to be frozen
        engine : string, default None
            write engine to use if writer is a path - you can also set this
            via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
            and ``io.excel.xlsm.writer``.
        storage_options : dict, optional
            Extra options that make sense for a particular storage connection, e.g.
            host, port, username, password, etc., if using a URL that will
            be parsed by ``fsspec``, e.g., starting "s3://", "gcs://".

            .. versionadded:: 1.2.0
        """
        from pandas.io.excel import ExcelWriter

        num_rows, num_cols = self.df.shape
        if num_rows > self.max_rows or num_cols > self.max_cols:
            raise ValueError(
                f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} "
                f"Max sheet size is: {self.max_rows}, {self.max_cols}")

        formatted_cells = self.get_formatted_cells()
        if isinstance(writer, ExcelWriter):
            need_save = False
        else:
            # pandas\io\formats\excel.py:808: error: Cannot instantiate
            # abstract class 'ExcelWriter' with abstract attributes 'engine',
            # 'save', 'supported_extensions' and 'write_cells'  [abstract]
            writer = ExcelWriter(  # type: ignore[abstract]
                writer,
                engine=engine,
                storage_options=storage_options)
            need_save = True

        try:
            writer.write_cells(
                formatted_cells,
                sheet_name,
                startrow=startrow,
                startcol=startcol,
                freeze_panes=freeze_panes,
            )
        finally:
            # make sure to close opened file handles
            if need_save:
                writer.close()
コード例 #18
0
                standardized = get_val(row.standardized)
                encoding = get_val(row.encoding)
                if not downloaded and len(downloaded) < 1:
                    downloaded, encoding = spider.start_single(
                        row.uri, Path.joinpath(folder_download, downloaded))
                    task_content.loc[index, C.REQ_DOWNLOAD] = downloaded.name
                    task_content.loc[index, C.REQ_ENCODING] = encoding
                else:
                    downloaded = folder_download.joinpath(downloaded)
                if not standardized and len(standardized) < 1:
                    standardized = std.parse_one(downloaded=downloaded,
                                                 from_lang=src_language,
                                                 to_lang=dst_language,
                                                 encoding=encoding)
                    task_content.loc[index,
                                     C.REQ_STANDARDIZED] = standardized.name
            except Exception as e:
                exc_type, exc_val, _ = sys.exc_info()
                if type(e) == r.exceptions.ConnectionError:
                    lg.error(f'{dir(e)}')
                    lg.error(f'errno:{e.errno} - strerror:{e.strerror}')
                    task_content.loc[
                        index,
                        C.REQ_ERROR] = f'{type(e)}:{e.errno}:{e.strerror}'
                lg.error(f"Exception occurred: {exc_type}\n", exc_info=True)
        ew = ExcelWriter(str(pth))
        task_content.to_excel(ew, index=False, encoding='utf-8')
        ew.save()
        ew.close()
        lg.info(f'** finished file:{pth}')