Python ParserXlsの例、file_parser.ParserXls Pythonの例

コード例 #1

0

ファイルを表示

 def __init__(self, inTest=False):
     self.inTest = inTest
     self.sheet_name = 0
     self.file_path = self.root_path + r"UIBM/"
     self.file_parser = ParserXls(self.file_path + "UIBMSourceSample.xlsx")
     self.df = self.get_dataframe_merged()
     self.column_types = {
         0: 'int',
         1: 'object',
         2: 'date',
         3: 'object',
         4: 'object',
         5: 'int',
         6: 'object',
         7: 'object',
         8: 'object',
         9: 'object',
         10: 'object',
         11: 'object',
         12: 'object',
         13: 'object',
         14: 'object',
         15: 'bool'
     }
     self.column_constraints = {i: False for i in range(16)}

コード例 #2

0

ファイルを表示

class ContrattiRete(DataProvider):
    def __init__(self, file_name):
        self.file_name = file_name
        self.file_path = self.file_path + r"Contratti rete/" + self.file_name

        self.parser = ParserXls(self.file_path)
        self.elenco = self.parser.open_file(sheet_name="Elenco")
        print("Aperto foglio 'elenco' in ContrattiRete.elenco")
        self.sogg_giuridico = self.parser.open_file(sheet_name="Sogg. Giu.")
        print("Aperto foglio 'Sogg. Giu.' in ContrattiRete.sogg_giuridico")

    def preprocessing_merge_sheets(self) -> pd.DataFrame:
        """
        Preprocessing file fonte Contratti di rete
        """
        # Apertura DF
        file_foglio1 = self.elenco.astype({'progr.': np.int64})
        file_foglio2 = self.sogg_giuridico.astype({'progr.': np.int64})

        # Sistemazione numero progressivo
        file_foglio2.loc[:, 'progr.'] = file_foglio2['progr.'] + file_foglio1[
            'progr.'].max()

        # Aggiunta colonna: Soggetto Giuridico SI/NO
        file_foglio1 = file_foglio1.assign(SoggettoGiuridico=pd.Series(
            ['NO' for i in file_foglio1.index.tolist()]))
        file_foglio2 = file_foglio2.assign(SoggettoGiuridico=pd.Series(
            ['SI' for i in file_foglio1.index.tolist()]))

        # Elenco delle colonne in lista per entrambi i file
        elenco_cols1 = file_foglio1.columns.tolist()
        elenco_cols2 = file_foglio2.columns.tolist()
        cols = {col: col[0] for col in elenco_cols2}

        # Match chiave e valori del dizionario per rinominare le colonne
        cols[elenco_cols2[0]] = elenco_cols1[0]
        cols[elenco_cols2[1]] = elenco_cols1[1]
        cols[elenco_cols2[4]] = elenco_cols1[2]
        cols[elenco_cols2[9]] = elenco_cols1[3]
        cols[elenco_cols2[10]] = elenco_cols1[5]
        cols[elenco_cols2[12]] = elenco_cols1[6]
        cols[elenco_cols2[13]] = elenco_cols1[7]
        cols[elenco_cols2[14]] = elenco_cols1[8]
        cols[elenco_cols2[15]] = elenco_cols1[10]
        cols[elenco_cols2[16]] = elenco_cols1[11]
        cols[elenco_cols2[17]] = elenco_cols1[12]
        cols[elenco_cols2[18]] = elenco_cols1[13]
        cols[elenco_cols2[19]] = elenco_cols1[14]
        cols[elenco_cols2[20]] = elenco_cols1[18]

        # Rinomino secondo DF con le colonne del primo
        file_foglio2.rename(columns=cols, inplace=True)
        # Seleziono le colonne che utilizzo e salvo nel DF temp
        file_temp = file_foglio2.loc[:, [
            v for k, v in cols.items() if len(v) > 1
        ]]

        return file_foglio1.append(file_temp, ignore_index=True, sort=False)

コード例 #3

0

ファイルを表示

    def __init__(self, file_name):
        self.file_name = file_name
        self.file_path = self.file_path + r"Contratti rete/" + self.file_name

        self.parser = ParserXls(self.file_path)
        self.elenco = self.parser.open_file(sheet_name="Elenco")
        print("Aperto foglio 'elenco' in ContrattiRete.elenco")
        self.sogg_giuridico = self.parser.open_file(sheet_name="Sogg. Giu.")
        print("Aperto foglio 'Sogg. Giu.' in ContrattiRete.sogg_giuridico")

コード例 #4

0

ファイルを表示

class RatingLegalita(DataProvider):
    def __init__(self, inTest=False):
        self.inTest = inTest
        self.sheet_name = 0
        self.file_path = self.root_path + r"RatingLegalita/"
        self.file_parser = ParserXls(self.file_path + "27mar2020.xlsx")
        self.df = self.file_parser.open_file(skiprows=1)
        self.column_types = {
            0: 'object',
            1: 'object',
            2: 'object',
            3: 'object',
            4: 'date',
            5: 'object',
            6: 'object',
            7: 'date'
        }
        self.column_constraints = {
            col: False
            for col in self.column_types.keys()
        }

    def filter_fiscalcodes_dataframe(self,
                                     inplace=False
                                     ) -> Union[None, pd.DataFrame]:
        return super().filter_fiscalcodes_dataframe(1, inplace=inplace)

    def update_rating_column_with_spaces(self):
        """Aggiunge uno spazio tra l'asterisco e il rating
        Esempio: '*++' -> '* ++'"""

        values = self.df.iloc[:, -2].str.rsplit("*", n=1)
        self.df.iloc[:, -2] = values.map(lambda l: "* ".join(l)
                                         if isinstance(l, list) else l)

コード例 #5

0

ファイルを表示

def get_dataframes_from_file(parser: ParserXls) -> list:
    '''
    Return a list of dataframes for an Excel file
    '''
    df_list = []
    for sheet_name in parser.sheet_names:
        df_list.append(parser.open_file(sheet_name=sheet_name))
    
    return df_list

コード例 #6

0

ファイルを表示

    def __init__(self, inTest=False):
        """
        ContrattiRete data provider

        Keyword Arguments:
            inTest {bool} -- opening test file (default: {False})
        """
        self.inTest = inTest
        self.sheet_name = "NuovoElenco"
        self.file_path = self.root_path + r"ContrattiRete/"
        self.file_parser = ParserXls(self.file_path + "ContrattiRete.xlsx")

        self.column_types = {
            0: 'int',
            1: 'object',
            2: 'date',
            3: 'object',
            4: 'object',
            5: 'object',
            6: 'object',
            7: 'object',
            8: 'object',
            9: 'object',
            10: 'object',
            11: 'object',
            12: 'object',
            13: 'object',
            14: 'object',
            15: 'object',
            16: 'object',
            17: 'object',
            18: 'object'
        }
        self.column_constraints = {
            i: False
            for i in range(len(self.column_types))
        }
        self.column_constraints[3] = True
        self.column_constraints[4] = True
        self.column_constraints[7] = True

        self.unique_column_names = ['numero repertorio', 'numero atto', 'c.f.']

        self.open_dataframe_from_sheet_name(sheet_name=self.sheet_name)

コード例 #7

0

ファイルを表示

 def __init__(self, inTest=False):
     self.inTest = inTest
     self.sheet_name = 0
     self.file_path = self.root_path + r"RatingLegalita/"
     self.file_parser = ParserXls(self.file_path + "27mar2020.xlsx")
     self.df = self.file_parser.open_file(skiprows=1)
     self.column_types = {
         0: 'object',
         1: 'object',
         2: 'object',
         3: 'object',
         4: 'date',
         5: 'object',
         6: 'object',
         7: 'date'
     }
     self.column_constraints = {
         col: False
         for col in self.column_types.keys()
     }

コード例 #8

0

ファイルを表示

    def open_source(self):
        """
        Open the sheet 0 in the Infocamere excel file
        """
        assert self.file_ext.startswith("xls"), TypeError(
            "Wrong file extension!")
        assert os.path.isfile(
            self.file_path), FileExistsError("File not found!")

        self.df = ParserXls(
            self.file_path).open_file(sheet_name=self.sheet_name)

コード例 #9

0

ファイルを表示

def get_laws_from_file(file_name: str) -> set:
    '''
    Returns the set of laws in an Excel file from FinanziamentiFVG
    '''
    parser = ParserXls(file_name)

    # %% Open dataframes
    df_list = get_dataframes_from_file(parser)

    # %% Get the list of laws from each dataframe
    laws_for_sheets = get_sets_from_dataframes(df_list)

    # return get_set_differences(laws_for_sheets)
    return laws_for_sheets

コード例 #10

0

ファイルを表示

class BrevettiIta(DataProvider):
    def __init__(self, inTest=False):
        self.inTest = inTest
        self.sheet_name = 0
        self.file_path = self.root_path + r"UIBM/"
        self.file_parser = ParserXls(self.file_path + "UIBMSourceSample.xlsx")
        self.df = self.get_dataframe_merged()
        self.column_types = {
            0: 'int',
            1: 'object',
            2: 'date',
            3: 'object',
            4: 'object',
            5: 'int',
            6: 'object',
            7: 'object',
            8: 'object',
            9: 'object',
            10: 'object',
            11: 'object',
            12: 'object',
            13: 'object',
            14: 'object',
            15: 'bool'
        }
        self.column_constraints = {i: False for i in range(16)}

    def get_dataframe_merged(self) -> pd.DataFrame:
        """
        Metodo che apre tutti i fogli del file excel e li ritorna in un unico DataFrame
        """
        df_to_concat = []
        for sheet in self.file_parser.sheet_names:
            df_temp = self.file_parser.open_file(sheet_name=sheet)
            #df_temp["Sheet"] = sheet
            df_to_concat.append(df_temp)

        return pd.concat(df_to_concat,
                         ignore_index=True,
                         verify_integrity=True)

コード例 #11

0

ファイルを表示

 def __init__(self, inTest=False):
     self.inTest = inTest
     self.file_path = self.root_path + r"Insiel/"
     self.file_parser = ParserXls(self.file_path + "Insiel.xlsx")

コード例 #12

0

ファイルを表示

class ContrattiRete(DataProvider):
    def __init__(self, inTest=False):
        """
        ContrattiRete data provider

        Keyword Arguments:
            inTest {bool} -- opening test file (default: {False})
        """
        self.inTest = inTest
        self.sheet_name = "NuovoElenco"
        self.file_path = self.root_path + r"ContrattiRete/"
        self.file_parser = ParserXls(self.file_path + "ContrattiRete.xlsx")

        self.column_types = {
            0: 'int',
            1: 'object',
            2: 'date',
            3: 'object',
            4: 'object',
            5: 'object',
            6: 'object',
            7: 'object',
            8: 'object',
            9: 'object',
            10: 'object',
            11: 'object',
            12: 'object',
            13: 'object',
            14: 'object',
            15: 'object',
            16: 'object',
            17: 'object',
            18: 'object'
        }
        self.column_constraints = {
            i: False
            for i in range(len(self.column_types))
        }
        self.column_constraints[3] = True
        self.column_constraints[4] = True
        self.column_constraints[7] = True

        self.unique_column_names = ['numero repertorio', 'numero atto', 'c.f.']

        self.open_dataframe_from_sheet_name(sheet_name=self.sheet_name)

    def open_dataframe_from_sheet_name(self, sheet_name):
        try:
            df = self.file_parser.open_file(sheet_name=sheet_name)
        except:
            print(
                "Il file dev'essere elaborato. Dopo l'elaborazione, ricordati di eseguire i passaggi:\
                \n\t1. rinominare il foglio 'Elenco' in 'VecchioElenco'\
                \n\t2. rinominare il foglio 'NuovoElenco' in 'Elenco'\n")
            self.preprocessing()
        else:
            self.df = df

    def preprocessing(self):
        """
        Elaborazioni:

            1. Accodamento contratti di rete presenti nel foglio "Sogg. Giu."
                nel dataframe self.df

            2. Salvataggio dei DataFrame nel file
        """
        self.old_df = self.file_parser.open_file(sheet_name="Elenco")
        self.df = self.old_df.astype({'progr.': np.int64})
        self.df_to_append = self.file_parser.open_file(
            sheet_name="Sogg. Giu.").astype({'progr.': np.int64})

        print("Accodamento contratti che sono soggetto giuridico...")
        self.append_dataframe()

        print("Selezione dei soli codici fiscali presenti in I2FVG...")
        self.set_filtred_fiscal_codes_dataframe()

        if not self.is_valid_data_provider():
            self.update_duplicates_sheet()

        self.update_preprocessed_sheet()

    def append_dataframe(self):
        """
        Method that append the self.df_to_append to self.df
        """
        self.updated_columns_from_sheets()

        self.align_df_to_append()

        self.df = self.df.append(self.df_to_append,
                                 ignore_index=True,
                                 sort=False)

    def updated_columns_from_sheets(self):
        """
        Method that update the columns for each DataFrame
        """
        step = self.df['progr.'].max()
        self.df_to_append.loc[:, 'progr.'] += step

        new_column_name = "SoggettoGiuridico"
        self.df[new_column_name] = "NO"
        self.df_to_append[new_column_name] = "SI"

    def align_df_to_append(self):
        """
        Align the self.df_to_append
        """
        mapping_dict = self.get_mapped_columns_from_sheets()

        column_selection = [
            column for column in mapping_dict.values() if len(column) > 1
        ]

        self.df_to_append = self.df_to_append.rename(
            columns=mapping_dict).loc[:, column_selection]

    def get_mapped_columns_from_sheets(self) -> dict:
        """
        Return the mapping dictionary between the two dataframes
        """
        elenco_cols1 = self.df.columns.tolist()
        elenco_cols2 = self.df_to_append.columns.tolist()
        cols = {col: col[0] for col in elenco_cols2}

        # Match chiave e valori del dizionario per rinominare le colonne
        cols[elenco_cols2[0]] = elenco_cols1[0]
        cols[elenco_cols2[1]] = elenco_cols1[1]
        cols[elenco_cols2[4]] = elenco_cols1[2]
        cols[elenco_cols2[9]] = elenco_cols1[3]
        cols[elenco_cols2[10]] = elenco_cols1[5]
        cols[elenco_cols2[12]] = elenco_cols1[6]
        cols[elenco_cols2[13]] = elenco_cols1[7]
        cols[elenco_cols2[14]] = elenco_cols1[8]
        cols[elenco_cols2[15]] = elenco_cols1[10]
        cols[elenco_cols2[16]] = elenco_cols1[11]
        cols[elenco_cols2[17]] = elenco_cols1[12]
        cols[elenco_cols2[18]] = elenco_cols1[13]
        cols[elenco_cols2[19]] = elenco_cols1[14]
        cols[elenco_cols2[20]] = elenco_cols1[18]

        return cols

    def set_filtred_fiscal_codes_dataframe(self) -> None:
        """
        Overright del metodo DataProvider per selezionare solo
        i Contratti di Rete che hanno aziende di Innovation Intelligence

        Arguments:
            cf_column {int} -- Numero della colonna del C.F.
        """
        selected_dataframe = self.filter_fiscalcodes_dataframe(cf_column=7)

        selection_filter = self.get_contratti_filter(selected_dataframe)

        self.df = self.df.loc[selection_filter].reset_index()
        self.df.drop(columns='index', inplace=True)

    def get_contratti_filter(self,
                             selected_dataframe: pd.DataFrame) -> pd.Series:
        """
        [summary]

        Returns:
            pd.Series -- [description]
        """
        numero_repertorio_list = selected_dataframe["numero repertorio"] \
            .drop_duplicates().tolist()
        numero_atto_list = selected_dataframe["numero atto"] \
            .drop_duplicates().tolist()

        numero_repertorio_filter = self.df["numero repertorio"].isin(
            numero_repertorio_list)
        numero_atto_filter = self.df["numero atto"].isin(numero_atto_list)

        return numero_repertorio_filter & numero_atto_filter

    def is_valid_data_provider(self) -> bool:
        """
        Check if duplicates founded and return True else return False 
        """
        df_duplicates = self.get_duplicates_dataframe()

        return df_duplicates.empty

    def update_duplicates_sheet(self):
        """
        Save the duplicati sheet
        """
        duplicates = self.get_duplicates_dataframe()
        # Memory for drop rows after the cleaning
        duplicates["indice"] = duplicates.index

        print(
            "Sono stati trovati duplicati. Tutti i duplicati sono salvati nel foglio 'Duplicati'"
        )
        self.write_new_dataframe_into_file_parser(duplicates,
                                                  sheet_name="Duplicati")

        print(
            f"Trovati n. {duplicates.shape[0]} duplicati da ripulire manualmente "
            + "contrassegnando i contratti non validi da dover eliminare")

    def get_duplicates_dataframe(self) -> pd.DataFrame:
        """
        Return pandas.DataFrame of duplicates values

        Returns:
            pd.DataFrame -- copy of the original DataFrame only with duplicates
        """
        is_duplicate_filter = self.get_duplicates_bool_series()
        return self.df.loc[is_duplicate_filter].copy()

    def get_duplicates_bool_series(self) -> pd.Series:
        """
        Return the boolean series for duplicate selection

        Returns:
            pd.Series -- [description]
        """
        return self.df.duplicated(subset=self.unique_column_names, keep=False)

    def write_new_dataframe_into_file_parser(self, df: pd.DataFrame,
                                             sheet_name: str):
        """
        Write into the Parser
        """
        try:
            self.file_parser.write_new_sheet_into_file(df,
                                                       sheet_name=sheet_name)
        except:
            print("Cannot write the new sheet into the same file! \
                    \nHint: save the file in xlsx format")

    def update_preprocessed_sheet(self):
        """
        Update the file_parser saving the old sheet into "ElencoOld" and
        sostitute the "Elenco" file with the preprocessed DataFrame
        """
        print("Salvataggio nel foglio 'NuovoElenco' dei contratti di rete\n" +
              "Ricordati di rinominare i fogli prima di consegnare il file")
        self.write_new_dataframe_into_file_parser(self.df,
                                                  sheet_name="NuovoElenco")

コード例 #13

0

ファイルを表示

Script per controllare il file fonte dei Contratti di Rete

Questo script è stato creato per poi consolidare il data provider

"""
# %% Setup
from file_parser import ParserXls
import pandas as pd
import numpy as np

FILE_PATH = r"../data/ContrattiRete/ContrattiRete_3Apr2020.xlsx"

# %% Apro il file utilizzando abbrv. 'contratti' per i Contratti di rete
try:
    contratti_parser = ParserXls(FILE_PATH)
except Exception as e:
    print("Problemi con il percorso del file")
    raise e

# %% Leggo il file
contratti_df = contratti_parser.open_file(sheet_name='Elenco')
contratti_df.info()

# %% Conteggio duplicati
column_name_constraints = ['numero repertorio', 'numero atto', 'c.f.']

contratti_duplicates_filter = contratti_df \
                                .duplicated(subset = column_name_constraints,
                                            keep = False)
contratti_duplicates_df = contratti_df.loc[contratti_duplicates_filter]

コード例 #14

0

ファイルを表示

 def setUpClass(cls):
     cls.logger = TestLogger("ParserXls")
     file_path = cls.root_dir + "test_file.xlsx"
     cls.parser = ParserXls(file_path=file_path)
     cls.file_fonte = cls.parser.open_file(sheet_name=None)

コード例 #15

0

ファイルを表示

import pandas as pd
import numpy as np
import os

# %% Change directory
os.chdir("..")

# %% Apro tabella del matching per incrociare l'id con i CF
db = DatabaseConnector()
imprese_match = db.get_dataframe_from_table("SVC_Imprese_Match")

# %% Apro file brevetti
file_fonte_name = r"data/PATSTAT/2020_05_12_brevetti.xlsx"

file_fonte = ParserXls(file_fonte_name)
file_fonte_df = file_fonte.open_file(sheet_name=0)

# %% Incrocio i dati per ottenere il Codice fiscale nel file fonte
file_fonte_df = file_fonte_df.astype({
    "idimpresa": "int64"
}).rename(columns={"idimpresa": "IDEsterno"})

merged_df = file_fonte_df.merge(imprese_match, on="IDEsterno", how="left")

# %% Salvo il file con i soli match
merged_df.dropna(subset=["CF"]).to_excel(
    r"data/PATSTAT/2020_05_12_brevetti_da_testare.xlsx", index=False)

# Lavoro restante: controllo a campione su 10 brevetti

コード例 #16

0

ファイルを表示

import pandas as pd
from file_parser import ParserXls
from idb import DatabaseConnector
import os
os.chdir("..")

# %% Parameters
BASE_DIR = r"data/FinanziamentiUE/2020/"
FILE_NAME = "FinanziamentiUE_08_05_2020.xlsx"
ORGANIZATION_SHEET = "organizations"
PROJECTS_SHEET = "projects"
ORGANIZATION_TBL_NAME = "DATA_FinanziamentiUE_Impresa"
PROJECTS_TBL_NAME = "DATA_FinanziamentiUE_Progetto"

# %% Loading
finanziamentiUE_parser = ParserXls(BASE_DIR + FILE_NAME)
finanziamentiUE_parser.sheet_names

# %% Open organizations
finanziamentiUE_organizations_df = finanziamentiUE_parser.open_file(
    ORGANIZATION_SHEET)
finanziamentiUE_organizations_df.info()

# %% Open projects
finanziamentiUE_projects_df = finanziamentiUE_parser.open_file(PROJECTS_SHEET)
finanziamentiUE_projects_df.info()

# %% Open DB connection
db = DatabaseConnector()

# %% Open Organizazions tbl