def test_file_path_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) expect = pdx.read_avro(tf.name) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) assert_frame_equal(expect, dataframe)
def test_buffer_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) with open(tf.name, 'rb') as f: expect = pdx.read_avro(BytesIO(f.read())) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) assert_frame_equal(expect, dataframe)
def main(): df = pd.DataFrame({"Boolean": [True, False, True, False], "Float64": np.random.randn(4), "Int64": np.random.randint(0, 10, 4), "String": ['foo', 'bar', 'foo', 'bar'], "DateTime64": [pd.Timestamp('20190101'), pd.Timestamp('20190102'), pd.Timestamp('20190103'), pd.Timestamp('20190104')]}) pdx.to_avro(OUTPUT_PATH, df) saved = pdx.read_avro(OUTPUT_PATH) print(saved)
def test_dataframe_kwargs(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) # include columns columns = ['Boolean', 'Int64'] expect = pdx.read_avro(tf.name, columns=columns) df = dataframe[columns] assert_frame_equal(expect, df) # exclude columns columns = ['String', 'Boolean'] expect = pdx.read_avro(tf.name, exclude=columns) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) df = dataframe.drop(columns, axis=1) assert_frame_equal(expect, df) # specify index index = 'String' expect = pdx.read_avro(tf.name, index=index) expect['DateTime64'] = expect['DateTime64'].astype( np.dtype('datetime64[ns]')) df = dataframe.set_index(index) assert_frame_equal(expect, df)
def combine_files(json_filepath, arvo_filepath, csv_filepath, output_filepath): """Combines the three files, eliminates duplicates and it sorts the resulting dataset by City Name. Creates a csv file in output_filepath and returns a dataframe with its content """ #reading all the three files df = pd.read_json(json_filepath) df = df.append(pd.read_csv(csv_filepath)) df = df.append(pdx.read_avro(arvo_filepath)) #dropping duplicates df = df.drop_duplicates() #sorting by Name df = df.sort_values(by='Name') #writing to csv df.to_csv(output_filepath) return pd.read_csv(output_filepath)
def converter_csv_to_avro(INPUT_PATH, OUTPUT_PATH, converter_to_datetime): df = pd.read_csv(INPUT_PATH) # Trasnform columns string to datetime for columns_to_converter in converter_to_datetime: df[columns_to_converter] = pd.to_datetime(df[columns_to_converter]) print(df.info()) pdx.to_avro(OUTPUT_PATH, df) # Converter saved = pdx.read_avro(OUTPUT_PATH) # Only read to control print(saved) return
def _read_avro(tmpfile, remove_timezone_from_type=True, *args, **kwargs): """ Reads a DataFrame from an Avro file Args: tmpfile (tempfile.NamedTemporaryFile): Connection to the file to be read from Returns: pd.DataFrame: The DataFrame read from Avro """ # Pandavro reading from a tempfile if it hasn't been closed post-writing # raises an 'ValueError', so we have to create a secondary opening. # Will work on unix-like systems, but not Windows. with open(tmpfile.name, 'rb') as f: df = pdx.read_avro(f, *args, **kwargs) if remove_timezone_from_type: datetime_cols = df.columns[df.dtypes == 'datetime64[ns, UTC]'] df[datetime_cols] = df[datetime_cols].apply( lambda x: x.dt.tz_convert(None)) return df
def test_file_path_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) expect = pdx.read_avro(tf.name) assert_frame_equal(expect, dataframe)
def test_buffer_e2e(dataframe): tf = NamedTemporaryFile() pdx.to_avro(tf.name, dataframe) with open(tf.name, 'rb') as f: expect = pdx.read_avro(BytesIO(f.read())) assert_frame_equal(expect, dataframe)
"Boolean": [True, False, True, False], "Float64": np.random.randn(4), "Int64": np.random.randint(0, 10, 4), "String": ['foo', 'bar', 'foo', 'bar'], "DateTime64": [ pd.Timestamp('20190101'), pd.Timestamp('20190102'), pd.Timestamp('20190103'), pd.Timestamp('20190104') ] }) pdx.to_avro("SampleAvro2.avro", df) saved = pdx.read_avro("SampleAvro.avro") print(saved) vv = pd.read_csv("Sample2.csv", header=0) bb = pd.read_csv("Sample.tsv", sep='\t', header=None) aa = pd.read_excel('Sample.xls') aa2 = pd.read_excel('Sample.xlsx') aa.columns = ['a', 'b', 'c', 'd', 'e', 'f'] ''' FileName = '/home/contactrkk_gmail/1D311A1E02824594/AllKindOfStuff/ML/Salaries.csv' FileType = FileName.split(".") FileType = FileType[len(FileType)-1].lower() observations = pd.read_csv('Salaries.csv') ''' from urllib.request import urlopen
TEMP_FILES = os.listdir("TrainTemp/") NUM_EPOCHS = 1 BATCH_SIZE = 1000000 PARAMS = { "user_feats_size": 8222243, "item_feats_size": 343419, "embed_size": 150, "optimizer_type": "Adam", "learning_rate": 0.05, "l2_reg": 0.0001} if __name__ == "__main__": # load data data = pd.concat([pdx.read_avro("TrainTemp/" + f) for f in TEMP_FILES]) data = np.hstack(( data["user_index"].values[:, None], data["item_indexs"].values[:, None])) _create_training_data(data) config = tf.estimator.RunConfig().replace( session_config=tf.ConfigProto( device_count={"GPU": 1}, log_device_placement=True), log_step_count_steps=100, save_summary_steps=10, keep_checkpoint_max=3) BPR = tf.estimator.Estimator( model_fn=model_fn, model_dir=MODEL_DIR, params=PARAMS, config=config)
def read(self, key: str) -> pd.DataFrame: df = pandavro.read_avro(key) return df
import os import pandavro df = pandavro.read_avro("./data/csv/example.avro") print(df)
def LoadData(self, FileName, HeaderMissing="No"): # Supports excel,csv,tsv,xml,json,orc,parquet,avro import pandas as pd FileType = FileName.split(".") FileType = FileType[len(FileType) - 1].lower() if FileType == 'xls': if HeaderMissing == "Yes": return pd.read_excel(FileName, header=None) else: return pd.read_excel(FileName) if FileType == 'xlsx': if HeaderMissing == "Yes": return pd.read_excel(FileName, header=None) else: return pd.read_excel(FileName) if FileType == 'csv': if HeaderMissing == "Yes": return pd.read_csv(FileName, header=None) else: return pd.read_csv(FileName) if FileType == 'tsv': if HeaderMissing == "Yes": return pd.read_csv(FileName, header=None, sep='\t') else: return pd.read_csv(FileName, sep='\t') if FileType == 'orc': import pyarrow.orc as orc return orc.ORCFile(FileName).read().to_pandas() if FileType == 'parquet': import pyarrow.parquet as parquet return parquet.ParquetFile(FileName).read().to_pandas() if FileType == 'avro': import pandavro as pdx return pdx.read_avro(FileName) if FileType == 'json': import json from flatten_json import flatten from pandas.io.json import json_normalize with open(FileName) as RequiredFile: json = json.load(RequiredFile) if isinstance(json, dict): if (len(json) > 1): DataFrame = json_normalize(flatten(json)) else: DataFrame = json_normalize(list(json.values())[0]) else: FlattenedData = (flatten(_json) for _json in json) DataFrame = pd.DataFrame(FlattenedData) return DataFrame if FileType == 'xml': import xml.etree.ElementTree as et RootElement = et.parse(FileName).getroot() RootElementTag = RootElement.tag RootElementAttributes = [] for Item in RootElement.keys(): if "__" + RootElementTag + "___" + Item not in RootElementAttributes: RootElementAttributes.append("__" + RootElementTag + "___" + Item) CoreElement = [] CoreElementAttributes = [] CoreNodes = [] CoreNodesAttributes = [] FinalColumns = [] for CE in RootElement: if CE.tag not in CoreElement: CoreElement.append(CE.tag) for Item in CE.keys(): if CE.tag + "___" + Item not in CoreElementAttributes: CoreElementAttributes.append(CE.tag + "___" + Item) for Item in list(CE): if CE.tag + "__" + Item.tag not in CoreNodes: CoreNodes.append(CE.tag + "__" + Item.tag) for Item_ in Item.keys(): if CE.tag + "__" + Item.tag + "___" + Item_ not in CoreNodesAttributes: CoreNodesAttributes.append(CE.tag + "__" + Item.tag + "___" + Item_) RootElementAttributes = sorted(RootElementAttributes) CoreElement = sorted(CoreElement) CoreElementAttributes = sorted(CoreElementAttributes) CoreNodes = sorted(CoreNodes) CoreNodesAttributes = sorted(CoreNodesAttributes) FinalColumns = FinalColumns + RootElementAttributes + CoreElementAttributes + CoreNodes + CoreNodesAttributes FinalColumns = sorted(FinalColumns) DataFrame = pd.DataFrame(columns=FinalColumns) for CE in RootElement: DataRow = [] for Item in RootElementAttributes: DataRow.append(RootElement.attrib.get( Item.split("___")[1])) for Item in CoreElementAttributes: DataRow.append(CE.attrib.get(Item.split("___")[1])) for Item in CoreNodes: if CE is not None and CE.find( Item.split("__")[1]) is not None: DataRow.append(CE.find(Item.split("__")[1]).text) else: DataRow.append(None) CoreNodesAttributesFiltered = [ Value for Value in CoreNodesAttributes if Value.split("___")[0] == Item ] for CNAF in CoreNodesAttributesFiltered: DataRow.append( CE.find(Item.split("__")[1]).attrib.get( CNAF.split("___")[1])) #print(CE.find(Item.split("__")[1]).attrib) #print("**********") #print(CoreNodesAttributesFiltered) #print("----------------") #print(DataRow) DataFrame = DataFrame.append(pd.Series(DataRow, index=FinalColumns), ignore_index=True) return DataFrame