def _process_data(cls, self): """Convert from xml, clean, and process.""" df = pd.read_xml(self.get.content, xpath='.//item') col_list = [] for col in df.columns: if '}' in str(col): # print(col.split('}')[1]) col_list.append(col.split('}')[1]) else: col_list.append(col) df.columns = col_list df.drop(columns=['description', 'PauseThresholdPrice'], inplace=True) self.df = df if self.path.exists(): df_prev = pd.read_parquet(self.path) subset = ['HaltTime', 'IssueSymbol'] df_all = (pd.concat( [df_prev, df]).reset_index(drop=True).drop_duplicates(subset=subset)) write_to_parquet(df_all, self.path) else: write_to_parquet(df, self.path)
def __init__(self): """Create new String Graph Repository object.""" super().__init__() # We load the data that cannot be automatically scraped self._data = compress_json.local_load("monarch_initiative.json") # The arguments keys used to load this graph general_kwargs = { "sources_column": "subject", "destinations_column": "object", "edge_list_edge_types_column": "predicate", "nodes_column": "id", "node_list_node_types_column": "category", "node_types_separator": "|", "name": "Monarch" } # We extend the data through scraping the Google Bucket base_url = "https://storage.googleapis.com/monarch-ingest/" xml = pd.read_xml(base_url).fillna("NaN") xml = xml[xml.Key.str.endswith("/monarch-kg.tar.gz")] for path in xml.Key: version = path.split("/")[0] self._data["Monarch"][version] = { "urls": [base_url + path], "arguments": { "edge_path": "monarch-kg/monarch-kg_edges.tsv", "node_path": "monarch-kg/monarch-kg_nodes.tsv", **general_kwargs } }
def read_xml( path_or_buffer, xpath="./*", namespaces=None, elems_only=False, attrs_only=False, names=None, encoding="utf-8", parser="lxml", stylesheet=None, compression="infer", storage_options=None, ) -> DataFrame: ErrorMessage.default_to_pandas("read_xml") Engine.subscribe(_update_engine) return DataFrame( pandas.read_xml( path_or_buffer, xpath=xpath, namespaces=namespaces, elems_only=elems_only, attrs_only=attrs_only, names=names, encoding=encoding, parser=parser, stylesheet=stylesheet, compression=compression, storage_options=storage_options, ) )
def __init__(self, name, layout): self.name = name self.name_snake_case = to_snake_case(name) self.fields = ParamLayout.dedup_fields( ParamLayout.group_bitfields([ Field(i) for i in pd.read_xml(layout, xpath='./Fields/*')['Def'] ]))
def get_rss_feed(cls, self): """Request and retry to get data from sec.""" get = requests.get(self.url, headers=self.headers) if get.status_code >= 400: get = requests.get(self.url, headers=self.headers) if get.status_code >= 400: help_print_arg('SEC RSS Feed: 2nd get request failed') self.df = pd.read_xml(get.content, xpath='.//item')
def test_parse_diagnostics(): topics: pd.DataFrame = pd.read_xml(DIAGNOSTICS_XML, xpath=".//topic") words: pd.DataFrame = pd.read_xml(DIAGNOSTICS_XML, xpath=".//word") # diags: untangle.Element = untangle.parse(DIAGNOSTICS_XML) # topics: pd.DataFrame = pd.DataFrame([t.attributes for t in diags.model.topic]).set_index('id') # words: pd.DataFrame = pd.DataFrame( # [ # { # **{'topic_id': t['id']}, # **w.attributes, # } # for t in diags.model.topic # for w in t.word # ] # ) assert words is not None assert topics is not None
def _read_and_load_descriptors(engine: engine.base.Engine, descriptor_type: str) -> None: descriptor = f"{descriptor_type}Descriptor" file_path = os.path.join("..", "..", "extension", "Descriptors", f"{descriptor}.xml") df = pd.read_xml(file_path) # type: ignore with engine.connect() as connection: for _, row in df.iterrows(): sql = _prepare_descriptor_sql(row, SCHEMA_LMSX, descriptor) connection.execute(text(sql))
def opener(path, ext): if ext == 'csv': df = pd.read_csv(path) elif ext == "json": df = pd.read_json(path) elif ext == "html": df = pd.read_html(path) elif ext == "xlsx": df = pd.read_Excel(path) elif ext == "xml": df = pd.read_xml(path) return df
def read_xml(cls, *args, **kwargs) -> __qualname__: try: df = pd.read_xml(*args, **kwargs) except pd.errors.EmptyDataError: # TODO: Figure out what EmptyDataError means # df = pd.DataFrame() return cls.new_df() # see to_xml for why these fixes are needed if "__xml_is_empty_" in df.reset_index().columns: # TODO: This ok? # df = pd.DataFrame() return cls.new_df() elif "__xml_index_" in df.columns: df = df.drop(columns={"__xml_index_"}) return cls._convert_typed(df)
def live_boas(request): """ generates list of Bid-Offer Acceptances from live BMRS datafeed """ boa_feed = requests.get(LIVE_BOA_URL.format(ELEXON_KEY)) current_boas = pd.read_xml(boa_feed.text, xpath=".//item") response = HttpResponse( content_type='text/csv', # headers={'Content-Disposition': 'attachment; filename="regional_generation_bytype.csv"'}, ) writer = csv.writer(response, quoting=csv.QUOTE_NONE) writer.writerow(['bmu_id', 'bmu_type', 'intensity']) for key, current_BOA in current_boas.to_dict(orient='index').items(): bmu = BMU.objects.get(id=current_BOA['bmuName']) writer.writerow([current_BOA['bmuName'], bmu.ft, 0]) return response
def load_topic_diagnostics(self) -> pd.DataFrame: """Loads MALLET topic diagnostics item into dataframe See: https://mallet.cs.umass.edu/diagnostics.php """ try: topics: pd.DataFrame = ( pd.read_xml(self.diagnostics_filename(), xpath=".//topic") .rename( columns={ 'id': 'topic_id', 'tokens': 'n_tokens', } ) .set_index('topic_id') ) return topics except Exception as ex: logger.error(f"load_topic_diagnostics: {ex}") return None
def find_xmls(path): fis = [p for p in Path(path).iterdir() if p.suffix == ".xml"] return [pd.read_xml(f) for f in fis]
def fileGathering(): dataDict = {} while True: fileType = input( "What's your dataset's file type? .xml/.csv(other types could be requested for development) \n" "분석하고자 하는 데이터가 담긴 파일이 어떤 타입입니까? (파일 이름 뒤 점과 알파벳 3글자) .xml/.csv(다른 타입은 개발중입니다): " ) fileRoute = input( "File Name or Route: \n파일 경로 및 파일 이름을 적어주세요. DataToGO 퐅더 안에 있고 다른 폴더 안에 안 들어가 있다면 이름만 입력하시면 됩니다. 이름은 파일 타입까지 적어주세요: " ) tag = input( "Give the data a tag. The tag will help you find the right data: \n이 데이터에 태그를 붙여주세요. 태그를 알고 있으면 쉽게 데이터를 찾을 수 있습니다: " ) if fileType == ".csv": dataDict[tag] = pd.read_csv(fileRoute, thousands=",") elif fileType == ".xml": dataDict[tag] = pd.read_xml(fileRoute, thousands=",") elif fileType == ".txt": handle = open(fileRoute) data = "" for line in handle: data += line command = input("More Files? (y/n):\n더 파일을 불러와야 합니까? (예:y/아니요:n): ") if command == "y": continue elif command == "n": break else: print("Wrong input. Considered as No.\n잘못 입력하셨습니다. 종료로 간주합니다.") break print( "Data import complete. Here's the Summary. \n데이터 불러오기 성공. 불러오기 요약입니다.") print("불러온 데이터 개수: %d" % len(dataDict)) print("데이터 태그:") for tag in dataDict.keys(): print(tag) return dataDict
import requests url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=4&owner=include&count=100&action=getcurrent' headers = ({ 'User-Agent': 'Rogue Technology Ventures [email protected]', 'Referer': 'https://www.sec.gov/structureddata/rss-feeds-submitted-filings', 'Host': 'www.sec.gov', 'Accept-Encoding': 'gzip, deflate', 'Cache-Control': 'no-cache', 'Accept-Language': 'en-GB,en;q=0.5' }) get = requests.get(url, headers=headers) url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=4&company=&dateb=&owner=include&start=0&count=100&output=atom' df = pd.read_xml(get.content).loc[6:] df = pd.concat([df, df['title'].str.split('-', expand=True)], axis=1) df.rename(columns={0: 'form', 1: 'pre_split'}, inplace=True) df_test = pd.concat([df, df['pre_split'].str.split(r'[()]', n=3, expand=True)], axis=1) cols_to_drop = [ 'rel', 'href', 'name', 'email', 'link', 'category', 'pre_split', 2 ] df_test.drop(columns=cols_to_drop, inplace=True) cols_to_rename = {0: 'person/entity', 1: 'p/e/#', 3: 'p/e/desc'} df_test.rename(columns=cols_to_rename, inplace=True) cols_to_rename = {1: 'date', 3: 'accNo'}
def read_xml(**kwargs) -> pd.DataFrame: return pd.read_xml(**kwargs)
from bs4 import BeautifulSoup import pandas as pd directory = r"D:\OneDrive - IESE Business School\Documentos\Amazon Project\Amazon Project - Data\Patents and trademarks\asb19550103-20201231-01.xml" data = pd.read_xml(directory) # Reading the data inside the xml # file to a variable under the name # data # directory = r"D:\OneDrive - IESE Business School\Documentos\Amazon Project\Amazon Project - Data\Patents and trademarks\asb19550103-20201231-01.xml" # with open(directory, 'r') as f: # data = f.read() # # Passing the stored data inside # # the beautifulsoup parser, storing # # the returned object # Bs_data = BeautifulSoup(data, "xml") # # # Finding all instances of tag # # `unique` # b_unique = Bs_data.find_all('unique') # # print(b_unique) # # # Using find() to extract attributes # # of the first instance of the tag # b_name = Bs_data.find('child', {'name': 'Frank'}) # # print(b_name) #
import sys import glob import os import pandas as pd import numpy as np import math import serial.tools.list_ports from datetime import datetime matrix_sheet = pd.read_xml('config.nth', xpath='//CapacitorData/CapacitorDataClass') matrix_sheet['CoilTurns'] = [ 9, 9, 9, 9, 9, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18 ] # matrix_sheet = pd.DataFrame( # np.array([ # [9, 200, 161.3, 23.5, 24.6, 23], # [9, 88, 242, 26, 21.8, 20], # [9, 26, 405.45, 31.2, 20.5, 16], # [9, 15, 572.6, 31.8, 17.3, 14], # [9, 6.2, 924.6, 31.4, 13.4, 12], # [17, 200, 105.2, 26.77, 17, 25], # [17, 88, 158.12, 26.25, 13.55, 17], # [17, 26, 264.8, 36, 14.1, 17], # [17, 15, 373.85, 40.4, 13.5, 16], # [17, 6.2, 602.5, 43.4, 12, 12], # [18, 200, 161.5, 36.2, 28, 46], # [18, 88, 242.3, 40.2, 23.85, 39], # [18, 26, 406, 40.9, 20, 32], # [18, 15, 573.5, 46.8, 19, 29],