Ejemplo n.º 1
0
    def _process_data(cls, self):
        """Convert from xml, clean, and process."""
        df = pd.read_xml(self.get.content, xpath='.//item')

        col_list = []
        for col in df.columns:
            if '}' in str(col):
                # print(col.split('}')[1])
                col_list.append(col.split('}')[1])
            else:
                col_list.append(col)

        df.columns = col_list
        df.drop(columns=['description', 'PauseThresholdPrice'], inplace=True)

        self.df = df

        if self.path.exists():
            df_prev = pd.read_parquet(self.path)
            subset = ['HaltTime', 'IssueSymbol']
            df_all = (pd.concat(
                [df_prev,
                 df]).reset_index(drop=True).drop_duplicates(subset=subset))
            write_to_parquet(df_all, self.path)
        else:
            write_to_parquet(df, self.path)
 def __init__(self):
     """Create new String Graph Repository object."""
     super().__init__()
     # We load the data that cannot be automatically scraped
     self._data = compress_json.local_load("monarch_initiative.json")
     # The arguments keys used to load this graph
     general_kwargs = {
         "sources_column": "subject",
         "destinations_column": "object",
         "edge_list_edge_types_column": "predicate",
         "nodes_column": "id",
         "node_list_node_types_column": "category",
         "node_types_separator": "|",
         "name": "Monarch"
     }
     # We extend the data through scraping the Google Bucket
     base_url = "https://storage.googleapis.com/monarch-ingest/"
     xml = pd.read_xml(base_url).fillna("NaN")
     xml = xml[xml.Key.str.endswith("/monarch-kg.tar.gz")]
     for path in xml.Key:
         version = path.split("/")[0]
         self._data["Monarch"][version] = {
             "urls": [base_url + path],
             "arguments": {
                 "edge_path": "monarch-kg/monarch-kg_edges.tsv",
                 "node_path": "monarch-kg/monarch-kg_nodes.tsv",
                 **general_kwargs
             }
         }
Ejemplo n.º 3
0
def read_xml(
    path_or_buffer,
    xpath="./*",
    namespaces=None,
    elems_only=False,
    attrs_only=False,
    names=None,
    encoding="utf-8",
    parser="lxml",
    stylesheet=None,
    compression="infer",
    storage_options=None,
) -> DataFrame:
    ErrorMessage.default_to_pandas("read_xml")
    Engine.subscribe(_update_engine)
    return DataFrame(
        pandas.read_xml(
            path_or_buffer,
            xpath=xpath,
            namespaces=namespaces,
            elems_only=elems_only,
            attrs_only=attrs_only,
            names=names,
            encoding=encoding,
            parser=parser,
            stylesheet=stylesheet,
            compression=compression,
            storage_options=storage_options,
        )
    )
Ejemplo n.º 4
0
 def __init__(self, name, layout):
     self.name = name
     self.name_snake_case = to_snake_case(name)
     self.fields = ParamLayout.dedup_fields(
         ParamLayout.group_bitfields([
             Field(i)
             for i in pd.read_xml(layout, xpath='./Fields/*')['Def']
         ]))
Ejemplo n.º 5
0
    def get_rss_feed(cls, self):
        """Request and retry to get data from sec."""
        get = requests.get(self.url, headers=self.headers)
        if get.status_code >= 400:
            get = requests.get(self.url, headers=self.headers)
            if get.status_code >= 400:
                help_print_arg('SEC RSS Feed: 2nd get request failed')

        self.df = pd.read_xml(get.content, xpath='.//item')
Ejemplo n.º 6
0
def test_parse_diagnostics():

    topics: pd.DataFrame = pd.read_xml(DIAGNOSTICS_XML, xpath=".//topic")
    words: pd.DataFrame = pd.read_xml(DIAGNOSTICS_XML, xpath=".//word")

    # diags: untangle.Element = untangle.parse(DIAGNOSTICS_XML)
    # topics: pd.DataFrame = pd.DataFrame([t.attributes for t in diags.model.topic]).set_index('id')
    # words: pd.DataFrame = pd.DataFrame(
    #     [
    #         {
    #             **{'topic_id': t['id']},
    #             **w.attributes,
    #         }
    #         for t in diags.model.topic
    #         for w in t.word
    #     ]
    # )

    assert words is not None
    assert topics is not None
Ejemplo n.º 7
0
def _read_and_load_descriptors(engine: engine.base.Engine,
                               descriptor_type: str) -> None:
    descriptor = f"{descriptor_type}Descriptor"

    file_path = os.path.join("..", "..", "extension", "Descriptors",
                             f"{descriptor}.xml")
    df = pd.read_xml(file_path)  # type: ignore

    with engine.connect() as connection:
        for _, row in df.iterrows():
            sql = _prepare_descriptor_sql(row, SCHEMA_LMSX, descriptor)
            connection.execute(text(sql))
Ejemplo n.º 8
0
def opener(path, ext):
    if ext == 'csv':
        df = pd.read_csv(path)
    elif ext == "json":
        df = pd.read_json(path)
    elif ext == "html":
        df = pd.read_html(path)
    elif ext == "xlsx":
        df = pd.read_Excel(path)
    elif ext == "xml":
        df = pd.read_xml(path)

    return df
Ejemplo n.º 9
0
 def read_xml(cls, *args, **kwargs) -> __qualname__:
     try:
         df = pd.read_xml(*args, **kwargs)
     except pd.errors.EmptyDataError:
         # TODO: Figure out what EmptyDataError means
         # df = pd.DataFrame()
         return cls.new_df()
     # see to_xml for why these fixes are needed
     if "__xml_is_empty_" in df.reset_index().columns:
         # TODO: This ok?
         # df = pd.DataFrame()
         return cls.new_df()
     elif "__xml_index_" in df.columns:
         df = df.drop(columns={"__xml_index_"})
     return cls._convert_typed(df)
Ejemplo n.º 10
0
def live_boas(request):
    """
    generates list of Bid-Offer Acceptances from live BMRS datafeed
    """
    boa_feed = requests.get(LIVE_BOA_URL.format(ELEXON_KEY))
    current_boas = pd.read_xml(boa_feed.text, xpath=".//item")

    response = HttpResponse(
        content_type='text/csv',
        # headers={'Content-Disposition': 'attachment; filename="regional_generation_bytype.csv"'},
    )
    writer = csv.writer(response, quoting=csv.QUOTE_NONE)
    writer.writerow(['bmu_id', 'bmu_type', 'intensity'])
    for key, current_BOA in current_boas.to_dict(orient='index').items():
        bmu = BMU.objects.get(id=current_BOA['bmuName'])
        writer.writerow([current_BOA['bmuName'], bmu.ft, 0])
    return response
Ejemplo n.º 11
0
    def load_topic_diagnostics(self) -> pd.DataFrame:
        """Loads MALLET topic diagnostics item into dataframe
        See: https://mallet.cs.umass.edu/diagnostics.php
        """
        try:

            topics: pd.DataFrame = (
                pd.read_xml(self.diagnostics_filename(), xpath=".//topic")
                .rename(
                    columns={
                        'id': 'topic_id',
                        'tokens': 'n_tokens',
                    }
                )
                .set_index('topic_id')
            )

            return topics

        except Exception as ex:
            logger.error(f"load_topic_diagnostics: {ex}")
            return None
Ejemplo n.º 12
0
def find_xmls(path):
    fis = [p for p in Path(path).iterdir() if p.suffix == ".xml"]
    return [pd.read_xml(f) for f in fis]
Ejemplo n.º 13
0
def fileGathering():

    dataDict = {}

    while True:

        fileType = input(
            "What's your dataset's file type? .xml/.csv(other types could be requested for development) \n"
            "분석하고자 하는 데이터가 담긴 파일이 어떤 타입입니까? (파일 이름 뒤 점과 알파벳 3글자) .xml/.csv(다른 타입은 개발중입니다): "
        )

        fileRoute = input(
            "File Name or Route: \n파일 경로 및 파일 이름을 적어주세요. DataToGO 퐅더 안에 있고 다른 폴더 안에 안 들어가 있다면 이름만 입력하시면 됩니다. 이름은 파일 타입까지 적어주세요: "
        )

        tag = input(
            "Give the data a tag. The tag will help you find the right data: \n이 데이터에 태그를 붙여주세요. 태그를 알고 있으면 쉽게 데이터를 찾을 수 있습니다: "
        )

        if fileType == ".csv":

            dataDict[tag] = pd.read_csv(fileRoute, thousands=",")

        elif fileType == ".xml":

            dataDict[tag] = pd.read_xml(fileRoute, thousands=",")

        elif fileType == ".txt":

            handle = open(fileRoute)

            data = ""

            for line in handle:

                data += line

        command = input("More Files? (y/n):\n더 파일을 불러와야 합니까? (예:y/아니요:n): ")

        if command == "y":

            continue

        elif command == "n":

            break

        else:

            print("Wrong input. Considered as No.\n잘못 입력하셨습니다. 종료로 간주합니다.")

            break

    print(
        "Data import complete. Here's the Summary. \n데이터 불러오기 성공. 불러오기 요약입니다.")
    print("불러온 데이터 개수: %d" % len(dataDict))
    print("데이터 태그:")

    for tag in dataDict.keys():

        print(tag)

    return dataDict
Ejemplo n.º 14
0
import requests

url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=4&owner=include&count=100&action=getcurrent'
headers = ({
    'User-Agent': 'Rogue Technology Ventures [email protected]',
    'Referer':
    'https://www.sec.gov/structureddata/rss-feeds-submitted-filings',
    'Host': 'www.sec.gov',
    'Accept-Encoding': 'gzip, deflate',
    'Cache-Control': 'no-cache',
    'Accept-Language': 'en-GB,en;q=0.5'
})
get = requests.get(url, headers=headers)

url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=4&company=&dateb=&owner=include&start=0&count=100&output=atom'
df = pd.read_xml(get.content).loc[6:]

df = pd.concat([df, df['title'].str.split('-', expand=True)], axis=1)
df.rename(columns={0: 'form', 1: 'pre_split'}, inplace=True)
df_test = pd.concat([df, df['pre_split'].str.split(r'[()]', n=3, expand=True)],
                    axis=1)
cols_to_drop = [
    'rel', 'href', 'name', 'email', 'link', 'category', 'pre_split', 2
]
df_test.drop(columns=cols_to_drop, inplace=True)

cols_to_rename = {0: 'person/entity', 1: 'p/e/#', 3: 'p/e/desc'}
df_test.rename(columns=cols_to_rename, inplace=True)

cols_to_rename = {1: 'date', 3: 'accNo'}
Ejemplo n.º 15
0
 def read_xml(**kwargs) -> pd.DataFrame:
     return pd.read_xml(**kwargs)
Ejemplo n.º 16
0
from bs4 import BeautifulSoup
import pandas as pd

directory = r"D:\OneDrive - IESE Business School\Documentos\Amazon Project\Amazon Project - Data\Patents and trademarks\asb19550103-20201231-01.xml"
data = pd.read_xml(directory)


# Reading the data inside the xml
# file to a variable under the name
# data
# directory = r"D:\OneDrive - IESE Business School\Documentos\Amazon Project\Amazon Project - Data\Patents and trademarks\asb19550103-20201231-01.xml"
# with open(directory, 'r') as f:
#     data = f.read()

# # Passing the stored data inside
# # the beautifulsoup parser, storing
# # the returned object
# Bs_data = BeautifulSoup(data, "xml")
#
# # Finding all instances of tag
# # `unique`
# b_unique = Bs_data.find_all('unique')
#
# print(b_unique)
#
# # Using find() to extract attributes
# # of the first instance of the tag
# b_name = Bs_data.find('child', {'name': 'Frank'})
#
# print(b_name)
#
Ejemplo n.º 17
0
import sys
import glob
import os
import pandas as pd
import numpy as np
import math
import serial.tools.list_ports
from datetime import datetime

matrix_sheet = pd.read_xml('config.nth',
                           xpath='//CapacitorData/CapacitorDataClass')
matrix_sheet['CoilTurns'] = [
    9, 9, 9, 9, 9, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18
]

# matrix_sheet = pd.DataFrame(
#     np.array([
#         [9, 200, 161.3,     23.5,   24.6,   23],
#         [9, 88,  242,       26,     21.8,   20],
#         [9, 26,  405.45,    31.2,   20.5,   16],
#         [9, 15,  572.6,     31.8,   17.3,   14],
#         [9, 6.2, 924.6,     31.4,   13.4,   12],
#         [17, 200, 105.2,     26.77,  17,     25],
#         [17, 88,  158.12,    26.25,  13.55,  17],
#         [17, 26,  264.8,     36,     14.1,   17],
#         [17, 15,  373.85,    40.4,   13.5,   16],
#         [17, 6.2, 602.5,     43.4,   12,     12],
#         [18, 200, 161.5,     36.2,   28,     46],
#         [18, 88,  242.3,     40.2,   23.85,  39],
#         [18, 26, 406,       40.9,   20,     32],
#         [18, 15,  573.5,     46.8,   19,     29],