Esempio n. 1
0
def read_monthly(infile, dd):
    """
    Parameters
    ----------

    infile: str
    dd: DataFrame

    Returns
    -------
    df: DataFrame

    """
    widths = dd.length.tolist()
    logger.info("Reading monthly {}".format(infile))

    if isinstance(infile, StringIO):
        df = pd.read_fwf(infile, widths=widths, names=dd.id.values)
    elif infile.endswith('.zip'):
        archive = zipfile.ZipFile(infile)
        filename = archive.namelist()[0]
        parent_dir = str(Path(infile).parent)
        colspec = pd.concat([dd.start - 1, dd.end], axis=1)
        colspec = colspec.values.tolist()

        with ensure_cleanup_zip(archive, filename, parent_dir):
            df = pd.read_fwf(os.path.join(parent_dir, filename),
                             colspecs=colspec, names=dd.id.values)
    # TODO: Fix stripping of 0s
    return df
Esempio n. 2
0
def load_names():
    #Last names
    last_names = pd.read_fwf('Names/dist.all.last', header=None, widths=[14,7,7,7])
    first_male = pd.read_fwf('Names/dist.male.first', header=None, widths=[14,7,7,7])
    first_female = pd.read_fwf('Names/dist.female.first', header=None, widths=[14,7,7,7])
    subset_last_name = last_names[last_names[2]<=70]
    subset_first_male = first_male[first_male[2]<=80]
    subset_first_femal = first_female[first_female[2]<=80]
    names = pd.concat([subset_last_name[0], subset_first_male[0], subset_first_femal[0]], ignore_index=True)
    return names
Esempio n. 3
0
def parsing(filename, T_set='False',form='wtreg'):
    """filename must be in path/TxYYMMDD format. Returns Pandas dataframe 

    The log file will be run through a checker to make sure that there are 
    no bad lines.

    Thresholds will be converted from hex format to dBm
    
    If T_set is set to 'True' only the thresholds, latitudes, longitudes and 
    altitudes will be returned with the station identifier as a suffix, 
    otherwise the entire log file will be parsed.
    """
    check_log(filename,form)
    if os.path.isfile(filename):
        dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%y %H:%M:%S')
        namelist = ['ID','Datetime','Version','Threshold','?',
                                   'Triggers','GPS_Number','GPS_Mode','Temp',
                                   'Lat','Lon','Alt']
        if form=='wtreg':
            widths_list = [1,18,4,5,12,7,3,3,3,9,10,8]
            collist = [1,3,9,10,11]
        if form=='old7':
            widths_list = [1,18,4,5,7,7,3,3,3,9,10,8]
            collist = [1,3,9,10,11]
        if form=='newok':
            widths_list = [1,18,4,5,12,7,3,3,4,4,9,10,8]
            collist = [1,3,10,11,12]
            namelist = ['ID','Datetime','Version','Threshold','???',
                       'Triggers','GPS_Number','GPS_Mode','Temp','Batt',
                       'Lat','Lon','Alt']
        if T_set=='True':
            df = pd.read_fwf(filename, 
                            widths=widths_list,
                            names=namelist,
                            usecols=collist,
                            parse_dates = [0],
                            date_parser = dateparse,
                            na_values='\n')
            station=filename[-7]
            df['Threshold'] = df['Threshold'].apply(hex2page)
            df=df.rename(columns = {'Threshold':'Threshold_%s'%station,
                                    'Lat':'Lat_%s'%station,
                                    'Lon':'Lon_%s'%station,
                                    'Alt':'Alt_%s'%station})
        else:
            df = pd.read_fwf(filename, 
                            widths=widths_list,
                            names=namelist,
                            parse_dates = [1],
                            date_parser = dateparse,
                            na_values='\n')
            df['Threshold'] = df['Threshold'].apply(hex2page)
        df=df.set_index('Datetime')
        return df
Esempio n. 4
0
def run():
    session = get_session()
    class AveragePriceData(Model):
        # ['footnote_codes', 'item_name', 'end_year', 'area_name', 'begin_year', 'area_code', 'item_code', 'begin_period', 'end_period']
        series_id = Text(primary_key=True)
        footnote_codes = Text()
        item_name = Text()
        begin_year = Integer()
        end_year = Integer()
        area_name = Text()
        area_code = Text()
        item_code = Text()
        begin_period = Text()
        end_period = Text()

    sync_table(AveragePriceData)

    # read the master data (ap.series)
    series = pandas.read_csv(path.format("ap/ap.series"), sep='\t', skiprows=1,
                             names=["series_id", "area_code", "item_code", "footnote_codes", "begin_year",
                                    "begin_period", "end_year", "end_period"] )

    # not sure why i'm getting extra spaces, cleaning that up
    series["item_code"] = series["item_code"].map(lambda x: str(x).strip())
    series.set_index("series_id", inplace=True)

    # load areas
    area = pandas.read_fwf(path.format("ap/ap.area"), widths=[4,100], names=["area_code", "area_name"], skiprows=2)
    area.set_index("area_code", inplace=True)

    footnotes = pandas.read_fwf(path.format("ap/ap.footnote"), skiprows=1, widths=[1,100], names=["footnote_code", "footnote_text"])
    footnotes.set_index("footnote_code", inplace=True)

    items = pandas.read_fwf(path.format("ap/ap.item"), widths=[7, 100], skiprows=2, names=["item_code", "item_name"])
    items.set_index("item_code", inplace=True)

    result = series.join(area, on="area_code").join(items, on="item_code")
    print result.head(5)

    for k, v in result.iterrows():
        vals = v.to_dict()
        vals["series_id"] = k
        try:
            AveragePriceData.create(**vals)
        except Exception as e:
            print e
            print vals
            break
        print "Created {}".format(k)
Esempio n. 5
0
def _parse_ghcnd_stnmeta(fpath_stns, fpath_stninv, elems, start_end=None, bbox=None):
        
    stns = pd.read_fwf(fpath_stns, colspecs=[(0, 11), (12, 20), (21, 30),
                                             (31, 37), (38, 40), (41, 71),
                                             (2, 3), (76, 79)], header=None,
                       names=['station_id', 'latitude', 'longitude',
                              'elevation', 'state', 'station_name',
                              'network_code', 'hcn_crn_flag'])
    stns['station_name'] = stns.station_name.apply(unicode, errors='ignore')
    stns['provider'] = 'GHCND'
    stns['sub_provider'] = (stns.network_code.apply(lambda x: _NETWORK_CODE_TO_SUBPROVIDER[x]))

    if bbox is not None:

        mask_bnds = ((stns.latitude >= bbox.south) & 
                     (stns.latitude <= bbox.north) & 
                     (stns.longitude >= bbox.west) & 
                     (stns.longitude <= bbox.east))

        stns = stns[mask_bnds].copy()

    stn_inv = pd.read_fwf(fpath_stninv, colspecs=[(0, 11), (31, 35), (36, 40),
                                          (41, 45)],
                          header=None, names=['station_id', 'elem', 'start_year',
                                              'end_year'])
    stn_inv['elem'] = stn_inv.elem.str.lower()
    stn_inv = stn_inv[stn_inv.elem.isin(elems)]
    stn_inv = stn_inv.groupby('station_id').agg({'end_year': np.max,
                                                 'start_year': np.min})
    stn_inv = stn_inv.reset_index()

    stns = pd.merge(stns, stn_inv, on='station_id')

    if start_end is not None:

        start_date, end_date = start_end

        mask_por = (((start_date.year <= stns.start_year) & 
                     (stns.start_year <= end_date.year)) | 
                    ((stns.start_year <= start_date.year) & 
                     (start_date.year <= stns.end_year)))

        stns = stns[mask_por].copy()

    stns = stns.reset_index(drop=True)
    stns = stns.set_index('station_id', drop=False)

    return stns
Esempio n. 6
0
def _basis_set_order(chunk, mapr, sets):
    # Gaussian only prints the atom center
    # and label once for all basis functions
    first = len(chunk[0]) - len(chunk[0].lstrip(' ')) + 1
    df = pd.read_fwf(six.StringIO('\n'.join(chunk)),
                     widths=[first, 4, 3, 2, 4], header=None)
    df[1].fillna(method='ffill', inplace=True)
    df[1] = df[1].astype(np.int64) - 1
    df[2].fillna(method='ffill', inplace=True)
    df.rename(columns={1: 'center', 3: 'N', 4: 'ang'}, inplace=True)
    df['N'] = df['N'].astype(np.int64) - 1
    if 'XX' in df['ang'].values:
        df[['L', 'l', 'm', 'n']] = df['ang'].map({'S': [0, 0, 0, 0],
                'XX': [2, 2, 0, 0], 'XY': [2, 1, 1, 0], 'XZ': [2, 1, 0, 1],
                'YY': [2, 0, 2, 0], 'YZ': [2, 0, 1, 1], 'ZZ': [2, 0, 0, 2],
                'PX': [1, 1, 0, 0], 'PY': [1, 0, 1, 0], 'PZ': [1, 0, 0, 1],
                }).apply(tuple).apply(pd.Series)
    else:
        df['L'] = df['ang'].str[:1].str.lower().map(lmap).astype(np.int64)
        df['ml'] = df['ang'].str[1:]
        df['ml'].update(df['ml'].map({'': 0, 'X': 1, 'Y': -1, 'Z': 0}))
        df['ml'] = df['ml'].astype(np.int64)
    cnts = {key: -1 for key in range(10)}
    pcen, pl, pn, shfns = 0, 0, 1, []
    for cen, n, l, seht in zip(df['center'], df['N'], df['L'],
                               df['center'].map(sets)):
        if not pcen == cen: cnts = {key: -1 for key in range(10)}
        if (pl != l) or (pn != n) or (pcen != cen): cnts[l] += 1
        shfns.append(mapr[(seht, l)][cnts[l]])
        pcen, pl, pn = cen, l, n
    df['shell'] = shfns
    df.drop([0, 2, 'N', 'ang'], axis=1, inplace=True)
    df['frame'] = 0
    return df
def parse_classification_report(classification_report):
    """Parse a sklearn classification report to a dict."""
    return pd.read_fwf(
        StringIO(classification_report),
        index_col=0,
        colspecs=[(0, 12), (12, 22), (22, 32), (32, 42), (42, 52)]
    ).dropna()
Esempio n. 8
0
def ReadFemResp1995():
    """Reads respondent data from NSFG Cycle 5.

    returns: DataFrame
    """
    dat_file = '1995FemRespData.dat.gz'
    names = ['a_doi', 'timesmar', 'mardat01', 'bdaycenm', 'post_wt']
    colspecs = [(12359, 12363),
                (3538, 3540),
                (11758, 11762),
                (13, 16),
                (12349, 12359)]
    df = pandas.read_fwf(dat_file, 
                         compression='gzip', 
                         colspecs=colspecs, 
                         names=names)

    df['cmmarrhx'] = df.mardat01
    df['cmbirth'] = df.bdaycenm
    df['cmintvw'] = df.a_doi
    df['finalwgt'] = df.post_wt

    df.timesmar.replace([98, 99], np.nan, inplace=True)
    df['evrmarry'] = (df.timesmar > 0).astype(int)

    CleanData(df)
    return df
Esempio n. 9
0
def load_dataframe(fobj, compression='gzip'):
    """Given an open file for `hip_main.dat.gz`, return a parsed dataframe.

    If your copy of ``hip_main.dat`` has already been unzipped, pass the
    optional argument ``compression=None``.

    """
    try:
        from pandas import read_fwf
    except ImportError:
        raise ImportError(PANDAS_MESSAGE)

    names, colspecs = zip(
        ('hip', (2, 14)),
        ('magnitude', (41, 46)),
        ('ra_degrees', (51, 63)),
        ('dec_degrees', (64, 76)),
        ('parallax_mas', (79, 86)),  # TODO: have Star load this
        ('ra_mas_per_year', (87, 95)),
        ('dec_mas_per_year', (96, 104)),
    )

    df = read_fwf(fobj, colspecs, names=names, compression=compression)
    df = df.assign(
        ra_hours = df['ra_degrees'] / 15.0,
        epoch_year = 1991.25,
    )
    return df.set_index('hip')
Esempio n. 10
0
File: ex_k.py Progetto: jacqk/Pyml
def classify_1(k, training_set):
    a, test_files = commands.getstatusoutput(\
    'ls ./testDigits')
    test_files = test_files.split('\n')
    
    count = np.empty(len(test_files))
    
    
    for i, test_file in enumerate(test_files):
        test = pd.read_fwf('./testDigits/' + test_file,\
        header=None, widths=[1] * 32)
        
        result_list = []
        for key in training_set.keys():
            diff = (training_set[key] == test).sum().sum()
            result_list.append([diff, key[0]])
        
        result_frame = DataFrame(result_list, columns=['distance', 'number'])
        indice = result_frame['distance'].argsort()[-k:]
    
        result = result_frame.ix[indice, 'number'].value_counts().index[0]
        count[i] = result == test_file[0]
        print test_file[0], result, count[i]
        
    p = sum(count)/float(count.size)
    print count.size, 'test case', sum(count), 'right', p    
        
    return p
    
#train_file_digit()
#print classify_1(30, train_file_digit())
def process_sst(txtfile):
    """
    Read mhl_sst_data from a CSV file (in current directory, unless
    otherwise specified) and convert it to a netCDF file.
    If successful, the name of the saved file is returned.
    """

    extension = txtfile[-4:]
    if extension == '.TXT':
        format = 'old'
        colspecs = colspecs_old
        header = 7
        skip_rows = 7
        names = names_old
    elif extension == '.txt':
        format = 'new'
        colspecs = colspecs_new
        header = 9
        skip_rows = 9
        names = names_new

    # CSV file to extract array of formatted data
    data = pandas.read_fwf(txtfile, colspecs=colspecs,
                           names=names, header=header, skip_rows=skip_rows)
    # convert time from string to decimal time, IMOS compliant
    (dtime, time) = convert_to_utc(data['Date_Time'], format)
    # use source filename to get deployment number.
    # extract spatial infor from summary  file
    site_code_short = os.path.basename(txtfile)[:3]
    spatial_data = get_spatial_data(txtfile, site_code_short, format)
    # generate NetCDF
    create_mhl_sst_ncfile(txtfile, site_code_short,
                          data, time, dtime, spatial_data)
Esempio n. 12
0
    def parse_momatrix(self):
        """
        Parse the :class:`~exatomic.core.orbital.MOMatrix` dataframe.

        Note:
            Must supply 'print "final vectors" "final vectors analysis"' for momatrix
        """
        key0 = "Final MO vectors"
        key1 = "center of mass"
        found = self.find(key0, key1)
        if found[key0]:
            start = found[key0][0][0] + 6
            end = found[key1][0][0] - 1
            c = pd.read_fwf(StringIO("\n".join(self[start:end])), widths=(6, 12, 12, 12, 12, 12, 12),
                            names=list(range(7)))
            self.c = c
            idx = c[c[0].isnull()].index.values
            c = c[~c.index.isin(idx)]
            del c[0]
            nbas = len(self.basis_set_order)
            n = c.shape[0]//nbas
            coefs = []
            # The for loop below is like numpy.array_split(df, n); using numpy.array_split
            # with dataframes seemed to have strange results where splits had wrong sizes?
            for i in range(n):
                coefs.append(c.iloc[i*nbas:(i+1)*nbas, :].astype(float).dropna(axis=1).values.ravel("F"))
            c = np.concatenate(coefs)
            del coefs
            orbital, chi = _square_indices(len(self.basis_set_order))
            self.momatrix = MOMatrix.from_dict({'coef': c, 'chi': chi, 'orbital': orbital, 'frame': 0})
def stations():
    stations = read_fwf("tests/ghcnd-stations.txt", header=None,
                        colspecs=[(0,11), (12, 20), (21, 30), (31,37), (38,40), (41,71), (72,75), (76,79), (80,85)],
                        names=["station_id", "lat", "long", "elevation", "state", "name", "gsn_flag", "hcn_flag", "wmo_id"])

    stations['wmo_id'] = stations['wmo_id'].astype(str)
    return stations
Esempio n. 14
0
def read(filename):
    """ 
      
        reads a fwf file into a pandas data frame
        
        Usage: data = read(filename)
        
        Note: ('ETA','DwellTime','Activity' are floats as they contain NaNs
                and cannot be automatically converted to integers)
    """    
    
    # creating the widths of each column
    widths = [12,21,12,12,12,9,12,12,21,255,255,25,25,12,12,12,12,17,51,51,12]
    
    # creating colspecs (containing starting and ending point [) of a column)
    cumsum = [sum(widths[:i+1]) for i in range(len(widths))]
    
    # excluding the commas
    cumsum0 = [0]+cumsum[:-1]
    cumsum_short = [item-1 for item in cumsum]
    colspecs = list(zip(cumsum0,cumsum_short))
    
    # reading the file
    try:
        data = pd.read_fwf(filename,colspecs = colspecs,skiprows = [1])
    except(IOError):
        print('This file does not exist. Please, check the filename or the directory.')
        sys.exit()
    
    # spectifying explicitly the data types
    data = data.astype('object')
    numeric_list = ['LON','LAT','ETA','DwellTime']
    data[numeric_list] = data[numeric_list].astype('float')
    return(data)
Esempio n. 15
0
def parse_monthly(fp, cache=True, nrows=None):
    with open('interesting.json') as f:
        col_map = json.load(f)
        month = zip_fp_to_month(fp)
        hdf_key = month_to_hdf_key(month)
        dd_key = month_to_dd_key(month)
        span = dd_key_to_span(dd_key)
        cm = col_map[span]

    if cache:
        with pd.HDFStore('data/store.h5') as store:
            if hdf_key in store:
                return None

    data_fp = extract(fp)
    try:
        dd = month_to_dd(month)
        subset = dd.loc[dd.field.isin(cm.keys())]
        names = list(subset.field)
        colspecs = (subset[['start', 'end']]
                    .assign(start=lambda df: df.start - 1)
                    .values.tolist())
        # TALK: run through with subset first (nrows=100)

        df = (pd.read_fwf(data_fp, colspecs=colspecs, names=names,
                          usecols=names, nrows=nrows)
                .rename(columns=cm)
                .sort(['mis']))
        key = month_to_hdf_key(month)
        with pd.HDFStore(STOREPATH) as store:
            store.append(key, df, format='table', data_columns=True)
    except:
        os.remove(data_fp)
        raise
    os.remove(data_fp)
Esempio n. 16
0
 def refreshGetij(self,dt):
     Logger.info("refreshGetij: ")
     # http://getij.rws.nl/export.cfm?format=txt&from=02-04-2016&to=08-04-2016&uitvoer=1&interval=10&lunarphase=yes&location=SCHEVNGN&Timezone=MET_DST&refPlane=NAP&graphRefPlane=NAP
     getijPars = {
         'format' : 'txt',
         'from' : strftime("%d-%m-%Y"),
         'to' : strftime("%d-%m-%Y"),
         'uitvoer': 1,
         'interval': 10,
         'lunarphase': 'yes',
         'location': 'SCHEVNGN',
         'Timezone':'MET_DST',
         'refPlane':'NAP',
         'graphRefPlane':'NAP'
     }
     r = requests.get("http://getij.rws.nl/export.cfm",params=getijPars)
     Logger.info(r.text)
     dataFile = StringIO(r.text)
     df = pd.read_fwf(dataFile,widths=[17,6,3],names=['timestamp','height','unit'],skiprows=14)
     self.loggerDataframe(df.head())
     dataFile.close()
     for index, row in df.iterrows():
         try:
             self.ws.addObservation(pd.Timestamp(row['timestamp']).tz_localize('MET'),'getij',float(row['height']),'getij.rws.nl')
         except ValueError:
             Logger.info("oops")
             pass
         except AttributeError:
             Logger.info("oops")
             pass
Esempio n. 17
0
def get_janus_epimetheus_resonances():
    w = [len("              Janus1"), len(" reson"), len("  Resonance radius R")]

    def get_janos_epi_order(reso):
        a, b = reso.split(":")
        return int(a) - int(b)

    fname = pr.resource_filename("pyciss", "data/ring_janus_epimetheus_resonances.txt")
    with open(fname) as f:
        jan_epi_resonances = pd.read_fwf(
            f, skiprows=15, header=0, widths=w, skipfooter=1
        )

    # replace column names
    jan_epi_resonances.columns = ["moon", "reson", "radius"]

    # calculate order from resonance name
    jan_epi_resonances["order"] = jan_epi_resonances.reson.map(get_janos_epi_order)

    def func(x):
        "Remove space from resonce string"
        return ":".join(i.strip() for i in x.split(":"))

    jan_epi_resonances.reson = jan_epi_resonances.reson.map(func)

    # calculate name for axes display
    jan_epi_resonances["name"] = (
        jan_epi_resonances.moon + " " + jan_epi_resonances.reson
    )
    return jan_epi_resonances
Esempio n. 18
0
def get_flare_catalog_fromfile(data_path):
    """ program to read in GOES h-alpha and x-ray flare information from file"""
    """ usage: [ha, xray]=get_flare_catalog; ha is a dict"""
    """ ha['location'][300] prints the 300th location"""
    """ keys are ha.keys() -- station_num, group_num, initial_time, final_time"""
    """ peak_time, optical_importance, optical_brightness, xray_class, """
    """ xray_size, NOAA_AR """
    #define data file location
#    ha_file=data_path+"/ha.txt"
    xray_file=data_path+"/xray.txt"
    print("Getting from file, years are only those downloaded")
    print("Reading X-ray flares from: ", xray_file)
    #code to read in xray data
    names=["data code", "station code", "year", "month", "day", "init_ind", "init_time", "final_ind", "final_time", "peak_ind", 
           "peak_time", "location", "optical", "something", "xray_class", "xray_size", "station", "blank", "NOAA_AR", "etc"]

    widths=[2, 3, 2, 2, 2, 2, 4, 1, 4, 1, 4, 7, 3, 22, 1, 3, 8, 8, 6, 24]

    xray_df=pd.read_fwf(xray_file, widths=widths, header=None, names=names, parse_dates=[[2, 3, 4]])
    #translates dates to datetime
    xray_df["location"]=[x if str(x)[0]=="N" or str(x)[0]=="S" else None for x in xray_df["location"]]
    xray_df["init_date"]=create_datetime(xray_df["year_month_day"], xray_df["init_time"])
    xray_df["peak_date"]=create_datetime(xray_df["year_month_day"], xray_df["peak_time"])
    xray_df["final_date"]=create_datetime(xray_df["year_month_day"], xray_df["final_time"])

    xray_df=xray_df[["init_date", "peak_date", "final_date", "location", "xray_class", "xray_size", "NOAA_AR"]]
    
    #remove all the lines that don't have either a valid peak data or a valid init date (both have to be lacking)
#    print("len before", len(xray_df))
#    xray_df = xray_df[np.isfinite(xray_df['peak_date']) | np.insfinite(xray_df['init_date'])]  
#    print("len after", len(xray_df))
    ha_df="not yet implemented"
    return (xray_df, ha_df)
Esempio n. 19
0
def download_station_data(station_id):

    #station_id = 'USC00167344'
    
    station_url = "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/all/" + station_id + ".dly"

    # Changing position 1 and 2 ( values being 4 and 2 (4 being the year, and 2 being the month))
    # Combining them to a width of 6, instead of two widths of 4 and 2.
    widths = [11, 4, 2, 4, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 

1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 

1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 

1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 1]

    # starts at 1 to exclude the station ID from the data
    # 1, 2, 3 are the year, month and measurment type.
    cols = [1, 2, 3, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64,
         68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124]

    #cols = [1, 2, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
    #    67, 71, 75, 79, 83, 87, 91, 95, 99, 103, 107, 111, 115, 119, 123]

    names = ['year', 'month', 'data_type', 1, 2, 3, 4, 5, 6, 7, 8, 9,
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]

    df = pd.read_fwf(station_url, widths=widths, usecols=cols, 
        header=None, names=names, parse_dates=[['year', 'month']], index_col=0, na_values='-9999')

    return df
def main(fileNameS, year):
    pathS = os.path.join(config.rawDataPathS, 'unemployment_statistics')
    conversionD = {'state_fips_code': (lambda x: str(x)),
                   'county_fips_code': (lambda x: str(x)),
                   'year': (lambda x: int(x)),
                   'labor_force': (lambda x: int(str(x).translate(None, ','))),
                   'employed': (lambda x: int(str(x).translate(None, ','))),
                   'unemployed_level': (lambda x: int(str(x).translate(None, ',')))}
    tableDF = pd.read_fwf(os.path.join(pathS, fileNameS),
                          converters=conversionD,                          
                          names=['laus_code', 'state_fips_code',
                                 'county_fips_code', 'county_and_state',
                                 'year', 'labor_force', 'employed',
                                 'unemployed_level', 'unemployed_rate'],
                          skipfooter=3,
                          skiprows=6,
                          widths=[18, 7, 6, 50, 4, 14, 13, 11, 9])
    tableDF.loc[:, 'fips_code'] = (tableDF.state_fips_code + 
                                   tableDF.county_fips_code).astype(int)
    
    # Select relevant columns and set index
    finalDF = tableDF.loc[:, ['fips_code', 'unemployed_rate']]
    finalDF.columns = ['FIPS', 'URate' + str(year)]
    finalDF = finalDF.sort(columns='FIPS')
    finalDF = finalDF.set_index('FIPS')
    
    return finalDF
Esempio n. 21
0
    def handle_data(self, data):
        '''
        Function to parse data between \<pre\> tags

        @param data: Input data
        '''
        if self.in_pre_tag == True and self.read_data == True:
            self.data_dict[self.label] = pd.read_fwf(StringIO(data), widths=[7,7,7,7,7,7,7,7,7,7,7],
                                                     header=0, skiprows=[0,1,3,4])

            split_data = data.split('\n')
            headings = split_data[2].split()
            units = split_data[3].split()

            self.metadata_dict[self.label] = OrderedDict()
            self.metadata_dict[self.label]['units'] = [(heading, unit) for heading, unit in zip(headings, units)]
            self.read_data = False

            self.tmp = data

        elif self.in_pre_tag == True and self.read_data == False:


            station_metadata_dict = OrderedDict()
            for line in data.splitlines():
                if line != '':
                    metadata = line.split(':')
                    station_metadata_dict[metadata[0].strip()] = metadata[1].strip()

            self.metadata_dict[self.label]['metadata'] = station_metadata_dict
            self.read_data = True

        elif self.read_data == True and self.in_header == True:
            self.label = data.strip()
Esempio n. 22
0
    def ps(self, args=None, options='', all=True, verbose=True,
           as_frame='auto', raise_on_error=True):
        if args is None:
            args = ''
        if all:
            args += 'A'
        if verbose:
            args += 'f'
        if len(args) > 0 and args[0] != '-':
            args = '-' + args

        results = self.wait(('ps %s %s' % (args, options)).strip(),
                            raise_on_error=raise_on_error)

        if as_frame == 'auto':
            as_frame = has_pandas

        if as_frame:
            if not has_pandas:
                raise ImportError("Unable to import pandas")
            df = pd.read_fwf(StringIO(results))
            cmd_loc = df.columns.get_loc('CMD')
            if cmd_loc < len(df.columns):
                col = cmd_loc.fillna('')
                for i in range(cmd_loc + 1, len(df.columns)):
                    col = col + df.icol(i).fillna('')
                df['CMD'] = col
            return df

        return results
Esempio n. 23
0
def index_to_df(indexpath, label, convert_times=True):
    """The main reader function for PDS Indexfiles.

    In conjunction with an IndexLabel object that figures out the column widths,
    this reader should work for all PDS TAB files.

    Parameters
    ----------
    indexpath : str or pathlib.Path
        The path to the index TAB file.
    label : pdstools.IndexLabel object
        Label object that has both the column names and the columns widths as attributes
        'colnames' and 'colspecs'
    convert_times : bool
        Switch to control if to convert columns with "TIME" in name (unless COUNT is as well in name) to datetime
    """
    indexpath = Path(indexpath)
    df = pd.read_fwf(
        indexpath, header=None, names=label.colnames, colspecs=label.colspecs
    )
    if convert_times:
        for column in [i for i in df.columns if "TIME" in i and "COUNT" not in i]:
            if column == "LOCAL_TIME":
                # don't convert local time
                continue
            print(f"Converting times for column {column}.")
            try:
                df[column] = pd.to_datetime(df[column])
            except ValueError:
                df[column] = pd.to_datetime(
                    df[column], format=utils.nasa_dt_format_with_ms
                )
        print("Done.")
    return df
Esempio n. 24
0
def read_raso_fwf(pid, path):
    """Read a fixed-width format radiosonde file.
    
    These are the ones containing the climatology that was also used by
    Giovanni Massaro and Daniel Meyer.
    """
    colspecs = [(8, 17), (17, 26), (26, 36), (43, 49)]
    names = ["p", "z", "T", "Td"]
    def errfloat(x):
        return None if "/" in x else float(x)
    file = filename(path)
    valid = (dt.datetime
            .strptime(file, "%Y%m%d_%H%M.reduced.txt")
            .replace(tzinfo=dt.timezone.utc)
            .timestamp()
            )
    df = pd.read_fwf(path, colspecs=colspecs, names=names,
            converters={n: errfloat for n in names},
            skiprows=1)
    df["T"] = 273.15 + df["T"]
    df["Td"] = 273.15 + df["Td"]
    ps = pd.Series(np.repeat(pid, len(df)), name="profile")
    # Calculate specific humidity and cloud water content
    qvap = fml.qvap(df["p"], df["Td"])
    qliq = fml.qliq(df["z"], df["p"], df["T"], df["Td"])
    data = pd.concat([ps, df, qvap, qliq], axis=1).as_matrix().tolist()
    cloudy = 1 if (qliq > 0).any() else 0
    return pid, data, valid, cloudy, file
Esempio n. 25
0
def parse_form_13f(fname):

    # note 'OTHER MANAGERS' field is absent from the fixed width column
    # definitions presumably because the no data is present in these documents
    # for this field.
    # The assertion will pick up if we get unexpected no. of columns
    conformed_period_of_report, filed_as_of_date, no_of_columns = \
            parse_form_13f_head(fname)

    assert no_of_columns == len(Form13F.column_names), \
        'Not enough column_names/columns'

    # construct pandas.DataFrame from fixed with file, use the 0th column as
    # the label for the row (security)
    data_frame = pandas.read_fwf(
        fname,
        # ranges of the fixed width columns
        colspecs=[(0, 29), (29, 45), (45, 57), (57, 64), (64, 73), (73, 79),
                  (79, 92), (92, 112), (112, 123), (123, 132)],
        skiprows=[0, 1, 2, 3, 4],
        index_col=0,
        names=Form13F.column_names,
    )

    # drop label if all values in row are Na
    data_frame = data_frame.dropna(how='all')

    # for each column apply a function on each of the rows which strips strings
    # of tabs, newlines and spaces
    data_frame = data_frame.apply(lambda x: x.apply(
        lambda x: x.strip() if isinstance(x, str) else x)
    )

    return Form13F(fname, conformed_period_of_report, filed_as_of_date,
                   data_frame)
Esempio n. 26
0
def readHypo71Sum(sumfile):
    """
    Read a summary file from hypoinverse in the y2k compliant hypo71 
    format
    
    Parameters
    ----------
    sumfile : str
        Path the the sum file
        
    Returns
    -----------
    DataFrame populated with sumfile info
    """
    fw = [(0, 20), (19, 22), (22, 23), (23, 28), (28, 32), (32, 33), (33, 38),
          (38, 45), (52, 55), (55, 59), (59, 64), (64, 69), (69, 74), (74, 79)]
    cols = ['ds', 'latd', 'latc', 'latm', 'lond', 'lonc', 'lonm', 'depth',
            'numphase', 'azgap', 'stadist', 'rms', 'horerr', 'vererr']
    toDrop = ['ds', 'latd', 'latc', 'latm', 'lond', 'lonc', 'lonm']
    df = pd.read_fwf(sumfile, colspecs=fw, names=cols)

    latmul = [1 if x else -1 for x in df['latc'].isnull()]
    df['lat'] = np.multiply((df['latd'] + df['latm'] / 60.), latmul)
    lonmul = [1 if x else -1 for x in df['lonc'].isnull()]
    df['lon'] = np.multiply((df['lond'] + df['lonm'] / 60.), lonmul)
    utcs = [obspy.UTCDateTime(x.replace(' ', '')) for x in df.ds]
    irisws = [x.format_iris_web_service().replace(':', '-') for x in utcs]
    times = [x.timestamp for x in utcs]
    names = [x.split('.')[0] for x in irisws]
    df['times'] = times
    df['names'] = names
    df.drop(toDrop, axis=1, inplace=True)
    return df
Esempio n. 27
0
    def _read_stns(self):

        if self.download_updates and not self._download_run:

            self.download_local()

        stns = pd.read_fwf(
            os.path.join(self.path_ushcn_data, "ushcn-v2.5-stations.txt"),
            colspecs=[(0, 11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71)],
            header=None,
            names=["station_id", "latitude", "longitude", "elevation", "state", "station_name"],
        )
        stns["station_name"] = stns.station_name.apply(unicode, errors="ignore")
        stns["provider"] = "USHCN"
        stns["sub_provider"] = ""

        if self.bbox is not None:

            mask_bnds = (
                (stns.latitude >= self.bbox.south)
                & (stns.latitude <= self.bbox.north)
                & (stns.longitude >= self.bbox.west)
                & (stns.longitude <= self.bbox.east)
            )

            stns = stns[mask_bnds].copy()

        stns = stns.set_index("station_id", drop=False)

        return stns
Esempio n. 28
0
def main():
    mpl.rcParams['font.size'] = 11
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.usetex'] = 'True'
    pgf_preamble = (r'\usepackage{/home/luismi/doc/2016/'
        + r'spaceapps/latex_stylesheet}')
    mpl.rcParams['text.latex.preamble'] = pgf_preamble
    data = []
    for i in range(1, 6):
        data.append(pd.read_fwf('data/rocket/{}0000.csv'.format(i)))
    height = []
    p_c = []
    isp = []
    for d in data:
        height += (h(d.p.values[2:] * 1E5) / 1E3).tolist()
        isp += d.isp.values[2:].tolist()
    for i in range(1, 6):
        p_c += len(data[0].values[2:]) * [i * 10]
    triang = mpl.tri.Triangulation(p_c, height)
    fig, ax = new_ax()
    c1 = ax.tricontourf(triang, isp, np.arange(2900, 3151, 25), 
        cmap=mpl.cm.viridis)
    c2 = ax.tricontour(triang, isp, np.arange(2900, 3151, 50), 
        colors='k', linestyles='--')
    cb = plotty.colorbar(c1, pad=0.15, fraction=0.1)
    cb.set_label(r'$Isp$ [\si{\metre\per\second}]')
    ax.clabel(c2, fmt=r'%.0f \si{\metre\per\second}')
    ax.set_xlabel(r'$p_c$ [\si{\mega\pascal}]')
    ax.set_ylabel(r'$h$ [\si{\kilo\metre}]')
    plotty.candy(ax, ncol=2, cb=True, pad=0.15, fraction=0.1)
    ax.xaxis.set_ticks([10, 20, 30, 40])
    ax.yaxis.set_ticks([-4., -3., -2., -1., 0.])
    fig.savefig('img/isp.pdf')
def run_pandas():
    """
    Load records into pandas data frame.

    * PyPy: OK
    * Source: https://github.com/pydata/pandas
    * Docs: amazing
    * Independent: no
    * Small: no
    * Can specify column data types: yes
    * Can read in chunks: yes
    * Can skip columns: yes
    * Can stream: yes but it won't be a DataFrame
    * Return type: DataFrame
    * Memory usage: about 60Mb
    * Timing: around 0.5 sec
    """
    zp = pd.read_fwf(
        'data/ZIP.DAT',
        widths=[5, 2, 28, 1, 5, 7, 8, 3, 6, 1, 1, 4, 4, 3],
        names=['zip_code', 'state_code', 'city_name', 'type', 'county_fips',
               'lat', 'lon', 'area_code', 'fin_code', 'last_line',
               'facility', 'msa_code', 'pmsa_code', 'filler'],
        usecols=[0, 1, 2, 4, 5, 6, 7, 11, 12],
        converters={'zip_code': str, 'county_fips': str, 'area_code': str,
                    'msa_code': str, 'pmsa_code': str},
        header=None,
        skiprows=2
    )
    print 'Records:', len(zp)
Esempio n. 30
0
 def read_sica_file(self, file_path):
     arquivo = pd.read_fwf(file_path)
     print(arquivo)
core_file_extpath = glob.glob(path + "/*.xls")
core_fileext = [os.path.split(h)[1] for h in core_file_extpath]
core_file = [os.path.splitext(os.path.basename(j))[0] for j in core_fileext]
if len(core_file) != 0:
    core_file_s = core_file[0]
else:
    core_file_s = input("Excel file not found - please manually type sample name:")

#Output excel file is created

writer = pd.ExcelWriter(core_file_s+" summary.xlsx")

for file in filenames:

    title = str(file)+" region in "+str(core_file_s)
    sn = pd.read_fwf(file+".par", delim_whitespace=True)

    #Confirms file read
    print("Collected "+file)

    #calculates percentage integration and confirms totals add up to 100%
    columns = (list(sn.columns))
    ints = list(sn[columns[3]])
    int_percents = []
    total_int = sum(ints)
    for integration in ints:
        int_percents.append(round(((integration/total_int)*100), 4))
    sn["Percent Integration"] = int_percents
    
    #Total Integration Percentage Sum should be 100 as a sanity check
    sn["Total Integration Percentage Sum"] = sum(int_percents)
Esempio n. 32
0
# Define data file field widths
field_widths = ([4, 2] + [1] * 12 + [2] + [1] * 2 + [2] + [1] * 2 + [2] * 2 +
                [1] * 2 + [4] * 2 + [2] * 2 + [4, 2] + [4] * 2 + [5] + [4] * 4)
# Data file field names
field_names = [
    'd1', 'd2', 'd3', 'worklic', 'd5', 'nwlic', 'd7', 'd8', 'd9', 'cars',
    'd11', 'd12', 'd13', 'sex', 'd15', 'household_position',
    'driving_licences', 'occupation', 'd19', 'd20', 'd21', 'mode', 'd23',
    'd24', 'pt_owalk_h', 'pt_dwalk_h', 'origin_zone', 'destination_zone',
    'pt_tot', 'pt_lines', 'pt_owalk', 'pt_dwalk', 'dist', 'car_time',
    'park_orig', 'park_dest', 'pt_wait'
]

# Read data
data = pd.read_fwf('./grenoble.dat',
                   widths=field_widths,
                   header=None,
                   names=field_names)

# Clean data
# Drop unecessary columns
data = data.drop(columns=[
    'd1', 'd2', 'd3', 'd5', 'd7', 'd8', 'd9', 'd11', 'd12', 'd13', 'd15',
    'd19', 'd20', 'd21', 'd23', 'd24', 'destination_zone', 'pt_dwalk_h',
    'pt_dwalk', 'park_orig', 'pt_wait'
])
# Set Nan to zero
data = data.applymap(lambda x: 0.0 if np.isnan(x) else x)
# Convert all data to integers
data = data.applymap(lambda x: int(x))

Esempio n. 33
0
def get_average_payload(comm):
    all_payloads_list = attributes_for_full[(
        attributes_for_full['2'] == comm)]['8'].tolist()
    return sum(all_payloads_list) / len(all_payloads_list)


def get_average_tare_weight(comm):
    all_tare_weight_list = attributes_for_full[(
        attributes_for_full['2'] == comm)]['9'].tolist()
    return sum(all_tare_weight_list) / len(all_tare_weight_list)


# using cost.dat
attributes_for_full = pandas.read_fwf(
    cost_dat_output,
    colspecs=split_width([5, 5, 10, 10, 10, 10, 10, 5, 5, 10]),
    header=None)
# attributes_for_full.columns = [['RR-Code','Commod.','TrainCost/hr','Cost/gross-ton-mile',
# 'terminal-processing-cost/car,fixed','terminal-cost/car-hr','transfer-cost/car','car-payload',
# 'car-tare-wt','Gross Car Weight','Cars per Train','Gross Train Weight']]

attributes_for_full.columns = [[
    '1', '2', '3', '4', '5', '6', '7', '8', '9', '12'
]]
attributes_for_full = attributes_for_full[['1', '2', '8', '9']]

for i in range(1, no_of_commodity + 1):
    attributes_for_empty = pandas.DataFrame({
        '1': [0],
        '2': [i],
        '8': [get_average_payload(i)],
Esempio n. 34
0
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt

data = pd.read_fwf('brain_body.txt')
brain = data[['Brain']]
body = data[['Body']]

# Create linear regression object
regr = linear_model.LinearRegression()

regr.fit(brain.head(3), body.head(3))

predicted_data = regr.predict(pd.DataFrame({"values": [3.385]}))

challenge_dataframe = pd.read_table('challenge_dataset.txt',
                                    delim_whitespace=False,
                                    names=("testing", ))
challenge_dataframe_xvalues = pd.DataFrame(
    challenge_dataframe.testing.str.split(",").tolist(),
    columns=["xvalue", "yvalue"])[[0]]

print(challenge_dataframe_xvalues.head())

challenge_dataframe_yvalues = pd.DataFrame(
    challenge_dataframe.testing.str.split(",").tolist(),
    columns=["xvalue", "yvalue"])[[1]]
print(predicted_data.shape)
print(predicted_data)
print(challenge_dataframe_yvalues.shape)
print(challenge_dataframe_yvalues.head())
  -h --help                     Show this screen

weather_cleanse takes in an csv file and write to an out_file that is a csv.
a command should thus be:
"python weather_cleanse.py input_file.csv out_file.csv"
'''

from docopt import docopt

import pandas as pd

ARGS = docopt(__doc__)
print(ARGS)
saved_cols = [1, 2, 3, 4]

station_df = pd.read_fwf('ghcnd-stations.txt')
station_df.columns = [
    "station_id", 'long', 'lat', 'elevation', 'location', 'gsn_flag',
    'hcn_flag', 'wmo_id'
]
station_df2 = station_df[['station_id', 'location']]
NYC_list = []
for row in station_df2.itertuples():
    if 'NEW YORK CNTRL PK TWR' in row.location or 'NEW YORK LAGUARDIA AP' in row.location or 'NEW YORK JFK INTL AP' in row.location:
        NYC_list.append([row.station_id, row.location])

station_df_result = pd.DataFrame(NYC_list, columns=['station_id', 'location'])

df = pd.read_csv(ARGS['FILE_IN'])
df.columns = ['station_id', 'date', 'condition', 'value', 'E', 'F', 'G', 'H']
import pandas as pd
from Potential import *
from Graph import *
from HybridLBPLogVersion import HybridLBP
from EPBPLogVersion import EPBP
import numpy as np
from show_image import show_images

import time

row = 50
col = 50

data = pd.read_fwf('../Data/noisyImage.dat', header=None)
m = data.iloc[0:row, 0:col].values
m = m * 100

# show_images((m,), vmin=-30, vmax=130)

domain = Domain((-30, 130), continuous=True)

evidence = [None] * (col * row)
for i in range(row):
    for j in range(col):
        evidence[i * col + j] = RV(domain, m[i, j])

rvs = []
for _ in range(row * col):
    rvs.append(RV(domain))

fs = []
Esempio n. 37
0
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt

# read data
dataframe = pd.read_fwf('Lesson1/data/brain_body.txt')
x_values = dataframe[['Brain']]
y_values = dataframe[['Body']]

# train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values, y_values)

# visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, body_reg.predict(x_values))
plt.show()
import os
from plotnine import *

#This file reads the fide ranking files from 2005 to 2021 and shows the proportion of women in ches and the
#mean rating per sex

#FIDE files have 2 different formats. Files with each format are in a different folder
#Read files with old format
old_names = os.listdir("data\old_format")

df_list_number = []#Stores the number of women
df_list_elo = []#Stores the mean rating

for name in old_names:
    year = name[3:5]#Get year from file name
    data = pd.read_fwf(f'data\old_format\{name}', delimiter=' ')
    #Split the data in men and women
    data_w = data.loc[(data.Flag == "w") | (data.Flag == "wi")]
    data_m = data.loc[(data.Flag != "w") & (data.Flag != "wi")]
    #Calculate percentage of men and women
    percent_w = data_w.shape[0] / data.shape[0] *100
    percent_m = 100 - percent_w
    #Calculate the mean rating of men and women
    elo_w = data_w.iloc[:,3 ].mean()
    elo_m = data_m.iloc[:,3 ].mean()
    #Add info to list as dictionary
    df_list_number.append({"year": "20" + str(year), "percent": percent_w, "Sex": "Women"})
    df_list_number.append({"year": "20" + str(year), "percent": percent_m, "Sex": "Men"})
    df_list_elo.append({"year": "20" + str(year), "elo": elo_w, "Sex": "Women"})
    df_list_elo.append({"year": "20" + str(year), "elo": elo_m, "Sex": "Men"})
Esempio n. 39
0
from metpy.units import units


###########################################

# Change default to be better for skew-T
plt.rcParams['figure.figsize'] = (9, 9)

###########################################

# Upper air data can be obtained using the siphon package, but for this example we will use
# some of MetPy's sample data.

col_names = ['pressure', 'height', 'temperature', 'dewpoint', 'direction', 'speed']

df = pd.read_fwf(get_test_data('jan20_sounding.txt', as_file_obj=False),
                 skiprows=5, usecols=[0, 1, 2, 3, 6, 7], names=col_names)

df['u_wind'], df['v_wind'] = mpcalc.wind_components(df['speed'],
                                                    np.deg2rad(df['direction']))

# Drop any rows with all NaN values for T, Td, winds
df = df.dropna(subset=('temperature', 'dewpoint', 'direction', 'speed',
                       'u_wind', 'v_wind'), how='all').reset_index(drop=True)

###########################################
# We will pull the data out of the example dataset into individual variables and
# assign units.

p = df['pressure'].values * units.hPa
T = df['temperature'].values * units.degC
Td = df['dewpoint'].values * units.degC
Esempio n. 40
0
                print("Error: geocode failed on input %s with message %s" %
                      (loc, e))
    df = pd.DataFrame(np.array(coordinate).reshape(-1, 3))
    df.columns = ["city", "latitude", "longitude"]
    return df


df = coordinate(list_city)
# the process of getting latitude and longitude might crash sometimes, so we need to store the data after we get all the
# latitude and longitude
tfile = open('coordinate.txt', 'a')
tfile.write(df.to_string())
tfile.close()

# we can use the coordinate directly from the text file
data = pd.read_fwf('coordinate.txt')
'''creat Map object'''
m = folium.Map([35.8781, -100.6298], zoom_start=5)

# mark each city as a point
for index, row in data.iterrows():
    folium.CircleMarker(
        location=[float(row['latitude']),
                  float(row['longitude'])],
        radius=3,
        popup=row['city'],
        fill_color='#ffe6e6',  # divvy color
    ).add_to(m)

# plot heatmap
m.add_children(
Esempio n. 41
0
def read_crn(filename, map_variables=True):
    """Read a NOAA USCRN fixed-width file into a pandas dataframe.

    The CRN network consists of over 100 meteorological stations covering the
    U.S. and is described in [1]_ and [2]_. The primary goal of CRN is to
    provide long-term measurements of temperature, precipitation, and soil
    moisture and temperature. Additionally, global horizontal irradiance (GHI)
    is measured at each site using a photodiode pyranometer.

    Parameters
    ----------
    filename: str, path object, or file-like
        filepath or url to read for the fixed-width file.
    map_variables: boolean, default: True
        When true, renames columns of the Dataframe to pvlib variable names
        where applicable. See variable :const:`VARIABLE_MAP`.

    Returns
    -------
    data: Dataframe
        A dataframe with DatetimeIndex and all of the variables in the
        file.

    Notes
    -----
    CRN files contain 5 minute averages labeled by the interval ending
    time. Here, missing data is flagged as NaN, rather than the lowest
    possible integer for a field (e.g. -999 or -99). Air temperature is in
    deg C and wind speed is in m/s at a height of 1.5 m above ground level.

    Variables corresponding to standard pvlib variables are by default renamed,
    e.g. `SOLAR_RADIATION` becomes `ghi`. See the
    :const:`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.

    CRN files occasionally have a set of null characters on a line
    instead of valid data. This function drops those lines. Sometimes
    these null characters appear on a line of their own and sometimes
    they occur on the same line as valid data. In the latter case, the
    valid data will not be returned. Users may manually remove the null
    characters and reparse the file if they need that line.

    References
    ----------
    .. [1] U.S. Climate Reference Network
       `https://www.ncdc.noaa.gov/crn/qcdatasets.html
       <https://www.ncdc.noaa.gov/crn/qcdatasets.html>`_

    .. [2] Diamond, H. J. et. al., 2013: U.S. Climate Reference Network
       after one decade of operations: status and assessment. Bull.
       Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1`
    """

    # read in data
    # TODO: instead of parsing as strings and then post-processing, switch to
    # pd.read_fwf(..., dtype=dict(zip(HEADERS, DTYPES)), skip_blank_lines=True)
    # when our minimum pandas >= 1.2.0 (skip_blank_lines bug for <1.2.0).
    # As a workaround, parse all values as strings, then drop NaN, then cast
    # to the appropriate dtypes, and mask "sentinal" NaN (e.g. -9999.0)
    data = pd.read_fwf(filename,
                       header=None,
                       names=HEADERS,
                       widths=WIDTHS,
                       dtype=str)

    # drop empty (bad) lines
    data = data.dropna(axis=0, how='all')

    # can't set dtypes in read_fwf because int cols can't contain NaN, so
    # do it here instead
    data = data.astype(dict(zip(HEADERS, DTYPES)))

    # finally, replace -999 values with NaN
    data = data.replace(NAN_DICT, value=np.nan)

    # set index
    # UTC_TIME does not have leading 0s, so must zfill(4) to comply
    # with %H%M format
    dts = data[['UTC_DATE', 'UTC_TIME']].astype(str)
    dtindex = pd.to_datetime(dts['UTC_DATE'] + dts['UTC_TIME'].str.zfill(4),
                             format='%Y%m%d%H%M',
                             utc=True)
    data = data.set_index(dtindex)

    if map_variables:
        data = data.rename(columns=VARIABLE_MAP)

    return data
Esempio n. 42
0
if not (args.text or args.data):
    parser.error("No action specified, must be either --text or --data")

# Get router text and split into sections
status_text = get_router_adsl_status_text(IP_ADDR)
sections = extract_sections(status_text)

if args.data:
    # Import pandas here, as it is quite slow (1-2 seconds) to import
    import pandas as pd

    # Get dataframe from port section, fixed width
    port_stringio = StringIO("\n".join(sections['port']))
    port_df = pd.read_fwf(
        port_stringio,
        index_col=0,  # First column is index
        skipinitialsep=True,  # Remove extra whitespace
        delimiter=" :"  # Both space and colon are delimiters
    )

    # Convert to list if only one value given
    if isinstance(args.data, str):
        args.data = [args.data]
    # Return each value
    for n, k in enumerate(args.data):
        if n > 0:
            print(":", end="")  # Add seperator

        # Get correct series, upstream or downstream
        direction = "Upstream" if k.endswith("up") else "Downstream"
        series = port_df[direction]
        # Get row
Esempio n. 43
0
"""
Created on Wed Nov  6 17:35:27 2019

@author: Ahmad Aiman Mohd Nazir
"""
#import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#read txt file
f = open("Conductivity(raw).txt", "r")
df = pd.read_fwf('Conductivity(raw).txt', sep=" ", header=None)
df.columns = ['xk', 'yk']

#plot the raw data
plt.scatter(df['xk'], df['yk'], label='raw data')
plt.xlabel('T')
plt.ylabel('Conductivity')
plt.title('Conductivity versus T')
plt.legend(loc='upper right')
plt.show()

#calculate the corresponding values for linear regression
sumx = sum(df['xk'])
sumx2 = sum((df['xk'])**2)
sumy = sum(df['yk'])
sumxy = sum(df['xk'] * df['yk'])
print('\nSum of \nX: {:.4f}\nY: {:.4f}\nX^2: {:.4f}\nXY: {:.4f}'.format(
    sumx, sumy, sumx2, sumxy))
Esempio n. 44
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({
                0.998: 2,
                1.5: 1,
                2.0: 0,
                2.5: 1
            },
                          index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({
                0.998: 0.5,
                1.5: 0.25,
                2.0: 0.0,
                2.5: 0.25
            },
                           index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = [
                'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
            ]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(
                s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join([
                'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
            ])
            f = StringIO(txt)
            df = pd.read_fwf(f,
                             widths=[6, 8, 3],
                             names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime([
                '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z',
                '2009-01-01 00:00:00X'
            ])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array([
                '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z',
                '2008-09-09 00:00:00Z'
            ],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT
                            or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[86400000000000])
            self.assertEqual(result.index.dtype, 'int64')
            tm.assert_series_equal(result, expected_s)

            # get nanoseconds to compare
            expected = np.array([86400000000000])
            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEqual(td.nunique(), 1)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            self.assertEqual(result2.index.dtype, 'int64')
            tm.assert_series_equal(result2, expected_s)

            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEqual(td.nunique(), 1)
Esempio n. 45
0
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt

#read data
dataframe = pd.read_fwf('D:/1brain_body.txt')
x_values = dataframe[['Brain']]
y_values = dataframe[['Body']]

#train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, body_reg.predict(x_values))
plt.show()
Esempio n. 46
0
tar_gz_file_name = 'GRGS_anomaly.tar.gz'

# remove the existing extracted files and freshly extract again
#call('rm GSM-* 2> /dev/null',shell=True)
#call('tar xvzf '+tar_gz_file_name , shell=True)

my_file = open(names_file, 'r')
raw = my_file.read()
my_file.close()

# list of all the file names
names = raw.split()

# Original data is not delimited and should be seperated from the following column numbers
mywidths = [8, 5, 3, 19, 19, 11, 11, 14, 14]

filename = 'GSM-2_2011113-2011122_0010_GRGS_0080_03v3.anomaly'
data = pd.read_fwf(filename, widths=mywidths, header=None, skiprows=3)
npway = np.genfromtxt(filename, delimiter=mywidths, skip_header=3)
np_data = np.asarray(data)
new = data.to_numpy()
nnew = new[:, 1:7]

print(npway[:, 1:7])
print(nnew)
np.savetxt('testfile.txt',
           npway[:, 1:7],
           delimiter=' ',
           fmt='%d %d %1.12e %1.12e %1.4e %1.4e')
print('saved the file : ' + filename)
Esempio n. 47
0
                    write_file.write(r.read())


merge_books("Tigrigna", ti_books)
merge_books("Amharic", am_books)
merge_books("English", en_books)

# Creating a parallel Corpus

ti = pd.read_csv('Scrapped/Tigrigna/All.txt', delimiter="\n", header=None)
ti.columns = ["Tigrigna"]

en = pd.read_csv('Scrapped/English/All.txt', delimiter="\n", header=None)
en.columns = ["English"]

data = pd.concat([en, ti], axis=1)
print(data.head())

data.to_csv("en_ti.csv", index=False)

am = pd.read_fwf('Scrapped/Amharic/All.txt', delimiter="\n", header=None)
am.columns = ["Amharic"]

#reset 'data' dataframe
data = []

data = pd.concat([en, am], axis=1)
print(data.head())

data.to_csv("en_am.csv", index=False)
Esempio n. 48
0
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# importing data set

train_set = pd.read_fwf('kmeans_data.txt', header = None)

# K-means

# Feature Transformation (convert to polar coordinates)
def my_transform(x, y):
    r = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    return(r, phi)

def my_inv_transform(r, phi):
    x = r * np.cos(phi)
    y = r * np.sin(phi)
    return(x, y)

# Transform the original data
for i in range(np.shape(train_set)[0]):
  x = train_set[0][i]
  y = train_set[1][i]
  (train_set[0][i], train_set[1][i]) = my_transform(x, y)
  
# initialize means
mu_1 = train_set[0][0]
mu_2 = train_set[0][1]
tofile_name = 'binary'  # 定义导出二进制文件名
data.tofile(tofile_name)  # 导出二进制文件
fromfile_data = np.fromfile(tofile_name, dtype='float32')  # 读取二进制文件
print(fromfile_data)  # 打印数据

####################################################################
# 3. 使用Pandas的read_csv、read_fwf、read_table读取数据
import pandas as pd  # 导入Pandas库
csv_data = pd.read_csv('csv_data.csv',
                       names=['col1', 'col2', 'col3', 'col4',
                              'col5'])  # 读取csv数据
print(csv_data)  # 打印输出数据

import pandas as pd  # 导入Pandas库
fwf_data = pd.read_fwf('fwf_data',
                       widths=[5, 5, 5, 5],
                       names=['col1', 'col2', 'col3', 'col4'])  # 读取csv数据
print(fwf_data)  # 打印输出数据

import pandas as pd  # 导入Pandas库
table_data = pd.read_table('table_data.txt',
                           sep=';',
                           names=['col1', 'col2', 'col3', 'col4',
                                  'col5'])  # 读取csv数据
print(table_data)  # 打印输出数据

####################################################################
# 2.2.2 从Excel获取运营数据
import xlrd  # 导入库

# 打开文件
Esempio n. 50
0
    def process_visits(self) -> None:
        col_breaks = [
            (0, 16), (16, 24), (24, 32), (32, 40), (40, 43), (43, 48),
            (48, 50), (50, 52), (52, 57), (57, 62), (62, 64), (64, 73),
            (73, 82), (82, 91), (91, 100), (100, 109), (109, 118), (118, 127),
            (127, 136), (136, 145), (145, 154), (154, 162), (162, 170),
            (170, 178), (178, 186), (186, 194), (194, 202), (202, 203),
            (203, 206), (206, 208), (208, 212), (212, 216), (216, 218),
            (218, 220), (220, 221), (221, 231), (231, 232)
        ]

        col_names = [
            'MemberId', 'ServiceDate', 'AdmissionDate', 'DischargeDate',
            'CoveredDays', 'CPT', 'CptMod1', 'CptMod2', 'HCPCS', 'CPT2',
            'Cpt2Mod', 'PrincipalIcdDiagnosis', 'IcdDiagnosis2',
            'IcdDiagnosis3', 'IcdDiagnosis4', 'IcdDiagnosis5', 'IcdDiagnosis6',
            'IcdDiagnosis7', 'IcdDiagnosis8', 'IcdDiagnosis9',
            'IcdDiagnosis10', 'PrincipalIcdProcedure', 'IcdProcedure2',
            'IcdProcedure3', 'IcdProcedure4', 'IcdProcedure5', 'IcdProcedure6',
            'IcdIdentifier', 'DRG', 'DischargeStatus', 'UbRevenue',
            'UbBillType', 'NumberOfTimes', 'CmsPlaceOfService', 'ClaimStatus',
            'ProviderId', 'SupplementalData'
        ]

        col_types = {
            'MemberId': np.unicode,
            'ServiceDate': np.object,
            'AdmissionDate': np.object,
            'DischargeDate': np.object,
            'CoveredDays': np.unicode,
            'CPT': np.unicode,
            'CptMod1': np.unicode,
            'CptMod2': np.unicode,
            'HCPCS': np.unicode,
            'CPT2': np.unicode,
            'Cpt2Mod': np.unicode,
            'PrincipalIcdDiagnosis': np.unicode,
            'IcdDiagnosis2': np.unicode,
            'IcdDiagnosis3': np.unicode,
            'IcdDiagnosis4': np.unicode,
            'IcdDiagnosis5': np.unicode,
            'IcdDiagnosis6': np.unicode,
            'IcdDiagnosis7': np.unicode,
            'IcdDiagnosis8': np.unicode,
            'IcdDiagnosis9': np.unicode,
            'IcdDiagnosis10': np.unicode,
            'PrincipalIcdProcedure': np.unicode,
            'IcdProcedure2': np.unicode,
            'IcdProcedure3': np.unicode,
            'IcdProcedure4': np.unicode,
            'IcdProcedure5': np.unicode,
            'IcdProcedure6': np.unicode,
            'IcdIdentifier': np.unicode,
            'DRG': np.unicode,
            'DischargeStatus': np.unicode,
            'UbRevenue': np.unicode,
            'UbBillType': np.unicode,
            'NumberOfTimes': np.unicode,
            'CmsPlaceOfService': np.unicode,
            'ClaimStatus': np.unicode,
            'ProviderId': np.unicode,
            'SupplementalData': np.unicode
        }

        self.log.info('Reading input file')
        df = pd.read_fwf(self.config.read_value('setup',
                                                'visit.input.filename'),
                         colspecs=col_breaks,
                         names=col_names,
                         dtype=col_types,
                         parse_dates=[1, 2, 3])

        for index, rows in df.iterrows():
            v = {
                'ServiceDate':
                self.process_date(rows.ServiceDate),
                'AdmissionDate':
                self.process_date(rows.AdmissionDate),
                'DischargeDate':
                self.process_date(rows.DischargeDate),
                'CoveredDays':
                self.validate_missing_field(rows.CoveredDays),
                'CPT':
                self.validate_missing_field(rows.CPT),
                'CptMod1':
                self.validate_missing_field(rows.CptMod1),
                'CptMod2':
                self.validate_missing_field(rows.CptMod2),
                'HCPCS':
                self.validate_missing_field(rows.HCPCS),
                'CPT2':
                self.validate_missing_field(rows.CPT2),
                'Cpt2Mod':
                self.validate_missing_field(rows.Cpt2Mod),
                'PrincipalIcdDiagnosis':
                self.validate_missing_field(rows.PrincipalIcdDiagnosis),
                'IcdDiagnosis2':
                self.validate_missing_field(rows.IcdDiagnosis2),
                'IcdDiagnosis3':
                self.validate_missing_field(rows.IcdDiagnosis3),
                'IcdDiagnosis4':
                self.validate_missing_field(rows.IcdDiagnosis4),
                'IcdDiagnosis5':
                self.validate_missing_field(rows.IcdDiagnosis5),
                'IcdDiagnosis6':
                self.validate_missing_field(rows.IcdDiagnosis6),
                'IcdDiagnosis7':
                self.validate_missing_field(rows.IcdDiagnosis7),
                'IcdDiagnosis8':
                self.validate_missing_field(rows.IcdDiagnosis8),
                'IcdDiagnosis9':
                self.validate_missing_field(rows.IcdDiagnosis9),
                'IcdDiagnosis10':
                self.validate_missing_field(rows.IcdDiagnosis10),
                'PrincipalIcdProcedure':
                self.validate_missing_field(rows.PrincipalIcdProcedure),
                'IcdProcedure2':
                self.validate_missing_field(rows.IcdProcedure2),
                'IcdProcedure3':
                self.validate_missing_field(rows.IcdProcedure3),
                'IcdProcedure4':
                self.validate_missing_field(rows.IcdProcedure4),
                'IcdProcedure5':
                self.validate_missing_field(rows.IcdProcedure5),
                'IcdProcedure6':
                self.validate_missing_field(rows.IcdProcedure6),
                'IcdIdentifier':
                self.validate_missing_field(rows.IcdIdentifier),
                'DRG':
                self.validate_missing_field(rows.DRG),
                'DischargeStatus':
                self.validate_missing_field(rows.DischargeStatus),
                'UbRevenue':
                self.validate_missing_field(rows.UbRevenue),
                'UbBillType':
                self.validate_missing_field(rows.UbBillType),
                'NumberOfTimes':
                self.validate_missing_field(rows.NumberOfTimes),
                'CmsPlaceOfService':
                self.validate_missing_field(rows.CmsPlaceOfService),
                'ClaimStatus':
                self.validate_missing_field(rows.ClaimStatus),
                'ProviderId':
                self.validate_missing_field(rows.ProviderId),
                'SupplementalData':
                self.validate_missing_field(rows.SupplementalData),
            }

            v['AggregatedCodes'] = self.aggregate_codes(v)

            member_id = rows.MemberId
            if member_id not in self.visits:
                self.visits[member_id] = [v]
            else:
                self.visits[member_id].append(v)
Esempio n. 51
0
def recalculate_avg_hours(file_paths, age_bins):
    '''
    --------------------------------------------------------------------
    Creates a dataframe of all of the requested months, recalculates the
    working hours variable and calculates a weighted average number of
    hours worked for each age bin across the entire time period.
    --------------------------------------------------------------------
    INPUTS:
    age_bins   = (S,) vector, beginning cutoff ages for each age bin
    file_paths = list, location of file for each requested month

    OTHER FUNCTIONS AND FILES CALLED BY THIS FUNCTION: None

    OBJECTS CREATED WITHIN FUNCTION:
    names          = length 6 tuple, names for each column in data file
    colspecs       = length 6 tuple, tuples for indexes for each column
    list_months_df = list, dataframes for each month of data
    month_df       = dataframe, data read from data file
    df             = dataframe, concatenated dataframe of data from all months
    TotWklyHours   = series, contains weighted averages per age bin
    df_hrs_age     = dataframe, weighted averages of weekly hours per age bin

    FILES CREATED BY THIS FUNCTION: None

    RETURNS: df_hrs_age
    --------------------------------------------------------------------
    '''
    names = ('HWHHWGT', 'PRTAGE', 'PRTFAGE', 'PEHRUSL1', 'PEHRUSL2',
             'PEHRFTPT')
    colspecs = ((46, 56), (121, 123), (123, 124), (217, 219), (219, 221),
                (221, 223))

    list_months_df = []
    for filename in file_paths:
        month_df = pd.read_fwf(filename,
                               colspecs=colspecs,
                               header=None,
                               names=names,
                               index_col=False)
        list_months_df.append(month_df)

    # concatenate all dataframes
    df = pd.concat(list_months_df)

    # Drop all observations that:
    #   1) have no hours in either response (PEHRUSL1=-1) and (PEHRUSL2=-1)
    #   2) have [(PEHRUSL1=-1), (PEHRUSL2=-4), and (PEHRFTPT!=1)] or
    #           [(PEHRUSL1=-4), (PEHRUSL2=-1), and (PEHRFTPT!=1)]
    #   3) have age that is top-coded (PRTFAGE=1)
    df = df[(
        (df['PEHRUSL1'] >= 0) | (df['PEHRUSL2'] >= 0) | (df['PEHRFTPT'] == 1))
            & (df['PRTFAGE'] == 0)]

    # Create empty total weekly hours series that has the index from df
    TotWklyHours = pd.Series(data=np.nan * np.ones(df.shape[0]),
                             index=df.index)

    # Assume that observations that report at least 35 hours of work in the
    # typical week (PEHRFTPT=1) but report either n/a hours (-1) or varying
    # hours (-4) have a supply of 35.0 hours per week
    TotWklyHours[(df['PEHRUSL1'] < 0) & (df['PEHRUSL2'] < 0) &
                 (df['PEHRFTPT'] == 1)] = 35.0

    # Assume that observations that report at least 35 hours of work in the
    # typical week (PEHRFTPT=1) but report only positive hours in job 1
    # (PEHRUSL1>=0) and report n/a or varying hours in job 2 (PEHRUSL2<0)
    # have a supply of the maximum of PEHRUSL1 and 35.0
    TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] < 0) &
                 (df['PEHRFTPT'] == 1)] = np.maximum(35.0, df['PEHRUSL1'])

    # Assume that observations that report at least 35 hours of work in the
    # typical week (PEHRFTPT=1) but report n/a or varying hours in job 1
    # (PEHRUSL1<0) and report only positive hours hours in job 2
    # (PEHRUSL2>=0) have a supply of the maximum of PEHRUSL2 and 35.0
    TotWklyHours[(df['PEHRUSL1'] < 0) & (df['PEHRUSL2'] >= 0) &
                 (df['PEHRFTPT'] == 1)] = np.maximum(35.0, df['PEHRUSL2'])

    # Observations that report only positive hours in job 1 (PEHRUSL1>=0)
    # and report n/a or varying hours in job 2 (PEHRUSL2<0) and do not
    # report at least 35 hours of work in the typical week (PEHRFTPT!=1)
    # have hours given by PEHRUSL1
    TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] < 0) &
                 (df['PEHRFTPT'] != 1)] = df['PEHRUSL1']

    # Observations that report n/a or varying hours in job 1 (PEHRUSL1<0)
    # and report only positive hours in job 2 (PEHRUSL2>=0) and do not
    # report at least 35 hours of work in the typical week (PEHRFTPT!=1)
    # have hours given by PEHRUSL2
    TotWklyHours[(df['PEHRUSL1'] < 0) & (df['PEHRUSL2'] >= 0) &
                 (df['PEHRFTPT'] != 1)] = df['PEHRUSL2']

    # Observations that report positive hours in job 1 (PEHRUSL1>=0) and
    # positive hours in job 2 (PEHRUSL2>=0) and report at least 35 hours of
    # work in the typical week (PEHRFTPT=1) have hours given by the maximum
    # of PEHRUSL1+PEHRUSL2 and 35.0
    TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] >= 0) &
                 (df['PEHRFTPT'] == 1)] = np.maximum(
                     35.0, df['PEHRUSL1'] + df['PEHRUSL2'])

    # Observations that report positive hours in job 1 (PEHRUSL1>=0) and
    # positive hours in job 2 (PEHRUSL2>=0) and do not report at least 35
    # hours of work in the typical week (PEHRFTPT!=1) have hours given by
    # PEHRUSL1+PEHRUSL2
    TotWklyHours[(df['PEHRUSL1'] >= 0) & (df['PEHRUSL2'] >= 0) &
                 (df['PEHRFTPT'] != 1)] = df['PEHRUSL1'] + df['PEHRUSL2']

    # Add TotWklyHours to DataFrame
    df['TotWklyHours'] = TotWklyHours

    # mark and group according to bin
    if age_bins is not None:
        age_bins = np.append(age_bins, 80)
        age_bins = list(age_bins)
        df['age_bin'] = pd.cut(df['PRTAGE'], age_bins)
        # print('df HWHHWGT=0=', df['HWHHWGT'][df['HWHHWGT'] == 0].shape)
        df_hrs_age = \
            df.groupby('age_bin').apply(lambda x:
                                        np.average(x.TotWklyHours,
                                                   weights=x.HWHHWGT))
    # group according to age
    else:
        df_hrs_age = \
            df.groupby('PRTAGE').apply(lambda x:
                                       np.average(x.TotWklyHours,
                                                  weights=x.HWHHWGT))

    return df_hrs_age
"""

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
import pandas as pd
from keras.preprocessing import text as keras_text, sequence as keras_seq
from sklearn.model_selection import train_test_split
from keras.layers import LSTM, GRU, Dropout

#Preparing training data
raw = pd.read_fwf(r'D:/sap/offline_challenge_to_send/xtrain_obfuscated.txt', header=None)
xtrain_obfuscated = pd.read_fwf(r'D:/sap/offline_challenge_to_send/xtrain_obfuscated.txt', header=None)
ytrain = pd.read_fwf(r'D:/sap/offline_challenge_to_send/ytrain.txt',header=None)
xtrain_obfuscated['label']=ytrain[0]
xtrain_obfuscated.rename(columns={0:'text'}, inplace=True)

#Reading test file
xtest_obfuscated = pd.read_fwf(r'D:/sap/offline_challenge_to_send/xtest_obfuscated.txt',header=None)
xtest_obfuscated.rename(columns={0:'text'}, inplace=True)

#One-hot encoding on training data
xtrain_encoded = pd.get_dummies(xtrain_obfuscated, columns=['label'])

#df_encoded_copy=df_encoded.copy()

#List sentences train
Esempio n. 53
0
def read_schedule(path):
    return pd.read_fwf(path,
                       widths=[8, 1],
                       names=["lookup", "schedule"],
                       header=None,
                       dtype=str)
Esempio n. 54
0
    def test_value_counts_datetime64(self, klass):

        # GH 3002, datetime64[ns]
        # don't test names though
        txt = "\n".join([
            'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
            'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
        ])
        f = StringIO(txt)
        df = pd.read_fwf(f,
                         widths=[6, 8, 3],
                         names=["person_id", "dt", "food"],
                         parse_dates=["dt"])

        s = klass(df['dt'].copy())
        s.name = None
        idx = pd.to_datetime([
            '2010-01-01 00:00:00', '2008-09-09 00:00:00', '2009-01-01 00:00:00'
        ])
        expected_s = Series([3, 2, 1], index=idx)
        tm.assert_series_equal(s.value_counts(), expected_s)

        expected = np_array_datetime64_compat([
            '2010-01-01 00:00:00', '2009-01-01 00:00:00', '2008-09-09 00:00:00'
        ],
                                              dtype='datetime64[ns]')
        if isinstance(s, Index):
            tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
        else:
            tm.assert_numpy_array_equal(s.unique(), expected)

        assert s.nunique() == 3

        # with NaT
        s = df['dt'].copy()
        s = klass([v for v in s.values] + [pd.NaT])

        result = s.value_counts()
        assert result.index.dtype == 'datetime64[ns]'
        tm.assert_series_equal(result, expected_s)

        result = s.value_counts(dropna=False)
        expected_s[pd.NaT] = 1
        tm.assert_series_equal(result, expected_s)

        unique = s.unique()
        assert unique.dtype == 'datetime64[ns]'

        # numpy_array_equal cannot compare pd.NaT
        if isinstance(s, Index):
            exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT])
            tm.assert_index_equal(unique, exp_idx)
        else:
            tm.assert_numpy_array_equal(unique[:3], expected)
            assert pd.isna(unique[3])

        assert s.nunique() == 3
        assert s.nunique(dropna=False) == 4

        # timedelta64[ns]
        td = df.dt - df.dt + timedelta(1)
        td = klass(td, name='dt')

        result = td.value_counts()
        expected_s = Series([6], index=[Timedelta('1day')], name='dt')
        tm.assert_series_equal(result, expected_s)

        expected = TimedeltaIndex(['1 days'], name='dt')
        if isinstance(td, Index):
            tm.assert_index_equal(td.unique(), expected)
        else:
            tm.assert_numpy_array_equal(td.unique(), expected.values)

        td2 = timedelta(1) + (df.dt - df.dt)
        td2 = klass(td2, name='dt')
        result2 = td2.value_counts()
        tm.assert_series_equal(result2, expected_s)
Esempio n. 55
0
# set directory to read files from
source_dir = "C:\\Users\\abhij\\Documents\\Career\\013 - Wire Wheel\\coding_challenge\\coding_challenge\\gfz-data"
output_file = "C:\\Users\\abhij\\Documents\\Career\\013 - Wire Wheel\\coding_challenge\\coding_challenge\\challenge1.csv"

file_list = glob.glob(source_dir + '/*.TAB')

# create data frame, empty list, for loop to iterate through file_list, thus adding correct rows from each file to a master dataframe
frame = pd.DataFrame()
list_ = []
#setting column widths b/c of use of pd.read_fwf {fixed width file}
col_widths = [(0, 6), (8, 10), (11, 13), (14, 16), (17, 19), (20, 23),
              (24, 26), (27, 29), (30, 32), (42, 45)]

for filename_ in file_list:

    df = pd.read_fwf(filename_, colspecs=col_widths, header=None)

    df = df.dropna(axis=0, subset=[4])
    df = df.dropna(axis=0, subset=[8])
    list_.append(df)
''' #Test on a single file
df = pd.read_fwf('C:\\Users\\abhij\\Documents\\Career\\013 - Wire Wheel\\coding_challenge\\coding_challenge\\gfz-data\\kp9706.tab', colspecs = col_widths, header = None)

#df = df[(df[8]>0)]
df = df.dropna(axis=0, subset=[4])
df = df.dropna(axis=0, subset=[8])
list_.append(df)
'''

frame = pd.concat(list_)
#provide column names since none provided in files
Esempio n. 56
0
columns = [(0, 15), (16, 56), (57, 60), (61, 65), (66, 74), (75, 105),
           (106, 115), (116, 124), (125, 134), (135, 144), (145, 154),
           (155, 163), (164, 173), (174, 183), (184, 193), (194, 202),
           (203, 212), (213, 222), (223, 229), (230, 236), (237, 243),
           (244, 250), (251, 252)]

if not os.path.exists(exceldir):
    os.mkdir(exceldir)

for txtfile in os.listdir(indir):
    if txtfile.endswith(".txt") or txtfile.endswith(".TXT"):
        # remove null
        copyfile(indir + '/' + txtfile, indir + '/' + txtfile + '.ORI')
        fi = open(indir + '/' + txtfile, 'r')
        data = fi.read()
        fi.close()

        data = data.replace('\x00', '')
        data = data.replace('\x0A\x0A', '\x0A')
        #data = filter(lambda x: not re.match(r'^*$', x), data)
        fo = open(indir + '/' + txtfile, 'w')
        fo.write(data)
        fo.close()

for csvfile in os.listdir(indir):
    if csvfile.endswith(".txt") or csvfile.endswith(".TXT"):
        df = pd.read_fwf(indir + '/' + csvfile, columns)
        df = df.fillna(' ')
        excelfile = exceldir + '/' + os.path.basename(csvfile) + '.xlsx'
        df.to_excel(excelfile, index=False)
Esempio n. 57
0
import pandas as pa
from sklearn import linear_model
import matplotlib.pyplot as plt

dataframe = pa.read_fwf('linear_regression_demo/brain_body.txt')
x_values = dataframe[['Brain']]
y_values = dataframe[['Body']]

bodyreg = linear_model.LinearRegression()
bodyreg.fit(x_values, y_values)
print "x_values"
print x_values
print "y_values"
print y_values

predict = 0
predict = bodyreg.predict(x_values)
print predict
plt.scatter(x_values, y_values)
plt.plot(x_values, predict)
plt.show()
Esempio n. 58
0
from collections import Counter
import operator

from google.colab import drive
drive.mount('/content/drive')
"""# LOAD EVENTOS"""

txt_event = '/content/drive/My Drive/TFM/03_DATASETS/eventos_2.rpt'
widths = [
    10, 39, 12, 16, 12, 39, 20, 10, 41, 16, 50, 39, 17, 18, 39, 41, 41, 41, 41,
    41, 30, 39
]

dfevent = pd.read_fwf(txt_event,
                      widths=widths,
                      header=1,
                      index_col=None,
                      index=True)
rowcl, colcl = dfevent.shape
dfevent = dfevent[0:(rowcl - 3)]

new_header = [
    'TipoEvento', 'CodigoEvento', 'FechaEvento', 'UsuarioEvento', 'HoraEvento',
    'ClienteEvento', 'CodigoPostalEvento', 'PaísEvento', 'RepresentanteEvento',
    'TipoPortesEvento', 'FormaPagoEvento', 'PlazoPagoEvento',
    'SkuArticuloEvento', 'TipoArticuloEvento', 'FamiliaArticuloEvento',
    'SubfamiliaArticuloEvento', 'CantidadArticuloEvento',
    'AlmacenArticuloEvento', 'TarifaArticuloEvento', 'DescuentoArticuloEvento',
    'MotivoEvento', 'CosteEvento'
]
dfevent.columns = new_header
Esempio n. 59
0
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 14 12:31:13 2018

@author: Dipika
"""

import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt

#read data
dataframe = pd.read_fwf('C:\\Dipika\\Resumes\\Siraj\\week1_0\\brain_body.txt')
x_values = dataframe[['Brain']]
y_values = dataframe[['Body']]

#train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, body_reg.predict(x_values))
plt.show()

Esempio n. 60
-1
def get_fw_csv_data(filename, widths, header=False, remote=False, **kwargs):

    if remote:
        theory = CSVReader(FileName=[filename])
        theory.HaveHeaders = 0
        theory.MergeConsecutiveDelimiters = 1
        theory.UseStringDelimiter = 0
        theory.DetectNumericColumns = 1
        theory.FieldDelimiterCharacters = ' '
        theory.UpdatePipeline()

        theory_client = servermanager.Fetch(theory)

        table = Table(theory_client)

        data = table.RowData

    else:
        import pandas as pd
        if not header:
            data = pd.read_fwf(filename, sep=' ', header=None,
                               widths=widths, **kwargs)
        else:
            data = pd.read_fwf(filename, sep=' ', width=widths, **kwargs)

    return data