Example #1
0
def denormalize(zip_name):
    """
    Denormalize diary and hhold NFS data.

    We do this so we don't have to do this join everytime.
    It's not that expensive to store.

    Write it to CSV and to feather format.
    """

    path, ext = os.path.splitext(zip_name)
    if not os.path.exists(path):
        zip_file = zipfile.ZipFile(zip_name)
        zip_file.extractall(path=path)

    fnames = glob.glob(pjoin(path, '*'))
    diary_name =  glob.fnmatch.filter(fnames,
                                    '*diary data.txt')[0]

    hhold_name = glob.fnmatch.filter(fnames,
                                     '*household data.txt')[0]

    diary = pd.read_csv(diary_name, sep="\t")

    hhold = pd.read_csv(hhold_name, sep="\t",
                        usecols=hhold_cols)
    dta = diary.merge(hhold)
    dta.to_csv(path.rstrip('/') + '.csv', index=False)
    if has_feather:
        feather.write_dataframe(dta, path.rstrip('/') + '.feather')
Example #2
0
    def _ft(self, tblname, dbname=None, type=None, df=None):
        if type is None:
            type = self.type
        if dbname is None:
            dbname = self.name
        if df is None:
            # return the dataframe if it exists
            df = ft.read_dataframe(
                os.path.expanduser(
                    os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname))
                )
            )
            if "idx" in df.columns.values:
                df.set_index("idx", drop=True, inplace=True)
                df.index.name = None
            return df

        else:
            if not (df.index.dtype_str == "int64") and not (df.empty):
                df = df.copy()
                df["idx"] = df.index
            ft.write_dataframe(
                df,
                os.path.expanduser(
                    os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname))
                ),
            )
            if "idx" in df.columns.values:
                del df
            return
def parse_all(nrows=None):
    # debug
    d = glob_files()
    dd = dict()
    for x in d:
        # df = pd.read_csv(x['filename'], dtype='str', header=None, skiprows=x['skiprows'])
        df = pd.read_csv(x['filename'], header=None, skiprows=x['skiprows'])
        # if header == ',PDG Application Service,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total':
        #     header = 'Account Customer,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total'
        # elif header == ',Application Service,FR,DFL,TP,DLG,Total':
        #     header = 'Account Customer,FR,DFL,TP,DLG,Total'
        # elif header == 'Local Authority,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total':
        #     header = 'Account Customer,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total'
        # elif header == 'Local Authority,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total,':
        #     header = 'Account Customer,FR,DFL,TP,DLG,OS(W),OS(NPW),OS(P),OS(NPP),SIMS,OC1,OC2,Total,'
        # elif header == ',PDG Application Service,OS(W),OS(NPW),OS(P),OS(NPP),Total':
        #     header = 'Account Customer,OS(W),OS(NPW),OS(P),OS(NPP),Total'
        # assert header.startswith('Account Customer,') or header.startswith('Region'), 'got {}'.format(header)
        df = _mangle_with_header_etc(df, x)
        if x['name'] not in dd:
            dd[x['name']] = list()
        dd[x['name']].append(df)
    for k in dd:
        df = pd.concat(dd[k])
        kk = k.lower().replace(' ', '_')
        filename = kk + '.feather'
        print("writing {} {}".format(filename, df.shape))
        feather.write_dataframe(df, filename)
    return dd
Example #4
0
def mergeFeathers(files, mergedFilename, writeCSV, deleteSource=True):
    data = [feather.read_dataframe(f) for f in files if not f == '']
    if len(data) > 0:
        df = pd.concat(data, sort=False, axis=0, ignore_index=True, copy=False)
    else:
        print('mergeFeathers: No files to merge!')
        return ''
    
    if writeCSV:
        df.to_csv(mergedFilename)
    else:
        try:
            feather.write_dataframe(df, mergedFilename)
        except:
            print('Error writing merged feather: Trying CSV')
            print(df.shape)
            traceback.print_exc()
            try:
                df.to_csv(mergedFilename.replace('.feather', '.csv'))
            except:
                print('Error writing merged CSV: Writing list of unmerged temp files.')
                with open(mergedFilename.replace('.feather', '.csv'), 'w') as fh:
                    for f in files:
                        fh.write(f + '\n')
                deleteSource = False
    if deleteSource:
        for f in files:
            if not f == '':
                try:
                    os.remove(f)
                except:
                    print('Could not delete merged temp file: %s' % f)
    return mergedFilename
def extract_svs(in_file, depth, chroms):
    """Create CSV file of structural variants of interest, for Circos plots.
    """
    allowed_chroms = set([str(x) for x in range(1, 23)])
    # print(allowed_chroms + 'X')
    # allowed_chroms = set(allowed_chroms.append('X'))

    df = pd.DataFrame(columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2", "file", "caller", "svtype"])

    log.debug("Building dataframe from VCF file...")

    callers = _find_svcaller(in_file)
    for idx, caller in enumerate(callers):
        for p1, p2, svtype in parse_svs(in_file, depth):
            if len(chroms) == 0 or (p1[0] in chroms or p2[0] in chroms):
                if p1[0] in allowed_chroms and p2[0] in allowed_chroms:
                    row = pd.Series({"chrom1": p1[0], "start1": p1[1], "end1": p1[2], 
                                     "chrom2": p2[0], "start2": p2[1], "end2": p2[2], 
                                     "file": in_file, "caller": caller, "svtype": svtype})
                    df = df.append(row, ignore_index=True)

    try:
        out_file = os.path.join(OUTPUT_DIR, os.path.basename(in_file))
        log.info("Exporting to interoperable feather file {}.feather".format(out_file))
        feather.write_dataframe(df, "{}.feather".format(out_file))
    except feather.ext.FeatherError:
        log.error("Failed to serialize feather object (most likely empty source dataframe)")
Example #6
0
def test_factor_rep():
    fpath1 = util.random_path()
    fpath2 = util.random_path()

    rcode = """
library(feather)

iris <- read_feather("{0}")
iris$Species <- as.factor(as.character(iris$Species))
write_feather(iris, "{1}")
""".format(fpath1, fpath2)
    tmp_paths = []

    try:
        iris = pd.read_csv('iris.csv')
        levels = ['setosa', 'versicolor', 'virginica']

        iris['Species'] = pd.Categorical(iris['Species'], categories=levels)

        feather.write_dataframe(iris, fpath1)
        util.run_rcode(rcode)

        result = feather.read_dataframe(fpath2)

        tmp_paths.extend([fpath1, fpath2])
        assert_frame_equal(result, iris)
    finally:
        util.remove_paths(tmp_paths)
Example #7
0
    def __to_feather__(self, uri: str):
        if not "feather" in DataObject.registered_types:
            raise RuntimeError("Cannot convert to feather.")
        import feather

        feather.write_dataframe(self.inner_data, uri)
        return DataObject.registered_types["feather"].from_uri(uri, source=self)
Example #8
0
def import_data_set(in_path, sep="\s", name=None):
    data = pd.read_csv(in_path, sep=sep, header=None)
    if name is None:
        name = os.path.basename(in_path)
        name = os.path.splitext(name)
        name = name[0] + ".data"
    feather.write_dataframe(data, join(out_path, name))
Example #9
0
    def test_overwritten_file(self):
        path = random_path()

        num_values = 100
        np.random.seed(0)

        values = np.random.randint(0, 10, size=num_values)
        feather.write_dataframe(pd.DataFrame({"ints": values}), path)

        df = pd.DataFrame({"ints": values[0 : num_values // 2]})
        self._check_pandas_roundtrip(df, path=path)
Example #10
0
    def _check_pandas_roundtrip(self, df, expected=None):
        path = random_path()
        self.test_files.append(path)
        feather.write_dataframe(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = feather.read_dataframe(path)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)
Example #11
0
def maybe_parse(path):
    feather_file = path + ".feather"
    if os.path.exists(feather_file):
        print("loading %s from cache" % path)
        df = feather.read_dataframe(feather_file)
        df = df.set_index("ut_ms")
        return df
    else:
        print("parsing %s" % path)
        df = parse(path)
        feather.write_dataframe(df.reset_index(), feather_file)
        return df
Example #12
0
def save_df(df, path, index=False):
    if path == '-' or path is None:
        print(default_csv_writer(df, None, index=index))
    elif file_format(path) != 'feather':
        default_csv_writer(df, path, index=index)
    elif featherpmm and feather:
        featherpmm.write_dataframe(featherpmm.Dataset(df, name='verification'),
                                   path)
    elif feather:
        feather.write_dataframe(df, path)
    else:
        raise Exception('The Python feather module is not installed.\n'
                        'Use:\n    pip install feather-format\n'
                        'to add capability.\n')
Example #13
0
def mergeSamples(batchFolder, extractionFunc, extractionKwargs, matchStr='*.feather', test=False, metaCols=None, filters=None):
    """Go through each feather file (sample) in a batch folder,
    apply the analysis function, and merge together."""
    mDf = pd.read_csv(opj(batchFolder, 'metadata.csv'))
    featherList = glob(opj(batchFolder, matchStr))
    featherLU = matchSamples(batchFolder, matchStr=matchStr, test=test)
    
    if not metaCols is None:
        if not 'sample_name' in metaCols:
            metaCols.append('sample_name')
        mDf = mDf[metaCols]
    
    mDf = mDf.set_index('sample_name')
    feathers = []
    i = 1
    print('Extracting from batch %s (%s)' % (batchFolder, time.ctime()))
    sttime = time.time()
    for sample_name, fn in featherLU.items():
        filterOut = False
        if not filters is None:
            """Keep only samples whose meta data matches all of the filters"""
            filterOut = False
            for col, valList in filters.items():
                if not mDf.loc[sample_name, col] in valList:
                    filterOut = True
                    break
        if not filterOut:
            f = feather.read_dataframe(fn)
            # print('Extracting from sample %s (%d of %d)' % (sample_name, i, len(featherLU)))
            try:
                x = extractionFunc(f, **extractionKwargs)
                x.loc[:, 'sample_name'] = sample_name
            except:
                print('Error extracting from batch %s, sample %s (%d)' % (batchFolder, sample_name, i))
                print(x.shape)
                print(x.head())
                traceback.print_exc()
            feathers.append(x)
        i += 1
    if len(feathers) > 0:
        outDf = pd.merge(pd.concat(feathers, axis=0), mDf.reset_index(), how='left', left_on='sample_name', right_on='sample_name')
        print('Finished batch %s (%1.0f minutes)' % (batchFolder, (time.time() - sttime) / 60), flush=True)

        """Write to a temporary merge file and return filename"""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.feather', prefix='merged_tmp_', dir=batchFolder, delete=False) as fh:
            tmpFilename = fh.name
        feather.write_dataframe(outDf, tmpFilename)
    else:
        tmpFilename = ''
    return tmpFilename
Example #14
0
File: awj.py Project: tacaswell/awj
    def __setitem__(self, key, df):
        fn = self._filename_from_key(key)
        feather.write_dataframe(df, fn)
        self._fn_cache[key] = fn
        self._sz_cache[key] = os.stat(fn).st_size
        if key in self._heap_map:
            self._heap_map[key][0] = time.time()
            # ensure the heap invariant
            heapq.heapify(self._heap)
        else:
            heap_entry = [time.time(), key]
            self._heap_map[key] = heap_entry
            heapq.heappush(self._heap, heap_entry)

        self.__prune_files()
Example #15
0
def to_feather(df, path):
    """
    Write a DataFrame to the feather-format

    Parameters
    ----------
    df : DataFrame
    path : string
        File path

    """
    path = _stringify_path(path)
    if not isinstance(df, DataFrame):
        raise ValueError("feather only support IO with DataFrames")

    feather = _try_import()
    valid_types = {'string', 'unicode'}

    # validate index
    # --------------

    # validate that we have only a default index
    # raise on anything else as we don't serialize the index

    if not isinstance(df.index, Int64Index):
        raise ValueError("feather does not support serializing {} "
                         "for the index; you can .reset_index()"
                         "to make the index into column(s)".format(
                             type(df.index)))

    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
        raise ValueError("feather does not support serializing a "
                         "non-default index for the index; you "
                         "can .reset_index() to make the index "
                         "into column(s)")

    if df.index.name is not None:
        raise ValueError("feather does not serialize index meta-data on a "
                         "default index")

    # validate columns
    # ----------------

    # must have value column names (strings only)
    if df.columns.inferred_type not in valid_types:
        raise ValueError("feather must have string column names")

    feather.write_dataframe(df, path)
Example #16
0
    def test_num_rows_attr(self):
        df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
        path = random_path()
        self.test_files.append(path)
        feather.write_dataframe(df, path)

        reader = feather.FeatherReader(path)
        assert reader.num_rows == len(df)

        df = pd.DataFrame({})
        path = random_path()
        self.test_files.append(path)
        feather.write_dataframe(df, path)

        reader = feather.FeatherReader(path)
        assert reader.num_rows == 0
def read_dtas(fname):
    savename = fname.split(".")[0] + ".feather"

    with open(fname, "rb") as pa:
        df = pd.read_stata(pa)

    df2 = df.copy()
    cols = []
    for i in range(len(df.columns)):
        try:
            cols.append(df.columns[i].encode('latin-1').decode('gb18030'))
        except:
            cols.append("")

    if cols[22] == "":
        cols[22] = '登记注册机关级鸸ど绦姓管聿棵'

    df2.columns = cols

    print(df2.columns)
    ##E.g.:
    #Index(['_组织机构代码', '_单位详细名称', '_行业代码', '_主要业务活动1', '_主要业务活动2', '_主要业务活动3',
    #       '_行政区划代码', '_省', '_地', '_县', '_乡', '_地址', '_街道办事处', '_法定代表人', '_开业年',
    #       '_开业月', '_区号', '_固定电话', '_分机号', '_传真号码', '_传真分机号', '_邮政编码',
    #       '登记注册机关级鸸ど绦姓管聿棵', '_登记注册号工商行政管理部门', '_登记注册机关级别编制部门', '_登记注册号编制部门',
    #       '_登记注册机关级别民政部门', '_登记注册号民政部门', '_登记注册机关级别国家税务部门', '_登记注册号国家税务部门',
    #       '_登记注册机关级别地方税务部门', '_登记注册号地方税务部门', '_登记注册机关级别其他', '_登记注册号其他', '_登记注册类型',
    #       '_企业控股情况', '_隶属关系', '_企业营业状态', '_执行会计制度类别', '_代码', '_名称', '_年初存货',
    #       '_年初产成品', '_流动资产合计', '_应收账款', '_存货', '_产成品', '_固定资产合计', '_固定资产原价',
    #       '_累计折旧', '_本年折旧', '_资产总计', '_流动负债合计', '_应付账款', '_非流动负债合计', '_负债合计',
    #       '_所有者权益合计', '_实收资本', '_国家资本', '_集体资本', '_法人资本', '_个人资本', '_港澳台资本',
    #       '_外商资本', '_营业收入', '_主营业务收入', '_营业成本', '_主营业务成本', '_营业税金及附加',
    #       '_主营业务税金及附加', '_其他业务利润', '_销售费用', '_管理费用', '_税金', '_财务费用', '_利息收入',
    #       '_利息支出', '_资产减值损失', '_公允价值变动收益', '_投资收益', '_营业利润', '_营业外收入', '_补贴收入',
    #       '_营业外支出', '_利润总额', '_应交所得税', '_应付职工薪酬', '_应交增值税', '_工业总产值', '_工业销售产值',
    #       '_出口交货值'],
    #      dtype='object')

    for i, col in enumerate(cols):
        print(i, col)
        if df2[col].dtype == 'object':
            print("Object type found, attempting conversion")
            df2[col] = df2[col].apply(lambda x: x.encode('latin-1').decode('gb18030'))

    feather.write_dataframe(df2, savename)
Example #18
0
 def _load_data(self, data_file, columns, usecols):
     fth_file = data_file + '.fth'
     if not os.path.exists(fth_file):
         logging.info('convert csv file to feather')
         df_tmp = pd.read_csv(data_file,
                              sep='\t',
                              names=columns,
                              usecols=usecols)
         logging.info('csv data shape {}'.format(df_tmp.shape))
         feather.write_dataframe(df_tmp, fth_file)
         df_tmp.head()
     logging.info('loading data {}'.format(fth_file))
     df_data = feather.read_dataframe(fth_file,
                                      columns=columns,
                                      use_threads=True)
     logging.info('data shape {}'.format(df_data.shape))
     #print df_data.head()['package_name']
     return df_data
Example #19
0
def concat_pieces(pieces, fname, featherfile, statafile, label, dblabel,
                  add2db, user, password, host):
    #add_frame_to_db(all_float_pieces,'FLOAT_VARS')
    if len(pieces) > 0:
        df = pd.concat(pieces, ignore_index=True,
                       sort=False)  #changed  20180716 (sort=False)
        df = order(df, ['RSSD9001', 'RSSD9999', 'year', 'qid'])
        df.to_csv(fname, index=False, sep="^")
        feather.write_dataframe(df, featherfile)
        if statafile != "0":
            if check_file_exists(statafile, "w"):
                df.to_stata(statafile)
                makelables(df, statafile)
        print '\n%s has a count of %s.' % (label, df["qid"].count())
        if add2db == 1:
            add_frame_to_db(df, dblabel.upper(), user, password, host)
    else:
        print label + "leeg"
Example #20
0
def dataframe_to_display(data_frame):
    """Save an array of floats to JSON.
    """
    # Write the array to a temporary file
    filepath = tempfile.mkstemp()[1]
    feather.write_dataframe(data_frame, filepath)

    # Read the temporary file as bytes
    array_data = open(filepath, 'rb').read()
    os.remove(filepath)

    # Convert raw bytes to a list of ints
    array_bytes_as_ints = []
    for d in array_data:
        array_bytes_as_ints.append(d)

    # Return the JSON representation of the list of ints
    return json.dumps(array_bytes_as_ints)
Example #21
0
def gen_df_bec_by_hs6():
    df_bec_by_hs6_path_str = r'SourceMaterial\df_bec_by_hs6.feather'
    df_bec_by_hs6_path = Path(df_bec_by_hs6_path_str)

    if df_bec_by_hs6_path.exists():
        df_bec_by_hs6 = feather.read_dataframe(df_bec_by_hs6_path)
    else:
        df_bec_raw = gen_df_bec_hs()
        df_bec_by_hs6 = pd.DataFrame()
        for index, row in df_bec_raw.iterrows():
            df = pd.DataFrame()
            df['hs6'] = pd.Series(row['hs6'].split(','))
            df['category'] = row['大類']
            df_bec_by_hs6 = pd.concat([df_bec_by_hs6, df])

        df_bec_by_hs6 = df_bec_by_hs6.drop_duplicates(['hs6'])
        feather.write_dataframe(df_bec_by_hs6, df_bec_by_hs6_path_str)
    return df_bec_by_hs6
Example #22
0
    def _check_pandas_roundtrip(self,
                                df,
                                expected=None,
                                path=None,
                                columns=None):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        feather.write_dataframe(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = feather.read_dataframe(path, columns)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)
Example #23
0
def rewrite_as_feather_file():
    with open('spo2_records.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        csv_lines = [row for row in csv_reader][1:]
        num_total = len(csv_lines)
        data_list = list()
        for i, row in enumerate(csv_lines[:10]):
            record = row[1]
            patient = int(record[1:7])
            with open('SpO2_and_hypoxemia_labels/' + record + '.txt',
                      'r') as json_file:
                data = json.load(json_file)
                data['patient'] = patient
                data['experiences_hypoxemia'] = np.any(
                    np.asarray(data['hypoxemia']) == 1)
                data_list.append(data)
        df = pd.DataFrame(data_list)
        feather.write_dataframe(df, 'spo2_hypoxemia.feather')
Example #24
0
def save_to_r_dataset(df, path):
    """Convert pandas dataframe to r dataframe.

    Parameters
    ----------
    df : dataframe
        Pandas dataframe.
    path : str
        Path to save.

    Returns
    -------
    None
        Description of returned object.

    """
    feather.write_dataframe(df, path)
    return None
Example #25
0
def feather_clean(in_directory):
    """ Utility function to clean feather files"""
    # in_directory = UpdateSP500Data.TOP_LEVEL_PATH / 'feather'
    Path.is_dir(in_directory)
    all_files = os.listdir(in_directory)
    for item in all_files:
        if item.endswith('.feather'):
            # Remove options with strikes at 5$
            option_df = feather.read_dataframe(in_directory / item)
            idx = option_df['strike'] == 5
            option_df = option_df.drop(option_df.index[idx])
            # # Remove Quarterly options
            # idx2 = option_df['root'] == 'SPXQ'
            # option_df = option_df.drop(option_df.index[idx2])
            # # Remove Monthly options
            # idx2 = option_df['root'] == 'SPXM'
            # option_df = option_df.drop(option_df.index[idx2])
            feather.write_dataframe(option_df, str(in_directory / item))
def convert_to_feather(file_path, out_path):
    '''Convert the Environment & Climate Change Canada csv files into feather files, to allow for faster processing

    Parameters
    ----------
    file_path : string
        file path to the csv files provided by Environment & Climate Change Canada, not including the name of the file 
    out_path : string
        where you want the new feather file to be written to in the computer, not including the new file name
    '''
    for station_name in os.listdir(file_path):
        file = file_path + station_name
        df = pd.read_csv(file,
                         sep=',',
                         engine='c',
                         low_memory=False,
                         encoding='latin1')
        feather.write_dataframe(df, out_path + station_name[:-4] + '.feather')
Example #27
0
def import_solar_data():
    file = request.files['file']
    file_name = os.path.splitext(file.filename)[0]
    file_path = 'data/temp/' + file_name
    file.save(file_path)

    solar_data = generic_data_to_dataframe(file_path)

    # Check if the file format is in the correct format
    try:
        solar_data.rename(columns={solar_data.columns[0]: 'Datetime'},
                          inplace=True)
    except:
        return jsonify({'error': 'Invalid data format.'})

    feather.write_dataframe(solar_data,
                            'data/solar_profiles/' + file_name + '.feather')
    return jsonify({'message': "Successfully imported file."})
Example #28
0
File: plot.py Project: jni/prin
def main(argv):
    args = _argument_parser().parse_args(argv)
    if args.data_frame is not None and os.path.exists(args.data_frame):
        df = feather.read_dataframe(args.data_frame)
    else:
        from . import parsers
        parser = getattr(parsers, args.format).parser
        print('reading network data')
        network = parser(args.datafile, max_num_nodes=args.max_num_nodes)
        print('extracting data')
        df = network_properties(network,
                                in_degree_threshold=args.in_degree_threshold,
                                pagerank_threshold=args.pagerank_threshold,
                                damping=args.damping)
    if args.data_frame is not None:
        feather.write_dataframe(df, args.data_frame)
    print('preparing plots')
    bokeh_plot(df, output=args.output_file, loglog=args.loglog)
def main(data_input, cluster_input, output):

    basicConfig(level=INFO,
                handlers=[
                    StreamHandler(),
                    FileHandler('{}.log'.format(output), mode='w')
                ])

    # Load the data.

    info('Loading data')

    data = feather.read_dataframe(data_input)

    info('Result: {}'.format(data.shape))

    info('Loading clusters')

    clusters = pd.read_csv(cluster_input, index_col='subject_id', squeeze=True)

    info('Result: {}'.format(clusters.shape))

    # Filter the data.

    info('Filtering data')

    data = data.loc[data['visit_id'] == 1].drop('visit_id',
                                                axis=1).set_index('subject_id')

    data = data.loc[clusters.index]

    # Merge the cluster assignments in.

    info('Merging clusters')

    data['classification'] = clusters

    # Write the output.

    info('Writing output')

    data['classification'] = data['classification'].astype('category')

    feather.write_dataframe(data.reset_index(), output)
Example #30
0
    def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        feather.write_dataframe(df, path)
        if not os.path.exists(path):
            raise Exception("file not written")

        result = feather.read_dataframe(path, columns)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)

        if null_counts is None:
            null_counts = np.zeros(len(expected.columns))

        np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
Example #31
0
def process_all(allrundirs):
    allcats = []

    for rundir in allrundirs:
        runname = rundir.split('/')[0]
        print('Loading {} ... '.format(runname), end='')
        sys.stdout.flush()
        allcats.append(load_catalog(rundir, runname))
        print('{:,}'.format(len(allcats[-1])))

    print('Loading finished. Saving merged table... ', end='')
    sys.stdout.flush()
    allcats = pd.concat(allcats, axis=0).sort_values(
        by=['run_name', 'start_time', 'channel']).reset_index(drop=True)
    print('({:,} reads total)'.format(len(allcats)))

    feather.write_dataframe(allcats, 'sequencing_summary.feather')

    print('Done.')
Example #32
0
def subset_data(session_id, pvalue_slider_value, foldchange_slider_value,
                basemean_slider_value, cluster_dropdown_value,
                go_dropdown_value, organism_type):
    if session_id is None:
        raise dash.exceptions.PreventUpdate()
    else:
        df = feather.read_dataframe('temp_data_files/' + session_id)
        df = df.rename(index=str, columns={'symbol': 'gene_ID'})
        if cluster_dropdown_value is not None:
            df = df[df['cluster'] == cluster_dropdown_value]
        if go_dropdown_value is None:
            pass
        elif len(go_dropdown_value) == 0:
            pass
        else:
            print("go dropdown triggered")
            # df = df[df['gene_ID'].isin(go_assocs.golist_to_collapsed_gene_list(go_dropdown_value))]
            if organism_type == 'mouse':
                df = df[df['gene_ID'].isin(
                    mouse_go_assocs.golist_to_collapsed_gene_list(
                        go_dropdown_value))]

        print(go_dropdown_value)
        # print(cluster_dropdown_value)
        # print(go_dropdown_value)
        if pvalue_slider_value is not None:
            min_slider = pvalue_slider_value[0]
            max_slider = pvalue_slider_value[1]
            df = df[df['neg_log10_padj'].between(min_slider, max_slider)]
        if foldchange_slider_value is not None:
            min_slider = foldchange_slider_value[0]
            max_slider = foldchange_slider_value[1]
            df = df[df['log2FoldChange'].between(min_slider, max_slider)]
        if basemean_slider_value is not None:
            min_slider = basemean_slider_value[0]
            max_slider = basemean_slider_value[1]
            # Handle exception for scRNAseq data case where no basemean
            try:
                df = df[df['log10basemean'].between(min_slider, max_slider)]
            except:
                pass
    feather.write_dataframe(df, 'temp_data_files/' + session_id + '_subset')
    return None
Example #33
0
def __saveToS32(obj, bucket, s3path, prefix=""):
    clz = obj.__class__.__name__
    lastobject = None
    if isinstance(obj, pandas.core.frame.DataFrame):
        import feather as ft
        ft.write_dataframe(obj, "/tmp/{}_lastobject.feather".format(prefix))
        lastobject = "{}_lastobject.feather".format(prefix)
    elif clz == "dict":
        import json
        with open("/tmp/{}_lastobject.json".format(prefix), "w") as outfile:
            json.dump(obj, outfile)
            lastobject = "{}_lastobject.json".format(prefix)
    CHUNK = 52428800
    if lastobject is not None:
        import math, os
        import boto
        from boto.s3.connection import S3Connection
        from boto.s3.key import Key
        c = boto.connect_s3()
        b = c.get_bucket(bucket)
        source_size = os.stat("/tmp/{}".format(lastobject)).st_size
        keyname = "{}/{}".format(s3path, lastobject)
        if source_size >= CHUNK:
            from filechunkio import FileChunkIO
            ## multipart upload
            ## http://boto.cloudhackers.com/en/latest/s3_tut.html#storing-large-data
            chunk_count = int(math.ceil(source_size / float(CHUNK)))
            mp = b.initiate_multipart_upload(keyname)
            try:
                for i in range(chunk_count):
                    offset = chunk_size * i
                    bytes = min(CHUNK, source_size - offset)
                    with FileChunkIO(lastobject,
                                     'r',
                                     offset=offset,
                                     bytes=bytes) as fp:
                        mp.upload_part_from_file(fp, part_num=i + 1)
            except:
                mp.complete_upload()
        else:
            k = Key(b)
            k.key = keyname
            k.set_contents_from_filename("/tmp/{}".format(lastobject))
def main(basic_input, medication_input, joint_injection_input, joint_input,
         output, filter_output):

    basicConfig(level=INFO,
                handlers=[
                    StreamHandler(),
                    FileHandler('{}.log'.format(output), mode='w')
                ])

    basic_data, medication_data, joint_injection_data, joint_data = load_data(
        basic_input, medication_input, joint_injection_input, joint_input)

    info('Generating masks')

    masks_basic = get_basic_masks(basic_data)

    masks_medications = get_medication_masks(medication_data,
                                             joint_injection_data)

    mask_joints = get_joint_count_masks(joint_data)

    mask_all = masks_basic['basic_combined'] & masks_medications[
        'medications_combined'] & mask_joints['joint_count']

    masks_all = masks_basic.join(masks_medications,
                                 how='outer').join(mask_joints, how='outer')

    masks_all['all_combined'] = mask_all

    info('{} patients will be retained'.format(mask_all.sum()))

    info('Filtering data')

    data = joint_data.set_index('subject_id').loc[mask_all.index[
        mask_all == True]].reset_index()

    info('Writing outputs')

    data.info()

    feather.write_dataframe(data, output)

    masks_all.to_csv(filter_output)
Example #35
0
def write_conslengths_feather(strand_lengths, strand_ids, seq_id):
    strand_ids = [Counter(x) for x in strand_ids]
    for x in range(0, 23):
        if len(strand_ids[x]) == 0:
            strand_ids[x] = 0
        else:
            temp_size = []
            for y in range(0, x):
                try:
                    temp_size.append(strand_ids[x][y])
                except:
                    temp_size.append(0)
            strand_ids[x] = temp_size

    strand_lengths = [Counter(x) for x in strand_lengths]
    for x in range(0, 23):
        if len(strand_lengths[x]) == 0:
            strand_lengths[x] = 0
        else:
            temp_size = []
            for y in range(1, x + 1):
                try:
                    temp_size.append(strand_lengths[x][y])
                except:
                    temp_size.append(0)
            strand_lengths[x] = temp_size
    #print(strand_lengths)

    barrel_sizes = [8, 10, 12, 14, 16, 18, 22]
    export_lengths = np.zeros([7, 22])
    export_IDs = np.zeros([7, 22])
    for value in range(0, len(barrel_sizes)):
        for x in range(0, barrel_sizes[value]):
            export_lengths[value][x] = strand_lengths[barrel_sizes[value]][x]
            export_IDs[value][x] = strand_ids[barrel_sizes[value]][x]
    print(export_IDs)
    #print(export_lengths)
    export_IDs = pandas.DataFrame(export_IDs)
    feather.write_dataframe(export_IDs.copy(),
                            "data/ConsStrandIDs%s.feather" % seq_id)
    export_lengths = pandas.DataFrame(export_lengths)
    feather.write_dataframe(export_lengths.copy(),
                            "data/ConsLengths%s.feather" % seq_id)
Example #36
0
def main(location, output_file, supported_loc={"BC", "WA"}):
    """Downloads data from the url and saves the dataframe in feather format

    Parameters
    ===========
    location: str
        Location id(s) of the page to fetch the data. For example 'BC', 'WA'
    output_file: str
        File name along with the path for saving the data
    supported_loc: set
        Set conatains string of all supported location ids. 
    """


    loc_ids = location.split(" ")
    location_df = list()


    for loc_id in loc_ids:
        if loc_id not in supported_loc:
            raise Exception("{} location not supported. Location should be from: ".format(loc_id) + str(supported_loc))
        try:
            url = "http://www.nuforc.org/webreports/ndxl"+loc_id+".html"
            location_df.append(pd.read_html(url)[0])
        except:
            raise Exception("URL " + url + " is not reachable")


    aliens_df = pd.concat(location_df, ignore_index=True)

    if output_file.split(".")[-1] == "feather":
        try:
            feather.write_dataframe(aliens_df, output_file)
        except:
            raise NotADirectoryError(output_file + "path does not exists.")
    elif output_file.split(".")[-1] == "csv":
        try:
            aliens_df.to_csv(output_file, index=False)
        except:
            raise NotADirectoryError(output_file + "path does not exists.")
    else:
        raise Exception("File format not supported")
Example #37
0
def appData():

    #fetch tables from energydata.uct.ac.za
    apikey = input(
        'Enter your APIKEY from http://energydata.uct.ac.za/user/YOUR_USERNAME: '******'Authorization': apikey}
    ckan = ckanapi.RemoteCKAN('http://energydata.uct.ac.za/',
                              apikey=apikey,
                              get_only=True)

    tables = ckan.action.package_show(id='dlr-database-tables-94-14')
    for i in range(0, len(tables['resources'])):
        name = tables['resources'][i]['name']
        print('... fetching ' + name + ' from energydata.uct.ac.za')
        r_url = tables['resources'][i]['url']
        # Download resources from data portal
        request = urllib.request.Request(r_url, headers=headers)
        with urllib.request.urlopen(request) as response, open(
                os.path.join(csv_table, name + '.csv'), 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
        table = pd.read_csv(os.path.join(csv_table, name + '.csv'))
        #write profiles to disk
        feather.write_dataframe(table,
                                os.path.join(feather_table, name + '.feather'))

    profiles = ckan.action.package_show(id='dlr-seasonal-adtd-profiles')
    for i in range(0, len(profiles['resources'])):
        name = profiles['resources'][i]['name']
        print('... fetching ' + profiles['resources'][i]['name'] +
              ' from energydata.uct.ac.za')
        r_url = profiles['resources'][i]['url']
        # Download resources from data portal
        request = urllib.request.Request(r_url, headers=headers)
        with urllib.request.urlopen(request) as response, open(
                os.path.join(csv_adtd, name + '.csv'), 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
        adtd = pd.read_csv(os.path.join(csv_adtd, name + '.csv'))
        #write profiles to disk
        feather.write_dataframe(adtd,
                                os.path.join(feather_adtd, name + '.feather'))
    return
Example #38
0
def get_current_stus_ada(save_path, **context):

    students = feather.read_dataframe("{0}/students.feather".format(save_path))
    attend_student = feather.read_dataframe(
        "{0}/attend_student.feather".format(save_path))

    current_students = students[students.enroll_status == 0].copy()
    current_students = pd.DataFrame(current_students['student_number'])

    attend_student_current = pd.merge(attend_student,
                                      current_students,
                                      on="student_number",
                                      how="inner")

    attend_student_current_grouped = attend_student_current.groupby(
        ['student_number', 'lastfirst', 'grade_level', 'school_abbrev'],
        as_index=False)

    attend_student_ytd = (attend_student_current_grouped.aggregate({
        'enrolled':
        'sum',
        'present':
        'sum',
        'absent':
        'sum'
    }))

    attend_student_ytd = attend_student_ytd.assign(
        ada=attend_student_ytd.present / attend_student_ytd.enrolled * 100)

    attend_student_ytd['ada_rank'] = (attend_student_ytd.groupby(
        ['school_abbrev', 'grade_level'],
        group_keys=False)['ada'].rank("dense", ascending=True))

    attend_student_ytd = attend_student_ytd.sort_values(
        ['school_abbrev', 'grade_level', 'ada_rank'])

    write_path = "{0}/attend_student_ytd.feather".format(save_path)

    feather.write_dataframe(attend_student_ytd, write_path)

    return write_path
Example #39
0
def saveFeatherFullData(output_npy,
                        label_npy,
                        u_dates,
                        lakename,
                        trial,
                        PGRNN=True,
                        includeTest=False):
    # convert predictions/labels numpy arrays into pandas dataframe and save as feather
    # @output_npy = prediction matrix (depths x days)
    # @label_npy = label matrix (depth x days)
    # @u_dates = numpy array of unique dates (np.datetime64 type)
    # @lakename = string nhd id (str)
    # @n_hid = number of best hidden units for experiment (str or int)
    # @realization = realization index from randomization (str or int)
    # @l1_norm (optional) = if l1 norm is used in hyperparameter optimization this is the value
    trial = str(trial)
    output_df = pd.DataFrame({'date': u_dates})
    label_df = pd.DataFrame({'date': u_dates})
    n_test_dates = u_dates.shape[0]
    n_depths = output_npy.shape[0]
    for i in range(0, n_depths):
        data = np.empty((n_test_dates))
        data[:] = np.nan
        new_col = pd.DataFrame({'depth_' + str(i): output_npy[i, :]})
        new_col2 = pd.DataFrame({'depth_' + str(i): label_npy[i, :]})
        output_df = pd.concat([output_df, new_col], axis=1)
        label_df = pd.concat([label_df, new_col2], axis=1)
    pg = ''
    if PGRNN:
        pg = 'PGRNN'
    else:
        pg = 'RNN'

    o_path = '../../scripts/manylakes2/outputs_full/' + lakename + pg + '_output_' + 'trial' + trial + '.feather'
    l_path = '../../scripts/manylakes2/labels/' + lakename + '_label.feather'

    # save em
    exists = os.path.isfile(l_path)

    if not exists:
        feather.write_dataframe(label_df, l_path)
    feather.write_dataframe(output_df, o_path)
Example #40
0
def csv_to_feather(csv, featherFileName, featherOutLoc=None):
    ''' 
  Inputs: 
      csv - (string) csv file; Location of .csv file + fileName; INCLUDE .csv
      featherFileName - (string) name to save feather file as;   INCLUDE .feather
      featherOutLoc - (string) folder to save feather file to
      
  Output: 
      Returns None; creates .feather file at specified location, defaults to current working directory
 
  Input example:
  r'H:\Data\HF data\heart failure data.csv'
  '''

    dataframe = pd.read_csv(csv)

    if featherOutLoc is None:
        feather.write_dataframe(dataframe, featherFileName)
    else:
        feather.write_dataframe(dataframe, featherOutLoc + featherFileName)
def main():
    #get args:
    args = sys.argv
    in_file = args[1]

    checkExists(in_file)

    print("Reading in file: ", in_file)
    df = pandas.read_csv(in_file)
    print("Successfully read in file: ", in_file)

    #write feather:
    out_file_name = os.path.splitext(in_file)[0]
    out_file_name += ".feather"
    print("Writing file: ", out_file_name)
    try:
        feather.write_dataframe(df, out_file_name)
        print("Successfully wrote file: ", out_file_name)
    except:
        print("File not written.")
Example #42
0
    def _check_pandas_roundtrip(self, df, expected=None, path=None,
                                columns=None, null_counts=None):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        feather.write_dataframe(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = feather.read_dataframe(path, columns)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)

        if null_counts is None:
            null_counts = np.zeros(len(expected.columns))

        np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
Example #43
0
def save_data_to_R(outpath, header, row, data):
    if data is None:
        return
    if '.feather' not in outpath:
        outpath += '.feather'

    import pandas as pd
    try:
        import feather
    except ImportError as e:
        raise RuntimeError(
            "Cannot export to R, require python package 'feather-format'")

    row = np.array(row)
    if isinstance(header, string_types):
        header = header.split(',')
    header = np.array(header)
    df = pd.DataFrame(data=data, index=row, columns=header, dtype=data.dtype)

    feather.write_dataframe(df, outpath)
Example #44
0
def averageTrialsToFinalOutput(lakename,
                               realization,
                               best_hid,
                               best_norm='NA',
                               trials=2,
                               PGRNN=True):
    # for a given lake and realization of randomly chosen observations, compile the experiment results into an averaged prediction
    # @lakename = string nhd id
    # @best hid = number of best hidden units for experiment
    # @best norm (optional) = if l1 norm is used in hyperparameter optimization this is the value
    # @trials (not implemented yet) = number of trials per experiment setup
    # @PGRNN = True if PGRNN, False if RNN

    realization = str(realization)
    pg = ''
    if PGRNN:
        pg = 'PGRNN'
    else:
        pg = 'RNN'
    o_path1 = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str(
        best_hid) + '_norm' + str(best_norm) + '_trial0.feather'
    o_path2 = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str(
        best_hid) + '_norm' + str(best_norm) + '_trial1.feather'
    merge_path = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str(
        best_hid) + '_norm' + str(best_norm) + '_BESTmerged.feather'
    merge_path2 = '../../scripts/manylakes/outputs' + realization + '/' + lakename + pg + '_output_' + 'nhid' + str(
        best_hid) + '_norm' + str(best_norm) + '_BESTmerged2.feather'
    obs1 = pd.read_feather(o_path1)
    obs2 = pd.read_feather(o_path2)
    obs_merged = obs1.copy()
    obs_merged2 = pd.DataFrame().reindex_like(obs1)  #

    obs_merged.values[:,
                      1:] = (obs1.values[:, 1:] +
                             obs2.values[:, 1:]) / 2  # average the two trials
    feather.write_dataframe(obs_merged, merge_path)

    obs_merged2.values[:, 1:] = (obs1.values[:, 1:] + obs2.values[:, 1:]) / 2
    obs_merged2['date'] = pd.to_datetime(df['date'])
    # obs_merged2.values[:,0] = obs1.values[:,0]
    feather.write_dataframe(obs_merged2, merge_path2)
Example #45
0
def save_to_r_dataset(df, path, save_as_csv=False):
    """Convert pandas dataframe to r dataframe.

    Parameters
    ----------
    df : dataframe
        Pandas dataframe.
    path : str
        Path to save.

    Returns
    -------
    None
        Description of returned object.

    """
    if save_as_csv:
        df.to_csv(path, index=False)
    else:
        feather.write_dataframe(df, path)
    return None
Example #46
0
def parse_files(cols_to_keep=['ts', 'url', 'languages']):
    i = 1
    master_df = pd.DataFrame()

    for file in os.listdir(index_folder):
        if 'cdx-' in file:
            if i % 10 == 0 or i == 1:
                sys.stdout.write('\rParsing file {} out of {}'.format(i, len(os.listdir(index_folder))))
                sys.stdout.flush()

            file_path = os.path.join(index_folder, file)
            df = _parse_index_file(file_path, cols_to_keep)
            master_df = pd.concat([master_df, df], sort=False)
            i += 1

    print('Parsing complete! {} total records extracted.'.format(len(master_df)))

    logging.debug('Saving DataFrame...')
    data_path = '../data/raw/'
    master_file = 'cc_urls_' + yearmonth
    feather.write_dataframe(master_df, os.path.join(data_path, master_file))
Example #47
0
def main(input, output):

    basicConfig(level=INFO,
                handlers=[
                    StreamHandler(),
                    FileHandler('{}.log'.format(output), mode='w')
                ])

    # Concatenate away.

    info('Concatenating data')

    data = (feather.read_dataframe(x) for x in input)

    concatenated = pd.concat(data)

    # Write the data.

    info('Writing data')

    feather.write_dataframe(concatenated, output)
Example #48
0
def write_player_id_file():
    """
    Writes the player id file to disk in feather format

    This file maps player IDs to names, positions, handedness, teams, and jersey numbers. Using IDs is a way to avoid
    having to correct the numerous spelling inconsistencies in the data.
    """
    import feather
    try:
        PLAYER_IDS.sort_values(by="ID", inplace=True)
    except UnboundLocalError:
        PLAYER_IDS = get_player_id_file()
        PLAYER_IDS.sort_values(by="ID", inplace=True)
    PLAYER_IDS['#'] = PLAYER_IDS['#'].astype(int)
    PLAYER_IDS['ID'] = PLAYER_IDS['ID'].astype(str)
    PLAYER_IDS['Name'] = PLAYER_IDS['Name'].astype(str)
    PLAYER_IDS['Pos'] = PLAYER_IDS['Pos'].astype(str)
    PLAYER_IDS['Team'] = PLAYER_IDS['Team'].astype(str)
    PLAYER_IDS['Hand'] = PLAYER_IDS['Hand'].astype(str)
    PLAYER_IDS = PLAYER_IDS.drop_duplicates()
    feather.write_dataframe(PLAYER_IDS, PLAYER_ID_FILE)
Example #49
0
File: kernel.py Project: BoPeng/SOS
def _R_repr(obj):
    if isinstance(obj, bool):
        return 'TRUE' if obj else 'FALSE'
    elif isinstance(obj, (int, float, str)):
        return repr(obj)
    elif isinstance(obj, Sequence):
        if len(obj) == 0:
            return 'c()'
        # if the data is of homogeneous type, let us use c()
        # otherwise use list()
        # this can be confusion but list can be difficult to handle
        if homogeneous_type(obj):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        else:
            return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    elif obj is None:
        return 'NULL'
    elif isinstance(obj, dict):
        return 'list(' + ','.join('{}={}'.format(x, _R_repr(y)) for x,y in obj.items()) + ')'
    elif isinstance(obj, set):
        return 'list(' + ','.join(_R_repr(x) for x in obj) + ')'
    else:
        import numpy
        import pandas
        if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\
                numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, \
                numpy.float64)):
            return repr(obj)
        elif isinstance(obj, numpy.matrixlib.defmatrix.matrix):
            try:
                import feather
            except ImportError:
                raise UsageError('The feather-format module is required to pass numpy matrix as R matrix'
                    'See https://github.com/wesm/feather/tree/master/python for details.')
            feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name
            feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_)
            return 'data.matrix(read_feather("{}"))'.format(feather_tmp_)
        elif isinstance(obj, numpy.ndarray):
            return 'c(' + ','.join(_R_repr(x) for x in obj) + ')'
        elif isinstance(obj, pandas.DataFrame):
            try:
                import feather
            except ImportError:
                raise UsageError('The feather-format module is required to pass pandas DataFrame as R data.frame'
                    'See https://github.com/wesm/feather/tree/master/python for details.')
            feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name
            try:
                data = obj.copy()
                feather.write_dataframe(data, feather_tmp_)
            except:
                # if data cannot be written, we try to manipulate data
                # frame to have consistent types and try again
                for c in data.columns:
                    if not homogeneous_type(data[c]):
                        data[c] = [str(x) for x in data[c]]
                feather.write_dataframe(data, feather_tmp_)
            return 'read_feather("{}")'.format(feather_tmp_)
        else:
            return repr('Unsupported datatype {}'.format(short_repr(obj)))
    try:
        sim_data = [x.get(timeout = 300) for x in res]
    except mp.TimeoutError:
        print("Simulation failed: timeout.")
        del(pool)
        sys.exit(1)
    print("Pool.map finished for risk_model %d" % risk_model)
    for sid in sim_data:
        iter_data = iter_data.append(sid, ignore_index = True)
    print("Finishing job for risk model %d" % risk_model)
    return iter_data

eu_results = do_simulations(100, risk_model = 0)
eu_iter_data = eu_results
# eu_iter_data.to_csv('eu_iter_data_p.csv')
feather.write_dataframe(eu_iter_data, 'eu_iter_data_p.feather')

regret_results = do_simulations(100, risk_model = 1)
regret_iter_data = regret_results
# regret_iter_data.to_csv('regret_iter_data_p.csv')
feather.write_dataframe(regret_iter_data, 'regret_iter_data_p.feather')

prospect_results = do_simulations(100, risk_model = 2)
prospect_iter_data = prospect_results
#prospect_iter_data.to_csv('prospect_iter_data_p.csv')
feather.write_dataframe(prospect_iter_data, 'prospect_iter_data_p.feather')

mixed_results = do_simulations(100, risk_model = 3)
mixed_iter_data = mixed_results
# mixed_iter_data.to_csv('mixed_iter_data_p.csv')
feather.write_dataframe(mixed_iter_data, 'mixed_iter_data_p.feather')
    try:
        bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%m/%d/%Y %H:%M:%S")
    except ValueError:
        try:
            bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%m/%d/%Y %H:%M")
        except ValueError:
            bikedata['dtstoptime'] = pd.to_datetime(bikedata.stoptime, format="%Y-%m-%d %H:%M:%S")

    # Set the startdate and stopdate - minutes and seconds reset to 0 (in the following format - 2016-03-01 06:00:00)
    # This has been done so that we could aggregate departures and arrivals per hour to identify bike usage
    bikedata['dtstartdatehour'] = bikedata.dtstarttime.apply(lambda x:x.replace(minute=0,second=0))
    bikedata['dtstopdatehour'] = bikedata.dtstoptime.apply(lambda x:x.replace(minute=0,second=0))
    bikedata['startdatehour'] = bikedata.dtstartdatehour.apply(lambda x:x.strftime('%Y-%m-%d %H:%M:%S'))
    # bikedata['startdate'] = bikedata.dtstarttime.dt.date.apply(lambda x:x.strftime('%Y-%m-%d'))

    # datetime.datetime columns are not supported by Feather
    # So, deleting the column after pre-processing.
    # TODO: Can we do without converting the dates in the first place?
    bikedata.drop('dtstarttime', axis=1, inplace=True)
    bikedata.drop('dtstoptime', axis=1, inplace=True)

    print fname, ' - ', len(bikedata)

    listFiles.append(bikedata)

df = pd.concat(listFiles)
print len(df)
feather.write_dataframe(df, dirname + feather_output_filename)

print "Successfully written into feather format"
import datetime
import feather
import pandas
import sys

if __name__ == '__main__':

    _, type_, date = sys.argv

    csv_file = '{}.{}.csv'.format(type_, date)
    df = pandas.DataFrame.from_csv(csv_file).reset_index()

    feather_file = '{}.{}.feather'.format(type_, date)
    feather.write_dataframe(df, feather_file)

    print('{} {} {} done'.format(datetime.datetime.now(), type_, date))
import pandas
import feather

# Read flights data and select flights to O'Hare
flights = pandas.read_csv("tests/testthat/data/flights.csv")
flights = flights[flights['dest'] == "ORD"]

# Select carrier and delay columns and drop rows with missing values
flights = flights[['carrier', 'dep_delay', 'arr_delay']]
flights = flights.dropna()
flights.head(10)

# Write to feather file for reading from R
feather.write_dataframe(flights, "tests/testthat/data/flights.feather")
Example #54
0
    co is, on average, 2.4 days after ci
    """
    df.date_time = pd.to_datetime(df.date_time, errors='coerce')
    df.srch_ci = pd.to_datetime(df.srch_ci, errors='coerce')
    df.srch_co = pd.to_datetime(df.srch_co, errors='coerce')
    df.srch_ci = df.srch_ci.fillna(df.date_time + timedelta(days=35))
    df.srch_co = df.srch_co.fillna(df.srch_ci + timedelta(days=2))
    return df

print(78*'=')
print("Reading train...")
df_train = feather.read_dataframe('../data/train_only_booked.feather')
print("Creating Features for Train...")
df_train_features = create_features(df_train, train=True)
print("Writing Feather...")
feather.write_dataframe(df_train_features, '../data/train_only_booked_features.feather')
gc.collect()

print(78*'=')
print("Reading holdout...")
df_hold = feather.read_dataframe('../data/holdout.feather')
print("Munging Holdout")
df_hold_feat = create_features(df_hold)
print("Writing Feather...")
feather.write_dataframe(df_hold_feat, '../data/holdout_features.feather')
gc.collect()

print(78*'=')
print("Reading LB Test...")
df_lb = feather.read_dataframe('../data/test.feather')
print("Creating Features for Public_LB")
Example #55
0
import feather, pandas, numpy as np, datetime

Abool = np.array([True,True,False])
Aint8 = np.array([1,1,0], dtype=np.int8)
Aint16 = np.array([1,1,0], dtype=np.int16)
Aint32 = np.array([1,1,0], dtype=np.int32)
Aint64 = np.array([1,1,0], dtype=np.int64)
Auint8 = np.array([1,1,0], dtype=np.uint8)
Auint16 = np.array([1,1,0], dtype=np.uint16)
Auint32 = np.array([1,1,0], dtype=np.uint32)
Auint64 = np.array([1,1,0], dtype=np.uint64)

Afloat32 = np.array([1.0, "NaN", 0.0], dtype=np.float32)
Afloat64 = np.array(["Inf", 1.0, 0.0], dtype=np.float64)

Autf8 = ["hey","there","sailor"]
Abinary = [b"hey",b"there",b"sailor"]

# Adate = [datetime.datetime(2016,1,1).date(),datetime.datetime(2016,1,2).date(),datetime.datetime(2016,1,3).date()]
Adatetime = [datetime.datetime(2016,1,1),datetime.datetime(2016,1,2),datetime.datetime(2016,1,3)]

Acat = pandas.Categorical(["a","b","c"], categories=["a","b","c","d"],ordered=False)  # don't conform to Arrow!
Acatordered = pandas.Categorical(["d","e","f"], categories=["d","e","f"],ordered=True)  # don't conform to Arrow!

df = pandas.DataFrame({"Abool": Abool,"Aint8": Aint8,"Aint16": Aint16,"Aint32": Aint32,"Aint64": Aint64,"Auint8": Auint8,"Auint16": Auint16,"Auint32": Auint32,"Auint64": Auint64,"Afloat32": Afloat32,"Afloat64": Afloat64,"Autf8": Autf8,"Abinary": Abinary,"Adatetime": Adatetime, "Acat": Acat,"Acatordered":Acatordered})

feather.write_dataframe(df,  "/home/test.feather")
Example #56
0
from pandas.util.testing import assert_frame_equal
import pandas as pd

import feather

import uuid

nrows = 4000000
ncols = 100

data = np.random.randn(nrows)

df = pd.DataFrame({'c{0}'.format(i): data
                   for i in range(ncols)})

def guid():
    return uuid.uuid4().hex

path = 'test_{0}.feather'.format(guid())

try:
    feather.write_dataframe(df, path)
    df2 = feather.read_dataframe(path)
    assert_frame_equal(df, df2)
finally:
    try:
        os.remove(path)
    except os.error:
        pass
def convert_feather(df, output_filename):
    feather.write_dataframe(df, output_filename)
    shop_id = df_shop.shop_id.iloc[0]
    df_shop_test['shop_id'] = [shop_id] * 14

    df_shop_test['day'] = pd.date_range('2016-11-01', '2016-11-14')

    days_max = df_shop.days_from_beginning.max()
    df_shop_test['days_from_beginning'] = np.arange(days_max + 1, days_max + 15)
    df_shop_test['pays_count'] = np.nan
    df_shop_test['week_id'] = [0] * 7 + [-1] * 7
    df_shop_test['biweek_id'] = 0

    return df_shop_test


dfs = []

for i in tqdm(shops):
    df_shop = df_pays[df_pays.shop_id == i]
    dfs.append(df_shop)
    df_shop_test = generate_test_df(df_shop)
    dfs.append(df_shop_test)


df_pays = pd.concat(dfs).reset_index(drop=1)

df_pays['dow'] = df_pays.day.dt.dayofweek.astype('uint8')
df_pays['is_weekend'] = df_pays.dow.isin([5, 6])

feather.write_dataframe(df_pays, 'data/df_pays_na_test.feather')
def nnd_hotdeck_using_feather(receiver = None, donor = None, matching_variables = None, z_variables = None):
    """
    Not working
    """
    import feather

    assert receiver is not None and donor is not None
    assert matching_variables is not None

    temporary_directory_path = os.path.join(config_files_directory, 'tmp')
    assert os.path.exists(temporary_directory_path)
    receiver_path = os.path.join(temporary_directory_path, 'receiver.feather')
    donor_path = os.path.join(temporary_directory_path, 'donor.feather')
    feather.write_dataframe(receiver, receiver_path)
    feather.write_dataframe(donor, donor_path)
    if isinstance(matching_variables, str):
        match_vars = '"{}"'.format(matching_variables)
    elif len(matching_variables) == 1:
        match_vars = '"{}"'.format(matching_variables[0])
    else:
        match_vars = '"{}"'.format('todo')

    r_script = """
rm(list=ls())
gc()
devtools::install_github("wesm/feather/R")
library(feather)
library(StatMatch)

receiver <- read_feather({receiver_path})
donor <- read_feather({donor_path})
summary(receiver)
summary(donor)

# variables
receiver = as.data.frame(receiver)
donor = as.data.frame(donor)
gc()
match_vars = {match_vars}
# don_class = c("sexe")
out.nnd <- NND.hotdeck(
  data.rec = receiver, data.don = donor, match.vars = match_vars
  )

# out.nndsummary(out.nnd$mtc.ids)
# head(out.nnd$mtc.ids, 10)
# head(receiver, 10)

fused.nnd.m <- create.fused(
    data.rec = receiver, data.don = donor,
    mtc.ids = out.nnd$mtc.ids,
    z.vars = "{z_variables}"
    )
summary(fused.nnd.m)
""".format(
        receiver_path = receiver_path,
        donor_path = donor_path,
        match_vars = match_vars,
        z_variables = z_variables,
        )
    print(r_script)
import os

import pandas as pd
import feather

os.getcwd()
fp = os.getcwd().replace("feature_eng", "")

train = feather.read_dataframe(fp + "data/train.feather")

df = pd.pivot_table(
    train,
    values="Demanda_uni_equil",
    index=[
        "Cliente_ID",
        "Producto_ID",
        "Agencia_ID",
        "Canal_ID",
        "Ruta_SAK",
        "Venta_uni_hoy",
        "Venta_hoy",
        "Dev_uni_proxima",
        "Dev_proxima",
    ],
    columns="Semana",
)

df = df.reset_index()

feather.write_dataframe(df, fp + "data/week_split_train.feather")