def main(): parser = ArgumentParser() parser.add_argument('--spark', dest='spark', action='store_true', default=False) args = parser.parse_args() doc_freq = read_json('../../data/doc_freq.json', typ='series') N = doc_freq['TOTAL_DOCS'] idf_vector = np.log10(N / doc_freq) for line in sys.stdin: url, tf_vector = line.split() tf_vector = read_json(tf_vector, typ='series') # Calculate tf-idf from tf vector and doc_freq vector tf_idf_vector = tf_vector.multiply(idf_vector, fill_value=0) # Normalize the tf-idf vector, this is important for the clustering done # later on. tf_idf_vector = tf_idf_vector / tf_idf_vector.sum() # Normalize the tf-idf vector. if NONZERO_ONLY: # tf_idf terms where these terms actually exist in this document, since # this is going to be a sparse vector. output_vector = tf_idf_vector[tf_idf_vector.nonzero()[0]] else: output_vector = tf_idf_vector s = StringIO() output_vector.to_json(s) if args.spark: indices_zip = list(zip(tf_idf_vector.nonzero()[0].tolist(),output_vector.tolist())) print('%s\t%s' % (url,indices_zip)) else: print('%s\t%s' % (url, s.getvalue()))
def test_good_kwargs(): df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) with tm.assert_produces_warning(None): tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split")) tm.assert_frame_equal( df, read_json(df.to_json(orient="columns"), orient="columns") ) tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index"))
def test_deprecated_kwargs(): df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) buf = df.to_json(orient="split") with tm.assert_produces_warning(FutureWarning): tm.assert_frame_equal(df, read_json(buf, "split")) buf = df.to_json(orient="columns") with tm.assert_produces_warning(FutureWarning): tm.assert_frame_equal(df, read_json(buf, "columns")) buf = df.to_json(orient="index") with tm.assert_produces_warning(FutureWarning): tm.assert_frame_equal(df, read_json(buf, "index"))
def get_data_df_from_JSON_Data(ls_filenames): # use the list of filenames, find the json dataset, and return the data frame # that has all the json data data_df = read_json(open(ls_filenames[0], 'r'), orient='records') if (len(ls_filenames) >= 2): for i in range(1, len(ls_filenames)): data_df = data_df.append(read_json(open(ls_filenames[i], 'r'), orient='records'), ignore_index=True) return data_df
def publish(): form = Form() if form.validate_on_submit(): try: # TODO: add this to config instead json_filepath = os.path.join('C:\\', 'Users', 'michhar', 'Documents', 'MLADS', 'data', 'MessyDoc-8f814e3f2a78.json') json_key = json.load(open(json_filepath)) credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), config.SCOPE) gc = gspread.authorize(credentials) # TODO: create a worksheet if not there, also put this in config wksheet = gc.open("SSF_Crop_Master_2012_Master_crop_master").worksheet('latest') # make a client connection client = document_client.DocumentClient(config.DOCUMENTDB_HOST, {'masterKey': config.DOCUMENTDB_KEY}) # Read databases and get our working database db = next((data for data in client.ReadDatabases() if data['id'] == config.DOCDB_DATABASE)) # Read collections and get the "user collection" coll_master = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == config.DOCDB_COLLECTION_MASTER)) master_doc = next((doc for doc in client.ReadDocuments(coll_master['_self']) if doc['id'] == config.DOCDB_MASTER_DOC)) master_data_df = read_json(master_doc['data']) headers = read_json(master_doc['data_headers']) master_data_df.columns = headers # update all cells in master google doc with data in master doc from db # this takes a minute or two (maybe put into a separate view function) update_worksheet(wksheet, master_data_df) return render_template('results.html', masterlink = 'https://docs.google.com/spreadsheets/d/1MKcDtjI5E-iNv9tU2KcA5yJWWgaSTh5j2IjPYOp9lic/pubhtml', title = 'Results', year = datetime.now().year, message = 'Success! Your data has been stored and the master sheet updated here ') except gspread.SpreadsheetNotFound, e: return render_template('error_page.html', title = 'Something went wrong!', year = datetime.now().year, message = '''The spreadsheet was not found. Please ensure you have enabled Google Drive API and created a new set of credentials.''', link = 'http://gspread.readthedocs.org/en/latest/oauth2.html')
def get_group(self, key): new_query = 'select value t.grps from (%s) t where grp_id=%s;' % ( self.query[:-1], str(key)) results = json.dumps(af.AFrame.send_request(new_query)[0]) grp = json.read_json(results)['grp'] df = pd.DataFrame(grp.tolist()) return df
def collect(self): results = af.AFrame.send_request(self._query) json_str = json.dumps(results) result = pd.DataFrame(data=json.read_json(json_str)) if '_uuid' in result.columns: result.drop('_uuid', axis=1, inplace=True) return result
def data_label_processing(): #Importing label names image_name_df = pd.read_excel(IMG_LBL_PATH, sheet_name='Sheet1') image_name_df.columns = ['species', 'label_id', 'label'] species = [] for idx in image_name_df.index: if (pd.isnull(image_name_df.at[idx,'species'])): image_name_df.at[idx,'species'] = save_class else: save_class = image_name_df.at[idx,'species'] species.append(save_class) imn_df = image_name_df.reset_index(drop=True) #Importing image names image_id_df = read_json(IMG_JSON_PATH) image_id_df.columns = ['label_id', 'image_name'] imi_df = image_id_df.reset_index(drop=True) #Combining labels and image names images_df = pd.merge(imn_df, imi_df, how='inner', on='label_id') plant_df = images_df.sample(frac=1).reset_index(drop=True) return plant_df
def parseDataset(): """ parses the dataset and creates list of dictionaries, similar to a baseline dictionary """ with open(dataset) as datasetFileHandler: datarows = datasetFileHandler.readlines() for datarow in datarows: dataframe = pd.read_json(datarow) outputDict = getBaselineDictionary() # cleanse DF as per reqs outputDict["business_id"] = dataframe.business_id[0] for timeline, number in dataframe.checkin_info.iteritems(): try: hour, day = timeline.split("-") if (day in weekends): outputDict["num_of_visits_in_weekends"] += number if (day in weekdays): outputDict["num_of_visits_in_weekdays"] += number if (hour in morning): outputDict["num_of_morning_visits"] += number if (hour in afternoon): outputDict["num_of_afternoon_visits"] += number if (hour in evening): outputDict["num_of_evening_visits"] += number except: pass outputDataFramesList.append(outputDict)
def load(test = False, cols = COLS): """Loads data from FTEST if *test* is True, otherwise from FTRAIN. Pass a list of *cols* if you're only interested in a subset of the target columns. """ fname = FTEST if test else FTRAIN df = read_json(os.path.expanduser(fname)) # load pandas dataframe df['price_per_bathroom'] = np.log((df['price']+1)/(df['bathrooms']+1)) df['price_per_bedroom'] = np.log((df['price']+1)/(df['bedrooms']+1)) df['price'] = np.log(df['price'] + 1) df['created'] = df['created'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) df['day_of_month'] = df['created'].apply(lambda x: x.day) df['hour'] = df['created'].apply(lambda x: x.hour + (x.minute + x.second / 60.0) / 60.0) df['day_of_week'] = df['created'].apply(lambda x: x.weekday()) df['desc_len'] = df['description'].apply(lambda desc: len([x for x in re.split(r'\W+', desc) if len(x) > 0])) df['num_features'] = df['features'].apply(len) df['features_len'] = df['features'].apply(lambda feats: sum([len([x for x in re.split(r'\W+', feat) if len(x) > 0]) for feat in feats])) df['num_photos'] = df['photos'].apply(len) # force all coordinates within NYC area df['longitude'] = df['longitude'].apply(bound(-74.3434, -73.62)) df['latitude'] = df['latitude'].apply(bound(40.4317, 41.0721)) print(df.count()) # prints the number of values for each column if not test: # only FTRAIN has any target columns df = df.dropna() # drop all rows that have missing values in them X = np.array(df[cols], dtype = np.float32) y = np.array(get_dummies(df['interest_level'])[OUTPUT_COLS], np.float32) X, y = shuffle(X, y, random_state=42) # shuffle train data else: X = np.array(df[cols], dtype = np.float32) y = df['listing_id'].as_matrix() return X, y
def head(self, num=5): new_query = self.query[:-1] + ' limit %d;' % num results = af.AFrame.send_request(new_query) json_str = json.dumps(results) result = pd.DataFrame(data=json.read_json(json_str)) if '_uuid' in result.columns: result.drop('_uuid', axis=1, inplace=True) return result
def get_curve_fitter(self): group_df = read_json(self.raw_data) group_df.sort(["percent_inhib","concentration"], inplace=True) curve_fitter = IC50CurveFit(main_group_df=group_df) curve_fitter.get_fit(self, constrained=True) return curve_fitter
def post_region(module,method): """ To make a POST using CURL to the flask dev server: Fisher-Jenks using the Hartigan Olympic time example curl -i -H "Content-Type: application/json" -X POST -d '{"args":["[12, 10.8, 11, 10.8, 10.8, 10.6, 10.8, 10.3, 10.3,10.3,10.4,10.5,10.2,10.0,9.9]"], "kwargs":{"k":5}}' http://localhost:5000/ap/esda/fisher_jenks/ or Sample Jenks Caspall using the same example - note that sample percentage is not passed. curl -i -H "Content-Type: application/json" -X POST -d '{"args":["[12, 10.8, 11, 10.8, 10.8, 10.6, 10.8, 10.3, 10.3,10.3,10.4,10.5,10.2,10.0,9.9]"], "kwargs":{"k":5}}' http://localhost:5000/ai/esda/jenks_caspall_sampled/ """ if not request.json: response = {'status':'error','data':{}} standarderror['data'] = 'Post datatype was not json' return jsonify(standarderror), 400 else: response = {'status':'success','data':{}} #Setup the call, the args and the kwargs call = funcs[module][method] #Parse the args args = request.json['args'] print type(args) validargs = [] for a in args: #Literal eval to get the native python type va = json.loads(a) #va = ast.literal_eval(a) #If it is a list, cast to a numpy ndarray via pandas json io #This should go to a decorator on the PySAL side at some point if isinstance(va, list): va = read_json(a) validargs.append(va.values.ravel()) #Check for and parse the kwargs try: kwargs = request.json['kwargs'] validkwargs = {} validkwargs = ast.literal_eval(str(kwargs)) except: pass #Make the call and get the return items funcreturn = vars(call(*validargs, **validkwargs)) for k, v in funcreturn.iteritems(): if isinstance(v, np.ndarray): funcreturn[k] = v.tolist() elif isinstance(v, ps.W): print "W OBJ" response['data'] = funcreturn return jsonify(response)
def main(): parser = ArgumentParser() parser.add_argument('--spark', dest='spark', action='store_true', default=False) args = parser.parse_args() doc_freq = read_json('../../data/doc_freq.json', typ='series') N = doc_freq['TOTAL_DOCS'] idf_vector = np.log10(N / doc_freq) for line in sys.stdin: url, tf_vector = line.split() tf_vector = read_json(tf_vector, typ='series') # Calculate tf-idf from tf vector and doc_freq vector tf_idf_vector = tf_vector.multiply(idf_vector, fill_value=0) # Normalize the tf-idf vector, this is important for the clustering done # later on. tf_idf_vector = tf_idf_vector / tf_idf_vector.sum( ) # Normalize the tf-idf vector. if NONZERO_ONLY: # tf_idf terms where these terms actually exist in this document, since # this is going to be a sparse vector. output_vector = tf_idf_vector[tf_idf_vector.nonzero()[0]] else: output_vector = tf_idf_vector s = StringIO() output_vector.to_json(s) if args.spark: indices_zip = list( zip(tf_idf_vector.nonzero()[0].tolist(), output_vector.tolist())) print('%s\t%s' % (url, indices_zip)) else: print('%s\t%s' % (url, s.getvalue()))
def trending(self): tweets = read_json(self.tweets_file, lines=True) top_trending_words = pd.Series(' '.join(tweets.text).lower().split()).value_counts()[:50] # get used hashtags per tweet hashtags = json_normalize(tweets['entities'], 'hashtags', errors='ignore') hashtags['text'] = hashtags['text'].str.lower() top_trending_hashtags = hashtags['text'].value_counts()[:50] _logger.debug("top trending words: \n%s" % top_trending_words) _logger.debug("top trending hashtags: \n%s" % top_trending_hashtags) return (top_trending_hashtags, top_trending_words)
def toPandas(self, sample: int = 0): from pandas.io import json if self._dataset is None: raise ValueError('no dataset specified') else: dataset = self._dataverse + '.' + self._dataset if sample > 0: query = 'select value t from %s t limit %d;' % (dataset, sample) else: query = 'select value t from %s t;' % dataset result = self.send_request(query) data = json.read_json(json.dumps(result)) df = pd.DataFrame(data) if '_uuid' in df.columns: df.drop('_uuid', axis=1, inplace=True) return df
def load_json(self, path, rename=None, index=None): '''Read a json file as a pandas dataframe. Parameters ---------- rename : list of string tuples (new old), optional columns to rename index : string, optional post-rename column to use as the row label. ''' data = pj.read_json(path, orient='records') Cache.rename_columns(data, rename) if index is not None: data.set_index([index], inplace=True) return data
def save_predictions(clf, train_data, target): """ Uses classifier to predict the targets for test_data, and stores the predictions in a .csv file suitable for uploading to kaggle.com. """ test_data = sanitize_train_data(read_json('test.json')) # The request_id attribute should be the second column. # Force the predictions to 0/1 instead of True/False. rows = zip( test_data['request_id'].tolist(), clf.fit(train_data, target).predict(test_data).astype(int).tolist(), ) with open('test_predictions.csv', 'w') as f: f.write('request_id,requester_received_pizza\n') for row in rows: f.write(('%s,%s\n' % row).encode('utf-8'))
def load_json(self, path, rename=None, index=None): '''Read a json file as a pandas dataframe. Parameters ---------- rename : list of string tuples (new old), optional columns to rename index : string, optional post-rename column to use as the row label. ''' data = pj.read_json(path, orient='records') self.rename_columns(data, rename) if index is not None: data.set_index([index], inplace=True) return data
def all_entries(): #connect to the db client = Cloudant(user, password, url=url, connect=True) db = client.create_database(db_name, throw_on_exists=False) #get all docs docs = list(map(lambda doc: doc, db)) #put them into a dataframe fdocs = json_normalize(docs) fdocs = DataFrame(fdocs, columns=['date', 'component', 'data', '_id']) fdocs['date'] = to_datetime(fdocs['date']) fdocs = fdocs.reset_index(drop=True) fdocs.sort_values(['date', 'component']) #get the components components = fdocs['component'].unique().tolist() data = [None] * len(fdocs) for i, row in fdocs.iterrows(): tmp = read_json(fdocs.loc[i, 'data'], orient='index') tmp = tmp.reset_index() data[i] = tmp fdocs.loc[i, 'data'] = i #make a list of same size as components complist = [None] * len(components) for i in range(len(components)): #drop everything but relevant info tmp = fdocs.drop(fdocs[fdocs.component != components[i]].index) #drop duplicates tmp = tmp.drop_duplicates(subset=['date'], keep='first', inplace=False) #sort them tmp = tmp.sort_values(['date'], ascending=[False]) #re index the dataframe tmp = tmp.reset_index(drop=True) #put the dataframe into the list complist[i] = tmp #disconnect from db client.disconnect() return render_template('entries.html', entries=complist, data=data)
def get_results_for_datapoint(self): '''When ordered by concentration, take the nth group and do an average - only used for the export to beehive function''' raw_dataframe = read_json(self.raw_data) raw_dataframe = raw_dataframe.groupby("concentration", sort=True) index = 0 sorted_groups = reversed(sorted([g for g in raw_dataframe.groups])) for index, group in enumerate(sorted_groups): concentration = group df = raw_dataframe.get_group(group) inhibition = df["percent_inhib"].mean() inhibition_error = df["percent_inhib"].std() realind = index +1 yield [(u" Compound Concentration %d (uM) (Compound Concentration Range) " % realind, concentration ,), (u" Compound Concentration %d Inhibition (%%) (Compound Concentration Range) " % realind, inhibition * 100,), (u" Compound Concentration %d Error (%%) (Compound Concentration Range) " % realind, inhibition_error *100,)] while index < 11: index += 1 #Fill in any missing values up to 12 columns in total realind = index +1 yield [(u" Compound Concentration %d (uM) (Compound Concentration Range) " % realind , "" ,), (u" Compound Concentration %d Inhibition (%%) (Compound Concentration Range) " % realind, "" ,), (u" Compound Concentration %d Error (%%) (Compound Concentration Range) " % realind, "" ,)]
os.chdir("/home/iap13/wcx/bandgapdata/bgdata") list_name = os.listdir() from featurebox.data.impot_element_table import element_table name_and_abbr = element_table.iloc[[0, 1], :] element_table = element_table.iloc[2:, :] elemen = element_table[[ 'electronegativity(martynov&batsanov)', 'electron number' ]] dict_all = [] for i in tqdm(list_name): try: a = json.read_json(i, orient='index', typ='series') cif_str = a["Structure_rlx"] del a["Structure_rlx"] POSCAR = Poscar.from_string(cif_str) ele_den = POSCAR.structure.composition.total_electrons / POSCAR.structure.volume composition_mp = POSCAR.structure.composition ncom = POSCAR.structure.composition.to_data_dict[ 'unit_cell_composition'].values() sym_amt = composition_mp.get_el_amt_dict() syms = sorted(sym_amt.keys(), key=lambda sym: get_el_sp(sym).X) formula = {s: formula_double_format(sym_amt[s], False) for s in syms} departElementProPFeature = DepartElementFeaturizer(
def get_info(_id): X0, X1 = 0, RIGHT - LEFT Y0, Y1 = 0, TOP - BOTTOM #connect to the db client = Cloudant(user, password, url=url, connect=True) db = client.create_database(db_name, throw_on_exists=False) #get the document from the db doc = db[_id] H = simplejson.loads(doc['krige_data']) z = read_json(doc['data'], orient='index') date = doc['date'] #set standard minmax, these will be changed based on component maxvalue = 100 minvalue = 0 #set the min max values according to whats unhealthy #max will be unhealthy value, and in turn be red on the generated image #we set values based on https://uk-air.defra.gov.uk/air-pollution/daqi?view=more-info&pollutant=no2 , Accsessed 11.05.2018 if (doc['component'] == 'PM2.5'): maxvalue = 60 minvalue = 0 if (doc['component'] == 'PM10'): maxvalue = 85 minvalue = 0 if (doc['component'] == 'NO2'): maxvalue = 420 minvalue = 0 buffr = io.BytesIO() fig, ax = subplots() fig.dpi = 400 ax.imshow(H, cmap=my_cmap, vmin=minvalue, vmax=maxvalue, origin='lower', interpolation='nearest', alpha=0.7, extent=[X0, X1, Y0, Y1]) sc = ax.scatter(z.x, z.y, cmap=my_cmap, vmin=minvalue, vmax=maxvalue, c=z.value, linewidths=0.75, s=50) plt.colorbar(sc) fig.suptitle('component: ' + str(z['component'].iloc[0]) + ', date: ' + str(date), fontsize=14) fig.savefig(buffr, dpi=400) buffr.seek(0) #disconnect from db client.disconnect() return send_file(buffr, mimetype='image/png')
"wb") fo.write('[' + ''.join(match_data) + ']') filename = fo.name fo.close() # open .json file again to parse the data readable for the dataFrame in the future json_file = open(filename, 'r') content = json_file.read() json_file.close() content = content.replace("}{", "},\n{") d = open(filename, 'w') d.write(content) d.close() e = open(filename, 'r') data_df = read_json(e, orient='records') data_df['year'] = data_df["dateTime"].apply( lambda x: get_splitted_column_for_datetime('year', x)) data_df['month'] = data_df["dateTime"].apply( lambda x: get_splitted_column_for_datetime('month', x)) data_df['day'] = data_df["dateTime"].apply( lambda x: get_splitted_column_for_datetime('day', x)) data_df['hr'] = data_df["dateTime"].apply( lambda x: get_splitted_column_for_datetime('hr', x)) data_df['min'] = data_df["dateTime"].apply( lambda x: get_splitted_column_for_datetime('min', x)) data_df['sec'] = data_df["dateTime"].apply( lambda x: get_splitted_column_for_datetime('sec', x)) del data_df["dateTime"] e.close()
def collaborate(): form = SetupForm() if form.validate_on_submit(): # gather user info if request.method == 'POST': creds = request.files['credfile'] email = form.email.data else: return render_template('collaborate.html', form = form, title = 'Collaborate', year = datetime.now().year) try: # create connection to google doc json_key = json.loads(creds.read()) credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), config.SCOPE) gc = gspread.authorize(credentials) wksheet = gc.open(form.docname.data).worksheet('crop master') # get contents as a list of dicts contents = wksheet.get_all_values() df = pd.DataFrame(contents) # Tidy up # - make some columns names based on first 5 rows # - grab data and label columns headers = df.iloc[3:5,:] newheaders = headers.sum(axis=0) newdf = df.iloc[7:-1, :].copy() newdf.columns = newheaders # TODO: tidy up more # - unnecessary columns # - add user/farmer name as a multiindex # convert to json # records orientation will result in list like [{column -> value}, ... , {column -> value}] jsonrecord = newdf.to_json(orient = 'values') # going to use 'individuals' collection # does it exist? if not make it, if so just add this doc # make a client connection client = document_client.DocumentClient(config.DOCUMENTDB_HOST, {'masterKey': config.DOCUMENTDB_KEY}) # Read databases and get our working database db = next((data for data in client.ReadDatabases() if data['id'] == config.DOCDB_DATABASE)) # Read collections and get the "user collection" coll_user = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == config.DOCDB_COLLECTION_USER)) # create or update user using upsert API doc = client.UpsertDocument(coll_user['_self'], { 'id': form.email.data, 'timestamp': datetime.now().strftime('%c'), 'data': jsonrecord, 'data_headers': newheaders.to_json(orient = 'values')}) # Read collections and get the "master collection" coll_master = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == config.DOCDB_COLLECTION_MASTER)) doc_definition = { 'id': config.DOCDB_MASTER_DOC, 'timestamp': datetime.now().strftime('%c'), 'data_headers': newheaders.to_json(orient = 'values')} # get all user docs user_docs = client.ReadDocuments(coll_user['_self']) # gather data in user docs into one dataframe user_data_dfs = [] for doc in user_docs: user_data_dfs.append(pd.DataFrame(read_json(doc['data']))) user_data_concatd = pd.concat(user_data_dfs) # convert to json master_records = user_data_concatd.to_json(orient = 'values') # add data to the doc definition doc_definition['data'] = master_records # upsert master with the doc definition doc = client.UpsertDocument(coll_master['_self'], doc_definition) except gspread.SpreadsheetNotFound, e: return render_template('error_page.html', title = 'Something went wrong!', year = datetime.now().year, message = '''The spreadsheet was not found. Please ensure you have enabled Google Drive API and created a new set of credentials.''', link = 'http://gspread.readthedocs.org/en/latest/oauth2.html') return redirect(url_for('.publish'))
def wrap(self, fn, path, cache, save_as_json=True, return_dataframe=False, index=None, rename=None, **kwargs): '''make an rma query, save it and return the dataframe. Parameters ---------- fn : function reference makes the actual query using kwargs. path : string where to save the data cache : boolean True will make the query, False just loads from disk save_as_json : boolean, optional True (default) will save data as json, False as csv return_dataframe : boolean, optional True will cast the return value to a pandas dataframe, False (default) will not index : string, optional column to use as the pandas index rename : list of string tuples, optional (new, old) columns to rename kwargs : objects passed through to the query function Returns ------- dict or DataFrame data type depends on return_dataframe option. Notes ----- Column renaming happens after the file is reloaded for json ''' if cache is True: json_data = fn(**kwargs) if save_as_json is True: ju.write(path, json_data) else: df = pd.DataFrame(json_data) self.rename_columns(df, rename) if index is not None: df.set_index([index], inplace=True) df.to_csv(path) # read it back in if save_as_json is True: if return_dataframe is True: data = pj.read_json(path, orient='records') self.rename_columns(data, rename) if index is not None: data.set_index([index], inplace=True) else: data = ju.read(path) elif return_dataframe is True: data = pd.DataFrame.from_csv(path) else: raise ValueError( 'save_as_json=False cannot be used with return_dataframe=False') return data
def wrap(self, fn, path, cache, save_as_json=True, return_dataframe=False, index=None, rename=None, **kwargs): '''make an rma query, save it and return the dataframe. Parameters ---------- fn : function reference makes the actual query using kwargs. path : string where to save the data cache : boolean True will make the query, False just loads from disk save_as_json : boolean, optional True (default) will save data as json, False as csv return_dataframe : boolean, optional True will cast the return value to a pandas dataframe, False (default) will not index : string, optional column to use as the pandas index rename : list of string tuples, optional (new, old) columns to rename kwargs : objects passed through to the query function Returns ------- dict or DataFrame data type depends on return_dataframe option. Notes ----- Column renaming happens after the file is reloaded for json ''' if cache == True: json_data = fn(**kwargs) if save_as_json == True: ju.write(path, json_data) else: df = pd.DataFrame(json_data) self.rename_columns(df, rename) if index is not None: df.set_index([index], inplace=True) df.to_csv(path) # read it back in if save_as_json == True: if return_dataframe == True: data = pj.read_json(path, orient='records') self.rename_columns(data, rename) if index != None: data.set_index([index], inplace=True) else: data = ju.read(path) elif return_dataframe == True: data = pd.DataFrame.from_csv(path) else: raise ValueError('save_as_json=False cannot be used with return_dataframe=False') return data
import os from matplotlib import rcParams NUM_TERMS = 20 NUM_CLUSTERS = 20 class ClusterComponent(object): def __init__(self, word, index, score): self.word = word self.index = index self.score = score if __name__ == '__main__': rcParams.update({'figure.autolayout':True}) doc_freq = read_json('../../data/doc_freq.json', typ='series') doc_freq_index = doc_freq.index n = NUM_CLUSTERS cluster_ls = [] average_cluster_length_ls = [] while n >= 2: df = pd.DataFrame.from_csv('../../data/clusters_%d.txt' %(n), header=None, index_col=None) # Create a new directory. dir_name_path = '../../data/clusters/%d' % (n) os.makedirs(dir_name_path, exist_ok=True) word_clusters = [] for index, row in df.iterrows(): t = row.nlargest(NUM_TERMS) w_words = pd.Series() for item_index, item_value in t.iteritems(): word = doc_freq_index[item_index]
def _parse_options_data(jd): return read_json(jd)
def cache_json_dataframe(): return { 'writer': ju.write, 'reader': lambda p: pj.read_json(p, orient='records') }
from __future__ import division from nltk import SpaceTokenizer from nltk.corpus import stopwords from pandas.io.json import read_json from pywordcloud import pywordcloud from pprint import pprint import re from sklearn.feature_extraction.text import TfidfVectorizer train_data = read_json('train.json') tokenizer = SpaceTokenizer() stop_words = stopwords.words('english') replacements = { "'ve": ' have', "'ll": ' will', } def preprocess(s): return str.join(' ', tokenizer.tokenize(s)) tdf2 = TfidfVectorizer(ngram_range=(1, 3), stop_words=stop_words, max_features=30, preprocessor=preprocess) data = train_data['request_text'].map(lambda x: x.lower()) tdf2.fit(data).transform(data) pprint(tdf2.get_feature_names()) #text = ' '.join([x.encode('utf-8') for x in train_data['request_title']]) # pywordcloud.create(text, outfile="output.html", uppercase=False,
def uploaded_file(): df = read_json(session['df']) df.sort_values('pos', inplace=True) # construct Bokeh plot of k-distance sensitivity and obtain js scripting. ksens_df = read_json(session['k_sensitivity']) ksens_df.sort_values('k', inplace=True) ksens = bokeh_lines( ksens_df['k'].values.tolist(), ys=[ksens_df[x].values.tolist() for x in ['nupop_tpr', 'nuclstm_tpr']], labels=['nupop', 'nuclstm'], x_label='distance (bp)', y_label='sensitivity (%)') ksens_script, ksens_div = components(ksens) # get model comparison summary summary_df = read_json(session['summary']) # keep and sort the features we'd be interested in viewing. feature_names = list(set(df.columns.tolist()) - set(['Chr', 'pos', 'seq'])) feature_names.sort() if request.method == 'POST': # obtain desired features to analyze selected_features = request.form.getlist('feature_names') if selected_features: selected_features.sort() # determine max of y-axis try: y_max = float(request.form['y_max']) except: y_max = max(df[selected_features].max()) # get start/end positions of chromosome to load (only seq_len will be displayed at once) start = max(int(request.form['start_position']), 0) end = int(request.form['end_position']) # construct Bokeh base position plot and obtain js scripting to send to ui. pos = base_position_plot(df, features=selected_features, start=start, end=end, y_max=y_max, seq_len=1000, plot_width=900, plot_height=600) pos_script, pos_div = components(pos) # obtain correlation matrix of selected features. corr = df[selected_features].corr() print('k-sensitivity script:\n{0}'.format(ksens_script)) print('k-sensitivity div:\n{0}'.format(ksens_div)) return render_template( 'chromosome.html', chromosome=session['chromosome'], feature_names=feature_names, selected_features=selected_features, summary_table=summary_df.to_html(index=False), pos_script=pos_script, pos_div=pos_div, ksens_script=ksens_script, ksens_div=ksens_div, corr_table=corr.to_html()) return render_template('chromosome.html', chromosome=session['chromosome'], feature_names=feature_names, selected_features=None, summary_table=summary_df.to_html(index=False), ksens_script=ksens_script, ksens_div=ksens_div)
def _load_dataframe(fname): df = read_json(os.path.expanduser(fname)) df['created'] = df['created'].apply( lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) return df
def run(): ### data handling ### print('reading files') train_data_raw = read_json('data/train.json') ## pandas data frame test_data_raw = read_json('data/test.json') ## pandas data frame ### train classifiers ### #nlp classifier print 'getting nlp scores' y_train = train_data_raw["requester_received_pizza"] #nlp_clf_title = RawDataClassifier(NLPClassifier (), NLPEngineer('request_title', max_features_ = 1000)) #nlp_clf_title.fit(train_data_raw, y_train) #metadata classifier print 'getting meta data scores' meta_clf = ensemble.GradientBoostingClassifier(n_estimators = 30) nlp_clf = NLPClassifier () nlp_clf2 = NLPClassifier () estimators = [meta_clf, nlp_clf,nlp_clf2] meta_engineer = MetadataEngineer() X_meta_train = meta_engineer.transform(train_data_raw) nlp_engineer = NLPEngineer('request_text_edit_aware', max_features_ = 5000) X_nlp_train = nlp_engineer.transform(train_data_raw) nlp_engineer2 = NLPEngineer('request_title', max_features_ = 5000) X_nlp_train2 = nlp_engineer2.transform(train_data_raw) input_train = [X_meta_train,X_nlp_train,X_nlp_train2] skf = list(cross_validation.StratifiedKFold(y_train, 10)) stacking = Stacking(LogisticRegression, estimators, skf, raw = True ) stacking.fit(input_train, y_train) X_meta_test = meta_engineer.transform(test_data_raw) X_nlp_test = nlp_engineer.transform(test_data_raw) X_nlp_test2 = nlp_engineer2.transform(test_data_raw) input_test = [X_meta_test,X_nlp_test,X_nlp_test2] y_test_pred = stacking.predict_proba(input_test)[:, 1] test_ids=test_data_raw['request_id'] print 'writing to file' fcsv = open('raop_prediction.csv','w') fcsv.write("request_id,requester_received_pizza\n") for index in range(len(y_test_pred)): theline = str(test_ids[index]) + ',' + str(y_test_pred[index])+'\n' fcsv.write(theline) fcsv.close()