def main():
    parser = ArgumentParser()
    parser.add_argument('--spark', dest='spark', action='store_true', default=False)
    args = parser.parse_args()
    
    doc_freq = read_json('../../data/doc_freq.json', typ='series')
    N = doc_freq['TOTAL_DOCS']
    idf_vector = np.log10(N / doc_freq)
    for line in sys.stdin:
        url, tf_vector = line.split()
        tf_vector = read_json(tf_vector, typ='series')
        
        # Calculate tf-idf from tf vector and doc_freq vector 
        tf_idf_vector = tf_vector.multiply(idf_vector, fill_value=0)
        
        # Normalize the tf-idf vector, this is important for the clustering done
        # later on.
        tf_idf_vector = tf_idf_vector / tf_idf_vector.sum() # Normalize the tf-idf vector.
        
        if NONZERO_ONLY:
            # tf_idf terms where these terms actually exist in this document, since
            # this is going to be a sparse vector.
            output_vector = tf_idf_vector[tf_idf_vector.nonzero()[0]]
        else:
            output_vector = tf_idf_vector
            
        s = StringIO()
        
        output_vector.to_json(s)
        if args.spark:
            indices_zip = list(zip(tf_idf_vector.nonzero()[0].tolist(),output_vector.tolist()))
            print('%s\t%s' % (url,indices_zip))
        else:
            print('%s\t%s' % (url, s.getvalue()))
Beispiel #2
0
def test_good_kwargs():
    df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
    with tm.assert_produces_warning(None):
        tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split"))
        tm.assert_frame_equal(
            df, read_json(df.to_json(orient="columns"), orient="columns")
        )
        tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index"))
Beispiel #3
0
def test_deprecated_kwargs():
    df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
    buf = df.to_json(orient="split")
    with tm.assert_produces_warning(FutureWarning):
        tm.assert_frame_equal(df, read_json(buf, "split"))
    buf = df.to_json(orient="columns")
    with tm.assert_produces_warning(FutureWarning):
        tm.assert_frame_equal(df, read_json(buf, "columns"))
    buf = df.to_json(orient="index")
    with tm.assert_produces_warning(FutureWarning):
        tm.assert_frame_equal(df, read_json(buf, "index"))
Beispiel #4
0
def get_data_df_from_JSON_Data(ls_filenames):
    # use the list of filenames, find the json dataset, and return the data frame
    # that has all the json data

    data_df = read_json(open(ls_filenames[0], 'r'), orient='records')
    if (len(ls_filenames) >= 2):
        for i in range(1, len(ls_filenames)):
            data_df = data_df.append(read_json(open(ls_filenames[i], 'r'),
                                               orient='records'),
                                     ignore_index=True)

    return data_df
Beispiel #5
0
def publish():
    form = Form()
    if form.validate_on_submit():
        try:
            # TODO: add this to config instead
            json_filepath = os.path.join('C:\\', 'Users', 'michhar', 'Documents', 'MLADS', 'data', 'MessyDoc-8f814e3f2a78.json')
            json_key = json.load(open(json_filepath))

            credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), config.SCOPE)

            gc = gspread.authorize(credentials)

            # TODO: create a worksheet if not there, also put this in config
            wksheet = gc.open("SSF_Crop_Master_2012_Master_crop_master").worksheet('latest')

            # make a client connection
            client = document_client.DocumentClient(config.DOCUMENTDB_HOST, {'masterKey': config.DOCUMENTDB_KEY})

            # Read databases and get our working database
            db = next((data for data in client.ReadDatabases() if data['id'] == config.DOCDB_DATABASE))

            # Read collections and get the "user collection"
            coll_master = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == config.DOCDB_COLLECTION_MASTER))


            master_doc = next((doc for doc in client.ReadDocuments(coll_master['_self']) if doc['id'] == config.DOCDB_MASTER_DOC))
            master_data_df = read_json(master_doc['data'])
            headers = read_json(master_doc['data_headers'])
            master_data_df.columns = headers

            # update all cells in master google doc with data in master doc from db
            # this takes a minute or two (maybe put into a separate view function)
            update_worksheet(wksheet, master_data_df)

            return render_template('results.html',
                        masterlink = 'https://docs.google.com/spreadsheets/d/1MKcDtjI5E-iNv9tU2KcA5yJWWgaSTh5j2IjPYOp9lic/pubhtml',
                        title = 'Results',
                        year = datetime.now().year,
                        message = 'Success! Your data has been stored and the master sheet updated here ')

        except gspread.SpreadsheetNotFound, e:

            return render_template('error_page.html',
                                   title = 'Something went wrong!',
                                   year = datetime.now().year,
                                   message = '''The spreadsheet was not found.
                                   Please ensure you have enabled Google Drive API and
                                   created a new set of credentials.''',
                                   link = 'http://gspread.readthedocs.org/en/latest/oauth2.html')
 def get_group(self, key):
     new_query = 'select value t.grps from (%s) t where grp_id=%s;' % (
         self.query[:-1], str(key))
     results = json.dumps(af.AFrame.send_request(new_query)[0])
     grp = json.read_json(results)['grp']
     df = pd.DataFrame(grp.tolist())
     return df
Beispiel #7
0
 def collect(self):
     results = af.AFrame.send_request(self._query)
     json_str = json.dumps(results)
     result = pd.DataFrame(data=json.read_json(json_str))
     if '_uuid' in result.columns:
         result.drop('_uuid', axis=1, inplace=True)
     return result
Beispiel #8
0
def data_label_processing():
    
    #Importing label names
    image_name_df = pd.read_excel(IMG_LBL_PATH, sheet_name='Sheet1')
    image_name_df.columns = ['species', 'label_id', 'label']

    species = []
    for idx in image_name_df.index:
        if (pd.isnull(image_name_df.at[idx,'species'])):
            image_name_df.at[idx,'species'] = save_class
        else:
            save_class = image_name_df.at[idx,'species']
            species.append(save_class)
    imn_df = image_name_df.reset_index(drop=True)

    #Importing image names
    image_id_df = read_json(IMG_JSON_PATH)
    image_id_df.columns = ['label_id', 'image_name']
    imi_df = image_id_df.reset_index(drop=True)

    #Combining labels and image names
    images_df = pd.merge(imn_df, imi_df, how='inner', on='label_id')
    plant_df = images_df.sample(frac=1).reset_index(drop=True)

    return plant_df
Beispiel #9
0
def parseDataset():
	"""
	parses the dataset and creates list of dictionaries, similar to a baseline dictionary
	"""
	with open(dataset) as datasetFileHandler:
		datarows = datasetFileHandler.readlines()
		for datarow in datarows:
			dataframe = pd.read_json(datarow)
			outputDict = getBaselineDictionary()

			# cleanse DF as per reqs
			outputDict["business_id"] = dataframe.business_id[0]

			for timeline, number in dataframe.checkin_info.iteritems():
				try:
					hour, day = timeline.split("-")
					if (day in weekends):
						outputDict["num_of_visits_in_weekends"] += number
					if (day in weekdays):
						outputDict["num_of_visits_in_weekdays"] += number
					if (hour in morning):
						outputDict["num_of_morning_visits"] += number
					if (hour in afternoon):
						outputDict["num_of_afternoon_visits"] += number
					if (hour in evening):
						outputDict["num_of_evening_visits"] += number
				except:
					pass

			outputDataFramesList.append(outputDict)
Beispiel #10
0
def parseDataset():
    """
	parses the dataset and creates list of dictionaries, similar to a baseline dictionary
	"""
    with open(dataset) as datasetFileHandler:
        datarows = datasetFileHandler.readlines()
        for datarow in datarows:
            dataframe = pd.read_json(datarow)
            outputDict = getBaselineDictionary()

            # cleanse DF as per reqs
            outputDict["business_id"] = dataframe.business_id[0]

            for timeline, number in dataframe.checkin_info.iteritems():
                try:
                    hour, day = timeline.split("-")
                    if (day in weekends):
                        outputDict["num_of_visits_in_weekends"] += number
                    if (day in weekdays):
                        outputDict["num_of_visits_in_weekdays"] += number
                    if (hour in morning):
                        outputDict["num_of_morning_visits"] += number
                    if (hour in afternoon):
                        outputDict["num_of_afternoon_visits"] += number
                    if (hour in evening):
                        outputDict["num_of_evening_visits"] += number
                except:
                    pass

            outputDataFramesList.append(outputDict)
Beispiel #11
0
def load(test = False, cols = COLS):
    """Loads data from FTEST if *test* is True, otherwise from FTRAIN.
    Pass a list of *cols* if you're only interested in a subset of the
    target columns.
    """
    fname = FTEST if test else FTRAIN
    df = read_json(os.path.expanduser(fname)) # load pandas dataframe

    df['price_per_bathroom'] = np.log((df['price']+1)/(df['bathrooms']+1))
    df['price_per_bedroom'] = np.log((df['price']+1)/(df['bedrooms']+1))
    df['price'] = np.log(df['price'] + 1)
    df['created'] = df['created'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    df['day_of_month'] = df['created'].apply(lambda x: x.day)
    df['hour'] = df['created'].apply(lambda x: x.hour + (x.minute + x.second / 60.0) / 60.0)
    df['day_of_week'] = df['created'].apply(lambda x: x.weekday())
    df['desc_len'] = df['description'].apply(lambda desc: len([x for x in re.split(r'\W+', desc) if len(x) > 0]))
    df['num_features'] = df['features'].apply(len)
    df['features_len'] = df['features'].apply(lambda feats: sum([len([x for x in re.split(r'\W+', feat) if len(x) > 0]) for feat in feats]))
    df['num_photos'] = df['photos'].apply(len)
    # force all coordinates within NYC area
    df['longitude'] = df['longitude'].apply(bound(-74.3434, -73.62))
    df['latitude'] = df['latitude'].apply(bound(40.4317, 41.0721))

    print(df.count())  # prints the number of values for each column
    
    if not test:  # only FTRAIN has any target columns
        df = df.dropna()  # drop all rows that have missing values in them
        X = np.array(df[cols], dtype = np.float32)
        y = np.array(get_dummies(df['interest_level'])[OUTPUT_COLS], np.float32)
        X, y = shuffle(X, y, random_state=42)  # shuffle train data
    else:
        X = np.array(df[cols], dtype = np.float32)
        y = df['listing_id'].as_matrix()

    return X, y
Beispiel #12
0
 def head(self, num=5):
     new_query = self.query[:-1] + ' limit %d;' % num
     results = af.AFrame.send_request(new_query)
     json_str = json.dumps(results)
     result = pd.DataFrame(data=json.read_json(json_str))
     if '_uuid' in result.columns:
         result.drop('_uuid', axis=1, inplace=True)
     return result
Beispiel #13
0
    def get_curve_fitter(self):

        group_df = read_json(self.raw_data)

        group_df.sort(["percent_inhib","concentration"], inplace=True)
        curve_fitter = IC50CurveFit(main_group_df=group_df)
        curve_fitter.get_fit(self, constrained=True)
        return curve_fitter
Beispiel #14
0
def post_region(module,method):
    """
    To make a POST using CURL to the flask dev server:
    Fisher-Jenks using the Hartigan Olympic time example
    curl -i -H "Content-Type: application/json" -X POST -d '{"args":["[12, 10.8, 11, 10.8, 10.8, 10.6, 10.8, 10.3, 10.3,10.3,10.4,10.5,10.2,10.0,9.9]"], "kwargs":{"k":5}}' http://localhost:5000/ap/esda/fisher_jenks/
    or
    Sample Jenks Caspall using the same example - note that sample
     percentage is not passed.
    curl -i -H "Content-Type: application/json" -X POST -d '{"args":["[12, 10.8, 11, 10.8, 10.8, 10.6, 10.8, 10.3, 10.3,10.3,10.4,10.5,10.2,10.0,9.9]"], "kwargs":{"k":5}}'  http://localhost:5000/ai/esda/jenks_caspall_sampled/
    """
    if not request.json:
        response = {'status':'error','data':{}}
        standarderror['data'] = 'Post datatype was not json'
        return jsonify(standarderror), 400
    else:
        response = {'status':'success','data':{}}

        #Setup the call, the args and the kwargs
        call = funcs[module][method]

        #Parse the args
        args = request.json['args']
        print type(args)
        validargs = []
        for a in args:
            #Literal eval to get the native python type
            va = json.loads(a)
            #va = ast.literal_eval(a)
            #If it is a list, cast to a numpy ndarray via pandas json io
            #This should go to a decorator on the PySAL side at some point
            if isinstance(va, list):
                va = read_json(a)
            validargs.append(va.values.ravel())

        #Check for and parse the kwargs
        try:
            kwargs = request.json['kwargs']
            validkwargs = {}
            validkwargs = ast.literal_eval(str(kwargs))
        except:
            pass

        #Make the call and get the return items
        funcreturn = vars(call(*validargs, **validkwargs))
        for k, v in funcreturn.iteritems():
            if isinstance(v, np.ndarray):
                funcreturn[k] = v.tolist()
            elif isinstance(v, ps.W):
                print "W OBJ"

        response['data'] = funcreturn

        return jsonify(response)
Beispiel #15
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--spark',
                        dest='spark',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    doc_freq = read_json('../../data/doc_freq.json', typ='series')
    N = doc_freq['TOTAL_DOCS']
    idf_vector = np.log10(N / doc_freq)
    for line in sys.stdin:
        url, tf_vector = line.split()
        tf_vector = read_json(tf_vector, typ='series')

        # Calculate tf-idf from tf vector and doc_freq vector
        tf_idf_vector = tf_vector.multiply(idf_vector, fill_value=0)

        # Normalize the tf-idf vector, this is important for the clustering done
        # later on.
        tf_idf_vector = tf_idf_vector / tf_idf_vector.sum(
        )  # Normalize the tf-idf vector.

        if NONZERO_ONLY:
            # tf_idf terms where these terms actually exist in this document, since
            # this is going to be a sparse vector.
            output_vector = tf_idf_vector[tf_idf_vector.nonzero()[0]]
        else:
            output_vector = tf_idf_vector

        s = StringIO()

        output_vector.to_json(s)
        if args.spark:
            indices_zip = list(
                zip(tf_idf_vector.nonzero()[0].tolist(),
                    output_vector.tolist()))
            print('%s\t%s' % (url, indices_zip))
        else:
            print('%s\t%s' % (url, s.getvalue()))
Beispiel #16
0
    def trending(self):
        tweets = read_json(self.tweets_file, lines=True)

        top_trending_words = pd.Series(' '.join(tweets.text).lower().split()).value_counts()[:50]

        # get used hashtags per tweet
        hashtags = json_normalize(tweets['entities'], 'hashtags',  errors='ignore')
        hashtags['text'] = hashtags['text'].str.lower()
        top_trending_hashtags = hashtags['text'].value_counts()[:50]

        _logger.debug("top trending words: \n%s" % top_trending_words)
        _logger.debug("top trending hashtags: \n%s" % top_trending_hashtags)
        return (top_trending_hashtags, top_trending_words)
Beispiel #17
0
    def toPandas(self, sample: int = 0):
        from pandas.io import json

        if self._dataset is None:
            raise ValueError('no dataset specified')
        else:
            dataset = self._dataverse + '.' + self._dataset
            if sample > 0:
                query = 'select value t from %s t limit %d;' % (dataset,
                                                                sample)
            else:
                query = 'select value t from %s t;' % dataset
            result = self.send_request(query)
            data = json.read_json(json.dumps(result))
            df = pd.DataFrame(data)
            if '_uuid' in df.columns:
                df.drop('_uuid', axis=1, inplace=True)
            return df
Beispiel #18
0
    def load_json(self, path, rename=None, index=None):
        '''Read a json file as a pandas dataframe.

        Parameters
        ----------
        rename : list of string tuples (new old), optional
            columns to rename
        index : string, optional
            post-rename column to use as the row label.
        '''
        data = pj.read_json(path, orient='records')

        Cache.rename_columns(data, rename)

        if index is not None:
            data.set_index([index], inplace=True)

        return data
Beispiel #19
0
def save_predictions(clf, train_data, target):
    """
    Uses classifier to predict the targets for test_data, and stores the
    predictions in a .csv file suitable for uploading to kaggle.com.
    """
    test_data = sanitize_train_data(read_json('test.json'))

    # The request_id attribute should be the second column.
    # Force the predictions to 0/1 instead of True/False.
    rows = zip(
        test_data['request_id'].tolist(),
        clf.fit(train_data, target).predict(test_data).astype(int).tolist(),
    )

    with open('test_predictions.csv', 'w') as f:
        f.write('request_id,requester_received_pizza\n')

        for row in rows:
            f.write(('%s,%s\n' % row).encode('utf-8'))
Beispiel #20
0
    def load_json(self,
                  path,
                  rename=None,
                  index=None):
        '''Read a json file as a pandas dataframe.
        
        Parameters
        ----------
        rename : list of string tuples (new old), optional
            columns to rename
        index : string, optional
            post-rename column to use as the row label.
        '''
        data = pj.read_json(path, orient='records')

        self.rename_columns(data, rename)
        
        if index is not None:        
            data.set_index([index], inplace=True)

        return data
Beispiel #21
0
def all_entries():
    #connect to the db
    client = Cloudant(user, password, url=url, connect=True)
    db = client.create_database(db_name, throw_on_exists=False)

    #get all docs
    docs = list(map(lambda doc: doc, db))
    #put them into a dataframe
    fdocs = json_normalize(docs)
    fdocs = DataFrame(fdocs, columns=['date', 'component', 'data', '_id'])
    fdocs['date'] = to_datetime(fdocs['date'])
    fdocs = fdocs.reset_index(drop=True)
    fdocs.sort_values(['date', 'component'])
    #get the components
    components = fdocs['component'].unique().tolist()

    data = [None] * len(fdocs)
    for i, row in fdocs.iterrows():
        tmp = read_json(fdocs.loc[i, 'data'], orient='index')
        tmp = tmp.reset_index()
        data[i] = tmp
        fdocs.loc[i, 'data'] = i

    #make a list of same size as components
    complist = [None] * len(components)
    for i in range(len(components)):
        #drop everything but relevant info
        tmp = fdocs.drop(fdocs[fdocs.component != components[i]].index)
        #drop duplicates
        tmp = tmp.drop_duplicates(subset=['date'], keep='first', inplace=False)
        #sort them
        tmp = tmp.sort_values(['date'], ascending=[False])
        #re index the dataframe
        tmp = tmp.reset_index(drop=True)
        #put the dataframe into the list
        complist[i] = tmp
    #disconnect from db
    client.disconnect()
    return render_template('entries.html', entries=complist, data=data)
Beispiel #22
0
 def get_results_for_datapoint(self):
     '''When ordered by concentration, take the nth group and do an average - only used for the 
     export to beehive function'''
     raw_dataframe = read_json(self.raw_data)
     raw_dataframe = raw_dataframe.groupby("concentration", sort=True)
     index = 0
     sorted_groups = reversed(sorted([g for g in raw_dataframe.groups]))
     for index, group in enumerate(sorted_groups):
         concentration = group
         df = raw_dataframe.get_group(group)
         inhibition = df["percent_inhib"].mean()
         inhibition_error = df["percent_inhib"].std()
         realind = index +1
         yield [(u"  Compound Concentration %d (uM) (Compound Concentration Range) " % realind, concentration ,),
                  (u"  Compound Concentration %d Inhibition (%%) (Compound Concentration Range) " % realind, inhibition * 100,),
                  (u"  Compound Concentration %d Error (%%) (Compound Concentration Range) " % realind, inhibition_error *100,)]
     while index < 11:
         index += 1
         #Fill in any missing values up to 12 columns in total
         realind = index +1
         yield [(u"  Compound Concentration %d (uM) (Compound Concentration Range) " % realind , "" ,),
              (u"  Compound Concentration %d Inhibition (%%) (Compound Concentration Range) " % realind, "" ,),
              (u"  Compound Concentration %d Error (%%) (Compound Concentration Range) " % realind, "" ,)]
Beispiel #23
0
os.chdir("/home/iap13/wcx/bandgapdata/bgdata")
list_name = os.listdir()

from featurebox.data.impot_element_table import element_table

name_and_abbr = element_table.iloc[[0, 1], :]
element_table = element_table.iloc[2:, :]
elemen = element_table[[
    'electronegativity(martynov&batsanov)', 'electron number'
]]

dict_all = []
for i in tqdm(list_name):
    try:
        a = json.read_json(i, orient='index', typ='series')
        cif_str = a["Structure_rlx"]
        del a["Structure_rlx"]

        POSCAR = Poscar.from_string(cif_str)
        ele_den = POSCAR.structure.composition.total_electrons / POSCAR.structure.volume
        composition_mp = POSCAR.structure.composition

        ncom = POSCAR.structure.composition.to_data_dict[
            'unit_cell_composition'].values()

        sym_amt = composition_mp.get_el_amt_dict()
        syms = sorted(sym_amt.keys(), key=lambda sym: get_el_sp(sym).X)
        formula = {s: formula_double_format(sym_amt[s], False) for s in syms}

        departElementProPFeature = DepartElementFeaturizer(
Beispiel #24
0
def get_info(_id):

    X0, X1 = 0, RIGHT - LEFT
    Y0, Y1 = 0, TOP - BOTTOM

    #connect to the db
    client = Cloudant(user, password, url=url, connect=True)
    db = client.create_database(db_name, throw_on_exists=False)
    #get the document from the db
    doc = db[_id]
    H = simplejson.loads(doc['krige_data'])
    z = read_json(doc['data'], orient='index')
    date = doc['date']

    #set standard minmax, these will be changed based on component
    maxvalue = 100
    minvalue = 0

    #set the min max values according to whats unhealthy
    #max will be unhealthy value, and in turn be red on the generated image
    #we set values based on https://uk-air.defra.gov.uk/air-pollution/daqi?view=more-info&pollutant=no2 , Accsessed 11.05.2018
    if (doc['component'] == 'PM2.5'):
        maxvalue = 60
        minvalue = 0
    if (doc['component'] == 'PM10'):
        maxvalue = 85
        minvalue = 0
    if (doc['component'] == 'NO2'):
        maxvalue = 420
        minvalue = 0

    buffr = io.BytesIO()
    fig, ax = subplots()
    fig.dpi = 400
    ax.imshow(H,
              cmap=my_cmap,
              vmin=minvalue,
              vmax=maxvalue,
              origin='lower',
              interpolation='nearest',
              alpha=0.7,
              extent=[X0, X1, Y0, Y1])

    sc = ax.scatter(z.x,
                    z.y,
                    cmap=my_cmap,
                    vmin=minvalue,
                    vmax=maxvalue,
                    c=z.value,
                    linewidths=0.75,
                    s=50)
    plt.colorbar(sc)

    fig.suptitle('component: ' + str(z['component'].iloc[0]) + ', date: ' +
                 str(date),
                 fontsize=14)
    fig.savefig(buffr, dpi=400)

    buffr.seek(0)
    #disconnect from db
    client.disconnect()
    return send_file(buffr, mimetype='image/png')
Beispiel #25
0
            "wb")
        fo.write('[' + ''.join(match_data) + ']')
        filename = fo.name
        fo.close()

        # open .json file again to parse the data readable for the dataFrame in the future
        json_file = open(filename, 'r')
        content = json_file.read()
        json_file.close()
        content = content.replace("}{", "},\n{")
        d = open(filename, 'w')
        d.write(content)
        d.close()

        e = open(filename, 'r')
        data_df = read_json(e, orient='records')
        data_df['year'] = data_df["dateTime"].apply(
            lambda x: get_splitted_column_for_datetime('year', x))
        data_df['month'] = data_df["dateTime"].apply(
            lambda x: get_splitted_column_for_datetime('month', x))
        data_df['day'] = data_df["dateTime"].apply(
            lambda x: get_splitted_column_for_datetime('day', x))
        data_df['hr'] = data_df["dateTime"].apply(
            lambda x: get_splitted_column_for_datetime('hr', x))
        data_df['min'] = data_df["dateTime"].apply(
            lambda x: get_splitted_column_for_datetime('min', x))
        data_df['sec'] = data_df["dateTime"].apply(
            lambda x: get_splitted_column_for_datetime('sec', x))
        del data_df["dateTime"]
        e.close()
Beispiel #26
0
def collaborate():
    form = SetupForm()
    if form.validate_on_submit():
        # gather user info
        if request.method == 'POST':
            creds = request.files['credfile']
            email = form.email.data
        else:
            return render_template('collaborate.html',
                                   form = form,
                                   title = 'Collaborate',
                                   year = datetime.now().year)

        try:
            # create connection to google doc
            json_key = json.loads(creds.read())
            credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), config.SCOPE)
            gc = gspread.authorize(credentials)
            wksheet = gc.open(form.docname.data).worksheet('crop master')

            # get contents as a list of dicts
            contents = wksheet.get_all_values()

            df = pd.DataFrame(contents)

            # Tidy up
            #   - make some columns names based on first 5 rows
            #   - grab data and label columns
            headers = df.iloc[3:5,:]
            newheaders = headers.sum(axis=0)
            newdf = df.iloc[7:-1, :].copy()
            newdf.columns = newheaders

            # TODO: tidy up more
            #  - unnecessary columns
            #  - add user/farmer name as a multiindex

            # convert to json
            # records orientation will result in list like [{column -> value}, ... , {column -> value}]
            jsonrecord = newdf.to_json(orient = 'values')

            # going to use 'individuals' collection
            # does it exist?  if not make it, if so just add this doc

            # make a client connection
            client = document_client.DocumentClient(config.DOCUMENTDB_HOST, {'masterKey': config.DOCUMENTDB_KEY})

            # Read databases and get our working database
            db = next((data for data in client.ReadDatabases() if data['id'] == config.DOCDB_DATABASE))

            # Read collections and get the "user collection"
            coll_user = next((coll for coll in client.ReadCollections(db['_self']) if coll['id'] == config.DOCDB_COLLECTION_USER))

            # create or update user using upsert API
            doc = client.UpsertDocument(coll_user['_self'],
                                        { 'id': form.email.data,
                                          'timestamp': datetime.now().strftime('%c'),
                                          'data': jsonrecord,
                                          'data_headers': newheaders.to_json(orient = 'values')})

            # Read collections and get the "master collection"
            coll_master = next((coll for coll in client.ReadCollections(db['_self']) 
                                if coll['id'] == config.DOCDB_COLLECTION_MASTER))

            doc_definition = { 'id': config.DOCDB_MASTER_DOC,
                               'timestamp': datetime.now().strftime('%c'),
                               'data_headers': newheaders.to_json(orient = 'values')}

            # get all user docs
            user_docs = client.ReadDocuments(coll_user['_self'])

            # gather data in user docs into one dataframe
            user_data_dfs = []
            for doc in user_docs:
                user_data_dfs.append(pd.DataFrame(read_json(doc['data'])))
            user_data_concatd = pd.concat(user_data_dfs)

            # convert to json
            master_records = user_data_concatd.to_json(orient = 'values')

            # add data to the doc definition
            doc_definition['data'] = master_records

            # upsert master with the doc definition
            doc = client.UpsertDocument(coll_master['_self'], doc_definition)


        except gspread.SpreadsheetNotFound, e:
            return render_template('error_page.html',
                                   title = 'Something went wrong!',
                                   year = datetime.now().year,
                                   message = '''The spreadsheet was not found.
                                   Please ensure you have enabled Google Drive API and
                                   created a new set of credentials.''',
                                   link = 'http://gspread.readthedocs.org/en/latest/oauth2.html')
                                   
        return redirect(url_for('.publish'))
Beispiel #27
0
    def wrap(self, fn, path, cache,
             save_as_json=True,
             return_dataframe=False,
             index=None,
             rename=None,
             **kwargs):
        '''make an rma query, save it and return the dataframe.

        Parameters
        ----------
        fn : function reference
            makes the actual query using kwargs.
        path : string
            where to save the data
        cache : boolean
            True will make the query, False just loads from disk
        save_as_json : boolean, optional
            True (default) will save data as json, False as csv
        return_dataframe : boolean, optional
            True will cast the return value to a pandas dataframe, False (default) will not
        index : string, optional
            column to use as the pandas index
        rename : list of string tuples, optional
            (new, old) columns to rename
        kwargs : objects
            passed through to the query function

        Returns
        -------
        dict or DataFrame
            data type depends on return_dataframe option.

        Notes
        -----
        Column renaming happens after the file is reloaded for json
        '''
        if cache is True:
            json_data = fn(**kwargs)

            if save_as_json is True:
                ju.write(path, json_data)
            else:
                df = pd.DataFrame(json_data)
                self.rename_columns(df, rename)

                if index is not None:
                    df.set_index([index], inplace=True)

                df.to_csv(path)

        # read it back in
        if save_as_json is True:
            if return_dataframe is True:
                data = pj.read_json(path, orient='records')
                self.rename_columns(data, rename)
                if index is not None:
                    data.set_index([index], inplace=True)
            else:
                data = ju.read(path)
        elif return_dataframe is True:
            data = pd.DataFrame.from_csv(path)
        else:
            raise ValueError(
                'save_as_json=False cannot be used with return_dataframe=False')

        return data
Beispiel #28
0
 def wrap(self, fn, path, cache,
          save_as_json=True,
          return_dataframe=False,
          index=None,
          rename=None,
          **kwargs):
     '''make an rma query, save it and return the dataframe.
     
     Parameters
     ----------
     fn : function reference
         makes the actual query using kwargs.
     path : string
         where to save the data
     cache : boolean
         True will make the query, False just loads from disk
     save_as_json : boolean, optional
         True (default) will save data as json, False as csv
     return_dataframe : boolean, optional
         True will cast the return value to a pandas dataframe, False (default) will not 
     index : string, optional
         column to use as the pandas index
     rename : list of string tuples, optional
         (new, old) columns to rename
     kwargs : objects
         passed through to the query function
     
     Returns
     -------
     dict or DataFrame
         data type depends on return_dataframe option.
     
     Notes
     -----
     Column renaming happens after the file is reloaded for json
     '''
     if cache == True:
         json_data = fn(**kwargs)
         
         if save_as_json == True:
             ju.write(path, json_data)
         else:
             df = pd.DataFrame(json_data)
             self.rename_columns(df, rename)
             
             if index is not None:
                 df.set_index([index], inplace=True)
     
             df.to_csv(path)
 
     # read it back in
     if save_as_json == True:
         if return_dataframe == True:
             data = pj.read_json(path, orient='records')
             self.rename_columns(data, rename)
             if index != None:
                 data.set_index([index], inplace=True)
         else:
             data = ju.read(path)
     elif return_dataframe == True:
         data = pd.DataFrame.from_csv(path)
     else:
         raise ValueError('save_as_json=False cannot be used with return_dataframe=False')
     
     return data
Beispiel #29
0
import os
from matplotlib import rcParams

NUM_TERMS = 20
NUM_CLUSTERS = 20

class ClusterComponent(object):
    
    def __init__(self, word, index, score):
        self.word = word
        self.index = index
        self.score = score

if __name__ == '__main__':
    rcParams.update({'figure.autolayout':True})
    doc_freq = read_json('../../data/doc_freq.json', typ='series')
    doc_freq_index = doc_freq.index
    n = NUM_CLUSTERS
    cluster_ls = []
    average_cluster_length_ls = []
    while n >= 2:
        df = pd.DataFrame.from_csv('../../data/clusters_%d.txt' %(n), header=None, index_col=None)
        # Create a new directory.
        dir_name_path = '../../data/clusters/%d' % (n)
        os.makedirs(dir_name_path, exist_ok=True)
        word_clusters = []
        for index, row in df.iterrows():
            t = row.nlargest(NUM_TERMS)
            w_words = pd.Series()
            for item_index, item_value in t.iteritems():
                word = doc_freq_index[item_index]
Beispiel #30
0
def _parse_options_data(jd):
    return read_json(jd)
Beispiel #31
0
 def cache_json_dataframe():
     return {
          'writer': ju.write,
          'reader': lambda p: pj.read_json(p, orient='records')
     }
Beispiel #32
0
from __future__ import division
from nltk import SpaceTokenizer
from nltk.corpus import stopwords
from pandas.io.json import read_json
from pywordcloud import pywordcloud
from pprint import pprint
import re
from sklearn.feature_extraction.text import TfidfVectorizer

train_data = read_json('train.json')

tokenizer = SpaceTokenizer()
stop_words = stopwords.words('english')
replacements = {
    "'ve": ' have',
    "'ll": ' will',
}


def preprocess(s):
    return str.join(' ', tokenizer.tokenize(s))

tdf2 = TfidfVectorizer(ngram_range=(1, 3), stop_words=stop_words,
                       max_features=30, preprocessor=preprocess)

data = train_data['request_text'].map(lambda x: x.lower())
tdf2.fit(data).transform(data)
pprint(tdf2.get_feature_names())

#text = ' '.join([x.encode('utf-8') for x in train_data['request_title']])
# pywordcloud.create(text, outfile="output.html", uppercase=False,
Beispiel #33
0
def uploaded_file():
    df = read_json(session['df'])
    df.sort_values('pos', inplace=True)

    # construct Bokeh plot of k-distance sensitivity and obtain js scripting.
    ksens_df = read_json(session['k_sensitivity'])
    ksens_df.sort_values('k', inplace=True)
    ksens = bokeh_lines(
        ksens_df['k'].values.tolist(),
        ys=[ksens_df[x].values.tolist() for x in ['nupop_tpr', 'nuclstm_tpr']],
        labels=['nupop', 'nuclstm'],
        x_label='distance (bp)',
        y_label='sensitivity (%)')

    ksens_script, ksens_div = components(ksens)

    # get model comparison summary
    summary_df = read_json(session['summary'])

    # keep and sort the features we'd be interested in viewing.
    feature_names = list(set(df.columns.tolist()) - set(['Chr', 'pos', 'seq']))
    feature_names.sort()

    if request.method == 'POST':

        # obtain desired features to analyze
        selected_features = request.form.getlist('feature_names')

        if selected_features:
            selected_features.sort()

            # determine max of y-axis
            try:
                y_max = float(request.form['y_max'])
            except:
                y_max = max(df[selected_features].max())

            # get start/end positions of chromosome to load (only seq_len will be displayed at once)
            start = max(int(request.form['start_position']), 0)
            end = int(request.form['end_position'])

            # construct Bokeh base position plot and obtain js scripting to send to ui.
            pos = base_position_plot(df,
                                     features=selected_features,
                                     start=start,
                                     end=end,
                                     y_max=y_max,
                                     seq_len=1000,
                                     plot_width=900,
                                     plot_height=600)

            pos_script, pos_div = components(pos)

            # obtain correlation matrix of selected features.
            corr = df[selected_features].corr()

            print('k-sensitivity script:\n{0}'.format(ksens_script))
            print('k-sensitivity div:\n{0}'.format(ksens_div))

            return render_template(
                'chromosome.html',
                chromosome=session['chromosome'],
                feature_names=feature_names,
                selected_features=selected_features,
                summary_table=summary_df.to_html(index=False),
                pos_script=pos_script,
                pos_div=pos_div,
                ksens_script=ksens_script,
                ksens_div=ksens_div,
                corr_table=corr.to_html())

    return render_template('chromosome.html',
                           chromosome=session['chromosome'],
                           feature_names=feature_names,
                           selected_features=None,
                           summary_table=summary_df.to_html(index=False),
                           ksens_script=ksens_script,
                           ksens_div=ksens_div)
Beispiel #34
0
 def cache_json_dataframe():
     return {
          'writer': ju.write,
          'reader': lambda p: pj.read_json(p, orient='records')
     }
Beispiel #35
0
 def _load_dataframe(fname):
     df = read_json(os.path.expanduser(fname))
     df['created'] = df['created'].apply(
         lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
     return df
def run():
    ### data handling ###       
    print('reading files')
    train_data_raw = read_json('data/train.json')   ## pandas data frame
    test_data_raw = read_json('data/test.json')     ## pandas data frame
     
    ###  train classifiers  ###
    
    #nlp classifier   
    
    print 'getting nlp scores'
    
    y_train = train_data_raw["requester_received_pizza"]     
    
    #nlp_clf_title = RawDataClassifier(NLPClassifier (), NLPEngineer('request_title', max_features_ = 1000))
    #nlp_clf_title.fit(train_data_raw, y_train)    
    
    #metadata classifier
    
    print 'getting meta data scores'
    
    meta_clf = ensemble.GradientBoostingClassifier(n_estimators = 30)
    nlp_clf = NLPClassifier ()
    nlp_clf2 = NLPClassifier ()
    estimators = [meta_clf, nlp_clf,nlp_clf2]
 
    meta_engineer = MetadataEngineer()
    X_meta_train = meta_engineer.transform(train_data_raw)
    
    nlp_engineer = NLPEngineer('request_text_edit_aware', max_features_ = 5000)
    X_nlp_train = nlp_engineer.transform(train_data_raw)
    
    nlp_engineer2 = NLPEngineer('request_title', max_features_ = 5000)
    X_nlp_train2 = nlp_engineer2.transform(train_data_raw)
    
    input_train = [X_meta_train,X_nlp_train,X_nlp_train2]
    
    skf = list(cross_validation.StratifiedKFold(y_train, 10))
    stacking = Stacking(LogisticRegression, estimators,
                 skf, raw = True
                 )
    
    stacking.fit(input_train, y_train)
    
    X_meta_test = meta_engineer.transform(test_data_raw)  
    X_nlp_test = nlp_engineer.transform(test_data_raw)
    X_nlp_test2 = nlp_engineer2.transform(test_data_raw)    
    input_test = [X_meta_test,X_nlp_test,X_nlp_test2]    
            
    y_test_pred = stacking.predict_proba(input_test)[:, 1]
    
    test_ids=test_data_raw['request_id']    

    print 'writing to file'    
    
    fcsv = open('raop_prediction.csv','w')
    fcsv.write("request_id,requester_received_pizza\n")
    for index in range(len(y_test_pred)):
        theline = str(test_ids[index]) + ',' + str(y_test_pred[index])+'\n'
        fcsv.write(theline)
    
    fcsv.close()