Exemple #1
0
def get_poisson_distribution(date_range, country_code, global_min, global_max):
    """
    Args:
        date_range (pandas.core.series.Series): The date range of country data for the poisson distribution to be applied to.
        country_code (string): The country code of the country being explored.
        global_min (pandas.core.series.Series): A time series list of the global minimum tendencies for tor users.
        global_max (pandas.core.series.Series): A time series list of the global maximum tendencies for tor users.


    """
    current_date = date_range[0]
    comparison_date = date_range[1]
    #print(date_range)

    # If there is not a global min or a global max on the day in question then don't even try
    if pd.isnull(global_min[date_range.name]) or pd.isnull(global_max[date_range.name]):
        return pd.Series({"country":country_code,"min":None, "max":None})

    # We can't do this without both dates
    if np.isnan(comparison_date) or np.isnan(current_date):
        return pd.Series({"country":country_code,"min":None, "max":None})
    else:
        down_score = 0
        up_score = 0
        # poisson.ppf(plausable_range, shape_params)
        min_range = global_min[date_range.name] * poisson.ppf(1-0.9999, comparison_date)
        max_range = global_max[date_range.name] * poisson.ppf(0.9999, comparison_date)
        if current_date < min_range:
            down_score = 1
        if current_date > max_range:
            up_score = 1

        return pd.Series({"country":country_code,"min":min_range, "max":max_range, "users":current_date, "event_censor":down_score, "event_spike":up_score})
Exemple #2
0
def remove_nans(divisions):
    """ Remove nans from divisions

    These sometime pop up when we call min/max on an empty partition

    Examples
    --------
    >>> remove_nans((np.nan, 1, 2))
    [1, 1, 2]
    >>> remove_nans((1, np.nan, 2))
    [1, 2, 2]
    >>> remove_nans((1, 2, np.nan))
    [1, 2, 2]
    """
    divisions = list(divisions)

    for i in range(len(divisions) - 2, -1, -1):
        if pd.isnull(divisions[i]):
            divisions[i] = divisions[i + 1]

    for i in range(len(divisions) - 1, -1, -1):
        if not pd.isnull(divisions[i]):
            for j in range(i + 1, len(divisions)):
                divisions[j] = divisions[i]
            break

    return divisions
    def test_gen_drawdown_table(self, px, expected_peak,
                                expected_valley, expected_recovery,
                                expected_duration):
        rets = px.pct_change().iloc[1:]

        drawdowns = timeseries.gen_drawdown_table(rets, top=1)
        self.assertTrue(
            pd.isnull(
                drawdowns.loc[
                    0,
                    'peak date'])) if expected_peak is None \
            else self.assertEqual(drawdowns.loc[0, 'peak date'],
                                  expected_peak)
        self.assertTrue(
            pd.isnull(
                drawdowns.loc[0, 'valley date'])) \
            if expected_valley is None else self.assertEqual(
                drawdowns.loc[0, 'valley date'],
                expected_valley)
        self.assertTrue(
            pd.isnull(
                drawdowns.loc[0, 'recovery date'])) \
            if expected_recovery is None else self.assertEqual(
                drawdowns.loc[0, 'recovery date'],
                expected_recovery)
        self.assertTrue(
            pd.isnull(drawdowns.loc[0, 'duration'])) \
            if expected_duration is None else self.assertEqual(
                drawdowns.loc[0, 'duration'], expected_duration)
def getTrackCountryOfOrigin(billboard_df_final):
    geolocator = Nominatim()
    track_state_of_origin = []
    track_country_of_origin = []
    for index_artist, row in billboard_df_final.iterrows():
        if (not pd.isnull(row['latitude'])) & (not pd.isnull(row['longitude'])):
            try:
                location = geolocator.reverse(str(row['latitude']) +',' + str(row['longitude']), language='en')
                state = location.raw['address']['state']
                if state == "Puerto Rico":
                    country = "Puerto Rico"
                else:
                    country = location.raw['address']['country']
                    if country == "The Netherlands":
                        country = "Netherlands"
            except:
                print row["Artist(s)"]
                country = "" 
                state = ""
        else:
            country = ""
            state = ""
        
        track_country_of_origin.append(country)
        if country == "United States of America":
            track_state_of_origin.append(state)
        else:
            track_state_of_origin.append("")

    return [track_country_of_origin, track_state_of_origin]
def wmd(d1,d2):
    if pd.isnull(d1) or pd.isnull(d2):
        return 1
    d1 = emd_standardize(d1)
    d2 = emd_standardize(d2)
    vect = CountVectorizer(stop_words="english").fit([d1, d2])
    names = vect.get_feature_names()
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    W_ = []
    for i in range(0,len(names)):
        try:
            W_.append(model[names[i]])
        except KeyError:
            W_.append(np.zeros(300))
    D_ = cosine_distances(W_)
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 = v_1 +1
    v_2 = v_2 +1
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    D_ = D_.astype(np.double)
    D_ = D_ +1
    D_ /= D_.max()
    wmd = emd(v_1, v_2, D_)
    return wmd
Exemple #6
0
    def test_replace2(self):
        N = 100
        ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N),
                        dtype=object)
        ser[:5] = np.nan
        ser[6:10] = 'foo'
        ser[20:30] = 'bar'

        # replace list with a single value
        rs = ser.replace([np.nan, 'foo', 'bar'], -1)

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -1).all()
        assert (rs[20:30] == -1).all()
        assert (pd.isnull(ser[:5])).all()

        # replace with different values
        rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3})

        assert (rs[:5] == -1).all()
        assert (rs[6:10] == -2).all()
        assert (rs[20:30] == -3).all()
        assert (pd.isnull(ser[:5])).all()

        # replace with different values with 2 lists
        rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3])
        tm.assert_series_equal(rs, rs2)

        # replace inplace
        ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True)
        assert (ser[:5] == -1).all()
        assert (ser[6:10] == -1).all()
        assert (ser[20:30] == -1).all()
Exemple #7
0
def test_conversions(data_missing):

    # astype to object series
    df = pd.DataFrame({'A': data_missing})
    result = df['A'].astype('object')
    expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A')
    tm.assert_series_equal(result, expected)

    # convert to object ndarray
    # we assert that we are exactly equal
    # including type conversions of scalars
    result = df['A'].astype('object').values
    expected = np.array([np.nan, 1], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    for r, e in zip(result, expected):
        if pd.isnull(r):
            assert pd.isnull(e)
        elif is_integer(r):
            # PY2 can be int or long
            assert r == e
            assert is_integer(e)
        else:
            assert r == e
            assert type(r) == type(e)
Exemple #8
0
def assert_timestamp_and_datetime_equal(result,
                                        expected,
                                        path=(),
                                        msg='',
                                        allow_datetime_coercions=False,
                                        compare_nat_equal=True,
                                        **kwargs):
    """
    Branch for comparing python datetime (which includes pandas Timestamp) and
    np.datetime64 as equal.

    Returns raises unless ``allow_datetime_coercions`` is passed as True.
    """
    assert allow_datetime_coercions or type(result) == type(expected), (
        "%sdatetime types (%s, %s) don't match and "
        "allow_datetime_coercions was not set.\n%s" % (
            _fmt_msg(msg),
            type(result),
            type(expected),
            _fmt_path(path),
        )
    )

    result = pd.Timestamp(result)
    expected = pd.Timestamp(result)
    if compare_nat_equal and pd.isnull(result) and pd.isnull(expected):
        return

    assert_equal.dispatch(object, object)(
        result,
        expected,
        path=path,
        **kwargs
    )
Exemple #9
0
    def test_constructor(self):

        # explicit construction
        index = Float64Index([1, 2, 3, 4, 5])
        self.assertIsInstance(index, Float64Index)
        expected = np.array([1, 2, 3, 4, 5], dtype='float64')
        self.assert_numpy_array_equal(index.values, expected)
        index = Float64Index(np.array([1, 2, 3, 4, 5]))
        self.assertIsInstance(index, Float64Index)
        index = Float64Index([1., 2, 3, 4, 5])
        self.assertIsInstance(index, Float64Index)
        index = Float64Index(np.array([1., 2, 3, 4, 5]))
        self.assertIsInstance(index, Float64Index)
        self.assertEqual(index.dtype, float)

        index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32)
        self.assertIsInstance(index, Float64Index)
        self.assertEqual(index.dtype, np.float64)

        index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32)
        self.assertIsInstance(index, Float64Index)
        self.assertEqual(index.dtype, np.float64)

        # nan handling
        result = Float64Index([np.nan, np.nan])
        self.assertTrue(pd.isnull(result.values).all())
        result = Float64Index(np.array([np.nan]))
        self.assertTrue(pd.isnull(result.values).all())
        result = Index(np.array([np.nan]))
        self.assertTrue(pd.isnull(result.values).all())
def load_data():
    # Read file content
    training_file_content = pd.read_csv(TRAINING_FILE_PATH)
    testing_file_content = pd.read_csv(TESTING_FILE_PATH)
    combined_file_content = pd.concat([training_file_content, testing_file_content])

    # Manipulate file content
    X = combined_file_content.drop([ID_COLUMN_NAME, LABEL_COLUMN_NAME], axis=1).as_matrix()
    categorical_features_mask_list = []
    for column_vector in X.T:
        valid_elements_mask = np.logical_not(pd.isnull(column_vector))
        if np.can_cast(type(column_vector[valid_elements_mask][0]), np.float):
            categorical_features_mask_list.append(False)
            min_value = np.min(column_vector[valid_elements_mask])
            column_vector[np.logical_not(valid_elements_mask)] = min_value - 1
        else:
            categorical_features_mask_list.append(True)
            column_vector[np.logical_not(valid_elements_mask)] = "Missing"
            column_vector[:] = perform_categorization(column_vector)
    encoder = OneHotEncoder(categorical_features=categorical_features_mask_list)
    X = encoder.fit_transform(X).toarray()

    # Separate the data set
    Y = combined_file_content[LABEL_COLUMN_NAME].as_matrix()
    ID = combined_file_content[ID_COLUMN_NAME].as_matrix()
    test_data_mask = pd.isnull(Y)
    X_train = X[np.logical_not(test_data_mask)]
    Y_train = Y[np.logical_not(test_data_mask)]
    X_test = X[test_data_mask]
    ID_test = ID[test_data_mask]

    return X_train, Y_train, X_test, ID_test
Exemple #11
0
        def _check_fill(meth, op, a, b, fill_value=0):
            exp_index = a.index.union(b.index)
            a = a.reindex(exp_index)
            b = b.reindex(exp_index)

            amask = isnull(a)
            bmask = isnull(b)

            exp_values = []
            for i in range(len(exp_index)):
                if amask[i]:
                    if bmask[i]:
                        exp_values.append(nan)
                        continue
                    exp_values.append(op(fill_value, b[i]))
                elif bmask[i]:
                    if amask[i]:
                        exp_values.append(nan)
                        continue
                    exp_values.append(op(a[i], fill_value))
                else:
                    exp_values.append(op(a[i], b[i]))

            result = meth(a, b, fill_value=fill_value)
            expected = Series(exp_values, exp_index)
            assert_series_equal(result, expected)
Exemple #12
0
 def _write_data_dates(self):
     convert_dates = self._convert_dates
     data = self.datarows
     byteorder = self._byteorder
     TYPE_MAP = self.TYPE_MAP
     MISSING_VALUES = self.MISSING_VALUES
     typlist = self.typlist
     for row in data:
         #row = row.squeeze().tolist() # needed for structured arrays
         for i, var in enumerate(row):
             typ = ord(typlist[i])
             #NOTE: If anyone finds this terribly slow, there is
             # a vectorized way to convert dates, see genfromdta for going
             # from int to datetime and reverse it. will copy data though
             if i in convert_dates:
                 var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
             if typ <= 244:  # we've got a string
                 if isnull(var):
                     var = ""  # missing string
                 if len(var) < typ:
                     var = _pad_bytes(var, len(var) + 1)
                 self._write(var)
             else:
                 if isnull(var):  # this only matters for floats
                     var = MISSING_VALUES[typ]
                 self._write(struct.pack(byteorder+TYPE_MAP[typ], var))
Exemple #13
0
    def compute(self, df, chunk_rows=None):
        assert self.columns
        for column in self.columns:
            if column not in df.columns:
                df[column] = numpy.nan
        rows_to_annotate = pandas.isnull(df[self.columns[0]])
        for column in self.columns[1:]:
            rows_to_annotate = rows_to_annotate | pandas.isnull(df[column])

        while rows_to_annotate.sum() > 0:
            if chunk_rows:
                this_chunk_rows = rows_to_annotate & (
                    rows_to_annotate.cumsum() <= chunk_rows)
            else:
                this_chunk_rows = rows_to_annotate

            num_remaining = rows_to_annotate.sum()
            logging.info("%s: %d / %d (%0.1f%%) remaining. Processing %d rows."
                % (
                    self.name,
                    num_remaining,
                    len(rows_to_annotate),
                    num_remaining * 100.0 / len(rows_to_annotate),
                    this_chunk_rows.sum()))

            rows_to_annotate = rows_to_annotate & (~ this_chunk_rows)
            
            if this_chunk_rows.sum() > 0:
                start = time.time()
                df.ix[this_chunk_rows, self.columns] = self.process_chunk(
                    df.ix[this_chunk_rows].copy())[self.columns]
                logging.info("Processed in %f0.2 sec" % (time.time() - start))
            yield this_chunk_rows.sum()
    def test_multiple_children_both_missing(self, entityset, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test all instances in neither child
        sessions = entityset['sessions']

        # add row to sessions to create session with no events
        sessions.update_data(extra_session_df)

        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=wishlist_df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()
        sessions = entityset['sessions']

        # wishlist has 2 newer events and one is NaT
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.NaT

        assert len(sessions.last_time_index) == 7
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
 def test_leaf_no_time_index(self, entityset):
     entityset.add_last_time_indexes()
     stores = entityset['stores']
     true_lti = pd.Series([None for x in range(6)], dtype='datetime64[ns]')
     assert len(true_lti) == len(stores.last_time_index)
     for v1, v2 in zip(stores.last_time_index, true_lti):
         assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
    def test_multiple_children_left_missing(self, entityset, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test all instances in right child
        sessions = entityset['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist df so new session instance in in wishlist_log
        row_values = {'session_id': 6,
                      'datetime': pd.Timestamp("2011-04-11 11:11:11"),
                      'product_id': 'toothpaste'}
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)
        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()

        # now wishlist_log has newer events for 3 session ids
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
    def test_multiple_children_all_combined(self, entityset, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test some instances in right, some in left, all when combined
        sessions = entityset['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist_log so extra session has child instance
        row_values = {'session_id': 6,
                      'datetime': pd.Timestamp("2011-04-11 11:11:11"),
                      'product_id': 'toothpaste'}
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)

        # drop instance 4 so wishlist_log does not have session id 3 instance
        df.drop(4, inplace=True)
        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()

        # wishlist has newer events for 2 sessions
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
    def filter_pair(self, lstring, rstring):
        """Checks if the input strings get dropped by the overlap filter.

        Args:
            lstring,rstring (string): input strings

        Returns:
            A flag indicating whether the string pair is dropped (boolean).
        """

        # If one of the inputs is missing, then check the allow_missing flag.
        # If it is set to True, then pass the pair. Else drop the pair.
        if pd.isnull(lstring) or pd.isnull(rstring):
            return (not self.allow_missing)

        # check for empty string
        if (not lstring) or (not rstring):
            return True

        # tokenize input strings 
        ltokens = self.tokenizer.tokenize(lstring)
        rtokens = self.tokenizer.tokenize(rstring)
 
        num_overlap = overlap(ltokens, rtokens) 

        if COMP_OP_MAP[self.comp_op](num_overlap, self.overlap_size):
            return False
        else:
            return True
def zeitsci_grant_normalize_wrapper(x):
    from_year = np.NaN
    if pd.isnull(x['GrantYear']):
        if pd.isnull(x['StartDate']):
            return np.NaN
        else:
            if len([i for i in x['StartDate'].split("/") if len(i) == 4]) != 1:
                return np.NaN
            else:
                from_year = [i for i in x['StartDate'].split("/") if len(i) == 4][0]
    else:
        from_year = x['GrantYear']

    input_dict = {
        'amount': x['Amount'],
        'block': x['OrganizationBlock'],
        'amount_cur': x['FundCurrency'],
        'from_year': int(from_year)
    }
    if any(pd.isnull(i) for i in input_dict.values()):
        return np.NaN

    return zeitsci_normalize(input_dict['amount']
                             , input_dict['block']
                             , input_dict['amount_cur']
                             , input_dict['from_year'])
Exemple #20
0
def __load_dataset__(log, enroll_ids, base_date):
    """get all instances in this time window"""
    X = IO.load_enrollments().set_index('enrollment_id')\
        .ix[enroll_ids].reset_index()
    for f in features.METHODS:
        X_ = f.extract(base_date)
        if X_ is None:
            print('%s returns None' % repr(f.__name__))
            continue
        if np.any(pd.isnull(X_)):
            raise RuntimeError('%s can generate NA(s)' % repr(f.__name__))

        X = pd.merge(X, X_, how='left', on='enrollment_id')
        if np.any(pd.isnull(X)):
            raise RuntimeError('%s does not generate features of all '
                               'enrollments' % repr(f.__name__))

    active_eids = set(log[(log['time'] > base_date) &
                          (log['time'] <= base_date + timedelta(days=10))]
                         ['enrollment_id'])
    y = [int(eid not in active_eids) for eid in enroll_ids]

    del X['enrollment_id']
    del X['username']
    del X['course_id']
    return X.as_matrix(), np.array(y, dtype=np.int)
def omega_ratio(dataframe, MAR):
    '''Calculate the Omega ratio of target index

    Args:
        dataframe is the dataframe passed by concat_data() function
        index_name is the index we want to calculate, must be consistent with index name in the excel sheet
        MAR is the minimum acceptable return, used for calculating the excess return 
        order is the number of partial moment, here is one, int format    
        start_gap is the gap at the beginning of the data, which is a six month blank period without. 
            the value is defined by a global variable start_gap
        year_list is the initial global variable which defines the typical year label for static table output
        end_point is given by get_end_year function, which is the biggest list year in the table [1,3,5,7,10,15]

    Returns:
        This method return the Omega ratio dataframe for target index across differnt year length
    '''
    year_list = [12,36,60,84,120,180]
    Omega_df = pd.DataFrame(index = dataframe.columns)
    # Force all nan in dataframe to be np.nan
    dataframe = dataframe.fillna(np.nan)
    # Calculation
    for i in year_list:
        for j in dataframe.columns:
            # Since np.nan+np.array cannot exclude the NaN scienairo,(due to the >MAR condition), we need to mannually check the NaN problem
            if np.prod(~pd.isnull(dataframe[j].iloc[-i:]))==0: 
                Omega_df.loc[j,'%d_Months' % i] = np.nan
            elif np.prod(~pd.isnull(dataframe[j].iloc[-i:]))!=0:
                Omega_df.loc[j,'%d_Months' % i] = np.sum(dataframe[j].iloc[-i:][dataframe[j].iloc[-i:]>MAR]-MAR**(1/12))\
                                            /-np.sum(dataframe[j].iloc[-i:][dataframe[j].iloc[-i:]<MAR]-MAR**(1/12))
    for j in dataframe.columns:
        Inception = int(np.count_nonzero(~np.isnan(dataframe[j])))
        Omega_df.loc[j,'Since Inception'] = np.sum(dataframe[j].iloc[-Inception:][dataframe[j].iloc[-Inception:]>MAR]-MAR**(1/12),axis=0)\
                                            /-np.sum(dataframe[j].iloc[-Inception:][dataframe[j].iloc[-Inception:]<MAR]-MAR**(1/12),axis=0)
    return Omega_df
Exemple #22
0
    def test_minmax(self):

        # monotonic
        idx1 = TimedeltaIndex(['1 days', '2 days', '3 days'])
        self.assertTrue(idx1.is_monotonic)

        # non-monotonic
        idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT'])
        self.assertFalse(idx2.is_monotonic)

        for idx in [idx1, idx2]:
            self.assertEqual(idx.min(), Timedelta('1 days')),
            self.assertEqual(idx.max(), Timedelta('3 days')),
            self.assertEqual(idx.argmin(), 0)
            self.assertEqual(idx.argmax(), 2)

        for op in ['min', 'max']:
            # Return NaT
            obj = TimedeltaIndex([])
            self.assertTrue(pd.isnull(getattr(obj, op)()))

            obj = TimedeltaIndex([pd.NaT])
            self.assertTrue(pd.isnull(getattr(obj, op)()))

            obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT])
            self.assertTrue(pd.isnull(getattr(obj, op)()))
def cosine(arr1, arr2):
    if arr1 is None or arr2 is None:
        return np.NaN
    if pd.isnull(arr1) or pd.isnull(arr2):
        return np.NaN
    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.cosine(arr1, arr2)
Exemple #24
0
def normalized_price(price_df):
    """
    Return the normalized price of a series

    :ARGS:

        price_df: :class:`pandas.Series` or :class:`pandas.DataFrame`

    :RETURNS:
        
        same as the input
    """
    if isinstance(price_df, pandas.Series):

        if pandas.isnull(price_df).any():
            print "This series contains null values"
            return
        else:
            return price_df.div(price_df[0])
    
    elif isinstance(price_df, pandas.DataFrame):
        if pandas.isnull(price_df).any().any():
            print "This DataFrame contains null values"
            return
        else:
            return price_df.div(price_df.iloc[0, :] )
    else:
        print "Input must be pandas.Series or pandas.DataFrame"
        return
    def get_address_line(self, index, address1, city, state, zip_code):
        # required: print message for exception

        if not pd.isnull(address1):
            #address = street
            address1 = str(re.sub(r'[^\x00-\x7f]', r' ', address1.strip()))
            #address1 = ' '.join(address1.split())
            print 1, address1
            #print type(address)
        else:
            raise ValueError('Missing street value at row ' + str(index) + '.')
            #address = ''

        if not pd.isnull(city):
            city_name = str(city)
        else:
            raise ValueError('Missing city value at row ' + str(index) + '.')
            #city_name =''

        if not pd.isnull(zip_code):
            zip = str(zip_code)
            #print zip
            #print type(zip)

        else:
            raise ValueError('Missing zip code value at row ' + str(index) + '.')

        #print address
        #print type(address)

        final_line = address1 + ", " + city_name + ', ' + config.state_abbreviation_upper + ' ' + zip
        final_line = ' '.join(final_line.split())
        #print index, final_line
        return final_line
Exemple #26
0
    def _get_daily_spot_value(self, asset, column, dt):
        reader = self._get_pricing_reader('daily')
        if column == "last_traded":
            last_traded_dt = reader.get_last_traded_dt(asset, dt)

            if isnull(last_traded_dt):
                return pd.NaT
            else:
                return last_traded_dt
        elif column in OHLCV_FIELDS:
            # don't forward fill
            try:
                return reader.get_value(asset, dt, column)
            except NoDataOnDate:
                return np.nan
        elif column == "price":
            found_dt = dt
            while True:
                try:
                    value = reader.get_value(
                        asset, found_dt, "close"
                    )
                    if not isnull(value):
                        if dt == found_dt:
                            return value
                        else:
                            # adjust if needed
                            return self.get_adjusted_value(
                                asset, column, found_dt, dt, "minute",
                                spot_value=value
                            )
                    else:
                        found_dt -= self.trading_calendar.day
                except NoDataOnDate:
                    return np.nan
Exemple #27
0
 def oncall(self, controller, currentevlog, currentresplog):
     # important to super here to reset starttime
     super(NBackEvent, self).oncall(controller, currentevlog, currentresplog)
     if self.nshift == 0:
         currentname = self.name
     else:
         try:
             currentname = currentevlog.iloc[self.nshift]['name']
         except IndexError:
             currentname = numpy.nan
         except:
             raise
     try:
         previousname = currentevlog.iloc[self.nshift-self.nback]['name']
     except IndexError:
         previousname = numpy.nan
     except:
         raise
     self.wasrep = 0.
     if pandas.isnull(currentname) or pandas.isnull(previousname):
         self.wasrep = numpy.nan
     elif currentname == previousname:
         self.wasrep = 1.
     if self.verbose:
         print 'current=%s\t last=%s\twasrep=%s' % \
                 (currentname, previousname, self.wasrep)
     # so now we just reassign the keys
     if self.wasrep:
         self.correct = self.repkey[:]
         self.incorrect = self.notrepkey[:]
     else:
         self.correct = self.notrepkey[:]
         self.incorrect = self.repkey[:]
     return
Exemple #28
0
    def test_minmax(self):
        for tz in self.tz:
            # monotonic
            idx1 = pd.DatetimeIndex([pd.NaT, '2011-01-01', '2011-01-02',
                                     '2011-01-03'], tz=tz)
            self.assertTrue(idx1.is_monotonic)

            # non-monotonic
            idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03',
                                     '2011-01-02', pd.NaT], tz=tz)
            self.assertFalse(idx2.is_monotonic)

            for idx in [idx1, idx2]:
                self.assertEqual(idx.min(), pd.Timestamp('2011-01-01', tz=tz))
                self.assertEqual(idx.max(), pd.Timestamp('2011-01-03', tz=tz))

        for op in ['min', 'max']:
            # Return NaT
            obj = DatetimeIndex([])
            self.assertTrue(pd.isnull(getattr(obj, op)()))

            obj = DatetimeIndex([pd.NaT])
            self.assertTrue(pd.isnull(getattr(obj, op)()))

            obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT])
            self.assertTrue(pd.isnull(getattr(obj, op)()))
def read_rdata(rdata_fullpath, table_name):
    """
    Returns the pandas DataFrame
    """
    from rpy2.robjects import pandas2ri, r
    pandas2ri.activate()

    # we want forward slashes for R
    rdata_fullpath_forR = rdata_fullpath.replace("\\", "/")
    print "Loading %s" % rdata_fullpath_forR
    
    # read in the data from the R session with python
    r['load'](rdata_fullpath_forR)
    # check that it's there
    table_df = pandas2ri.ri2py(r['model_summary'])

    # fillna
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0: print "  Found %5d NA values in column %s" % (nullcount, col)
    table_df = table_df.fillna(0)
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0: print "  -> Found %5d NA values in column %s" % (nullcount, col)
    
    print "Read %d lines from %s" % (len(table_df), rdata_fullpath)
    return table_df
Exemple #30
0
def metacsv_dataframe_to_dataarray(dataframe, names=None, attrs=None):

    global xr
    if xr is None:
        _import_xarray()

    dataframe = dataframe.copy()

    if attrs is None:
        attrs = dataframe.attrs

    coords = dataframe.coords.copy()

    dataframe.index.names = [
        str(ind) if not pd.isnull(ind) else 'ind_{}'.format(i)
            for i, ind in enumerate(dataframe.index.names)]

    if dataframe.coords == None:
        coords.update({c: None for c in dataframe.index.names})

    dataframe.columns.names = [
        str(c) if not pd.isnull(c) else 'coldim_{}'.format(i)
            for i, c in enumerate(dataframe.columns.names)]

    colnames = dataframe.columns.names
    series = dataframe._constructor_sliced(dataframe.stack(colnames))
    coords.update({c: None for c in colnames})

    series.coords.update(coords)
    return metacsv_series_to_dataarray(series, attrs=attrs)
Exemple #31
0
dev_prev_365 = stock_data['Close'].rolling(window=365, min_periods=2).std()
stock_data['Ratio_past5_365'] = stock_data['Avg_day_5'] / dev_prev_365

# Shifting the indexes of the dataframe by 1 period
stock_data.shift()
print(stock_data.head(10))
print(stock_data.tail(10))

# Removing rows from the dataset with dates before 1951-10-03
stock_data = stock_data[
    stock_data['Date'] > datetime(year=1951, month=1, day=2)]
print(stock_data.head(5))

# Removing NAN values from the dataset
stock_data = stock_data.dropna(axis=0)
print(pd.isnull(stock_data).sum())

# Splitting the dataset into train and test data
train = stock_data[stock_data['Date'] < datetime(year=2013, month=1, day=1)]
test = stock_data[stock_data['Date'] > datetime(year=2013, month=1, day=1)]
print(train.tail(3))
print(test.head(3))

# Training a linear regression model to predict the Close stock prices
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

linear_model = LinearRegression()
model_fit = linear_model.fit(
    train[['Avg_day_5', 'Sdev_day_5', 'Ratio_past5_365']], train['Close'])
predict_train = model_fit.predict(
Exemple #32
0
def main(xlsx_path, out_path, action, viral_submission=False):

    # PARSE STUDIES
    #################
    xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_study")
    if xl_sheet.shape[0] < 1:
        raise ValueError("No entries found in studies sheet")
    studies_col = ["alias", "title", "study_type", "study_abstract"]
    try:
        studies_dict = extract_data(xl_sheet, studies_col)
    except AssertionError as e:
        print("Sheet ENA_study: ", e)
        raise

    # PARSE SAMPLES
    #################
    xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_sample")
    if xl_sheet.shape[0] < 1:
        raise ValueError("No entries found in samples")
    if viral_submission:
        samples_cols = [
            "alias",
            "title",
            "scientific_name",
            "sample_description",
            "geographic location (country and/or sea)",
            "host common name",
            "host health state",
            "host sex",
            "host scientific name",
            "collector name",
            "collection date",
            "collecting institution",
            "isolate",
        ]
    else:
        samples_cols = ["alias", "title", "scientific_name", "sample_description"]
    try:
        samples_dict = extract_data(xl_sheet, samples_cols)
    except AssertionError as e:
        print("Sheet ENA_sample: ", e)
        raise

    # PARSE EXPERIMENTS
    #################
    xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_experiment")
    if xl_sheet.shape[0] < 1:
        raise ValueError("No experiments found in experiments sheet")
    exp_columns = [
        "alias",
        "title",
        "study_alias",
        "sample_alias",
        "design_description",
        "library_name",
        "library_strategy",
        "library_source",
        "library_selection",
        "library_layout",
        "insert_size",
        "library_construction_protocol",
        "platform",
        "instrument_model",
    ]
    try:
        experiments_dict = extract_data(xl_sheet, exp_columns)
    except AssertionError as e:
        print("Sheet ENA_experiment: ", e)
        raise

    # PARSE RUNS SHEET
    #################
    xl_sheet = read_ena_xlsx_sheet(xlsx_path, sheet_name="ENA_run")
    if xl_sheet.shape[0] < 1:
        raise ValueError("No entries found in runs sheet")
    run_cols = ["alias", "experiment_alias", "file_name", "file_format"]
    try:
        runs_dict = extract_data(xl_sheet, run_cols, unique_key="file_name")
    except AssertionError as e:
        print("Sheet ENA_run: ", e)
        raise

    # DROP COMMENTS
    ###############
    studies_dict = {
        k: v
        for k, v in studies_dict.items()
        if k in set([v["study_alias"] for k, v in experiments_dict.items()])
    }
    assert bool(studies_dict), "No entries found in studies"
    experiments_dict = {
        k: v
        for k, v in experiments_dict.items()
        if v["study_alias"] in studies_dict.keys()
    }
    assert bool(experiments_dict), "No entries found in experiments"
    samples_dict = {
        k: v
        for k, v in samples_dict.items()
        if k in set([v["sample_alias"] for k, v in experiments_dict.items()])
    }
    assert bool(samples_dict), "No entries found in samples"
    runs_dict = {
        k: v
        for k, v in runs_dict.items()
        if v["experiment_alias"] in experiments_dict.keys()
    }
    assert bool(runs_dict), "No entries found in runs"

    # WRITE HEADERS TO TABLES
    studies_table = open(pathlib.Path(out_path) / "studies.tsv", "w")
    studies_table.write(
        "\t".join(
            [
                "alias",
                "status",
                "accession",
                "title",
                "study_type",
                "study_abstract",
                "pubmed_id",
                "submission_date",
            ]
        )
        + "\n"
    )
    samples_table = open(pathlib.Path(out_path) / "samples.tsv", "w")
    if viral_submission:
        samples_table.write(
            "\t".join(
                [
                    "alias",
                    "status",
                    "accession",
                    "title",
                    "scientific_name",
                    "taxon_id",
                    "sample_description",
                    "collection_date",
                    "geographic_location",
                    "host_common_name",
                    "host_subject_id",
                    "host_health_state",
                    "host_sex",
                    "host_scientific_name",
                    "collector_name",
                    "collecting_institution",
                    "isolate",
                    "submission_date",
                ]
            )
            + "\n"
        )
    else:
        samples_table.write(
            "\t".join(
                [
                    "alias",
                    "status",
                    "accession",
                    "title",
                    "scientific_name",
                    "taxon_id",
                    "sample_description",
                    "submission_date",
                ]
            )
            + "\n"
        )

    experiments_table = open(pathlib.Path(out_path) / "experiments.tsv", "w")
    experiments_table.write(
        "\t".join(
            [
                "alias",
                "status",
                "accession",
                "title",
                "study_alias",
                "sample_alias",
                "design_description",
                "library_name",
                "library_strategy",
                "library_source",
                "library_selection",
                "library_layout",
                "insert_size",
                "library_construction_protocol",
                "platform",
                "instrument_model",
                "submission_date",
            ]
        )
        + "\n"
    )

    runs_table = open(pathlib.Path(out_path) / "runs.tsv", "w")
    runs_table.write(
        "\t".join(
            [
                "alias",
                "status",
                "accession",
                "experiment_alias",
                "file_name",
                "file_format",
                "file_checksum",
                "submission_date",
            ]
        )
        + "\n"
    )
    action = action

    # WRITE  DICTIONARIES TO TABLE FILES

    # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
    # dt_oobj = datetime.now(tz=None)
    # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
    for study_alias, study in studies_dict.items():
        # study_alias = study_alias + '_' + timestamp
        studies_table.write(
            "\t".join(
                [
                    study_alias,
                    action,
                    "ENA_accession",
                    study["title"],
                    study["study_type"],
                    study["study_abstract"],
                    "",
                    "ENA_submission_data",
                ]
            )
            + "\n"
        )  # assuming no pubmed_id
    for sample_alias, sample in samples_dict.items():
        # sample_alias = sample_alias + '_' + timestamp
        if viral_submission:
            if sample["collector name"] == "":
                sample["collector name"] = "unknown"
            samples_table.write(
                "\t".join(
                    [
                        sample_alias,
                        action,
                        "ena_accession",
                        sample["title"],
                        sample["scientific_name"],
                        "tax_id_updated_by_ENA",
                        sample["sample_description"],
                        sample["collection date"],
                        sample["geographic location (country and/or sea)"],
                        sample["host common name"],
                        "host subject id",
                        sample["host health state"],
                        sample["host sex"],
                        sample["host scientific name"],
                        sample["collector name"],
                        sample["collecting institution"],
                        sample["isolate"],
                        "ENA_submission_date",
                    ]
                )
                + "\n"
            )
        else:
            samples_table.write(
                "\t".join(
                    [
                        sample_alias,
                        action,
                        "ena_accession",
                        sample["title"],
                        sample["scientific_name"],
                        "tax_id_updated_by_ENA",
                        sample["sample_description"],
                    ]
                )
                + "\n"
            )
        for exp_alias, exp in experiments_dict.items():
            # should I check here if any experiment has a study or sample alias that is incorrect?
            # (not listed in the samples or study dict)
            # process the experiments for this sample
            if exp["sample_alias"] == sample_alias:
                if pd.isnull(exp["library_name"]):
                    if exp["sample_alias"] in exp_alias:
                        lib_alias = exp_alias
                    else:
                        lib_alias = exp_alias + "_" + exp["sample_alias"]
                else:
                    lib_alias = exp["library_name"]
                experiments_table.write(
                    "\t".join(
                        [
                            exp_alias,
                            action,
                            "ena_accession",
                            exp["title"],
                            exp["study_alias"],
                            sample_alias,
                            exp["design_description"],
                            lib_alias,
                            exp["library_strategy"],
                            exp["library_source"],
                            exp["library_selection"],
                            exp["library_layout"].lower(),
                            str(int(exp["insert_size"])),
                            exp["library_construction_protocol"],
                            exp["platform"],
                            exp["instrument_model"],
                            "submission_date_ENA",
                        ]
                    )
                    + "\n"
                )
                for file_name, run in runs_dict.items():
                    if run["experiment_alias"] == exp_alias:
                        runs_table.write(
                            "\t".join(
                                [
                                    run["alias"],
                                    action,
                                    "ena_run_accession",
                                    exp_alias,
                                    file_name,
                                    FILE_FORMAT,
                                    "file_checksum",
                                    "submission_date_ENA",
                                ]
                            )
                            + "\n"
                        )
    studies_table.close()
    samples_table.close()
    experiments_table.close()
    runs_table.close()
Exemple #33
0
#-*- coding:utf-8 _*-
""" 
@author:Administrator
@file: test_something_doubt.py
@time: 2018/8/16
"""
import pandas as pd

data = pd.read_csv('./month_6_1.csv', header=0)
data_null_len = len(data[pd.isnull(data['bedrooms'])])
print(data_null_len)

# 获取缺失值的另一种方式:直接让条件data[col] == data[col] 可能是缺失值不会相等造成的没有缺失的才会相等;
data_new = data.loc[data['bedrooms'] == data['bedrooms']]

print(data_new.shape[0])
print(data.shape[0])
Exemple #34
0
def _set_pctChg(pctChg):
    if pd.isnull(pctChg):
        return None
    else:
        return pctChg
Exemple #35
0
# method 1 uses the pandas isin function
not_in_DF1_method1 = DF2[~DF2['AA'].isin(DF1['AA'])]
print(not_in_DF1_method1)

not_in_DF2_method1 = DF1[~DF1['AA'].isin(DF2['AA'])]
print(not_in_DF2_method1)

# method 2 is more generic but produces the same results
DF1list = [True] * DF1.shape[0]
DF2list = [True] * DF2.shape[0]

DF1.loc[:, 'inDF1'] = DF1list
DF2.loc[:, 'inDF2'] = DF2list

bigDF = pd.merge(DF1, DF2, how="outer")

not_in_DF1_method2 = bigDF.drop('inDF2', axis=1)
not_in_DF1_method2 = not_in_DF1_method2[pd.isnull(not_in_DF1_method2).any(
    axis=1)]
not_in_DF1_method2 = not_in_DF1_method2.drop('inDF1', axis=1)

print(not_in_DF1_method2)

not_in_DF2_method2 = bigDF.drop('inDF1', axis=1)
not_in_DF2_method2 = not_in_DF2_method2[pd.isnull(not_in_DF2_method2).any(
    axis=1)]
not_in_DF2_method2 = not_in_DF2_method2.drop('inDF2', axis=1)

print(not_in_DF2_method2)
Exemple #36
0
print('movies:\n', movies.head())
print('movies columns:\n', movies.columns)

# 1、合并数据集
credits.rename(columns={'movie_id': 'id'}, inplace=True)
print('credit columns:\n', credits.columns)
# 根据电影id和title 将二者合并
data = pd.merge(left=credits, right=movies, on=['id', 'title'], how='outer')

# 获取合并后的电影信息

print('data columns:\n', data.columns)
print('data shape:\n', data.shape)

# 检测缺失值
res_nul = pd.isnull(data).sum()
print('res_nul:\n', res_nul)

# 获取缺失电影的名称
bool_mask_name_nul = pd.isnull(data.loc[:, 'release_date'])
# 显示确实电影名称
movies_names = data.loc[bool_mask_name_nul, 'original_title'].values[0]
print('movies_names:\n', movies_names)
# America Is Still the Place

# 首映日填充
data.loc[bool_mask_name_nul, 'release_date'] = '2014-06-01'
# 获取丢失时长的电影
bool_mask_runtime = pd.isnull(data.loc[:, 'runtime'])
movie_unruntime = data.loc[bool_mask_runtime, 'original_title'].values
print('movies unnames:\n', movie_unruntime)
Exemple #37
0
def _apply_predicate(op, val, col_stats):
    # Sanitize operator
    if op not in {"=", "==", "!=", "<", "<=", ">", ">=", "in", "not in"}:
        raise ValueError(f"'{op}' is not a valid operator in predicates.")

    col_min = col_stats.get("minimum", None)
    col_max = col_stats.get("maximum", None)
    col_sum = col_stats.get("sum", None)

    # Apply operator
    if op == "=" or op == "==":
        if _apply_filter_not_eq(val, col_stats):
            return False
        # TODO: Replace pd.isnull with
        # cudf.isnull once it is implemented
        if pd.isnull(val) and not col_stats["has_null"]:
            return False
        if not _apply_filter_bool_eq(val, col_stats):
            return False
    elif op == "!=":
        if (
            col_min is not None
            and col_max is not None
            and val == col_min
            and val == col_max
        ):
            return False
        if _apply_filter_bool_eq(val, col_stats):
            return False
    elif col_min is not None and (
        (op == "<" and val <= col_min) or (op == "<=" and val < col_min)
    ):
        return False
    elif col_max is not None and (
        (op == ">" and val >= col_max) or (op == ">=" and val > col_max)
    ):
        return False
    elif (
        col_sum is not None
        and op == ">"
        and (
            (col_min is not None and col_min >= 0 and col_sum <= val)
            or (col_max is not None and col_max <= 0 and col_sum >= val)
        )
    ):
        return False
    elif (
        col_sum is not None
        and op == ">="
        and (
            (col_min is not None and col_min >= 0 and col_sum < val)
            or (col_max is not None and col_max <= 0 and col_sum > val)
        )
    ):
        return False
    elif op == "in":
        if (col_max is not None and col_max < min(val)) or (
            col_min is not None and col_min > max(val)
        ):
            return False
        if all(_apply_filter_not_eq(elem, col_stats) for elem in val):
            return False
    elif op == "not in" and col_min is not None and col_max is not None:
        if any(elem == col_min == col_max for elem in val):
            return False
        col_range = None
        if isinstance(col_min, int):
            col_range = range(col_min, col_max)
        elif isinstance(col_min, datetime.datetime):
            col_range = pd.date_range(col_min, col_max)
        if col_range and all(elem in val for elem in col_range):
            return False
    return True
Exemple #38
0
def assemble_initial_source_list(catalog_vnum):
    """
    Given LIST_OF_LISTS_STARTER_v0.5.csv , exported from
    /doc/list_of_cluster_member_lists.ods, clean and concatenate the cluster
    members. Flatten the resulting list on source_ids, joining the cluster,
    age, and bibcode columns into comma-separated strings.
    """

    metadf = pd.read_csv(
        os.path.join(clusterdatadir, 'LIST_OF_LISTS_STARTER_V0.6.csv')
    )
    metadf['bibcode'] = metadf.ads_link.str.extract("abs\/(.*)\/")

    N_stars_in_lists = []
    Nstars_with_age_in_lists = []
    dfs = []

    # for each table, concatenate into a dataframe of source_id, cluster,
    # log10age ("age").
    for ix, r in metadf.iterrows():

        print(79*'-')
        print(f'Beginning {r.reference_id}...')

        csvpath = os.path.join(clusterdatadir, r.csv_path)
        assert os.path.exists(csvpath)

        df = pd.read_csv(csvpath)

        df['reference_id'] = r.reference_id
        df['reference_bibcode'] = r.bibcode
        if 'HATSandHATNcandidates' in r.reference_id:
            df['reference_bibcode'] = 'JoelHartmanPrivComm'

        colnames = df.columns

        #
        # every CSV file needs a Gaia DR2 "source_id" column
        #
        if "source" in colnames:
            df = df.rename(
                columns={"source":"source_id"}
            )

        #
        # every CSV file needs a "cluster name" name column
        #
        if "assoc" in colnames:
            df = df.rename(
                columns={"assoc":"cluster"} # moving groups
            )

        colnames = df.columns

        if "cluster" not in colnames:
            msg = (
                f'WRN! for {r.reference_id} did not find "cluster" column. '+
                f'Appending the reference_id ({r.reference_id}) as the cluster ID.'
            )
            print(msg)

            df['cluster'] = r.reference_id

        #
        # every CSV file needs an "age" column, which can be null, but
        # preferably is populated.
        #
        if "age" not in colnames:

            if r.reference_id in [
                'CantatGaudin2018a', 'CantatGaudin2020a', 'CastroGinard2020',
                'GaiaCollaboration2018lt250', 'GaiaCollaboration2018gt250'
            ]:

                # get clusters and ages from CG20b; use them as the reference
                cg20bpath = os.path.join(
                    clusterdatadir,
                    "v05/CantatGaudin20b_cut_cluster_source_age.csv"
                )
                df_cg20b = pd.read_csv(cg20bpath)
                cdf_cg20b = df_cg20b.drop_duplicates(subset=['cluster','age'])[
                    ['cluster', 'age']
                ]

                # cleaning steps
                if r.reference_id == 'CastroGinard2020':
                    df['cluster'] = df.cluster.str.replace('UBC', 'UBC_')

                elif r.reference_id in [
                    'GaiaCollaboration2018lt250',
                    'GaiaCollaboration2018gt250'
                ]:
                    df['cluster'] = df.cluster.str.replace('NGC0', 'NGC_')
                    df['cluster'] = df.cluster.str.replace('NGC', 'NGC_')
                    df['cluster'] = df.cluster.str.replace('IC', 'IC_')
                    df['cluster'] = df.cluster.str.replace('Stock', 'Stock_')
                    df['cluster'] = df.cluster.str.replace('Coll', 'Collinder_')
                    df['cluster'] = df.cluster.str.replace('Trump02', 'Trumpler_2')
                    df['cluster'] = df.cluster.str.replace('Trump', 'Trumpler_')

                _df = df.merge(cdf_cg20b, how='left', on=['cluster'])
                assert len(_df) == len(df)

                df['age'] = _df['age']
                print(
                    f'For {r.reference_id} got {len(df[~pd.isnull(df.age)])}/{len(df)} finite ages via CantatGaudin2020b crossmatch on cluster ID.'
                )

                del _df

            elif (
                ('Zari2018' in r.reference_id)
                or
                ('Oh2017' in r.reference_id)
                or
                ('Ujjwal2020' in r.reference_id)
                or
                ('CottenSong' in r.reference_id)
                or
                ('HATSandHATNcandidates' in r.reference_id)
                or
                ('SIMBAD' in r.reference_id)
                or
                ('Gagne2018' in r.reference_id)
            ):
                age = np.ones(len(df))*np.nan
                df['age'] = age

            else:
                age_mapper = lambda k: AGE_LOOKUP[k]
                age = df.cluster.apply(age_mapper)
                df['age'] = age

        N_stars_in_lists.append(len(df))
        Nstars_with_age_in_lists.append(len(df[~pd.isnull(df.age)]))
        dfs.append(df)

        assert (
            'source_id' in df.columns
            and
            'cluster' in df.columns
            and
            'age' in df.columns
        )

    metadf["Nstars"] = N_stars_in_lists
    metadf["Nstars_with_age"] = Nstars_with_age_in_lists

    # concatenation.
    nomagcut_df = pd.concat(dfs)
    assert np.sum(metadf.Nstars) == len(nomagcut_df)

    # clean ages
    sel = (nomagcut_df.age == -np.inf)
    nomagcut_df.loc[sel,'age'] = np.nan
    nomagcut_df['age'] = np.round(nomagcut_df.age,2)

    #
    # merge duplicates, and ','-join the cluster id strings, age values
    #
    scols = ['source_id', 'cluster', 'age', 'reference_id', 'reference_bibcode']
    nomagcut_df = nomagcut_df[scols].sort_values(by='source_id')

    for c in nomagcut_df.columns:
        nomagcut_df[c] = nomagcut_df[c].astype(str)

    print(79*'-')
    print('Beginning aggregation (takes ~2-3 minutes for v0.5)...')
    _ = nomagcut_df.groupby('source_id')
    df_agg = _.agg({
        "cluster": list,
        "age": list,
        "reference_id": list,
        "reference_bibcode": list
    })

    u_sourceids = np.unique(nomagcut_df.source_id)
    N_sourceids = len(u_sourceids)
    assert len(df_agg) == N_sourceids

    df_agg["source_id"] = df_agg.index

    # turn the lists to comma separated strings.
    outdf = pd.DataFrame({
        "source_id": df_agg.source_id,
        "cluster": [','.join(map(str, l)) for l in df_agg['cluster']],
        "age": [','.join(map(str, l)) for l in df_agg['age']],
        "mean_age": [np.round(np.nanmean(np.array(l).astype(float)),2) for l in df_agg['age']],
        "reference_id": [','.join(map(str, l)) for l in df_agg['reference_id']],
        "reference_bibcode": [','.join(map(str, l)) for l in df_agg['reference_bibcode']],
    })

    outpath = os.path.join(
        clusterdatadir, f'list_of_lists_keys_paths_assembled_v{catalog_vnum}.csv'
    )
    metadf.to_csv(outpath, index=False)
    print(f'Made {outpath}')

    outpath = os.path.join(
        clusterdatadir, f'cdips_targets_v{catalog_vnum}_nomagcut.csv'
    )
    outdf.to_csv(outpath, index=False)
    print(f'Made {outpath}')
def get_sample_text(condition: str,
                    sample_type: str,
                    cell_type: str = None,
                    replicate: str = None,
                    lane: str = None):
    """ Construct the text for a sample name by concatenating the respective
    properties of the sample.
    
    The format of the name is:
        [cell-type, ] <condition>, <sample_type> [(<replicate>)] [(lane: <lane>)]
        
    The optional parts are skipped if the respective value is None.
    
    Parameters
    ----------
    condition: string
        The name of the condition for the sample, e.g., "sham.cm"
        
    sample_type: string
        The type of the sample, e.g., "riboseq"
        
    cell_type: string
        The type of cell (tissue, etc.) from which the sample came, e.g., "cm"
        
    replicate: string
        An identifier for the (biological) replicate, e.g., "mouse-403"
        
    lane: string
        An identifier for the lane of the sample, e.g., "2"
        
    Returns
    -------
    sample_name_text: string, or None
        The name, constructed as indicated above. If condition is None, NaN, or
        a zero-length string, then None is returned.
    """
    if pd.isnull(condition):
        return None

    if condition is None:
        return None

    if len(condition) == 0:
        return None

    sample_name = ""

    if cell_type is not None:
        sample_name = "{}{}, ".format(sample_name, cell_type)

    sample_name = "{}{}, {} ".format(sample_name, str(condition),
                                     str(sample_type))

    if replicate is not None:
        sample_name = "{}({}) ".format(sample_name, str(replicate))

    if lane is not None:
        sample_name = "{}(lane: {})".format(sample_name, str(lane))

    sample_name = sample_name.strip()
    return sample_name
Exemple #40
0
def get_target_catalog(catalog_vnum, VERIFY=1):
    """
    1. Assemble the target catalog (down to arbitrary brightness; i.e, just
    clean and concatenate).
    2. Manually async query the Gaia database based on those source_ids.
    3. Verify the result, and merge and write it.
    """

    csvpath = os.path.join(
        clusterdatadir, f'cdips_targets_v{catalog_vnum}_nomagcut.csv'
    )
    if not os.path.exists(csvpath):
        assemble_initial_source_list(catalog_vnum)

    df = pd.read_csv(csvpath)

    # made by assemble_initial_source_list above.
    metapath = os.path.join(
        clusterdatadir, f'list_of_lists_keys_paths_assembled_v{catalog_vnum}.csv'
    )
    metadf = pd.read_csv(metapath)

    if VERIFY:
        # one-time verification
        verify_target_catalog(df, metadf)

    # e.g., cdips_v05_1-result.vot.gz
    votablepath = os.path.join(
        clusterdatadir, f'cdips_v{str(catalog_vnum).replace(".","")}_1-result.vot.gz'
    )
    if not os.path.exists(votablepath):
        temppath = os.path.join(clusterdatadir, f'v{str(catalog_vnum).replace(".","")}_sourceids.csv')
        print(f'Wrote {temppath}')
        df['source_id'].to_csv(
            temppath,
            index=False
        )
        querystr = (
            "SELECT top 2000000 g.source_id, g.ra, g.dec, g.parallax, "+
            "g.parallax_error, g.pmra, g.pmdec, g.phot_g_mean_mag, "+
            "g.phot_rp_mean_mag, g.phot_bp_mean_mag FROM "+
            f"user_lbouma.v{str(catalog_vnum).replace('.','')}_sourceids as u, gaiadr2.gaia_source AS g WHERE "+
            "u.source_id=g.source_id "
        )
        print('Now you must go to https://gea.esac.esa.int/archive/, login, and run')
        print(querystr)
        assert 0
        # # NOTE: the naive implementation below doesn't work, probably because of a
        # # sync/async issue. given_source_ids_get_gaia_data now raises an
        # # error # if n_max exceeds 5e4, because the ~70k items that WERE
        # # returned are duds.
        # cols = (
        #     'g.source_id, g.ra, g.dec, g.parallax, g.parallax_error, g.pmra, '
        #     'g.pmdec, g.phot_g_mean_mag, g.phot_rp_mean_mag, g.phot_bp_mean_mag'
        # )
        # gdf = given_source_ids_get_gaia_data(
        #     np.array(df.source_id.astype(np.int64)),
        #     f'cdips_targets_v{catalog_vnum}',
        #     n_max=int(2e6), overwrite=False,
        #     enforce_all_sourceids_viable=True, whichcolumns=cols,
        #     gaia_datarelease='gaiadr2'
        # )

    gdf = given_votable_get_df(votablepath, assert_equal='source_id')

    if not len(gdf) == len(df):
        print(79*"*")
        print('WRN!')
        print(f'Expected {len(df)} matches in Gaia DR2')
        print(f'Got {len(gdf)} matches in Gaia DR2')
        print(79*"*")
        verify_gaia_xmatch(df, gdf, metadf)

    # every queried source_id should have a result. the two that do not are
    # EsplinLuhman2019, 377 matches to 443 stars, and Gagne2018c, 914 matches
    # to 916 stars. this is 60 missing stars out of 1.5 million. we'll be okay.
    # so, do the merge using the GAIA xmatch results as the base.
    mdf = gdf.merge(df, on='source_id', how='left')


    #
    # update metadf with new info.
    #
    N_stars_in_lists = []
    Nstars_with_age_in_lists = []
    N_sel0 = []
    N_sel1 = []
    N_sel2 = []
    for ix, r in metadf.iterrows():

        csvpath = os.path.join(clusterdatadir, r.csv_path)
        assert os.path.exists(csvpath)
        _df = pd.read_csv(csvpath)
        if 'source_id' not in _df.columns:
            _df = _df.rename(columns={"source":"source_id"})

        _sel = mdf.source_id.isin(_df.source_id)
        N_stars_in_lists.append(len(mdf[_sel]))
        _selage =  (~pd.isnull(mdf.age)) & _sel
        Nstars_with_age_in_lists.append(len(mdf[_selage]))

        _sel0 = (
            _sel
            &
            (mdf.phot_rp_mean_mag < 16)
        )

        _sel1 =  (
            _sel
            &
            ( (mdf.phot_rp_mean_mag < 16)
             |
            (
              (mdf.parallax/mdf.parallax_error > 5) & (mdf.parallax > 10)
            )
            )
        )

        _sel2 = _sel1 & (mdf.mean_age > -1)

        N_sel0.append(len(mdf[_sel0]))
        N_sel1.append(len(mdf[_sel1]))
        N_sel2.append(len(mdf[_sel2]))

    metadf["N_gaia"] = N_stars_in_lists
    metadf["N_gaia_withage"] = Nstars_with_age_in_lists
    metadf["N_Rplt16"] = N_sel0
    metadf["N_Rplt16_orclose"] = N_sel1
    metadf["N_Rplt16_orclose_withage"] = N_sel2
    metadf['Nstars_m_Ngaia'] = metadf.Nstars - metadf.N_gaia

    #
    # save the output
    #
    csvpath = os.path.join(
        clusterdatadir, f'cdips_targets_v{catalog_vnum}_nomagcut_gaiasources.csv'
    )
    if not os.path.exists(csvpath):
        mdf.to_csv(csvpath, index=False)
        print(f'Wrote {csvpath}')
    else:
        print(f'Found {csvpath}')

    metapath = os.path.join(
        clusterdatadir,
        f'list_of_lists_keys_paths_assembled_v{catalog_vnum}_gaiasources.csv'
    )
    if not os.path.exists(metapath):
        metadf.sort_values(by='Nstars', ascending=False).to_csv(metapath, index=False)
        print(f'Wrote {metapath}')
    else:
        print(f'Found {metapath}')

    # Rp<16
    csvpath = os.path.join(
        clusterdatadir, f'cdips_targets_v{catalog_vnum}_gaiasources_Rplt16.csv'
    )
    if not os.path.exists(csvpath):
        sel = (mdf.phot_rp_mean_mag < 16)
        smdf = mdf[sel]
        smdf.to_csv(csvpath, index=False)
        print(f'Wrote {csvpath}')
    else:
        print(f'Found {csvpath}')

    # Rp<16 or close
    csvpath = os.path.join(
        clusterdatadir, f'cdips_targets_v{catalog_vnum}_gaiasources_Rplt16_orclose.csv'
    )
    if not os.path.exists(csvpath):
        sel =  (
            (mdf.phot_rp_mean_mag < 16)
            |
            (
              (mdf.parallax/mdf.parallax_error > 5) & (mdf.parallax > 10)
            )
        )
        smdf = mdf[sel]
        smdf.to_csv(csvpath, index=False)
        print(f'Wrote {csvpath}')
    else:
        print(f'Found {csvpath}')
import matplotlib.pyplot as plt

csv_lines = [] #preallocate list for extracted csv lines

path_to_csv = "data/dataset.csv"

raw_data = pd.read_csv(path_to_csv, sep=';')

# encode as nominal
#raw_data.user.unique(); raw_data.user = raw_data.user.map({'debora':0, 'katia':1, 'wallace':2, 'jose_carlos':3}); raw_data.user.unique()
raw_data.gender.unique(); raw_data.gender = raw_data.gender.map({'Woman':1, 'Man':0}); raw_data.gender.unique()

raw_data['how_tall_in_meters'] = raw_data['how_tall_in_meters'].str.replace(',', '.')
raw_data['body_mass_index'] = raw_data['body_mass_index'].str.replace(',', '.')

raw_data[pd.isnull(raw_data).any(axis=1)]

raw_data.isnull().values.any(); raw_data.isnull().sum().sum()

raw_data.drop(raw_data[raw_data.z4 == "-14420-11-2011 04:50:23,713"].index.values, inplace=True) # row 122076 -> (165633, 19)
raw_data.z4 = pd.to_numeric(raw_data.z4, errors='raise'); raw_data.dtypes # z4 object -> int64
raw_data.columns

raw_data[pd.isnull(raw_data).any(axis=1)]

raw_data = raw_data[raw_data['body_mass_index'].notnull()]

raw_data[pd.isnull(raw_data).any(axis=1)]

raw_data.isnull().values.any(); raw_data.isnull().sum().sum()
Exemple #42
0
def infer_schema(_data,
                 fname,
                 output_root='',
                 sample_size=1.0,
                 type_threshold=0.5,
                 n_jobs=1,
                 base_schema=None,
                 base_schema_feature_colname='column',
                 base_schema_dtype_colname='type'):
    """
	Infer data types for all columns for the input table

	Parameters
	----------
	_data: pandas DataFrame
		data table to infer
	fname: string
		the output file name
	output_root: string, default=''
		the root directory for the output file
	sample_size: int or float(<= 1.0), default=1.0
		int: number of sample rows to infer the data type (useful for large tables)
		float: sample size in percentage
	type_threshold: float(<= 1.0), default=0.5
		threshold for inferring data type
	n_jobs: int, default=1
		the number of jobs to run in parallel
	base_schema: pandas DataFrame, default=None
		data schema to base on
	base_schema_feature_colname: string
		feature_colname in base schema
	base_schema_dtype_colname: string
		dtype_colname in base schema
	"""

    # copy raw data table
    data = _data.copy()

    # open a new workbook to store all result
    wb = openpyxl.Workbook()
    ws = wb['Sheet']
    ws.title = 'schema'

    # calculate sample size
    if sample_size <= 1.0:
        sample_size = int(data.shape[0] * sample_size)

    # dictionary to store dropna sample data values
    data_dropna_sample_values = {}
    for col in data.columns.values:
        if len(data[col].dropna()) <= sample_size:
            data_dropna_sample_values[col] = data[col].dropna().values
        else:
            data = data.sample(sample_size).reset_index(drop=True)
            data_dropna_sample_values[col] = data[col].dropna().values

    # use data_dropna_sample_values to infer data type for each column
    _n_jobs = np.min([n_jobs, len(data.columns.values)])
    type_infos = Parallel(n_jobs=_n_jobs)(delayed(_infer_dtype)(
        data_dropna_sample_values[col], col, type_threshold)
                                          for col in data.columns.values)
    type_infos_df = pd.DataFrame(type_infos)[['column', 'type']]

    # dtype mapping for basic stat calculation
    data_types = {}
    for col in data.columns.values:
        data_types[col] = type_infos_df.loc[type_infos_df['column'] == col,
                                            'type'].values[0]

    # get basic statistic information for all columns
    stat_infos = Parallel(n_jobs=_n_jobs)(delayed(_cal_column_stat)(
        data_dropna_sample_values[col], col, data_types[col])
                                          for col in data.columns.values)
    stat_infos_df = pd.DataFrame(stat_infos)

    # merge dtype infomation with stat information
    full_infos_df = type_infos_df.merge(stat_infos_df, on='column', how='left')
    full_infos_df = full_infos_df[[
        'column', 'type', 'sample_value', 'sample_num_uni', 'sample_min',
        'sample_median', 'sample_max', 'sample_std'
    ]]

    # if base_schema is provided, we can compare with base schema
    if base_schema is not None:
        base_schema = base_schema.rename(
            columns={
                base_schema_feature_colname: 'base_column',
                base_schema_dtype_colname: 'base_type'
            })[['base_column', 'base_type']]
        full_infos_df = full_infos_df.merge(base_schema,
                                            left_on='column',
                                            right_on='base_column',
                                            how='outer')

        # compare with the base schema
        full_infos_df['base_column'] = full_infos_df['base_column'].apply(
            lambda x: 'column not in base table' if pd.isnull(x) else x)
        full_infos_df['column'] = full_infos_df['column'].apply(
            lambda x: 'column not in current table' if pd.isnull(x) else x)

        # reorder the column
        full_infos_df = full_infos_df[[
            'column', 'base_column', 'type', 'base_type', 'sample_value',
            'sample_num_uni', 'sample_min', 'sample_median', 'sample_max',
            'sample_std'
        ]]

    # add data validation for type column
    val_type = DataValidation(type="list",
                              formula1='"key,numeric,str,date"',
                              allow_blank=False)
    ws.add_data_validation(val_type)

    # get col_name, excel column mapping
    column_mapping = {}
    for i, col in enumerate(full_infos_df.columns):
        column_mapping[col] = xlsxwriter.utility.xl_col_to_name(i)

    # write everything into the worksheet
    for r_idx, r in enumerate(
            dataframe_to_rows(full_infos_df, index=False, header=True)):
        ws.append(r)
        for cell_idx, cell in enumerate(
                ws.iter_cols(max_col=ws.max_column,
                             min_row=ws.max_row,
                             max_row=ws.max_row)):
            cell = cell[0]
            if r_idx != 0:
                val_type.add(ws['%s%d' % (column_mapping['type'], ws.max_row)])
                if cell_idx == 0:
                    cell.font = Font(bold=True)
            else:
                cell.style = 'Accent5'

    # add conditional formating
    red_fill = PatternFill(bgColor="FFC7CE")
    red_font = Font(color="9C0006")
    green_fill = PatternFill(bgColor="C6EFCE")
    green_font = Font(color="006100")
    blue_fill = PatternFill(bgColor="9ECAE1")
    blue_font = Font(color="08306B")
    orange_fill = PatternFill(bgColor="FDD0A2")
    orange_font = Font(color="A63603")
    purple_fill = PatternFill(bgColor="DADAEB")
    purple_font = Font(color="3F007D")

    # red highlight if there is any inconsistent between base and the target
    if base_schema is not None:
        col1 = column_mapping['column']
        col2 = column_mapping['base_column']
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col1, col1, ws.max_row),
            FormulaRule(formula=['%s2<>%s2' % (col1, col2)],
                        stopIfTrue=True,
                        fill=red_fill,
                        font=red_font))

        ws.conditional_formatting.add(
            '%s2:%s%d' % (col2, col2, ws.max_row),
            FormulaRule(formula=['%s2<>%s2' % (col1, col2)],
                        stopIfTrue=True,
                        fill=red_fill,
                        font=red_font))

        col1 = column_mapping['type']
        col2 = column_mapping['base_type']
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col1, col1, ws.max_row),
            FormulaRule(formula=['%s2<>%s2' % (col1, col2)],
                        stopIfTrue=True,
                        fill=red_fill,
                        font=red_font))

        ws.conditional_formatting.add(
            '%s2:%s%d' % (col2, col2, ws.max_row),
            FormulaRule(formula=['%s2<>%s2' % (col1, col2)],
                        stopIfTrue=True,
                        fill=red_fill,
                        font=red_font))

    # yellow hightlight column type (which need to be modified)
    ws['%s1' % (column_mapping['type'])].style = 'Neutral'

    # green highlight for the mkey type and red highlight for the error type
    type_cols = [column_mapping['type']]
    if 'base_type' in column_mapping.keys():
        type_cols.append(column_mapping['base_type'])

    for col in type_cols:
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col, col, ws.max_row),
            FormulaRule(formula=['%s2="error"' % (col)],
                        stopIfTrue=True,
                        fill=red_fill,
                        font=red_font))
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col, col, ws.max_row),
            FormulaRule(formula=['%s2="key"' % (col)],
                        stopIfTrue=True,
                        fill=green_fill,
                        font=green_font))
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col, col, ws.max_row),
            FormulaRule(formula=['%s2="numeric"' % (col)],
                        stopIfTrue=True,
                        fill=blue_fill,
                        font=blue_font))
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col, col, ws.max_row),
            FormulaRule(formula=['%s2="str"' % (col)],
                        stopIfTrue=True,
                        fill=orange_fill,
                        font=orange_font))
        ws.conditional_formatting.add(
            '%s2:%s%d' % (col, col, ws.max_row),
            FormulaRule(formula=['%s2="date"' % (col)],
                        stopIfTrue=True,
                        fill=purple_fill,
                        font=purple_font))

    # red highlight for sample_num_uni = 0 or 1, only one unique value
    ws.conditional_formatting.add(
        '%s2:%s%d' % (column_mapping['sample_num_uni'],
                      column_mapping['sample_num_uni'], ws.max_row),
        FormulaRule(formula=['%s2=0' % (column_mapping['sample_num_uni'])],
                    stopIfTrue=True,
                    fill=red_fill,
                    font=red_font))
    ws.conditional_formatting.add(
        '%s2:%s%d' % (column_mapping['sample_num_uni'],
                      column_mapping['sample_num_uni'], ws.max_row),
        FormulaRule(formula=['%s2=1' % (column_mapping['sample_num_uni'])],
                    stopIfTrue=True,
                    fill=red_fill,
                    font=red_font))

    # adjust the column format for the worksheet
    _adjust_ws(ws=ws, row_height=20)

    wb.save(filename=os.path.join(output_root, 'data_schema_%s.xlsx' %
                                  (fname)))
Exemple #43
0
dataset["Cabin"].describe()

# In[ ]:

dataset["Cabin"].isnull().sum()

# In[ ]:

dataset["Cabin"][dataset["Cabin"].notnull()].head()

# In[ ]:

# Replace the Cabin number by the type of cabin 'X' if not
dataset["Cabin"] = pd.Series(
    [i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin']])

# The first letter of the cabin indicates the Desk, i choosed to keep this information only, since it indicates the probable location of the passenger in the Titanic.

# In[ ]:

# Show the counts of observations in each categorical bin using bars.
g = sns.countplot(dataset["Cabin"],
                  order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'X'])

# In[ ]:

g = sns.catplot(y="Survived",
                x="Cabin",
                data=dataset,
                kind="bar",
Exemple #44
0
    def amortize(self, df):
        """
        Input a day-wise sparse dataframe.
        Return an amortized dataframe.
    
        Parameters
        ----------
        df : dataframe
            A sparse dataframe with date as its index.
            
            e.g.
              DATE  Brent Oil Futures Historical Data - Price
        2010-01-01                                        NaN
        2010-01-02                                        NaN
        2010-01-03                                        NaN
        2010-01-04                                      80.12
        2010-01-05                                      80.59
    
        Par : dictionary
            Costomized parameters imported from 'parameters.py'.
    
        Raises
        ------
        ValueError
            Raised when the amortization contains NaN.
    
        Returns
        -------
        df : dataframe
            A dataframe with no NaN and date as its index.
            
            e.g.
              DATE  Brent Oil Futures Historical Data - Price
        2010-01-01                                      80.12
        2010-01-02                                      80.12
        2010-01-03                                      80.12
        2010-01-04                                      80.12
        2010-01-05                                      80.59
    
        """

        display, verbose = True, True
        if display:
            feature_ctr, unab_amort_list = 0, []

        df = df.copy()

        for col in df.columns:
            # if verbose:
            #     print(col)

            index = np.where(df[col].notnull())[0]
            if index.size >= 2:
                amortization = [df[col].iloc[index[0]]] * (index[0] - 0)
                for i in range(len(index) - 1):
                    amortization.extend(
                        np.linspace(float(df[col].iloc[index[i]]),
                                    float(df[col].iloc[index[i + 1]]),
                                    index[i + 1] - index[i],
                                    endpoint=False))

                    if np.any(pd.isnull(amortization)):
                        print(i)
                        raise ValueError(f'{col} contains NaN')

                amortization.extend([df[col].iloc[index[i + 1]]] *
                                    (len(df[col]) - 1 - index[i + 1] + 1))

                df[col] = amortization

                # Make sure all values are converted into number
                df[col] = df[col].astype(float)

                if np.any(pd.isnull(df[col])):
                    print('null', col)
                    raise ValueError

                if display:
                    feature_ctr += 1

            elif index.size < 2:
                if display:
                    unab_amort_list.append(col)
                if verbose:
                    print(f'Unable to amortize {col}')

                df.drop(columns=col, inplace=True)

        return df
Exemple #45
0
def UserTimeLine_JsonLoad(input_str, index_line):
    '''
	'''
    # main data structure, contains information from each tweet
    Tweet_OBJ = col.defaultdict()
    # tweet
    Tweet_OBJ['tweet_time'] = None  # pd.timestamp
    Tweet_OBJ['tweet_id'] = None  # all id are id_str
    Tweet_OBJ['text'] = ""  # make sure text is onto a single line.....
    Tweet_OBJ['lang'] = ""
    Tweet_OBJ[
        'coordinates'] = ""  # Null, or {"coordinates":[-75.14310264,40.05701649],"type":"Point"}

    Tweet_OBJ['reply_to_userID'] = "-1"  # set of tuple(userID, userName)
    Tweet_OBJ['quoted_status_id'] = "-1"
    Tweet_OBJ[
        'in_reply_to_status_id'] = "-1"  # set of tweetID (set of strings)

    Tweet_OBJ['retweet_count'] = 0
    Tweet_OBJ['favorite_count'] = 0

    # user
    Tweet_OBJ['user_id'] = "-1"  # all id are id_str
    Tweet_OBJ['user_name'] = ""
    Tweet_OBJ['user_verified'] = False  # default

    Tweet_OBJ['user_followers'] = 0  # default 0
    Tweet_OBJ['user_friends'] = 0
    Tweet_OBJ['user_favourites'] = 0
    Tweet_OBJ['user_listed'] = 0
    Tweet_OBJ['user_statuses'] = 0

    # hash tag
    Tweet_OBJ['Tag'] = set()  # set of strings
    # mentions
    Tweet_OBJ['mentioned_userID'] = set()  # set of tuple(userID, userName)

    #################################################################################
    # json load, extract tweet time and id_str
    flag_TidTimeAuthor = True  # flag for tweet id, time and author
    try:
        # load json
        tweet_json = json.loads(input_str)
    except ValueError:
        print "Line: {}, json loads Error".format(index_line)
        flag_TidTimeAuthor = False
    else:
        # extract date-time from mainbody
        try:
            time_str = tweet_json['created_at']
            tweet_id = tweet_json['id_str']
        except ValueError:
            flag_TidTimeAuthor = False
            pass
        except KeyError:
            flag_TidTimeAuthor = False
            pass
        else:
            # convert to pandas timestamp
            try:
                time_dt = pd.to_datetime(time_str)
                if pd.isnull(time_dt):
                    flag_TidTimeAuthor = False
                    print "Line: {}, date-time is NaT".format(index_line)
            except ValueError:
                flag_TidTimeAuthor = False
                print "Line: {}, date-time convertion failed".format(
                    index_line)
                pass
            else:
                # upload to RetD_TimeUserTag
                if flag_TidTimeAuthor:
                    Tweet_OBJ['tweet_time'] = time_dt
                    Tweet_OBJ['tweet_id'] = tweet_id

    #################################################################################
    # extract user information sub-json
    if flag_TidTimeAuthor:
        try:
            user_json = tweet_json['user']
        except ValueError:
            flag_TidTimeAuthor = False
            pass
        except KeyError:
            flag_TidTimeAuthor = False
            pass
        else:
            # extract user statistics
            try:
                user_id = user_json['id_str']
                user_name = user_json['screen_name']
                if len(user_name) > 253:
                    user_name = user_name[:250]
                user_followers = user_json['followers_count']
                user_friends = user_json['friends_count']
            except ValueError:
                flag_TidTimeAuthor = False
                pass
            except KeyError:
                flag_TidTimeAuthor = False
                pass
            else:
                if flag_TidTimeAuthor:
                    Tweet_OBJ['user_id'] = user_id
                    Tweet_OBJ['user_name'] = user_name
                    Tweet_OBJ['user_followers'] = user_followers
                    Tweet_OBJ['user_friends'] = user_friends

    #################################################################################
    # extract tweet direct information
    if flag_TidTimeAuthor:

        # extract coordinates information
        try:
            geo_json = tweet_json['coordinates']
            coordinates = str(geo_json['coordinates'])
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            Tweet_OBJ['coordinates'] = coordinates

        # extract lang information
        try:
            lang = tweet_json['lang']
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            Tweet_OBJ['lang'] = lang

        # extract retweet_count information
        try:
            retweet_count = tweet_json['retweet_count']
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            Tweet_OBJ['retweet_count'] = retweet_count

        # extract favorite_count information
        try:
            favorite_count = tweet_json['favorite_count']
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            Tweet_OBJ['favorite_count'] = favorite_count

        # extract reply_to_user information
        try:
            reply_userID_str = tweet_json['in_reply_to_user_id_str']
            # if userID == null, raise error; if not full digit str, raise false
            flag_idstr = reply_userID_str.isdigit()
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            if flag_idstr == True:
                Tweet_OBJ['reply_to_userID'] = reply_userID_str

        # extract in_reply_to_status_id information
        try:
            reply_tweetID_str = tweet_json['in_reply_to_status_id_str']
            # if userID == null, raise error; if not full digit str, raise false
            flag_idstr = reply_tweetID_str.isdigit()
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            if flag_idstr == True:
                Tweet_OBJ['in_reply_to_status_id'] = reply_tweetID_str

        # extract quoted_status_id information
        try:
            quoted_status_id = tweet_json['quoted_status_id']
            # if userID == null, raise error; if not full digit str, raise false
            flag_idstr = quoted_status_id.isdigit()
        except ValueError:
            pass
        except KeyError:
            pass
        except AttributeError:
            pass
        except TypeError:
            pass
        else:
            if flag_idstr == True:
                Tweet_OBJ['quoted_status_id'] = quoted_status_id

    #################################################################################
    # extract tags from entities
    if flag_TidTimeAuthor:
        # extract tags from entities
        tag_list = set([])  # eliminate repeating tags
        try:
            entities_json = tweet_json['entities']
            Hashtags_json = entities_json['hashtags']
        except ValueError:
            pass
        except KeyError:
            pass
        except TypeError:
            pass
        else:
            for entry in Hashtags_json:
                try:
                    # THIS IS VERY VERY VERY IMPORTANT !!!!!
                    tag_text = str(entry['text']).lower()
                    if len(tag_text) > 253:
                        tag_text = tag_text[:250]
                    tag_list.add(
                        tag_text)  # THIS IS VERY VERY VERY IMPORTANT !!!!!
                    # THIS IS VERY VERY VERY IMPORTANT !!!!!
                    # MySQL cant distinguish upper and lower cases when str is used as name for table
                    # which will result in confusion in data analysis
                except ValueError:
                    pass
                except KeyError:
                    pass
                except TypeError:
                    pass
            # end of for
            for item in tag_list:
                Tweet_OBJ['Tag'].add(item)

    #############################################################
    # extract text
    if flag_TidTimeAuthor:
        # extract date-time from mainbody
        try:
            text_str = tweet_json['text']
            text_str = transASC(text_str)
            text_str = removeUtf(text_str)
            text_str = text_str.replace("'", "")
            text_str = parse_MultiLine_text(text_str)
        except ValueError:
            pass
        except KeyError:
            pass
        else:
            Tweet_OBJ['text'] = text_str

    #############################################################
    # extract mentioned_userID
    if flag_TidTimeAuthor:
        # extract entities and user_mentions
        try:
            usermentions_json = entities_json['user_mentions']
        except ValueError:
            pass
        except KeyError:
            pass
        except TypeError:
            pass
        else:
            for entry in usermentions_json:
                try:
                    Tweet_OBJ['mentioned_userID'].add(entry['id_str'])
                except ValueError:
                    pass
                except KeyError:
                    pass
                except TypeError:
                    pass

    #############################################################
    return flag_TidTimeAuthor, Tweet_OBJ
Exemple #46
0
def save_graph_xml(
    data,
    filepath=None,
    node_tags=settings.osm_xml_node_tags,
    node_attrs=settings.osm_xml_node_attrs,
    edge_tags=settings.osm_xml_way_tags,
    edge_attrs=settings.osm_xml_way_attrs,
    oneway=False,
    merge_edges=True,
    edge_tag_aggs=None,
):
    """
    Save graph to disk as an OSM-formatted XML .osm file.

    Note: for large networks this function can take a long time to run. Before
    using this function, make sure you configured OSMnx as described in the
    example below when you created the graph.

    Example
    -------
    >>> import osmnx as ox
    >>> utn = ox.settings.useful_tags_node
    >>> oxna = ox.settings.osm_xml_node_attrs
    >>> oxnt = ox.settings.osm_xml_node_tags
    >>> utw = ox.settings.useful_tags_way
    >>> oxwa = ox.settings.osm_xml_way_attrs
    >>> oxwt = ox.settings.osm_xml_way_tags
    >>> utn = list(set(utn + oxna + oxnt))
    >>> utw = list(set(utw + oxwa + oxwt))
    >>> ox.config(all_oneway=True, useful_tags_node=utn, useful_tags_way=utw)
    >>> G = ox.graph_from_place('Piedmont, CA, USA', network_type='drive')
    >>> ox.save_graph_xml(G, filepath='./data/graph1.osm')

    Parameters
    ----------
    data : networkx multi(di)graph OR a length 2 iterable of nodes/edges
        geopandas GeoDataFrames
    filepath : string
        path to the .osm file including extension. if None, use default data
        folder + graph.osm
    node_tags : list
        osm node tags to include in output OSM XML
    node_attrs: list
        osm node attributes to include in output OSM XML
    edge_tags : list
        osm way tags to include in output OSM XML
    edge_attrs : list
        osm way attributes to include in output OSM XML
    oneway : bool
        the default oneway value used to fill this tag where missing
    merge_edges : bool
        if True merges graph edges such that each OSM way has one entry
        and one entry only in the OSM XML. Otherwise, every OSM way
        will have a separate entry for each node pair it contains.
    edge_tag_aggs : list of length-2 string tuples
        useful only if merge_edges is True, this argument allows the user
        to specify edge attributes to aggregate such that the merged
        OSM way entry tags accurately represent the sum total of
        their component edge attributes. For example, if the user
        wants the OSM way to have a "length" attribute, the user must
        specify `edge_tag_aggs=[('length', 'sum')]` in order to tell
        this method to aggregate the lengths of the individual
        component edges. Otherwise, the length attribute will simply
        reflect the length of the first edge associated with the way.

    Returns
    -------
    None
    """
    # default filepath if none was provided
    if filepath is None:
        filepath = os.path.join(settings.data_folder, "graph.osm")

    # if save folder does not already exist, create it
    folder, filename = os.path.split(filepath)
    if not folder == "" and not os.path.exists(folder):
        os.makedirs(folder)

    if not settings.all_oneway:
        raise UserWarning("In order for save_graph_osm to behave properly "
                          "the graph must have been created with the "
                          "`all_oneway` setting set to True.")

    try:
        gdf_nodes, gdf_edges = data
    except ValueError:
        gdf_nodes, gdf_edges = utils_graph.graph_to_gdfs(
            data, node_geometry=False, fill_edge_geometry=False)

    # rename columns per osm specification
    gdf_nodes.rename(columns={
        "osmid": "id",
        "x": "lon",
        "y": "lat"
    },
                     inplace=True)
    if "id" in gdf_edges.columns:
        gdf_edges = gdf_edges[[col for col in gdf_edges if col != "id"]]
    if "uniqueid" in gdf_edges.columns:
        gdf_edges = gdf_edges.rename(columns={"uniqueid": "id"})
    else:
        gdf_edges = gdf_edges.reset_index().rename(columns={"index": "id"})

    # add default values for required attributes
    for table in (gdf_nodes, gdf_edges):
        table["uid"] = "1"
        table["user"] = "******"
        table["version"] = "1"
        table["changeset"] = "1"
        table["timestamp"] = "2017-01-01T00:00:00Z"

    # convert all datatypes to str
    gdf_nodes = gdf_nodes.applymap(str)
    gdf_edges = gdf_edges.applymap(str)

    # misc. string replacements to meet OSM XML spec
    if "oneway" in gdf_edges.columns:

        # fill blank oneway tags with default (False)
        gdf_edges.loc[pd.isnull(gdf_edges["oneway"]), "oneway"] = oneway
        gdf_edges.loc[:, "oneway"] = gdf_edges["oneway"].astype(str)
        gdf_edges.loc[:, "oneway"] = (gdf_edges["oneway"].str.replace(
            "False", "no").replace("True", "yes"))

    # initialize XML tree with an OSM root element then append nodes/edges
    root = etree.Element("osm", attrib={"version": "1", "generator": "OSMnx"})
    root = _append_nodes_xml_tree(root, gdf_nodes, node_attrs, node_tags)
    root = _append_edges_xml_tree(root, gdf_edges, edge_attrs, edge_tags,
                                  edge_tag_aggs, merge_edges)

    # write to disk
    etree.ElementTree(root).write(filepath)
    utils.log(f'Saved graph as .osm file at "{filepath}"')
if __name__ == "__main__":
    titanic_train_data = pd.read_csv(r"data/train.csv").drop(['Ticket'],
                                                             axis=1)
    titanic_test_data = pd.read_csv(r"data/test.csv").drop(['Ticket'], axis=1)
    all_data = titanic_train_data.append(titanic_test_data).drop(['Survived'],
                                                                 axis=1)
    survived_data = titanic_train_data['Survived']

    freq_port = all_data.Embarked.dropna().mode()[0]
    all_data.loc[:, 'Embarked'] = all_data['Embarked'].fillna(freq_port)

    all_data = ap.correct_age(all_data)

    all_data.loc[:, 'Cabin'] = all_data['Cabin'].map(lambda x: 'U'
                                                     if pd.isnull(x) else x[0])
    all_data.Cabin.replace(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U'],
                           [0, 0, 0, 0, 0, 0, 0, 0, 1],
                           inplace=True)

    all_data = tp.preprocessing(all_data)
    to_predict = all_data[survived_data.shape[0]:]

    train_x, test_x, train_y, test_y = train_test_split(
        all_data[0:survived_data.shape[0]], survived_data, test_size=0.2)

    kfold = model_selection.KFold(n_splits=10, random_state=seed)

    DecisionTreeClassifierModel = DecisionTreeClassifier(max_features=10,
                                                         min_samples_leaf=6,
                                                         criterion='gini',
# Rename columns
ra_c.rename(columns={'country': 'country_c', 'times_2014_r': 'times_2014_r_c',
                     'cwur_2014_r': 'cwur_2014_r_c', 'sh_2014_r': 'sh_2014_r_c'}, inplace=True)
ra_s.rename(columns={'country': 'country_s', 'times_2014_r': 'times_2014_r_s',
                     'cwur_2014_r': 'cwur_2014_r_s', 'sh_2014_r': 'sh_2014_r_s'}, inplace=True)

# Merging the data based on top 100 universities from each ranking
rank_analysis_sct = pd.merge(ra_t, 
                             pd.merge(ra_c, 
                              ra_s, on = 'university_name', how = 'outer'), 
                                    on = 'university_name', how = 'outer')

# Ensuring country column is not blank for universities not present in all 3 rankings
for i in range(len(rank_analysis_sct)):
    if pd.isnull(rank_analysis_sct.loc[i, 'country']):
        rank_analysis_sct.loc[i, 'country'] = str(rank_analysis[rank_analysis['university_name'] ==
            rank_analysis_sct.loc[i, 'university_name']].iloc[0]['country'])


# Ensuring rank column is not blank for universities not present in all 3 rankings
rank_analysis_sct['times_2014_r'] = rank_analysis_sct['times_2014_r'].replace(np.nan, rank_analysis_sct['times_2014_r_c'])
rank_analysis_sct['times_2014_r'] = rank_analysis_sct['times_2014_r'].replace(np.nan, rank_analysis_sct['times_2014_r_s'])

rank_analysis_sct['cwur_2014_r'] = rank_analysis_sct['cwur_2014_r'].replace(np.nan, rank_analysis_sct['cwur_2014_r_c'])
rank_analysis_sct['cwur_2014_r'] = rank_analysis_sct['cwur_2014_r'].replace(np.nan, rank_analysis_sct['cwur_2014_r_s'])

rank_analysis_sct['sh_2014_r'] = rank_analysis_sct['sh_2014_r'].replace(np.nan, rank_analysis_sct['sh_2014_r_c'])
rank_analysis_sct['sh_2014_r'] = rank_analysis_sct['sh_2014_r'].replace(np.nan, rank_analysis_sct['sh_2014_r_s'])

# Replacing nan items (universities which do not exist in ranking) with rank of 700 to ensure they are at farther distance
def get_ingredient(df_tab, idx, sen_type):
    ingr = df_tab.loc[idx, 'ingr_' + sen_type]
    if pd.isnull(ingr):
        return None
    else:
        return ingr
def table_to_vcf(input_table_path, output_vcf_path=None):
    # validate args
    if not os.path.isfile(input_table_path):
        sys.exit("ERROR: %s not found" % input_table_path)

    # read input table. low_memory allows dtypes to be inferred
    t = pd.read_table(input_table_path, low_memory=False)

    missing_columns = {"chrom", "pos", "ref", "alt"} - set(t.columns)
    if missing_columns:
        sys.exit("ERROR: %s is missing columns: %s" %
                 (input_table_path, str(missing_columns)))

    if output_vcf_path is None:
        output_vcf_path = input_table_path.replace(".tsv", "") + ".vcf.gz"

    print("Writing output to %s" % output_vcf_path)

    with gzopen(output_vcf_path, "w") as f:
        f.write("""##source=ClinVar
##INFO=<ID=MUT,Number=1,Type=String,Description="MUT">
##INFO=<ID=MUT,Number=1,Type=String,Description="MEASURESET_ID">
##INFO=<ID=MUT,Number=1,Type=String,Description="SYMBOL">
##INFO=<ID=MUT,Number=1,Type=String,Description="CLINICAL_SIGNIFICANCE">
##INFO=<ID=MUT,Number=1,Type=String,Description="REVIEW_STATUS">
##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_SUBMITTERS">
##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_TRAITS">
##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_PMIDS">
##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_PATHOGENIC">
##INFO=<ID=MUT,Number=1,Type=String,Description="ALL_CONFLICTED">
""")
        f.write("\t".join(
            ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) +
                "\n")
        for i, table_row in t.iterrows():
            vcf_row = []
            vcf_row.append(table_row["chrom"])
            vcf_row.append(table_row["pos"])
            vcf_row.append('.')  # ID
            vcf_row.append(table_row["ref"])
            vcf_row.append(table_row["alt"])
            vcf_row.append('.')  # QUAL
            vcf_row.append('.')  # FILTER

            info_field = collections.OrderedDict()

            # from VCF spec:
            #    INFO - additional information: (String, no white-space, semi-colons, or equals-signs permitted; commas are
            #    permitted only as delimiters for lists of values) INFO fields are encoded as a semicolon-separated series of short
            #    keys with optional values in the format: <key>=<data>[,data].
            for key in [
                    "mut", "measureset_id", "symbol", "clinical_significance",
                    "review_status", "all_submitters", "all_traits",
                    "all_pmids", "pathogenic", "conflicted"
            ]:
                if pd.isnull(table_row[key]):
                    continue
                value = str(table_row[key])
                value = re.sub('\s*[,;]\s*', '|',
                               value)  # replace , or ; with |
                value = value.replace("=", " eq ").replace(" ", "_")

                info_field[key.upper()] = value
            vcf_row.append(";".join(
                [key + "=" + value for key, value in info_field.items()]))

            f.write("\t".join(map(str, vcf_row)) + "\n")
            f.flush()

    print("Done")
# Read data
df_og = pd.read_csv("Titanic.csv")

## Preprocess data
# Set working dataframe to original dataframe
df = df_og

# Finder out whether there are any duplicated rows
print("There are "+ str(df.duplicated().sum()) + " duplicated rows")

# Remove duplicates
df = df.drop_duplicates()

# Find out whether there is any missing data (NaNs)
print("The total missing data is \n{} ".format(pd.isnull(df).sum()))

# Deal with missing data:
# Remove rows with missing data
#df = df.dropna(how='any',axis=0)
# or
# Impute values to replace NaN in in whole dataset, since no missing values in target dataframe
for c in df.columns:
    if (pd.isnull(df[c]).sum()!=0):
        df[c].fillna(df[c].mean(),inplace=True)
# Imputing is more accurate in our case

# Drop 'useless' columns: name of passenger (meaningless), index (meaningless), and sex (represented twice)
df = df.drop(['Name','Unnamed: 0','Sex'],axis=1)

# Get dummy of features (categorical to numerical)
Exemple #52
0
def _set_values_series(dfs):
    return set(dfs[~pd.isnull(dfs)])
Exemple #53
0
def tolist(*args):
    q = [[j for j in i if not (pd.isnull(j))] for i in args]
    return q
Exemple #54
0
def look_at_daily_bets(game_df, odds_df, date, options):
    """
    """
    # Convert the date to an epoch dt
    this_epch = Tutils.tme2epoch(date, "%Y%m%d")

    # sort by time downloaded and remove duplicates, keeping the most recent one
    odds_df = odds_df.sort_values("Time")
    recent_odds_df = odds_df.drop_duplicates(
        subset=(["Away_Team", "Home_Team", "Game_Time"]), keep='last')

    # Maps team names to dataframe of that team
    team_df_map = NBA_utils.make_team_df_map(game_df)
    all_teams = sorted(list(team_df_map.keys()))

    stat_map = {}
    # Add whatever columns we want to look at
    #d = 3
    #r = 8
    d = 3
    for r in range(1, 20):
        pct_counts = add_col_and_print_threshold_counts(
            game_df, d, r, team_df_map)
        key = "%d %d" % (d, r)
        stat_map[key] = pct_counts

    #
    if options.plot_file:
        plot_over_under_pcts(stat_map, options.plot_file)
        plot_col_running_sum(game_df, "OU_HIT_3_avg_8", options.plot_file)

    #
    # Add the running 3game average to look at
    #
    num_games = 3
    home_col = "HomeTeamAvg%sGames" % num_games
    game_df[home_col] = game_df.apply(
        lambda row: NBA_utils.calculate_team_avg_xdays_game_df(
            row['HomeTeam'], num_games, row, team_df_map[row['HomeTeam']]),
        axis=1)
    away_col = "AwayTeamAvg%sGames" % num_games
    game_df[away_col] = game_df.apply(
        lambda row: NBA_utils.calculate_team_avg_xdays_game_df(
            row['AwayTeam'], num_games, row, team_df_map[row['AwayTeam']]),
        axis=1)

    #
    # Load up model information
    #
    model = pickle.load(open(model_file, "rb"))
    # Load in the info file
    info_file = model_file.replace(".sav", ".info")
    info_F = open(info_file, "r")
    info_lines = info_F.readlines()
    # Get the predictors
    predictors = MODEL_utils.read_predictors_from_info(info_lines)
    training_end_date = MODEL_utils.read_end_date_from_info(info_lines)
    game_df = add_model_predictors(predictors, game_df, this_epch, team_df_map)

    game_df.to_csv("game_df.csv")
    #
    # Get the vegas lines for this date and iterate over them
    #
    day_df = recent_odds_df[recent_odds_df['GameDateEpoch'] == this_epch]
    for ind, row in day_df.iterrows():
        print("Info on game %s at %s. Over_Under: %f" %
              (row['Away_Team'], row['Home_Team'],
               row['Over_under_VI Consensus']))

        ###########################################################
        #
        # Unnecessary info print statements
        #
        # Look at over/unders for this team in either home or away games
        print_team_over_unders(game_df, row, 'Away_Team')
        print_team_over_unders(game_df, row, 'Home_Team')
        # Look at last 3 away games for away team
        print_last_x_games(game_df, this_epch, 10, 'RemappedAwayTeam',
                           'Away_Team', row, 'away')
        print_last_x_games(game_df, this_epch, 10, 'RemappedHomeTeam',
                           'Home_Team', row, 'home')
        ###########################################################

        # Get game_df maps for this team
        away_team_df = game_df[
            (game_df['RemappedAwayTeam'] == row['Away_Team']) |
            (game_df['RemappedHomeTeam'] == row['Away_Team'])]
        if (len(away_team_df) == 0):
            print("Can't make map for team: '%s'" % row['Away_Team'])
            continue
        home_team_df = game_df[
            (game_df['RemappedAwayTeam'] == row['Home_Team']) |
            (game_df['RemappedHomeTeam'] == row['Home_Team'])]
        if (len(home_team_df) == 0):
            print("Can't make map for team: %s" % row['Home_Team'])
            continue

        #
        # THIS IS THE KEY RIGHT NOW
        #
        if away_team_df.iloc[-1]['RemappedAwayTeam'] == row['Away_Team']:
            away_team_avg = away_team_df.iloc[-1]['AwayTeamAvg3Games']
        else:
            away_team_avg = away_team_df.iloc[-1]['HomeTeamAvg3Games']
        print("\t%s %s : %f" %
              (row['Away_Team'], 'TeamAvg3Games', away_team_avg))

        if home_team_df.iloc[-1]['RemappedAwayTeam'] == row['Home_Team']:
            home_team_avg = home_team_df.iloc[-1]['AwayTeamAvg3Games']
        else:
            home_team_avg = home_team_df.iloc[-1]['HomeTeamAvg3Games']
        print("\t%s %s : %f" %
              (row['Home_Team'], 'TeamAvg3Games', home_team_avg))
        avg_points = home_team_avg + away_team_avg
        print("\t%f" % (avg_points))
        #
        # If the teams have been averaging more than 7 over the O/U, it's usually over
        #
        if avg_points - abs(row['Over_under_Open']) >= 9:
            print("  --YOOO BET ON THE OVER HERE: BEEN AVERAGING %d!!!" %
                  (avg_points))
        ###########################################
        # Add modeled_points
        ###########################################

        # Get home team modeled points
        ht_preds_list = []
        at_preds_list = []
        for p in predictors:
            ht_pred_col = "%s_%s" % ('HomeTeam', p)
            at_pred_col = "%s_%s" % ('AwayTeam', p)

            if p.startswith("Opp"):
                #
                # Need differen't logic for getting opponents averages
                #
                # Get home Predictor
                if away_team_df.iloc[-1]['RemappedHomeTeam'] == row[
                        'Away_Team']:
                    if pd.isnull(away_team_df.iloc[-1][at_pred_col]):
                        ht_preds_list.append(np.nan)
                    else:
                        ht_preds_list.append(
                            away_team_df.iloc[-1][at_pred_col])
                else:
                    if pd.isnull(away_team_df.iloc[-1][ht_pred_col]):
                        ht_preds_list.append(np.nan)
                    else:
                        ht_preds_list.append(
                            away_team_df.iloc[-1][ht_pred_col])
                # Get Away predictors
                if home_team_df.iloc[-1]['RemappedHomeTeam'] == row[
                        'Home_Team']:
                    if pd.isnull(home_team_df.iloc[-1][at_pred_col]):
                        at_preds_list.append(np.nan)
                    else:
                        at_preds_list.append(
                            home_team_df.iloc[-1][at_pred_col])
                else:
                    if pd.isnull(home_team_df.iloc[-1][ht_pred_col]):
                        at_preds_list.append(np.nan)
                    else:
                        at_preds_list.append(
                            home_team_df.iloc[-1][ht_pred_col])

            else:
                # Get home Predictor
                if home_team_df.iloc[-1]['RemappedHomeTeam'] == row[
                        'Home_Team']:
                    if pd.isnull(home_team_df.iloc[-1][ht_pred_col]):
                        ht_preds_list.append(np.nan)
                    else:
                        ht_preds_list.append(
                            home_team_df.iloc[-1][ht_pred_col])
                else:
                    if pd.isnull(home_team_df.iloc[-1][at_pred_col]):
                        ht_preds_list.append(np.nan)
                    else:
                        ht_preds_list.append(
                            home_team_df.iloc[-1][at_pred_col])
                # Get Away predictors
                if away_team_df.iloc[-1]['RemappedHomeTeam'] == row[
                        'Away_Team']:
                    if pd.isnull(away_team_df.iloc[-1][ht_pred_col]):
                        at_preds_list.append(np.nan)
                    else:
                        at_preds_list.append(
                            away_team_df.iloc[-1][ht_pred_col])
                else:
                    if pd.isnull(away_team_df.iloc[-1][at_pred_col]):
                        at_preds_list.append(np.nan)
                    else:
                        at_preds_list.append(
                            away_team_df.iloc[-1][at_pred_col])

        ht_preds = np.array(ht_preds_list)
        at_preds = np.array(at_preds_list)
        if np.nan not in ht_preds_list:
            ht_fcst = model.predict(ht_preds.reshape(1, -1))[0]
        else:
            ht_fcst = -9999
        if np.nan not in at_preds_list:
            at_fcst = model.predict(at_preds.reshape(1, -1))[0]
        else:
            at_fcst = -9999
        print(ht_preds)
        print("\tModeled HomeTeam points: %s" % ht_fcst)
        print(at_preds)
        print("\tModeled AwayTeam points: %s" % at_fcst)
        modeled_OU = ht_fcst + at_fcst
        # If the modeled OU is more than 3 under the vegas line, bet this bitch
        print(row['Over_under_VI Consensus'])
        if (((abs(row['Over_under_VI Consensus']) - modeled_OU) > 2) &
            ((abs(row['Over_under_VI Consensus']) - modeled_OU) < 5)):
            print("  --YOOO BET ON THE UNDER HERE: Model expecting: %d!!!" %
                  (modeled_OU))

        if ((modeled_OU - 9) > abs(row['Over_under_VI Consensus'])):
            print("  --YOOO BET ON THE OVER HERE: Model expecting: %d!!!" %
                  (modeled_OU))

        # If modeled is more than 2 under AND average is more than 2 under.  Bet.
        #if (((abs(row['Over_under_VI Consensus'])-modeled_OU) > 2) and
        #    ((abs(row['Over_under_VI Consensus'])-avg_points) > 2)):
        #    print ("  --YOOO BET ON THE UNDER HERE: Model expecting %d AND avg saying %d!!!" % (modeled_OU, avg_points))
        print("-------------------------------------------------------")
    def __init__(self):
        print('loading model ...')
        self.nlp = spacy.load('de_core_news_lg', exclude = ['ner'])
        self.nlp.add_pipe('ner', source = spacy.load('de_core_news_lg'))
        ruler = self.nlp.add_pipe('entity_ruler', before = 'ner')
        ruler.from_disk("data/custom.jsonl")
        ruler.to_disk("data/.swap/entity_ruler")
        additional_df = pd.read_excel(inputSheet, 'additional', usecols=['entity', 'variations', 'type'])
        self.additional_dict = dict()
        for _, r in additional_df.iterrows():
            k = r['entity']
            t = r['type']
            self.additional_dict[k.lower()] = (k, t)
            if not pd.isnull(r['variations']):
                for v in re.split(r'\s*;\s*', r['variations']):
                    self.additional_dict[v.lower()] = (k, t)
        ignore_df = pd.read_excel(inputSheet, 'ignore', usecols=['entity'])
        self.ignore_set = set([r['entity'].lower() for _, r in ignore_df.iterrows()])
        print('parsing transcripts ...')
        self.timecode_pattern = re.compile(r'[\(\[].{2}:.{2}[\)\]]|[\(\[].{1,2}:.{2}:.{2}[\)\]]')
        self.entities_dict = dict()
        self.segments = []
        self.entities = []
        transcript_id: int = 0
        transcript_path: str = None
        for transcript_id in range(1, 5):
            print(f'... {transcript_id}')
            transcript_path = f'data/transcript-{transcript_id}.txt'
            with open(transcript_path, 'r') as file:
                transcript = file.read()
            starts, segments = self.split_transcript(transcript)
            for i, segment in enumerate(segments):
                entities = self.get_entities(segment)
                for e in entities:
                    if e[0] not in self.entities_dict:
                        self.entities_dict[e[0]] = set()
                    if e[0].lower() in self.additional_dict:
                        # correct category?
                        self.entities_dict[e[0]] = set([self.additional_dict[e[0].lower()][1]])
                    else:
                        self.entities_dict[e[0]].add(e[1])
                entities = [e[0] for e in entities]
                self.segments.append(Segment(transcript_id, starts[i], segment, entities))

        with open(entitiesFile, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['entity', 'type'])
            for e in sorted(self.entities_dict.keys()):
                print(e)
                writer.writerow([e, ';'.join(sorted(list(self.entities_dict[e])))])

        with open(segmentsFile, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['video', 'start', 'text', 'entities'])
            for s in self.segments:
                writer.writerow([s.video, s.start, s.text, ';'.join(sorted(s.entities))])

        dfe = pd.read_csv(entitiesFile)
        dfe.sort_values('type', inplace=True)
        dfe.to_excel(extractionSheet, sheet_name='entities', index=False)

        dfm = pd.DataFrame()
        dfm = dfm.append(pd.read_excel(inputSheet, 'entities', usecols=['entity', 'variations', 'wikidata', 'sapa', 'type', 'image']), ignore_index=True)
        dfm = dfm.append(pd.read_excel(extractionSheet, 'entities', usecols=['entity', 'type']), ignore_index=True)
        dfm.head()
        length2 = len(dfm)
        dfm.sort_values('entity', inplace=True)
        dfm.drop_duplicates(keep='first', inplace=True, subset=['entity'])
        length3 = len(dfm)
        print(f'Total after merge: {length2} | Removed duplications:  {length2 - length3}')
        dfm.to_excel(outputSheet, sheet_name='entities', index=False)

        dfi = pd.DataFrame().append(pd.read_excel(inputSheet, 'ignore', usecols=['entity']))
        dfa = pd.DataFrame().append(pd.read_excel(inputSheet, 'additional', usecols=['entity', 'variations', 'type']))

        with pd.ExcelWriter(outputSheet, engine='openpyxl', mode='a') as writer:
            dfi.to_excel(writer, sheet_name='ignore', index=False)
            dfa.to_excel(writer, sheet_name='additional', index=False)
Exemple #56
0
def upload(df,
           gfile="/New Spreadsheet",
           wks_name=None,
           col_names=True,
           row_names=True,
           clean=True,
           credentials=None,
           start_cell='A1',
           df_size=False,
           new_sheet_dimensions=(1000, 100),
           input_option='USER_ENTERED'):
    '''
        Upload given Pandas DataFrame to Google Drive and returns
        gspread Worksheet object

        :param df: Pandas DataFrame
        :param gfile: path to Google Spreadsheet or gspread ID
        :param wks_name: worksheet name
        :param col_names: passing top row to column names for Pandas DataFrame
        :param row_names: passing left column to row names for Pandas DataFrame
        :param clean: clean all data in worksheet before uploading
        :param credentials: provide own credentials
        :param start_cell: specify where to insert the DataFrame; default is A1
        :param df_size:
            -If True and worksheet name does NOT exist, will create
            a new worksheet that is the size of the df; otherwise, by default,
            creates sheet of 1000x100 cells.
            -If True and worksheet does exist, will resize larger or smaller to
            fit new dataframe.
            -If False and dataframe is larger than existing sheet, will resize
            the sheet larger.
            -If False and dataframe is smaller than existing sheet, does not resize.
        :param new_sheet_dimensions: tuple of (row, cols) for size of a new sheet
        :param input_option: Determines how input data should be interpreted.
            (see ValueInputOption GoogleSheet API)
        :param conv_string: If True, converts dataframe to str before pushing to Google Sheet
        :type df: class 'pandas.core.frame.DataFrame'
        :type gfile: str
        :type wks_name: str
        :type col_names: bool
        :type row_names: bool
        :type clean: bool
        :type credentials: class 'oauth2client.client.OAuth2Credentials'
        :type start_cell: str
        :type df_size: bool
        :type new_sheet_dimensions: tuple
        :type conv_string: bool
        :returns: gspread Worksheet
        :rtype: class 'gspread.models.Worksheet'

        :Example:

            >>> from df2gspread import df2gspread as d2g
            >>> import pandas as pd
            >>> df = pd.DataFrame([1 2 3])
            >>> wks = d2g.upload(df, wks_name='Example worksheet')
            >>> wks.title
            'Example worksheet'
    '''
    # access credentials
    credentials = get_credentials(credentials)
    # auth for gspread
    gc = gspread.authorize(credentials)

    try:
        gc.open_by_key(gfile).__repr__()
        gfile_id = gfile
    except:
        gfile_id = get_file_id(credentials, gfile, write_access=True)

    # Tuple of rows, cols in the dataframe.
    # If user did not explicitly specify to resize sheet to dataframe size
    # then for new sheets set it to new_sheet_dimensions, which is by default 1000x100
    if df_size:
        new_sheet_dimensions = (len(df), len(df.columns))
    wks = get_worksheet(gc,
                        gfile_id,
                        wks_name,
                        write_access=True,
                        new_sheet_dimensions=new_sheet_dimensions)
    if clean:
        wks = clean_worksheet(wks, gfile_id, wks_name, credentials)

    start_col = re.split(r'(\d+)', start_cell)[0].upper()
    start_row = re.split(r'(\d+)', start_cell)[1]
    start_row_int, start_col_int = gspread.utils.a1_to_rowcol(start_cell)

    # find last index and column name (A B ... Z AA AB ... AZ BA)
    num_rows = len(df.index) + 1 if col_names else len(df.index)
    last_idx_adjust = start_row_int - 1
    last_idx = num_rows + last_idx_adjust

    num_cols = len(df.columns) + 1 if row_names else len(df.columns)
    last_col_adjust = start_col_int - 1
    last_col_int = num_cols + last_col_adjust
    last_col = re.split(
        r'(\d+)', (gspread.utils.rowcol_to_a1(1, last_col_int)))[0].upper()

    # If user requested to resize sheet to fit dataframe, go ahead and
    # resize larger or smaller to better match new size of pandas dataframe.
    # Otherwise, leave it the same size unless the sheet needs to be expanded
    # to accomodate a larger dataframe.
    if df_size:
        wks.resize(rows=len(df.index) + col_names,
                   cols=len(df.columns) + row_names)
    if len(df.index) + col_names + last_idx_adjust > wks.row_count:
        wks.add_rows(
            len(df.index) - wks.row_count + col_names + last_idx_adjust)
    if len(df.columns) + row_names + last_col_adjust > wks.col_count:
        wks.add_cols(
            len(df.columns) - wks.col_count + row_names + last_col_adjust)

    # Define first cell for rows and columns
    first_col = re.split(r'(\d+)', (gspread.utils.rowcol_to_a1(
        1, start_col_int + 1)))[0].upper() if row_names else start_col
    first_row = str(start_row_int + 1) if col_names else start_row

    # Addition of col names
    if col_names:
        cell_list = wks.range('%s%s:%s%s' %
                              (first_col, start_row, last_col, start_row))
        for idx, cell in enumerate(cell_list):
            cell.value = df.columns.astype(str)[idx]
        wks.update_cells(cell_list)

    # Addition of row names
    if row_names:
        cell_list = wks.range('%s%s:%s%d' %
                              (start_col, first_row, start_col, last_idx))
        for idx, cell in enumerate(cell_list):
            cell.value = df.index.astype(str)[idx]
        wks.update_cells(cell_list)

    # convert df values to string
    # df = df.applymap(str)

    # Addition of cell values
    cell_list = wks.range('%s%s:%s%d' %
                          (first_col, first_row, last_col, last_idx))
    for j, idx in enumerate(df.index):
        for i, col in enumerate(df.columns.values):
            if not pd.isnull(df[col][idx]):
                cell_list[i + j * len(df.columns.values)].value = df[col][idx]

    wks.update_cells(cell_list, value_input_option=input_option)
    return wks
 def test_fred_nan(self):
     start = datetime(2010, 1, 1)
     end = datetime(2013, 1, 27)
     df = web.DataReader("DFII5", "fred", start, end)
     assert pd.isnull(df.loc["2010-01-01"][0])
Exemple #58
0
def strategy(sdk):
    ###################
    #    股票策略     #
    ###################
    tradeDateFlag = sdk.getGlobal('TRADEDATEFLAG')
    tradeDateFlag += 1
    sdk.setGlobal('TRADEDATEFLAG', tradeDateFlag)

    if tradeDateFlag % HOLDINGPERIOD == 0:

        stockList = sdk.getStockList()
        stop = sdk.getFactorData("LZ_CN_STKA_QUOTE_TCLOSE")[-1]  # 获取最近的收盘价因子矩阵

        profit = np.array(
            sdk.getFactorData("LZ_CN_STKA_FIN_IND_EBITPS")[-21])  ###息税前利润
        data_in = Factors(sdk, -21)
        stop_1 = data_in[-4 - 29]
        stop = data_in[-3 - 29]

        industry_new = sdk.getFactorData("LZ_CN_STKA_INDU_ZX")[-1]
        #   dtl = SMB
        #   lyst = [i for i in range(0,len(SMB)) if SMB[i]>np.median(SMB)+5*np.std(SMB)]
        #   dtl[lyst] = np.median(SMB)+5*np.std(SMB)
        #   lyst = [i for i in range(0,len(SMB)) if SMB[i]<np.median(SMB)-5*np.std(SMB)]
        #   dtl[lyst] = np.median(SMB)-5*np.std(SMB)
        #   lyst = [i for i in range(0,len(SMB)) if pd.isnull(SMB[i])==True]
        #   dtl[lyst] = mean_data[industry_new[lyst]]
        #   SMB = dtl

        data_mat = Factors(sdk, -21)
        #data_mat.append(np.array(sdk.getFactorData("LZ_CN_STKA_VAL_A_TCAP")[-21]))
        #data_mat.append(industry_new)

        for i in range(0, len(data_mat) - 30):
            sum_data = [float(0) for ii in range(0, 30)]
            num_data = [float(0) for ii in range(0, 30)]
            whole_sum = 0
            whole_num = 0
            for j in range(0, len(data_mat[i])):
                if not pd.isnull(
                        data_mat[i][j]
                ) and data_mat[i][j] < 1.0e+20 and data_mat[i][j] > 0.001:
                    sum_data[int(industry_new[j])] += data_mat[i][j]
                    num_data[int(industry_new[j])] += 1
                    whole_sum += data_mat[i][j]
                    whole_num += 1
            if whole_num == 0:
                whole_num += 1
            mean_data = np.array(sum_data) / np.array(num_data)
            for j in range(0, len(mean_data)):
                if pd.isnull(mean_data[j]):
                    mean_data[j] = whole_sum / whole_num
            for j in range(0, len(data_mat[i])):
                if pd.isnull(
                        data_mat[i]
                    [j]) or data_mat[i][j] > 1.0e+20 or data_mat[i][j] < 0.001:
                    data_mat[i][j] = mean_data[int(industry_new[j])]
            median = np.median(data_mat[i])
            sd = np.std(data_mat[i])
            for j in range(0, len(data_mat[i])):
                if data_mat[i][j] > median + 5 * sd:
                    data_mat[i][j] = median + 5 * sd
                if data_mat[i][j] < median - 5 * sd:
                    data_mat[i][j] = median - 5 * sd

        XX = []
        X = data_mat[-2 - 29]
        for j in range(0, len(X)):
            X[j] = math.log(X[j])
        # print X[j]
        XX.append(X)
        #XX.append(data_mat[-1])
        for i in range(0, 30):
            XX.append(data_mat[-i - 1])
        X = XX
        data_mat = data_mat[:10]
        X = np.transpose(X)

        for i in range(0, len(data_mat)):
            lm = linear_model.LinearRegression()
            lm.fit(X, data_mat[i])
            data_mat[i] = data_mat[i] - lm.predict(X)
            data_mat[i] = data_mat[i] / len(data_mat[i])
            ##归一化处理
            mat_max = max(data_mat[i])
            mat_min = min(data_mat[i])
            for j in range(0, len(data_mat[i])):
                data_mat[i][j] = (data_mat[i][j] - mat_min) / (mat_max -
                                                               mat_min)
            #print data_mat[i]

        data_mat = np.transpose(data_mat)
        new_data_mat = []
        label = []
        profit = np.array(stop) / np.array(stop_1) - 1
        profit1 = [i for i in profit if not pd.isnull(i)]
        for i in range(0, len(stop_1)):
            if profit[i] > np.percentile(profit1, 70):
                label.append(1)
                new_data_mat.append(data_mat[i])
                continue
            if profit[i] <= np.percentile(profit1, 30):
                label.append(0)
                new_data_mat.append(data_mat[i])
                continue

        data_mat = new_data_mat
        sequ = range(0, len(data_mat))

        random.shuffle(sequ)
        data_mat_t = []
        label_t = []
        for i in sequ:
            data_mat_t.append(data_mat[i])
            label_t.append(label[i])
        data_mat = data_mat_t
        label = label_t

        data_new = Factors(sdk, -1)
        #data_new.append(np.array(sdk.getFactorData("LZ_CN_STKA_VAL_A_TCAP")[-1]))
        #data_new.append(industry_new)
        industry_new = sdk.getFactorData("LZ_CN_STKA_INDU_ZX")[-1]

        for i in range(0, len(data_new) - 30):
            sum_data = [0 for ii in range(0, 30)]
            num_data = [0 for ii in range(0, 30)]
            whole_sum = 0
            whole_num = 0
            for j in range(0, len(data_new[i])):
                if not pd.isnull(
                        data_new[i][j]
                ) and data_new[i][j] < 1.0e+20 and data_new[i][j] > 0.001:
                    sum_data[int(industry_new[j])] += data_new[i][j]
                    num_data[int(industry_new[j])] += 1
                    whole_sum += data_new[i][j]
                    whole_num += 1
            if whole_num == 0:
                whole_num += 1
            mean_data = np.array(sum_data) / np.array(num_data)
            for j in range(0, len(mean_data)):
                if pd.isnull(mean_data[j]):
                    mean_data[j] = whole_sum / whole_num
            for j in range(0, len(data_new[i])):
                if pd.isnull(
                        data_new[i]
                    [j]) or data_new[i][j] > 1.0e+20 or data_new[i][j] < 0.001:
                    data_new[i][j] = mean_data[int(industry_new[j])]
            median = np.median(data_new[i])
            sd = np.std(data_new[i])
            for j in range(0, len(data_new[i])):
                if data_new[i][j] > median + 5 * sd:
                    data_new[i][j] = median + 5 * sd
                if data_new[i][j] < median - 5 * sd:
                    data_new[i][j] = median - 5 * sd
        XX = []
        X = data_new[-2 - 29]
        for j in range(0, len(X)):
            X[j] = math.log(X[j])
        # print X[j]
        XX.append(X)
        for i in range(0, 30):
            XX.append(data_new[-i - 1])
        X = XX
        data_new = data_new[:10]
        X = np.transpose(X)

        for i in range(0, len(data_new)):
            lm = linear_model.LinearRegression()
            lm.fit(X, data_new[i])
            data_new[i] = data_new[i] - lm.predict(X)
            data_new[i] = data_new[i] / len(data_new[i])
            ##归一化处理
            new_max = max(data_new[i])
            new_min = min(data_new[i])
            for j in range(0, len(data_new[i])):
                data_new[i][j] = (data_new[i][j] - new_min) / (new_max -
                                                               new_min)
            #print data_new[i]

        data_new = np.transpose(data_new)

        finger = 10
        ll = len(data_mat)
        ll = ll / finger
        max_score = 0
        for i in range(0, 10):
            test_set = np.array(data_mat[ll * i:ll * (i + 1)])
            test_lab = np.array(label[ll * i:ll * (i + 1)])
            train_set = np.array(data_mat[:ll * i] + data_mat[ll * (i + 1):])
            train_lab = np.array(label[:ll * i] + label[ll * (i + 1):])
            #train_lab = np.array(train_lab)
            head = len(train_set[0])
            #print "head=%i" %head
            #print "sample number=%i" %len(train_set)
            tail = 1
            array1 = [head, head * 3, head * 10, head * 3, tail]  ###网络
            print "array1=%s" % array1
            trainer, net = NetworkTrain(array1, train_set, train_lab)
            score = NetworkTest(trainer, net, train_set, train_lab)
            if score > max_score:
                max_score = score
                machine = trainer

        print "Optimized OOB Score: %f" % max_score

        Y_random = np.random.binomial(data_new.shape[0], 0.5, size=20000)
        predicted = NetworkPredict(trainer, data_new, Y_random)

        # Create Random Forest object
        #model= RandomForestClassifier(n_estimators=10)
        # Train the model using the training sets and check score
        #model.fit(data_mat, label)
        #Predict Output
        #predicted= model.predict(data_new)

        WholeDict = dict(zip(stockList, predicted))

        stockToBuy = []
        buy_sq = []
        stockToSell = []

        for key in WholeDict.keys():
            if WholeDict[key] == 1:
                stockToBuy.append(key)
                buy_sq.append(WholeDict[key])
            if WholeDict[key] != 1:
                stockToSell.append(key)

        buyDict = dict(zip(stockToBuy, buy_sq))
        buyDict_Sorted = sorted(buyDict.items(),
                                key=lambda asd: asd[1],
                                reverse=True)

        stockToBuy = []

        for i in range(0, len(buyDict_Sorted)):
            stockToBuy.append(buyDict_Sorted[i][0])

        #Date = sdk.getNowDate()
        #sell_plan[Date] = stockToBuy

        #ii=0
        #selldate=''
        #for key in sell_plan.keys():
        #    d1 = datetime.datetime.strptime(key, '%Y%m%d')
        #    d2 = datetime.datetime.strptime(Date, '%Y%m%d')
        #    if d2-d1==10:
        #        ii=1
        #        buydate=key

        #if ii==1 :
        #    for i in range(0,len(sell_plan[buydate])):
        #        stockToSell.append(sell_plan[buydate][i])

        stockToBuy = stockToBuy[:HOLDINGNUMBER]
        # 更新持仓,卖出股票池锁定
        stockToSell = getPositionList(sdk)
        #stockToSell1 = getPositionList(sdk)
        #stockToSell = [val for val in stockToSell1 if val in stockToSell]
        # 卖出股票
        quotes = sdk.getQuotes(stockToSell)
        stockToSell = list(set(stockToSell)
                           & set(quotes.keys()))  # 列出要卖出的股票代码和相应的可卖持仓

        #     print tradeDateFlag
        #     print stockToBuy
        #     print stockToSell
        #     print "\n"

        if stockToSell != []:
            pass

        bar = {}
        for s in stockToSell:
            bar[s] = quotes[s].open
        position = getPositionDict(sdk)
        if stockToSell != []:
            sellStockList(sdk, stockToSell, bar)  # 以开盘价卖出股票
        # 更新持仓
        stockPositionList = getPositionList(sdk)
        # 买入股票池锁定
        quotes = sdk.getQuotes(stockToBuy)  # 获取股票列表的盘口信息
        stockToBuy = list(set(stockToBuy)
                          & set(quotes.keys()))  # 列出要买入的股票代码和相应的可卖持仓
        bar = {}
        for s in stockToBuy:
            bar[s] = quotes[s].open
        position = getPositionDict(sdk)
        buyStockList(sdk, stockToBuy, bar)  # 以开盘价买入股票
Exemple #59
0
def custom_heuristic(file_path):
    '''
    You are given a list of Titantic passengers and their associated
    information. More information about the data can be seen at the link below:
    http://www.kaggle.com/c/titanic-gettingStarted/data

    For this exercise, you need to write a custom heuristic that will take
    in some combination of the passenger's attributes and predict if the passenger
    survived the Titanic diaster.

    Can your custom heuristic beat 80% accuracy?
    
    The available attributes are:
    Pclass          Passenger Class
                    (1 = 1st; 2 = 2nd; 3 = 3rd)
    Name            Name
    Sex             Sex
    Age             Age
    SibSp           Number of Siblings/Spouses Aboard
    Parch           Number of Parents/Children Aboard
    Ticket          Ticket Number
    Fare            Passenger Fare
    Cabin           Cabin
    Embarked        Port of Embarkation
                    (C = Cherbourg; Q = Queenstown; S = Southampton)
                    
    SPECIAL NOTES:
    Pclass is a proxy for socioeconomic status (SES)
    1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

    Age is in years; fractional if age less than one
    If the age is estimated, it is in the form xx.5

    With respect to the family relation variables (i.e. SibSp and Parch)
    some relations were ignored. The following are the definitions used
    for SibSp and Parch.

    Sibling:  brother, sister, stepbrother, or stepsister of passenger aboard Titanic
    Spouse:   husband or wife of passenger aboard Titanic (mistresses and fiancees ignored)
    Parent:   mother or father of passenger aboard Titanic
    Child:    son, daughter, stepson, or stepdaughter of passenger aboard Titanic
    
    Write your prediction back into the "predictions" dictionary. The
    key of the dictionary should be the passenger's id (which can be accessed
    via passenger["PassengerId"]) and the associating value should be 1 if the
    passenger survvied or 0 otherwise. 

    For example, if a passenger is predicted to have survived:
    passenger_id = passenger['PassengerId']
    predictions[passenger_id] = 1

    And if a passenger is predicted to have perished in the disaster:
    passenger_id = passenger['PassengerId']
    predictions[passenger_id] = 0
    
    You can also look at the Titantic data that you will be working with
    at the link below:
    https://www.dropbox.com/s/r5f9aos8p9ri9sa/titanic_data.csv
    '''

    cols = [
        'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'
    ]
    cols.remove('PassengerId')
    cols.remove('Survived')
    cols.remove('Name')
    # Ticket?
    cols.remove('Ticket')

    # Bucketize age
    df = pandas.read_csv(file_path)
    df['Age'].fillna(-1, inplace=True)
    df['AgeBucket'] = pandas.Series('', index=df.index)
    for idx, row in df.iterrows():
        age = row['Age']
        age_bucket = 'c'
        if age < 18:
            age_bucket = 'young'
        elif age < 25:
            age_bucket = 'midyoung'
        elif age < 40:
            age_bucket = 'midmid'
        else:
            age_bucket = 'old'
        df.loc[idx, 'AgeBucket'] = age_bucket
    # After AgeBucket is added, replace the Age with Bucket
    cols.remove('Age')
    cols.append('AgeBucket')

    # Replace cabin with first char
    df['Deck'] = pandas.Series('', index=df.index)
    for idx, row in df.iterrows():
        cabin = row['Cabin']
        if not pandas.isnull(cabin):
            df.loc[idx, 'Deck'] = cabin[0]
    cols.remove('Cabin')
    cols.append('Deck')

    # Bucketize fare
    df['Fare'].fillna(-1, inplace=True)
    df['FareBucket'] = pandas.Series(0, index=df.index)
    for idx, row in df.iterrows():
        fare = row['Fare']
        # Cap to makes bucketizing look nicer
        if fare > 100:
            df.loc[idx, 'Fare'] = 100

        fare_bucket = ''
        if fare <= 10:
            fare_bucket = 10
        elif fare <= 20:
            fare_bucket = 20
        elif fare <= 30:
            fare_bucket = 30
        elif fare <= 40:
            fare_bucket = 40
        else:
            fare_bucket = 100
        df.loc[idx, 'FareBucket'] = fare_bucket
    cols.remove('Fare')
    cols.append('FareBucket')

    # Print data relations
    features = []
    for i, coli in enumerate(cols):
        for j, colj in enumerate(cols):
            if i <= j:
                features.append([coli, colj])
                # features.append([coli, colj, 'Sex'])
    features = []
    #x = df[(df['Sex'] == 'male') & (df['Deck'] == 'E') & (df['AgeBucket'] == 'midmid')]

    survivor_threshold = 0.8
    base_threshold = 0
    for f in features:
        print "--------------------------------------------------------------"
        print f
        predictions = {}

        for passenger_index, passenger in df.iterrows():
            key = ''
            for k in f:
                v = passenger[k]
                key = key + ' ' + str(v)

            predictions.setdefault(key, [0, 0])
            predictions[key][0] += passenger['Survived']
            predictions[key][1] += 1

        # Print the stats for features list
        # print ', '.join(df.columns)
        # print predictions
        for k in sorted(predictions.keys()):
            v = predictions[k]
            survivor = 1.0 * v[0] / v[1]
            base = v[1]
            if survivor < survivor_threshold or base < base_threshold or 'female' in k:
                continue
            print '%s => %.2f (%d)' % (k, (1.0 * v[0] / v[1]), v[1])

    # Observations:
    # PREDICTION female in [1, 2] class => 97, 92%
    # PREDICTION female with SbSp <= 2 [0.79, 0.75, 0.77]
    # PREDICTION female not from S [69%, against C, Q]
    # PREDICTION decks: B, D, E
    # TODO: fare: bucketize

    total_survivors = 0
    predictions = {}
    # df = pandas.read_csv(file_path)
    for passenger_index, passenger in df.iterrows():
        passenger_id = passenger['PassengerId']
        survivor = 0

        sex = passenger['Sex']
        if sex == 'female':
            # 1 or 2nd class?
            if passenger['Pclass'] in [1, 2]:
                survivor = 1
            # Embarked in 'C' (Cherbourg)?
            if passenger['Embarked'] == 'C':
                survivor = 1
            if passenger['Deck'] in ['B', 'C', 'D', 'E']:
                survivor = 1
            if passenger['FareBucket'] == 100:
                survivor = 1

        # Bunch of findings
        if passenger['Pclass'] == 2 and passenger['Parch'] == 2:
            survivor = 1
        if passenger['SibSp'] == 1 and passenger['Deck'] in ['B', 'D']:
            survivor = 1
        if passenger['Parch'] == 2 and passenger['AgeBucket'] == 'midyoung':
            survivor = 1
        if passenger['Embarked'] == 'C' and passenger['Deck'] == 'D':
            survivor = 1
        if passenger['AgeBucket'] == 'midmid' and passenger['Deck'] in [
                'B', 'D'
        ]:
            survivor = 1
        if passenger['Sex'] == 'male' and passenger['Deck'] in [
                'E'
        ] and passenger['AgeBucket'] == 'midmid':
            survivor = 1

        predictions[passenger_id] = survivor
        if survivor:
            total_survivors = total_survivors + 1

    print 'prediction rate: ', total_survivors, len(predictions)
    accurate = 0
    for _, passenger in df.iterrows():
        passenger_id = passenger['PassengerId']
        prediction = predictions[passenger_id]
        if prediction == passenger['Survived']:
            accurate = accurate + 1
    #survived = sum(df['Survived'] == 1)
    print 'accuracy: ', (1.0 * accurate / len(predictions))

    return predictions


#custom_heuristic('./kaggle_titanic_train.csv')

# SUBMITTED CODE:
#     passenger_id = passenger['PassengerId']
#
#     # Set custom columns
#     passenger['Deck'] = ''
#     if not pandas.isnull(passenger['Cabin']):
#         passenger['Deck'] = passenger['Cabin'][0]
#
#     passenger['AgeBucket'] = ''
#     age = passenger['Age']
#     age_bucket = 'c'
#     if age < 18:
#         age_bucket = 'young'
#     elif age < 25:
#         age_bucket = 'midyoung'
#     elif age < 40:
#         age_bucket = 'midmid'
#     else:
#         age_bucket = 'old'
#     passenger['AgeBucket'] = age_bucket
#
#
#     survivor = 0
#
#     sex = passenger['Sex']
#     if sex == 'female':
#         # 1 or 2nd class?
#         if passenger['Pclass'] in [1, 2]:
#             survivor = 1
#         # Embarked in 'C' (Cherbourg)?
#         if passenger['Embarked'] == 'C':
#             survivor = 1
#         if passenger['Deck'] in ['B', 'C', 'D', 'E']:
#             survivor = 1
#         if passenger['Fare'] >= 40:
#             survivor = 1
#
#     # Bunch of findings
#     if passenger['Pclass'] == 2 and passenger['Parch'] == 2:
#         survivor = 1
#     if passenger['SibSp'] == 1 and passenger['Deck'] in ['B', 'D']:
#         survivor = 1
#     if passenger['Parch'] == 2 and passenger['AgeBucket'] == 'midyoung':
#         survivor = 1
#     if passenger['Embarked'] == 'C' and passenger['Deck'] == 'D':
#         survivor = 1
#     if passenger['AgeBucket'] == 'midmid' and passenger['Deck'] in ['B', 'D']:
#         survivor = 1
#     if passenger['Sex'] == 'male' and passenger['Deck'] in ['E'] and passenger['AgeBucket'] == 'midmid':
#         survivor = 1
#
#     predictions[passenger_id] = survivor
    def _initialize_custom_data(self):
        windfarm = self.config["array_system_design"]["location_data"]

        self.location_data = extract_library_specs(
            "cables", windfarm, file_type="csv"
        )

        # Make sure no data is missing
        missing = set(self.COLUMNS).difference(self.location_data.columns)
        if missing:
            raise ValueError(
                f"The following columns must be included in the location data: {missing}"
            )

        self._format_windfarm_data()

        # Ensure there is no missing data in required columns
        missing_data_cols = [
            c
            for c in self.REQUIRED
            if pd.isnull(self.location_data[c]).sum() > 0
        ]
        if missing_data_cols:
            raise ValueError(f"Missing data in columns: {missing_data_cols}!")

        # Ensure there is no missing data in optional columns
        missing_data_cols = [
            c
            for c in self.OPTIONAL
            if (
                pd.isnull(self.location_data[c]) | self.location_data[c] == 0
            ).sum()
            > 0
        ]
        if missing_data_cols:
            message = (
                f"Missing data in columns {missing_data_cols}; "
                "all values will be calculated."
            )
            warnings.warn(message)

        # Ensure the number of turbines matches what's expected
        if self.location_data.shape[0] != self.system.num_turbines:
            raise ValueError(
                f"The provided number of turbines ({self.location_data.shape[0]}) ",
                f"does not match the plant data ({self.system.num_turbines}).",
            )

        n_coords = self.location_data.groupby(
            ["turbine_latitude", "turbine_longitude"]
        ).ngroups
        duplicates = self.location_data.shape[0] - n_coords
        if duplicates > 0:
            raise ValueError(
                f"There are {duplicates} rows with duplicate coordinates."
            )

        # Ensure the number of turbines on a string is within the limits
        longest_string = self.location_data["order"].unique().size
        self.num_strings = self.location_data.groupby(
            ["substation_id", "string"]
        ).ngroups
        if longest_string > self.num_turbines_full_string:
            raise ValueError(
                "Strings can't contain more than "
                f"{self.num_turbines_full_string} turbines."
            )
        else:
            self.num_turbines_full_string = longest_string
            del self.num_turbines_partial_string
            del self.num_partial_strings