def test_isfloat_returns_True_if_given_float_string_padded_or_not(x, y, z): assume(not math.isnan(x)) assume(not math.isinf(x)) y = ''.join(repeat(' ', y)) + repr(x) + ''.join(repeat(' ', z)) assert fastnumbers.isfloat(repr(x)) assert fastnumbers.isfloat(repr(x), str_only=True) assert fastnumbers.isfloat(y)
def load_data_for_nn(): data = pd.read_csv(os.path.join(DIR_TRAIN, 'train_set.csv'), usecols=range(1, 11), parse_dates=['timestamp', 'thread_timestamp']) data = data[data.channel.isin([ 'career', 'big_data', 'deep_learning', 'kaggle_crackers', 'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets' ]) & data.main_msg] # data_train = data. date_before = date(2017, 4, 1) train = data[data['timestamp'] <= date_before] val = data[data['timestamp'] > date_before] train_data = train[['channel', 'text']].reset_index()[['channel', 'text']] train_data['channel'] = train_data.channel.map(MAPPINGS) train_data = train_data.sort_values('channel').reset_index()[[ 'channel', 'text' ]] val_data = val[['channel', 'text']].reset_index()[['channel', 'text']] val_data['channel'] = val_data.channel.map(MAPPINGS) val_data = val_data.sort_values('channel').reset_index()[[ 'channel', 'text' ]] train_data.text = train_data.text.astype(str) \ .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] val_data.text = val_data.text.astype(str) \ .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) val_data = val_data[~val_data.text. apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] train_text = train_data['text'].astype(str).apply(lambda x: x.lower()) train_labels = np.asarray(train_data['channel'], dtype='int8') val_text = val_data['text'].astype(str).apply(lambda x: x.lower()) val_labels = np.asarray(val_data['channel'], dtype='int8') vocab, vocab_size = create_vocab_set() X_train = text2sequence(train_text, vocab) X_val = text2sequence(val_text, vocab) X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0) X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0) train_labels = to_categorical(train_labels, num_classes=12) val_labels = to_categorical(val_labels, num_classes=12) return X_train, train_labels, X_val, val_labels
def readfile(filedir): with open(filedir, "r") as f: prices = [] hps = [] lines1 = [] lines2 = [] for line in f: line = line.split(',') hp = fn.fast_float(line[21], default=0) price = fn.fast_float(line[25], default=0) if fn.isfloat(hp): if hp > 20 and hp < 300: hps.append(hp) else: lines1.append(line) else: lines1.append(line) if fn.isfloat(price): if price > 1 and price < 60: prices.append(price) else: lines2.append(line) else: lines2.append(line) avghp = sum(hps) / len(hps) avgprice = sum(prices) / len(prices) for line in lines1: hp = avghp price = fn.fast_float(line[21]) hps.append(hp) prices.append(price) for line in lines2: hp = fn.fast_float(line[25]) price = avgprice hps.append(hp) prices.append(price) sort = zip(prices, hps) sort.sort() prices = [x for x, y in sort] hps = [y for x, y in sort] return prices , hps
def load_data(): data = pd.read_csv('../data/train_set.csv', usecols=range(1, 11), parse_dates=['timestamp', 'thread_timestamp']) data = data[data.channel.isin([ 'career', 'big_data', 'deep_learning', 'kaggle_crackers', 'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets' ]) & data.main_msg] users_100 = list(data.user_id.value_counts()[:100].index) data = data[data["user_id"].isin(users_100)] mappings = {} for c, value in enumerate(users_100, 0): mappings[value] = c # split on data and data val date_before = date(2017, 4, 1) train = data[data['timestamp'] <= date_before] val = data[data['timestamp'] > date_before] train_data = train[['user_id', 'text']].reset_index()[['user_id', 'text']] train_data['user_id'] = train_data.user_id.map(mappings) train_data = train_data.sort_values('user_id').reset_index()[[ 'user_id', 'text' ]] val_data = val[['user_id', 'text']].reset_index()[['user_id', 'text']] val_data['user_id'] = val_data.user_id.map(mappings) val_data = val_data.sort_values('user_id').reset_index()[[ 'user_id', 'text' ]] train_data.text = train_data.text.astype(str) \ .apply(lambda x: re.sub('(<\S+>:?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] val_data.text = val_data.text.astype(str) \ .apply(lambda x: re.sub('(<\S+>:?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) val_data = val_data[~val_data.text. apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] train_text = train_data['text'].astype(str).apply(lambda x: x.lower()) train_labels = np.asarray(train_data['user_id'], dtype='int8') val_text = val_data['text'].astype(str).apply(lambda x: x.lower()) val_labels = np.asarray(val_data['user_id'], dtype='int8') return train_text, train_labels, val_text, val_labels
def func(value): """ Check if a value can be casted to a specific :param value: value to be checked :return: """ if isinstance(value, bool): _data_type = "bool" elif fastnumbers.isint(value): # Check if value is integer _data_type = "int" elif fastnumbers.isfloat(value): _data_type = "float" # if string we try to parse it to int, float or bool elif isinstance(value, str): if str_to_boolean(value): _data_type = "bool" elif str_to_date(value): _data_type = "date" elif str_to_array(value): _data_type = "array" else: _data_type = "string" else: _data_type = "null" if get_type is False: if _data_type == data_type: return True else: return False else: return _data_type
def __repr__(self): # This is just a helper method to print # the question in a readable format. condition = "==" if fn.isfloat(self.value): condition = ">=" return "Is %s %s %s?" % (DATASET_HEADERS[self.column], condition, str(self.value))
def _infer_type(value): #if not value or f4py.is_missing_value(value): # return None if fastnumbers.isint(value): return b"i" if fastnumbers.isfloat(value): return b"f" return b"s"
def load_data_gbm(): data = pd.read_csv(os.path.join(dir_train, 'train_set.csv'), usecols=range(1, 11), parse_dates=['timestamp', 'thread_timestamp']) data = data[data.channel.isin([ 'career', 'big_data', 'deep_learning', 'kaggle_crackers', 'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome', 'bayesian', '_meetings', 'datasets' ]) & data.main_msg] date_before = date(2017, 4, 1) train = data[data['timestamp'] <= date_before] val = data[data['timestamp'] > date_before] train_data = train[['channel', 'text']].reset_index()[['channel', 'text']] train_data['channel'] = train_data.channel.map(mappings) train_data = train_data.sort_values('channel').reset_index()[[ 'channel', 'text' ]] val_data = val[['channel', 'text']].reset_index()[['channel', 'text']] val_data['channel'] = val_data.channel.map(mappings) val_data = val_data.sort_values('channel').reset_index()[[ 'channel', 'text' ]] train_data.text = train_data.text.astype(str) \ .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] val_data.text = val_data.text.astype(str) \ .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) val_data = val_data[~val_data.text. apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] train_text = train_data['text'].astype(str).apply(lambda x: x.lower()) train_labels = np.asarray(train_data['channel'], dtype='int8') val_text = val_data['text'].astype(str).apply(lambda x: x.lower()) val_labels = np.asarray(val_data['channel'], dtype='int8') return train_text, val_text, train_labels, val_labels
def parse_argval(val: str): if val.isdigit(): return int(val) if isfloat(val): return float(val) if val == "set()": return set() try: return orjson.loads(val) except ValueError: return val
def guess_type(val, empty_as_null: bool) -> ColumnType: """Guess type of a value""" if val is None: return ColumnType.NULL assert isinstance(val, (int, float, str)), "Invalid column data" if fastnumbers.isfloat(val): return ColumnType.NUMBER else: if len(val.strip()) == 0 and empty_as_null: return ColumnType.NULL return ColumnType.STRING
def get_quantity(s): # 5, 5 eV, 5+/-1 eV, 5(1) eV # set uncertainty to nan if not provided parts = s.split() parts += [None] * (2 - len(parts)) if isfloat(parts[0]): parts[0] += "+/-nan" try: parts[0] = ufloat_fromstr(parts[0]) return ureg.Measurement(*parts) except ValueError: return None
def norm_val(val, empty_as_null: bool) -> Union[bytes, int, float, None]: """Normalize a value""" if val is None: return None if fastnumbers.isfloat(val) or fastnumbers.isint(val): return fastnumbers.float(val) val = val.strip() if len(val) == 0 and empty_as_null: return None return val.encode("utf-8", "ignore")
def infer(value): """ Infer a Spark data type from a value :param value: value to be inferred :return: Spark data type """ result = None # print(v) if value is None: result = "null" elif is_bool(value): result = "bool" elif isint(value): result = "int" elif isfloat(value): result = "float" elif is_list(value): result = ArrayType(infer(value[0])) elif is_datetime(value): result = "datetime" elif is_date(value): result = "date" elif is_binary(value): result = "binary" elif is_str(value): if str_to_boolean(value): result = "bool" elif str_to_date(value): result = "string" # date elif str_to_array(value): result = "string" # array else: result = "string" return get_spark_dtypes_object(result)
def to_spark(value): """ Infer a Spark data type from a value :param value: value to be inferred :return: Spark data type """ result = None if value is None: result = "null" elif is_bool_value(value): result = "bool" elif fastnumbers.isint(value): result = "int" elif fastnumbers.isfloat(value): result = "float" elif is_list_value(value): result = ArrayType(to_spark(value[0])) elif is_datetime(value): result = "datetime" elif is_date(value): result = "date" elif is_binary(value): result = "binary" elif is_str(value): if is_bool_str(value): result = "bool" elif is_datetime(value): result = "string" # date elif is_list_str(value): result = "string" # array else: result = "string" return parse_spark_class_dtypes(result)
def parse_column_type(name, values): if name == b"Sample": return b"i" non_missing_values = [x for x in values if x != b"" and x != b"NA"] unique_values = set(non_missing_values) has_non_number = False for x in unique_values: if not fastnumbers.isfloat(x): has_non_number = True break if has_non_number: if len(unique_values) == len(non_missing_values): return b"i" #ID else: return b"d" #Discrete return b"n" # Numeric
def test_isfloat_returns_False_if_given_non_number_string(x): assume(not a_number(x)) assert not fastnumbers.isfloat(x)
def test_isfloat_returns_False_for_nan_string_unless_allow_nan_is_True(): assert not fastnumbers.isfloat('nan') assert fastnumbers.isfloat('nan', allow_nan=True) assert fastnumbers.isfloat('-NaN', allow_nan=True)
def test_isfloat_given_unicode_non_numeral_returns_False(x): assert not fastnumbers.isfloat(x)
def test_isfloat_given_unicode_of_more_than_one_char_returns_False(x): assume(not a_number(x)) assert not fastnumbers.isfloat(x)
def test_isfloat_returns_False_if_given_string_and_num_only_is_True(x): assume(not math.isnan(x)) assume(not math.isinf(x)) assert not fastnumbers.isfloat(repr(x), num_only=True)
def test_isfloat_given_unicode_numeral_returns_True(x): assert fastnumbers.isfloat(x) # Try padded as well assert fastnumbers.isfloat(u' ' + x + u' ')
def test_isfloat_returns_True_if_given_int_string_padded_or_not(x, y, z): y = ''.join(repeat(' ', y)) + repr(x) + ''.join(repeat(' ', z)) assert fastnumbers.isfloat(repr(x)) assert fastnumbers.isfloat(repr(x), str_only=True) assert fastnumbers.isfloat(y)
def str_to_decimal(_value): return True if fastnumbers.isfloat(_value) else False
def test_isfloat_returns_True_if_given_float(x): assert fastnumbers.isfloat(x) assert fastnumbers.isfloat(x, num_only=True)
def test_isfloat_returns_False_if_given_float_and_str_only_is_True(x): assert not fastnumbers.isfloat(x, str_only=True)
def test_isfloat_returns_False_if_given_int(x): assert not fastnumbers.isfloat(x)
def test_isfloat(): # 1. float number assert fastnumbers.isfloat(-367.3268) assert not fastnumbers.isfloat(-367.3268, str_only=True) assert fastnumbers.isfloat(-367.3268, num_only=True) # 2. signed float string assert fastnumbers.isfloat("+367.3268") assert fastnumbers.isfloat("+367.3268", True) assert not fastnumbers.isfloat("+367.3268", num_only=True) # 3. float string with exponents assert fastnumbers.isfloat("-367.3268e207") # 4. float string with padded whitespace assert fastnumbers.isfloat(" -367.04 ") # 5. int number assert not fastnumbers.isfloat(499) # 6. signed int string assert fastnumbers.isfloat("-499") # 7. int string with padded whitespace assert fastnumbers.isfloat(" +3001 ") # 8. long number assert not fastnumbers.isfloat(35892482945872302493) # 9. long string assert fastnumbers.isfloat("35892482945872302493") # 10. return type assert fastnumbers.isfloat(4029) is False assert fastnumbers.isfloat(4029.0) is True assert fastnumbers.isfloat(4029.0, str_only=True) is False assert fastnumbers.isfloat("4029") is True assert fastnumbers.isfloat("4029", True) is True # 11. TypeError for invalid input assert not fastnumbers.isfloat(["hey"]) # 12. Invalid input string assert not fastnumbers.isfloat("not_a_number") # 13. Invalid input string with numbers assert not fastnumbers.isfloat("26.8 lb") # 14. Infinity assert not fastnumbers.isfloat("inf") assert fastnumbers.isfloat("inf", allow_inf=True) assert fastnumbers.isfloat("-infinity", allow_inf=True) assert fastnumbers.isfloat("-INFINITY", allow_inf=True) # 15. NaN assert not fastnumbers.isfloat("nAn") assert fastnumbers.isfloat("nan", allow_nan=True) assert fastnumbers.isfloat("-NaN", allow_nan=True) # 16. Sign/'e'/'.' only assert not fastnumbers.isfloat("+") assert not fastnumbers.isfloat("-") assert not fastnumbers.isfloat("e") assert not fastnumbers.isfloat(".") # 18. Unicode numbers assert fastnumbers.isfloat(u"⑦") assert fastnumbers.isfloat(u"⁸") assert fastnumbers.isfloat(u"⅔") assert fastnumbers.isfloat(u"Ⅴ")
date_before = date(2017, 4, 1) train = data[data['timestamp'] <= date_before] val = data[data['timestamp'] > date_before] train_data = train[['channel', 'text']].reset_index()[['channel', 'text']] train_data['channel'] = train_data.channel.map(mappings) train_data = train_data.sort_values('channel').reset_index()[[ 'channel', 'text' ]] val_data = val[['channel', 'text']].reset_index()[['channel', 'text']] val_data['channel'] = val_data.channel.map(mappings) val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']] train_data = train_data[~train_data.text. apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] val_data = val_data[~val_data.text. apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)] train_text = train_data['text'].astype(str).apply(lambda x: x.lower()) train_labels = np.asarray(train_data['channel'], dtype='int8') val_text = val_data['text'].astype(str).apply(lambda x: x.lower()) val_labels = np.asarray(val_data['channel'], dtype='int8') train_text = train_text \ .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \ .apply(lambda x: re.sub('\s+', ' ', x)) val_text = val_text \ .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(>)|([\w\.]*@[\w\.]*)', ' ', x)) \
def test_isfloat_returns_False_for_inf_string_unless_allow_inf_is_True(): assert not fastnumbers.isfloat('inf') assert fastnumbers.isfloat('inf', allow_inf=True) assert fastnumbers.isfloat('-INFINITY', allow_inf=True)
def crawl_nbastats_by_year(year, champion_team_name='dal', num_player_of_interest=10): champion_team_name = team_abbr_full[champion_team_name] url_root = 'http://espn.go.com/nba/team/stats/_/name/' best_stats = [] champion_teamstats = pd.DataFrame() non_champion_teamstats = [] non_champion_teamlist = [] for team_abbr, team_name in zip(teams_abbr, teams_full_name): catogory = '/cat/avgMinutes/' # ordering player with their avg. minutes URL = url_root + team_abbr + '/year/' + str(year) + category + team_name print 'parsing ' + URL + ' ...' request = urllib2.Request(URL) response = urllib2.urlopen(request) if response.url != URL: print 'no response on this address, redirect to: ', response.url continue response = response.read() soup = BeautifulSoup(response, 'html.parser') players = soup.findAll('tr', {'class': re.compile('^player-')}) stat_labels = soup.findAll('tr', {'class': ['colhead']}) total_labels = soup.findAll('tr', {'class': ['total']}) print soup.title.string #print '1: ', total_labels[0].select('td') #print '2: ', total_labels[1].select('td') player_list = [] player_dict = {} team_stats = OrderedDict() # avoid dict sorting the keys when adding them # Initialise 30 statistics for the team stats = ['', ''] stats[0] = stat_labels[0].select('td') # Table 1: game statistics stats[1] = stat_labels[1].select('td') # Table 2: shooting statistics stat_labels = stats for stat in stats[0]: team_stats[stat.get_text()] = 0.0 for stat in stats[1]: team_stats[stat.get_text()] = 0.0 numOfPlayer = len(players) / 2 # teams with players fewer than 10 are not included in the study if numOfPlayer < num_player_of_interest: print 'warning: players less than ' + str(num_player_of_interest) + ' !' continue player_namelist = [] for i, player in enumerate(players, 0): if i == numOfPlayer: break player_stats = player.findAll('td') player_namelist.append(player_stats[0].get_text().encode('ascii', 'ignore')) team_stats = pd.DataFrame(np.zeros([numOfPlayer, len(team_stats.keys())]), \ index=player_namelist, columns=team_stats.keys()) team_stats = team_stats.drop('PLAYER', 1) for i, player in enumerate(players, 0): player_idx = i % numOfPlayer j = i / numOfPlayer player_stats = player.findAll('td') # iterate over players within a team stat = np.zeros(len(player_stats)) for stat_label, player_stat in zip(stat_labels[j], player_stats): x = player_stat.get_text().encode('ascii', 'ignore') if isfloat(x) == True: x = float(x) team_stats.set_value(player_namelist[player_idx], stat_label.get_text(), x) '''filename = team_name + '_' + str(year) + '.csv' print 'saving ' + filename, ' ...' team_stats.to_csv(filename)''' # keep track of champion team with specified year team_stats.index.name = 'Players' team_stats.columns.name = 'Statistics' if team_name == champion_team_name: champion_teamstats = team_stats else: non_champion_teamstats.append(team_stats) non_champion_teamlist.append(team_name) # keep track of the best of each statistics if len(best_stats) == 0: best_stats = team_stats.max(axis=0, numeric_only=True).as_matrix() else: team_stats = team_stats.max(axis=0, numeric_only=True).as_matrix() # only take max if all the entries in 'team_stats' are non-nan if not np.isnan(team_stats).any(): best_stats = np.maximum(best_stats, team_stats) # element-wise max # normalise the stats by dividing the champion team's stats by the best stats among all teams if (not champion_teamstats.empty) and (len(best_stats) != 0): champion_teamstats = champion_teamstats.loc[:, 'GP'::].divide(best_stats, axis='columns') for team_stat, team_name in zip(non_champion_teamstats, non_champion_teamlist): team_stat = team_stat.loc[:, 'GP'::].divide(best_stats, axis='columns') team_stat = team_stat.iloc[0:num_player_of_interest] filename = 'non_champions/' + str(year) + '_' + team_name + '.csv' team_stat.to_csv(filename) return champion_teamstats
def test_isfloat_with_no_arguments_fails(): with raises(TypeError): fastnumbers.isfloat(5, invalid='dummy')
def is_decimal(value): return fastnumbers.isfloat(value, allow_nan=True)