def getWordsDictionary(self, data, full_text = False): """ Returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """ splitter = lambda w, t: [wi.split(t) for wi in w] if type(w) == type([]) else splitter(w,t) if full_text: # get all words in every cell and then calculate histograms words = [] for cell in data: words += splitRecursive(cell, WORD_SEPARATORS) hist = {i: words.count(i) for i in words} x = list(hist.keys()) histogram = { 'x': x, 'y': list(hist.values()) } return x, histogram else: hist = {i: data.count(i) for i in data} x = list(hist.keys()) histogram = { 'x': x, 'y': list(hist.values()) } return x, histogram
def get_text_histogram(data): """ If text, returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """ words = [] for cell in data: words.extend(splitRecursive(cell, WORD_SEPARATORS)) hist = get_hist(words) return hist
def get_words_histogram(data, is_full_text=False): """ Returns an array of all the words that appear in the dataset and the number of times each word appears in the dataset """ splitter = lambda w, t: [wi.split(t) for wi in w] if isinstance( w, list) else splitter(w, t) if is_full_text: # get all words in every cell and then calculate histograms words = [] for cell in data: words += splitRecursive(cell, WORD_SEPARATORS) hist = {i: words.count(i) for i in words} else: hist = {i: data.count(i) for i in data} return {'x': list(hist.keys()), 'y': list(hist.values())}
def norm(value, cell_stats): if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.NUMERIC: if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or ( value == None or value == '' or value == '\n' or value == '\r')): return [0, 0, 0] if cell_stats['max'] - cell_stats['min'] != 0: normalizedValue = (value - cell_stats['min']) / \ (cell_stats['max'] - cell_stats['min']) elif cell_stats['max'] != 0: normalizedValue = value / cell_stats['max'] else: normalizedValue = value # if normalizedValue > 10: # raise ValueError('Something is wrong with normalized value') sign = 1 if normalizedValue >= 0 else 0 normalizedValue = abs(normalizedValue) + OFFSET return [normalizedValue, sign, 1.0] if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.DATE: #[ timestamp, year, month, day, minute, second, is null] if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or ( value == None or value == '' or value == '\n' or value == '\r')): ret = [0]*7 ret[-1] = 0 return ret try: timestamp = int(parseDate(value).timestamp()) except: ret = [0] * 7 ret[-1] = 0 return ret date = datetime.datetime.fromtimestamp(timestamp) date_max = datetime.datetime.fromtimestamp(cell_stats['max']) date_min = datetime.datetime.fromtimestamp(cell_stats['min']) attrs = ['year', 'month', 'day', 'minute', 'second'] maxes = {'day': 31, 'minute': 60, 'second': 60, 'month': 12} norm_vals = [] if cell_stats['max'] - cell_stats['min'] != 0: norm_vals.append( (timestamp - cell_stats['min']) / (cell_stats['max'] - cell_stats['min']) ) else: norm_vals.append( timestamp / cell_stats['max'] ) for k_attr in attrs: curr = getattr(date, k_attr) if k_attr in maxes: d_max = maxes[k_attr] d_min = 0 else: d_max = getattr(date_max, k_attr) d_min = getattr(date_min, k_attr) if d_max - d_min !=0: norm_vals.append( (curr -d_min)/(d_max-d_min) ) else: norm_vals.append((curr) / (d_max)) norm_vals.append(1.0) return norm_vals if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.TEXT: # is it a word if cell_stats['dictionaryAvailable']: # all the words in the dictionary +2 (one for rare words and one for null) vector_length = len(cell_stats['dictionary']) + TEXT_ENCODING_EXTRA_LENGTH arr = [0] * vector_length arr[-1] = 1.0 if value in [None, '']: # return NULL value, which is an empy hot vector array with the last item in list with value 1 arr[vector_length - 1] = 0 # set null as 1 return arr # else return one hot vector # if word is a strange word it will not be in the dictionary try: index = cell_stats['dictionary'].index(value) except: index = vector_length - 2 arr[index] = 1 return arr else: return [] if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.FULL_TEXT: if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or ( value == None or value == '' or value == '\n' or value == '\r')): return [FULL_TEXT_NONE_VALUE] # is it a full text if cell_stats['dictionaryAvailable']: # all the words in the dictionary +2 (one for rare words and one for null) vector_length = len(cell_stats['dictionary']) + FULL_TEXT_ENCODING_EXTRA_LENGTH # else return a list of one hot vectors values = splitRecursive(value, WORD_SEPARATORS) array_of_arrays = [] first_word = vector_length - 4 array_of_arrays += [FULL_TEXT_IS_START] for word in values: # else return one hot vector # if word is a strange word it will not be in the dictionary try: index = cell_stats['dictionary'].index(word) except: index = FULL_TEXT_UN_FREQUENT array_of_arrays += [index] array_of_arrays += [FULL_TEXT_IS_END] # return [array_of_arrays] # TODO: ask about this return array_of_arrays else: return []