def __init__(self, where, queryables=None, encoding=None, scope_level=0): where = _validate_where(where) self.encoding = encoding self.condition = None self.filter = None self.terms = None self._visitor = None # capture the environment if needed local_dict = DeepChainMap() if isinstance(where, Expr): local_dict = where.env.scope where = where.expr elif isinstance(where, (list, tuple)): for idx, w in enumerate(where): if isinstance(w, Expr): local_dict = w.env.scope else: w = _validate_where(w) where[idx] = w where = ' & '.join(map('({})'.format, com.flatten(where))) # noqa self.expr = where self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) self._visitor = ExprVisitor(self.env, queryables=queryables, parser='pytables', engine='pytables', encoding=encoding) self.terms = self.parse()
def __init__(self, lhs, rhs, truediv, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): raise TypeError("unsupported operand type(s) for {0}:" " '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type)) if truediv or PY3: _cast_inplace(com.flatten(self), np.float_)
def __init__(self, lhs, rhs, truediv, *args, **kwargs): super().__init__('/', lhs, rhs, *args, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): raise TypeError("unsupported operand type(s) for {0}:" " '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type)) # do not upcast float32s to float64 un-necessarily acceptable_dtypes = [np.float32, np.float_] _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_)
def _align(terms): """Align a set of terms""" try: # flatten the parse tree (a nested list, really) terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable if isinstance(terms.value, pd.core.generic.NDFrame): typ = type(terms.value) return typ, _zip_axes_from_type(typ, terms.value.axes) return np.result_type(terms.type), None # if all resolved variables are numeric scalars if all(term.isscalar for term in terms): return _result_type_many(*(term.value for term in terms)).type, None # perform the main alignment typ, axes = _align_core(terms) return typ, axes
def __init__(self, where, queryables=None, encoding=None, scope_level=0): where = _validate_where(where) self.encoding = encoding self.condition = None self.filter = None self.terms = None self._visitor = None # capture the environment if needed local_dict = DeepChainMap() if isinstance(where, Expr): local_dict = where.env.scope where = where.expr elif isinstance(where, (list, tuple)): for idx, w in enumerate(where): if isinstance(w, Expr): local_dict = w.env.scope else: w = _validate_where(w) where[idx] = w where = " & ".join(map("({})".format, com.flatten(where))) # noqa self.expr = where self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) self._visitor = ExprVisitor( self.env, queryables=queryables, parser="pytables", engine="pytables", encoding=encoding, ) self.terms = self.parse()
def build_dataset_with_keras(self, max_vocab_size=50000): text = self.get_raw_text() words = text_to_word_sequence( text, lower=True, filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') words = self.remove_stopwords(words) max_vocab_size = min(max_vocab_size, len(set(words))) # initialize Tokenizer with 'UNK' as the out-of-vocabulary token. # Since Keras reserves the 0th index for padding sequences, the index for 'UNK' # will be 1st index # max_vocab_size + 1 because Keras reserves the 0th index tokenizer = Tokenizer(num_words=max_vocab_size + 1, oov_token='UNK', lower=True, filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') sentences = self.parse_file_into_sentences() tokenizer.fit_on_texts(sentences) sequences = tokenizer.texts_to_sequences(sentences) # for downstream compatibility flatted_sequences = list(flatten(sequences)) count = tokenizer.word_counts # for downstream compatibility filtered_count = {} dictionary = {} for k, v in tokenizer.word_index.items(): if v <= max_vocab_size: if k == 'UNK': filtered_count['UNK'] = 0 dictionary['UNK'] = 1 continue filtered_count[k] = count[k] dictionary[k] = v else: filtered_count['UNK'] += 1 reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) # for downstream compatibility count_as_tuples = list( zip(list(filtered_count.keys()), list(filtered_count.values()))) assert max_vocab_size == len(count_as_tuples) return flatted_sequences, count_as_tuples, dictionary, reverse_dictionary
def report_corpus(corpus: list, name: str): pos_class_df = DataFrame([ pos for pos in flatten([[part[1] for part in line] for line in corpus_data]) ], columns=["POS"]) grouped_by_pos_df = pos_class_df.groupby( "POS" ).size( ).to_frame( name="frequency" ).reset_index( ).sort_values( by="frequency" ) grouped_by_pos_df.to_excel("./reports/{}-class-stats.xlsx".format( name )) grouped_by_pos_df["POS"] = grouped_by_pos_df.apply( lambda row: "{} ({})".format(row["POS"], row["frequency"]), axis=1 ) sns.barplot( data=grouped_by_pos_df, x="frequency", y="POS", orient="h", saturation=1, palette="tab10", ).get_figure( ).savefig( "./reports/{}-corpus-barplot.svg".format(name) ) pass
def getPatientData(self, patientInfo, categoryKey, dataKey): """function to impute some dummy info for missing fields""" dfValue = [] # return NaN if no data if len(patientInfo[categoryKey]) == 0: return np.nan for iEntry in np.arange(0, len(patientInfo[categoryKey])): if dataKey in patientInfo[categoryKey][iEntry]: dfValue.append(patientInfo[categoryKey][iEntry][dataKey]) #if expectedKey == 'drugindication': # return [dfValue] #return dfValue #else: # return -1 # return NaN if no results if len(dfValue) == 0: return np.nan return set(list(flatten(dfValue)))
def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ return _result_type_many(*(term.type for term in com.flatten(self)))
] rule_names = [ i.split(' ')[0] for i in input[0].replace('departure ', 'd_').replace( 'arrival ', 'a_').replace('-', ' ').split('\n') ] my_ticket = list(map(int, input[1].split('\n')[1].split(','))) tickets = [ list(map(int, i.split(','))) + [True] for i in input[2].split('\n')[1:] ] rules = { k: [i for i in range(v[0], v[1] + 1)] + [j for j in range(v[2], v[3] + 1)] for k, v in zip(rule_names, all_rules) } valids = sorted( [i for i in range(min(flatten(all_rules)), max(flatten(all_rules)) + 1)]) valid_tickets = list() for t in tickets: for n in t[:-1]: if n not in valids: t[-1] = False if t[-1]: valid_tickets.append(t[:-1]) impossible = defaultdict(list) possible = defaultdict(list) definitely = defaultdict(int) for i in range(len(rules)): for t in valid_tickets: for rule, includes in rules.items(): if t[i] not in includes:
random_fits = [] for i in range(0, len(unemploy_log_diffset)): random_fits.append(np.exp((pm.auto_arima(unemploy_log_diffset[i], start_p=1, start_q=1, max_p=3, max_q=3, m=12, n_jobs=-1, error_action='ignore', suppress_warnings=True, #memory issues cause warnings stepwise=False, random=True, random_state=42, n_fits=10)).predict(n_periods=13))) print (i) #add back to the data to prepare for prophet from pandas.core.common import flatten future_regressor_sets = {} for i in set(future_dates): for j in range(0, len(next_13)): future_regressor_sets[i] = list(flatten(([unemploy_log_diffset[i]['Unemployment_rate']],random_fits[j]))) test=future_dates #put the data in a format the prophet will understand -- ds and unemployment rates including future values for key in test: test[key]['Unemployment_rate']=pd.Series(future_regressor_sets[key]) #for key in test: # test[key]['Unemployment_rate']=pd.Series(future_regressor_sets[key]) #test.values() #use prophet to get those predictions test_f={} for i in test: test_f[i] = prophet_models[i].predict(test[i])
def __init__(self, lhs, rhs, truediv, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) if truediv or PY3: _cast_inplace(com.flatten(self), np.float_)
print('Fix input file') print(zedd) print('original input') xs, cv, kf = run_kf(data=zedd, dim_of_measurements=dim_of_measurements, measured_var=(measured_var), covar=(covar), process_model=(process_model), white_noise_var=white_noise_var, dt=dt, sensor_covar=(sensor_covar), measurement_function=(measurement_function)) x, p = run_smoother(kf, xs, cv) final_x = [] for i in x: final_x.append(list(flatten(i))) print(final_x) print('type: ', type(final_x)) print('FINAL_X', final_x[0]) print(final_x[0][0]) print(type(p)) # print('==============+P============', p[0]) process_files.process_output(final_x, p, output_loc) #Kalman.visualise(x, p, zedd, real)
header=None) id_map = dict(zip(id_map_csv.iloc[:, 1], id_map_csv.iloc[:, 0])) del id_map_csv gc.collect() ## Getting ID Mapping Files ## Get Training Embeddings IDs train_feature_dict_dir = "/data/recsys2020/history_nn/TrainChunk*" train_chunks_dirs = list(sorted(glob.glob(train_feature_dict_dir))) for i, file in enumerate(train_chunks_dirs): print(i, file) with open(file, 'rb') as f: chunk = joblib.load(f) setid = set(flatten(chunk['tweet_ids'])) print('setid', len(setid)) setengagenum = set(np.unique(chunk['engagement_histories'])) print('setengagenum', len(setengagenum)) setengage = {id_map[k] for k in setengagenum if k in id_map} print('setengage', len(setengage)) setid.update(setengage) print('setid', len(setid)) ids_chunk = np.array(list(setid)) with open("/data/recsys2020/history_nn/TrainEmbID" + str(i), "wb") as f2: joblib.dump(ids_chunk, f2) del ids_chunk del setid del setengagenum
def names(self): """Get the names in an expression""" if is_term(self.terms): return frozenset([self.terms.name]) return frozenset(term.name for term in com.flatten(self.terms))
ele2 = ele1.replace(in_[1], out_[1]) ele3 = ele2.replace(in_[2], out_[2]) ele4 = ele3.replace(in_[3], out_[3]) ele5 = ele4.split('::') lis_.append(ele5) return lis_ #################################### ## read in Selenium scrape data in json format with open('groupOverview_l.json') as f: data = json.loads("[" + f.read().replace("][", "],\n[") + "]") #print(data[0][2]) group = pd.Series(flatten([data[i][0] for i in range(len(data))])) OverviewText = pd.Series(flatten([data[i][1] for i in range(len(data))])) BGA = pd.Series([data[i][2] for i in range(len(data))]) TypeNames_ = [] Method_ = [] Location_ = [] OverallScore_ = [] for ind in range(len(data)): TML = multReplaceStr(data[ind][3]) TypeNames_.append( [x.strip(' ') for x in (TML[i][0] for i in range(len(TML)))]) TypeNames = pd.Series(TypeNames_) Method_.append( [ x.upper().strip(' ').replace('JIGS', 'JIG').replace(
def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ return result_type_many(*(term.type for term in com.flatten(self)))
def tripadvisor(urll): html = requests.get(urll).text soup = BeautifulSoup(html, "html.parser") data = soup.findAll('div', attrs={'class': '_6sUF3jUd'}) hotel_name = [] hotel_link = [] for div in data: links = div.findAll('h2') for a in links: hotel_name.append(a.text) for div in data: links = div.findAll('a', attrs={'class': '_1QKQOve4'}) for a in links: hotel_link.append("https://www.tripadvisor.fr" + a['href']) hotel_data = pd.DataFrame(list(zip(hotel_name, hotel_link)), columns=['Hotel Name', 'Hotel Link']) #url = 'https://www.tripadvisor.fr/Attraction_Review-g196629-d12447489-Reviews-Canyoning_Saint_Lary-Saint_Lary_Soulan_Hautes_Pyrenees_Occitanie.html' #url = 'https://www.tripadvisor.fr/Attraction_Review-g1841271-d13280719-Reviews-Escape_Dimension_La_Croisee_des_Mondes-Saleilles_Perpignan_Pyrenees_Orientales_.html' mail = [] phone = [] website = [] address = [] for i in hotel_data['Hotel Link']: html_data = requests.get(i).text data = re.search(r'window\.__WEB_CONTEXT__=(\{.*?\});', html_data).group(1) data = json.loads(data.replace('pageManifest', '"pageManifest"')) soup = BeautifulSoup(html_data, "html.parser") def get_emails(val): if isinstance(val, dict): for k, v in val.items(): if k == 'email': if v: yield v else: yield from get_emails(v) elif isinstance(val, list): for v in val: yield from get_emails(v) def get_phones(val): if isinstance(val, dict): for k, v in val.items(): if k == 'phone': if v: yield v else: yield from get_phones(v) elif isinstance(val, list): for v in val: yield from get_phones(v) def get_websites(val): if isinstance(val, dict): for k, v in val.items(): if k == 'website': if v: yield v else: yield from get_websites(v) elif isinstance(val, list): for v in val: yield from get_websites(v) try: for email in get_emails(data): email = base64.b64decode(email).decode('utf-8') email = re.search(r'mailto:(.*)_', email).group(1) mail1 = [] mail1.append(email) mail.append(mail1[0]) except: for email in get_emails(data): mail1 = [] mail1.append(email) mail.append(mail1[0]) #print(mail[0]) try: for email in get_phones(data): email = base64.b64decode(email).decode('utf-8') phone1 = [] phone1.append(email) phone.append(phone1[0]) except: for email in get_phones(data): phone1 = [] phone1.append(email) phone.append(phone1[0]) #print(phone[0]) try: for email in get_websites(data): email = base64.b64decode(email).decode('utf-8') website1 = [] website1.append(email) website.append(website1[0]) except: for email in get_websites(data): website1 = [] website1.append(email) website.append(website1[0]) #print(website[0]) try: divs = soup.findAll("div", attrs={'class': 'LjCWTZdN'}) div = divs[0] for span in div: address1 = [] address1.append(span.text) address.append(address1) # try: # for span in soup.findAll("div",attrs={'class':'_2hDw2pmg'}): # address1 = [] # address1.append(span.text) # address.append(address1[0]) # except: # address.append('not available') except: address.append('not available') address = list(flatten(address)) hotel_data['Hotel Website'] = website hotel_data['Hotel Phone'] = phone hotel_data['Hotel Email'] = mail hotel_data['Hotel Address'] = address return hotel_data
def melt( frame: "DataFrame", id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index: bool = True, ) -> "DataFrame": # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, MultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " "name of the resulting Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, stacklevel=3, ) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present " f"in the DataFrame: {list(missing)}") else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in " f"the DataFrame: {list(missing)}") if col_level is not None: idx = frame.columns.get_level_values(col_level).get_indexer( id_vars + value_vars) else: idx = frame.columns.get_indexer(id_vars + value_vars) frame = frame.iloc[:, idx] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ f"variable_{i}" for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = cast("Series", concat([id_data] * K, ignore_index=True)) else: id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame._values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: result.index = tile_compat(frame.index, K) return result
currentRsi = float("{:.2f}".format(stock.rsi[-1])) if currentRsi > 75: data.append(str(currentRsi) + " 🔥") elif currentRsi < 35: data.append(str(currentRsi) + " 🧊") else: data.append(currentRsi) chartLink = "https://finance.yahoo.com/quote/" + ticker + "/chart?p=" + ticker data.append(chartLink) allData.append(data) # Shows chart only if current RSI is greater than or less than 70 or 30 respectively if currentRsi < 30 or currentRsi > 70: stock.graph(MAarr) except Exception as e: print('Error: ', str(e)) print( tabulate(allData, headers=flatten([ 'Stock', 'Price', [str(x) + " MA" for x in MAarr], "RSI", "chart" ])))
def _expand_colspan_rowspan(self, rows, fill_rowspan=True): """Given a list of rows, return a list of rows that properly handle colspan/rowspan Discussion on behavior of fill_rowspan in #17073 Parameters ---------- rows : list of rows, each of which is a list of elements in that row fill_rowspan : boolean Should a rowspan fill every item in the rowspan (True) or only the bottommost element (False)? Default is True. Returns ------- res : list of rows, each of which is a list of elements in that row, respecting colspan/rowspan """ res = [] saved_span = [] for row in rows: extracted_row = self._extract_td(row) cols_text = [ _remove_whitespace(self._text_getter(col)) for col in extracted_row ] col_colspans = [ int(col.get('colspan', 1)) for col in extracted_row ] col_rowspans = [ int(col.get('rowspan', 1)) for col in extracted_row ] # expand cols using col_colspans # maybe this can be done with a list comprehension, dunno cols = list( zip( list( flatten( lmap(lambda text_nc: [text_nc[0]] * text_nc[1], list(zip(cols_text, col_colspans))))), list( flatten( lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0], list(zip(col_colspans, col_rowspans))))))) # cols is now a list of (text, number of rows) # now insert any previous rowspans for (col, (text, nr)) in saved_span: cols.insert(col, (text, nr)) # save next saved_span def advance_item_to_next_row(item): (col, (text, nr)) = item if nr == 1: return None else: # only keep the text around if fill_rowspan is set return (col, (text if fill_rowspan else '', nr - 1)) saved_span = lfilter( lambda i: i is not None, lmap(advance_item_to_next_row, list(enumerate(cols)))) cols = [text for (text, nr) in cols] # generate cols with text only if any([col != '' for col in cols]): res.append(cols) return res
for k in max_types } print('normed all maximum loss types (stoch+cont): \n', normed_max_types, '\n') print('all maximum loss types (stoch): \n', max_types_no_cont, '\n') normed_max_types_no_cont = { k: np.round(max_types_no_cont[k] / sum(max_types_no_cont.values()), 2) for k in max_types_no_cont } print('normed all maximum loss types (stoch): \n', normed_max_types_no_cont, '\n') # In[9]: bins = np.logspace(-9, 1, 30) plt.hist(list(flatten(opening_typ_dict['stoch_opening'])), bins=bins, histtype='step', label='stoch op') plt.hist(list(flatten(opening_typ_dict['cont_opening'])), bins=bins, histtype='step', label='cont op') plt.xscale('log') plt.yscale('log') plt.xlabel('opening angle of each interaction in degree') plt.ylabel('counts') plt.legend() events_s = len([i for i in opening_typ_dict['stoch_opening'] if len(i) > 0]) events_c = len([i for i in opening_typ_dict['cont_opening'] if len(i) > 0]) plt.title('stoch events: {}, cont events: {}'.format(events_s, events_c))
def operand_types(self): return frozenset(term.type for term in com.flatten(self))
for root, dirs, files in os.walk(all_paths_dir): for file in files: if file.endswith(".pkl"): all_paths_filenames.append(os.path.join(root, file)) img_paths_train = [] mask_paths_train = [] img_labels_train = [] for train_path in all_paths_filenames[:-1]: with open(train_path, 'rb') as alp: all_paths_splits = pickle.load(alp) img_paths_train.append(all_paths_splits['image paths']) mask_paths_train.append(all_paths_splits['mask paths']) img_labels_train.append(all_paths_splits['image labels']) img_paths_train = list(flatten(img_paths_train)) mask_paths_train = list(flatten(mask_paths_train)) img_labels_train = list(flatten(img_labels_train)) with open(all_paths_filenames[-1], 'rb') as alp: all_paths_splits = pickle.load(alp) img_paths_valid = all_paths_splits['image paths'] mask_paths_valid = all_paths_splits['mask paths'] img_labels_valid = all_paths_splits['image labels'] print("Total training samples : ", len(img_paths_train)) print("Total validation samples : ", len(img_paths_valid)) class Dataset(torch.utils.data.Dataset): 'Characterizes a dataset for PyTorch'
def ComputeVals(backWindowVal, frontWindowVal, dfAOI, df, polys, patient, mode): #Define the backwindow and front window not necessary but it helps for clarity backWindow = backWindowVal frontWindow = frontWindowVal #Create a marker to split into consecutive AOI groups dfAOI['marker'] = (dfAOI['InfoUnit'] != dfAOI['InfoUnit'].shift()).cumsum() #Add the first and last of every group into our df_master df_master = dfAOI.index.to_series().groupby(dfAOI['marker']).agg( ['first', 'last']).reset_index() #Add a new column with the patient name on our df_master df_master['PATIENT'] = patient #Find the RecordingTimestamp for every first and last index in our df_master and store it in their respective df's df_firstIndexTime = df.loc[df_master['first'], ['RecordingTimestamp']] df_lastIndexTime = df.loc[df_master['last'], ['RecordingTimestamp']] #Add the RecordingTimestamps to our df_master df_master = df_master.assign( recordingTimeStampStart=df_firstIndexTime.values, recordingTimeStampEnd=df_lastIndexTime.values) #Assign the target for looking back and front by subtracting the back window and adding the front window, then assigning it to our df_master df_master = df_master.assign( timeTargetBack=df_master['recordingTimeStampStart'].values - backWindow, timeTargetFront=df_master['recordingTimeStampEnd'].values + frontWindow) #Create a copy of the index in the main dataframe to avoid loosing the indices in the future df['copy_index'] = df.index #Use the function merge_asof to find the back and front closest value of from the target time stored in master_df in comparison to the RecordingTimestamp in the main dataframe #this will be stored in their respective new dataframes (we get a dataframe back from this function, this dataframe has only the rows of the main dataframe where we have the closest value to # our targets, it is important to note that since we get the row from the main df we have the correct location in the main df after merge because we have the copy of index). **It is basically a # left join but instead of equal values we match based on nearest. df_backward = pd.merge_asof(df_master, df, left_on='timeTargetBack', right_on='RecordingTimestamp', direction='backward') df_forward = pd.merge_asof(df_master, df, left_on='timeTargetFront', right_on='RecordingTimestamp', direction='forward') #Assign the copy of indices which are the correct ones on our df_master df_master = df_master.assign( backWindowIndex=df_backward['copy_index'].values) df_master = df_master.assign( frontWindowIndex=df_forward['copy_index'].values) #Assign the info unit values for every index in the df_master, this could have been done earlier df_master = df_master.assign(InfoUnits=df.loc[df_master['first'], 'InfoUnit'].values) #Create a list to store the smaller dataframes from the back to front windows df_list = [] #Zip the values to create a point in the main dataframe, will be useful for later when finding if it falls on a polygon df['point'] = list( zip(df['FixationPointX..MCSpx.'], df['FixationPointY..MCSpx.'])) #Append all the smaller dataframes into the list for index, row in df_master.iterrows(): df_list.append(df.loc[row['backWindowIndex']:row['frontWindowIndex']]. drop_duplicates('FixationIndex')) #Iterate through the df_list and create a new dataframe of true and false values based on which polygon every row lands on, append this to a list of hits. hitsList = [] for x in df_list: pointList = list(map(myPoint, (x['point'].values))) _pnts = pointList pnts = gp.GeoDataFrame(geometry=_pnts) hitsList.append( pnts.assign( **{key: pnts.within(geom) for key, geom in polys.items()})) resList = [] #We iterate through the true false dataframes and we just keep the truth values as a list for data in hitsList: res = list( flatten( pd.DataFrame(data.columns.where( data == True).tolist()).values.tolist())) ans = [x for x in res if not isinstance(x, float)] noDuplicate = pd.Series(ans).drop_duplicates() resList.append(list(noDuplicate)) #Store the hists into the df_master hitName = 'HITS' + '_' + str(backWindow) df_master[hitName] = resList hitToCSV = df_master[['PATIENT', 'InfoUnits', hitName]] #If we just call for hits just return the hits dataframe if (mode == 'hits'): return hitToCSV #Else we call full and now we calculate the latencies elif (mode == 'full'): df_latency = df_master #Drop undefined AOI's # df_latency.drop(df_latency.loc[df_latency['InfoUnits']=='KITCHEN'].index, inplace=True) # df_latency.drop(df_latency.loc[df_latency['InfoUnits']=='EXTERIOR'].index, inplace=True) # df_latency.drop(df_latency.loc[df_latency['InfoUnits']=='CUPBOARD'].index, inplace=True) #Call the computeLatency function for back and front latencyValuesBack = computeLatency(df, polys, df_latency, 'back') latencyValuesFront = computeLatency(df, polys, df_latency, 'front') df_master['BackLatency'] = latencyValuesBack[0] df_master['BackIndex'] = latencyValuesBack[1] df_master['FrontLatency'] = latencyValuesFront[0] df_master['FrontIndex'] = latencyValuesBack[1] latencyToCSV = df_master[[ 'PATIENT', 'InfoUnits', 'BackLatency', 'FrontLatency' ]] patientResult = pd.merge(latencyToCSV, hitToCSV, right_index=True, left_index=True) patientResult.reset_index() resultsToCSV = patientResult.drop(['PATIENT_y', 'InfoUnits_y'], axis=1) resultsToCSV = resultsToCSV.rename(columns={ 'PATIENT_x': 'Patient', 'InfoUnits_x': 'InfoUnits' }) return resultsToCSV
def score_web_text(tokens): # only use the scraped text out the website # table of pool of words from the different company types divided by SIC codes too, see in DATASETS the differences and the references table_of_pool_words = [ pool_words_R_ext, pool_words_I_ext, pool_words_DS_ext, pool_words_OS_ext, pool_words_RS_ext, pool_words_DA_ext, pool_words_MSP_ext, pool_words_CC_ext, pool_words_MS_ext, pool_words_VD_ext, pool_words_TC_ext ] # transfrom the words in to dataframe words_dataframe = pd.DataFrame(table_of_pool_words) pool_set = list(flatten(table_of_pool_words)) unique_words = set() for x in pool_set: unique_words.add(x) gen_weights = np.zeros(len(unique_words)) counter = 0 for x in unique_words: id = np.where(words_dataframe == x)[0] gen_weights[counter] = len(id) counter += 1 list_words = list(unique_words) points = np.asarray(list_words) table_of_dataframes = [] # loop over each pool words for iweight in range(len(table_of_pool_words)): id_pool = return_indices_of_a(list_words, table_of_pool_words[iweight]) # here I created the pool and the weight for each word for each company type weight = 1. / gen_weights[id_pool] new_pool = points[id_pool] pool = pd.DataFrame([new_pool, weight]) table_of_dataframes.append(pool) # actual sum of all the weights in each pool-words value_total = np.zeros(len(table_of_pool_words)) for itotal in range(len(table_of_pool_words)): dataf = table_of_dataframes[itotal] value_total[itotal] = np.sum(dataf.iloc[1]) n_str = len(tokens) flag_check = [] if n_str == 0: print('!raised awareness! zero element in the analyzed text') flags_checks = np.zeros(len(table_of_pool_words)) elif n_str > 0: if n_str == 1: print('!raised awareness! Only 1 element in the analyzed text') for ipool in range(len(table_of_pool_words)): dataf = table_of_dataframes[ipool] id_0 = return_indices_of_a(dataf.iloc[0], tokens) values = np.sum(dataf.iloc[1][id_0]) flag_check.append(values / value_total[ipool]) flags_checks = 100 * np.asarray(flag_check) return flags_checks
# I will run the distance_coord function for every element (DataFrame) in the gdf_list. total_distance=[] for item in gdf_list: total_distance.append(distance_coord(item)) # In[21]: # I will flatten the total_distance list of lists to a single flat list. total_distance = list(flatten(total_distance)) # In[22]: # I will create a function to compute the tmin, tmax and timedelta for every user. def compute_tmin_tmax_timedelta(df): 'This function computes the min, max timestamp and the timedelta for evry user' tmin = df['datetime'].min() tmax = df['datetime'].max() timedelta = tmax-tmin return tmin, tmax, timedelta
if args.overleaf: clear_directory(document_path) fetch_overleaf(args.overleaf, document_path) temp_path = create_temporary_copy(document_path) conf_source_path = os.path.join(document_path, "variables.json") with open(conf_source_path) as f: conf_source = json.load(f) # DataFrame initialisation cols = conf_source["booleans"] + \ list(conf_source["numbers"].keys()) + \ list(conf_source["enums"].keys()) + \ list(flatten(conf_source["choices"])) + \ ["nbPages", "space"] df = pd.DataFrame(columns=cols) file_path = os.path.join(temp_path, filename) inject_space_indicator(file_path) # LaTeX bbl pregeneration generate_bbl(file_path) # ---------------------------------------- # PDF generation # ---------------------------------------- if args.config: row = generate_pdf(json.loads(args.config), filename, temp_path) pdf_name = filename + ".pdf" shutil.copyfile(os.path.join(temp_path, pdf_name),
def emo_analysis(input_text): tokenizer = BertTokenizer.from_pretrained( "monologg/bert-base-cased-goemotions-original") model = BertForMultiLabelClassification.from_pretrained( "monologg/bert-base-cased-goemotions-original") goemotions = MultiLabelPipeline(model=model, tokenizer=tokenizer, threshold=0.3) #결과확인 ##print(goemotions(texts)) ########## 여기서는 최초 입력 에세이를 적용한다. input_text !!!!!!!! re_text = input_text.split(".") #데이터 전처리 def cleaning(datas): fin_datas = [] for data in datas: # 영문자 이외 문자는 공백으로 변환 only_english = re.sub('[^a-zA-Z]', ' ', data) # 데이터를 리스트에 추가 fin_datas.append(only_english) return fin_datas texts = cleaning(re_text) #분석된 감정만 추출 emo_re = goemotions(texts) emo_all = [] for list_val in range(0, len(emo_re)): ##print(emo_re[list_val]['labels'],emo_re[list_val]['scores']) #mo_all.append((emo_re[list_val]['labels'],emo_re[list_val]['scores'])) #KEY, VALUE만 추출하여 리스트로 저장 #emo_all.append(emo_re[list_val]['scores']) emo_all.append((emo_re[list_val]['labels'])) from pandas.core.common import flatten #이중리스틀 FLATTEN하게 변환 flat_list = list(flatten(emo_all)) #중립적인 감정을 제외하고, 입력한 문장에서 다양한 감정을 모두 추출하고 어떤 감정이 있는지 계산 unique = [] for r in flat_list: if r == 'neutral': pass else: unique.append(r) #빈도수 계산하여 오름차순 정렬 from collections import Counter count = Counter(unique) words = dict(count.most_common()) ###### 워드크라우드 구현 start ##### # #분석가능한 감정 총 감정 수 - Bert origin model 적용시 28개 감정 추출돰 # total_num_emotion_analyzed = 28 # ########## wodCloud 설정 ######## # from wordcloud import WordCloud # import matplotlib.pyplot as plt # import nltk # from nltk.corpus import stopwords # %matplotlib inline # import matplotlib # from IPython.display import set_matplotlib_formats # matplotlib.rc('font',family = 'Malgun Gothic') # set_matplotlib_formats('retina') # matplotlib.rc('axes',unicode_minus = False) # #문장의 핵심감정을 워드크라우드로 표현(큰 글자가 가장 빈도수가 많이 나온 분석결과다) # wordcloud = WordCloud(background_color='white', # colormap = "Accent_r", # width=1500, height=1000).generate_from_frequencies(words) # plt.imshow(wordcloud) # plt.axis('off') # plt.show() ###### 워드크라우드 구현 end ##### # 에세이에 표현된 핵심 감정값 도출 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! return words
def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) if truediv or PY3: _cast_inplace(com.flatten(self), np.float_)
# Split products into terms: Tokenize. products['products_mod'] = products['products_mod'].str.split() # # Merge the synonyms perse # departments_synonyms = departments_synonyms.groupby('department')['synonyms'].apply(list) # departments_synonyms = pd.merge(departments, departments_synonyms, on="department", how='outer').fillna('') # Merge the department and aisle names into the dataframe. products = pd.merge(products, departments, on="department_id", how='outer') products = pd.merge(products, aisles, on="aisle_id", how='outer') # https://stackoverflow.com/a/43898233/3780957 # https://stackoverflow.com/a/57225427/3780957 # Remove synonyms here in the list products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist() products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x))) # %% # Steam and lemmatisation of the product name # https://stackoverflow.com/a/24663617/3780957 # https://stackoverflow.com/a/25082458/3780957 # https://en.wikipedia.org/wiki/Lemmatisation lemma = nltk.wordnet.WordNetLemmatizer() sno = nltk.stem.SnowballStemmer('english') products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row]) products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row]) # %% ## EDA ----
def melt( frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ) -> DataFrame: # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, ABCMultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in" " the DataFrame: {missing}" "".format(missing=list(missing))) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, ABCMultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns)
intro_sel = intro[id_sic[element]] if intro_sel: pool = Pool(os.cpu_count()) # cleaning process of the introduction pool.apply_async(worker, ( intro_sel, all_token, )) pool.close() pool.join() all_tks.append(all_token) token_dataframe = pd.DataFrame(list(flatten(all_tks))) # here you can create the dataframe for N-Grams #bigrams_dataframe = pd.DataFrame(bi_grams) print(type(token_dataframe), len(token_dataframe)) counts_words = [] for toks in range(len(token_dataframe.iloc[0])): value_token = token_dataframe[toks].value_counts() # count each singular words counts_words.append(value_token) Counted_words = pd.DataFrame(counts_words) Counted_words.fillna(value=0, inplace=True)
def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): return np.bool_ return result_type_many(*(term.type for term in com.flatten(self)))
meals_ds['Price Range'] = meals_ds['Price Range'].replace( to_replace=dict_price) # replace all values by dictionary print('Fill empty values of median by town') for city in city_set: median_ = meals_ds[meals_ds['City'] == city]['Price Range'].median() indexes = meals_ds.loc[meals_ds['City'] == city].index meals_ds.loc[indexes, 'Price Range'] = meals_ds.loc[indexes, 'Price Range'].fillna(median_) # prepare and analyze Cuisine Style print('Creating cuisine style set') meals_ds['Cuisine Style'] = meals_ds['Cuisine Style'].apply( str_to_list) # Turn all values in column into list cuisine_set = set(list(flatten(meals_ds['Cuisine Style'].dropna().values)) ) # Create a set with cuisine styles print('Create dummies for Cuisine Style') for item in cuisine_set: meals_ds[item] = meals_ds['Cuisine Style'].apply(find_item) print('Fill empty values by most popular cuisine') # Group cuisines by city and fill empty values by most popular cuisine in the city for city in city_set: city_cuisine = pop_cuisine(city) indexes = meals_ds.loc[meals_ds['City'] == city].index # Dummy value for most popular cuisine is 1. For other cuisines value is 0 meals_ds.loc[indexes, city_cuisine] = meals_ds.loc[indexes, city_cuisine].fillna(1) meals_ds.loc[indexes, cuisine_set] = meals_ds.loc[indexes,
def flatten_to_unique(input_list): return_list = [] for item in set(flatten(input_list)): return_list.append({item}) return return_list