def get_datasets_political_parties(path='data/political-data/'): """ Loads Political party data from files, splits the data into words and generates labels. Returns split sentences and labels. """ arr = os.listdir(path) print(arr) datasets = dict() class_value = 0 datasets['data'] = [] datasets['target'] = [] datasets['target_names'] = [] for input_file in arr: read_file = path + input_file data = list(open(read_file, "r").readlines()) data = [s.strip() for s in data] target = [class_value for x in data] datasets['data'].append(data) datasets['target'].append(target) datasets['target_names'].append(input_file) class_value = class_value + 1 datasets['data'] = utils.flatten_list(datasets['data']) datasets['target'] = utils.flatten_list(datasets['target']) datasets['target_names'] = datasets['target_names'] print('The Target Names: ', datasets['target_names']) return datasets
def create_d2d_sparse_matrix(i2d, drug_to_interactions): d2i = array_to_dict(i2d) number_of_drugs = len(d2i) print('creating matrix') rows = flatten_list([[d2i[x[0]]] * len(x[1]) for x in sorted(drug_to_interactions.items())]) cols = [ d2i[t] for t in flatten_list( [x[1] for x in sorted(drug_to_interactions.items())]) ] print('number of valid interactions:', len(cols)) assert len(rows) == len(cols) data = [1] * len(cols) m = csr_matrix((data, (rows, cols)), shape=(number_of_drugs, number_of_drugs), dtype='f') print('m shape:', m.shape, 'm non zeros:', m.nnz) m = m.todense() count_non_sym = 0 for i in range(m.shape[0]): for j in range(i + 1, m.shape[0]): if m[i, j] != m[j, i]: count_non_sym += 1 m[i, j] = max(m[i, j], m[j, i]) m[j, i] = m[i, j] print('non sym count (matrix was made sym using max):', count_non_sym) assert np.allclose(m, m.T, atol=1e-8) #matrix is symmetric return m
def get_tags_list(df_path): """ get list of BIO tags. Arg: df_path: data path """ train_df = pd.read_csv(df_path + 'train_df_opinion.tsv', delimiter='\t') dev_df = pd.read_csv(df_path + 'dev_df_opinion.tsv', delimiter='\t') test_syn_df = pd.read_csv(df_path + "test_syn_df_opinion.tsv", delimiter='\t') test_dia_df = pd.read_csv(df_path + "test_dia_df_opinion.tsv", delimiter='\t') # concatenate data frames full_df = pd.concat([train_df, dev_df, test_syn_df, test_dia_df]) # prepare labels _, entities = prep_df(full_df) full_df = bio_tagging_df(full_df) labels = full_df.bio_tags.values labels_unlist = [list(chain.from_iterable(lab)) for lab in labels] labels_flat = [flatten_list(lab) for lab in labels_unlist] # create tags tag_values = [list(set(tag)) for tag in labels_flat] tag_values = list(set(flatten_list(tag_values))) tag_values.append('PAD') tag2idx = {t: i for i, t in enumerate(tag_values)} return tag_values, tag2idx, entities
def get_sentences_biotags(tokenizer, sentences, labels, max_len): ''' get tokenized flattened sentences and BIO tags. Args: sentences: text column from data labels: label column from data max_len: maximal sequence length ''' sentences_unlist = [list(chain.from_iterable(sent)) for sent in sentences] labels_unlist = [list(chain.from_iterable(lab)) for lab in labels] sentences_flat = [flatten_list(sent) for sent in sentences_unlist] labels_flat = [flatten_list(lab) for lab in labels_unlist] tokenized_texts_and_labels = [ tokenize_and_preserve_labels(tokenizer, sent, labs, max_len) for sent, labs in zip(sentences_flat, labels_flat) ] tokenized_texts = [ token_label_pair[0] for token_label_pair in tokenized_texts_and_labels ] tokenized_labels = [ token_label_pair[1] for token_label_pair in tokenized_texts_and_labels ] return tokenized_texts, tokenized_labels
def validation_epoch_end(self, outputs, prefix="val") -> Dict: self.step_count += 1 losses = { k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names } loss = losses["loss"] rouges = { k: np.array([x[k] for x in outputs]).mean() for k in ROUGE_KEYS + ["gen_time", "summ_len"] } rouge_tensor: torch.FloatTensor = torch.tensor( rouges["rouge2"]).type_as(loss) rouges.update({k: v.item() for k, v in losses.items()}) losses.update(rouges) metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()} metrics["step_count"] = self.step_count self.save_metrics(metrics, prefix) # writes to self.metrics_save_path preds = flatten_list([x["preds"] for x in outputs]) target = flatten_list([x["target"] for x in outputs]) return { "log": metrics, "preds": preds, f"{prefix}_loss": loss, f"{prefix}_rouge": rouge_tensor, "target": target, }
def leave_one_group_out_cv_single_time_point(X, y, group_names, train_predict_fn, use_features=None): if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.DataFrame): raise KeyError( "leave_one_group_out_cv expects X and y to be data frames.") groups = np.unique(group_names) target_collection = [] predicted_probs_collection = [] predicted_class_collection = [] trial_id_collection = [] group_id_collection = [] for group in groups: # Subset X and y train_set = X[X['group'] != group] test_set = X[X['group'] == group] train_labels = y[y['group'] != group] test_labels = y[y['group'] == group] # Add trial info to collection trial_id_collection.append(list(test_set["Trial"])) # Extract sensors if use_features is not None: train_set = train_set.loc[:, use_features] test_set = test_set.loc[:, use_features] # Convert to numpy arrays X_train = np.asarray(train_set) X_test = np.asarray(test_set) y_train = np.asarray(train_labels["label"]) y_test = np.asarray(test_labels["label"]) # Fit model and predict test set predicted_probs, predicted_class = train_predict_fn(X_train=X_train, X_test=X_test, y_train=y_train) # Append to collections target_collection.append(y_test) predicted_probs_collection.append(predicted_probs) predicted_class_collection.append(predicted_class) group_id_collection.append([group] * len(predicted_class)) return pd.DataFrame({ "Group": flatten_list(group_id_collection), "Trial": flatten_list(trial_id_collection), "Target": flatten_list(target_collection), "Predicted Probability": flatten_list(predicted_probs_collection), "Predicted Class": flatten_list(predicted_class_collection) })
def get_all_words_in_path(path): trees = get_trees(path) word_list = flatten_list([get_all_names(t) for t in trees]) word_names = remove_magic(word_list) def split_snake_case_name_to_words(name): return [n for n in name.split('_') if n] return flatten_list([ split_snake_case_name_to_words(word_name) for word_name in word_names ])
def create_subject_arrays(self, double_precision=True ): ''' Create arrays with errors per subject and per num_target also create an array with the precision per subject and num_target directly ''' unique_subjects = np.unique(self.dataset['subject']) unique_n_items = np.unique(self.dataset['n_items']) self.dataset['errors_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object) self.dataset['errors_all_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object) self.dataset['errors_nontarget_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object) self.dataset['precision_subject_nitems_bays'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size)) self.dataset['precision_subject_nitems_theo'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size)) self.dataset['precision_subject_nitems_theo_nochance'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size)) self.dataset['precision_subject_nitems_bays_notreatment'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size)) self.dataset['response_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object) self.dataset['item_angle_subject_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object) for n_items_i, n_items in enumerate(unique_n_items): for subject_i, subject in enumerate(unique_subjects): ids_filtered = ((self.dataset['subject']==subject) & (self.dataset['n_items'] == n_items) & (self.dataset.get('masked', False) == False)).flatten() # Get the errors self.dataset['errors_subject_nitems'][subject_i, n_items_i] = self.dataset['errors_all'][ids_filtered, 0] self.dataset['errors_all_subject_nitems'][subject_i, n_items_i] = self.dataset['errors_all'][ids_filtered] self.dataset['errors_nontarget_subject_nitems'][subject_i, n_items_i] = self.dataset['errors_all'][ids_filtered, 1:] # Get the responses and correct item angles self.dataset['response_subject_nitems'][subject_i, n_items_i] = self.dataset['response'][ids_filtered] self.dataset['item_angle_subject_nitems'][subject_i, n_items_i] = self.dataset['item_angle'][ids_filtered] # Compute the precision self.dataset['precision_subject_nitems_bays'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=True, correct_orientation=True, use_wrong_precision=True) self.dataset['precision_subject_nitems_theo'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=False) self.dataset['precision_subject_nitems_theo_nochance'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=True, correct_orientation=False, use_wrong_precision=False) self.dataset['precision_subject_nitems_bays_notreatment'][subject_i, n_items_i] = self.compute_precision(self.dataset['errors_subject_nitems'][subject_i, n_items_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=True) # if double_precision: # precision_subject_nitems *= 2. # precision_subject_nitems_theo *= 2. # # self.dataset['precision_subject_nitems_theo_nochance'] *= 2. # self.dataset['precision_subject_nitems_bays_notreatment'] *= 2. self.dataset['errors_nitems'] = np.array([utils.flatten_list(self.dataset['errors_subject_nitems'][:, n_item_i]) for n_item_i in xrange(unique_n_items.size)]) self.dataset['errors_all_nitems'] = np.array([utils.flatten_list(self.dataset['errors_all_subject_nitems'][:, n_item_i]) for n_item_i in xrange(unique_n_items.size)]) self.dataset['errors_nontarget_nitems'] = self.dataset['errors_all_nitems'][:, :, 1:] self.dataset['precision_nitems_bays'] = np.mean(self.dataset['precision_subject_nitems_bays'], axis=0) self.dataset['precision_nitems_theo'] = np.mean(self.dataset['precision_subject_nitems_theo'], axis=0) self.dataset['precision_nitems_theo_nochance'] = np.mean(self.dataset['precision_subject_nitems_theo_nochance'], axis=0) self.dataset['precision_nitems_bays_notreatment'] = np.mean(self.dataset['precision_subject_nitems_bays_notreatment'], axis=0)
def get_all_words_in_path(path): """Returns list of all words""" trees = [t for t in get_trees(path) if t] function_names = [ f for f in flatten_list([get_all_names(t) for t in trees]) if not (f.startswith('__') and f.endswith('__')) ] def split_snake_case_name_to_words(name): return [n for n in name.split('_') if n] return flatten_list([ split_snake_case_name_to_words(function_name) for function_name in function_names ])
def fix_sentence(self, s_tripleset, template, tag2ent): ent2tags = {v: k for k, v in tag2ent.items()} # s_tripleset must meet "head && tail are in template && tag2ent" bad_triples = set() for triple_ix, triple in enumerate(s_tripleset): for ent in [triple[0], triple[-1]]: if ent in ent2tags: if ent2tags[ent] not in template: bad_triples.add(triple_ix) continue else: bad_triples.add(triple_ix) continue s_tripleset = [ triple for triple_ix, triple in enumerate(s_tripleset) if triple_ix not in bad_triples ] # tag2ent are entities only in triple_entities triple_entities = set( flatten_list([(triple[0], triple[-1]) for triple in s_tripleset])) tag2tri_ent = { k: v for k, v in tag2ent.items() if v in triple_entities } # templates only have triple_entities for tag, ent in tag2ent.items(): if ent not in triple_entities: ent = ent.replace('_', ' ') template = template.replace(tag, ent) if {word for word in template.split() if 'AGENT' in word or 'BRIDGE' in word or 'PATIENT' in word} \ != set(tag2tri_ent.keys()): self.cnt_corefs += 1 assert set(tag2tri_ent.values()) == triple_entities ''' TODO: Erroraneous case: train.csv:7123:"Ayam penyet mainIngredients Squeezed"" or ""smashed"" fried chicken served with sambal",PATIENT_2 is PATIENT_3 .,"Fried chicken is Squeezed"" or ""smashed"" fried chicken served with sambal .",The chicken is smashed and served hot with sambal .,"Ayam penyet Fried chicken Squeezed"" or ""smashed"" fried chicken served with sambal",AGENT_1 PATIENT_2 PATIENT_3,ROOT mainIngredients mainIngredients_inv,mainIngredients,"[0, 2]","[2, 2, 8]","{""AGENT_1"": ""Ayam penyet"", ""PATIENT_2"": ""Fried chicken"", ""PATIENT_3"": ""Squeezed\"" or \""smashed\"" fried chicken served with sambal""}","[[0, 4], [4, 2], [2, 5], [5, 0]]","Ayam penyet <ENT_SEP> Fried chicken <ENT_SEP> Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_REL_SEP> mainIngredients <REL_TRP_SEP> 0 2 0","Ayam penyet mainIngredients Squeezed"" or ""smashed"" fried chicken served with sambal <ENT_TGT_SEP> PATIENT_2 is PATIENT_3 . <TGT_TXT_SEP> The chicken is smashed and served hot with sambal ." train.csv:7359:Bakewell tart ingredient Frangipane,AGENT_1 contains PATIENT_3 .,Bakewell pudding contains Frangipane .,It contains frangipane .,Bakewell pudding Bakewell tart Frangipane,AGENT_1 BRIDGE_2 PATIENT_3,ROOT ingredient ingredient_inv,ingredient,"[1, 2]","[2, 2, 1]","{""AGENT_1"": ""Bakewell pudding"", ""BRIDGE_2"": ""Bakewell tart"", ""PATIENT_3"": ""Frangipane""}","[[1, 4], [4, 2], [2, 5], [5, 1]]",Bakewell pudding <ENT_SEP> Bakewell tart <ENT_SEP> Frangipane <ENT_REL_SEP> ingredient <REL_TRP_SEP> 1 2 0,Bakewell tart ingredient Frangipane <ENT_TGT_SEP> AGENT_1 contains PATIENT_3 . <TGT_TXT_SEP> It contains frangipane . { "sent": "demarce short stories in the the grantville gazettes precede eric flint novels .", "graph": [ { "truth": "precededBy", "pred": "precededBy", "ent0_ent1": "1634: the bavarian crisis ENT0_END demarce short stories in the the grantville gazettes" }, { "truth": "<unk>", "pred": "author", "ent0_ent1": "1634: the bavarian crisis ENT0_END eric flint" } ] } ''' return s_tripleset, template, tag2tri_ent
def recurse_files(self, folder): if isdir(folder): return flatten_list([ self.recurse_files(folder + '/' + f) for f in listdir(folder) if not f.startswith('.') ]) return [folder]
def validation_epoch_end(self, outputs, prefix="val") -> Dict: self.step_count += 1 losses = { k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names } loss = losses["loss"] generative_metrics = { k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"] } metric_val = (generative_metrics[self.val_metric] if self.val_metric in generative_metrics else losses[self.val_metric]) metric_tensor: torch.FloatTensor = torch.tensor(metric_val).type_as( loss) generative_metrics.update({k: v.item() for k, v in losses.items()}) losses.update(generative_metrics) all_metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()} all_metrics["step_count"] = self.step_count self.metrics[prefix].append( all_metrics) # callback writes this to self.metrics_save_path preds = flatten_list([x["preds"] for x in outputs]) return { "log": all_metrics, "preds": preds, f"{prefix}_loss": loss, f"{prefix}_{self.val_metric}": metric_tensor, }
def validation_epoch_end(self, outputs, prefix="val") -> Dict: self.step_count += 1 losses = { k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names } loss = losses["loss"] gen_metrics = { k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"] } metrics_tensor: torch.FloatTensor = torch.tensor( gen_metrics[self.val_metric]).type_as(loss) gen_metrics.update({k: v.item() for k, v in losses.items()}) # fix for https://github.com/PyTorchLightning/pytorch-lightning/issues/2424 if dist.is_initialized(): dist.all_reduce(metrics_tensor, op=dist.ReduceOp.SUM) metrics_tensor = metrics_tensor / dist.get_world_size() gen_metrics.update({self.val_metric: metrics_tensor.item()}) losses.update(gen_metrics) metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()} metrics["step_count"] = self.step_count self.save_metrics(metrics, prefix) # writes to self.metrics_save_path preds = flatten_list([x["preds"] for x in outputs]) return { "log": metrics, "preds": preds, f"{prefix}_loss": loss, f"{prefix}_{self.val_metric}": metrics_tensor }
def cross_validate_time_point(X, y, trial_folds, train_predict_fn, use_features=None): folds = np.unique(trial_folds) target_collection = [] predicted_probs_collection = [] predicted_class_collection = [] trial_id_collection = [] fold_collection = [] for fold in folds: train_indices = np.where(trial_folds != fold)[0] test_indices = np.where(trial_folds == fold)[0] if isinstance(X, pd.DataFrame): if use_features is not None: X = X.loc[:, use_features] X_train = np.asarray(X.iloc[train_indices]) X_test = np.asarray(X.iloc[test_indices]) else: X_train = X[train_indices] X_test = X[test_indices] y_train = y[train_indices] y_test = y[test_indices] # Fit model and predict test set predicted_probs, predicted_class = train_predict_fn(X_train=X_train, X_test=X_test, y_train=y_train) # Append to collections trial_id_collection.append(test_indices) target_collection.append(y_test) predicted_probs_collection.append(predicted_probs) predicted_class_collection.append(predicted_class) fold_collection.append([fold] * len(test_indices)) return pd.DataFrame({ "Fold": flatten_list(fold_collection), "Trial": flatten_list(trial_id_collection), "Target": flatten_list(target_collection), "Predicted Probability": flatten_list(predicted_probs_collection), "Predicted Class": flatten_list(predicted_class_collection) })
def __init__(self, set: DataSetType): self.data_set_type = set.value files = self.recurse_files( path.join(path.dirname(path.realpath(__file__)), "raw", set.value)) data = flatten_list([RDFFileReader(f).data for f in files]) super().__init__(data, misspelling=misspelling, rephrase=(rephrase, rephrase_if_must))
def get_membership(self, partition_vector=None, flatten=False): pvec = partition_vector or self.partition_vector result = defaultdict(list) for (position, value) in enumerate(pvec): result[value].append(position) result = [tuple(x) for x in sorted(result.values(), key=len, reverse=True)] return flatten_list(result) if flatten else result
def simulate_from_result(self, partition_object, lsf=False, ntimes=1, **kwargs ): """ Simulates a set of records using parameters estimated when calculating concatenated trees from the Partition object """ inds = partition_object.get_membership() if lsf and ntimes > 1: multiple_results = [self.simulate(ind, lsf=lsf, ntimes=ntimes) for ind in inds] return [flatten_list(result) for result in zip(*multiple_results)] else: return [flatten_list([self.simulate(ind, **kwargs) for ind in inds]) for _ in range(ntimes)]
def pos_tags(self): """Return part-of-speech tags, for the entire document. >>> Analysis("I am fine. How are you?").pos_tags() ... # doctest: +NORMALIZE_WHITESPACE [('I', 'PRP'), ('am', 'VBP'), ('fine', 'NN'), ('.', '.'), ('How', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('?', '.')] """ return flatten_list(self.pos_tags_by_sentence())
def get_membership(self, partition_vector=None, flatten=False): pvec = partition_vector or self.partition_vector result = defaultdict(list) for (position, value) in enumerate(pvec): result[value].append(position) result = [ tuple(x) for x in sorted(result.values(), key=len, reverse=True) ] return flatten_list(result) if flatten else result
def get_verbs_in_path(path): """Returns list of all verbs""" trees = [t for t in get_trees(path) if t] list_of_nodes = get_nodes(trees) fncs = get_function_names(list_of_nodes) print('%s functions extracted' % len(fncs)) verbs = flatten_list([ get_verbs_from_function_name(function_name) for function_name in fncs ]) return verbs
def get_all_mushrooms_ids(): """Gets all mushrooms identifiers""" mushrooms_ids = [] for f in get_families(): for g in get_genres_per_family(f): mushrooms_ids.append(get_mushrooms_per_genre(g)) mushrooms_ids = utils.flatten_list(mushrooms_ids) return mushrooms_ids
def export_compatibility_list(): with os.scandir('output/compatibility_list/raw') as it: for entry in it: if entry.name.endswith('.json'): data = load_json(entry.path) output_data = [] with open( 'output/compatibility_list/' + entry.name.replace('.json', '.txt'), 'w') as f: f.write(flatten_list(output_data))
def find_good_lines(self, line, lines, z=0, separation=0, redge=0.05, ledge=None): """ Find any good lines in the Spectrum. Parameters ---------- line : :obj:`str` Search for good lines of this kind. lines : :obj:`str` Compare against this kind of lines. z : :obj:`float` Redshift correction to apply to the rest frequencies. separation : :obj:`float` Minimum separation between lines to be considered good. redge : :obj:`float` The line frequency should be this far """ # If no value is given for the left edge, use the same as for the right edge if ledge == None: ledge = redge # Find the lines within the Spectrum corresponding to the desired line ns, rf = self.find_lines(line, z) # Find other lines in the Spectrum ofs = [] for l in lines: n, f = self.find_lines(l, z) ofs.append(list(f)) fofs = utils.flatten_list(ofs) # Loop over lines checking that their separation from the other lines # is larger than separation. for i,f in enumerate(rf): diff = [abs(of - f) if of != f else separation+1 for of in fofs] if all(d > separation for d in diff) and \ f >= self.x.compressed().min() + self.bw*ledge and \ f <= self.x.compressed().max() - self.bw*redge: try: self.good_lines[line].append(ns[i]) self.good_lines[line+'_freq'].append(rf[i]) except KeyError: self.good_lines[line] = [ns[i]] self.good_lines[line+'_freq'] = [rf[i]] try: self.good_lines[line] self.good_lines[line+'_freq'] except KeyError: self.good_lines[line] = [] self.good_lines[line+'_freq'] = []
def get_df(sols_lst, panel, noise): "Extract dataframe with fpr and tpr from list of solutions" l = [(fpr(sol.imap, REF_IMAP), tpr(sol.imap, REF_IMAP)) for sol in utils.flatten_list(sols_lst)] # Sort base on fpr l.sort(key=lambda tup: tup[0]) x, y = zip(*l) df = pd.DataFrame({ "fpr": x, "tpr": y, "panel": [panel] * len(x), "noise": [noise] * len(x) }) return df
def get_datasets_political_parties(path='data/tobacco_full/'): """ Loads Political party data from files, splits the data into words and generates labels. Returns split sentences and labels. """ arr = sorted(os.listdir(path)) print('arr', arr) datasets = dict() class_value = 0 datasets['data'] = [] datasets['target'] = [] datasets['target_names'] = [] for input_file in arr: read_file = path + input_file data = list(open(read_file, "r").readlines()) print('Data in each file', input_file, len(data)) data = [s.strip() for s in data if len(s.strip()) > 0] # ignoring empty lines target = [class_value for x in data] datasets['data'].append(data) datasets['target'].append(target) datasets['target_names'].append(input_file) class_value = class_value + 1 # print('The Data before flattening: ', datasets['data'])nvi datasets['data'] = utils.flatten_list(datasets['data']) datasets['target'] = utils.flatten_list(datasets['target']) #datasets['target_names'] = datasets['target_names'] #print('The Data : ', datasets['data']) #print('The Target : ', datasets['target']) #print(len(datasets['data'])) #print(len(datasets['target'])) #print('The Target Names: ', datasets['target_names']) return datasets
def find_good_lines(self, line, lines, z=0, separation=0, redge=0.05, ledge=None): """ Find any good lines in the Spectrum. Parameters ---------- line : :obj:`str` Search for good lines of this kind. lines : :obj:`str` Compare against this kind of lines. z : :obj:`float` Redshift correction to apply to the rest frequencies. separation : :obj:`float` Minimum separation between lines to be considered good. redge : :obj:`float` The line frequency should be this far """ # If no value is given for the left edge, use the same as for the right edge if ledge == None: ledge = redge # Find the lines within the Spectrum corresponding to the desired line ns, rf = self.find_lines(line, z) # Find other lines in the Spectrum ofs = [] for l in lines: n, f = self.find_lines(l, z) ofs.append(list(f)) fofs = utils.flatten_list(ofs) # Loop over lines checking that their separation from the other lines # is larger than separation. for i,f in enumerate(rf): diff = [abs(of - f) if of != f else separation+1 for of in fofs] if all(d > separation for d in diff) and \ f >= self.x.compressed().min() + self.bw*ledge and \ f <= self.x.compressed().max() - self.bw*redge: try: self.good_lines[line].append(ns[i]) except KeyError: self.good_lines[line] = [ns[i]] try: self.good_lines[line] except KeyError: self.good_lines[line] = []
def set_run_transforms(run): ts = [] spatial = [] transforms = [] for model in run['models']: model['transforms_nested'] = { 'ts' : [], 'spatial' : [], 'transform' : [] } for transform in model['transforms']: model['transforms_nested'][transform['type']].append(transform) #print(json.dumps(model['transforms_nested'], indent=2)) for model in run['models']: ts.append(model['transforms_nested']['ts']) spatial.append(model['transforms_nested']['spatial']) transforms.append(model['transforms_nested']['transform']) ts = utils.flatten_list(ts) spatial = utils.flatten_list(spatial) transforms = utils.flatten_list(transforms) ts = utils.drop_duplicates_from_list_of_dicts(ts) spatial = utils.drop_duplicates_from_list_of_dicts(spatial) transforms = utils.drop_duplicates_from_list_of_dicts(transforms) for transform in ts+spatial+transforms: del transform['type'] run['transforms'] = { 'ts' : ts, 'spatial' : spatial, 'transforms' : transforms } return run
def is_field(_td, _fields, verbose=False): ''' This function checks whether fields are in a dict. Parameters ---------- _td : dict / list of dict dict of trial data. _fields : str / list of str Fields in the trial data dict. verbose : bool, optional Describe what's happening in the code. The default is False. Returns ------- return_val : bool Return whether fields are in the trial data dict or not. ''' return_val = True # check dict input variable if type(_td) is dict: _td = [_td] if type(_td) is not list: raise Exception('ERROR: _td must be a list of dictionaries!') # check string input variable if type(_fields) is str: _fields = [_fields] if type(_fields) is not list: raise Exception('ERROR: _str must be a list of strings!') # Flatten list of fields _fields = flatten_list(_fields) for idx, td_tmp in enumerate(_td): for field in _fields: if field not in td_tmp.keys(): return_val = False if verbose: print('Field {} not in dict #{}'.format(field, idx)) return return_val
def get_labels(self): """Gets the list of unique labels for this dataset Returns ------- label_list : list of strings Sorted (alphabetical/numerical) list of labels as strings """ if self.labels or not self.labelled_data: return self.labels else: all_labels = [] # Could add in _test.csv but later might use meta learning so some text labels we # don't want to train/evaluate on... for set_type in ["_train.csv", "_dev.csv"]: file_path = os.path.join(self.data_dir, self.data_name + set_type) _, _, labels = self._read_csv_or_df(file_path) all_labels += flatten_list(labels) # Return the sorted unique values (as strings) of all the labels # Note that list(set(all_labels)) returns the unique labels as a list return sorted(list(map(str, list(set(all_labels)))))
def get_overlaps(chunks): chunks = [list(chunk) for chunk in chunks] # get overlaps of +/- 1 token # create temp chunks chunks_plus1_starts = copy.deepcopy(chunks) chunks_minus1_starts = copy.deepcopy(chunks) chunks_plus1_ends = copy.deepcopy(chunks) chunks_minus1_ends = copy.deepcopy(chunks) for i, chunk in enumerate(chunks): chunks_plus1_starts[i][1] = int(chunk[1] + 1) chunks_plus1_ends[i][2] = int(chunk[2] + 1) # pay attention with - 1 if chunk[1] != chunk[2]: chunks_minus1_starts[i][1] = int(chunk[1] - 1) chunks_minus1_ends[i][2] = int(chunk[2] - 1) chunks_overlap = [tuple(chunk) for chunk in flatten_list([chunks, chunks_plus1_starts, chunks_minus1_starts, chunks_plus1_ends, chunks_minus1_ends])] return chunks_overlap
def Make(self, *code, **kwargs): """ """ _return_type = None flatten = None if '_return_type' in kwargs: _return_type = kwargs['_return_type'] del kwargs['_return_type'] if 'flatten' in kwargs: flatten = kwargs['flatten'] del kwargs['flatten'] g, refs = dsl.Compile(code, self._refs) f = utils.compose2(g, self) flatten_f = lambda x: utils.flatten_list(x) if type(x) is list else x if flatten: f = utils.compose2(flatten_f, f) return self.__unit__(f, refs, _return_type=_return_type)
def plot_normalized_distribution_over_time(raw_data_df): fig, axes = plt.subplots(4, 3, figsize=(16, 16)) flat_axes = utils.flatten_list(axes) for i, (field_name, ax) in enumerate(zip(data.FIELD_NAMES, flat_axes)): x1 = raw_data_df[[ _ for _ in raw_data_df.columns if utils.split_field_month(_)[0] == field_name ]] x2 = x1.div(x1.sum(axis=1), axis=0) x3 = x2.mean() x3.index = pd.DatetimeIndex(x3.index.map(utils.map_to_month)) \ .strftime("%Y-%m") x3.plot(kind="bar", ax=ax, color="ggg" + "b" * (len(x3) - 3)) ax.set_title(data.FIELD_NAMES_MAP[field_name], fontsize=18) ax.set_xticks([]) # Hack for ax in flat_axes[-3:]: ax.set_xticks(range(len(x3.index))) ax.set_xticklabels( [_ if i % 3 == 0 else "" for (i, _) in enumerate(x3.index)]) plt.setp(ax.xaxis.get_majorticklabels(), rotation=60) fig.suptitle("Normalized Distribution of Activity over Time", fontsize=30)
async def scrape_lap_records(http_session: aiohttp.ClientSession, track_id: int, vehicle_id: int) -> List[LapRecordTuple]: first_soup = await _prepare_soup(http_session, track_id, vehicle_id) if first_soup.find("p", class_="error"): raise ValueError("invalid track_id and vehicle_id combination") number_of_pages = _get_number_of_pages(first_soup) if number_of_pages == 0: logger.debug() return [] tasks = [] for page_n in range(2, number_of_pages + 1): tasks.append( _request_and_scrape_soup(http_session, track_id, vehicle_id, page_n)) tasks.append(_scrape_soup(first_soup)) results = flatten_list(await asyncio.gather(*tasks)) logger.debug( f"Found {len(results)} records for track={track_id} and vehicle={vehicle_id}" ) return results
def create_df_node(df: DataFrame, node_attributes) -> DataFrame: """ Creating df_node from df, base on node_attributes :param df: dataframe :param node_attributes: dict, e.g. {'NodeID1': ['LABELx', 'NAME', 'ATT']}, :return: df_node dataframe """ df_node = None for elem in node_attributes.items(): node_columns = [x for x in flatten_list(elem) if x] _nodeID = node_columns[0] _df_node = df.select(node_columns) for col in node_columns: _df_node = _df_node.withColumnRenamed( f"{col}", f"{col[:-1]}") # "idEntity:ID" # ":LABEL" if df_node is None: df_node = _df_node else: df_node = df_node.union(_df_node) df_node = df_node.dropDuplicates() return df_node
def create_subject_arrays(self, double_precision=True ): ''' Create arrays with errors per subject and per num_target also create an array with the precision per subject and num_target directly ''' unique_subjects = np.unique(self.dataset['subject']) unique_n_items = np.unique(self.dataset['n_items']) self.dataset['errors_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['errors_all_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['errors_nontarget_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['sizes_subject_nitems_trecall'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size)) self.dataset['precision_subject_nitems_trecall_bays'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size)) self.dataset['precision_subject_nitems_trecall_theo'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size)) self.dataset['precision_subject_nitems_trecall_theo_nochance'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size)) self.dataset['precision_subject_nitems_trecall_bays_notreatment'] = np.nan*np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size)) self.dataset['response_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['item_angle_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['target_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['nontargets_subject_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['errors_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['errors_all_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['errors_nontarget_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['response_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['item_angle_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['target_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['nontargets_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['precision_nitems_trecall_bays'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size)) self.dataset['precision_nitems_trecall_theo'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size)) self.dataset['precision_nitems_trecall_theo_nochance'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size)) self.dataset['precision_nitems_trecall_bays_notreatment'] = np.nan*np.empty((unique_n_items.size, unique_n_items.size)) for n_items_i, n_items in enumerate(unique_n_items): for subject_i, subject in enumerate(unique_subjects): for trecall_i, trecall in enumerate(np.arange(1, n_items+1)): ids_filtered = ((self.dataset['subject']==subject) & (self.dataset['n_items'] == n_items) & (self.dataset['probe'] == trecall) & (self.dataset.get('masked', False) == False)).flatten() # Invert the order of storage, 0 -> last item probed, 1 -> second to last item probe, etc... # trecall_i = n_items - trecall # Get the errors self.dataset['errors_all_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['errors_all'][ids_filtered] self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['errors_nontarget_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.extract_target_nontargets_columns(self.dataset['errors_all_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], trecall) # Get the responses and correct item angles # TODO (lmatthey) trecall here is inverted, should really fix it somehow... self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['response'][ids_filtered].flatten() self.dataset['item_angle_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['item_angle'][ids_filtered] # Save target item and nontargets as well self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.extract_target_nontargets_columns(self.dataset['item_angle'][ids_filtered], trecall) # Get the number of samples per conditions self.dataset['sizes_subject_nitems_trecall'][subject_i, n_items_i, trecall_i] = self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i].size # Compute the precision self.dataset['precision_subject_nitems_trecall_bays'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=True, correct_orientation=True, use_wrong_precision=True) self.dataset['precision_subject_nitems_trecall_theo'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=False) self.dataset['precision_subject_nitems_trecall_theo_nochance'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=True, correct_orientation=False, use_wrong_precision=False) self.dataset['precision_subject_nitems_trecall_bays_notreatment'][subject_i, n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], remove_chance_level=False, correct_orientation=True, use_wrong_precision=True) # if double_precision: # precision_subject_nitems *= 2. # precision_subject_nitems_theo *= 2. # # self.dataset['precision_subject_nitems_theo_nochance'] *= 2. # self.dataset['precision_subject_nitems_bays_notreatment'] *= 2. # self.dataset['errors_nitems_trecall'] = np.array([utils.flatten_list(self.dataset['errors_subject_nitems_trecall'][:, n_items_i]) for n_items_i in xrange(unique_n_items.size)]) # Store all/average subjects data for n_items_i, n_items in enumerate(unique_n_items): for trecall_i, trecall in enumerate(np.arange(1, n_items+1)): self.dataset['errors_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['errors_subject_nitems_trecall'][:, n_items_i, trecall_i])) self.dataset['errors_all_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['errors_all_subject_nitems_trecall'][:, n_items_i, trecall_i])) self.dataset['errors_nontarget_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['errors_nontarget_subject_nitems_trecall'][:, n_items_i, trecall_i])) # Responses, target, nontarget self.dataset['response_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['response_subject_nitems_trecall'][:, n_items_i, trecall_i])) self.dataset['target_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['target_subject_nitems_trecall'][:, n_items_i, trecall_i])) self.dataset['nontargets_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['nontargets_subject_nitems_trecall'][:, n_items_i, trecall_i])) self.dataset['item_angle_nitems_trecall'][n_items_i, trecall_i] = np.array(utils.flatten_list(self.dataset['item_angle_subject_nitems_trecall'][:, n_items_i, trecall_i])) # Precision over all subjects errors (not average of precisions) self.dataset['precision_nitems_trecall_bays'][n_items_i, trecall_i] = self.compute_precision(self.dataset['errors_nitems_trecall'][n_items_i, trecall_i], remove_chance_level=True, correct_orientation=True, use_wrong_precision=True) # self.dataset['precision_nitems_trecall_bays'] = np.mean(self.dataset['precision_subject_nitems_trecall_bays'], axis=0) self.dataset['precision_nitems_trecall_theo'] = np.mean(self.dataset['precision_subject_nitems_trecall_theo'], axis=0) self.dataset['precision_nitems_trecall_theo_nochance'] = np.mean(self.dataset['precision_subject_nitems_trecall_theo_nochance'], axis=0) self.dataset['precision_nitems_trecall_bays_notreatment'] = np.mean(self.dataset['precision_subject_nitems_trecall_bays_notreatment'], axis=0)
def get_list_csv(collection=coll): """ return flatten list from get_meta_csv""" return flatten_list(list(get_meta_csv(collection)))
mapping = defaultdict(lambda: None) mapping['ACQNO'] = lambda x: [{'tag': '100', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] # mapping['ACTIV'] = lambda x: [{'tag': '101', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}] mapping['AU'] = lambda x: [{'tag': '102', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': e}} for e in split_subfields(x)] mapping['CITED'] = lambda x: [{'tag': '103', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': e}} for e in split_subfields(x)] mapping['COPY'] = lambda x: [{'tag': '104', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] # mapping['CTI'] = lambda x: [{'tag': '105', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}] mapping['DES'] = lambda x: [{'tag': '106', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['ED'] = lambda x: [{'tag': '107', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] # mapping['EXCLM'] = lambda x: [{'tag': '108', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}] mapping['EXP'] = lambda x: [{'tag': '109', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['FREQ'] = lambda x: [{'tag': '110', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': fix_FREQ.get(x, x)}}] mapping['HOLD'] = lambda x: [{'tag': '111', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': e}} for e in split_subfields(x)] mapping['ISSN'] = lambda x: [{'tag': '112', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['ISSUE'] = process_ISSUE #lambda x: [{'tag': '113', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}] mapping['LANG'] = lambda x: [{'tag': '114', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': languages[e.title()]}} for e in flatten_list([a.split('/') for a in split_subfields(x)])] mapping['OS'] = lambda x: [{'tag': '115', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['PDATE'] = lambda x: [{'tag': '116', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['PNOTE'] = lambda x: [{'tag': '117', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': '; '.join(x.strip(whitespace+'|').split('\n |'))}}] mapping['PSTAT'] = lambda x: [{'tag': '118', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['PUB'] = lambda x: [{'tag': '119', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['REG'] = lambda x: [{'tag': '120', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['STAMP'] = process_UP('121') mapping['ROUTE'] = process_ROUTE mapping['SUB'] = lambda x: [{'tag': '123', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['TI'] = lambda x: [{'tag': '124', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['TNOTE'] = lambda x: [{'tag': '125', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['TYPE'] = lambda x: [{'tag': '126', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] # mapping['VADDR'] = lambda x: [{'tag': '127', 'ind1': ' ', 'ind2': ' ', 'subs': {'a' : x}}] mapping['VCODE'] = lambda x: [{'tag': '128', 'ind1': ' ', 'ind2': ' ', 'subs': {'a': x}}] mapping['UP'] = process_STAMP
def fit_mixture_model(self): unique_subjects = np.unique(self.dataset['subject']) unique_n_items = np.unique(self.dataset['n_items']) # Initialize empty arrays em_fits_keys = ['kappa', 'mixt_target', 'mixt_nontargets', 'mixt_nontargets_sum', 'mixt_random', 'train_LL', 'K', 'aic', 'bic'] self.dataset['em_fits'] = dict() for k in em_fits_keys: self.dataset['em_fits'][k] = np.nan*np.empty(self.dataset['probe'].size) self.dataset['em_fits']['resp_target'] = np.nan*np.empty(self.dataset['probe'].size) self.dataset['em_fits']['resp_nontarget'] = np.nan*np.empty(self.dataset['probe'].size) self.dataset['em_fits']['resp_random'] = np.nan*np.empty(self.dataset['probe'].size) self.dataset['em_fits_subjects_nitems_trecall'] = np.empty((unique_subjects.size, unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['em_fits_nitems_trecall'] = np.empty((unique_n_items.size, unique_n_items.size), dtype=np.object) self.dataset['em_fits_subjects_nitems'] = np.empty((unique_subjects.size, unique_n_items.size), dtype=np.object) # for subject_i, subject in enumerate(unique_subjects): # self.dataset['em_fits_subjects_nitems_trecall'][subject] = dict() # for n_items_i, n_items in enumerate(unique_n_items): # self.dataset['em_fits_subjects_nitems_trecall'][subject][n_items] = dict() self.dataset['em_fits_nitems_trecall_mean'] = dict(mean=dict(), std=dict(), values=dict()) # Compute mixture model fits per n_items, subject and trecall for n_items_i, n_items in enumerate(unique_n_items): for subject_i, subject in enumerate(unique_subjects): for trecall_i, trecall in enumerate(np.arange(1, n_items + 1)): ids_filtered = ((self.dataset['subject']==subject) & (self.dataset['n_items'] == n_items) & (self.dataset['probe'] == trecall) & (self.dataset.get('masked', False) == False)).flatten() # Invert the order of storage, 0 -> last item probed, 1 -> second to last item probe, etc... # trecall_i = n_items - trecall print "Fit mixture model, %d items, subject %d, trecall %d, %d datapoints (%d)" % (n_items, subject, trecall, np.sum(ids_filtered), self.dataset['sizes_subject_nitems_trecall'][subject_i, n_items_i, trecall_i]) params_fit = em_circular_mixture_to_use.fit(self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, trecall_i]) params_fit['mixt_nontargets_sum'] = np.sum(params_fit['mixt_nontargets']) # print self.dataset['response'][ids_filtered, 0].shape, self.dataset['item_angle'][ids_filtered, 0].shape, self.dataset['item_angle'][ids_filtered, 1:].shape # cross_valid_outputs = em_circularmixture.cross_validation_kfold(self.dataset['response'][ids_filtered, 0], self.dataset['item_angle'][ids_filtered, 0], self.dataset['item_angle'][ids_filtered, 1:], K=10, shuffle=True, debug=False) # params_fit = cross_valid_outputs['best_fit'] resp = em_circular_mixture_to_use.compute_responsibilities(self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, trecall_i], params_fit) for k, v in params_fit.iteritems(): self.dataset['em_fits'][k][ids_filtered] = v # params_fit['responsibilities'] = resp self.dataset['em_fits']['resp_target'][ids_filtered] = resp['target'] self.dataset['em_fits']['resp_nontarget'][ids_filtered] = np.sum(resp['nontargets'], axis=1) self.dataset['em_fits']['resp_random'][ids_filtered] = resp['random'] self.dataset['em_fits_subjects_nitems_trecall'][subject_i, n_items_i, trecall_i] = params_fit # Do not look at trecall (weird but whatever) params_fit = em_circular_mixture_to_use.fit(np.array(utils.flatten_list(self.dataset['response_subject_nitems_trecall'][subject_i, n_items_i, :n_items_i+1])), np.array(utils.flatten_list(self.dataset['target_subject_nitems_trecall'][subject_i, n_items_i, :n_items_i+1])), np.array(utils.flatten_list(self.dataset['nontargets_subject_nitems_trecall'][subject_i, n_items_i, :n_items_i+1]))) self.dataset['em_fits_subjects_nitems'][subject_i, n_items_i] = params_fit for n_items_i, n_items in enumerate(unique_n_items): for k in ['mean', 'std', 'values']: self.dataset['em_fits_nitems_trecall_mean'][k][n_items] = dict() for trecall_i, trecall in enumerate(np.arange(1, n_items + 1)): for k in ['mean', 'std', 'values']: self.dataset['em_fits_nitems_trecall_mean'][k][n_items][trecall] = dict() ## Now compute mean/std em_fits per n_items, trecall # Refit the model mixing all subjects together (not sure how we could get sem, 1-held?) params_fit = em_circular_mixture_to_use.fit(self.dataset['response_nitems_trecall'][n_items_i, trecall_i], self.dataset['target_nitems_trecall'][n_items_i, trecall_i], self.dataset['nontargets_nitems_trecall'][n_items_i, trecall_i]) self.dataset['em_fits_nitems_trecall'][n_items_i, trecall_i] = params_fit # Need to extract the values for a subject/nitems pair, for all keys of em_fits. Annoying dictionary indexing needed for key in em_fits_keys: fits_persubjects = [self.dataset['em_fits_subjects_nitems_trecall'][subject_i, n_items_i, trecall_i][key] for subject in np.unique(unique_subjects)] self.dataset['em_fits_nitems_trecall_mean']['mean'][n_items][trecall][key] = np.mean(fits_persubjects) self.dataset['em_fits_nitems_trecall_mean']['std'][n_items][trecall][key] = np.std(fits_persubjects) self.dataset['em_fits_nitems_trecall_mean']['values'][n_items][trecall][key] = fits_persubjects ## Construct array versions of the em_fits_nitems mixture proportions, for convenience self.construct_arrays_em_fits()
def main(): global d if not d: d = get_items('BARCD') total = float(len(d)) i = 1 fb = open(export_dir+'ITEMS.marc.dat', 'wb') ft = open(export_dir+'ITEMS.marc.txt', 'w') print 'Exporting items...' item_count = 0 for (recid, copies) in d.items(): if not is_staff_paper(recid): record = Record() id_field = Field(tag='999', indicators=[' ', ' '], subfields=['a', recid, 'b', ALL[recid].get('ID', '')]) record.add_ordered_field(id_field) for c in copies.items(): aux = [(e[0], items_fix[e[0]](e[1])) for e in c[1].items() if e[0] in items_fix] item_field = Field(tag='945', indicators=[' ', ' '], subfields= ['b', c[0]]+flatten_list(aux)) record.add_ordered_field(item_field) item_count = item_count + 1 fb.write(record.as_marc()) ft.write(str(record) + '\n==================\n') update_progress(i*100/total) i = i + 1 print "\nRecords:\t" + str(int(total)) print "Items: \t" + str(item_count) fb.close() ft.close()
sys.exit("Selected mutation tool reports that it doesn't support the current project.") print("Creating mutants...") mdir, mutants = master.mutate() print("Scoring mutants in parallel...") divided_mutants = divide(mutants, args.scorers) # functools.partial is instead of lamda below, as the latter can't be pickled toolfun = functools.partial(load_tool, args.mutation_tool) scorefun = functools.partial(local_scorer.create_and_score, toolfun, cwd, mdir) with Pool(processes=args.scorers) as pool: nested_results = pool.map(scorefun, divided_mutants, 1) results = ScoringResult(flatten_list(nested_results)) if not args.benchmark: print("Loading mutant metadata from the filesystem...") results.add_metadata(cwd, mdir) if args.ci_mode: passed = results.percentage_score >= args.ci_threshold reporter = load_reporter(args.reporter, results, passed, args.ci_threshold) else: reporter = load_reporter(args.reporter, results) print("Reporting mutation testing results...") reporter.report() sys.exit(1 if args.ci_mode and not passed else 0)
def find_lines(self, line, z=0, verbose=False): """ Finds if there are any lines of a given type in the frequency range. The line frequencies are corrected for redshift. Parameters ---------- line : :obj:`string` Line type to search for. z : :obj:`float` Redshift to apply to the rest frequencies. verbose : :obj:`bool` Verbose output? Returns ------- n : :obj:`numpy.array` Principal quantum numbers. reference_frequencies : :obj:`numpy.array` Reference frequencies of the lines inside the spectrum in MHz. The frequencies are redshift corrected. See Also -------- crrlpy.crrls.load_ref : Describes the format of line and the available ones. Examples -------- >>> from crrlpy.spec import Spectrum >>> freq = [10, 11] >>> temp = [1, 1] >>> spec = Spectrum(freq, temp) >>> ns, rf = spec.find_lines('RRL_CIalpha') >>> ns array([ 843., 844., 845., 846., 847., 848., 849., 850., 851., 852., 853., 854., 855., 856., 857., 858., 859., 860., 861., 862., 863., 864., 865., 866., 867., 868., 869.]) """ if not isinstance(line, str): raise ValueError('line should be a string') # Load the reference frequencies. qn, restfreq = crrls.load_ref(line) # Correct rest frequencies for redshift. reffreq = restfreq/(1.0 + z) # Check which lines lie within the sub band. mask_ref = (self.x.compressed()[0] < reffreq) & \ (self.x.compressed()[-1] > reffreq) reffreqs = reffreq[mask_ref] refqns = qn[mask_ref] if not line in self.lines.keys(): try: self.lines[line].append(refqns) self.lines[line+'_freq'].append(reffreqs) except KeyError: self.lines[line] = [refqns] self.lines[line+'_freq'] = [reffreqs] self.lines[line] = utils.flatten_list(self.lines[line]) self.lines[line+'_freq'] = utils.flatten_list(self.lines[line+'_freq']) nlin = len(reffreqs) if verbose: print "Found {0} {1} lines within the subband.".format(nlin, line) if nlin > 1: print "Corresponding to n values: {0}--{1}".format(refqns[0], refqns[-1]) elif nlin == 1: print "Corresponding to n value {0} and frequency {1} MHz".format(refqns[0], reffreqs[0]) return refqns, reffreqs
def best_points_allT(result_dist_to_use): return np.array(utils.flatten_list([best_points_T(result_dist_to_use, T) for T in T_space]))
def find_lines(self, line, z=0, verbose=False): """ Finds if there are any lines of a given type in the frequency range. The line frequencies are corrected for redshift. Parameters ---------- line : :obj:`string` Line type to search for. z : :obj:`float` Redshift to apply to the rest frequencies. verbose : :obj:`bool` Verbose output? Returns ------- n : :obj:`numpy.array` Principal quantum numbers. reference_frequencies : :obj:`numpy.array` Reference frequencies of the lines inside the spectrum in MHz. The frequencies are redshift corrected. See Also -------- crrlpy.crrls.load_ref : Describes the format of line and the available ones. Examples -------- >>> from crrlpy.spec import Spectrum >>> freq = [10, 11] >>> temp = [1, 1] >>> spec = Spectrum(freq, temp) >>> ns, rf = spec.find_lines('RRL_CIalpha') >>> ns array([ 843., 844., 845., 846., 847., 848., 849., 850., 851., 852., 853., 854., 855., 856., 857., 858., 859., 860., 861., 862., 863., 864., 865., 866., 867., 868., 869.]) """ if not isinstance(line, str): raise ValueError('line should be a string') # Load the reference frequencies. qn, restfreq = crrls.load_ref(line) # Correct rest frequencies for redshift. reffreq = restfreq/(1.0 + z) # Check which lines lie within the sub band. mask_ref = (self.x.compressed()[0] < reffreq) & \ (self.x.compressed()[-1] > reffreq) reffreqs = reffreq[mask_ref] refqns = qn[mask_ref] if not line in self.lines.keys(): try: self.lines[line].append(refqns) except KeyError: self.lines[line] = [refqns] #print self.lines[line] self.lines[line] = utils.flatten_list(self.lines[line]) nlin = len(reffreqs) if verbose: print "Found {0} {1} lines within the subband.".format(nlin, line) if nlin > 1: print "Corresponding to n values: {0}--{1}".format(refqns[0], refqns[-1]) elif nlin == 1: print "Corresponding to n value {0} and frequency {1} MHz".format(refqns[0], reffreqs[0]) return refqns, reffreqs