def main(): """Main function of script.""" args = utils.read_arguments(__doc__) # Read dataset. Each row of x_matrix is a sentence. x_matrix, y_vector = utils.pickle_from_file(args['input_filename']) # Get Stanford model parser = StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8') # Get parse trees. parsed_matrix = [] for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)): parsed_document = [] for paragraph_index, paragraph in enumerate(document): parsed_paragraph = [] for sentence_index, sentence in enumerate(paragraph): try: parsed_paragraph.append( list( parser.raw_parse( six.text_type(sentence.decode('utf-8'))))) except UnicodeDecodeError: logging.warning( 'Skip sentence {}-{}-{} for unicode error'.format( index, paragraph_index, sentence_index)) y_vector[index].pop(sentence_index) parsed_document.append(parsed_paragraph) parsed_matrix.append(parsed_document) # Save output logging.info('Saving {} documents'.format(len(parsed_matrix))) utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename']) logging.info('All operations finished')
def main(): """Main fuction of the script.""" args = utils.read_arguments(__doc__) sentences = [] labels = [] for filename in get_input_files(args['input_dirpath'], r'.*txt', int(args['limit'])): with LabeledSentencesExtractor(filename) as instance_extractor: labeled_senteces = instance_extractor.get_labeled_sentences() sentences.append(labeled_senteces[0]) labels.append(labeled_senteces[1]) if not args['raw_text']: # Process sentences as vectors feature_extractor = FeatureExtractor() x_train = feature_extractor.get_matrix(sentences) logging.info('Saving numeric matrix with shape {}'.format( x_train.shape)) else: x_train = sentences logging.info('Saving raw text, {} documents'.format(len(sentences))) # Convert labels to numeric vector unique_labels = sorted(['Claim', 'MajorClaim', 'Premise', 'None']) counts = dict.fromkeys(unique_labels, 0) y_vector = [] for document_labels in labels: for paragraph_labels in document_labels: for label_index, label in enumerate(paragraph_labels): paragraph_labels[label_index] = unique_labels.index(label) counts[label] += 1 logging.info('Classes used (sorted) {}'.format(unique_labels)) logging.info('\t Counts {}'.format(counts)) utils.pickle_to_file((x_train, labels), args['output_filename'])
def save_to_file(self, directory_name, name=None): if name is not None: filename = os.path.join(directory_name, '{}_model.p'.format(name)) else: filename = os.path.join(directory_name, 'model.p') utils.safe_mkdir(directory_name) utils.pickle_to_file(self.model, filename)
def save_to_files(self, directory_name, name=None): """Saves all dataset files into a directory. Args: directory_name (string): Name of directory to save files. name (string, optional): additional name to add into the dataset files. """ filename = self._get_objective_filename(directory_name, 'indices', name) utils.pickle_to_file(self.indices, filename)
def save_to_files(self, directory_name, name=None): """Saves all the sample files into the directory directory_name. Args: directory_name (string): Name of directory to save files. name (string, optional): additional name to add into the dataset files. """ utils.safe_mkdir(directory_name) super(BaseSampledDataset, self).save_to_files(directory_name) utils.pickle_to_file(self._sample_indices, self._get_objective_filename( directory_name, 'sample_indices', name))
def main(): """Main fuction of the script.""" args = utils.read_arguments(__doc__) documents = [] filenames = list(traverse_directory(args["input_dirpath"], '*clean*.txt')) labels_dirname = args["labels_dirpath"] labels_from_json = get_all_labels_from_json(labels_dirname) for filename in tqdm(filenames): with AnnotatedIBMFactory(filename) as instance_extractor: filename_key = filename.split("/")[-1] document = instance_extractor.build_document( labels_from_json[filename_key]) documents.append(document) utils.pickle_to_file(documents, args['output_file'])
def get_graph(graph_filename, category_filename): if graph_filename and os.path.isfile(graph_filename): print 'Reading pickled graph' hierarchy_graph = utils.pickle_from_file(graph_filename) else: hierarchy_graph = networkx.DiGraph() categories = utils.get_categories_from_file(category_filename) print 'Downloading categories' for category_name in tqdm(categories): utils.add_subcategories(category_name, hierarchy_graph) if graph_filename: print 'Saving graph' utils.pickle_to_file(hierarchy_graph, category_filename) return hierarchy_graph
def main(): args = docopt(__doc__, version=1.0) mapping = utils.pickle_from_file(args['<mapping_filename>']) graph = utils.pickle_from_file(args['<graph_filename>']) yago_to_lkif = invert_mapping(mapping) for node in graph.nodes(): if len(yago_to_lkif[node]) != 0: continue for ancestor in get_oldest_ancestors(node, graph): yago_to_lkif[node].update(yago_to_lkif[ancestor]) utils.pickle_to_file(dict(yago_to_lkif), args['<output_file>'])
def main(relation, limit, offset, directory_name): """Main script function.""" utils.safe_mkdir(directory_name) query = """SELECT DISTINCT ?related ?wikiPage WHERE { ?movie rdf:type <http://yago-knowledge.org/resource/%s> . ?related <http://yago-knowledge.org/resource/%s> ?movie . ?related <http://yago-knowledge.org/resource/hasWikipediaUrl> ?wikiPage } LIMIT %s OFFSET %s""" % (MOVIE_CATEGORY_NAME, relation, limit, offset) response = utils.query_sparql(query, utils.YAGO_ENPOINT_URL) print 'Reading {} objects.'.format(len(response)) filename = '{}-{}.pickle'.format(relation, offset) utils.pickle_to_file(response, os.path.join(directory_name, filename))
def main(): """Main function of script""" args = utils.read_arguments(__doc__) print('Loading documents') documents = utils.pickle_from_file(args['input_filename']) transformer = conll_feature_extractor.ConllFeatureExtractor( use_structural=True, use_syntactic=True, use_lexical=True) # Extract instances and labels. Each instance is a sentence, represented as # a list of feature dictionaries for each work. instances = transformer.get_feature_dict(documents) utils.pickle_to_file(instances, args['output_filename']) print('All operations completed')
def main(): """Main fuction of the script.""" args = utils.read_arguments(__doc__) documents = [] filenames = get_input_files(args['input_dirpath'], r'.*txt', int(args['limit'])) if args['parse_trees']: parser = LexicalizedStanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8') else: parser = None for filename in tqdm(filenames): with AnnotatedDocumentFactory(filename) as instance_extractor: document = instance_extractor.build_document() if args['parse_trees']: document.parse_text(parser) documents.append(document) utils.pickle_to_file(documents, args['output_file'])
def tag_dataset(partition_name): partition_name_short = 'dev' if 'dev' in partition_name else 'test' output_filename = os.path.join( args.output_dirname, 'predictions_{}_{}_{}.conll'.format(args.experiment_name, dataset_name, partition_name_short)) if attention_model is not None: tags, attention = model.predict(data[dataset_name][partition_name], return_attention=True) attention_filename = os.path.join( args.output_dirname, 'attention_{}_{}_{}.p'.format(args.experiment_name, dataset_name, partition_name_short)) utils.pickle_to_file(attention, attention_filename) del attention else: tags = model.tagSentences(data[dataset_name][partition_name]) true_labels = [] result = [] for idx, (sentence, sentence_labels) in enumerate( zip(data[dataset_name][partition_name], tags[dataset_name])): for token, true_label_id, predicted_label in zip( sentence['raw_tokens'], sentence[args.target_column], sentence_labels): if token == 'PADDING_TOKEN': continue true_label = label_encoding[true_label_id] true_labels.append(true_label) result.append((token, true_label, predicted_label, idx)) result = pandas.DataFrame( result, columns=['Token', 'True', 'Predicted', 'Sentence']) result.to_csv(output_filename, sep='\t', index=False) print( metrics.classification_report( true_labels, numpy.concatenate(tags[dataset_name])))
def main(): """Main fuction of the script.""" args = utils.read_arguments(__doc__) documents = [] text_buffer = [] if args['parse_trees']: parser = LexicalizedStanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8') else: parser = None limit = int(args['limit']) with open(args['input_file'], 'r') as input_file: for line in input_file: if _is_new_document(line) and len(text_buffer) > 0: try: documents.append(_create_document(text_buffer, parser)) except Exception as e: print('Creation failed for document: {}'.format( text_buffer[0])) print(e) text_buffer = [line] print('Adding case {} {}'.format(len(documents), line.strip())) if limit > 0 and len(documents) >= limit: break if len(documents) % 10 == 0: # Partial save utils.pickle_to_file(documents, args['output_file']) else: text_buffer.append(line) if len(text_buffer) > 0: documents.append(_create_document(text_buffer, parser)) print('{} documents processed'.format(len(documents))) utils.pickle_to_file(documents, args['output_file']) print('All task finished')
def map_count(channel, count_name, temp_dir): total = LogCounter(count_name) count = 0 job_id = channel.receive() while job_id != None: temp_file_name = "%s/map_%d_%d" % (temp_dir, job_id, count_names.index(count_name)) total.add_counter(pickle.load(open(temp_file_name, 'rb'))) count += 1 os.remove(temp_file_name) job_id = channel.receive() channel.send(pickle_to_file(total.report(), "%s/out_%d" % (temp_dir, count_names.index(count_name))))
v_lst, preds_lst, ic_lst, folds_lst = zip( *(mp.cv_predictiveness(data_1, S_lst[1:][i], measure_func, ensemble_funcs[i], V=5, stratified=True, na_rm=True, type=pred_type) for i in range(len(S_lst[1:])))) end = time.time() print("Estimating predictiveness took " + str(end - start) + " seconds") v_lst_all = [v_none] + list(v_lst) preds_lst_all = [preds_none] + list(preds_lst) ic_lst_all = [ic_none] + list(ic_lst) uts.pickle_to_file( v_lst_all, args.output_dir + 'vs_' + args.measure + '_est_' + args.estimator_type + '.pkl') uts.pickle_to_file( preds_lst_all, args.output_dir + 'preds_' + args.measure + '_est_' + args.estimator_type + '.pkl') uts.pickle_to_file( ic_lst_all, args.output_dir + 'ics_' + args.measure + '_est_' + args.estimator_type + '.pkl') ## set up Z, v, W, G, c_n matrices Z = np.array(Z_aug_lst) v = np.array(v_lst_all) W = np.diag(z_counts / np.sum(z_counts)) G = np.vstack((np.append(1, np.zeros(p)), np.ones(p + 1))) c_n = np.array([v_none, v_lst_all[len(v_lst)]]) ## --------------------------------------------------
def main(args=sys.argv[1:]): train_size = float(args[0]) seed = int(args[1]) icu_data_dir = args[2] # Read the y data outcomes = pd.read_csv(icu_data_dir + "Outcomes-a.txt") subject_outcomes = outcomes[["RecordID", "In-hospital_death"]] # Create a dictionary of features for each subject # Using a dictionary because some of the features don't appear in all subjects... value_range = {} # this is just for printing out ranges of the values file_folder = icu_data_dir + "set-a/" all_subject_features = {} for idx, filename in enumerate(os.listdir(file_folder)[:MAX_PROCESS]): df = pd.read_csv("%s%s" % (file_folder, filename)) df["hour"] = np.array([time.split(":")[0] for time in df.Time.values], dtype=int) df["minute"] = np.array( [time.split(":")[1] for time in df.Time.values], dtype=int) df.Time = df.hour * 60 + df.minute record_id = int(df.loc[0].Value) subject_features = {"RecordID": record_id} for feat_name, process_func_list in FEATURES.items(): if WEIGHTED_MEAN in process_func_list: sub_df = df.loc[(df.Parameter == feat_name) & (df.Value > 0)] else: sub_df = df.loc[(df.Parameter == feat_name) & (df.Value >= 0)] if sub_df.shape[0] == 0: continue if feat_name not in value_range: value_range[feat_name] = [ sub_df.Value.min(), sub_df.Value.max() ] else: value_range[feat_name][0] = min(value_range[feat_name][0], sub_df.Value.min()) value_range[feat_name][1] = max(value_range[feat_name][1], sub_df.Value.max()) for func in process_func_list: value = func(sub_df) if not np.isfinite(value): print(value, feat_name, func.__name__) print(sub_df) assert np.isfinite(value) full_feature_name = "%s:%s" % (feat_name, func.__name__) subject_features[full_feature_name] = value fio2_df = df.loc[df.Parameter == "FiO2"] pao2_df = df.loc[df.Parameter == "PaO2"] if fio2_df.shape[0] and pao2_df.shape[0]: fio2_mean = _get_mean(fio2_df) pao2_mean = _get_mean(pao2_df) if fio2_mean > 0: subject_features["O2:_get_ratio"] = pao2_mean / fio2_mean all_subject_features[idx] = subject_features for k, v in value_range.items(): print(k, v) subjects_x = pd.DataFrame.from_dict(all_subject_features, orient="index") ## if a covariate has > 30% missing data, remove it prop_nan = subjects_x.apply(lambda x: np.mean(np.isnan(x))) print('Features filtered for proportion of NA values >= 0.3') print(prop_nan >= 0.3) tmp = subjects_x.loc[:, prop_nan < 0.3] subjects_x = tmp # Merge the X and Y data icu_subjects = subjects_x.merge(subject_outcomes, on="RecordID") death_resp = icu_subjects["In-hospital_death"] icu_subjects = icu_subjects.drop(columns=["RecordID"]) # Grab column names column_names = list(icu_subjects.columns.values) print(column_names) # icu_subjects = icu_subjects.as_matrix() icu_subjects = icu_subjects.loc[:, column_names].values # Center the x covariates centering_term = np.nanmean(icu_subjects, axis=0) centering_term[-1] = 0 icu_subjects -= centering_term assert np.all(death_resp == icu_subjects[:, -1]) # randomly split the data if train_size < 1: mats = train_test_split(icu_subjects, train_size=train_size, test_size=1.0 - train_size, random_state=seed) x_train = mats[0][:, :-1] y_train = mats[0][:, -1:] x_test = mats[1][:, :-1] y_test = mats[1][:, -1:] else: x_train = icu_subjects[:, :-1] y_train = icu_subjects[:, -1:] x_test = x_train y_test = y_train print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) # Save the data icu_data = data_generator.Dataset(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) ## save off as a pickle icu_processed_file = icu_data_dir + "icu_data_processed.pkl" pickle_to_file(icu_data, icu_processed_file) icu_column_file = icu_data_dir + "icu_data_column_names.txt" with open(icu_column_file, "w") as f: for i, col in enumerate(column_names[:-1]): f.write("%d, %s\n" % (i, col)) feature_group_list, vi_group_names, nan_fill_config = _process_feature_groups( column_names[:-1]) print( "Copy paste this for creating the variable importance groups argument!" ) print("--var-import-idx %s" % ";".join(feature_group_list)) icu_vi_name_file = icu_data_dir + "icu_data_var_import_names.csv" vi_group_name_df = pd.DataFrame.from_dict(vi_group_names, orient="index") vi_group_name_df.to_csv(icu_vi_name_file) nan_config_file = icu_data_dir + "nan_fill_config.json" with open(nan_config_file, 'w') as f: json.dump(nan_fill_config, f)
def prepare_dataset(embeddings_path, datasets, output_dirpath, freq_threshold_unk_tokens=50, reduce_embeddings=False, value_transformations=None, pad_onetoken_sentence=True): """Preprocess dataset and embeddings. Reads in the pre-trained embeddings (in text format) from embeddings_path and prepares those to be used with the LSTM network. Unknown words in the trainDataPath-file are added, if they appear at least freq_threshold_unk_tokens times Args: embeddings_path: Full path to the pre-trained embeddings file. File must be in text format. datasets: A dictionary where the keys are the dataset names and the values are the specification for the dataset. The specifications are also dicts, with the keys columns, labels, evaluate, commentSymbol and dirpath. dirpath contains the path to the directory where the three partitions of the dataset (train, test, dev) are stored in txt format. output_dirpath: Path to directory to store the resulting pickled file freq_threshold_unk_tokens: Unknown words are added, if they occure more than freq_threshold_unk_tokens times in the train set reduce_embeddings: Set to true, then only the embeddings needed for training will be loaded value_transformations: Column specific value transformations pad_onetoken_sentence: True to pad one sentence tokens (needed for CRF classifier) """ utils.safe_mkdir(output_dirpath) embeddings_name = os.path.basename(embeddings_path)[:10] dataset_name = "_".join(sorted(datasets.keys()) + [embeddings_name]) output_filename = os.path.join(output_dirpath, dataset_name + '.p') casing2Idx = preprocessing.getCasingVocab() embeddings, word2Idx = preprocessing.readEmbeddings( embeddings_path, datasets, freq_threshold_unk_tokens, reduce_embeddings) mappings = {'tokens': word2Idx, 'casing': casing2Idx} result = { 'embeddings': embeddings, 'mappings': mappings, 'datasets': datasets, 'data': {} } for name, dataset in datasets.items(): trainData = os.path.join((dataset['dirpath']), 'train.txt') devData = os.path.join((dataset['dirpath']), 'dev.txt') testData = os.path.join((dataset['dirpath']), 'test.txt') paths = [trainData, devData, testData] print(paths) result['data'][name] = preprocessing.createPklFiles( paths, mappings, dataset['columns'], dataset['commentSymbol'], value_transformations, pad_onetoken_sentence) utils.pickle_to_file(result, output_filename) print("DONE - Embeddings file saved: {}".format(output_filename))
def save_entities(category_name, entities, directory_name): filename = '{}.pickle'.format(category_name) utils.pickle_to_file(entities, os.path.join(directory_name, filename))