Esempio n. 1
0
def main():
    """Main function of script."""
    args = utils.read_arguments(__doc__)

    # Read dataset. Each row of x_matrix is a sentence.
    x_matrix, y_vector = utils.pickle_from_file(args['input_filename'])

    # Get Stanford model
    parser = StanfordParser(
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        encoding='utf8')
    # Get parse trees.
    parsed_matrix = []
    for index, document in tqdm(enumerate(x_matrix), total=len(x_matrix)):
        parsed_document = []
        for paragraph_index, paragraph in enumerate(document):
            parsed_paragraph = []
            for sentence_index, sentence in enumerate(paragraph):
                try:
                    parsed_paragraph.append(
                        list(
                            parser.raw_parse(
                                six.text_type(sentence.decode('utf-8')))))
                except UnicodeDecodeError:
                    logging.warning(
                        'Skip sentence {}-{}-{} for unicode error'.format(
                            index, paragraph_index, sentence_index))
                    y_vector[index].pop(sentence_index)
            parsed_document.append(parsed_paragraph)
        parsed_matrix.append(parsed_document)

    # Save output
    logging.info('Saving {} documents'.format(len(parsed_matrix)))
    utils.pickle_to_file((parsed_matrix, y_vector), args['output_filename'])
    logging.info('All operations finished')
def main():
    """Main fuction of the script."""
    args = utils.read_arguments(__doc__)
    sentences = []
    labels = []
    for filename in get_input_files(args['input_dirpath'], r'.*txt',
                                    int(args['limit'])):
        with LabeledSentencesExtractor(filename) as instance_extractor:
            labeled_senteces = instance_extractor.get_labeled_sentences()
            sentences.append(labeled_senteces[0])
            labels.append(labeled_senteces[1])

    if not args['raw_text']:
        # Process sentences as vectors
        feature_extractor = FeatureExtractor()
        x_train = feature_extractor.get_matrix(sentences)
        logging.info('Saving numeric matrix with shape {}'.format(
            x_train.shape))
    else:
        x_train = sentences
        logging.info('Saving raw text, {} documents'.format(len(sentences)))

    # Convert labels to numeric vector
    unique_labels = sorted(['Claim', 'MajorClaim', 'Premise', 'None'])
    counts = dict.fromkeys(unique_labels, 0)
    y_vector = []
    for document_labels in labels:
        for paragraph_labels in document_labels:
            for label_index, label in enumerate(paragraph_labels):
                paragraph_labels[label_index] = unique_labels.index(label)
                counts[label] += 1
    logging.info('Classes used (sorted) {}'.format(unique_labels))
    logging.info('\t Counts {}'.format(counts))

    utils.pickle_to_file((x_train, labels), args['output_filename'])
Esempio n. 3
0
    def save_to_file(self, directory_name, name=None):
        if name is not None:
            filename = os.path.join(directory_name, '{}_model.p'.format(name))
        else:
            filename = os.path.join(directory_name, 'model.p')

        utils.safe_mkdir(directory_name)
        utils.pickle_to_file(self.model, filename)
Esempio n. 4
0
    def save_to_files(self, directory_name, name=None):
        """Saves all dataset files into a directory.

        Args:
            directory_name (string): Name of directory to save files.
            name (string, optional): additional name to add into the dataset
                files.
        """
        filename = self._get_objective_filename(directory_name, 'indices', name)
        utils.pickle_to_file(self.indices, filename)
Esempio n. 5
0
    def save_to_files(self, directory_name, name=None):
        """Saves all the sample files into the directory directory_name.

        Args:
            directory_name (string): Name of directory to save files.
            name (string, optional): additional name to add into the dataset
                files.
        """
        utils.safe_mkdir(directory_name)
        super(BaseSampledDataset, self).save_to_files(directory_name)
        utils.pickle_to_file(self._sample_indices, self._get_objective_filename(
            directory_name, 'sample_indices', name))
Esempio n. 6
0
def main():
    """Main fuction of the script."""
    args = utils.read_arguments(__doc__)
    documents = []
    filenames = list(traverse_directory(args["input_dirpath"], '*clean*.txt'))
    labels_dirname = args["labels_dirpath"]
    labels_from_json = get_all_labels_from_json(labels_dirname)
    for filename in tqdm(filenames):
        with AnnotatedIBMFactory(filename) as instance_extractor:
            filename_key = filename.split("/")[-1]
            document = instance_extractor.build_document(
                labels_from_json[filename_key])
            documents.append(document)
    utils.pickle_to_file(documents, args['output_file'])
def get_graph(graph_filename, category_filename):
    if graph_filename and os.path.isfile(graph_filename):
        print 'Reading pickled graph'
        hierarchy_graph = utils.pickle_from_file(graph_filename)
    else:
        hierarchy_graph = networkx.DiGraph()
        categories = utils.get_categories_from_file(category_filename)
        print 'Downloading categories'
        for category_name in tqdm(categories):
            utils.add_subcategories(category_name, hierarchy_graph)
        if graph_filename:
            print 'Saving graph'
            utils.pickle_to_file(hierarchy_graph, category_filename)
    return hierarchy_graph
def main():
    args = docopt(__doc__, version=1.0)
    mapping = utils.pickle_from_file(args['<mapping_filename>'])
    graph = utils.pickle_from_file(args['<graph_filename>'])

    yago_to_lkif = invert_mapping(mapping)

    for node in graph.nodes():
        if len(yago_to_lkif[node]) != 0:
            continue
        for ancestor in get_oldest_ancestors(node, graph):
            yago_to_lkif[node].update(yago_to_lkif[ancestor])

    utils.pickle_to_file(dict(yago_to_lkif), args['<output_file>'])
Esempio n. 9
0
def main(relation, limit, offset, directory_name):
    """Main script function."""

    utils.safe_mkdir(directory_name)

    query = """SELECT DISTINCT ?related ?wikiPage WHERE {
        ?movie rdf:type <http://yago-knowledge.org/resource/%s> .
        ?related <http://yago-knowledge.org/resource/%s> ?movie .
        ?related <http://yago-knowledge.org/resource/hasWikipediaUrl> ?wikiPage
        } LIMIT %s OFFSET %s""" % (MOVIE_CATEGORY_NAME, relation, limit,
                                   offset)
    response = utils.query_sparql(query, utils.YAGO_ENPOINT_URL)
    print 'Reading {} objects.'.format(len(response))
    filename = '{}-{}.pickle'.format(relation, offset)
    utils.pickle_to_file(response, os.path.join(directory_name, filename))
Esempio n. 10
0
def main():
    """Main function of script"""
    args = utils.read_arguments(__doc__)
    print('Loading documents')
    documents = utils.pickle_from_file(args['input_filename'])

    transformer = conll_feature_extractor.ConllFeatureExtractor(
        use_structural=True, use_syntactic=True, use_lexical=True)
    # Extract instances and labels. Each instance is a sentence, represented as
    # a list of feature dictionaries for each work.
    instances = transformer.get_feature_dict(documents)

    utils.pickle_to_file(instances, args['output_filename'])

    print('All operations completed')
Esempio n. 11
0
def main():
    """Main fuction of the script."""
    args = utils.read_arguments(__doc__)
    documents = []
    filenames = get_input_files(args['input_dirpath'], r'.*txt',
                                int(args['limit']))
    if args['parse_trees']:
        parser = LexicalizedStanfordParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            encoding='utf8')
    else:
        parser = None
    for filename in tqdm(filenames):
        with AnnotatedDocumentFactory(filename) as instance_extractor:
            document = instance_extractor.build_document()
            if args['parse_trees']:
                document.parse_text(parser)
            documents.append(document)

    utils.pickle_to_file(documents, args['output_file'])
Esempio n. 12
0
    def tag_dataset(partition_name):
        partition_name_short = 'dev' if 'dev' in partition_name else 'test'
        output_filename = os.path.join(
            args.output_dirname,
            'predictions_{}_{}_{}.conll'.format(args.experiment_name,
                                                dataset_name,
                                                partition_name_short))

        if attention_model is not None:
            tags, attention = model.predict(data[dataset_name][partition_name],
                                            return_attention=True)
            attention_filename = os.path.join(
                args.output_dirname,
                'attention_{}_{}_{}.p'.format(args.experiment_name,
                                              dataset_name,
                                              partition_name_short))
            utils.pickle_to_file(attention, attention_filename)
            del attention
        else:
            tags = model.tagSentences(data[dataset_name][partition_name])
        true_labels = []
        result = []

        for idx, (sentence, sentence_labels) in enumerate(
                zip(data[dataset_name][partition_name], tags[dataset_name])):
            for token, true_label_id, predicted_label in zip(
                    sentence['raw_tokens'], sentence[args.target_column],
                    sentence_labels):
                if token == 'PADDING_TOKEN':
                    continue
                true_label = label_encoding[true_label_id]
                true_labels.append(true_label)
                result.append((token, true_label, predicted_label, idx))

        result = pandas.DataFrame(
            result, columns=['Token', 'True', 'Predicted', 'Sentence'])
        result.to_csv(output_filename, sep='\t', index=False)
        print(
            metrics.classification_report(
                true_labels, numpy.concatenate(tags[dataset_name])))
def main():
    """Main fuction of the script."""
    args = utils.read_arguments(__doc__)
    documents = []
    text_buffer = []
    if args['parse_trees']:
        parser = LexicalizedStanfordParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            encoding='utf8')
    else:
        parser = None
    limit = int(args['limit'])

    with open(args['input_file'], 'r') as input_file:
        for line in input_file:
            if _is_new_document(line) and len(text_buffer) > 0:
                try:
                    documents.append(_create_document(text_buffer, parser))
                except Exception as e:
                    print('Creation failed for document: {}'.format(
                        text_buffer[0]))
                    print(e)
                text_buffer = [line]
                print('Adding case {} {}'.format(len(documents), line.strip()))
                if limit > 0 and len(documents) >= limit:
                    break

                if len(documents) % 10 == 0:  # Partial save
                    utils.pickle_to_file(documents, args['output_file'])

            else:
                text_buffer.append(line)

    if len(text_buffer) > 0:
        documents.append(_create_document(text_buffer, parser))
    print('{} documents processed'.format(len(documents)))

    utils.pickle_to_file(documents, args['output_file'])
    print('All task finished')
Esempio n. 14
0
def map_count(channel, count_name, temp_dir):
    total = LogCounter(count_name)
    
    count = 0
    job_id = channel.receive()
    while job_id != None:
        temp_file_name = "%s/map_%d_%d" % (temp_dir, job_id, count_names.index(count_name))
        total.add_counter(pickle.load(open(temp_file_name, 'rb')))
        count += 1
        os.remove(temp_file_name)
        job_id = channel.receive()
    
    channel.send(pickle_to_file(total.report(), "%s/out_%d" % (temp_dir, count_names.index(count_name))))
v_lst, preds_lst, ic_lst, folds_lst = zip(
    *(mp.cv_predictiveness(data_1,
                           S_lst[1:][i],
                           measure_func,
                           ensemble_funcs[i],
                           V=5,
                           stratified=True,
                           na_rm=True,
                           type=pred_type) for i in range(len(S_lst[1:]))))
end = time.time()
print("Estimating predictiveness took " + str(end - start) + " seconds")
v_lst_all = [v_none] + list(v_lst)
preds_lst_all = [preds_none] + list(preds_lst)
ic_lst_all = [ic_none] + list(ic_lst)
uts.pickle_to_file(
    v_lst_all, args.output_dir + 'vs_' + args.measure + '_est_' +
    args.estimator_type + '.pkl')
uts.pickle_to_file(
    preds_lst_all, args.output_dir + 'preds_' + args.measure + '_est_' +
    args.estimator_type + '.pkl')
uts.pickle_to_file(
    ic_lst_all, args.output_dir + 'ics_' + args.measure + '_est_' +
    args.estimator_type + '.pkl')
## set up Z, v, W, G, c_n matrices
Z = np.array(Z_aug_lst)
v = np.array(v_lst_all)
W = np.diag(z_counts / np.sum(z_counts))
G = np.vstack((np.append(1, np.zeros(p)), np.ones(p + 1)))
c_n = np.array([v_none, v_lst_all[len(v_lst)]])

## --------------------------------------------------
def main(args=sys.argv[1:]):
    train_size = float(args[0])
    seed = int(args[1])
    icu_data_dir = args[2]

    # Read the y data
    outcomes = pd.read_csv(icu_data_dir + "Outcomes-a.txt")
    subject_outcomes = outcomes[["RecordID", "In-hospital_death"]]

    # Create a dictionary of features for each subject
    # Using a dictionary because some of the features don't appear in all subjects...
    value_range = {}  # this is just for printing out ranges of the values
    file_folder = icu_data_dir + "set-a/"
    all_subject_features = {}
    for idx, filename in enumerate(os.listdir(file_folder)[:MAX_PROCESS]):
        df = pd.read_csv("%s%s" % (file_folder, filename))
        df["hour"] = np.array([time.split(":")[0] for time in df.Time.values],
                              dtype=int)
        df["minute"] = np.array(
            [time.split(":")[1] for time in df.Time.values], dtype=int)
        df.Time = df.hour * 60 + df.minute

        record_id = int(df.loc[0].Value)
        subject_features = {"RecordID": record_id}
        for feat_name, process_func_list in FEATURES.items():
            if WEIGHTED_MEAN in process_func_list:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value > 0)]
            else:
                sub_df = df.loc[(df.Parameter == feat_name) & (df.Value >= 0)]

            if sub_df.shape[0] == 0:
                continue
            if feat_name not in value_range:
                value_range[feat_name] = [
                    sub_df.Value.min(), sub_df.Value.max()
                ]
            else:
                value_range[feat_name][0] = min(value_range[feat_name][0],
                                                sub_df.Value.min())
                value_range[feat_name][1] = max(value_range[feat_name][1],
                                                sub_df.Value.max())

            for func in process_func_list:
                value = func(sub_df)
                if not np.isfinite(value):
                    print(value, feat_name, func.__name__)
                    print(sub_df)
                assert np.isfinite(value)
                full_feature_name = "%s:%s" % (feat_name, func.__name__)
                subject_features[full_feature_name] = value

        fio2_df = df.loc[df.Parameter == "FiO2"]
        pao2_df = df.loc[df.Parameter == "PaO2"]
        if fio2_df.shape[0] and pao2_df.shape[0]:
            fio2_mean = _get_mean(fio2_df)
            pao2_mean = _get_mean(pao2_df)
            if fio2_mean > 0:
                subject_features["O2:_get_ratio"] = pao2_mean / fio2_mean

        all_subject_features[idx] = subject_features

    for k, v in value_range.items():
        print(k, v)

    subjects_x = pd.DataFrame.from_dict(all_subject_features, orient="index")

    ## if a covariate has > 30% missing data, remove it
    prop_nan = subjects_x.apply(lambda x: np.mean(np.isnan(x)))
    print('Features filtered for proportion of NA values >= 0.3')
    print(prop_nan >= 0.3)
    tmp = subjects_x.loc[:, prop_nan < 0.3]
    subjects_x = tmp

    # Merge the X and Y data
    icu_subjects = subjects_x.merge(subject_outcomes, on="RecordID")
    death_resp = icu_subjects["In-hospital_death"]
    icu_subjects = icu_subjects.drop(columns=["RecordID"])

    # Grab column names
    column_names = list(icu_subjects.columns.values)
    print(column_names)
    # icu_subjects = icu_subjects.as_matrix()
    icu_subjects = icu_subjects.loc[:, column_names].values

    # Center the x covariates
    centering_term = np.nanmean(icu_subjects, axis=0)
    centering_term[-1] = 0
    icu_subjects -= centering_term
    assert np.all(death_resp == icu_subjects[:, -1])

    # randomly split the data
    if train_size < 1:
        mats = train_test_split(icu_subjects,
                                train_size=train_size,
                                test_size=1.0 - train_size,
                                random_state=seed)
        x_train = mats[0][:, :-1]
        y_train = mats[0][:, -1:]
        x_test = mats[1][:, :-1]
        y_test = mats[1][:, -1:]
    else:
        x_train = icu_subjects[:, :-1]
        y_train = icu_subjects[:, -1:]
        x_test = x_train
        y_test = y_train

    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)

    # Save the data
    icu_data = data_generator.Dataset(x_train=x_train,
                                      y_train=y_train,
                                      x_test=x_test,
                                      y_test=y_test)

    ## save off as a pickle
    icu_processed_file = icu_data_dir + "icu_data_processed.pkl"
    pickle_to_file(icu_data, icu_processed_file)

    icu_column_file = icu_data_dir + "icu_data_column_names.txt"
    with open(icu_column_file, "w") as f:
        for i, col in enumerate(column_names[:-1]):
            f.write("%d, %s\n" % (i, col))

    feature_group_list, vi_group_names, nan_fill_config = _process_feature_groups(
        column_names[:-1])
    print(
        "Copy paste this for creating the variable importance groups argument!"
    )
    print("--var-import-idx %s" % ";".join(feature_group_list))
    icu_vi_name_file = icu_data_dir + "icu_data_var_import_names.csv"
    vi_group_name_df = pd.DataFrame.from_dict(vi_group_names, orient="index")
    vi_group_name_df.to_csv(icu_vi_name_file)

    nan_config_file = icu_data_dir + "nan_fill_config.json"
    with open(nan_config_file, 'w') as f:
        json.dump(nan_fill_config, f)
Esempio n. 17
0
def prepare_dataset(embeddings_path,
                    datasets,
                    output_dirpath,
                    freq_threshold_unk_tokens=50,
                    reduce_embeddings=False,
                    value_transformations=None,
                    pad_onetoken_sentence=True):
    """Preprocess dataset and embeddings.

    Reads in the pre-trained embeddings (in text format) from embeddings_path
    and prepares those to be used with the LSTM network.
    Unknown words in the trainDataPath-file are added, if they appear at least
    freq_threshold_unk_tokens times

    Args:
        embeddings_path: Full path to the pre-trained embeddings file.
            File must be in text format.
        datasets: A dictionary where the keys are the dataset names and the
            values are the specification for the dataset. The specifications are
            also dicts, with the keys columns, labels, evaluate, commentSymbol
            and dirpath. dirpath contains the path to the directory where the
            three partitions of the dataset (train, test, dev) are stored in
            txt format.
        output_dirpath: Path to directory to store the resulting pickled file
        freq_threshold_unk_tokens: Unknown words are added, if they occure more
            than freq_threshold_unk_tokens times in the train set
        reduce_embeddings: Set to true, then only the embeddings needed for
            training will be loaded
        value_transformations: Column specific value transformations
        pad_onetoken_sentence: True to pad one sentence tokens
            (needed for CRF classifier)
    """
    utils.safe_mkdir(output_dirpath)
    embeddings_name = os.path.basename(embeddings_path)[:10]
    dataset_name = "_".join(sorted(datasets.keys()) + [embeddings_name])
    output_filename = os.path.join(output_dirpath, dataset_name + '.p')

    casing2Idx = preprocessing.getCasingVocab()
    embeddings, word2Idx = preprocessing.readEmbeddings(
        embeddings_path, datasets, freq_threshold_unk_tokens,
        reduce_embeddings)

    mappings = {'tokens': word2Idx, 'casing': casing2Idx}
    result = {
        'embeddings': embeddings,
        'mappings': mappings,
        'datasets': datasets,
        'data': {}
    }

    for name, dataset in datasets.items():
        trainData = os.path.join((dataset['dirpath']), 'train.txt')
        devData = os.path.join((dataset['dirpath']), 'dev.txt')
        testData = os.path.join((dataset['dirpath']), 'test.txt')
        paths = [trainData, devData, testData]
        print(paths)

        result['data'][name] = preprocessing.createPklFiles(
            paths, mappings, dataset['columns'], dataset['commentSymbol'],
            value_transformations, pad_onetoken_sentence)

    utils.pickle_to_file(result, output_filename)

    print("DONE - Embeddings file saved: {}".format(output_filename))
def save_entities(category_name, entities, directory_name):
    filename = '{}.pickle'.format(category_name)
    utils.pickle_to_file(entities, os.path.join(directory_name, filename))