コード例 #1
0
ファイル: test_random.py プロジェクト: colinsongf/pythia
def test_random_resampling():
    data = [ { 'key': True, 'data': 123 },
             { 'key': True, 'data': 123 },
             { 'key': True, 'data': 123 },
             { 'key': True, 'data': 123 },
             { 'key': True, 'data': 123 },
             { 'key': False, 'data': 123 },
             { 'key': False, 'data': 123 },
            ]
    def get_state(seed):
        return np.random.RandomState(seed)
    sampled_data = label_sample(data, 'key', random_state=get_state(41))
    sampled_data2 = label_sample(data, 'key', random_state=get_state(41))
    assert sampled_data == sampled_data2
コード例 #2
0
ファイル: data_gen.py プロジェクト: colinsongf/pythia
def gen_observations(all_clusters,
                     lookup_order,
                     document_data,
                     features,
                     parameters,
                     vocab,
                     full_vocab,
                     encoder_decoder,
                     lda_model,
                     tf_session,
                     w2v_model,
                     hdf5_path=None,
                     dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.', '?', '!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency = parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = {
                'novelty': next_doc['novelty'],
                'data': copy.copy(observations)
            }
            corpus_unprocessed.append(labeled_observation)

    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(
            replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(
            len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty",
                                       replacement, desired_size, random_state)
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times
    # across observations

    clusterids = []
    postids = []
    for case in corpus:

        # Create raw and normalized document arrays
        case_docs_raw = [record['body_text'] for record in case['data']]
        case_docs_normalized = [
            normalize.xml_normalize(body_text) for body_text in case_docs_raw
        ]
        case_docs_no_stop_words = [
            normalize.normalize_and_remove_stop_words(body_text)
            for body_text in case_docs_raw
        ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data']][-1]
        postids.append(postid)
        clusterid = [record['cluster_id'] for record in case['data']][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(
                doc_raw, bkgd_text_raw, bkgd_sentences_full,
                features['mem_net'], vocab, full_vocab, w2v_model,
                encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words,
                                      bkgd_text_no_stop_words,
                                      bkgd_docs_no_stop_words, vocab,
                                      features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder,
                                     features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words,
                                      bkgd_text_no_stop_words, vocab,
                                      lda_model, features['lda'],
                                      feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized,
                                      w2v_model, features['w2v'],
                                      feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(
                    normalize.xml_normalize(doc_raw),
                    normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw,
                                             full_vocab,
                                             features['wordonehot'],
                                             feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors,
                                             axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all()
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    ids = [
        "C" + str(clusterid) + "_P" + str(postid)
        for clusterid, postid in zip(clusterids, postids)
    ]

    if 'mem_net' in features:
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids
コード例 #3
0
ファイル: data_gen.py プロジェクト: pcallier/pythia
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (???): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
#
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        observations = [document_data[sorted_entries[0]]]

        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        if 'over' in parameters:
            desired_size = None
            parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if 'replacement' in parameters:
            replacement = True
        else:
            replacement = False
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors)

            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0)
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features:
        return mem_net_features, labels, ids
    else:
        return data, labels, ids
コード例 #4
0
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (???): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
#
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        observations = [document_data[sorted_entries[0]]]

        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        if 'over' in parameters:
            desired_size = None
            parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if 'replacement' in parameters:
            replacement = True
        else:
            replacement = False
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    for case in corpus:
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_raw[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)

        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors)

            if 'st' in features:
                sentences = [ get_first_and_last_sentence(doc) for doc in bkgd_docs_raw ]
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0)
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    if 'mem_net' in features:
        return mem_net_features, labels
    else:
        return data, labels
コード例 #5
0
ファイル: data_gen.py プロジェクト: Lab41/pythia
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency=parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if 
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ]
        case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words,
                    bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all() 
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)
        
        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features: 
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids