Example #1
0
def process_sent(doc,
                 word2vec,
                 vocab,
                 ivocab,
                 word_vector_size,
                 to_return="word2vec",
                 silent=False,
                 encoder_decoder=None,
                 vocab_dict={}):
    document_vector = []

    if to_return == "word2vec":
        document_vector = [
            process_word(w,
                         word2vec,
                         vocab,
                         ivocab,
                         word_vector_size,
                         to_return,
                         silent=True) for w in doc
        ]
    elif to_return == "skip_thought":
        sentences = punkt_sentences(doc)
        norm_sentences = [normalize.xml_normalize(s) for s in sentences]
        document_vector = [sk.encode(encoder_decoder, norm_sentences)]
    elif to_return == "one_hot":
        data_gen.run_onehot(doc, vocab_dict)

    return document_vector
Example #2
0
def build_w2v(trainingdata, min_count=5, window=5, size=100, workers=3, pretrained=False, **kwargs):
    '''
    Fits a Word2Vec topic model based on the training corpus sentences.

    Args:
        trainingdata (list): A list containing the training corpus as parsed JSON text
        min_count (int): ignore all words with total frequency lower than this number
        window (int): maximum distance between the current and predicted word within a sentence
        size (int): dimensionality of the feature vectors
        workers (int): use this many worker threads to train the model (faster training with multicore machines)

    Returns:
        Word2Vec: A pretrained Word2Vec model from Google or a Word2Vec model fit to the training data sentences
    '''

    # Suppress gensim's INFO messages
    logging.getLogger("gensim").setLevel(logging.WARNING)

    # Use Google's pretrained Word2Vec model
    if pretrained:
        # Look at environment variable 'PYTHIA_MODELS_PATH' for user-defined model location
        # If environment variable is not defined, use current working directory
        if os.environ.get('PYTHIA_MODELS_PATH') is not None:
            path_to_models = os.environ.get('PYTHIA_MODELS_PATH')
        else:
            path_to_models = os.path.join(os.getcwd(), 'models')
        # Make the directory for the models unless it already exists
        try:
            os.makedirs(path_to_models)
        except OSError as exception:
            if exception.errno != errno.EEXIST: raise
        # Look for Google's trained Word2Vec model as a binary or zipped file; Return error and quit if not found
        if os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin")):
            w2v_model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True)
        elif os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz")):
            with gzip.open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz"), 'rb') as f_in:
                with open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            w2v_model = gensim.models.Word2Vec.load_word2vec_format(
                os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True)
        else:
            print("""Error: Google's pretrained Word2Vec model GoogleNews-vectors-negative300.bin was not found in %s
Set 'pretrained=False' or download/unzip GoogleNews-vectors-negative300.bin.gz
from https://code.google.com/archive/p/word2vec/ into %s""" % (path_to_models,path_to_models), file=sys.stderr)
            quit()

    # Train a Word2Vec model with the corpus
    else:
        sentencearray = []
        for entry in trainingdata:
            sentences = tokenize.punkt_sentences(xml_normalize(entry['body_text']))
            for sentence in sentences:
                words = tokenize.word_punct_tokens(sentence)
                sentencearray.append(words)

        w2v_model = gensim.models.Word2Vec(sentencearray, min_count=min_count, window=window, size=size, workers=workers)

    return w2v_model
Example #3
0
def process_sent(doc, word2vec, vocab, ivocab, word_vector_size, to_return="word2vec", silent=False, encoder_decoder=None, vocab_dict={}):
    document_vector = []

    if to_return=="word2vec":
        document_vector = [process_word(w, word2vec, vocab, ivocab , word_vector_size, to_return, silent=True) for w in doc]
    elif to_return=="skip_thought":
        sentences = punkt_sentences(doc)
        norm_sentences = [normalize.xml_normalize(s) for s in sentences]
        document_vector = [ sk.encode(encoder_decoder, norm_sentences)]
    elif to_return=="one_hot":
        data_gen.run_onehot(doc, vocab_dict)

    return document_vector
Example #4
0
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode):

    #determine if the first and last sentences will be taken or all sentences
    if w2v_params.get('mem_w2v_mode', False):
        w2v_mode = w2v_params['mem_w2v_mode']
    else:
        w2v_mode = 'all'

    if w2v_mode == 'all':
        sentences = tokenize.punkt_sentences(doc)
    else:
        sentences = get_first_and_last_sentence(doc)

    normalizedsentences = []

    sentence_mask = []
    for sentence in sentences:
        words = tokenize.word_punct_tokens(sentence)
        if len(sentence_mask) > 0:
            prev_mask = sentence_mask[-1]
        else:
            prev_mask = -1
        sentence_mask.append(prev_mask + len(words))
        normalizedsentences.append(words)

    wordvectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:
        for word in phrase:
            wordvector = None
            try:
                wordvector_ = w2v_model[word]
                wordvector = [float(w) for w in wordvector_]
            except:
                wordvector = w2v_model.seeded_vector(np.random.rand())
            if wordvector is not None:
                wordvectorarray.append(wordvector)

    if mask_mode == 'sentence':
        mask = sentence_mask
    else:
        mask = np.array([index for index, w in enumerate(wordvectorarray)],
                        dtype=np.int32)

    if len(wordvectorarray) - 1 != mask[-1]:
        print(mask)
        print(np.array(wordvectorarray).shape)
        raise

    return np.vstack(wordvectorarray), mask
Example #5
0
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode):

    #determine if the first and last sentences will be taken or all sentences
    if w2v_params.get('mem_w2v_mode', False):
        w2v_mode = w2v_params['mem_w2v_mode']
    else: w2v_mode = 'all'

    if w2v_mode == 'all':
        sentences = tokenize.punkt_sentences(doc)
    else:
        sentences = get_first_and_last_sentence(doc)

    normalizedsentences = []

    sentence_mask = []
    for sentence in sentences:
        words = tokenize.word_punct_tokens(sentence)
        if len(sentence_mask)>0: 
            prev_mask = sentence_mask[-1]
        else: 
            prev_mask = -1
        sentence_mask.append(prev_mask + len(words))
        normalizedsentences.append(words)

    wordvectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:
        for word in phrase:
            wordvector = None
            try:
                wordvector_ = w2v_model[word]
                wordvector = [float(w) for w in wordvector_]
            except:
                wordvector = w2v_model.seeded_vector(np.random.rand())
            if wordvector is not None:
                wordvectorarray.append(wordvector)

    if mask_mode=='sentence': 
        mask = sentence_mask
    else:
        mask = np.array([index for index, w in enumerate(wordvectorarray)], dtype=np.int32)

    if len(wordvectorarray)-1!=mask[-1]:
        print(mask)
        print(np.array(wordvectorarray).shape)
        raise

    return np.vstack(wordvectorarray), mask
Example #6
0
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.

    Args:
        doc (str): the text of the document (before any preprocessing)

    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])
    first_and_last = [first, last]
    return first_and_last
Example #7
0
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.
    
    Args:
        doc (str): the text of the document (before any preprocessing)
    
    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])
    first_and_last = [first, last]
    return first_and_last
Example #8
0
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.

    Args:
        doc (str): the text of the document (before any preprocessing)

    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])

    # Protect against scenario where last sentence is mistakenly returned by parser as empty list
    if len(last) == 0:
        i = -2
        while len(last) == 0:
            last = normalize.xml_normalize(sentences[i])
            i -= 1

    first_and_last = [first, last]
    return first_and_last
Example #9
0
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.

    Args:
        doc (str): the text of the document (before any preprocessing)

    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])

    # Protect against scenario where last sentence is mistakenly returned by parser as empty list
    if len(last)==0:
        i = -2
        while len(last)==0:
            last = normalize.xml_normalize(sentences[i])
            i-=1

    first_and_last = [first, last]
    return first_and_last
Example #10
0
def test_punkt():
    """Test sentence tokenization"""

    assert tokenize.punkt_sentences("S1. S2. S3! S4!!!") == ["S1.", "S2.", "S3!", "S4!!", "!"]
    assert tokenize.punkt_sentences("S1.      S4!!!") == ["S1.", "S4!!", "!"]
Example #11
0
def gen_observations(all_clusters,
                     lookup_order,
                     document_data,
                     features,
                     parameters,
                     vocab,
                     full_vocab,
                     encoder_decoder,
                     lda_model,
                     tf_session,
                     w2v_model,
                     hdf5_path=None,
                     dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.', '?', '!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency = parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = {
                'novelty': next_doc['novelty'],
                'data': copy.copy(observations)
            }
            corpus_unprocessed.append(labeled_observation)

    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(
            replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(
            len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty",
                                       replacement, desired_size, random_state)
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times
    # across observations

    clusterids = []
    postids = []
    for case in corpus:

        # Create raw and normalized document arrays
        case_docs_raw = [record['body_text'] for record in case['data']]
        case_docs_normalized = [
            normalize.xml_normalize(body_text) for body_text in case_docs_raw
        ]
        case_docs_no_stop_words = [
            normalize.normalize_and_remove_stop_words(body_text)
            for body_text in case_docs_raw
        ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data']][-1]
        postids.append(postid)
        clusterid = [record['cluster_id'] for record in case['data']][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(
                doc_raw, bkgd_text_raw, bkgd_sentences_full,
                features['mem_net'], vocab, full_vocab, w2v_model,
                encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words,
                                      bkgd_text_no_stop_words,
                                      bkgd_docs_no_stop_words, vocab,
                                      features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder,
                                     features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words,
                                      bkgd_text_no_stop_words, vocab,
                                      lda_model, features['lda'],
                                      feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized,
                                      w2v_model, features['w2v'],
                                      feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(
                    normalize.xml_normalize(doc_raw),
                    normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw,
                                             full_vocab,
                                             features['wordonehot'],
                                             feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors,
                                             axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all()
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    ids = [
        "C" + str(clusterid) + "_P" + str(postid)
        for clusterid, postid in zip(clusterids, postids)
    ]

    if 'mem_net' in features:
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids
Example #12
0
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full,
                             mem_net_params, vocab, full_vocab, w2v_model,
                             encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else:
        mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else:
        embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full) == 1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences) == 1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors

    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab = full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(
            normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.', ',', '!', '?'} <= onehot_vocab.keys(
        )  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices,
                                 onehot_vocab,
                                 max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded,
                                    onehot_vocab,
                                    min_length,
                                    max_length,
                                    already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(
            tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)),
            onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode == 'sentence':
            doc_masks = sentence_mask
        else:
            doc_masks = [index for index, w in enumerate(doc_input)]

    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus,
                                                   mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params,
                                        mask_mode)

        if len(corpus_vectors) > 0 and len(doc_vectors) > 0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
Example #13
0
def test_punkt():
    """Test sentence tokenization"""

    assert tokenize.punkt_sentences("S1. S2. S3! S4!!!") == ["S1.", "S2.", "S3!", "S4!!", "!"]
    assert tokenize.punkt_sentences("S1.      S4!!!") == ["S1.", "S4!!", "!"]
Example #14
0
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (???): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
#
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        observations = [document_data[sorted_entries[0]]]

        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        if 'over' in parameters:
            desired_size = None
            parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if 'replacement' in parameters:
            replacement = True
        else:
            replacement = False
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors)

            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0)
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features:
        return mem_net_features, labels, ids
    else:
        return data, labels, ids
Example #15
0
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else: mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else: embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full)==1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences)==1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors


    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab=full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.',',','!','?'} <= onehot_vocab.keys()  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), 
                                    onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode=='sentence':
            doc_masks = sentence_mask
        else: doc_masks = [index for index, w in enumerate(doc_input)]


    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode)

        if len(corpus_vectors)>0 and len(doc_vectors)>0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
Example #16
0
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency=parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if 
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ]
        case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words,
                    bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all() 
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)
        
        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features: 
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids
Example #17
0
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (???): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
#
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        observations = [document_data[sorted_entries[0]]]

        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        if 'over' in parameters:
            desired_size = None
            parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if 'replacement' in parameters:
            replacement = True
        else:
            replacement = False
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    for case in corpus:
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_raw[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)

        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors)

            if 'st' in features:
                sentences = [ get_first_and_last_sentence(doc) for doc in bkgd_docs_raw ]
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0)
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    if 'mem_net' in features:
        return mem_net_features, labels
    else:
        return data, labels