コード例 #1
0
def wordonehot(doc, corpus, vocab, transformations, feature, min_length=None, max_length=None):
    # Normalize and tokenize the text before sending it into the one-hot encoder
    norm_doc = tokenize.word_punct_tokens(normalize.xml_normalize(doc))
    norm_corpus = tokenize.word_punct_tokens(normalize.xml_normalize(corpus))
    doc_onehot = run_onehot(norm_doc, vocab, min_length, max_length)
    corpus_onehot = run_onehot(norm_corpus, vocab, min_length, max_length)
    feature = gen_feature([doc_onehot, corpus_onehot], transformations, feature)
    return feature
コード例 #2
0
ファイル: data_gen.py プロジェクト: pcallier/pythia
def wordonehot(doc, corpus, vocab, transformations, feature, min_length=None, max_length=None):
    # Normalize and tokenize the text before sending it into the one-hot encoder
    norm_doc = tokenize.word_punct_tokens(normalize.xml_normalize(doc))
    norm_corpus = tokenize.word_punct_tokens(normalize.xml_normalize(corpus))
    doc_onehot = run_onehot(norm_doc, vocab, min_length, max_length)
    corpus_onehot = run_onehot(norm_corpus, vocab, min_length, max_length)
    feature = gen_feature([doc_onehot, corpus_onehot], transformations, feature)
    return feature
コード例 #3
0
def analyze_clusters(all_clusters, lookup_order, documentData):
    tasks = []
    lil_spacy = " "
    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]

        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = lil_spacy.join(word_punct_tokens(xml_normalize(first_doc)))

        # #check to make sure there are at least two sentences - important when using the sentence mask
        # sentences = punkt_sentences(first_doc)
        # if len(sentences) ==1:
        #     break

        #corpus = normalize_and_remove_stop_words(first_doc)

        # # Store a list of sentences in the cluster at each iteration
        # sentences = []
        # sentences += (data_gen.get_first_and_last_sentence(first_doc))
        task = {"C": "", "Q": "", "A": ""}
        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]

            #normalize and remove stop words from doc
            doc = lil_spacy.join(word_punct_tokens(xml_normalize(raw_doc)))
            #doc = normalize_and_remove_stop_words(raw_doc)

            # #check to make sure there are at least two sentences - important when using the sentence mask
            # sentences = punkt_sentences(raw_doc)
            # if len(sentences) ==1:
            #     break

            if documentData[index]["novelty"]:
                novelty = True
            else:
                novelty = False

            task["C"] += corpus
            task["Q"] = doc
            task["A"] = novelty
            tasks.append(task.copy())
            corpus += doc

    return tasks
コード例 #4
0
ファイル: utils.py プロジェクト: Lab41/pythia
def analyze_clusters(all_clusters, lookup_order, documentData):
    tasks = []
    lil_spacy = " "
    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = lil_spacy.join(word_punct_tokens(xml_normalize(first_doc)))

        # #check to make sure there are at least two sentences - important when using the sentence mask
        # sentences = punkt_sentences(first_doc)
        # if len(sentences) ==1:
        #     break

        #corpus = normalize_and_remove_stop_words(first_doc)

        # # Store a list of sentences in the cluster at each iteration
        # sentences = []
        # sentences += (data_gen.get_first_and_last_sentence(first_doc))
        task = {"C": "","Q": "", "A": ""}
        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]

            #normalize and remove stop words from doc
            doc = lil_spacy.join(word_punct_tokens(xml_normalize(raw_doc)))
            #doc = normalize_and_remove_stop_words(raw_doc)

            # #check to make sure there are at least two sentences - important when using the sentence mask
            # sentences = punkt_sentences(raw_doc)
            # if len(sentences) ==1:
            #     break

            if documentData[index]["novelty"]:
                novelty=True
            else:
                novelty=False

            task["C"] += corpus
            task["Q"] = doc
            task["A"] = novelty
            tasks.append(task.copy())
            corpus+=doc

    return tasks
コード例 #5
0
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.

    Args:
        doc (str): the text of the document (before any preprocessing)

    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])
    first_and_last = [first, last]
    return first_and_last
コード例 #6
0
ファイル: observations.py プロジェクト: dav009/pythia
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.
    
    Args:
        doc (str): the text of the document (before any preprocessing)
    
    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])
    first_and_last = [first, last]
    return first_and_last
コード例 #7
0
ファイル: tensorflow_cnn.py プロジェクト: colinsongf/pythia
    def prep_news_data(self, vocab, min_length, max_length):
        from sklearn.datasets import fetch_20newsgroups
        newsgroups = fetch_20newsgroups()

        documents = [
            data_gen.run_onehot(normalize.xml_normalize(text), vocab,
                                min_length, max_length)
            for text in newsgroups.data
        ]
        labels = newsgroups.target

        #encode the labels in a dictionary
        unique_labels = np.unique(labels)
        i = 0
        unique_label_dict = {}
        for u_c in unique_labels:
            unique_label_dict[u_c] = i
            i += 1

        hot_labels = []
        n_classes = len(unique_labels)
        for c in labels:
            cluster_vect = np.zeros(n_classes, dtype=int)
            cluster_vect[unique_label_dict[c]] = 1
            hot_labels.append(cluster_vect.tolist())

        return documents, hot_labels, n_classes
コード例 #8
0
def parse_file(inData, f):
    documents = ""
    tasks = []
    for i, line in enumerate(inData):
        #print(i, line)
        line = line.strip()
        try:
            post = json.loads(line)  # make sure we can parse the json
        except Exception:
            print("Error with file " + f)
            #continue
        text = post["body_text"]
        text = xml_normalize(text)  # call function from pythia normalize
        novelty = post["novelty"]
        task = {"C": "", "Q": "", "A": ""}
        if i < 1:
            documents += text  # add the first document before setting any tasks
        elif i < 200:
            task[
                "C"] += documents  # add the next 200 documents as a task with the new document as a question.
            task["Q"] = text
            task["A"] = novelty
            tasks.append(task.copy())
            documents += text
    return tasks
コード例 #9
0
def build_w2v(trainingdata, min_count=5, window=5, size=100, workers=3, pretrained=False, **kwargs):
    '''
    Fits a Word2Vec topic model based on the training corpus sentences.

    Args:
        trainingdata (list): A list containing the training corpus as parsed JSON text
        min_count (int): ignore all words with total frequency lower than this number
        window (int): maximum distance between the current and predicted word within a sentence
        size (int): dimensionality of the feature vectors
        workers (int): use this many worker threads to train the model (faster training with multicore machines)

    Returns:
        Word2Vec: A pretrained Word2Vec model from Google or a Word2Vec model fit to the training data sentences
    '''

    # Suppress gensim's INFO messages
    logging.getLogger("gensim").setLevel(logging.WARNING)

    # Use Google's pretrained Word2Vec model
    if pretrained:
        # Look at environment variable 'PYTHIA_MODELS_PATH' for user-defined model location
        # If environment variable is not defined, use current working directory
        if os.environ.get('PYTHIA_MODELS_PATH') is not None:
            path_to_models = os.environ.get('PYTHIA_MODELS_PATH')
        else:
            path_to_models = os.path.join(os.getcwd(), 'models')
        # Make the directory for the models unless it already exists
        try:
            os.makedirs(path_to_models)
        except OSError as exception:
            if exception.errno != errno.EEXIST: raise
        # Look for Google's trained Word2Vec model as a binary or zipped file; Return error and quit if not found
        if os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin")):
            w2v_model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True)
        elif os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz")):
            with gzip.open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz"), 'rb') as f_in:
                with open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            w2v_model = gensim.models.Word2Vec.load_word2vec_format(
                os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True)
        else:
            print("""Error: Google's pretrained Word2Vec model GoogleNews-vectors-negative300.bin was not found in %s
Set 'pretrained=False' or download/unzip GoogleNews-vectors-negative300.bin.gz
from https://code.google.com/archive/p/word2vec/ into %s""" % (path_to_models,path_to_models), file=sys.stderr)
            quit()

    # Train a Word2Vec model with the corpus
    else:
        sentencearray = []
        for entry in trainingdata:
            sentences = tokenize.punkt_sentences(xml_normalize(entry['body_text']))
            for sentence in sentences:
                words = tokenize.word_punct_tokens(sentence)
                sentencearray.append(words)

        w2v_model = gensim.models.Word2Vec(sentencearray, min_count=min_count, window=window, size=size, workers=workers)

    return w2v_model
コード例 #10
0
ファイル: data_gen.py プロジェクト: colinsongf/pythia
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.

    Args:
        doc (str): the text of the document (before any preprocessing)

    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])

    # Protect against scenario where last sentence is mistakenly returned by parser as empty list
    if len(last) == 0:
        i = -2
        while len(last) == 0:
            last = normalize.xml_normalize(sentences[i])
            i -= 1

    first_and_last = [first, last]
    return first_and_last
コード例 #11
0
ファイル: data_gen.py プロジェクト: pcallier/pythia
def get_first_and_last_sentence(doc):
    '''
    Finds the first and last sentance of a document and normalizes them.

    Args:
        doc (str): the text of the document (before any preprocessing)

    Returns:
        array: the first and last sentance after normalizing
    '''
    sentences = tokenize.punkt_sentences(doc)
    first = normalize.xml_normalize(sentences[0])
    last = normalize.xml_normalize(sentences[-1])

    # Protect against scenario where last sentence is mistakenly returned by parser as empty list
    if len(last)==0:
        i = -2
        while len(last)==0:
            last = normalize.xml_normalize(sentences[i])
            i-=1

    first_and_last = [first, last]
    return first_and_last
コード例 #12
0
ファイル: preprocess.py プロジェクト: codeaudit/pythia
def gen_full_vocab(corpus_dict,
                   full_vocab_type='word',
                   full_vocab_size=1000,
                   stem=False,
                   full_char_vocab="",
                   token_include={'.', ',', '!', '?'},
                   **kwargs):
    '''
    Generates a dictionary of words to be used as the vocabulary in features that utilize bag of words.
    This vocab contains stop words and punctuation

    Args:
        corpus_dict (OrderedDict): An ordered list of the most frequently occurring tokens in the corpus
        vocab_size (int): the number of words to be used in the vocabulary

    Returns:
        dict: a dictionary of size vocab_size that contains the most frequent normalized and non-stop words in the corpus
    '''

    vocabdict = dict()
    if full_vocab_type == 'char':
        index = 0
        for c in full_char_vocab:
            vocabdict[c] = index
            index += 1

    else:
        index = 0
        vocabdict = dict()
        for word in corpus_dict:
            if len(vocabdict) < full_vocab_size:
                cleantext = xml_normalize(word, stem)
                if cleantext != '':
                    if not cleantext in vocabdict:
                        vocabdict[cleantext] = index
                        index += 1
            else:
                break

    #For each of these we need to ensure that the punctuation or list of tokens we desire is in the dictionary
    for t in token_include:
        if t not in vocabdict.keys():
            vocabdict[t] = index
            index += 1

    return vocabdict
コード例 #13
0
ファイル: preprocess.py プロジェクト: Lab41/pythia
def gen_full_vocab(corpus_dict, full_vocab_type='word', full_vocab_size=1000, full_vocab_stem=False, full_char_vocab="", token_include = {'.',',','!','?'}, **kwargs):
    '''
    Generates a dictionary of words to be used as the vocabulary in features that utilize bag of words.
    This vocab contains stop words and punctuation

    Args:
        corpus_dict (OrderedDict): An ordered list of the most frequently occurring tokens in the corpus
        vocab_size (int): the number of words to be used in the vocabulary

    Returns:
        dict: a dictionary of size vocab_size that contains the most frequent normalized and non-stop words in the corpus
    '''

    vocabdict = dict()
    if full_vocab_type=='character':
        index=0
        for c in full_char_vocab:
            vocabdict[c] = index
            index+= 1

    else:
        index = 0
        vocabdict = dict()
        for word in corpus_dict:
            if len(vocabdict) < full_vocab_size:
                cleantext = xml_normalize(word, full_vocab_stem)
                if cleantext != '':
                    if not cleantext in vocabdict:
                        vocabdict[cleantext] = index
                        index+=1
            else: break

    #For each of these we need to ensure that the punctuation or list of tokens we desire is in the dictionary
    for t in token_include:
        if t not in vocabdict.keys():
            vocabdict[t] = index
            index+=1

    return vocabdict
コード例 #14
0
ファイル: tensorflow_cnn.py プロジェクト: Lab41/pythia
    def prep_news_data(self, vocab, min_length, max_length):
        from sklearn.datasets import fetch_20newsgroups
        newsgroups= fetch_20newsgroups()

        documents = [data_gen.run_onehot(normalize.xml_normalize(text), vocab, min_length, max_length)
                     for text in newsgroups.data]
        labels = newsgroups.target

        #encode the labels in a dictionary
        unique_labels = np.unique(labels)
        i = 0
        unique_label_dict = {}
        for u_c in unique_labels:
            unique_label_dict[u_c] = i
            i +=1

        hot_labels = []
        n_classes = len(unique_labels)
        for c in labels:
            cluster_vect = np.zeros(n_classes, dtype=int)
            cluster_vect[unique_label_dict[c]]=1
            hot_labels.append(cluster_vect.tolist())

        return documents, hot_labels, n_classes
コード例 #15
0
ファイル: utils.py プロジェクト: Lab41/pythia
def parse_file(inData, f):
    documents = ""
    tasks = []
    for i, line in enumerate(inData):
        #print(i, line)
        line = line.strip()
        try:
            post = json.loads(line) # make sure we can parse the json
        except Exception:
            print("Error with file " +  f)
            #continue
        text = post["body_text"]
        text = xml_normalize(text) # call function from pythia normalize
        novelty = post["novelty"]
        task = {"C": "","Q": "", "A": ""}
        if i < 1:
            documents += text # add the first document before setting any tasks
        elif i < 200:
            task["C"] += documents # add the next 200 documents as a task with the new document as a question.
            task["Q"] = text
            task["A"] = novelty
            tasks.append(task.copy())
            documents += text
    return tasks
コード例 #16
0
ファイル: data_gen.py プロジェクト: colinsongf/pythia
def gen_observations(all_clusters,
                     lookup_order,
                     document_data,
                     features,
                     parameters,
                     vocab,
                     full_vocab,
                     encoder_decoder,
                     lda_model,
                     tf_session,
                     w2v_model,
                     hdf5_path=None,
                     dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.', '?', '!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency = parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = {
                'novelty': next_doc['novelty'],
                'data': copy.copy(observations)
            }
            corpus_unprocessed.append(labeled_observation)

    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(
            replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(
            len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty",
                                       replacement, desired_size, random_state)
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times
    # across observations

    clusterids = []
    postids = []
    for case in corpus:

        # Create raw and normalized document arrays
        case_docs_raw = [record['body_text'] for record in case['data']]
        case_docs_normalized = [
            normalize.xml_normalize(body_text) for body_text in case_docs_raw
        ]
        case_docs_no_stop_words = [
            normalize.normalize_and_remove_stop_words(body_text)
            for body_text in case_docs_raw
        ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data']][-1]
        postids.append(postid)
        clusterid = [record['cluster_id'] for record in case['data']][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized)
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(
                doc_raw, bkgd_text_raw, bkgd_sentences_full,
                features['mem_net'], vocab, full_vocab, w2v_model,
                encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words,
                                      bkgd_text_no_stop_words,
                                      bkgd_docs_no_stop_words, vocab,
                                      features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder,
                                     features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words,
                                      bkgd_text_no_stop_words, vocab,
                                      lda_model, features['lda'],
                                      feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized,
                                      w2v_model, features['w2v'],
                                      feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(
                    normalize.xml_normalize(doc_raw),
                    normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw,
                                             full_vocab,
                                             features['wordonehot'],
                                             feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors,
                                             axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all()
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)

        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels

    ids = [
        "C" + str(clusterid) + "_P" + str(postid)
        for clusterid, postid in zip(clusterids, postids)
    ]

    if 'mem_net' in features:
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids
コード例 #17
0
ファイル: data_gen.py プロジェクト: colinsongf/pythia
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full,
                             mem_net_params, vocab, full_vocab, w2v_model,
                             encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else:
        mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else:
        embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full) == 1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences) == 1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors

    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab = full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(
            normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.', ',', '!', '?'} <= onehot_vocab.keys(
        )  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices,
                                 onehot_vocab,
                                 max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded,
                                    onehot_vocab,
                                    min_length,
                                    max_length,
                                    already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(
            tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)),
            onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode == 'sentence':
            doc_masks = sentence_mask
        else:
            doc_masks = [index for index, w in enumerate(doc_input)]

    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus,
                                                   mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params,
                                        mask_mode)

        if len(corpus_vectors) > 0 and len(doc_vectors) > 0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
コード例 #18
0
ファイル: data_gen.py プロジェクト: pcallier/pythia
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else: mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else: embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full)==1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences)==1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors


    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab=full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.',',','!','?'} <= onehot_vocab.keys()  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), 
                                    onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode=='sentence':
            doc_masks = sentence_mask
        else: doc_masks = [index for index, w in enumerate(doc_input)]


    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode)

        if len(corpus_vectors)>0 and len(doc_vectors)>0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
コード例 #19
0
def test_xml_normalize():
    assert normalize.xml_normalize(
        "my link is <http://link.com>. Enough.") == 'my link is . enough.'
コード例 #20
0
ファイル: utils.py プロジェクト: Lab41/pythia
def analyze_clusters2(all_clusters, lookup_order, documentData, vector_type, word2vec, word_vector_size, param, in_dict={}):
    #This is mostly cut and paste from data_gen but with some differences
    #TODO in the future fold this into data_gen more....but would need somewhat extensive changes there

    # Prepare to store results of feature assessments
    tasks = []

    #initialize vocab and ivocab to empty dictionaries
    if 'vocab' in in_dict:
        print("using a vocab")
        vocab = in_dict['vocab']
    else:
        vocab = {}
    if 'ivocab' in in_dict:
        ivocab = in_dict['ivocab']
    else:
        ivocab = {}
    if 'word2vec' in in_dict:
        built_word2vec = in_dict['word2vec']
    else:
        built_word2vec = word2vec.copy()

    inputs = []
    answers = []
    input_masks = []
    questions = []

    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = xml_normalize(first_doc)
        built_word2vec, vocab, ivocab = build_vocab(first_doc, built_word2vec, vocab, ivocab, word_vector_size)

        # Store a list of sentences in the cluster at each iteration
        sentences = []
        sentences += (data_gen.get_first_and_last_sentence(first_doc))
        task = {"C": "","Q": "", "A": ""}
        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]

            #normalize and remove stop words from doc
            doc = xml_normalize(raw_doc)
            built_word2vec, vocab, ivocab = build_vocab(doc, built_word2vec, vocab, ivocab, word_vector_size)

            if documentData[index]["novelty"]:
                novelty=True
                answers.append(1)
            else:
                novelty=False
                answers.append(0)

            inp_vector = [process_word(word = w,
                                        word2vec = built_word2vec,
                                        vocab = vocab,
                                        ivocab = ivocab,
                                        to_return = vector_type, silent=True) for w in corpus]

            question_rep = [process_word(word = w,
                                        word2vec = built_word2vec,
                                        vocab = vocab,
                                        ivocab = ivocab,
                                        to_return = vector_type, silent=True) for w in corpus]

            # task["C"] += corpus
            # task["Q"] = doc
            # task["A"] = novelty
            # tasks.append(task.copy())
            # corpus+=doc
            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(question_rep).astype(floatX))
            input_masks.append(np.array([index for index, w in enumerate(doc)], dtype=np.int32))
    print(len(inputs), len(questions), len(answers), len(input_masks))

    results = {}
    results['vocab'] = vocab
    results['ivocab'] = ivocab
    results['word2vec'] = built_word2vec
    results[param+'_inputs'] = inputs
    results[param+'_questions'] = questions
    results[param+'_answers'] = answers
    results[param+'_input_masks'] = input_masks

    return results
コード例 #21
0
ファイル: test_normalize.py プロジェクト: Lab41/pythia
def test_xml_normalize():
    assert normalize.xml_normalize("my link is <http://link.com>. Enough.") == 'my link is . enough.'
コード例 #22
0
def analyze_clusters2(all_clusters,
                      lookup_order,
                      documentData,
                      vector_type,
                      word2vec,
                      word_vector_size,
                      param,
                      in_dict={}):
    #This is mostly cut and paste from data_gen but with some differences
    #TODO in the future fold this into data_gen more....but would need somewhat extensive changes there

    # Prepare to store results of feature assessments
    tasks = []

    #initialize vocab and ivocab to empty dictionaries
    if 'vocab' in in_dict:
        print("using a vocab")
        vocab = in_dict['vocab']
    else:
        vocab = {}
    if 'ivocab' in in_dict:
        ivocab = in_dict['ivocab']
    else:
        ivocab = {}
    if 'word2vec' in in_dict:
        built_word2vec = in_dict['word2vec']
    else:
        built_word2vec = word2vec.copy()

    inputs = []
    answers = []
    input_masks = []
    questions = []

    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]

        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = xml_normalize(first_doc)
        built_word2vec, vocab, ivocab = build_vocab(first_doc, built_word2vec,
                                                    vocab, ivocab,
                                                    word_vector_size)

        # Store a list of sentences in the cluster at each iteration
        sentences = []
        sentences += (data_gen.get_first_and_last_sentence(first_doc))
        task = {"C": "", "Q": "", "A": ""}
        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]

            #normalize and remove stop words from doc
            doc = xml_normalize(raw_doc)
            built_word2vec, vocab, ivocab = build_vocab(
                doc, built_word2vec, vocab, ivocab, word_vector_size)

            if documentData[index]["novelty"]:
                novelty = True
                answers.append(1)
            else:
                novelty = False
                answers.append(0)

            inp_vector = [
                process_word(word=w,
                             word2vec=built_word2vec,
                             vocab=vocab,
                             ivocab=ivocab,
                             to_return=vector_type,
                             silent=True) for w in corpus
            ]

            question_rep = [
                process_word(word=w,
                             word2vec=built_word2vec,
                             vocab=vocab,
                             ivocab=ivocab,
                             to_return=vector_type,
                             silent=True) for w in corpus
            ]

            # task["C"] += corpus
            # task["Q"] = doc
            # task["A"] = novelty
            # tasks.append(task.copy())
            # corpus+=doc
            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(question_rep).astype(floatX))
            input_masks.append(
                np.array([index for index, w in enumerate(doc)],
                         dtype=np.int32))
    print(len(inputs), len(questions), len(answers), len(input_masks))

    results = {}
    results['vocab'] = vocab
    results['ivocab'] = ivocab
    results['word2vec'] = built_word2vec
    results[param + '_inputs'] = inputs
    results[param + '_questions'] = questions
    results[param + '_answers'] = answers
    results[param + '_input_masks'] = input_masks

    return results
コード例 #23
0
ファイル: data_gen.py プロジェクト: Lab41/pythia
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32):
    '''
    Generates observations for each cluster found in JSON file and calculates the specified features.

    Args:
        all_clusters (set): cluster IDs
        lookup_order (dict): document arrival order
        document_data (array): parsed JSON documents
        features (dict): the specified features to be calculated
        parameters (dict): data structure with run parameters
        vocab (dict): the vocabulary of the data set
        full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors
        lda_model (sklearn.???): trained LDA model
        tf_session: active TensorFlow session
        w2v_model (gensim.word2vec): trained word2vec model

    Returns:
        data(list): contains for each obeservation the features of the document vs corpus which could include:
            tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding
        labels(list): the labels for each document where a one is novel and zero is duplicate
    '''

    # Prepare to store results of feature assessments
    data = list()
    labels = list()
    # mem_net_features is used when the mem_net algorithm is ran
    # It consist of inputs, labels(answers), input_masks and questions for each entry
    mem_net_features = {}
    inputs = []
    input_masks = []
    questions = []
    # Sentence punctuation delimiters
    punkt = ['.','?','!']

    corpus_unprocessed = list()
    # HDF5-related parameters
    hdf5_save_frequency=parameters['hdf5_save_frequency']
    data_key = 'data'
    labels_key = 'labels'
    # Truncate any existing files at save location, or return early if 
    # using existing files
    if hdf5_path is not None:
        if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path):
            return hdf5_path, hdf5_path
        open(hdf5_path, 'w').close()

    # Create random state
    random_state = np.random.RandomState(parameters['seed'])

    # Iterate through clusters found in JSON file, generate observations
    # pairing data and label
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]
        observations = [document_data[sorted_entries[0]]]
        for index in sorted_entries[1:]:
            next_doc = document_data[index]
            observations.append(next_doc)
            labeled_observation = { 'novelty' : next_doc['novelty'],
                    'data' : copy.copy(observations) }
            corpus_unprocessed.append(labeled_observation)
    
    # Resample if necessary
    # If oversampling +/- replacement, sample up
    # to larger class size for both classes, with replacement
    # If -oversampling, sample down to 
    # smaller class size for both classes with or w/o replacement
    if 'resampling' in parameters:
        resampling_parameters = parameters['resampling']
        if resampling_parameters.get('over', False):
            desired_size = None
            resampling_parameters['replacement'] = True
        else:
            desired_size = -np.Inf
        if resampling_parameters.get('replacement', False):
            replacement = True
        else:
            replacement = False
        logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size))
        logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters)))
        corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state)  
    else:
        corpus = corpus_unprocessed

    # Featurize each observation
    # Some duplication of effort here bc docs will appear multiple times 
    # across observations
    
    clusterids = []
    postids = []
    for case in corpus:
        
        # Create raw and normalized document arrays
        case_docs_raw = [ record['body_text'] for record in case['data'] ]
        case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ]
        case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ]
        #create ids for individual data points
        postid = [record['post_id'] for record in case['data'] ][-1]
        postids.append(postid)
        clusterid = [ record['cluster_id'] for record in case['data'] ][0]
        clusterids.append(clusterid)
        # Pull out query documents
        doc_raw = case_docs_raw[-1]
        doc_normalized = case_docs_normalized[-1]
        doc_no_stop_words = case_docs_no_stop_words[-1]
        # Create lists of background documents
        bkgd_docs_raw = case_docs_raw[:-1]
        bkgd_docs_normalized = case_docs_normalized[:-1]
        bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1]
        bkgd_text_raw = '\n'.join(bkgd_docs_raw)
        bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) 
        bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words)
        feature_vectors = list()

        if 'mem_net' in features:
            # Get all sentences for the memory network algorithm
            bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw)
            doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder)

            # Now add all of the input docs to the primary list
            inputs.append(doc_input)
            questions.append(doc_questions)
            input_masks.append(doc_masks)

        else:

            if 'bow' in features:
                feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words,
                    bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors)
            if 'st' in features:
                sentences = []
                for doc in bkgd_docs_raw:
                    for item in get_first_and_last_sentence(doc):
                        sentences.append(item)
                feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors)

            if 'lda' in features:
                feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors)

            if 'w2v' in features:
                feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors)

            if 'cnn' in features:
                feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session)

            if 'wordonehot' in features:
                feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors)

            # Save features and label
            feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype)
            # Fail catastrphically on zero vector (not sure if we need this)
            #assert not (feature_vectors < 0.0001).all() 
            data.append(feature_vectors)
        if case["novelty"]:
            labels.append(1)
        else:
            labels.append(0)
        
        # save to HDF5 if desired
        if hdf5_path is not None and len(data) % hdf5_save_frequency == 0:
            with h5py.File(hdf5_path, 'a') as h5:
                data_np = np.array(data)
                labels_np = np.reshape(np.array(labels), (-1, 1))
                add_to_hdf5(h5, data_np, data_key)
                add_to_hdf5(h5, labels_np, labels_key, np.uint8)
                labels = list()
                data = list()
    # Save off any remainder
    if hdf5_path is not None and len(data) > 0:
        with h5py.File(hdf5_path, 'a') as h5:
            data_np = np.array(data)
            labels_np = np.reshape(np.array(labels), (-1, 1))
            add_to_hdf5(h5, data_np, data_key)
            add_to_hdf5(h5, labels_np, labels_key, np.uint8)

    mem_net_features['inputs'] = inputs
    mem_net_features['questions'] = questions
    mem_net_features['input_masks'] = input_masks
    mem_net_features['answers'] = labels
    
    ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)]

   
    if 'mem_net' in features: 
        return mem_net_features, labels, ids
    if hdf5_path is not None:
        return hdf5_path, hdf5_path, ids
    else:
        return data, labels, ids