def wordonehot(doc, corpus, vocab, transformations, feature, min_length=None, max_length=None): # Normalize and tokenize the text before sending it into the one-hot encoder norm_doc = tokenize.word_punct_tokens(normalize.xml_normalize(doc)) norm_corpus = tokenize.word_punct_tokens(normalize.xml_normalize(corpus)) doc_onehot = run_onehot(norm_doc, vocab, min_length, max_length) corpus_onehot = run_onehot(norm_corpus, vocab, min_length, max_length) feature = gen_feature([doc_onehot, corpus_onehot], transformations, feature) return feature
def analyze_clusters(all_clusters, lookup_order, documentData): tasks = [] lil_spacy = " " #Iterate through clusters found in JSON file, do feature assessments, #build a rolling corpus from ordered documents for each cluster for cluster in all_clusters: # Determine arrival order in this cluster sortedEntries = [ x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0]) ] first_doc = documentData[sortedEntries[0]]["body_text"] # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary corpus = lil_spacy.join(word_punct_tokens(xml_normalize(first_doc))) # #check to make sure there are at least two sentences - important when using the sentence mask # sentences = punkt_sentences(first_doc) # if len(sentences) ==1: # break #corpus = normalize_and_remove_stop_words(first_doc) # # Store a list of sentences in the cluster at each iteration # sentences = [] # sentences += (data_gen.get_first_and_last_sentence(first_doc)) task = {"C": "", "Q": "", "A": ""} for index in sortedEntries[1:]: # Find next document in order raw_doc = documentData[index]["body_text"] #normalize and remove stop words from doc doc = lil_spacy.join(word_punct_tokens(xml_normalize(raw_doc))) #doc = normalize_and_remove_stop_words(raw_doc) # #check to make sure there are at least two sentences - important when using the sentence mask # sentences = punkt_sentences(raw_doc) # if len(sentences) ==1: # break if documentData[index]["novelty"]: novelty = True else: novelty = False task["C"] += corpus task["Q"] = doc task["A"] = novelty tasks.append(task.copy()) corpus += doc return tasks
def analyze_clusters(all_clusters, lookup_order, documentData): tasks = [] lil_spacy = " " #Iterate through clusters found in JSON file, do feature assessments, #build a rolling corpus from ordered documents for each cluster for cluster in all_clusters: # Determine arrival order in this cluster sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] first_doc = documentData[sortedEntries[0]]["body_text"] # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary corpus = lil_spacy.join(word_punct_tokens(xml_normalize(first_doc))) # #check to make sure there are at least two sentences - important when using the sentence mask # sentences = punkt_sentences(first_doc) # if len(sentences) ==1: # break #corpus = normalize_and_remove_stop_words(first_doc) # # Store a list of sentences in the cluster at each iteration # sentences = [] # sentences += (data_gen.get_first_and_last_sentence(first_doc)) task = {"C": "","Q": "", "A": ""} for index in sortedEntries[1:]: # Find next document in order raw_doc = documentData[index]["body_text"] #normalize and remove stop words from doc doc = lil_spacy.join(word_punct_tokens(xml_normalize(raw_doc))) #doc = normalize_and_remove_stop_words(raw_doc) # #check to make sure there are at least two sentences - important when using the sentence mask # sentences = punkt_sentences(raw_doc) # if len(sentences) ==1: # break if documentData[index]["novelty"]: novelty=True else: novelty=False task["C"] += corpus task["Q"] = doc task["A"] = novelty tasks.append(task.copy()) corpus+=doc return tasks
def build_w2v(trainingdata, min_count=5, window=5, size=100, workers=3, pretrained=False, **kwargs): ''' Fits a Word2Vec topic model based on the training corpus sentences. Args: trainingdata (list): A list containing the training corpus as parsed JSON text min_count (int): ignore all words with total frequency lower than this number window (int): maximum distance between the current and predicted word within a sentence size (int): dimensionality of the feature vectors workers (int): use this many worker threads to train the model (faster training with multicore machines) Returns: Word2Vec: A pretrained Word2Vec model from Google or a Word2Vec model fit to the training data sentences ''' # Suppress gensim's INFO messages logging.getLogger("gensim").setLevel(logging.WARNING) # Use Google's pretrained Word2Vec model if pretrained: # Look at environment variable 'PYTHIA_MODELS_PATH' for user-defined model location # If environment variable is not defined, use current working directory if os.environ.get('PYTHIA_MODELS_PATH') is not None: path_to_models = os.environ.get('PYTHIA_MODELS_PATH') else: path_to_models = os.path.join(os.getcwd(), 'models') # Make the directory for the models unless it already exists try: os.makedirs(path_to_models) except OSError as exception: if exception.errno != errno.EEXIST: raise # Look for Google's trained Word2Vec model as a binary or zipped file; Return error and quit if not found if os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin")): w2v_model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True) elif os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz")): with gzip.open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz"), 'rb') as f_in: with open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) w2v_model = gensim.models.Word2Vec.load_word2vec_format( os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True) else: print("""Error: Google's pretrained Word2Vec model GoogleNews-vectors-negative300.bin was not found in %s Set 'pretrained=False' or download/unzip GoogleNews-vectors-negative300.bin.gz from https://code.google.com/archive/p/word2vec/ into %s""" % (path_to_models,path_to_models), file=sys.stderr) quit() # Train a Word2Vec model with the corpus else: sentencearray = [] for entry in trainingdata: sentences = tokenize.punkt_sentences(xml_normalize(entry['body_text'])) for sentence in sentences: words = tokenize.word_punct_tokens(sentence) sentencearray.append(words) w2v_model = gensim.models.Word2Vec(sentencearray, min_count=min_count, window=window, size=size, workers=workers) return w2v_model
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode): #determine if the first and last sentences will be taken or all sentences if w2v_params.get('mem_w2v_mode', False): w2v_mode = w2v_params['mem_w2v_mode'] else: w2v_mode = 'all' if w2v_mode == 'all': sentences = tokenize.punkt_sentences(doc) else: sentences = get_first_and_last_sentence(doc) normalizedsentences = [] sentence_mask = [] for sentence in sentences: words = tokenize.word_punct_tokens(sentence) if len(sentence_mask) > 0: prev_mask = sentence_mask[-1] else: prev_mask = -1 sentence_mask.append(prev_mask + len(words)) normalizedsentences.append(words) wordvectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: for word in phrase: wordvector = None try: wordvector_ = w2v_model[word] wordvector = [float(w) for w in wordvector_] except: wordvector = w2v_model.seeded_vector(np.random.rand()) if wordvector is not None: wordvectorarray.append(wordvector) if mask_mode == 'sentence': mask = sentence_mask else: mask = np.array([index for index, w in enumerate(wordvectorarray)], dtype=np.int32) if len(wordvectorarray) - 1 != mask[-1]: print(mask) print(np.array(wordvectorarray).shape) raise return np.vstack(wordvectorarray), mask
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode): #determine if the first and last sentences will be taken or all sentences if w2v_params.get('mem_w2v_mode', False): w2v_mode = w2v_params['mem_w2v_mode'] else: w2v_mode = 'all' if w2v_mode == 'all': sentences = tokenize.punkt_sentences(doc) else: sentences = get_first_and_last_sentence(doc) normalizedsentences = [] sentence_mask = [] for sentence in sentences: words = tokenize.word_punct_tokens(sentence) if len(sentence_mask)>0: prev_mask = sentence_mask[-1] else: prev_mask = -1 sentence_mask.append(prev_mask + len(words)) normalizedsentences.append(words) wordvectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: for word in phrase: wordvector = None try: wordvector_ = w2v_model[word] wordvector = [float(w) for w in wordvector_] except: wordvector = w2v_model.seeded_vector(np.random.rand()) if wordvector is not None: wordvectorarray.append(wordvector) if mask_mode=='sentence': mask = sentence_mask else: mask = np.array([index for index, w in enumerate(wordvectorarray)], dtype=np.int32) if len(wordvectorarray)-1!=mask[-1]: print(mask) print(np.array(wordvectorarray).shape) raise return np.vstack(wordvectorarray), mask
def run_w2v(w2v_model, doc, w2v): ''' Calculates Word2Vec vectors for a document using the first and last sentences of the document Args: w2v_model (gensim.Word2Vec): Trained Word2Vec model doc (str): the text of the document w2v (dict): Dictionary of Word2Vec parameters as set in master_pipeline. The dictionary will include keys for the model building parameters min_count, window, size, workers and pretrained. The dict may also have optional boolean keys for the feature operations append, difference, product and cos. Returns: documentvector (list): List of Word2Vec vectors averaged across words and concatenated across sentences ''' # Get first and last sentences of document, break down sentences into words and remove stop words sentences = get_first_and_last_sentence(doc) normalizedsentences = [] for sentence in sentences: words = normalize.remove_stop_words(tokenize.word_punct_tokens(sentence)) normalizedsentences.append(words) wordvectorarray = [] sentencevectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: for word in phrase: try: wordvector = w2v_model[word] except KeyError: continue wordvectorarray.append(wordvector) # Only calculate mean and append to sentence vector array if one or more word vectors were found if len(wordvectorarray) > 0: sentencevectorarray.append(np.mean(wordvectorarray, axis=0)) # Only concatenate if both sentences were added to sentence vector array, otherwise append array of zeroes if len(sentencevectorarray) == 2: documentvector = np.concatenate(sentencevectorarray) elif len(sentencevectorarray) == 1: documentvector = np.concatenate((sentencevectorarray[0], np.zeros(w2v['size']))) else: documentvector = np.zeros(w2v['size']*2) return documentvector
def run_w2v(w2v_model, doc, w2v): ''' Calculates Word2Vec vectors for a document using the first and last sentences of the document Args: w2v_model (gensim.Word2Vec): Trained Word2Vec model doc (str): the text of the document w2v (dict): Dictionary of Word2Vec parameters as set in master_pipeline. The dictionary will include keys for the model building parameters min_count, window, size, workers and pretrained. The dict may also have optional boolean keys for the feature operations append, difference, product and cos. Returns: documentvector (list): List of Word2Vec vectors averaged across sentences ''' # Get first and last sentences of document, break down sentences into words and remove stop words sentences = get_first_and_last_sentence(doc) normalizedsentences = [] for sentence in sentences: words = normalize.remove_stop_words(tokenize.word_punct_tokens(sentence)) normalizedsentences.append(words) wordvectorarray = [] sentencevectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: for word in phrase: wordvector = None try: wordvector = w2v_model[word] except: pass if wordvector is not None: wordvectorarray.append(wordvector) # Only calculate mean and append to sentence vector array if one or more word vectors were found if len(wordvectorarray) > 0: sentencevectorarray.append(np.mean(wordvectorarray, axis=0)) # Only calculate mean if one or more sentences were added to sentence vector array, otherwise return array of zeroes if len(sentencevectorarray) > 0: documentvector = np.mean(sentencevectorarray, axis=0) else: documentvector = np.zeros(w2v['size']) return documentvector
def tfidf_sum(doc, corpus_array, vocab): ''' Calculates L1 normalized TFIDF summation as Novelty Score for new document against corpus. Credit to http://cgi.di.uoa.gr/~antoulas/pubs/ntoulas-novelty-wise.pdf Args: doc (str): the text (normalized and without stop words) of the document corpus (str): the text (normalized and without stop words) of the corpus for that cluster (including the current doc) Returns: float: the normalized TFIDF summation ''' doc_array = tokenize.word_punct_tokens(doc) doc_length = len(doc_array) vectorizer = TfidfVectorizer(norm=None, vocabulary = vocab) tfidf = vectorizer.fit_transform(corpus_array) vector_values = tfidf.toarray() tfidf_score = np.sum(vector_values[-1])/doc_length return tfidf_score
def tfidf_sum(doc, corpus_array, vocab, feature): ''' Calculates L1 normalized TFIDF summation as Novelty Score for new document against corpus. Credit to http://cgi.di.uoa.gr/~antoulas/pubs/ntoulas-novelty-wise.pdf Args: doc (str): the text (normalized and without stop words) of the document corpus (str): the text (normalized and without stop words) of the corpus for that cluster (including the current doc) Returns: float: the normalized TFIDF summation ''' doc_array = tokenize.word_punct_tokens(doc) doc_length = len(doc_array) vectorizer = TfidfVectorizer(norm=None, vocabulary = vocab) tfidf = vectorizer.fit_transform(corpus_array) vector_values = tfidf.toarray() tfidf_score = np.sum(vector_values[-1])/doc_length feature.append(np.array([tfidf_score])) return feature
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder): ''' Generates observations to be fed into the mem_net code Args: raw_doc (string): the raw document text raw_corpus (str): the raw corpus text sentences_full (list): list of all sentences in the corpus mem_net_params (dict): the specified features to be calculated for mem_net vocab (dict): the vocabulary of the data set w2v_model: the word2vec model of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: doc_input (array): the corpus data, known in mem_nets as the input doc_questions: the document data, known in mem_nets as the question doc_masks: the mask for the input data - tells mem_net where the end of each input is this can be per word for the end of a sentence ''' # Use the specified mask mode where available if mem_net_params.get('mask_mode', False): mask_mode = mem_net_params["mask_mode"] else: mask_mode = 'sentence' if mem_net_params.get('embed_mode', False): embed_mode = mem_net_params['embed_mode'] else: embed_mode = 'word2vec' if embed_mode == 'skip_thought': from src.featurizers.skipthoughts import skipthoughts as sk doc_sentences = tokenize.punkt_sentences(raw_doc) # Ensure that the document and corpus are long enough and if not make them be long enough if len(sentences_full) == 1: #print("short corpus") sentences_full.extend(sentences_full) if len(doc_sentences) == 1: #print("short doc") doc_sentences.extend(doc_sentences) corpus_vectors = sk.encode(encoder_decoder, sentences_full) doc_vectors = sk.encode(encoder_decoder, doc_sentences) # Since each entry is a sentence, we use the index of each entry for the mask # We cannot use a word mode in this embedding doc_masks = [index for index, w in enumerate(corpus_vectors)] doc_questions = doc_vectors doc_input = corpus_vectors elif embed_mode == 'onehot': min_length = None max_length = None if mem_net_params.get('onehot_min_len', False): min_length = mem_net_params['onehot_min_len'] if mem_net_params.get('onehot_max_len', False): max_length = mem_net_params['onehot_max_len'] onehot_vocab = full_vocab # Preprocess and tokenize bkgd documents corpus_tokens = tokenize.word_punct_tokens( normalize.xml_normalize(raw_corpus)) corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab) corpus_indices = encode_doc(corpus_tokens, onehot_vocab) # Get sentence mask indices assert {'.', ',', '!', '?'} <= onehot_vocab.keys( ) # ensure that you are using a vocabulary w/ punctuation sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length) # One-hot encode documents w/ masks, and query document corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab)) corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True) # Tokenize and one-hot encode query document doc_vectors = run_onehot( tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), onehot_vocab, min_length, max_length) doc_questions = doc_vectors.T doc_input = corpus_vectors.T if mask_mode == 'sentence': doc_masks = sentence_mask else: doc_masks = [index for index, w in enumerate(doc_input)] elif embed_mode == 'word2vec': corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode) doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode) if len(corpus_vectors) > 0 and len(doc_vectors) > 0: doc_questions = doc_vectors doc_input = corpus_vectors return doc_input, doc_questions, doc_masks
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder): ''' Generates observations to be fed into the mem_net code Args: raw_doc (string): the raw document text raw_corpus (str): the raw corpus text sentences_full (list): list of all sentences in the corpus mem_net_params (dict): the specified features to be calculated for mem_net vocab (dict): the vocabulary of the data set w2v_model: the word2vec model of the data set encoder_decoder (???): the encoder/decoder for skipthoughts vectors Returns: doc_input (array): the corpus data, known in mem_nets as the input doc_questions: the document data, known in mem_nets as the question doc_masks: the mask for the input data - tells mem_net where the end of each input is this can be per word for the end of a sentence ''' # Use the specified mask mode where available if mem_net_params.get('mask_mode', False): mask_mode = mem_net_params["mask_mode"] else: mask_mode = 'sentence' if mem_net_params.get('embed_mode', False): embed_mode = mem_net_params['embed_mode'] else: embed_mode = 'word2vec' if embed_mode == 'skip_thought': from src.featurizers.skipthoughts import skipthoughts as sk doc_sentences = tokenize.punkt_sentences(raw_doc) # Ensure that the document and corpus are long enough and if not make them be long enough if len(sentences_full)==1: #print("short corpus") sentences_full.extend(sentences_full) if len(doc_sentences)==1: #print("short doc") doc_sentences.extend(doc_sentences) corpus_vectors = sk.encode(encoder_decoder, sentences_full) doc_vectors = sk.encode(encoder_decoder, doc_sentences) # Since each entry is a sentence, we use the index of each entry for the mask # We cannot use a word mode in this embedding doc_masks = [index for index, w in enumerate(corpus_vectors)] doc_questions = doc_vectors doc_input = corpus_vectors elif embed_mode == 'onehot': min_length = None max_length = None if mem_net_params.get('onehot_min_len', False): min_length = mem_net_params['onehot_min_len'] if mem_net_params.get('onehot_max_len', False): max_length = mem_net_params['onehot_max_len'] onehot_vocab=full_vocab # Preprocess and tokenize bkgd documents corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus)) corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab) corpus_indices = encode_doc(corpus_tokens, onehot_vocab) # Get sentence mask indices assert {'.',',','!','?'} <= onehot_vocab.keys() # ensure that you are using a vocabulary w/ punctuation sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length) # One-hot encode documents w/ masks, and query document corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab)) corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True) # Tokenize and one-hot encode query document doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), onehot_vocab, min_length, max_length) doc_questions = doc_vectors.T doc_input = corpus_vectors.T if mask_mode=='sentence': doc_masks = sentence_mask else: doc_masks = [index for index, w in enumerate(doc_input)] elif embed_mode == 'word2vec': corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode) doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode) if len(corpus_vectors)>0 and len(doc_vectors)>0: doc_questions = doc_vectors doc_input = corpus_vectors return doc_input, doc_questions, doc_masks
def run_w2v_elemwise(w2v_model, doc, w2v, operation): ''' Calculates Word2Vec vectors for a document using the first and last sentences of the document Examines vector elements and retains maximum, minimum or absolute value for each vector element Args: w2v_model (gensim.Word2Vec): Trained Word2Vec model doc (str): the text of the document w2v (dict): Dictionary of Word2Vec parameters as set in master_pipeline. The dictionary will include keys for the model building parameters min_count, window, size, workers and pretrained. The dict may also have optional boolean keys for the feature operations append, difference, product and cos. operation (str): element wise operation of max, min or abs Returns: documentvector (list): Word2Vec vectors with min/max/abs element values for a sentence, which are then concatenated across sentences ''' # Get first and last sentences of document, break down sentences into words and remove stop words sentences = get_first_and_last_sentence(doc) normalizedsentences = [] for sentence in sentences: words = normalize.remove_stop_words(tokenize.word_punct_tokens(sentence)) normalizedsentences.append(words) sentencevectorarray = [] # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors for phrase in normalizedsentences: # Set up comparison vector based on requested operation if operation == 'max': vectorlist = np.full(w2v['size'], -np.inf) elif operation == 'min': vectorlist = np.full(w2v['size'], np.inf) elif operation == 'abs': vectorlist = np.zeros(w2v['size']) # Determine word vector and evaluate elements against comparison vector for word in phrase: try: wordvector = w2v_model[word] except KeyError: continue if operation == 'max': vectorlist = np.where(wordvector > vectorlist, wordvector, vectorlist) elif operation == 'min': vectorlist = np.where(wordvector < vectorlist, wordvector, vectorlist) elif operation == 'abs': vectorlist = np.where(abs(wordvector) > vectorlist, abs(wordvector), vectorlist) # Remove any infinity values from special cases (ex: 1 word sentence and word not in word2vec model) vectorlist = np.where(np.isinf(vectorlist), 0, vectorlist) sentencevectorarray.append(vectorlist) # Only concatenate if both sentences were added to sentence vector array, otherwise append array of zeroes if len(sentencevectorarray) == 2: documentvector = np.concatenate(sentencevectorarray) elif len(sentencevectorarray) == 1: documentvector = np.concatenate((sentencevectorarray[0], np.zeros(w2v['size']))) else: documentvector = np.zeros(w2v['size']*2) return documentvector
def test_word_punct(): """Test regex-based word and punctuation tokenization""" assert tokenize.word_punct_tokens("Who are you??? Stop, now!") == \ ["Who", "are", "you", "???", "Stop", ",", "now", "!"]