Exemple #1
0
def wordonehot(doc, corpus, vocab, transformations, feature, min_length=None, max_length=None):
    # Normalize and tokenize the text before sending it into the one-hot encoder
    norm_doc = tokenize.word_punct_tokens(normalize.xml_normalize(doc))
    norm_corpus = tokenize.word_punct_tokens(normalize.xml_normalize(corpus))
    doc_onehot = run_onehot(norm_doc, vocab, min_length, max_length)
    corpus_onehot = run_onehot(norm_corpus, vocab, min_length, max_length)
    feature = gen_feature([doc_onehot, corpus_onehot], transformations, feature)
    return feature
Exemple #2
0
def wordonehot(doc, corpus, vocab, transformations, feature, min_length=None, max_length=None):
    # Normalize and tokenize the text before sending it into the one-hot encoder
    norm_doc = tokenize.word_punct_tokens(normalize.xml_normalize(doc))
    norm_corpus = tokenize.word_punct_tokens(normalize.xml_normalize(corpus))
    doc_onehot = run_onehot(norm_doc, vocab, min_length, max_length)
    corpus_onehot = run_onehot(norm_corpus, vocab, min_length, max_length)
    feature = gen_feature([doc_onehot, corpus_onehot], transformations, feature)
    return feature
Exemple #3
0
def analyze_clusters(all_clusters, lookup_order, documentData):
    tasks = []
    lil_spacy = " "
    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [
            x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])
        ]

        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = lil_spacy.join(word_punct_tokens(xml_normalize(first_doc)))

        # #check to make sure there are at least two sentences - important when using the sentence mask
        # sentences = punkt_sentences(first_doc)
        # if len(sentences) ==1:
        #     break

        #corpus = normalize_and_remove_stop_words(first_doc)

        # # Store a list of sentences in the cluster at each iteration
        # sentences = []
        # sentences += (data_gen.get_first_and_last_sentence(first_doc))
        task = {"C": "", "Q": "", "A": ""}
        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]

            #normalize and remove stop words from doc
            doc = lil_spacy.join(word_punct_tokens(xml_normalize(raw_doc)))
            #doc = normalize_and_remove_stop_words(raw_doc)

            # #check to make sure there are at least two sentences - important when using the sentence mask
            # sentences = punkt_sentences(raw_doc)
            # if len(sentences) ==1:
            #     break

            if documentData[index]["novelty"]:
                novelty = True
            else:
                novelty = False

            task["C"] += corpus
            task["Q"] = doc
            task["A"] = novelty
            tasks.append(task.copy())
            corpus += doc

    return tasks
Exemple #4
0
def analyze_clusters(all_clusters, lookup_order, documentData):
    tasks = []
    lil_spacy = " "
    #Iterate through clusters found in JSON file, do feature assessments,
    #build a rolling corpus from ordered documents for each cluster
    for cluster in all_clusters:
        # Determine arrival order in this cluster
        sortedEntries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])]

        first_doc = documentData[sortedEntries[0]]["body_text"]

        # Set corpus to first doc in this cluster and prepare to update corpus with new document vocabulary
        corpus = lil_spacy.join(word_punct_tokens(xml_normalize(first_doc)))

        # #check to make sure there are at least two sentences - important when using the sentence mask
        # sentences = punkt_sentences(first_doc)
        # if len(sentences) ==1:
        #     break

        #corpus = normalize_and_remove_stop_words(first_doc)

        # # Store a list of sentences in the cluster at each iteration
        # sentences = []
        # sentences += (data_gen.get_first_and_last_sentence(first_doc))
        task = {"C": "","Q": "", "A": ""}
        for index in sortedEntries[1:]:
            # Find next document in order
            raw_doc = documentData[index]["body_text"]

            #normalize and remove stop words from doc
            doc = lil_spacy.join(word_punct_tokens(xml_normalize(raw_doc)))
            #doc = normalize_and_remove_stop_words(raw_doc)

            # #check to make sure there are at least two sentences - important when using the sentence mask
            # sentences = punkt_sentences(raw_doc)
            # if len(sentences) ==1:
            #     break

            if documentData[index]["novelty"]:
                novelty=True
            else:
                novelty=False

            task["C"] += corpus
            task["Q"] = doc
            task["A"] = novelty
            tasks.append(task.copy())
            corpus+=doc

    return tasks
Exemple #5
0
def build_w2v(trainingdata, min_count=5, window=5, size=100, workers=3, pretrained=False, **kwargs):
    '''
    Fits a Word2Vec topic model based on the training corpus sentences.

    Args:
        trainingdata (list): A list containing the training corpus as parsed JSON text
        min_count (int): ignore all words with total frequency lower than this number
        window (int): maximum distance between the current and predicted word within a sentence
        size (int): dimensionality of the feature vectors
        workers (int): use this many worker threads to train the model (faster training with multicore machines)

    Returns:
        Word2Vec: A pretrained Word2Vec model from Google or a Word2Vec model fit to the training data sentences
    '''

    # Suppress gensim's INFO messages
    logging.getLogger("gensim").setLevel(logging.WARNING)

    # Use Google's pretrained Word2Vec model
    if pretrained:
        # Look at environment variable 'PYTHIA_MODELS_PATH' for user-defined model location
        # If environment variable is not defined, use current working directory
        if os.environ.get('PYTHIA_MODELS_PATH') is not None:
            path_to_models = os.environ.get('PYTHIA_MODELS_PATH')
        else:
            path_to_models = os.path.join(os.getcwd(), 'models')
        # Make the directory for the models unless it already exists
        try:
            os.makedirs(path_to_models)
        except OSError as exception:
            if exception.errno != errno.EEXIST: raise
        # Look for Google's trained Word2Vec model as a binary or zipped file; Return error and quit if not found
        if os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin")):
            w2v_model = gensim.models.Word2Vec.load_word2vec_format(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True)
        elif os.path.isfile(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz")):
            with gzip.open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin.gz"), 'rb') as f_in:
                with open(os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            w2v_model = gensim.models.Word2Vec.load_word2vec_format(
                os.path.join(path_to_models, "GoogleNews-vectors-negative300.bin"), binary=True)
        else:
            print("""Error: Google's pretrained Word2Vec model GoogleNews-vectors-negative300.bin was not found in %s
Set 'pretrained=False' or download/unzip GoogleNews-vectors-negative300.bin.gz
from https://code.google.com/archive/p/word2vec/ into %s""" % (path_to_models,path_to_models), file=sys.stderr)
            quit()

    # Train a Word2Vec model with the corpus
    else:
        sentencearray = []
        for entry in trainingdata:
            sentences = tokenize.punkt_sentences(xml_normalize(entry['body_text']))
            for sentence in sentences:
                words = tokenize.word_punct_tokens(sentence)
                sentencearray.append(words)

        w2v_model = gensim.models.Word2Vec(sentencearray, min_count=min_count, window=window, size=size, workers=workers)

    return w2v_model
Exemple #6
0
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode):

    #determine if the first and last sentences will be taken or all sentences
    if w2v_params.get('mem_w2v_mode', False):
        w2v_mode = w2v_params['mem_w2v_mode']
    else:
        w2v_mode = 'all'

    if w2v_mode == 'all':
        sentences = tokenize.punkt_sentences(doc)
    else:
        sentences = get_first_and_last_sentence(doc)

    normalizedsentences = []

    sentence_mask = []
    for sentence in sentences:
        words = tokenize.word_punct_tokens(sentence)
        if len(sentence_mask) > 0:
            prev_mask = sentence_mask[-1]
        else:
            prev_mask = -1
        sentence_mask.append(prev_mask + len(words))
        normalizedsentences.append(words)

    wordvectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:
        for word in phrase:
            wordvector = None
            try:
                wordvector_ = w2v_model[word]
                wordvector = [float(w) for w in wordvector_]
            except:
                wordvector = w2v_model.seeded_vector(np.random.rand())
            if wordvector is not None:
                wordvectorarray.append(wordvector)

    if mask_mode == 'sentence':
        mask = sentence_mask
    else:
        mask = np.array([index for index, w in enumerate(wordvectorarray)],
                        dtype=np.int32)

    if len(wordvectorarray) - 1 != mask[-1]:
        print(mask)
        print(np.array(wordvectorarray).shape)
        raise

    return np.vstack(wordvectorarray), mask
Exemple #7
0
def run_w2v_matrix(w2v_model, doc, w2v_params, mask_mode):

    #determine if the first and last sentences will be taken or all sentences
    if w2v_params.get('mem_w2v_mode', False):
        w2v_mode = w2v_params['mem_w2v_mode']
    else: w2v_mode = 'all'

    if w2v_mode == 'all':
        sentences = tokenize.punkt_sentences(doc)
    else:
        sentences = get_first_and_last_sentence(doc)

    normalizedsentences = []

    sentence_mask = []
    for sentence in sentences:
        words = tokenize.word_punct_tokens(sentence)
        if len(sentence_mask)>0: 
            prev_mask = sentence_mask[-1]
        else: 
            prev_mask = -1
        sentence_mask.append(prev_mask + len(words))
        normalizedsentences.append(words)

    wordvectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:
        for word in phrase:
            wordvector = None
            try:
                wordvector_ = w2v_model[word]
                wordvector = [float(w) for w in wordvector_]
            except:
                wordvector = w2v_model.seeded_vector(np.random.rand())
            if wordvector is not None:
                wordvectorarray.append(wordvector)

    if mask_mode=='sentence': 
        mask = sentence_mask
    else:
        mask = np.array([index for index, w in enumerate(wordvectorarray)], dtype=np.int32)

    if len(wordvectorarray)-1!=mask[-1]:
        print(mask)
        print(np.array(wordvectorarray).shape)
        raise

    return np.vstack(wordvectorarray), mask
Exemple #8
0
def run_w2v(w2v_model, doc, w2v):
    '''
      Calculates Word2Vec vectors for a document using the first and last sentences of the document

      Args:
          w2v_model (gensim.Word2Vec): Trained Word2Vec model
          doc (str): the text of the document
          w2v (dict): Dictionary of Word2Vec parameters as set in master_pipeline. The dictionary
           will include keys for the model building parameters min_count, window, size, workers and pretrained.
           The dict may also have optional boolean keys for the feature operations append, difference, product and cos.

      Returns:
          documentvector (list): List of Word2Vec vectors averaged across words and concatenated across sentences
      '''

    # Get first and last sentences of document, break down sentences into words and remove stop words
    sentences = get_first_and_last_sentence(doc)
    normalizedsentences = []

    for sentence in sentences:
        words = normalize.remove_stop_words(tokenize.word_punct_tokens(sentence))
        normalizedsentences.append(words)

    wordvectorarray = []
    sentencevectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:
        for word in phrase:
            try:
                wordvector = w2v_model[word]
            except KeyError:
                continue
            wordvectorarray.append(wordvector)

        # Only calculate mean and append to sentence vector array if one or more word vectors were found
        if len(wordvectorarray) > 0:
            sentencevectorarray.append(np.mean(wordvectorarray, axis=0))

    # Only concatenate if both sentences were added to sentence vector array, otherwise append array of zeroes
    if len(sentencevectorarray) == 2:
        documentvector =  np.concatenate(sentencevectorarray)
    elif len(sentencevectorarray) == 1:
        documentvector =  np.concatenate((sentencevectorarray[0], np.zeros(w2v['size'])))
    else:
        documentvector = np.zeros(w2v['size']*2)
    return documentvector
Exemple #9
0
def run_w2v(w2v_model, doc, w2v):
    '''
      Calculates Word2Vec vectors for a document using the first and last sentences of the document

      Args:
          w2v_model (gensim.Word2Vec): Trained Word2Vec model
          doc (str): the text of the document
          w2v (dict): Dictionary of Word2Vec parameters as set in master_pipeline. The dictionary
           will include keys for the model building parameters min_count, window, size, workers and pretrained.
           The dict may also have optional boolean keys for the feature operations append, difference, product and cos.

      Returns:
          documentvector (list): List of Word2Vec vectors averaged across sentences
      '''

    # Get first and last sentences of document, break down sentences into words and remove stop words
    sentences = get_first_and_last_sentence(doc)
    normalizedsentences = []

    for sentence in sentences:
        words = normalize.remove_stop_words(tokenize.word_punct_tokens(sentence))
        normalizedsentences.append(words)

    wordvectorarray = []
    sentencevectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:
        for word in phrase:
            wordvector = None
            try:
                wordvector = w2v_model[word]
            except:
                pass
            if wordvector is not None: wordvectorarray.append(wordvector)

        # Only calculate mean and append to sentence vector array if one or more word vectors were found
        if len(wordvectorarray) > 0:
            sentencevectorarray.append(np.mean(wordvectorarray, axis=0))

    # Only calculate mean if one or more sentences were added to sentence vector array, otherwise return array of zeroes
    if len(sentencevectorarray) > 0:
        documentvector =  np.mean(sentencevectorarray, axis=0)
    else:
        documentvector = np.zeros(w2v['size'])
    return documentvector
Exemple #10
0
def tfidf_sum(doc, corpus_array, vocab):
    '''
    Calculates L1 normalized TFIDF summation as Novelty Score for new document against corpus.
    
    Credit to http://cgi.di.uoa.gr/~antoulas/pubs/ntoulas-novelty-wise.pdf
    
    Args:
        doc (str): the text (normalized and without stop words) of the document
        corpus (str): the text (normalized and without stop words) of the corpus for that cluster (including the current doc)
    
    Returns:
        float: the normalized TFIDF summation
    '''
    doc_array = tokenize.word_punct_tokens(doc)
    doc_length = len(doc_array)
    vectorizer = TfidfVectorizer(norm=None, vocabulary = vocab)
    tfidf = vectorizer.fit_transform(corpus_array)
    vector_values = tfidf.toarray()
    tfidf_score = np.sum(vector_values[-1])/doc_length
    return tfidf_score
Exemple #11
0
def tfidf_sum(doc, corpus_array, vocab, feature):
    '''
    Calculates L1 normalized TFIDF summation as Novelty Score for new document against corpus.

    Credit to http://cgi.di.uoa.gr/~antoulas/pubs/ntoulas-novelty-wise.pdf

    Args:
        doc (str): the text (normalized and without stop words) of the document
        corpus (str): the text (normalized and without stop words) of the corpus for that cluster (including the current doc)

    Returns:
        float: the normalized TFIDF summation
    '''
    doc_array = tokenize.word_punct_tokens(doc)
    doc_length = len(doc_array)
    vectorizer = TfidfVectorizer(norm=None, vocabulary = vocab)
    tfidf = vectorizer.fit_transform(corpus_array)
    vector_values = tfidf.toarray()
    tfidf_score = np.sum(vector_values[-1])/doc_length
    feature.append(np.array([tfidf_score]))
    return feature
Exemple #12
0
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full,
                             mem_net_params, vocab, full_vocab, w2v_model,
                             encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else:
        mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else:
        embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full) == 1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences) == 1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors

    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab = full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(
            normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.', ',', '!', '?'} <= onehot_vocab.keys(
        )  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices,
                                 onehot_vocab,
                                 max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded,
                                    onehot_vocab,
                                    min_length,
                                    max_length,
                                    already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(
            tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)),
            onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode == 'sentence':
            doc_masks = sentence_mask
        else:
            doc_masks = [index for index, w in enumerate(doc_input)]

    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus,
                                                   mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params,
                                        mask_mode)

        if len(corpus_vectors) > 0 and len(doc_vectors) > 0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
Exemple #13
0
def gen_mem_net_observations(raw_doc, raw_corpus, sentences_full, mem_net_params, vocab, full_vocab, w2v_model, encoder_decoder):
    '''
    Generates observations to be fed into the mem_net code

    Args:
        raw_doc (string): the raw document text
        raw_corpus (str): the raw corpus text
        sentences_full (list): list of all sentences in the corpus
        mem_net_params (dict): the specified features to be calculated for mem_net
        vocab (dict): the vocabulary of the data set
        w2v_model: the word2vec model of the data set
        encoder_decoder (???): the encoder/decoder for skipthoughts vectors

    Returns:
        doc_input (array): the corpus data, known in mem_nets as the input
        doc_questions: the document data, known in mem_nets as the question
        doc_masks: the mask for the input data - tells mem_net where the end of each input is
            this can be per word for the end of a sentence
     '''

    # Use the specified mask mode where available
    if mem_net_params.get('mask_mode', False):
        mask_mode = mem_net_params["mask_mode"]
    else: mask_mode = 'sentence'

    if mem_net_params.get('embed_mode', False):
        embed_mode = mem_net_params['embed_mode']
    else: embed_mode = 'word2vec'

    if embed_mode == 'skip_thought':
        from src.featurizers.skipthoughts import skipthoughts as sk
        doc_sentences = tokenize.punkt_sentences(raw_doc)

        # Ensure that the document and corpus are long enough and if not make them be long enough
        if len(sentences_full)==1:
            #print("short corpus")
            sentences_full.extend(sentences_full)
        if len(doc_sentences)==1:
            #print("short doc")
            doc_sentences.extend(doc_sentences)
        corpus_vectors = sk.encode(encoder_decoder, sentences_full)
        doc_vectors = sk.encode(encoder_decoder, doc_sentences)

        # Since each entry is a sentence, we use the index of each entry for the mask
        # We cannot use a word mode in this embedding
        doc_masks = [index for index, w in enumerate(corpus_vectors)]
        doc_questions = doc_vectors
        doc_input = corpus_vectors


    elif embed_mode == 'onehot':
        min_length = None
        max_length = None
        if mem_net_params.get('onehot_min_len', False):
            min_length = mem_net_params['onehot_min_len']
        if mem_net_params.get('onehot_max_len', False):
            max_length = mem_net_params['onehot_max_len']
        onehot_vocab=full_vocab

        # Preprocess and tokenize bkgd documents
        corpus_tokens = tokenize.word_punct_tokens(normalize.xml_normalize(raw_corpus))
        corpus_tokens = strip_to_vocab(corpus_tokens, onehot_vocab)
        corpus_indices = encode_doc(corpus_tokens, onehot_vocab)
        # Get sentence mask indices
        assert {'.',',','!','?'} <= onehot_vocab.keys()  # ensure that you are using a vocabulary w/ punctuation
        sentence_mask = get_mask(corpus_indices, onehot_vocab, max_length=max_length)
        # One-hot encode documents w/ masks, and query document
        corpus_encoded = onehot_encode(corpus_indices, len(onehot_vocab))
        corpus_vectors = run_onehot(corpus_encoded, onehot_vocab, min_length, max_length, already_encoded=True)
        # Tokenize and  one-hot encode query document
        doc_vectors = run_onehot(tokenize.word_punct_tokens(normalize.xml_normalize(raw_doc)), 
                                    onehot_vocab, min_length, max_length)

        doc_questions = doc_vectors.T
        doc_input = corpus_vectors.T

        if mask_mode=='sentence':
            doc_masks = sentence_mask
        else: doc_masks = [index for index, w in enumerate(doc_input)]


    elif embed_mode == 'word2vec':
        corpus_vectors, doc_masks = run_w2v_matrix(w2v_model, raw_corpus, mem_net_params, mask_mode)
        doc_vectors, _ = run_w2v_matrix(w2v_model, raw_doc, mem_net_params, mask_mode)

        if len(corpus_vectors)>0 and len(doc_vectors)>0:
            doc_questions = doc_vectors
            doc_input = corpus_vectors

    return doc_input, doc_questions, doc_masks
Exemple #14
0
def run_w2v_elemwise(w2v_model, doc, w2v, operation):
    '''
      Calculates Word2Vec vectors for a document using the first and last sentences of the document
      Examines vector elements and retains maximum, minimum or absolute value for each vector element

      Args:
          w2v_model (gensim.Word2Vec): Trained Word2Vec model
          doc (str): the text of the document
          w2v (dict): Dictionary of Word2Vec parameters as set in master_pipeline. The dictionary
           will include keys for the model building parameters min_count, window, size, workers and pretrained.
           The dict may also have optional boolean keys for the feature operations append, difference, product and cos.
          operation (str): element wise operation of max, min or abs
      Returns:
          documentvector (list): Word2Vec vectors with min/max/abs element values for a sentence, which are then
          concatenated across sentences
      '''
    # Get first and last sentences of document, break down sentences into words and remove stop words

    sentences = get_first_and_last_sentence(doc)
    normalizedsentences = []

    for sentence in sentences:
        words = normalize.remove_stop_words(tokenize.word_punct_tokens(sentence))
        normalizedsentences.append(words)

    sentencevectorarray = []

    # Look up word vectors in trained Word2Vec model and build array of word vectors and sentence vectors
    for phrase in normalizedsentences:

        # Set up comparison vector based on requested operation
        if operation == 'max':
            vectorlist = np.full(w2v['size'], -np.inf)
        elif operation == 'min':
            vectorlist = np.full(w2v['size'], np.inf)
        elif operation == 'abs':
            vectorlist = np.zeros(w2v['size'])

        # Determine word vector and evaluate elements against comparison vector
        for word in phrase:
            try:
                wordvector = w2v_model[word]
            except KeyError:
                continue
            if operation == 'max':
                vectorlist = np.where(wordvector > vectorlist, wordvector, vectorlist)
            elif operation == 'min':
                vectorlist = np.where(wordvector < vectorlist, wordvector, vectorlist)
            elif operation == 'abs':
                vectorlist = np.where(abs(wordvector) > vectorlist, abs(wordvector), vectorlist)

        # Remove any infinity values from special cases (ex: 1 word sentence and word not in word2vec model)
        vectorlist = np.where(np.isinf(vectorlist), 0, vectorlist)

        sentencevectorarray.append(vectorlist)

    # Only concatenate if both sentences were added to sentence vector array, otherwise append array of zeroes
    if len(sentencevectorarray) == 2:
        documentvector = np.concatenate(sentencevectorarray)
    elif len(sentencevectorarray) == 1:
        documentvector = np.concatenate((sentencevectorarray[0], np.zeros(w2v['size'])))
    else:
        documentvector = np.zeros(w2v['size']*2)
    return documentvector
Exemple #15
0
def test_word_punct():
    """Test regex-based word and punctuation tokenization"""

    assert tokenize.word_punct_tokens("Who are you??? Stop, now!") == \
        ["Who", "are", "you", "???", "Stop", ",", "now", "!"]
Exemple #16
0
def test_word_punct():
    """Test regex-based word and punctuation tokenization"""

    assert tokenize.word_punct_tokens("Who are you??? Stop, now!") == \
        ["Who", "are", "you", "???", "Stop", ",", "now", "!"]