Exemple #1
0
def run_pipeline(path: str,
                 s_ngram: int,
                 s_step: int,
                 window: int = 20,
                 no_components: int = 100,
                 learning_rate: float = 0.05,
                 glove_epochs: int = 10,
                 no_threads: int = 4,
                 verbose: bool = True) -> np.array:
    """ Run the entire pipeline:
    
    1. tokenize parsed data.
    2. build embeddings.
    3. counter-fit embeddings.
    4. return a 3d embedding structure.

    :param path: the path to the preprocessed session notes.
    :param s_ngram: the size of the ngrams.
    :param s_step: the size of steps to skip between each ngram.
    :param no_components: number of dimensions for each embedding.
    :param learning_rate: the learning rate for the model.
    :param glove_epochs: the number of training iterations.
    :param no_threads: number of treads to use for processing.
    :param verbose: toggle extra print output.
    
    """

    # tokenize the data
    token_set = tokenizer.tokenize(path, s_ngram, s_step, True)

    # train the glove object and create embeddings for each word
    glove_object = train_glove(token_set,
                               window=window,
                               no_components=no_components,
                               learning_rate=learning_rate,
                               glove_epochs=glove_epochs,
                               no_threads=no_threads,
                               verbose=verbose)

    # save the embeddings and vocabulary for counter-fitting
    save_embeddings('./pipeline/bash_files/word_vectors/vectors.txt',
                    glove_object.word_vectors, token_set.word2idx)
    save_vocabulary('./pipeline/bash_files/word_vectors/vocabulary.txt',
                    [word for word in token_set.word2idx.keys()])

    # perform counter-fitting procedure
    cf_embeddings = counter_fit()

    # get a list of sentences per venture/session pair
    token_set = token_set.collapse_speakers()
    sentences = np.array(token_set.all_tokens.tolist())

    # build the final embedding structure
    embed_struct = build_embed_struct(sentences, cf_embeddings)

    return embed_struct
Exemple #2
0
def test_grammar(file_text, grammar_enum, grammar_symbol_enum):
    tokens = tokenize(file_text)
    print_tokens(tokens)
    print()

    rule_table = build_rule_table(grammar_enum, grammar_symbol_enum)
    print_rule_table(rule_table)
    print()

    ast = build_abstract_syntax_tree(rule_table, tokens, grammar_enum,
                                     grammar_symbol_enum)
    print_ast(ast)
def tokenize_and_stem(text):
    # tokenize
    tokens = tokenizer.tokenize(text)

    # remove punctuation
    tokens = remove_punctuation(tokens)

    # remove propers
    tokens = remove_lowercase_noun(tokens)

    # stem
    tokens = stem_words(tokens)

    # to lowercase
    return tokens
def get_tokens(params, **kwargs):
    """Convenience function for getting data, labels from params.

  Handles oversampling based on Params object.

  :param params: a Params object with model params

  """
    tokenset = tokenize(os.path.join(DATA_PATH, SESSION_NOTES_CSV),
                        **params.to_dict())
    if params.oversample:
        data, labels = tokenset.random_oversample(params.sample_type, **kwargs)
    else:
        data = tokenset.get_all_data()
        labels = tokenset.get_all_labels()
    return data, labels
def optimizer(parameter_space: Dict) -> float:
    """Optimizer function for hyperopt training of TOKEN_GRU.

  :param parameter_space: Dict of hyperopt parameters

  :returns: negative f1_score

  """
    params = make_params(model_type=GenericGRU.TOKEN_GRU, **parameter_space)
    tokenset = tokenize(os.path.join(DATA_PATH, SESSION_NOTES_CSV),
                        **params.to_dict())
    rs = ShuffleSplit(n_splits=1,
                      train_size=0.7,
                      test_size=0.3,
                      random_state=17)
    ventures = tokenset.get_venture_set()
    train_inds, test_inds = next(iter(rs.split(ventures)))
    fit_predict = make_TokenGRU(params)
    return -fit_predict(tokenset, ventures[train_inds], ventures[test_inds])
def test_build_embed_struct(s_ngram, no_components):
    """function to test build_embed_struct in evc_tool/domain/model/embedding.py

        :param s_ngram: n_gram size
        :param no_components: number of dimensions per token
    """

    token_set = tokenizer.tokenize(
        'pipeline/data_loader/stage_three_plus_contractions_preprocessed_session_notes.csv',
        s_ngram, 1, True)
    glove_object = embedding.train(token_set, no_components=no_components)
    embed_struct = embedding.build_embed_struct(token_set, glove_object)

    no_sentences = len(token_set.collapse_speakers().all_tokens.tolist())

    # check if the final dimensions of the embed_struct are correct
    assert len(embed_struct) == no_sentences
    assert len(embed_struct[0]) == 2259
    assert len(embed_struct[0][0]) == no_components
import sys
import numpy as np
np.set_printoptions(threshold=np.inf)

sys.path.append('../tokenizer')
from tokenizer import tokenizer

if __name__ == '__main__':

    # create TokenSet
    token_set = tokenizer.tokenize('data/examples/truncated.csv', 433, 1, True)

    # get venture/session/token data
    venture_data = token_set.get_all_ventures().numpy()
    session_data = token_set.get_all_sessions().numpy()
    token_data = token_set.get_all_data().numpy()

    # get boolean location of each venture/session pair
    vs_pairs = {}
    for v in set(venture_data):
        venture_bool = [int(x == v) for x in venture_data]
        for s in set(session_data):
            session_bool = [int(x == s) for x in session_data]
            vs_pairs[(v, s)] = np.logical_and(venture_bool, session_bool) == 1

    # get actual token values for each venture/session pair, and collapse
    # each row of tokens into a single array
    vs_data = {}
    for key, value in vs_pairs.items():
        vs_data[key] = token_data[value].flatten()
def sentiment(phrase):
    if type(phrase) is not str:
        raise Exception(
            'sentiment: input phrase must be a string, instead found: '.type(
                phrase))

    # Early exit.
    tokenizedPhrase = tokenize(phrase)
    if len(tokenizedPhrase) == 0:
        return {'score': 0, 'normalizedScore': 0}

    # Sentiment Score.
    ss = 0
    # Hash Tags SS.
    hss = 0
    # Number of sentiment containing hashtags and words encountered.
    sentiHashtags = 0
    sentiWords = 0
    # Number of words encountered.
    words = 0
    wc = 0
    # Helpers: for loop indexes, token, temp ss, and word count.
    #var k, kmax, t, tkn, tss, wc;
    tss = ''
    precedingNegation = False
    for k in range(len(tokenizedPhrase)):
        if wc == 2:
            wc = 0
            continue
        tkn = tokenizedPhrase[k]
        t = tkn['value']
        tss = 0
        if tokenizedPhrase[k]['tag'] == 'punctuation':
            precedingNegation = False

        if tkn['tag'] == 'emoji':
            try:
                tss = emojis[t]
            except KeyError:
                pass
            if tss:
                ss += tss
                tkn['score'] = tss
                sentiWords += 1
            words += 1
        elif tkn['tag'] == 'emoticon':
            try:
                tss = emoticons[t]
            except KeyError:
                pass
            if tss:
                ss += tss
                tkn['score'] = tss
                sentiWords += 1
            words += 1
        elif tkn['tag'] == 'hashtag':
            if t[1:].lower() in afinn: tss = afinn[t[1:].lower()]
            if tss:
                tkn['score'] = tss
                hss += tss
                sentiHashtags += 1
        elif tkn['tag'] == 'word':
            t = t.lower()
            wc = 1
            # Check for bigram configurations i.e. token at `k` and `k+1`. Accordingly
            # compute the sentiment score in `tss`. Convert to Lower Case for case insensitive comparison.
            if k < (len(tokenizedPhrase) - 1) and t in affin2Grams:
                if tokenizedPhrase[k + 1]['value'].lower() in affin2Grams[t]:
                    tss = affin2Grams[t][tokenizedPhrase[k +
                                                         1]['value'].lower()]
                    tkn['grouped'] = 1
                    # Will have to count `2` words!
                    wc = 2
                # sentiWords += 1
            else:
                if t in afinn:
                    tss = afinn[t]
                else:
                    tss = 0
                #tss = afinn[t] || 0
                # sentiWords += 1;

            # flip the the score if negation flag is true
            if precedingNegation == True:
                tss = -tss
                tkn['negation'] = True

            # change code check negation. mark negation flag true when negation word in sentence
            if t in negations and wc == 1:
                precedingNegation = True

            ss += tss
            k += (wc - 1)
            if tss:
                tkn['score'] = tss
                sentiWords += 1

            # Update number of words accordingly.
            words += wc

    #print(normalize(hss, ss, sentiHashtags, sentiWords, words))

    return {
        'score': (ss + hss),
        'normalizedScore':
        round(normalize(hss, ss, sentiHashtags, sentiWords, words), 2),
        'tokenizedPhrase':
        tokenizedPhrase
    }
def test_filter(ventures, sessions, labels, cohorts, sites, speakers):

    # create token set from example data
    token_set = tokenizer.tokenize('data/examples/truncated.csv', 5, 1, True)

    # get data from each KeyMapper
    venture_data = token_set.venture_mapper.get_data()
    session_data = token_set.session_mapper.get_data()
    funding_data = token_set.funding_mapper.get_data()
    cohort_data = token_set.cohort_mapper.get_data()
    site_data = token_set.site_mapper.get_data()
    speaker_data = token_set.speaker_mapper.get_data()

    ### TEST filter with single input ###

    # get number of remaining tokens after we filter
    # for selected input, each independent of the others
    n_ventures = len(venture_data)
    if set(ventures).intersection(venture_data):
        n_ventures = 0
        for v in ventures:
            if v in venture_data:
                n_ventures += venture_data.count(v)
    n_sessions = len(session_data)
    if set(sessions).intersection(session_data):
        n_sessions = 0
        for s in sessions:
            if s in session_data:
                n_sessions += session_data.count(s)
    n_funding = len(funding_data)
    if set(labels).intersection(funding_data):
        n_funding = 0
        for l in labels:
            if l in funding_data:
                n_funding += funding_data.count(l)
    n_cohorts = len(cohort_data)
    if set(cohorts).intersection(cohort_data):
        n_cohorts = 0
        for c in cohorts:
            if c in cohort_data:
                n_cohorts += cohort_data.count(c)
    n_sites = len(site_data)
    if set(sites).intersection(site_data):
        n_sites = 0
        for s in sites:
            if s in site_data:
                n_sites += site_data.count(s)
    n_speakers = len(speaker_data)
    if set(speakers).intersection(speaker_data):
        n_speakers = 0
        for s in speakers:
            if s in speaker_data:
                n_speakers += speaker_data.count(s)

    # check if the numbers calcluated above correspond with what
    # filter_data returns if called with only a single input parameter
    venture_set = token_set.filter_data(ventures=ventures)
    assert n_ventures == len(venture_set.venture_mapper.get_data())
    session_set = token_set.filter_data(sessions=sessions)
    assert n_sessions == len(session_set.session_mapper.get_data())
    funding_set = token_set.filter_data(labels=labels)
    assert n_funding == len(funding_set.funding_mapper.get_data())
    cohort_set = token_set.filter_data(cohorts=cohorts)
    assert n_cohorts == len(cohort_set.cohort_mapper.get_data())
    site_set = token_set.filter_data(sites=sites)
    assert n_sites == len(site_set.site_mapper.get_data())
    speaker_set = token_set.filter_data(speakers=speakers)
    assert n_speakers == len(speaker_set.speaker_mapper.get_data())

    ### TEST filter with multiple inputs ###

    # get number of remaining tokens after we filter
    # for selected input, independently at first...
    venture_bool = np.ones(len(venture_data))
    if set(ventures).intersection(venture_data):
        venture_bool = np.zeros(len(venture_data))
        for v in ventures:
            venture_bool = np.logical_or(venture_bool,
                                         [int(x == v) for x in venture_data])

    session_bool = np.ones(len(session_data))
    if set(sessions).intersection(session_data):
        session_bool = np.zeros(len(session_data))
        for s in sessions:
            session_bool = np.logical_or(session_bool,
                                         [int(x == s) for x in session_data])

    funding_bool = np.ones(len(funding_data))
    if set(labels).intersection(funding_data):
        funding_bool = np.zeros(len(funding_data))
        for l in labels:
            funding_bool = np.logical_or(funding_bool,
                                         [int(x == l) for x in funding_data])

    cohort_bool = np.ones(len(cohort_data))
    if set(cohorts).intersection(cohort_data):
        cohort_bool = np.zeros(len(cohort_data))
        for c in cohorts:
            cohort_bool = np.logical_or(cohort_bool,
                                        [int(x == c) for x in cohort_data])

    site_bool = np.ones(len(site_data))
    if set(sites).intersection(site_data):
        site_bool = np.zeros(len(site_data))
        for s in sites:
            site_bool = np.logical_or(site_bool,
                                      [int(x == s) for x in site_data])

    speaker_bool = np.ones(len(speaker_data))
    if set(speakers).intersection(speaker_data):
        speaker_bool = np.zeros(len(speaker_data))
        for s in speakers:
            speaker_bool = np.logical_or(speaker_bool,
                                         [int(x == s) for x in speaker_data])

    # ... now combine all booleans generated above to get final count of remaining data
    n_all = np.logical_and(venture_bool, session_bool)
    n_all = np.logical_and(n_all, funding_bool)
    n_all = np.logical_and(n_all, cohort_bool)
    n_all = np.logical_and(n_all, site_bool)
    n_all = np.logical_and(n_all, speaker_bool)

    count = n_all.tolist().count(1)

    # call filter data with multiple inputs, and check if the token counts calculated
    # above correspond with the counts after running the filter method
    token_set = token_set.filter_data(ventures, sessions, labels, cohorts,
                                      sites, speakers)

    assert count == len(token_set.venture_mapper.get_data())
    assert count == len(token_set.session_mapper.get_data())
    assert count == len(token_set.funding_mapper.get_data())
    assert count == len(token_set.cohort_mapper.get_data())
    assert count == len(token_set.site_mapper.get_data())
    assert count == len(token_set.speaker_mapper.get_data())
Exemple #10
0
 def test_jrr_token(self):
     r = re.compile('\s+', re.MULTILINE)
     self.assertEqual(
         r.sub('', tokenize(expected_input)),
         r.sub('', expected_output),
     )
Exemple #11
0
def extractFeatures(text, lang=['en','en-US']):
	
	hashtags = techniques.extractHashtags(text)
	urls = techniques.extractURL(text)
	atUsers = techniques.extractAtUser(text)
	text = techniques.removeHashtags(text)
	text = techniques.removeAtUser(text)
	text = techniques.replaceURL(text, '')

	porter = PorterStemmer()

	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(text) 
	filtered_words = [] 
	functionWords = []
	  
	for w in word_tokens: 
	    if w not in stop_words:
	        filtered_words.append(w)
	    else:
	    	functionWords.append(w)
	
	#print(filtered_words)
	
	countElongated = techniques.countElongated(text)
	#sentenceWords = techniques.words(text)
	filtered_Sentence = ' '.join(filtered_words)
	originalText = text

	countCaps = techniques.countAllCaps(text)
	

	s = aspell.Speller('lang', lang[0])
	lang_tool = language_check.LanguageTool(lang[1])

	aspellDelta = []
	for word in filtered_words:
		suggest = s.suggest(word)
		if(len(suggest) > 0):
			suggest = suggest[0]
		else:
			suggest = word
		aspellDelta.append(ls.distance(word, suggest))


	
	
	langCheckDelta = []
	correctedSentence = techniques.words(language_check.correct(filtered_Sentence, lang_tool.check(filtered_Sentence)))
	word = 0
	while word < min(len(correctedSentence), len(filtered_words)):
		langCheckDelta.append(ls.distance(correctedSentence[word], filtered_words[word]))
		word += 1

	aspellDelta = sum(aspellDelta) / max(1,len(aspellDelta))
	langCheckDelta = sum(langCheckDelta) / max(1,len(langCheckDelta))
	sentenceSpellDelta = (aspellDelta + langCheckDelta) / 2.0

	sentence = ' '.join(correctedSentence)
	sentenceLength = len(filtered_words)
	sentenceWordLength = sum([len(x) for x in filtered_words]) / max(1,sentenceLength)

	stemmed_words = []
	for word in filtered_words:
		if word not in stop_words: 
			stemmed_words.append(porter.stem(word))
		else:
			functionWords.append(word)
		
	stemmed_sentence = ' '.join(stemmed_words)

	tokenizer = RegexpTokenizer("[a-zA-Z]+")
	tokenizedText = tokenizer.tokenize(stemmed_sentence)
	tokenizedChar = [c for c in ' '.join(tokenizedText)]

	sentenceCharTrigrams = [ ''.join(grams) for grams in ngrams(tokenizedChar, 3)]
	sentenceWordBigrams = [ ' '.join(grams) for grams in ngrams(tokenizedText, 2)]
	sentenceWordUnigrams = [ ' '.join(grams) for grams in ngrams(tokenizedText, 1)]
	functionWords = [ ' '.join(grams) for grams in ngrams(functionWords, 1)]

	return {'correctedSentence': sentence, 'originalSentence': originalText,'filteredSentence':filtered_Sentence, 'stemmedSentence':stemmed_sentence, 'elongated' : countElongated, 'caps': countCaps, 'textLength': sentenceLength, 'sentenceWordLength' : sentenceWordLength, 'spellDelta':sentenceSpellDelta, 'charTrigrams':sentenceCharTrigrams, 'wordBigrams': sentenceWordBigrams, 'wordUnigrams':sentenceWordUnigrams, 'POSBigrams': '', 'functionWords': functionWords, 'hashtag': hashtags, 'url':urls}
def test_tokenizer_tokenSet_eq():

    token_setA = tokenizer.tokenize('data/examples/exampleA.csv', 5, 1, True)
    token_setB = tokenizer.tokenize('data/examples/exampleB.csv', 5, 1, True)

    token_setC = tokenizer.tokenize('data/examples/exampleC.csv', 5, 1, True)
    token_setD = tokenizer.tokenize('data/examples/exampleD.csv', 5, 1, True)
    token_setE = tokenizer.tokenize('data/examples/exampleE.csv', 5, 1, True)
    token_setF = tokenizer.tokenize('data/examples/exampleF.csv', 5, 1, True)
    token_setG = tokenizer.tokenize('data/examples/exampleG.csv', 5, 1, True)
    token_setH = tokenizer.tokenize('data/examples/exampleH.csv', 5, 1, True)
    token_setI = tokenizer.tokenize('data/examples/exampleI.csv', 5, 1, True)

    token_setJ = tokenizer.tokenize('data/examples/exampleA.csv', 3, 1, True)
    token_setK = tokenizer.tokenize('data/examples/exampleA.csv', 5, 2, True)
    token_setL = tokenizer.tokenize('data/examples/exampleA.csv', 5, 1, False)

    assert token_setA == token_setB  # identical
    assert token_setA != token_setC  # different cohort
    assert token_setA != token_setD  # different site
    assert token_setA != token_setE  # different session
    assert token_setA != token_setF  # different venture ID
    assert token_setA != token_setG  # different speaker ID
    assert token_setA != token_setH  # different comment
    assert token_setA != token_setI  # one row different
    assert token_setA != token_setJ  # different ngram size
    assert token_setA != token_setK  # different step size
    assert token_setA != token_setL  # end token vs no end token