def run_pipeline(path: str, s_ngram: int, s_step: int, window: int = 20, no_components: int = 100, learning_rate: float = 0.05, glove_epochs: int = 10, no_threads: int = 4, verbose: bool = True) -> np.array: """ Run the entire pipeline: 1. tokenize parsed data. 2. build embeddings. 3. counter-fit embeddings. 4. return a 3d embedding structure. :param path: the path to the preprocessed session notes. :param s_ngram: the size of the ngrams. :param s_step: the size of steps to skip between each ngram. :param no_components: number of dimensions for each embedding. :param learning_rate: the learning rate for the model. :param glove_epochs: the number of training iterations. :param no_threads: number of treads to use for processing. :param verbose: toggle extra print output. """ # tokenize the data token_set = tokenizer.tokenize(path, s_ngram, s_step, True) # train the glove object and create embeddings for each word glove_object = train_glove(token_set, window=window, no_components=no_components, learning_rate=learning_rate, glove_epochs=glove_epochs, no_threads=no_threads, verbose=verbose) # save the embeddings and vocabulary for counter-fitting save_embeddings('./pipeline/bash_files/word_vectors/vectors.txt', glove_object.word_vectors, token_set.word2idx) save_vocabulary('./pipeline/bash_files/word_vectors/vocabulary.txt', [word for word in token_set.word2idx.keys()]) # perform counter-fitting procedure cf_embeddings = counter_fit() # get a list of sentences per venture/session pair token_set = token_set.collapse_speakers() sentences = np.array(token_set.all_tokens.tolist()) # build the final embedding structure embed_struct = build_embed_struct(sentences, cf_embeddings) return embed_struct
def test_grammar(file_text, grammar_enum, grammar_symbol_enum): tokens = tokenize(file_text) print_tokens(tokens) print() rule_table = build_rule_table(grammar_enum, grammar_symbol_enum) print_rule_table(rule_table) print() ast = build_abstract_syntax_tree(rule_table, tokens, grammar_enum, grammar_symbol_enum) print_ast(ast)
def tokenize_and_stem(text): # tokenize tokens = tokenizer.tokenize(text) # remove punctuation tokens = remove_punctuation(tokens) # remove propers tokens = remove_lowercase_noun(tokens) # stem tokens = stem_words(tokens) # to lowercase return tokens
def get_tokens(params, **kwargs): """Convenience function for getting data, labels from params. Handles oversampling based on Params object. :param params: a Params object with model params """ tokenset = tokenize(os.path.join(DATA_PATH, SESSION_NOTES_CSV), **params.to_dict()) if params.oversample: data, labels = tokenset.random_oversample(params.sample_type, **kwargs) else: data = tokenset.get_all_data() labels = tokenset.get_all_labels() return data, labels
def optimizer(parameter_space: Dict) -> float: """Optimizer function for hyperopt training of TOKEN_GRU. :param parameter_space: Dict of hyperopt parameters :returns: negative f1_score """ params = make_params(model_type=GenericGRU.TOKEN_GRU, **parameter_space) tokenset = tokenize(os.path.join(DATA_PATH, SESSION_NOTES_CSV), **params.to_dict()) rs = ShuffleSplit(n_splits=1, train_size=0.7, test_size=0.3, random_state=17) ventures = tokenset.get_venture_set() train_inds, test_inds = next(iter(rs.split(ventures))) fit_predict = make_TokenGRU(params) return -fit_predict(tokenset, ventures[train_inds], ventures[test_inds])
def test_build_embed_struct(s_ngram, no_components): """function to test build_embed_struct in evc_tool/domain/model/embedding.py :param s_ngram: n_gram size :param no_components: number of dimensions per token """ token_set = tokenizer.tokenize( 'pipeline/data_loader/stage_three_plus_contractions_preprocessed_session_notes.csv', s_ngram, 1, True) glove_object = embedding.train(token_set, no_components=no_components) embed_struct = embedding.build_embed_struct(token_set, glove_object) no_sentences = len(token_set.collapse_speakers().all_tokens.tolist()) # check if the final dimensions of the embed_struct are correct assert len(embed_struct) == no_sentences assert len(embed_struct[0]) == 2259 assert len(embed_struct[0][0]) == no_components
import sys import numpy as np np.set_printoptions(threshold=np.inf) sys.path.append('../tokenizer') from tokenizer import tokenizer if __name__ == '__main__': # create TokenSet token_set = tokenizer.tokenize('data/examples/truncated.csv', 433, 1, True) # get venture/session/token data venture_data = token_set.get_all_ventures().numpy() session_data = token_set.get_all_sessions().numpy() token_data = token_set.get_all_data().numpy() # get boolean location of each venture/session pair vs_pairs = {} for v in set(venture_data): venture_bool = [int(x == v) for x in venture_data] for s in set(session_data): session_bool = [int(x == s) for x in session_data] vs_pairs[(v, s)] = np.logical_and(venture_bool, session_bool) == 1 # get actual token values for each venture/session pair, and collapse # each row of tokens into a single array vs_data = {} for key, value in vs_pairs.items(): vs_data[key] = token_data[value].flatten()
def sentiment(phrase): if type(phrase) is not str: raise Exception( 'sentiment: input phrase must be a string, instead found: '.type( phrase)) # Early exit. tokenizedPhrase = tokenize(phrase) if len(tokenizedPhrase) == 0: return {'score': 0, 'normalizedScore': 0} # Sentiment Score. ss = 0 # Hash Tags SS. hss = 0 # Number of sentiment containing hashtags and words encountered. sentiHashtags = 0 sentiWords = 0 # Number of words encountered. words = 0 wc = 0 # Helpers: for loop indexes, token, temp ss, and word count. #var k, kmax, t, tkn, tss, wc; tss = '' precedingNegation = False for k in range(len(tokenizedPhrase)): if wc == 2: wc = 0 continue tkn = tokenizedPhrase[k] t = tkn['value'] tss = 0 if tokenizedPhrase[k]['tag'] == 'punctuation': precedingNegation = False if tkn['tag'] == 'emoji': try: tss = emojis[t] except KeyError: pass if tss: ss += tss tkn['score'] = tss sentiWords += 1 words += 1 elif tkn['tag'] == 'emoticon': try: tss = emoticons[t] except KeyError: pass if tss: ss += tss tkn['score'] = tss sentiWords += 1 words += 1 elif tkn['tag'] == 'hashtag': if t[1:].lower() in afinn: tss = afinn[t[1:].lower()] if tss: tkn['score'] = tss hss += tss sentiHashtags += 1 elif tkn['tag'] == 'word': t = t.lower() wc = 1 # Check for bigram configurations i.e. token at `k` and `k+1`. Accordingly # compute the sentiment score in `tss`. Convert to Lower Case for case insensitive comparison. if k < (len(tokenizedPhrase) - 1) and t in affin2Grams: if tokenizedPhrase[k + 1]['value'].lower() in affin2Grams[t]: tss = affin2Grams[t][tokenizedPhrase[k + 1]['value'].lower()] tkn['grouped'] = 1 # Will have to count `2` words! wc = 2 # sentiWords += 1 else: if t in afinn: tss = afinn[t] else: tss = 0 #tss = afinn[t] || 0 # sentiWords += 1; # flip the the score if negation flag is true if precedingNegation == True: tss = -tss tkn['negation'] = True # change code check negation. mark negation flag true when negation word in sentence if t in negations and wc == 1: precedingNegation = True ss += tss k += (wc - 1) if tss: tkn['score'] = tss sentiWords += 1 # Update number of words accordingly. words += wc #print(normalize(hss, ss, sentiHashtags, sentiWords, words)) return { 'score': (ss + hss), 'normalizedScore': round(normalize(hss, ss, sentiHashtags, sentiWords, words), 2), 'tokenizedPhrase': tokenizedPhrase }
def test_filter(ventures, sessions, labels, cohorts, sites, speakers): # create token set from example data token_set = tokenizer.tokenize('data/examples/truncated.csv', 5, 1, True) # get data from each KeyMapper venture_data = token_set.venture_mapper.get_data() session_data = token_set.session_mapper.get_data() funding_data = token_set.funding_mapper.get_data() cohort_data = token_set.cohort_mapper.get_data() site_data = token_set.site_mapper.get_data() speaker_data = token_set.speaker_mapper.get_data() ### TEST filter with single input ### # get number of remaining tokens after we filter # for selected input, each independent of the others n_ventures = len(venture_data) if set(ventures).intersection(venture_data): n_ventures = 0 for v in ventures: if v in venture_data: n_ventures += venture_data.count(v) n_sessions = len(session_data) if set(sessions).intersection(session_data): n_sessions = 0 for s in sessions: if s in session_data: n_sessions += session_data.count(s) n_funding = len(funding_data) if set(labels).intersection(funding_data): n_funding = 0 for l in labels: if l in funding_data: n_funding += funding_data.count(l) n_cohorts = len(cohort_data) if set(cohorts).intersection(cohort_data): n_cohorts = 0 for c in cohorts: if c in cohort_data: n_cohorts += cohort_data.count(c) n_sites = len(site_data) if set(sites).intersection(site_data): n_sites = 0 for s in sites: if s in site_data: n_sites += site_data.count(s) n_speakers = len(speaker_data) if set(speakers).intersection(speaker_data): n_speakers = 0 for s in speakers: if s in speaker_data: n_speakers += speaker_data.count(s) # check if the numbers calcluated above correspond with what # filter_data returns if called with only a single input parameter venture_set = token_set.filter_data(ventures=ventures) assert n_ventures == len(venture_set.venture_mapper.get_data()) session_set = token_set.filter_data(sessions=sessions) assert n_sessions == len(session_set.session_mapper.get_data()) funding_set = token_set.filter_data(labels=labels) assert n_funding == len(funding_set.funding_mapper.get_data()) cohort_set = token_set.filter_data(cohorts=cohorts) assert n_cohorts == len(cohort_set.cohort_mapper.get_data()) site_set = token_set.filter_data(sites=sites) assert n_sites == len(site_set.site_mapper.get_data()) speaker_set = token_set.filter_data(speakers=speakers) assert n_speakers == len(speaker_set.speaker_mapper.get_data()) ### TEST filter with multiple inputs ### # get number of remaining tokens after we filter # for selected input, independently at first... venture_bool = np.ones(len(venture_data)) if set(ventures).intersection(venture_data): venture_bool = np.zeros(len(venture_data)) for v in ventures: venture_bool = np.logical_or(venture_bool, [int(x == v) for x in venture_data]) session_bool = np.ones(len(session_data)) if set(sessions).intersection(session_data): session_bool = np.zeros(len(session_data)) for s in sessions: session_bool = np.logical_or(session_bool, [int(x == s) for x in session_data]) funding_bool = np.ones(len(funding_data)) if set(labels).intersection(funding_data): funding_bool = np.zeros(len(funding_data)) for l in labels: funding_bool = np.logical_or(funding_bool, [int(x == l) for x in funding_data]) cohort_bool = np.ones(len(cohort_data)) if set(cohorts).intersection(cohort_data): cohort_bool = np.zeros(len(cohort_data)) for c in cohorts: cohort_bool = np.logical_or(cohort_bool, [int(x == c) for x in cohort_data]) site_bool = np.ones(len(site_data)) if set(sites).intersection(site_data): site_bool = np.zeros(len(site_data)) for s in sites: site_bool = np.logical_or(site_bool, [int(x == s) for x in site_data]) speaker_bool = np.ones(len(speaker_data)) if set(speakers).intersection(speaker_data): speaker_bool = np.zeros(len(speaker_data)) for s in speakers: speaker_bool = np.logical_or(speaker_bool, [int(x == s) for x in speaker_data]) # ... now combine all booleans generated above to get final count of remaining data n_all = np.logical_and(venture_bool, session_bool) n_all = np.logical_and(n_all, funding_bool) n_all = np.logical_and(n_all, cohort_bool) n_all = np.logical_and(n_all, site_bool) n_all = np.logical_and(n_all, speaker_bool) count = n_all.tolist().count(1) # call filter data with multiple inputs, and check if the token counts calculated # above correspond with the counts after running the filter method token_set = token_set.filter_data(ventures, sessions, labels, cohorts, sites, speakers) assert count == len(token_set.venture_mapper.get_data()) assert count == len(token_set.session_mapper.get_data()) assert count == len(token_set.funding_mapper.get_data()) assert count == len(token_set.cohort_mapper.get_data()) assert count == len(token_set.site_mapper.get_data()) assert count == len(token_set.speaker_mapper.get_data())
def test_jrr_token(self): r = re.compile('\s+', re.MULTILINE) self.assertEqual( r.sub('', tokenize(expected_input)), r.sub('', expected_output), )
def extractFeatures(text, lang=['en','en-US']): hashtags = techniques.extractHashtags(text) urls = techniques.extractURL(text) atUsers = techniques.extractAtUser(text) text = techniques.removeHashtags(text) text = techniques.removeAtUser(text) text = techniques.replaceURL(text, '') porter = PorterStemmer() stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filtered_words = [] functionWords = [] for w in word_tokens: if w not in stop_words: filtered_words.append(w) else: functionWords.append(w) #print(filtered_words) countElongated = techniques.countElongated(text) #sentenceWords = techniques.words(text) filtered_Sentence = ' '.join(filtered_words) originalText = text countCaps = techniques.countAllCaps(text) s = aspell.Speller('lang', lang[0]) lang_tool = language_check.LanguageTool(lang[1]) aspellDelta = [] for word in filtered_words: suggest = s.suggest(word) if(len(suggest) > 0): suggest = suggest[0] else: suggest = word aspellDelta.append(ls.distance(word, suggest)) langCheckDelta = [] correctedSentence = techniques.words(language_check.correct(filtered_Sentence, lang_tool.check(filtered_Sentence))) word = 0 while word < min(len(correctedSentence), len(filtered_words)): langCheckDelta.append(ls.distance(correctedSentence[word], filtered_words[word])) word += 1 aspellDelta = sum(aspellDelta) / max(1,len(aspellDelta)) langCheckDelta = sum(langCheckDelta) / max(1,len(langCheckDelta)) sentenceSpellDelta = (aspellDelta + langCheckDelta) / 2.0 sentence = ' '.join(correctedSentence) sentenceLength = len(filtered_words) sentenceWordLength = sum([len(x) for x in filtered_words]) / max(1,sentenceLength) stemmed_words = [] for word in filtered_words: if word not in stop_words: stemmed_words.append(porter.stem(word)) else: functionWords.append(word) stemmed_sentence = ' '.join(stemmed_words) tokenizer = RegexpTokenizer("[a-zA-Z]+") tokenizedText = tokenizer.tokenize(stemmed_sentence) tokenizedChar = [c for c in ' '.join(tokenizedText)] sentenceCharTrigrams = [ ''.join(grams) for grams in ngrams(tokenizedChar, 3)] sentenceWordBigrams = [ ' '.join(grams) for grams in ngrams(tokenizedText, 2)] sentenceWordUnigrams = [ ' '.join(grams) for grams in ngrams(tokenizedText, 1)] functionWords = [ ' '.join(grams) for grams in ngrams(functionWords, 1)] return {'correctedSentence': sentence, 'originalSentence': originalText,'filteredSentence':filtered_Sentence, 'stemmedSentence':stemmed_sentence, 'elongated' : countElongated, 'caps': countCaps, 'textLength': sentenceLength, 'sentenceWordLength' : sentenceWordLength, 'spellDelta':sentenceSpellDelta, 'charTrigrams':sentenceCharTrigrams, 'wordBigrams': sentenceWordBigrams, 'wordUnigrams':sentenceWordUnigrams, 'POSBigrams': '', 'functionWords': functionWords, 'hashtag': hashtags, 'url':urls}
def test_tokenizer_tokenSet_eq(): token_setA = tokenizer.tokenize('data/examples/exampleA.csv', 5, 1, True) token_setB = tokenizer.tokenize('data/examples/exampleB.csv', 5, 1, True) token_setC = tokenizer.tokenize('data/examples/exampleC.csv', 5, 1, True) token_setD = tokenizer.tokenize('data/examples/exampleD.csv', 5, 1, True) token_setE = tokenizer.tokenize('data/examples/exampleE.csv', 5, 1, True) token_setF = tokenizer.tokenize('data/examples/exampleF.csv', 5, 1, True) token_setG = tokenizer.tokenize('data/examples/exampleG.csv', 5, 1, True) token_setH = tokenizer.tokenize('data/examples/exampleH.csv', 5, 1, True) token_setI = tokenizer.tokenize('data/examples/exampleI.csv', 5, 1, True) token_setJ = tokenizer.tokenize('data/examples/exampleA.csv', 3, 1, True) token_setK = tokenizer.tokenize('data/examples/exampleA.csv', 5, 2, True) token_setL = tokenizer.tokenize('data/examples/exampleA.csv', 5, 1, False) assert token_setA == token_setB # identical assert token_setA != token_setC # different cohort assert token_setA != token_setD # different site assert token_setA != token_setE # different session assert token_setA != token_setF # different venture ID assert token_setA != token_setG # different speaker ID assert token_setA != token_setH # different comment assert token_setA != token_setI # one row different assert token_setA != token_setJ # different ngram size assert token_setA != token_setK # different step size assert token_setA != token_setL # end token vs no end token