from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.utils.nlp import polarity sentences = [ "So there is no way for me to plug it in here in the US unless I go by a converter.", "Good case, Excellent value.", "Works great!", 'The design is very odd, as the ear "clip" is not very comfortable at all.', "Needless to say, I wasted my money." ] # define preprocessing pipeline text_processor = TextPreProcessor( fix_text=True, unpack_contractions=True, tokenizer=SocialTokenizer(lowercase=True).tokenize, ) # pass each sentence through the pipeline tokenized_sentences = list(text_processor.pre_process_docs(sentences)) for sent in tokenized_sentences: _polarity, _scores = polarity(sent) print("{:.4f}\t".format(_polarity) + " ".join(sent))
def yelpInstanceConstructFromTrain(self, paramFpathInTrainTxt, paramFpathOutToken2IndexDict, paramFpathOutIndex2TokenDict, paramFpathOutTrainParams, paramFpathOutTrainInstance): ''' combine reviews with stars, reshuffle reviews, and split into two sets =================================================== parameters: ----------- paramFpathInTrainTxt: review texted train paramFpathOutToken2IndexDict: map token to index paramFpathOutIndex2TokenDict: map index to token paramFpathOutTest: test se paramFpathOutParams: the parameters needed for training paramTrainsetPercent: train set percent return: ----------- None ''' # read in the train.txt fpointerInTrainTxt = open(paramFpathInTrainTxt, 'rt', encoding='utf8') def __function4map(elem4map): ''' stripe elem =================================================== parameters: ----------- elem4map return: ----------- mapped elem ''' elemstriped = elem4map.strip() return elemstriped listTrainTxt = list(map(__function4map, fpointerInTrainTxt.readlines())) fpointerInTrainTxt.close() # ----------initialize TextPreProcessor text_processor = TextPreProcessor( normailze=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored" }, fix_html=True, segmenter="english", corrector="english", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) # ----------Initialize TextPreProcessor listTrainTxtTokenized = \ list(text_processor.pre_process_docs(listTrainTxt)) listTrainTxt = None # ----------save the vocabulary table, # calculate and save the parameters # filter top 20,000 tokens dictVocabulary2Freq = dict() for listTokens in listTrainTxtTokenized: for aToken in listTokens: if aToken in dictVocabulary2Freq: dictVocabulary2Freq[aToken] += 1 else: dictVocabulary2Freq[aToken] = 1 itemgetter1 = operator.itemgetter(1) list_k_v_top_20000 = sorted(dictVocabulary2Freq.items(), key=itemgetter1, reverse=True)[0:20000] dict_k_v_top_20000 = {k: v for k, v in list_k_v_top_20000} dictVocabulary2Freq = None list_k_v_top_20000 = None # calculate maxDocumentSize and vocabularySize maxDocumentSize = 0 vocabularySize = 0 dictVocabulary2Index = dict() dictIndex2Vocabulary = dict() tokenCurrentIndex = 0 for listTokens in listTrainTxtTokenized: if maxDocumentSize < len(listTokens): maxDocumentSize = len(listTokens) for aToken in listTokens: # filter rare words, reduce vocabulary size if aToken not in dict_k_v_top_20000: continue if aToken in dictVocabulary2Index: pass else: dictVocabulary2Index[aToken] = tokenCurrentIndex dictIndex2Vocabulary[tokenCurrentIndex] = aToken tokenCurrentIndex += 1 vocabularySize = tokenCurrentIndex assert vocabularySize == len(dictVocabulary2Index) # trim doc_size to 0.5 maxDocSize # trimmed_doc_size = maxDocumentSize * 0.5 # json write using the fp4jsonoutput = open(,'wt', encoding='utf8') fp4jsonoutput = open(paramFpathOutToken2IndexDict, 'wt', encoding='utf8') json.dump(dictVocabulary2Index, fp4jsonoutput, ensure_ascii=False) fp4jsonoutput.close() fp4jsonoutput = open(paramFpathOutIndex2TokenDict, 'wt', encoding='utf8') json.dump(dictIndex2Vocabulary, fp4jsonoutput, ensure_ascii=False) fp4jsonoutput.close() # dictVocabulary2Index = None dictIndex2Vocabulary = None fpointerOutParams = open(paramFpathOutTrainParams, 'wt', encoding='utf8') str4write = 'TrainingInstances: %d\n' % len(listTrainTxtTokenized)\ + 'DocumentSeqLen: %d\n' % maxDocumentSize\ + 'VocabularySize: %d\n' % vocabularySize fpointerOutParams.write(str4write) fpointerOutParams.close() # ----------calculate and save the parameters # ----------construct training instances and perform padding print('Hello1') def __function_tokenlist_to_traininstance(tokenlist): ''' from tokenlist to padded instance list adding subsampling ''' tokenlist_size = len(tokenlist) traininginstance = list() for n in range(tokenlist_size): # ----------split tokenlist section tokenlist_section = None if n - HALF_WINDOW_SIZE < 0: if n + HALF_WINDOW_SIZE >= tokenlist_size: tokenlist_section = tokenlist else: tokenlist_section = tokenlist[:n + HALF_WINDOW_SIZE] else: if n + HALF_WINDOW_SIZE >= tokenlist_size: tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:] else: tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:n + HALF_WINDOW_SIZE] # ----------calculate tokenlist multiterm countlist_vocab = [0 for i in range(vocabularySize)] countlist_vocab[dictVocabulary2Index[tokenlist[n]]] += 1 traininginstance.append(countlist_vocab) countlist_vocab = [0 for i in range(vocabularySize)] for atoken in tokenlist_section: countlist_vocab[dictVocabulary2Index[atoken]] += 1 traininginstance.append(countlist_vocab) # ----------padding for n in range(tokenlist_size, maxDocumentSize): fullzero_vocab = [0 for i in range(vocabularySize)] traininginstance.append(fullzero_vocab) fullzero_vocab = [0 for i in range(vocabularySize)] traininginstance.append(fullzero_vocab) return traininginstance def __function_traininstance_to_string(traininstance): ''' from traininstance to a string ''' str_training_instance = '' for acountlist_vocab in traininstance: acountlist_vocab = list(map(str, acountlist_vocab)) str_acountlist_vocab = ' '.join(acountlist_vocab) str_training_instance += ' ' + str_acountlist_vocab str_training_instance += '\n' return str_training_instance fpointerOutTrainInstance = open(paramFpathOutTrainInstance, 'wt', encoding='utf8') for aTrainTxtTokenized in listTrainTxtTokenized: aTrainInstance = __function_tokenlist_to_traininstance( aTrainTxtTokenized) aStrTrainInstance = __function_traininstance_to_string( aTrainInstance) fpointerOutTrainInstance.write(aStrTrainInstance) fpointerOutTrainInstance.close() return None
def compute_elmo_rep(model_dir, input_list, mtype='BiLSTMAttention'): ''' Given a list of documents, return a list of embedded documents each element in list is [sentence len] * [word embedding dim] ''' config = DefaultConfig( ) # Just take the default config to do the prediction work config.set_attrs({'batch_size': 8}) model_path = '%s/model' % model_dir text_processor = TextPreProcessor( normailze=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored" }, fix_html=True, segmenter="english", corrector="english", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) listTokenized = list(text_processor.pre_process_docs(input_list)) print('After tokenization:') print(listTokenized) tensorTokenizedCharEncoded = batch_to_ids( listTokenized ) #[ ['I', 'am', 'a' ,'sentense'] , ['A','sentense'] ] )#listShuffledReviewsTokenized ) # print( listShuffledReviewsCharacterEmbedded[0].size() ) arrayTokenizedCharEncoded = tensorTokenizedCharEncoded.numpy().astype( numpy.int32) x = Variable(torch.from_numpy(arrayTokenizedCharEncoded).long(), requires_grad=False) if config.on_cuda: x = x.cuda() else: x = x.cpu() #print(x.size()) model = biLSTMAttention.BiLSTMAttention( param_document_seq_len=tensorTokenizedCharEncoded.size( 1), # 300 in our model param_character_embedding_len=tensorTokenizedCharEncoded.size( 2), #it depends on the setting param_bilstm_hidden_size=1024 // 2, # 1024 is the Elmo size, the concatenated hidden size is supposed to Elmo size, however, any size is OK param_attention_size=(1024 // 2 * 2) // 1024 * 1024 + (1024 // 2 * 2) % 1024, # attention size should be a smoothed representation of character-emb param_class_count=5, param_options_file=config.options_file, param_weight_file=config.weight_file) print('Loading trained model') # here, load and save are defined in biLSTMAttention.py # load <=> model.load_state_dict( torch.load(path) ) # save <=> torch.save( model.state_dict(), path ) # an other way: # model = torch.load( path ) # has 2 field, if torch.save( model, path ), then both ['state_dict'] and ['struct'] != None # torch.save( model, path ) if config.on_cuda: model.load(model_path) model = model.cuda() else: model.load_cpu_from_gputrained(model_path) model = model.cpu() elmo_dict = model.forward_obtainTrainedElmoRep(x) elmo_rep = elmo_dict['elmo_representations'][ 0] # since num_output_representations = 1, so len(list_elmo_rep) = 1, # if num_output_representations == 2, then will produce 2 same elmo_representations of [batch_size, seq_len, wordembedding_len] #print(elmo_rep.size()) arr_elmo_rep = elmo_rep.data.cpu().numpy() return arr_elmo_rep