def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) telist = [] for word in text_tk: word = word.lower() ps = nltk.stem.PorterStemmer() word = ps.stem(word) # word = nltk.stem.SnowballStemmer(word) telist.append(word) # return ''.join(ans) # newtext = ?telist # newtext = ' '.join(text_tk) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic
def perprocessing(tdic): new_dic = {} POS_feature = [] for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) print(text_tk) telist = [] for word in text_tk: word = word.lower() # ps = nltk.stem.PorterStemmer() # word = ps.stem(word) telist.append(word) # print(telist) afterlemma = lemma(telist) telist = afterlemma[0] POS_feature.append(afterlemma[1]) # print(telist) newtext = ' '.join(telist) # print(newtext) newtext = textPreprocessor01.replaceall(newtext) #now preprocess . change to URLINK SADFACE print(newtext) new_dic[id] = gt, newtext return new_dic, np.array(POS_feature)
def perprocessing(tdic): new_dic = {} for line in tdic: id = line gt = tdic[line][0] raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1])) text = twokenize.normalizeTextForTagger(raw) text_tk = twokenize.tokenize(text) # print(text_tk) newtext = ' '.join(text_tk) newtext = textPreprocessor01.replaceall(newtext) new_dic[id] = gt, newtext # print(type(tdic[line][1])) # print(line) # print(type(line)) # print(type(newtext)) # print(newtext) return new_dic