def sent_list(docs, splitStr='__label__'): sent_analysis = [] for i in range(1, len(docs)): text = str(lines[i]) splitText = text.split(splitStr) secHalf = splitText[1] sentiment = secHalf[0] text = secHalf[2:len(secHalf) - 1].lower() table = str.maketrans(' ', ' ', string.punctuation) text.translate(table) if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text: text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", text) sent_analysis.append([text, sentiment]) return sent_analysis
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): """Converts a text to a sequence of words (or tokens). # Arguments text: Input text (string). filters: Sequence of characters to filter out. lower: Whether to convert the input to lowercase. split: Sentence split marker (string). # Returns A list of words (or tokens). """ if lower: text = text.lower() if sys.version_info < (3,) and isinstance(text, unicode): translate_map = dict((ord(c), unicode(split)) for c in filters) else: translate_map = maketrans(filters, split * len(filters)) text = text.translate(translate_map) #seq = text.split(split) seq = text.split() #seq = word_tokenize(text) #print("text:",seq) #pos_seq = nltk.pos_tag(text) #return [i for i in seq if i] return nltk.pos_tag(seq)
def _remove_pattern_2(input_text_list): stoplist = read_stopwords() cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text) text = text.split() text = [word for word in text if word not in stoplist] ## 在提取词根前清除一次停用词 stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] cleanwordlist = [ word for word in stemmed_words if word not in stoplist ] ## 提取词根后,再清除 text = " ".join(cleanwordlist) cleaned_text_list.append(text) return cleaned_text_list
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "): '''prune: sequence of characters to filter out ''' if lower: text = text.lower() text = text.translate(string.maketrans(filters, split*len(filters))) seq = text.split(split) sentences = [_f for _f in seq if _f] return bigram(sentences)
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): if lower: text = text.lower() if type(text) == unicode: translate_table = {ord(c): ord(t) for c, t in zip(filters, split * len(filters))} else: translate_table = maketrans(filters, split * len(filters)) text = text.translate(translate_table) seq = text.split(split) return [i for i in seq if i]
def translate(comment): if hasattr(comment, "decode"): comment = comment.decode("utf-8") text = TextBlob(comment) try: text = text.translate(to="en") except NotTranslated: pass return str(text)
def _remove_pattern_2(input_text_list): cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # Remove stop words # text = text.split() # stops = set(stopwords.words("english")) # text = [w for w in text if not w in stops and len(w) >= 3] # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # text = text.split() # stemmer = SnowballStemmer('english') # stemmed_words = [stemmer.stem(word) for word in text] # text = " ".join(stemmed_words) cleaned_text_list.append(text) return cleaned_text_list
def clean_text(text): ## Remove puncuation text = str(text) text = text.translate(string.punctuation) ## Convert words to lower case and split them text = text.lower().split() ## Remove stop words stops = set(stopwords.words("english")) text = [w for w in text if not w in stops and len(w) >= 3] text = " ".join(text) ## Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) ## Stemming text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) return text
def corenlp_tokenize_enpbt(text, filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", lower=True, split=" "): if lower: text = text.lower() if sys.version_info < (3, ) and isinstance(text, unicode): translate_map = dict((ord(c), unicode(split)) for c in filters) else: translate_map = maketrans(filters, split * len(filters)) text = text.translate(translate_map) ann = settings.CORENLP_CLIENT.annotate(text) return [x.word for x in ann.sentencelessToken]
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): if lower: text = text.lower() # if isinstance(text, unicode): #python3 字符串永远都是Unicode translate_table = { ord(c): ord(t) for c, t in zip(filters, split * len(filters)) } # else: # translate_table = str.maketrans(filters, split * len(filters)) #空格 * int == int长度的空格 text = text.translate(translate_table) seq = text.split(split) return [i for i in seq if i]
def remove_punctuation(text): """custom function to remove the punctuation""" return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))