Python RegexpTokenizer Examples, nltk.tokenize.RegexpTokenizer Python Examples

Example #1

0

Show file

File: financial_words.py Project: gusar/STAR

    def tokenize_text(body):
        body = body.lower()
        body = BeautifulSoup(body, "html.parser").get_text()

        tokenizer = tokenize.RegexpTokenizer(r'\w+')
        body = tokenizer.tokenize(body)
        return [word for word in body if word not in corpus.stopwords.words('english')]

Example #2

0

Show file

def prep_sequence(seq, lookup, size):
    '''Converts a sequence to a vector'''
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(seq)
    sentence_vec = np.zeros((size, ))
    for token in tokens:
        if token in lookup:
            sentence_vec += lookup[token]

    return torch.from_numpy(sentence_vec).float()

Example #3

0

Show file

def longest_sentence_length(data1, data2):
    max_length = 0
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    for sent_list in [data1['str1'], data1['str2'],
                      data2['str1'], data2['str2']]:
        for sent in sent_list:
            tokens = tokenizer.tokenize(sent)
            if len(tokens) > max_length:
                max_length = len(tokens)

    return max_length

Example #4

0

Show file

def prep_sequence(seq, lookup, max_len, gpu=False):
    '''Converts a sequence to a vector'''
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(seq)
    seq = [lookup[s] for s in tokens]
    while len(seq) < max_len:
        seq.append(0)
    if gpu:
        return torch.LongTensor(seq).cuda()
    else:
        return torch.LongTensor(seq)

Example #5

0

Show file

def get_tokens(text):
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    stw = set(stopwords.words('spanish'))
    tokens_no_sw = [x.lower() for x in tokens if x.lower() not in stw]
    tokens_no_sw = Counter(tokens_no_sw)
    norma = sum([tokens_no_sw[i] * tokens_no_sw[i] for i in tokens_no_sw])
    norma = sqrt(norma)
    for i in tokens_no_sw:
        tokens_no_sw[i] = tokens_no_sw[i] / norma
    return tokens_no_sw

Example #6

0

Show file

File: utils.py Project: gridl/examples-3

def tokenize_code(text):
    """Tokenize code strings.

  This simply considers whitespaces as token delimiters.

  Args:
    text: A code string to be tokenized.

  Returns:
    A list of strings representing the tokens in the code.
  """
    return tokenize.RegexpTokenizer(r'\w+').tokenize(text)

Example #7

0

Show file

def tokenize_comment(comment, voc, voc_index):
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    stemmer = stem.SnowballStemmer('russian')
    result = []
    for sent in tokenize.sent_tokenize(comment):
        filtered = [word for word in tokenizer.tokenize(sent) \
                if word not in corpus.stopwords.words('russian')]
        stemmed = [stemmer.stem(word) for word in filtered]
        for word in stemmed:
            if voc.get(word) == None:
                voc[word] = voc_index
                voc_index += 1
        result += stemmed
    return voc_index, result

Example #8

0

Show file

File: data_preprocessing.py Project: uuid0000/10-315-Machine-Learning-Sentiment-Binary-Classification

def read_data(files_loc, s_type='positive'):
    content_list = []
    tag_list = []
    s_tag_list = []
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    files_loc = files_loc + "*.txt"
    stemmer = SnowballStemmer("english")

    for f in glob.glob(files_loc):
        tag_list.append(f)
        s_tag_list.append(s_type)

        with open(f, 'r', encoding='utf-8') as text:
            raw = text.read()
            content = re.sub(r'\d+', '', raw)
            content = tokenizer.tokenize(content)
            content = list(map(lambda x: x.lower(), content))
            content = list(map(lambda x: stemmer.stem(x), content))
            content_list.append(content)

    return content_list, tag_list, s_tag_list

Example #9

0

Show file

File: TextPreprocessManager.py Project: andr3aranieri/FaultsClusteringSystem

    def preprocess(self, text):
        text = unicode(text, errors='replace')
        text = text.replace("\t", " ")
        text = text.replace("\r", " ")
        text = text.replace("\n", " ")
        text = text.replace("'", " ")
        text = text.strip('\t\n\r')

        #NLTK removing punctuation and numbers;
        tokenizer = tokenize.RegexpTokenizer(r'[-.?!,":;()|0-9 ]', gaps=True)
        word_list = tokenizer.tokenize(text)

        #NLTK removing english stop words and Stemming Words;
        stemmer = PorterStemmer()
        filtered_words = [
            stemmer.stem(w) for w in word_list
            if not w in stopwords.words('english') and w != '' and w != ' '
            and w != '\n' and len(w) > 1
        ]

        return ' '.join(filtered_words)

Example #10

0

Show file

主题词抽取
'''
import warnings  # 用于过滤模型多余的警告

warnings.filterwarnings('ignore', category=UserWarning)
import nltk.tokenize as tk
import nltk.corpus as nc
import nltk.stem.snowball as sb
import gensim.models.ldamodel as gm
import gensim.corpora as gc

doc = []
with open('../data2/topic.txt', 'r') as f:
    for line in f.readlines():
        doc.append(line[:-1])
tokenizer = tk.RegexpTokenizer(r'\w+')  # 基于正则表达式分词
stopwords = nc.stopwords.words('english')  # 去除非主要词
stemmer = sb.SnowballStemmer('english')  # 词干提取器
lines_tokens = []
for line in doc:
    tokens = tokenizer.tokenize(line.lower())
    line_tokens = []
    for token in tokens:
        if token not in stopwords:
            token = stemmer.stem(token)
            line_tokens.append(token)
    lines_tokens.append(line_tokens)
dic = gc.Dictionary(lines_tokens)
bow = []
for line_tokens in lines_tokens:
    row = dic.doc2bow(line_tokens)  # 构造词袋----------

Example #11

0

Show file

import warnings
warnings.filterwarnings("ignore", category=UserWarning)  # 忽略用户级警告

import nltk.tokenize as nltoken
import nltk.corpus as sc
import nltk.stem.snowball as snowball
import gensim.models.ldamodel as gm
import gensim.corpora as gc

doc = list()

with open(os.path.dirname(__file__) + "/data/topic.txt") as f:
    for line in f.readlines():
        doc.append(line)

tokenizer = nltoken.RegexpTokenizer(r"\w+")  # 见空白就拆开
stopwords = sc.stopwords.words("english")  # 停止词汇
stemmer = snowball.SnowballStemmer("english")
lines_tokens = list()

# 被拆开的句子是一个行列式, 一行表示一句, 一列表示一个单词
for line in doc:
    tokens = tokenizer.tokenize(line.lower())
    line_tokens = list()
    for token in tokens:
        if token not in stopwords:
            token = stemmer.stem(token)
            line_tokens.append(token)
    line_tokens.append(line_tokens)

dic = gc.Dictionary(lines_tokens)  # 构建词典, 用于构建词袋

Example #12

0

Show file

File: lda.py Project: m4-z3/ML-Textbook-to-Notes

    def __init__(self):
        """Start up code that should be run once in order to set up for separating sentences by topic"""
        # making sure user has required resources
        try:
            nltk.data.find('corpora/wordnet')
        except:
            nltk.download('wordnet')

        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        # list of stop words to remove curtesy of https://gist.github.com/sebleier/554280
        self._stop_words = [
            "a", "about", "above", "after", "again", "against", "ain", "all",
            "am", "an", "and", "any", "are", "aren", "aren't", "as", "at",
            "be", "because", "been", "before", "being", "below", "between",
            "both", "but", "by", "can", "couldn", "couldn't", "d", "did",
            "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don",
            "don't", "down", "during", "each", "few", "for", "from", "further",
            "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven",
            "haven't", "having", "he", "her", "here", "hers", "herself", "him",
            "himself", "his", "how", "i", "if", "in", "into", "is", "isn",
            "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma",
            "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't",
            "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o",
            "of", "off", "on", "once", "only", "or", "other", "our", "ours",
            "ourselves", "out", "over", "own", "re", "s", "same", "shan",
            "shan't", "she", "she's", "should", "should've", "shouldn",
            "shouldn't", "so", "some", "such", "t", "than", "that", "that'll",
            "the", "their", "theirs", "them", "themselves", "then", "there",
            "these", "they", "this", "those", "through", "to", "too", "under",
            "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were",
            "weren", "weren't", "what", "when", "where", "which", "while",
            "who", "whom", "why", "will", "with", "won", "won't", "wouldn",
            "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've",
            "your", "yours", "yourself", "yourselves", "could", "he'd",
            "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've",
            "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd",
            "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've",
            "what's", "when's", "where's", "who's", "why's", "would", "able",
            "abst", "accordance", "according", "accordingly", "across", "act",
            "actually", "added", "adj", "affected", "affecting", "affects",
            "afterwards", "ah", "almost", "alone", "along", "already", "also",
            "although", "always", "among", "amongst", "announce", "another",
            "anybody", "anyhow", "anymore", "anyone", "anything", "anyway",
            "anyways", "anywhere", "apparently", "approximately", "arent",
            "arise", "around", "aside", "ask", "asking", "auth", "available",
            "away", "awfully", "b", "back", "became", "become", "becomes",
            "becoming", "beforehand", "begin", "beginning", "beginnings",
            "begins", "behind", "believe", "beside", "besides", "beyond",
            "biol", "brief", "briefly", "c", "ca", "came", "cannot", "can't",
            "cause", "causes", "certain", "certainly", "co", "com", "come",
            "comes", "contain", "containing", "contains", "couldnt", "date",
            "different", "done", "downwards", "due", "e", "ed", "edu",
            "effect", "eg", "eight", "eighty", "either", "else", "elsewhere",
            "end", "ending", "enough", "especially", "et", "etc", "even",
            "ever", "every", "everybody", "everyone", "everything",
            "everywhere", "ex", "except", "f", "far", "ff", "fifth", "first",
            "five", "fix", "followed", "following", "follows", "former",
            "formerly", "forth", "found", "four", "furthermore", "g", "gave",
            "get", "gets", "getting", "give", "given", "gives", "giving", "go",
            "goes", "gone", "got", "gotten", "h", "happens", "hardly", "hed",
            "hence", "hereafter", "hereby", "herein", "heres", "hereupon",
            "hes", "hi", "hid", "hither", "home", "howbeit", "however",
            "hundred", "id", "ie", "im", "immediate", "immediately",
            "importance", "important", "inc", "indeed", "index", "information",
            "instead", "invention", "inward", "itd", "it'll", "j", "k", "keep",
            "keeps", "kept", "kg", "km", "know", "known", "knows", "l",
            "largely", "last", "lately", "later", "latter", "latterly",
            "least", "less", "lest", "let", "lets", "like", "liked", "likely",
            "line", "little", "'ll", "look", "looking", "looks", "ltd", "made",
            "mainly", "make", "makes", "many", "may", "maybe", "mean", "means",
            "meantime", "meanwhile", "merely", "mg", "might", "million",
            "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug",
            "must", "n", "na", "name", "namely", "nay", "nd", "near", "nearly",
            "necessarily", "necessary", "need", "needs", "neither", "never",
            "nevertheless", "new", "next", "nine", "ninety", "nobody", "non",
            "none", "nonetheless", "noone", "normally", "nos", "noted",
            "nothing", "nowhere", "obtain", "obtained", "obviously", "often",
            "oh", "ok", "okay", "old", "omitted", "one", "ones", "onto", "ord",
            "others", "otherwise", "outside", "overall", "owing", "p", "page",
            "pages", "part", "particular", "particularly", "past", "per",
            "perhaps", "placed", "please", "plus", "poorly", "possible",
            "possibly", "potentially", "pp", "predominantly", "present",
            "previously", "primarily", "probably", "promptly", "proud",
            "provides", "put", "q", "que", "quickly", "quite", "qv", "r",
            "ran", "rather", "rd", "readily", "really", "recent", "recently",
            "ref", "refs", "regarding", "regardless", "regards", "related",
            "relatively", "research", "respectively", "resulted", "resulting",
            "results", "right", "run", "said", "saw", "say", "saying", "says",
            "sec", "section", "see", "seeing", "seem", "seemed", "seeming",
            "seems", "seen", "self", "selves", "sent", "seven", "several",
            "shall", "shed", "shes", "show", "showed", "shown", "showns",
            "shows", "significant", "significantly", "similar", "similarly",
            "since", "six", "slightly", "somebody", "somehow", "someone",
            "somethan", "something", "sometime", "sometimes", "somewhat",
            "somewhere", "soon", "sorry", "specifically", "specified",
            "specify", "specifying", "still", "stop", "strongly", "sub",
            "substantially", "successfully", "sufficiently", "suggest", "sup",
            "sure", "take", "taken", "taking", "tell", "tends", "th", "thank",
            "thanks", "thanx", "thats", "that've", "thence", "thereafter",
            "thereby", "thered", "therefore", "therein", "there'll", "thereof",
            "therere", "theres", "thereto", "thereupon", "there've", "theyd",
            "theyre", "think", "thou", "though", "thoughh", "thousand",
            "throug", "throughout", "thru", "thus", "til", "tip", "together",
            "took", "toward", "towards", "tried", "tries", "truly", "try",
            "trying", "ts", "twice", "two", "u", "un", "unfortunately",
            "unless", "unlike", "unlikely", "unto", "upon", "ups", "us", "use",
            "used", "useful", "usefully", "usefulness", "uses", "using",
            "usually", "v", "value", "various", "'ve", "via", "viz", "vol",
            "vols", "vs", "w", "want", "wants", "wasnt", "way", "wed",
            "welcome", "went", "werent", "whatever", "what'll", "whats",
            "whence", "whenever", "whereafter", "whereas", "whereby",
            "wherein", "wheres", "whereupon", "wherever", "whether", "whim",
            "whither", "whod", "whoever", "whole", "who'll", "whomever",
            "whos", "whose", "widely", "willing", "wish", "within", "without",
            "wont", "words", "world", "wouldnt", "www", "x", "yes", "yet",
            "youd", "youre", "z", "zero", "a's", "ain't", "allow", "allows",
            "apart", "appear", "appreciate", "appropriate", "associated",
            "best", "better", "c'mon", "c's", "cant", "changes", "clearly",
            "concerning", "consequently", "consider", "considering",
            "corresponding", "course", "currently", "definitely", "described",
            "despite", "entirely", "exactly", "example", "going", "greetings",
            "hello", "help", "hopefully", "ignored", "inasmuch", "indicate",
            "indicated", "indicates", "inner", "insofar", "it'd", "keep",
            "keeps", "novel", "presumably", "reasonably", "second", "secondly",
            "sensible", "serious", "seriously", "sure", "t's", "third",
            "thorough", "thoroughly", "three", "well", "wonder"
        ]
        # set pattern to tokenize sentence to words
        self._word_tokenizer = tokenize.RegexpTokenizer(
            r"[a-zA-Z]{3,}(?:-[a-zA-Z]+)+|[a-zA-Z]{3,}(?:'t)?")
        # creates lemmatizer
        self._lemmatizer = WordNetLemmatizer()
        self._topicNum = 2

Example #13

0

Show file

def tokenize_code(text):
    """A very basic procedure for tokenizing code strings."""
    return tokenize.RegexpTokenizer(r'\w+').tokenize(text)

Example #14

0

Show file

File: tokenize.py Project: webisaac/orange3-text

 def set_up(self):
     self.tokenizer = tokenize.RegexpTokenizer(self.pattern)

Example #15

0

Show file

        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }
    neg_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) +
                             r')\b')
    tzer = tokenize.RegexpTokenizer(r'[A-Za-z_]+')

    #remove url, unicodes, emojis
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[^ ]+'
    pat3 = r'www\.[^ ]+'
    pat4 = r'\\u[^ ]+'
    combined_pat = r'|'.join((pat1, pat2, pat3, pat4))
    re_pat = re.compile(combined_pat)

    tweets = x.tolist()
    clean_tweets = []
    for t in tweets:
        tweet = tweet_cleaner(t)
        clean_tweets.append(remove_underscores(tweet))
    x = df.text.apply(tweet_cleaner).apply(remove_underscores)

Example #16

0

Show file

#paramters
batch_size=500
epoch = 100
hidden_dim = 50
embedding_dim = 50
label_size = 5
max_num=200
learning_rate = 10**(-3)
layer=2
use_gpu = torch.cuda.is_available()




tokenizer = tokenize.RegexpTokenizer(r"\w+")
stop_words = set(stopwords.words('english'))


#clean stop words (not use right now)
def text_clean(sentence):
    words = tokenizer.tokenize(sentence)
    words = [w.lower() for w in words if not w in stop_words]
    return words


#load pre-trained embedding matrix
def load_glove_into_dict(glove_path):

    embeddings_ix = {}
    with open(glove_path) as glove_file:

Example #17

0

Show file

def normalize(s):
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s.lower().strip())
    return ' '.join([stemmer.stem(w) for w in words])

Example #18

0

Show file

File: topic.py Project: liwb16161/AI

import warnings
warnings.filterwarnings('ignore', category=UserWarning)
import nltk.tokenize as tk
import nltk.corpus as nc
import nltk.stem.snowball as sb
import gensim.models.ldamodel as gm
import gensim.corpora as gc
doc = []
with open('../data/topic.txt', 'r') as f:
    for line in f.readlines():
        doc.append(line[:-1])
tokenizer = tk.RegexpTokenizer(r'\w+')
stopwords = nc.stopwords.words('english')
stemmer = sb.SnowballStemmer('english')
lines_tokens = []
for line in doc:
    tokens = tokenizer.tokenize(line.lower())
    line_tokens = []
    for token in tokens:
        if token not in stopwords:
            token = stemmer.stem(token)
            line_tokens.append(token)
    lines_tokens.append(line_tokens)
dic = gc.Dictionary(lines_tokens)
bow = []
for line_tokens in lines_tokens:
    row = dic.doc2bow(line_tokens)
    bow.append(row)
n_topics = 2
model = gm.LdaModel(bow, num_topics=n_topics, id2word=dic, passes=25)
topics = model.print_topics(num_topics=n_topics, num_words=4)

Example #19

0

Show file

def _tokenize(sentences):
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    tokens = []
    for sentence in sentences:
        tokens += tokenizer.tokenize(sentence)
    return tokens

Example #20

0

Show file

 def tokenize(self, corpus):
     
     tokenizer = tokenizers.RegexpTokenizer(r'\w+')
     corpus = corpus.apply(lambda x: tokenizer.tokenize(x))
     return corpus

Example #21

0

Show file

def preprocess(sentence):
	sentence = sentence.lower()
	tokenizer = tokenize.RegexpTokenizer(r'\w+')
	tokens = tokenizer.tokenize(sentence)
	filtered_words = [w for w in tokens if not w in stopwords.words('english')]
	return filtered_words

Example #22

0

Show file

def word_split(s):
    t = nltk_tokenize.RegexpTokenizer(r'[\w-]+(\.[\w-]+)*')
    return t.tokenize(s)

Example #23

0

Show file

def punctuation_remove(text):

    tzer = tokenize.RegexpTokenizer(r'[A-Za-z0-9_]+')
    tokenized = tzer.tokenize(text)
    return ' '.join(tokenized)

Example #24

0

Show file

File: predict_labels_LSTM.py Project: Twitter-R01/TwitterR01

def tweet_cleaner(text):
    contractions_dict = {
        "ain't": "is not",
        "aren't": "are not",
        "can't": "cannot",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how is",
        "I'd": "I would",
        "I'd've": "I would have",
        "I'll": "I will",
        "I'll've": "I will have",
        "I'm": "I am",
        "I've": "I have",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as",
        "this's": "this is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "here's": "here is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }
    neg_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) +
                             r')\b')
    tzer = tokenize.RegexpTokenizer(r'[A-Za-z_]+')

    #remove url, unicodes, emojis
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[^ ]+'
    pat3 = r'www\.[^ ]+'
    pat4 = r'\\u[^ ]+'
    combined_pat = r'|'.join((pat1, pat2, pat3, pat4))
    re_pat = re.compile(combined_pat)
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text().lower()  # lowercase the whole thing here
    bomgone = souped.replace('ï¿½', ' ')
    re_cleaned = re_pat.sub(' ', bomgone)
    neg_handled = neg_pattern.sub(lambda x: contractions_dict[x.group()],
                                  re_cleaned)
    tokenized = tzer.tokenize(neg_handled)
    return " ".join(tokenized)

Example #25

0

Show file

File: ReviewProcessor.py Project: mukul-mehta/kafka-session

import pickle
import logging
import os
from os.path import dirname
from gensim.models import Word2Vec
from nltk import tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Tokenizer to remove non-alphabets in words.
alpha_regex_tokenizer = tokenize.RegexpTokenizer(r'\w+')

# Vader based sentiment analyzer
sid = SentimentIntensityAnalyzer()

# All reviews seen until now.
overall_reviews = []


def predict(review):
    logging.debug("Predicting review..")
    sentences = clean(review)
    overall_scores = []
    for sentence in sentences:
        logging.debug(
            "Predicting sentiment for {sentence}".format(sentence=sentence))
        scores = sid.polarity_scores(sentence)
        overall_scores.append(scores)
    return overall_scores


def clean(review):

Example #26

0

Show file

File: readability.py Project: wibruce/ecir2019-qac

import re
from nltk import tokenize

TOKENIZER = tokenize.RegexpTokenizer(r'[\w\'\-]+|(?:[\.,\/#!$\"\?%\^&\*;:{}=\-_`~()\[\]])')
URL_REGEX = re.compile(r'(https?:\/\/[^ )]+)', re.MULTILINE)

def _clean(text):
    # match source code (lines starting with 4 empty spaces)
    text_cleaned = re.sub(r'^\ {4,}.*', '', text, flags=re.MULTILINE)
    text_cleaned = text_cleaned.replace('\n', '')
    text_cleaned = text_cleaned.replace('\r', '')
    return text_cleaned

def _tokens(text):
    tokens = TOKENIZER.tokenize(text)
    filtered = [token for token in tokens if token.isalpha()]
    return filtered

class ReadabilityAnalyzer(object):

    def __init__(self, question):
        self.raw = URL_REGEX.sub('URL', question)
        self.cleaned = _clean(self.raw)

    @property
    def tokens(self):
        return _tokens(self.cleaned)

    @property
    def sents(self):
        return tokenize.sent_tokenize(self.cleaned)

Example #27

0

Show file

File: Lev_dist.py Project: Raffaele992/Progetto-Gestione-informazione

# uncompyle6 version 3.2.4
# Python bytecode 3.5 (3350)
# Decompiled from: Python 3.5.2 (default, Nov 12 2018, 13:43:14)
# [GCC 5.4.0 20160609]
# Embedded file name: /home/riccardo/Lev_dist.py
# Compiled at: 2018-12-19 15:46:22
# Size of source mod 2**32: 1675 bytes

import stringdist
from nltk import tokenize, pos_tag
from nltk.corpus import stopwords
import time

Personaltokenizer = tokenize.RegexpTokenizer('\\w+')
possible_results = []


def evaluate_query(query, R, pos, vocab):

    start = time.clock()

    if len(query) <= 85 and len(query) > 0:
        query = query.lower()
        query = Personaltokenizer.tokenize(query)
        print('Query as inserted lower case: ', query)
        filtered_query = [
            word for word in query if word not in stopwords.words('english')
        ]
        print('Filtered query : ', filtered_query)
        filtered_tagged_query = pos_tag(filtered_query)
        print('Tagged query : ', filtered_tagged_query)

Example #28

0

Show file

File: parse_document.py Project: jsj2008/jvmnotebook

def init_nltk():
    global tokenizer
    global tagger
    tokenizer = tokenize.RegexpTokenizer(r'\w+|[^\w\s]+')
    tagger = UnigramTagger(brown.tagged_sents())

Example #29

0

Show file

File: topic.py Project: yongledang/fresh-notespool

主题词提取:隐含狄利克雷分布(LDA(Latent Dirichlet allocation))应用           ??????????
'''

import warnings
warnings.filterwarnings('ignore', category=UserWarning)
import nltk.tokenize as tk
import nltk.corpus as nc
import nltk.stem.snowball as sb
import gensim.models.ldamodel as gm  # 隐含狄利克雷分布模型 LDA(线性判别分析)
import gensim.corpora as gc

doc = []
with open('topic.txt', 'r') as f:
    for line in f.readlines():
        doc.append(line[:-1])
tokenizer = tk.RegexpTokenizer(r'\w+')  # reg 匹配特殊的分隔符
stopwords = nc.stopwords.words('english')  # 停止词，没实际语义贡献的词
stemmer = sb.SnowballStemmer('english')
lines_tokens = []
for line in doc:
    tokens = tokenizer.tokenize(line.lower())
    line_tokens = []
    for token in tokens:
        if token not in stopwords:
            token = stemmer.stem(token)
            line_tokens.append(token)
    lines_tokens.append(line_tokens)

#Dictionary： encapsulates(封装) the mapping between normalized words and their integer ids.
dic = gc.Dictionary(lines_tokens)
# Dictionary(121 unique tokens: ['cryptographi', 'lot', 'spent', 'studi', 'time']...)

Example #30

0

Show file

            database = "tmdb")
    movie_dataframe2 = sqlio.read_sql_query(sql, dbConnection)
except (Exception , psycopg2.Error) as dbError :
        print ("Error:", dbError)
finally:
    if(dbConnection): dbConnection.close()


    
import nltk.sentiment.vader 
from nltk.corpus import stopwords
import nltk.tokenize as nt   
from nltk.stem import PorterStemmer 
# Tokenization
word_tokens = []    
tokenizer = nt.RegexpTokenizer(r'\w+')
for i in range(0, len(movie_dataframe)):
    word_tokens.append(tokenizer.tokenize(movie_dataframe['content'][i]))

 
#Stop word removal
stop_words = set(stopwords.words('english'))     

 
movie_dataframe['word_list'] = movie_dataframe['content'].apply(lambda x: [item for item in x.split() if item not in stop_words])
          



movie_dataframe['sentiment_scores'] = ""
movie_dataframe['total_score'] = ""