def clean_text(t): sentence = t.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) filtered_words = filter( lambda token: token not in stopwords.words('english'), tokens) return " ".join(filtered_words)
def startToken(input): tokenizer = RegexpTokenizer(r'(\w+)') file = open(input, 'r') text = file.read() tokens = tokenizer.tokenize(text.lower()) file.close() return tokens
def gen_vocab(vocab_fname, path): print("\ngen_vocab:{}".format(vocab_fname)) """ reads in a csv file, outputs as python list in given path as pickled object. unicode. Also add unigrams for every line""" # open file pointer f = codecs.open(path+vocab_fname, 'r', "utf-8") # output list concepts = [] # read in lines for line in f.readlines(): concepts = concepts + line.lower().strip("\n").split(',') # from observation the concept lists all had '' while ('' in concepts): concepts.remove('') # add unigrams to concepts. does not preserve order of list unigrams = set() set_concepts = set(concepts) tokenizer = RegexpTokenizer(ur'\w+') for phrase in concepts: unigrams.update(tokenizer.tokenize(phrase)) set_concepts.update(unigrams) return list(set_concepts)
def gen_counts(path_corpus, list_corpus): """ creates np array, for each corpus file how many words in that document """ # create output counts_corpus = np.zeros(len(list_corpus)) fp = None txt = u'' tokens = [] tokenizer = RegexpTokenizer(ur'\w+') count = 0 every = 500 for f in list_corpus: # read in text fp = codecs.open(path_corpus+f, 'r', "utf-8", errors="ignore") txt = fp.read() txt = txt.lower() fp.close() # tokenize tokens = tokenizer.tokenize(txt) counts_corpus[list_corpus.index(f)] = len(tokens) # count interations if count % every == 0: print(count) count += 1 return counts_corpus
def filter_sentence(sentence): tokenizer = RegexpTokenizer(r'\w+') word_tokens = tokenizer.tokenize(sentence) filtered_words = [w for w in word_tokens if not w in stop_words] snowball_result_set = [snowball_stemmer.stem(word) for word in filtered_words] return snowball_result_set
def analyze_dataset(): l_sentences = [] with open( '/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-negative.txt' ) as file1: r = reader(file1, dialect='excel-tab') for row in r: l_sentences.append(row[0]) with open( '/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-positive.txt' ) as file2: r = reader(file2, dialect='excel-tab') for row in r: l_sentences.append(row[0]) # chunk the given text into sentences tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') d_lengths = defaultdict(int) tokenizer2 = RegexpTokenizer(r'\w+') # clean sentences from punctuation l_sentences = [ ''.join(ch for ch in sent if ch not in set(string.punctuation)) for sent in l_sentences ] l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences] total_sent = len(l_sentences) d_lengths = Counter(l_sentences) print total_sent lengths = sorted(d_lengths.iteritems(), key=lambda key_value: int(key_value[0])) plot(lengths)
def frequencyAnalyse(polarised_tweets : Dict): positive_words = {} negative_words = {} tokenizer = RegexpTokenizer(r'\w+') stop_words = list(stopwords.words('english')) for i in polarised_tweets: word_pit = tokenizer.tokenize(polarised_tweets[i][0]) tags = nltk.pos_tag(word_pit) for word in tags: if word[0] in positive_words: positive_words[word[0]] += 1 continue elif word[0] in negative_words: negative_words[word[0]] += 1 continue if len(word[0]) < 3: continue if word[0].lower() in stop_words: continue if word[1] in ['JJ']: if polarised_tweets[i][1] > 0.2: #Positive positive_words[word[0].lower()] = 1 elif polarised_tweets[i][1] < -0.2: #Negative negative_words[word[0].lower()] = 1 for w in sorted(negative_words, key=negative_words.get, reverse=True): print(w, negative_words[w]) return (positive_words, negative_words)
def tokenize(self, attr): accepted = { 'title': self.title, 'description': self.description, 'cve': self.cve, 'cwe': self.cwe, 'refs': self.refs, 'dsk': self.dsk } matcher = { 'title': r'\w+[-\w+]*', 'description': r'\w+[-\w+]*', 'cve': r'CVE[\s|-]\d+[\s|-]\d+' } if attr not in accepted.keys(): return 'It is not possible to tokenize this plugin attribute.' tokenizer = RegexpTokenizer(matcher[attr]) stop = stopwords.words('english') final = [] if attr == 'title' or attr == 'description': intermediate = tokenizer.tokenize(accepted[attr]) final = [i.lower() for i in intermediate if i not in stop] elif attr == 'cve': intermediate = tokenizer.tokenize(','.join(accepted[attr])) final = [i.lower().replace(' ', '-') for i in intermediate if i not in stop] return final
def tokenize(self, string): # Supression des espaces non nécessaires space = re.compile(r' +') string = re.sub(space, ' ', string) # Harmonisation des numéros de téléphone tel = re.compile( r'(?P<sep1>0[0-9])( |/+|\-|\\+)(?P<sep2>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep3>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep4>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep5>[0-9]{2})' ) string = tel.sub(r'\g<sep1>.\g<sep2>.\g<sep3>.\g<sep4>.\g<sep5>', string) # Tokenisation # Le tokenizer supprime automatiquement les caractères suivant isolés : `^ ° ¤ ¨ # Reconnait comme token : # - Email # - Site web, nom de domaine, utilisateur etc # - Numéro de téléphone réduit # - Nom composé # - Mot courant # - Ponctuation tokenizer = RegexpTokenizer( r'''([Aa]ujourd'hui|\w+'|[a-zA-ZÀ-Ÿà-ÿ0-9_\.\-]+@[a-zA-ZÀ-Ÿà-ÿ0-9\-\.]+\.[a-zA-ZÀ-Ÿà-ÿ0-9]+|[a-zA-ZÀ-Ÿà-ÿ0-9:@%/;$~_?\+\-=\\\.&\|£€]+[a-zA-ZÀ-Ÿà-ÿ0-9#@%/$~_?\+\-=\\&\|£€]+|[\wÀ-Ÿà-ÿ]+[/\-][\wÀ-Ÿà-ÿ]+|[\wÀ-Ÿà-ÿ0-9]+|\.\.\.|[\(\)\[\]\{\}\"\'\.,;\:\?!\-\_\*\#\§=+<>/\\])''' ) tokens = tokenizer.tokenize(string) return tokens
def summarize(text): tokenizer = RegexpTokenizer(r'\w+') formatted_text = tokenizer.tokenize(text) sentence_list = nltk.sent_tokenize(text) stopwords = nltk.corpus.stopwords.words('english') word_frequencies = {} for word in formatted_text: if word not in stopwords: if word not in word_frequencies: word_frequencies[word] = 1 else: word_frequencies[word] += 1 max_freq = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = word_frequencies[word]/max_freq sentence_scores = {} for sent in sentence_list: for word in tokenizer.tokenize(sent.lower()): if word in word_frequencies: if len(sent.split(' ')) < 30: if sent not in sentence_scores: sentence_scores[sent] = word_frequencies[word] else: sentence_scores[sent] += word_frequencies[word] import heapq summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get) summary = ' '.join(summary_sentences) return summary
def get_documents_text(act_id, **kwargs): """ Returns the concatenated, tag-stripped text of all documents related to act_id """ db_conn = kwargs['db'] italian_stops = set(stopwords.words('italian')) cursor = db_conn.cursor(MySQLdb.cursors.DictCursor) sql = """ select d.testo from opp_documento as d where d.atto_id=%s """ cursor.execute(sql, act_id) rows = cursor.fetchall() cursor.close() testo = u'' for row in rows: # strip html tags from texts, if present testo += unicode( strip_tags( row['testo'] ) ) # remove stopwords tokenizer = RegexpTokenizer("[\w]+") words = tokenizer.tokenize(testo) filtered_testo = " ".join([word for word in words if word.lower() not in italian_stops]) return filtered_testo
def add_to_index(self, document, doc_id): # parser = HTMLParser(text=document['data']) text = document['data'] # print(1) nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tokens = [token.lower() for token in tokens] tmp_text = ' '.join(tokens) if len(tokens) > 10e5: return self.doc_iter += 1 nlp.max_length = 10e7 doc_text = nlp(tmp_text, disable=['ner', 'parser']) lemmas = [] # for lemma in tokens: for s in doc_text: lemma = s.lemma_ lemmas.append(lemma) # if lemma not in set(stopwords.words('russian')) \ # and lemma not in set(stopwords.words('english')) \ # and len(lemma) > 1: # lemmas.append(lemma) freq = FreqDist(lemmas) for k, v in freq.most_common(): if k not in self.global_index: self.global_index[k] = [] self.global_index[k].append((doc_id, v))
def nltk_lemmatize_preprocessing(X): _stopwords = set(stopwords.words('english')) _regextknz = RegexpTokenizer(r'\w+') _lemmatizer = WordNetLemmatizer() _X = [] for text in X: text = text.lower() text = re.sub(r"(?i)(?:https?|ftp)://[\n\S]+", "", text) return_str = '' for sent in sent_tokenize(text): words_filtered = [] for word in _regextknz.tokenize(sent): if word not in _stopwords: words_filtered.append(word) word_lemmatized = [] for (word, tag) in pos_tag(words_filtered): tag = get_wordnet_pos(tag) if tag != '': word = _lemmatizer.lemmatize(word, tag) word_lemmatized.append(word) return_str = return_str + ' '.join(word_lemmatized) + '. ' # print(word_lemmatized) _X.append(return_str) return _X
def processText(self,Estr): # ① 去除HTML标签 content = re.sub(r'<[^>]*>', ' ', Estr) # ② 除去标点符号,等非字母的字符 tokenizer = RegexpTokenizer(r'[a-z]+') raw = str(content).lower() content = tokenizer.tokenize(raw) # ③ 去除停用词 # 获取英语的停用词表 en_stop = stopwords.words('english') # get_stop_words('en') # 获取自己的停用词表 # file = os.getcwd()+"\\..\\datasets\\stopwords.txt" # f = open(file, "r") # mystopwords = f.read() # mystopwords= mystopwords.split('\n') # for word in mystopwords: # en_stop.add(word) # 去除文本中的停用词 stopped_tokens = [i for i in content if not i in en_stop] # ④ 按长度过滤 content = [i for i in stopped_tokens if len(i) > 2] return content
def test(): global N, words, network print 'In testing.' gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth.""" tokenizer = RegexpTokenizer('\w+') gettysburg_tokens = tokenizer.tokenize(gettysburg) samples = [] for token in gettysburg_tokens: word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) dist = FreqDist(samples) V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) pred = network.forward(V).w topics = [] while len(topics) != 5: max_act = max(pred) topic_idx = pred.index(max_act) topic = words[topic_idx] if topic in gettysburg_tokens: topics.append(topic) del pred[topic_idx] print 'Topics of the Gettysburg Address:' print topics
def Identify_badword_inclusion(self, post_new1): sentence = sent_tokenize(post_new1) badword = open("../../../resources/text_resources/BadWords.txt", "r").read() # print(sentence) tokenizer = RegexpTokenizer("[\w']+") # lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() for i in sentence: # print(tokenizer.tokenize(i)) for j in tokenizer.tokenize(i): # print(j,stemmer.stem(j)) j = stemmer.stem(j) # print(j)#lemmas from post text #search on bad word dictionary for x in badword.split(','): if x == j: #checking with badword dictionary # print("YES,IT HAS BAD WORD") # print(i) self.badword_sen[self.iter] = i self.iter += 1 # for xx in self.badword_sen: # print(xx) return self.badword_sen
def get_recall_unordered(recall_list, query_list): """ Determine the fraction of queries in the recall list that were captured by the query list without concern for word order, capitalization, or punctuation. Differences in apostrophes (single quotes) will still be considered. :param recall_list: list containing all queries from a recall data set :param query_list: list containing all automatically generated queries :return: fraction of recall queries captured """ tokenizer = RegexpTokenizer(r'[a-zA-Z\']+') query_token = [] for q in query_list: l = tokenizer.tokenize(q.lower()) sorted(l, key=str.lower) query_token.append(l) num = 0 for q in recall_list: l = tokenizer.tokenize(q.lower()) sorted(l, key=str.lower) if l in query_token: num += 1 if len(recall_list) == 0: return 0, 0 return float(num) / float(len(recall_list)), num
def clean_text(text, stop_words): '''Make text lowercase, tokenize words and words with apostrophes, convert contractions to full words, lemmatize by POS tag, remove stop words and words shorter than 3 letters.''' # make text lowercase text = text.lower().replace("’", "'") # initial tokenization to remove non-words tokenizer = RegexpTokenizer("([a-z]+(?:'[a-z]+)?)") words = tokenizer.tokenize(text) # convert contractions contractions = load_dict_contractions() words = [contractions[word] if word in contractions else word for word in words] text = ' '.join(words) # remove stop words, lemmatize using POS tags, and remove two-letter words lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(text) \ if word not in stop_words] # removing any words that got lemmatized into a stop word words = [word for word in words if word not in stop_words] words = [word for word in words if len(word) > 2] text = ' '.join(words) return text
def create_bag_of_words(document_list): """ Creates a bag of words representation of the document list given. It removes the punctuation and the stop words. :type document_list: list[str] :param document_list: :rtype: list[list[str]] :return: """ tokenizer = RegexpTokenizer(r'\w+') cached_stop_words = set(stopwords.words("english")) body = [] processed = [] # remove common words and tokenize # texts = [[word for word in document.lower().split() if word not in stopwords.words('english')] # for document in reviews] for i in range(0, len(document_list)): body.append(document_list[i].lower()) for entry in body: row = tokenizer.tokenize(entry) processed.append([word for word in row if word not in cached_stop_words]) return processed
def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: tagger_args = [ f'--dicdir={mecab_dictionary_path}', '--rcfile=/dev/null', f'--node-format=%m{self.__MECAB_TOKEN_POS_SEPARATOR}%h{self.__EOL_SEPARATOR}', f'--eos-format={self.__MECAB_EOS_MARK}{self.__EOL_SEPARATOR}', ] self.__mecab = MeCab.Tagger(' '.join(tagger_args)) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def lcs_norm_word(answer_text, source_text): '''Computes the longest common subsequence of words in two texts; returns a normalized value. :param answer_text: The pre-processed text for an answer text :param source_text: The pre-processed text for an answer's associated source text :return: A normalized LCS value''' tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') # splitting inputs tok_answer_text = tokenizer.tokenize(answer_text) tok_source_text = tokenizer.tokenize(source_text) len_answers = len(tok_answer_text) len_source = len(tok_source_text) tok_answer_text.insert(0, ' ') tok_source_text.insert(0, ' ') # making a matrix of zeros with as many columns as the answer and # as many rows as the student answer, plus 1 each lcs_matrix = np.zeros((len_source + 1, len_answers + 1)) for i in range(1, len_source + 1): for j in range(1, len_answers + 1): if tok_answer_text[j] == tok_source_text[i]: lcs_matrix[i][j] = lcs_matrix[i - 1][j - 1] + 1 else: lcs_matrix[i][j] = max(lcs_matrix[i - 1][j], lcs_matrix[i][j - 1]) cs = lcs_matrix[-1][-1] return cs / len_answers
def tensor_vec_pipline(data, word_index, max_len): #Create data maxtrix to be fed to the keras model print("Creating data to feed to tensorflow") df_len = len(data) indexing_matrix = np.zeros((df_len, max_len), dtype = 'int32') r_inc = 0 tokenizer = RegexpTokenizer(r'\w+') for index, row in data.iterrows(): sentence = row['sentence'] sen_tokenize = tokenizer.tokenize(sentence) c_inc = 0 for word in sen_tokenize: try: indexing_matrix[r_inc][c_inc] = word_index[word] except Exception as e: #print(e, word) if (str(e) == word): indexing_matrix[r_inc][c_inc] = 0 continue c_inc = c_inc + 1 r_inc = r_inc + 1 print("Run complete") return indexing_matrix
def get_word_tokens(self) -> List[int]: tokens = [] for file in self.get_sourcecode: tokenizer = RegexpTokenizer(self.WORD_PATTERN) word_token = tokenizer.tokenize(file) tokens.append(self.__len__(word_token)) return tokens
def read_all_txt_orig(directory): all_s = [] for file in os.listdir(directory): full_path = os.path.join(directory, file) if not file.endswith(".txt"): continue with open(full_path) as f: captions = f.read().split('\n') for cap in captions: if len(cap) == 0 or len(cap) == 1: continue cap = cap.replace("\ufffd\ufffd", " ") # picks out sequences of alphanumeric characters as tokens # and drops everything else tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(cap.lower()) # print('tokens', tokens) if len(tokens) == 0: print('cap', cap) continue tokens_new = [] for t in tokens: if t == 'thisbirdhasadarkgreybelly': print(123) t = t.encode('ascii', 'ignore').decode('ascii') if len(t) > 0: tokens_new.append(t) all_s.append(" ".join(tokens_new) + "\n") return all_s
def regex_tokenizer(self, sent, whole_sent=False): regex_tokenizer = RT('\w+|\[M:.*?\]|[\(\)\.\,;\?\!]|\S+') tokens = regex_tokenizer.tokenize(sent) i = 0 j = len(tokens) - 1 # combine abbreviations with their period; # separation is a result of tokenizing the sentence while i < j: if re.match(self.abbrev_pattern, tokens[i]) and tokens[i + 1] == '.': tokens[i:i + 2] = [''.join(tokens[i:i + 2])] j -= 1 i += 1 # return tokenized sentence minus stopword and short words if not whole_sent: return [ t for t in tokens if not t in self.stop_words and len(t) > 2 ] # return entire tokenized sentence else: return [t for t in tokens]
def __init__(self, text): print 'Extracting keywords...' self.text = text self.graph = defaultdict(lambda: 0) self.tokenizer = RegexpTokenizer('\w+') self.make_graph()
def __init__(self, bioasq_json, context_token_limit=-1, types=None, include_synonyms=False, include_answer_spans=True): """ Creates the BioAsqSquadBuilder. :param bioasq_json: The BioASQ JSON object. :param context_token_limit: If larger than 0, contexts will only be added as long as the token limit is not exceeded. :param types: Question types to include :param include_synonyms: If True, the answers object is a list of lists (which is NOT the SQuAD format) with the outer list containing the answers (i.e., correct answers of the list question) and inner list containing the synonyms. If False, the answers object is a flat list and only one synonym is included. :param include_answer_spans: Whether to include exact answers. If True, questions that are not extractive are skipped. """ self._bioasq_json = bioasq_json self._types = types if self._types is None: self._types = ["factoid", "list"] self._tokenizer = RegexpTokenizer(r'\w+|[^\w\s]') self._context_token_limit = context_token_limit self._include_synonyms = include_synonyms self._include_answer_spans = include_answer_spans self._paragraphs = None self._stats = { "contexts_truncated": 0, "max_context_length": 0, }
def analyze_dataset(): l_sentences = [] with open('/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-negative.txt') as file1: r = reader(file1, dialect='excel-tab') for row in r: l_sentences.append(row[0]) with open('/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-positive.txt') as file2: r = reader(file2, dialect='excel-tab') for row in r: l_sentences.append(row[0]) # chunk the given text into sentences tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') d_lengths = defaultdict(int) tokenizer2 = RegexpTokenizer(r'\w+') # clean sentences from punctuation l_sentences = [''.join(ch for ch in sent if ch not in set(string.punctuation)) for sent in l_sentences] l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences] total_sent = len(l_sentences) d_lengths = Counter(l_sentences) print total_sent lengths = sorted(d_lengths.iteritems(), key=lambda key_value: int(key_value[0])) plot(lengths)
def clean_test_data(test_data, test_labels): """ Using that function you are able to read the test files and make the predictions :param test_data: test data file :param test_labels: test labels file :return: """ xtest = [] ytest = [] with open(test_data, encoding="utf8", errors="ignore") as data: stop_words = set(stopwords.words('english')) tokenizer = RegexpTokenizer(r'\w+') lines = data.readlines() for i in range(len(lines)): line = lines[i] if i == 0: continue else: tokenized_line = tokenizer.tokenize(line) cleaned_data = [ word.lower() for word in tokenized_line if not word.isdigit() and word != "ml" and word not in stop_words ] xtest.append(" ".join(cleaned_data)) with open(test_labels, encoding="utf8", errors="ignore") as labels: ydata = labels.readlines() for label in ydata: ytest.append(label.strip()) return xtest, ytest
def get_tokens(dict_element): # Remove stop words from data and perform initial # cleanup for feature extraction query = dict_element['query'] desc = dict_element['product_description'] title = dict_element['product_title'] stop = stopwords.words('english') pattern = r'''(?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | \$?\d+(\.\d+)?%? # numbers, incl. currency and percentages | \w+([-']\w+)* # words w/ optional internal hyphens/apostrophe | @((\w)+([-']\w+))* | [+/\-@&*] # special characters with meanings ''' #pattern = r'[+/\-@&*#](\w+)|(\w+)' tokenizer = RegexpTokenizer(pattern) #tokenizer = RegexpTokenizer(r'\w+') query_tokens = tokenizer.tokenize(query) query_tokens = map(lambda x:x.lower(),query_tokens) desc_tokens = tokenizer.tokenize(desc) desc_tokens = [x.lower() for x in desc_tokens if x.lower() not in stop] title_tokens = tokenizer.tokenize(title) title_tokens = [x.lower() for x in title_tokens if x.lower() not in stop] return query_tokens, title_tokens, desc_tokens
def read_and_clean_training_data(file): """ The following function reads the training data and split them based the label :param file: :return: """ with open(file, encoding="utf8", errors="ignore") as f: stop_words = set(stopwords.words('english')) lines = f.readlines() x_train = [] y_train = [] tokenizer = RegexpTokenizer(r'\w+') for i in range(len(lines)): line = lines[i] if i == 0: continue else: data, label = line.split('\t') label = label.strip() tokenized_data = tokenizer.tokenize(data) cleaned_data = [ word.lower() for word in tokenized_data if not word.isdigit() and word != "ml" and word not in stop_words ] final_data = " ".join(cleaned_data) x_train.append(final_data) y_train.append(label) return x_train, y_train
def _get_ngram_features(infile, ngram_size): """ Returns a dictionary containing ngrams and counts observed in a given file :param infile: file to be analysed :param ngram_size: ngram size :return: dict of ngrams/counts """ # tokenizer which remove punctuation tokenizer = RegexpTokenizer(r'\w+') # dictionary on ngrams and counts d_ngrams = defaultdict(int) # stopwords stops = set(stopwords.words("english")) # lemmatizer for stemming lemmatizer = WordNetLemmatizer() # load train data with open(infile) as tsv: file_reader = reader(tsv, dialect="excel-tab") # skip title line file_reader.next() for line in file_reader: s_text = line[2] # remove punctuation and tokenize l_text = tokenizer.tokenize(s_text) # remove stopwords and stem l_text = [lemmatizer.lemmatize(word) for word in l_text if word not in stops] # get the ngrams for the given line l_temp = ngrams(l_text, ngram_size) for ngram in l_temp: d_ngrams[ngram] += 1 return d_ngrams
def french_tokenizer(text): from nltk import RegexpTokenizer tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b") toks = tokenizer.tokenize(text) # We also lemmatize! # toks = [fr_lexicon.get(t, t) for t in toks] return toks
def prep_text_to_stem(text): """ Remove partes indesejadas como números e palavras na stop_list. Além disso adicionar # ao final da palavra a fim de facilitar no stems de uma única letra :param text: :return: """ text = list(filter(lambda x: type(x) == str, text)) tokenizer = RegexpTokenizer(r'\w+', flags=re.UNICODE) tokens = tokenizer.tokenize(' '.join(text).lower()) new_tokens = [] stop_list = Counter(tokens).most_common(300) stop_list = [tup[0] for tup in stop_list] stop_list.append('series([],') for token in tokens: if token not in stop_list: token = ''.join( [letter for letter in token if not letter.isdigit()]) for pun in punct: token.replace(pun, '') new_token = token + '#' new_tokens.append(new_token) return ' '.join(new_tokens)
def analyze_articles(): json_document = _read_json_articles() l_articles = [ json_document[i]['_source']['content'] for i in range(len(json_document)) ] # chunk the given text into sentences tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') d_lengths = defaultdict(int) tokenizer2 = RegexpTokenizer(r'\w+') total_sent = 0 for article in l_articles: l_sentences = tokenizer.tokenize(article) # clean sentences from punctuation l_sentences = [ ''.join(ch for ch in sent if ch not in set(string.punctuation)) for sent in l_sentences ] l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences] total_sent += len(l_sentences) d_counts = Counter(l_sentences) for key in d_counts.keys(): d_lengths[str(key)] += d_counts[key] print total_sent lengths = sorted(d_lengths.iteritems(), key=lambda key_value: int(key_value[0])) plot(lengths)
def preprocessing(self): self.df = pd.read_csv('static/models/resampled_comments_1.csv') self.comments = self.df[['comment', 'rating', 'sentiment']] self.comments['comment'] = self.comments['comment'].map( lambda x: x.lower()) toknizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''') token = self.comments.apply( lambda row: toknizer.tokenize(row['comment']), axis=1) stop_words = set(stopwords.words('french')) stop_token = token.apply( lambda x: [item for item in x if item not in stop_words]) stemmer = SnowballStemmer(language='french') stemm = stop_token.apply(lambda x: [stemmer.stem(y) for y in x]) lemmatizer = FrenchLefffLemmatizer() lemm = stemm.apply(lambda x: [lemmatizer.lemmatize(y) for y in x]) for i in range(len(lemm)): lemm[i] = ' '.join(lemm[i]) self.comments['lemmatiser_com'] = lemm data = self.comments[['comment', 'lemmatiser_com', 'sentiment']] self.df = pd.DataFrame(data) return self.df
def text2sents(text, lemmatize=False, stemmer=None): """ converts a text into a list of sentences consisted of normalized words :param text: list of string to process :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False :return: list of lists of words """ sents = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') if lemmatize: normalizer = WordNetLemmatizer() tagger = PerceptronTagger() elif stemmer is None: normalizer = PorterStemmer() else: normalizer = stemmer sents_normalized = [] for sent in sents: sent_tokenized = tokenizer.tokenize(sent) if lemmatize: sent_tagged = tagger.tag(sent_tokenized) sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged] else: sent_normalized = [normalizer.stem(w) for w in sent_tokenized] sents_normalized.append(sent_normalized) return sents_normalized
def french_tokenizer(text): from nltk import RegexpTokenizer tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b") toks = tokenizer.tokenize(text) # We also lemmatize! # toks = [fr_lexicon.get(t, t) for t in toks] return toks
def prepare_text(text: pd.Series) -> pd.Series: """ Naive approach to text cleaning. Strip out HTML, then do relatively strict preparation (lemmatization, stopwords) :param text: series of all relevant text data """ # first, remove html tags wo_html = text.apply(lambda x: BeautifulSoup(x, "lxml").text) tokenizer = RegexpTokenizer(r'\w+') stopword_set = set(stopwords.words('english')) lmtzr = WordNetLemmatizer() clean_text = [] pbar = tqdm(range(len(text)), desc='clean_text') for d in wo_html: dlist = d.lower() dlist = tokenizer.tokenize(dlist) dlist = list(set(dlist).difference(stopword_set)) # filter tokens filtered_tokens = [] for token in dlist: if re.search('^[a-zA-Z]+$', token) and len(token) >= 4: filtered_tokens.append(token) # lemmatize stems = [lmtzr.lemmatize(t) for t in filtered_tokens] final_stems = [stem for stem in stems if len(stem) > 3] clean_text.append(final_stems) pbar.update() pbar.close() return clean_text
def tokenize(text): """ Input: "Body of text...: Output: [word, ...] list of tokenized words matching regex '\w+' """ tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) return tokens
def tokenize(self, text): """ tokenise text using nltk RegexpTokenizer :param text: :return: list of tokens """ tokenizer = RegexpTokenizer(self.pattern) tokens = tokenizer.tokenize(text) return tokens
def tokenize(self, text): """ :param tweet_list: :type list: :return: tokens This tokenizer uses the nltk RegexpTokenizer. """ tokenizer = RegexpTokenizer(self.pattern) tokens = tokenizer.tokenize(text) return tokens
def __call__(self, doc ): from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords #tokenizer = RegexpTokenizer(r'\w+') tokenizer = RegexpTokenizer(r'[a-zA-Z]+') #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)] words=[self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)] mystops=(u'youtube',u'mine',u'this',u'that','facebook','com','google','www','http','https') stop_words=set(stopwords.words('english')) stop_words.update(mystops) stop_words=list(stop_words) return [i.lower() for i in words if i not in stop_words]
def tokenize_and_stem(doc): tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() tokens = tokenizer.tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in en_stop and len(token) > 2] final = [p_stemmer.stem(word) for word in clean] return final
def gen_doc_term_counts(path_corpus, list_corpus, list_vocab): print("\ngen_doc_term_counts:{}".format(path_corpus)) """ generates document-term matrix given a path to a corpus and common vocab """ num_docs = len(list_corpus) num_terms = len(list_vocab) doc_term = np.zeros((num_docs, num_terms)) counts_corpus = np.zeros(num_docs) # generate (dict) compiled regex's re_c_vocab = gen_regex_c(list_vocab) tokenizer = RegexpTokenizer(ur'\w+') # iterate over files fp = None txt = u'' r = None num = 0.0 tokens = [] count = 0 every = 50 start= timeit.default_timer() checkpoint = 0.0 for i in range(num_docs): fp = codecs.open(path_corpus+list_corpus[i], 'r', "utf-8", errors="ignore") txt = fp.read() txt = txt.lower() fp.close() # tokenize tokens = tokenizer.tokenize(txt) counts_corpus[i] = len(tokens) # count number terms for j in range(num_terms): r = re_c_vocab[ list_vocab[j] ] num = len(r.findall(txt, re.UNICODE)) doc_term[i,j] = num if (count % every == 0): checkpoint = timeit.default_timer() print(count, round(checkpoint-start, 2)) count += 1 return (doc_term, counts_corpus)
def __call__(self, doc ,string_tokenize='[a-zA-Z0-9]+'): from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from nltk.corpus import wordnet as wn #tokenizer = RegexpTokenizer(r'\w+') tokenizer = RegexpTokenizer(string_tokenize) #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)] words=[self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)] mystops=(u'youtube',u'mine',u'this',u'that') stop_words=set(stopwords.words('english')) stop_words.update(mystops) stop_words=list(stop_words) words1= [i.lower() for i in words if i not in stop_words] words2= list(set(list({l.name() for word in words1 for s in wn.synsets(word) for l in s.lemmas()})+words1)) return [i.lower() for i in words2 if i not in stop_words]
class StemTokenizer(object): def __init__(self): self.wnl = PorterStemmer() self.mytokenizer = RegexpTokenizer('\\b\\w+\\b') def __call__(self, doc): #return [self.wnl.stem(t) for t in word_tokenize(doc)] return [self.wnl.stem(t) for t in self.mytokenizer.tokenize(doc)]
class StemTokenizer(object): def __init__(self): from nltk import RegexpTokenizer from nltk.stem import PorterStemmer self.wnl = PorterStemmer() self.mytokenizer = RegexpTokenizer('\\b\\w+\\b') def __call__(self, doc): return [self.wnl.stem(t) for t in self.mytokenizer.tokenize(doc)]
def build_vector(text, neutral): # We tokenize the text tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) if neutral: tokens = pos_tag(tokens) # we add POS tag forbidden_pos = ['RB', 'RBS', 'RBR', 'CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'PDT', 'PRP', 'PRP$', 'RP', 'SYM', 'TO', 'WDT', 'WP', 'WP$', ] # We build the document vector vector = set() for couple in tokens: if neutral: if (couple[1] in forbidden_pos): continue vector.add(lemmatize(couple[0])) else: vector.add(lemmatize(couple)) return vector
def analyze_articles(): json_document = _read_json_articles() l_articles = [json_document[i]['_source']['content'] for i in range(len(json_document))] # chunk the given text into sentences tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') d_lengths = defaultdict(int) tokenizer2 = RegexpTokenizer(r'\w+') total_sent = 0 for article in l_articles: l_sentences = tokenizer.tokenize(article) # clean sentences from punctuation l_sentences = [''.join(ch for ch in sent if ch not in set(string.punctuation)) for sent in l_sentences] l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences] total_sent += len(l_sentences) d_counts = Counter(l_sentences) for key in d_counts.keys(): d_lengths[str(key)] += d_counts[key] print total_sent lengths = sorted(d_lengths.iteritems(), key=lambda key_value: int(key_value[0])) plot(lengths)
def tokenizeWords(corpus_root): wordlists = PlaintextCorpusReader(corpus_root, '.*') tokenizer = RegexpTokenizer(r'\w+') # for fileid in wordlists.fileids(): # sentimentText=wordlists.raw(fileid).lower() # tokenizedWords=tokenizer.tokenize(sentimentText) # tokenizedTextWithoutStopWords=removeAllStopWords(tokenizedWords) # # print(tokenizedTextWithoutStopWords) # if "positive" in corpus_root: # print("positive documents") # #posfeats.update(word_feats(tokenizedTextWithoutStopWords),'pos') # #posfeats =posfeats+[word_feats(tokenizedTextWithoutStopWords), 'pos'] # posfeats[word_feats(tokenizedTextWithoutStopWords)]='pos' # # if "negative" in corpus_root: # negfeats.update(word_feats(tokenizedTextWithoutStopWords),'neg') if "negative" in corpus_root: negfeats = [(word_feats(removeAllStopWords(tokenizer.tokenize(wordlists.raw(f).lower()))), 'neg') for f in wordlists.fileids()] if "positive" in corpus_root: posfeats = [(word_feats(removeAllStopWords(tokenizer.tokenize(wordlists.raw(f).lower()))), 'pos') for f in wordlists.fileids()] print(posfeats)
def test(): gt = GetTweets() documents = gt.get_hashtag('ferguson', count=20) documents += gt.get_hashtag('police', count=21) print 'Query:', documents[-1] tokenizer = RegexpTokenizer('\w+') vols = [] for doc in documents: samples = [] for token in tokenizer.tokenize(doc): word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) vols.append(volumize(FreqDist(samples))) vectors = [ doc_code(v) for v in vols[:-1] ] query_vec = doc_code(vols[-1]) sims = [ cos(v, query_vec) for v in vectors ] m = max(sims) print m, documents[sims.index(m)]
def create_bag_of_words(document_list): """ Creates a bag of words representation of the document list given. It removes the punctuation and the stop words. :type document_list: list[str] :param document_list: :rtype: list[list[str]] :return: """ tokenizer = RegexpTokenizer(r'\w+') tagger = nltk.PerceptronTagger() cached_stop_words = set(stopwords.words("english")) cached_stop_words |= { 't', 'didn', 'doesn', 'haven', 'don', 'aren', 'isn', 've', 'll', 'couldn', 'm', 'hasn', 'hadn', 'won', 'shouldn', 's', 'wasn', 'wouldn'} body = [] processed = [] for i in range(0, len(document_list)): body.append(document_list[i].lower()) for entry in body: row = tokenizer.tokenize(entry) tagged_words = tagger.tag(row) nouns = [] for tagged_word in tagged_words: if tagged_word[1].startswith('NN'): nouns.append(tagged_word[0]) nouns = [word for word in nouns if word not in cached_stop_words] processed.append(nouns) return processed
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = JiebaTokenizer() if not os.path.isdir(self.__DICT_PATH): raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH ) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH ) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, } ) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def stemming(doc): wnl = PorterStemmer() mytokenizer = RegexpTokenizer('\\b\\w+\\b') return [wnl.stem(t) for t in mytokenizer.tokenize(doc)]
def __init__(self): RegexpTokenizer.__init__(self, r'\w+|\s+|\[,.]+|\,+|[\{\}\-\<\>\=]+|(?!\')[^\w\s]')
def __init__(self): RegexpTokenizer.__init__(self, r'\w+\[.,]+|[\[\]\(\)\{\}"\-\<\>\=]+|[^\w\s]')
from nltk.corpus import cess_esp as cess from nltk import RegexpTokenizer import nltk import pickle # My sentences sentence = "hola, hola, soy Pedro ¿como te llamas?." tokenizer = RegexpTokenizer(r'\w+') tokenized_words = tokenizer.tokenize(sentence) # Dec train/test train = None test = None cess_sents = cess.tagged_sents() try: with open('test_pickles/test_data.pickle', 'rb') as fa: div = pickle.load(fa) train = cess_sents[:div] test = cess_sents[div+1:] except FileNotFoundError as a: # training data print("dumping train/test") div = len(cess_sents)*90//100 train = cess_sents[:div] test = cess_sents[div+1:] with open('test_pickles/test_data.pickle', 'wb') as fb: pickle.dump(div, fb) ##### #