def detect_language(comment): """ To detect language we could compare a comment to stopwords from each language. The language that has most stopwords in common with the comment is likely to be the language in which the comment is written. This is obviously not waterproof, however, a well written comment would work way better than a comment written in slang or with poor grammar. Ultimately, this would likely result in comments that are more valuable because of their structure. In addition, languages that are easily distinguished from English could be detected, thus being able to compare the language of a comment to the actual content that is annotated in Hypothes.is, since most users won't understand comments in a different language anyway. """ # first we tokenize the comment tokens = wordpunct_tokenize(comment) words = [word.lower() for word in tokens] languages_ratios = {} # Then we compare the words to the most frequent stopwords per language for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) # Calculate the language score languages_ratios[language] = len(common_elements) # Get the key with the highest value most_rated_language = max(languages_ratios, key=languages_ratios.get) return most_rated_language
def calculate_language_scores(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}. :param text: Text to analyze. :type text: str :return: Dictionary with languages and unique stopwords seen in analyzed text. :rtype: dict(str -> int) :raises: TypeError """ if not isinstance(text, basestring): raise TypeError("Expected basestring, got '%s' instead" % type(text)) if not text: return {} languages_ratios = {} # Split the text into separate tokens, using natural language punctuation signs. tokens = wordpunct_tokenize(text) tokenized_words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(tokenized_words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def generate(languages): """ Generate a dict of language stopwords from nltk.corpus by the specific lanauges. language_stopwords: { {'english': {'a', 'an'}, {'french': {'un', 'une'} } @param languages: languages specified, e.g. ['english', 'french'] @type languages: list @return: a dict of stopwords corresponding to the specific languages @rtype: dict """ language_stopwords = {} for language in stopwords.fileids(): if language in languages: stopwords_set = set(stopwords.words(language)) language_stopwords[language] = stopwords_set return language_stopwords
def _calculate_languages_ratios(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} ''' nltk.wordpunct_tokenize() splits all punctuations into separate tokens ''' tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def hello_world(): if request.method == 'POST': print "Request: ", request print "Form: ", request.form print "Files: ", request.files archive = zipfile.ZipFile(request.files.get("solution")) with archive.open("extra.txt") as solution: languages_ratios = {} tokens = nltk.wordpunct_tokenize(solution.read().decode('utf-8')) words_list = [word.lower() for word in tokens] words_set = set(words_list) print "Words_set: ", words_set for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) common_elements = words_set.intersection(stopwords_set) if common_elements: languages_ratios[language] = len(common_elements) print "Language ratios: ", languages_ratios # 50% mark = 50 if max(languages_ratios, key=languages_ratios.get) == 'english' else 0 # 50% print "Mark for lang: ", mark words_count = len(words_list) print "Words count: ", words_count mark += (float(words_count) / 200) * 50 if words_count < 200 else 50 print "Total Mark: ", mark req = requests.post(request.form["url"], data={"mark": int(mark)}) return ''
def split_count(sentence): # split the sentence and count where each words come from # how about making a parallel list and then nameing the language and word in the same index #box vocab_list = [] languages_ratios = {} split = wordpunct_tokenize(sentence) # tokenizes the input words = [word.lower()for word in split] # makes sentence lower in the list split lang_dict = {} for language in stopwords.fileids(): # iterate through a list of lang built in stopwords_set = set(stopwords.words(language)) words_set = set(words) # creates a set of words vocab_list = words # good # print "this is word set: " ,words_set #print "this is vocablist: " , vocab_list common_element = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_element) # this will detrm}ain the score lang_dict[language] = common_element # works like intend, but want to make Cleaner #main_language_set = #secondary_lang = lang_dict.intersection( secondary_lang) # print "size of vocab: ",len(vocab_list) #,"and lang ", len(lang_list) ---Delete # for i in range(len(vocab_list)): # print lang_list[i],vocab_list[i] # print "----------------------------" print "this is the set for main lang:", lang_dict.get(main_language), "\n" print "this is the set for second lang:", lang_dict.get(secondary_lang),"\n" # print "this lang. ratios", languages_ratios , "\n" # print "this is lang list: ",lang_list print "this is vocb_list: ", vocab_list , "\n" # check good print "this is DICT: ", lang_dict print "ORIGINAL SENTENCE: " , sentence
def calcularValoresDeIdioma(contenido): languages_ratios = {} tokens = wordpunct_tokenize(contenido) words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) return languages_ratios
def calculate_languages_ratios(text): languages_ratios = {} tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def detectLanguage(self, text): languages_scores = {} tokens = word_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords # appearing in analyzed text for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_scores[language] = len(common_elements) # language "score" return max(languages_scores, key=languages_scores.get)
def check_language(self, word_list): """ source: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/""" languages_ratios = {} for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(word_list) # Check similarity common_elements = words_set.intersection(stopwords_set) # Save as ratio languages_ratios[language] = len(common_elements) # Get language with most similarities most_rated_language = max(languages_ratios, key=languages_ratios.get) return most_rated_language
def _calculate_languages_ratios(text): text = str(text) # assuring we receive a String languages_ratios = {} tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def cal(): text = sys.stdin.read() languages_ratios = {} toekns = wordpunct_tokenize(text) words = [word.lower() for word in toekns] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) ratios = languages_ratios most = max(ratios, key=ratios.get) print (most) """if most == "english":
def language_detector(string): tokens = wordpunct_tokenize(string) words = [word.lower() for word in tokens] # compute language scores languages_ratios = {} for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" languages_ratios most_rated_language = max(languages_ratios, key=languages_ratios.get) return most_rated_language
def _calculate_languages_ratios(self, text): #Calcule la probabilité d'avoir un text écrit dans telle ou telle languages et #retourne un dictionnaire qui ressemble à {'french': 2, 'english': 4, 'dutsh': 0} languages_ratios = {} tokens = self.getWords(text) # Compte par language le nombre de stopwords qui apparait. for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(tokens) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # nombre d'aparition de stopwords par langue return languages_ratios
def lang_likelihood(self, document): ''' This method computes the language likelihood using algorithm and tokenizer from NLTK. ''' languages_likelihood = {} tokens = wordpunct_tokenize(document) words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_likelihood[language] = len(common_elements) # language "score" return languages_likelihood
def _calculate_languages_ratios(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} ''' nltk.wordpunct_tokenize() splits all punctuations into separate tokens >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.") ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.'] ''' tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in stopwords.fileids(): if (language == "portuguese"): lista=stopwords.words(language) lista.append('Fatec') lista.append('fatec') lista.append('Palmeiras') lista.append('palmeiras') lista.append('Dilma') lista.append('dilma') lista.append('Copa') lista.append('copa') stopwords_set=set(lista) else: stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def main(): #step 1 which tokenizes the words and now we have clean words to stop match against stop print "\n -----------------------------\n" split = wordpunct_tokenize("hola como estas, espero que estes bien" ) print split print "\n -----------------------------\n" # #Lets Get serious # languages_ratios = {} tokens = wordpunct_tokenize("hola como estas?") words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_element = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_element) # this will detrmain the score print languages_ratios
def calculate_language_scores(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}. :param text: Text to analyze. :type text: str :return: Dictionary with languages and unique stopwords seen in analyzed text. :rtype: dict(str -> int) """ # Split the text into separate tokens, using natural language punctuation signs. words = {word.lower() for word in wordpunct_tokenize(text)} # Return the number of stopwords found per language. return { len( words.intersection( stopwords.words(language) ) ) for language in stopwords.fileids() }
def identify_language(text): """ Identify a language, given a text of that language. Parameters ---------- text : str Returns ------- list of tuples (ISO 369-3, score) Examples -------- >>> identify_language('Ich gehe zur Schule.') [('deu', 0.8)] """ languages_ratios = [] tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Check how many stopwords of the languages NLTK knows appear in the # provided text for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) score = len(common_elements) languages_ratios.append((language, score)) # Normalize sum_scores = float(sum(el[1] for el in languages_ratios)) languages_ratios = [(_nltk_to_iso369_3(el[0]), el[1]) for el in languages_ratios] if sum_scores > 0: languages_ratios = [(el[0], el[1] / sum_scores) for el in languages_ratios] return sorted(languages_ratios, key=lambda n: n[1], reverse=True)
def detect_lang(self,text): """ Returns the detected language. Args: text: input text Returns: the detectred language string """ language_ratio = {} words = wordpunct_tokenize(text) for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(words) common_words = words_set.intersection(stopwords_set) language_ratio[language] = len(common_words) detected_lang = max(language_ratio, key=language_ratio.get) return detected_lang
def capitalize(text): """ Text capitalizator for Python 2. """ if isinstance(text, str): text = text.decode("utf-8") if set(text) & CYRILLIC_ALPHABET: language = "russian" else: words = set(wordpunct_tokenize(text.lower())) language = max( stopwords.fileids(), key=lambda lang: len(words & PRECALCULATED_LANGSETS[lang]) ) class_ = EnglishCapitalization if language == "russian": class_ = RussianCapitalization elif language == "spanish": class_ = SpanishCapitalization elif language == "dutch": class_ = DutchCapitalization return class_().capitalize(text)
def _calculate_languages_ratios(words): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in stopwords.fileids(): stopwords_set = set([i.encode("utf-8") for i in stopwords.words(language)]) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def _get_available_languages(self): """ Get available languages by listing nltk's stopwords files """ return stopwords.fileids()
def construct_ui(self): self.setWindowTitle('TextAnalysis') self.mainWidget = QtWidgets.QWidget() self.setCentralWidget(self.mainWidget) self.statusBar = QtWidgets.QStatusBar() self.setStatusBar(self.statusBar) layout = QtWidgets.QGridLayout() self.mainWidget.setLayout(layout) # text input layout.addWidget(QtWidgets.QLabel("Paste text to analyze or "), 0, 0, 1, 2) self.button_file = QtWidgets.QPushButton("open file") layout.addWidget(self.button_file, 0, 2, 1, 1) self.textin = QtWidgets.QPlainTextEdit( "👍 and 📋 and 👌 and 😋 and 👍 again. In an interview on BBC Radio 4’s Today programme, Raab argued that leaving the EU without a deal would not be a problem, partly because the general agreement on tariffs and trade (Gatt) could be applied to create a standstill on tariffs with the EU. Mark Carney, the governor of the Bank of England, and Liam Fox, the trade secretary, have said it is not possible for the UK to trigger this unilaterally. But Raab said Carney was not a lawyer and claimed that legally it could be done and the question is whether there is the political will." ) layout.addWidget(self.textin, 1, 0, 1, 4) self.separatorLine1 = QtWidgets.QFrame() self.separatorLine1.setFrameShape(QtWidgets.QFrame.HLine) self.separatorLine1.setFrameShadow(QtWidgets.QFrame.Plain) self.separatorLine1.setLineWidth(1) layout.addWidget(self.separatorLine1, 2, 0, 1, 4) # emoji interface layout.addWidget(QtWidgets.QLabel("Create emoji statistics:"), 3, 0, 1, 4) self.button_emoji = QtWidgets.QPushButton("analyze") layout.addWidget(self.button_emoji, 4, 0, 1, 1) self.textout_emoji = QtWidgets.QPlainTextEdit() self.textout_emoji.setPlainText( "output goes here (copy and paste into e.g. a spreadsheet)") layout.addWidget(self.textout_emoji, 5, 0, 1, 4) self.separatorLine2 = QtWidgets.QFrame() self.separatorLine2.setFrameShape(QtWidgets.QFrame.HLine) self.separatorLine2.setFrameShadow(QtWidgets.QFrame.Plain) self.separatorLine2.setLineWidth(1) layout.addWidget(self.separatorLine2, 6, 0, 1, 4) # bigram interface layout.addWidget(QtWidgets.QLabel("Create ngram statistics:"), 7, 0, 1, 4) layout.addWidget(QtWidgets.QLabel("Stopwords:"), 8, 0, 1, 1) self.languagelist = QtWidgets.QComboBox() self.languagelist.addItem("- none -") for item in stopwords.fileids(): self.languagelist.addItem(item) layout.addWidget(self.languagelist, 8, 1, 1, 1) layout.addWidget(QtWidgets.QLabel("Windowsize:"), 8, 2, 1, 1) self.windowsize = QtWidgets.QLineEdit() self.windowsize.setMaxLength(1) self.windowsize.setText("2") layout.addWidget(self.windowsize, 8, 3, 1, 1) self.button_ngrams = QtWidgets.QPushButton("analyze") layout.addWidget(self.button_ngrams, 9, 0, 1, 1) self.textout_ngrams = QtWidgets.QPlainTextEdit() self.textout_ngrams.setPlainText( "output goes here (copy and paste into e.g. a spreadsheet)") layout.addWidget(self.textout_ngrams, 10, 0, 1, 4) # event binding self.button_file.clicked.connect(self.opentextfile) self.button_emoji.clicked.connect(self.emojistats) self.button_ngrams.clicked.connect(self.start_ngrams)
#!/usr/bin/env python import nltk import numpy as np import sys from nltk.corpus import stopwords from review_data import read_reviews ############################################################################### languages = stopwords.fileids() stopword_sets = [set(stopwords.words(lang)) for lang in languages] target_languages = [u'english', u'spanish'] ############################################################################### def detect_language(tokens): token_set = set(tokens) lang_scores = [] for stopword_set in stopword_sets: common_words = stopword_set & token_set lang_scores.append(len(common_words)) best_index = np.argmax(lang_scores) best_lang = languages[best_index] return best_lang
from nltk.corpus import stopwords english_stops=set(stopwords.words('english')) words=["Can't", 'is','a','contraction'] print [word for word in words if word not in english_stops] print stopwords.fileids()
import math import operator import re from pprint import pprint import numpy as np from SequenceMining import GspSearch import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage ### LANGUAGE RECOGNITION ### if __name__ == '__main__': # These are the available languages with stopwords from NLTK languages = stopwords.fileids() # Fill the dictionary of languages, to avoid unnecessary function calls print("Loading stop words...", end='\r') try: dict_list = np.load('stopwords.npy').item() except: dict_list = {} for lang in languages: dict_list[lang] = {} for stop_word in stopwords.words(lang): dict_list[lang][stop_word] = 0 np.save('stopwords.npy', dict_list) print("Loaded stop words. ")
from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords tokenizer = RegexpTokenizer('\s+', gaps=True) english_stopwords = set(stopwords.words('english')) words = tokenizer.tokenize("Can't is a contraction.") print(words) filtered = [word for word in words if word not in english_stopwords] print(filtered) # stopwords of all languages # print(stopwords.words()) # languages print(stopwords.fileids())
def run(self, table): # concatenate all cell's contents cell_content = ' '.join( map( attrgetter('content'), table.cells() ) ) input_string = cell_content # add text from additional fields for field_name in self.additional_fields: if field_name in table.table_data: input_string += table.table_data[field_name] # tokenize string and extract lower case words tokens = wordpunct_tokenize(input_string) words = [word.lower() for word in tokens] # iterate over all languages in nltk and match their stopwords # to the words in our input string. # larger intersection -> higher score languages_ratios = {} for language in stopwords.fileids(): # the language's stopwords stopwords_set = set(stopwords.words(language)) # our words words_set = set(words) # intersection between the sets common_elements = len(words_set.intersection(stopwords_set)) if common_elements > 0: languages_ratios[language] = common_elements # get top <n> languages top_n_languages = sorted( languages_ratios.items(), key=itemgetter(1), reverse=True )[:self.top_n] # sum all language scores to normalize the individual scores language_score_sum = sum( map(itemgetter(1), top_n_languages) ) limit_match = len(self.limit) == 0 # add annotations for each identified language for language in top_n_languages: language_name, language_score = language normalized_score = language_score/language_score_sum if language_name in self.limit and normalized_score >= self.limit[language_name]: limit_match = True table.annotations.append({ 'source': 'preprocessing', 'task': 'LanguageDetection', 'language': language_name, 'score': normalized_score, }) return limit_match
A multi-class classifier chooses one of many possible labels. A multi-binary classifier choose zero or more labels by combining multiple binary classifiers, 1 for each label.''') classifier_group.add_argument('--binary', action='store_true', default=False, help='train a binary classifier, or a multi-binary classifier if --multi is also given') classifier_group.add_argument('--multi', action='store_true', default=False, help='train a multi-class classifier, or a multi-binary classifier if --binary is also given') feat_group = parser.add_argument_group('Feature Extraction', 'The default is to lowercase every word, strip punctuation, and use stopwords') feat_group.add_argument('--bigrams', action='store_true', default=False, help='include bigrams as features') feat_group.add_argument('--no-lowercase', action='store_true', default=False, help="don't lowercase every word") feat_group.add_argument('--filter-stopwords', default='no', choices=['no']+stopwords.fileids(), help='language stopwords to filter, defaults to "no" to keep stopwords') feat_group.add_argument('--punctuation', action='store_true', default=False, help="don't strip punctuation") score_group = parser.add_argument_group('Feature Scoring', 'The default is no scoring, all words are included as features') score_group.add_argument('--score_fn', default='chi_sq', choices=[f for f in dir(BigramAssocMeasures) if not f.startswith('_')], help='scoring function for information gain and bigram collocations, defaults to chi_sq') score_group.add_argument('--min_score', default=0, type=int, help='minimum score for a word to be included, default is 0 to include all words') score_group.add_argument('--max_feats', default=0, type=int, help='maximum number of words to include, ordered by highest score, defaults is 0 to include all words') eval_group = parser.add_argument_group('Classifier Evaluation',
from nltk.corpus import stopwords from six import text_type ############################################################################### # This small hack (?) helps NLTK to find its files. NLTK_DATA.path[0:0] = [NLTK_PATH] ############################################################################### PRECALCULATED_LANGSETS = {} for _language in stopwords.fileids(): stopwords_set = set( text_type(wrd, "utf-8") for wrd in stopwords.words(_language) ) stopwords_set = (wordpunct_tokenize(word) for word in stopwords_set) PRECALCULATED_LANGSETS[_language] = set(chain.from_iterable(stopwords_set)) CYRILLIC_ALPHABET = text_type("ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬЬБЮ", "utf-8") CYRILLIC_ALPHABET = frozenset(CYRILLIC_ALPHABET + CYRILLIC_ALPHABET.lower()) ############################################################################### class Capitalization(object): """
feat_group = parser.add_argument_group( 'Feature Extraction', 'The default is to lowercase every word, strip punctuation, and use stopwords' ) feat_group.add_argument('--ngrams', nargs='+', type=int, help='use n-grams as features.') feat_group.add_argument('--no-lowercase', action='store_true', default=False, help="don't lowercase every word") feat_group.add_argument( '--filter-stopwords', default='no', choices=['no'] + stopwords.fileids(), help='language stopwords to filter, defaults to "no" to keep stopwords') feat_group.add_argument('--punctuation', action='store_true', default=False, help="don't strip punctuation") feat_group.add_argument( '--value-type', default='bool', choices=('bool', 'int', 'float'), help= '''Data type of values in featuresets. The default is bool, which ignores word counts. Use int to get word and/or ngram counts.''') score_group = parser.add_argument_group( 'Feature Scoring',
for review in tokenized_docs: new_review = [] for token in review: """re是regular expression的所写,表示正则表达式 sub是substitute的所写,表示替换""" """re.sub(pattern, repl, string, count=0, flags=0)""" """x.sub()""" new_token = x.sub(u'', token) # 不匹配正则的用空字符(u'')替换掉 if not new_token == u'': new_review.append(new_token) if len(new_review) == 0: # 跳过空格 continue tokenized_docs_no_punctuation.append(new_review) # breakpoint print(tokenized_docs_no_punctuation) print("分词 nltk_data\\corpora\\stopwords\\*") lang = stopwords.fileids() print("语言:", lang) stops = set(stopwords.words("english")) print("英语单词-停顿词:", stops) words = ["Don't", 'hesitate', 'to', 'ask', 'questions'] out = [word for word in words if word not in stops] print(out) """相似性度量(比较)算法 参考源码 distance.py""" out = edit_distance('relate', 'relation') # 动态规划方法(y=x映射) print(out) X = set([10, 20, 30, 40]) Y = set([20, 30, 60]) print(jaccard_distance(X, Y)) # 集合相交法 交集长/并集长 """ 史密斯-沃特曼算法 nlp/Smith-Waterman-Algorithm-Example.gif 基于生物信息学的知识来匹配蛋白序列或者DNA序列的算法 找出两个序列中具有高相似度的片段
def topics(df, model="lda", language=False, save=False): """ Either executes LDA or NMF on a dutch document. This is a simple implementation and only used for "fun" purposes. It is not so much to find the very best topics, but topics that are good enough. Parameters: ----------- df : pandas dataframe Pandas dataframe that contains the raw messages mode : str, default "lda" Which model to use for topic modelling. Either "lda" or "nmf" works for now stopwords : str, default None If you want to remove stopwords, provide a local link to the text file (that includes a list of words) including the extension. """ if save: file = open(f"results/topic_{model}.txt", "a") else: file = None # Prepare stopwords try: stopwords = nltk_stopwords.words(language) except: languages = nltk_stopwords.fileids() raise Exception( f"Please select one of the following languages: {languages}") # Create Topics for user in df.User.unique(): print("#" * len(user) + "########", file=file) print("### " + user + " ###", file=file) print("#" * len(user) + "########\n", file=file) data_samples = df[df.User == user].Message_Only_Text data_samples = data_samples.tolist() if model == "lda": # Extracting Features tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords) tf = tf_vectorizer.fit_transform(data_samples) # Fitting LDA topic_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topic_model.fit(tf) feature_names = tf_vectorizer.get_feature_names() else: # MNF uses tfidf tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords) tfidf = tfidf_vectorizer.fit_transform(data_samples) feature_names = tfidf_vectorizer.get_feature_names() # Run NMF topic_model = NMF(n_components=5, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd') topic_model.fit(tfidf) print("\nTopics in {} model:".format(model), file=file) print_top_words(topic_model, feature_names, 7, file=file)
# -*- coding: utf-8 -*- """ Created on Thu Oct 11 21:39:24 2018 @author: Ahmad """ import nltk from nltk.corpus import stopwords sw = stopwords.words("english") sw[0] len(sw) stopwords_count = set(stopwords.words("english")) len(stopwords_count) stopwords_count stopwords.fileids() sw_a = stopwords.words("spanish") sw[0] set(stopwords.words("spanish")) # &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& import nltk from nltk.stem.snowball import SnowballStemmer import string words = "Hi Everyone If you can read this message youre properly using parseOutText Please proceed to the next part of the project" stemmer = SnowballStemmer("english") stemmer.stem("Hi") words_list = words.split()
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import names, stopwords, words print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS print(names.words('female.txt')) # doctest: +ELLIPSIS from nltk.corpus import cmudict print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # Load the entire cmudict corpus into a Python dictionary: transcr = cmudict.dict() print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()])
def preprocess(args): path = './dataset/' doc_df = pd.read_csv(path + 'documents.csv') doc_dict = dict() # Fill nan(float) as none(str) doc_df = doc_df.fillna('') Languages = list(stopwords.fileids()) # Build remove terms remove_title = [ '[Text]', 'Language:', '<F P=105>', '</F>', 'Article Type:BFN', '<F P=106>', 'Article Type:CSO', '[Excerpt]', '[Editorial Report]', '[passage omitted]', 'ONLY <F P=103>', '<F P=104>' ] remove_term = [ '.', '"', '--', '\'s', '<', '>', '[', ']', '`', ',', ':', '/', '\\', '{', '}', '-', '(', ')' ] my_stopwords = [ 'mr', 'he\'d', 'also', 'every', 'would', 'without', 'per', 'yesterday', 'however', 'could', 'since', 'many', 'must', 'well', 'still', 'today', 'people', 'next' ] print('Stopwords removing processing\n') # Build dictionary for doc in doc_df.iloc: temp_str = doc['doc_text'] # Choosing the fileids of stopwords, initial:english Lang_tmp = 'english' Lang_flag = False if ('Language: <F P=105>' in temp_str): Lang_flag = True Lang_tmp = temp_str.split('Language: <F P=105>')[1].split()[0] if (not (Lang_tmp.lower() in Languages)): Lang_flag = False # Removing meaningless words for t in remove_title: if (t in temp_str): temp_str = temp_str.replace(t, '') for w in remove_term: if (w in temp_str): temp_str = temp_str.replace(w, '') # Removing stopwords temp = temp_str.split() tmp_len = len(temp) for t in range(tmp_len): temp[t] = temp[t].lower() temp = Counter(temp) for m in my_stopwords: # My stopwords set for all doc if (m in temp): del temp[m] for s in stopwords.words('english'): # english stopwords for all doc if (s in temp): del temp[s] if (Lang_flag and Lang_tmp != 'English'): for s in stopwords.words(Lang_tmp.lower()): if (s in temp): del temp[s] # Save to dict doc_dict[doc['doc_id']] = temp print('.pkl file output\n') # File output pickle_out = open(args.doc_dict_path, 'wb') pickle.dump(doc_dict, pickle_out) pickle_out.close() print('Done\n') return doc_dict
# Pulling the data. Each page contains 20 tweets while (cnt <= lim): new_tweets = api.user_timeline(screen_name=handle, page=cnt) all_tweets.extend(new_tweets) cnt += 1 # Pull out only the Tweets from all the other data op = [tweet.text.encode("utf-8") for tweet in all_tweets] for tweets in op: # Language Filter text = wordpunct_tokenize(tweets) lang_count = {} words = [word.lower() for word in text] for language in stopwords.fileids(): stop_set = set(stopwords.words(language)) word_set = set(words) common = word_set.intersection(stop_set) lang_count[language] = len(common) lang = max(lang_count, key=lang_count.get) # Store in DB if (lang == "english"): print tweets saveFile = open('final.csv', 'a') saveFile.write(tweets) saveFile.write('\n') saveFile.close()
#!/usr/bin/python3 # coding: utf-8 from nltk.corpus import stopwords ################################################################## ## 简单测试 print(type(stopwords)) # <class 'nltk.corpus.reader.wordlist.WordListCorpusReader'> print(len(stopwords.fileids())) # 21; 支持 21 种语言; 在 ~/nltk_data/corpora/stopwords/; 目前不支持中文 print(stopwords.fileids()) # ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish'] print(len(stopwords.words('english'))) # 179 个停用词 print(stopwords.words('english')[:5]) # ['i', 'me', 'my', 'myself', 'we'] print(stopwords.words('chinese')[:5]) # ['i', 'me', 'my', 'myself', 'we']
help="""the group of words that represents a single training instance, the default is to use entire files""", ) corpus_group.add_argument( "--fraction", default=1.0, type=float, help="""The fraction of the corpus to use for testing coverage""" ) feat_group = parser.add_argument_group( "Feature Extraction", "The default is to lowercase every word, strip punctuation, and use stopwords" ) feat_group.add_argument("--ngrams", nargs="+", type=int, help="use n-grams as features.") feat_group.add_argument("--no-lowercase", action="store_true", default=False, help="don't lowercase every word") feat_group.add_argument( "--filter-stopwords", default="no", choices=["no"] + stopwords.fileids(), help='language stopwords to filter, defaults to "no" to keep stopwords', ) feat_group.add_argument("--punctuation", action="store_true", default=False, help="don't strip punctuation") args = parser.parse_args() ################### ## corpus reader ## ################### reader_args = [] reader_kwargs = {} if args.cat_pattern: reader_args.append(args.cat_pattern)
try: from nltk.tokenize import wordpunct_tokenize # RE-based tokenizer which splits text on whitespace and punctuation (except for underscore) except ImportError: print('[!] You need to install nltk (http://nltk.org/index.html)') test_tokens = wordpunct_tokenize(text) test_tokens #### stopwords####### from nltk.corpus import stopwords stopwords.readme().replace('\n', ' ') # Since this is raw text, we need to replace \n's with spaces for it to be readable. stopwords.fileids() # Most corpora consist of a set of files, each containing a piece of text. A list of identifiers for these files is accessed via fileids(). stopwords.words('english')[:10] len(stopwords.words('english')) ### ##We loop through the list of stop words in all languages and check how many stop words our test text contains in each language. The text is then classified to be in the language in which it has the most stop words. language_ratios = {} test_words = [word.lower() for word in test_tokens] # lowercase all tokens test_words_set = set(test_words) for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too.
from nltk.tokenize import word_tokenize import random from nltk.corpus import names import nltk.data from nltk.tokenize import TweetTokenizer from nltk.corpus import wordnet import nltk from nltk.corpus import stopwords import nltk.corpus # 1. Write a Python NLTK program to list down all the corpus names. dir(nltk.corpus) print("\nAvailable corpus names:") print(dir(nltk.corpus)) # 2. Write a Python NLTK program to get a list of common stop words in various languages in Python. print(stopwords.fileids()) # 3. Write a Python NLTK program to check the list of stopwords in various languages. # From Wikipedia: # In computing, stop words are words which are filtered out before or after processing of natural language data(text). Though "stop words" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search. # Any group of words can be chosen as the stop words for a given purpose. For some search engines, these are some of the most common, short function words, such as the, is, at, which, and on. In this case, stop words can cause problems when searching for phrases that include them, particularly in names such as "The Who", "The The", or "Take That". Other search engines remove some of the most common words-including lexical words, such as "want"-from a query in order to improve performance. result = set(stopwords.words('english')) print("List of stopwords in English:") print(result) print("\nList of stopwords in Arabic:") result = set(stopwords.words('arabic')) print(result) print("\nList of stopwords in Azerbaijani:") result = set(stopwords.words('azerbaijani')) print(result) print("\nList of stopwords in Danish:")
import nltk from nltk.corpus import stopwords print stopwords.fileids() stops = set(stopwords.words('english')) print stops words = "Don't hesitate to ask questions".split(" ") print[word for word in words if word not in stops] def para_fraction(text): s = set(stopwords.words('english')) para = [w for w in text if w.lower() not in s] return len(para) / len(text) print nltk.corpus.reuters.words() print para_fraction(nltk.corpus.reuters.words()) print para_fraction(nltk.corpus.inaugural.words())
## Using default (treebank) tokenizers para = "Hello World. It's good to see you. Thanks for buying this book." print(sent_tokenize(para)) print(word_tokenize(para)) ## punctation is treated as a seprate word ## Alternative word tokenizer tokenizer = WordPunctTokenizer() ## Punctuation is a seprate word print(tokenizer.tokenize(para)) ## Make your sentence tokenizer - based on unsupervised learning text = webtext.raw('overheard.txt') ## Read corpus example sent_tokenizer1 = PunktSentenceTokenizer(text) ## Train tokenizer sent1 = sent_tokenizer1.tokenize(text) ## Use new tokenizer sent = sent_tokenize(text) ## Old tokenizer #### check difference between tokenizers print("Default tokenizer:\n", sent[678]) ## Fails to tokenize sentences properly print("Learned tokenizer:\n", sent1[678]) ## Works well #### STOPWORDS ###################################################################### from nltk.corpus import stopwords ## Find languages stopwords.fileids() ## Which languages stopwords.words('english')[1:10] ## Filtering stop words stopset = set(stopwords.words('english')) words = ["Can't", 'is', 'a', 'contraction'] print([word for word in words if word not in stopset])