def is_stopword(word): english_stopwords = stopwords.raw('english') russian_stopwords = stopwords.raw('russian') is_english_stopword = word in english_stopwords try: word = word.decode('utf-8') return is_english_stopword or word in russian_stopwords except UnicodeError: return is_english_stopword
def create_indexes(): stopwords_pt = stopwords.raw('portuguese').decode('utf-8').split('\n')[:-1] snowball_stemmer = PortugueseStemmer() rslp_stemmer = RSLPStemmer() indexes = {'no-stemmer-with-stopwords': Index(stemmer=None, stopwords=[]), 'no-stemmer-without-stopwords': Index(stemmer=None, stopwords=stopwords_pt), 'snowball-with-stopwords': Index(stemmer=snowball_stemmer, stopwords=[]), 'snowball-without-stopwords': Index(stemmer=snowball_stemmer, stopwords=stopwords_pt), 'rslp-with-stopwords': Index(stemmer=rslp_stemmer, stopwords=[]), 'rslp-without-stopwords': Index(stemmer=rslp_stemmer, stopwords=stopwords_pt),} for index_name, index in indexes.iteritems(): index.name = index_name filenames = machado.fileids() index_count = len(indexes) total_iterations = len(filenames) * index_count counter = 1 for filename in filenames: contents = machado.raw(filename) for index_name, index in indexes.iteritems(): info = '[{:05d}/{:05d}] Adding document "{}" to index "{}" ... '\ .format(counter, total_iterations, filename, index_name) sys.stdout.write(info) start = time() index.add_document(filename, contents) end = time() sys.stdout.write('OK ({:09.5f}s)\n'.format(end - start)) counter += 1 if not os.path.exists('data'): os.mkdir('data') counter = 1 for index_name, index in indexes.iteritems(): info = '[{:02d}/{:02d}] Dumping index "{}" ... '.format(counter, index_count, index_name) sys.stdout.write(info) start = time() index.dump('data/{}.pickle'.format(index_name)) end = time() sys.stdout.write('OK ({:09.5f}s)\n'.format(end - start)) counter += 1
from nltk.tokenize import word_tokenize from nltk import WordNetLemmatizer, wordnet, pos_tag from nltk.corpus import stopwords import string wl = WordNetLemmatizer() sw = stopwords.raw("english").split() vocab = {} def get_tag(treebank_tag): if treebank_tag.startswith('J'): return wordnet.wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.wordnet.ADV else: return wordnet.wordnet.NOUN def get_lem(text): t = "" for m in text: t += wl.lemmatize(m[0].lower(), get_tag(m[1])) + " " return t.strip() def preprocess(text):
Idea I: Let us count the stop words. Idea II: Let us use word bigrams. """ # Idea I from nltk.tokenize import wordpunct_tokenize sentence = "Zalando SE is a European e-commerce company based in Berlin, Germany. The company follows a platform approach, offering Fashion and Lifestyle products to customers in 17 European markets. Zalando was founded in Germany in 2008. Swedish company Kinnevik is the largest owner with 32%." tokens = wordpunct_tokenize(sentence) print(tokens) # Explore stop word corpus from nltk.corpus import stopwords print(stopwords.readme().replace("\n", " ")) # German stop words print(stopwords.raw("german").replace("\n", " ")) # How many stop words for english and german? print(len(stopwords.words(["english", "german"]))) # Classify language by counting stop words language_ratios = {} test_words = [word.lower() for word in test_tokens] test_words_set = set(test_words) for language in stopwords.fileids(): # For some languages it would be a wise idea to tokenize the stop words by punctuation too. stopwords_set = set(stopwords.words(language)) common_elements = test_words_set.intersection(stopwords_set) language_ratios[language] = len(common_elements)