#!/usr/bin/env python2.7 import sys import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import SnowballStemmer import re import common stopwords_set = set(stopwords.words('english')) stemmer = SnowballStemmer('english') langs = common.read_langs('langs.txt') keywords_set = common.read_keywords(langs) def tokenize(text): words = [word.lower() for word in nltk.word_tokenize(text)] words = [w for w in words if len(w)>=3] words = [w for w in words if w.isalpha()] words = [w for w in words if w not in stopwords_set] words = [stemmer.stem(w) for w in words] return words def tokenize_code(code): words = [word.lower() for word in nltk.word_tokenize(code)] words = [w for w in words if w.isalpha()] words = [w for w in words if w in keywords_set] return words