Example #1
0
#!/usr/bin/env python2.7

import sys
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import common

stopwords_set = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
langs = common.read_langs('langs.txt')
keywords_set = common.read_keywords(langs)

def tokenize(text):
    words = [word.lower() for word in nltk.word_tokenize(text)]
    words = [w for w in words if len(w)>=3]
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if w not in stopwords_set]
    words = [stemmer.stem(w) for w in words]
    return words

def tokenize_code(code):
    words = [word.lower() for word in nltk.word_tokenize(code)]
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if w in keywords_set]
    return words