def prelim(filename): ### Initialize tokenizer, stopset, stemmer and lemmatizer ### tokenizer = RegexpTokenizer( r'\w+' ) #used this because this regex gets rid of punctuations al well....alternativly word_tokenize could also have been used stopset = set(stopwords.words('english')) #porter_stemmer=PorterStemmer() lemm = WordNetLemmatizer() #stemmer = SnowballStemmer('english') ################################################## text = open(filename, 'r').read() text = unicode(text, errors='replace') tokens = tokenizer.tokenize(text) #tokenize the text tokens = list(set(tokens)) ##doubt about this in the algorithm # tokens = [porter_stemmer.stem(w) for w in tokens if not w in stopset] #stem the tokens and remove stop words tokens = [lemm.lemmatize(w, 'v') for w in tokens] # tokens = [porter_stemmer.stem(w) for w in tokens if not w in stopset] # tokens = [lemm.lemmatize(w) for w in tokens if not w in stopset] import code_for_classification return code_for_classification.findClass(tokens)
import code_for_classification import string from sys import argv script, filename, correctoutput, myoutput = argv target = open(filename, 'r') lines = target.readlines() target_correct = open(correctoutput, 'w') target_my = open(myoutput, 'w') i = 1 for line in iter(lines): words = [word for word in line.split()] target_my.write(code_for_classification.findClass(words[1:]) + "\n") target_correct.write(words[0] + "\n") # print(i) i += 1 target.close() target_correct.close() target_my.close()
import code_for_classification import string from sys import argv sys,argv=argv #filename to be classified in the command line text = open(argv,'r').read() ### Initialize tokenizer, stopset, stemmer and lemmatizer ### tokenizer = RegexpTokenizer(r'\w+') #used this because this regex gets rid of punctuations al well....alternativly word_tokenize could also have been used stopset = set(stopwords.words('english')) porter_stemmer=PorterStemmer() #lemm = WordNetLemmatizer() #stemmer = SnowballStemmer('english') ################################################## tokens = tokenizer.tokenize(text) #tokenize the text tokens = list(set(tokens)) ##doubt about this in the algorithm #tokens = [stemmer.stem(w) for w in tokens if not w in stopset] #stem the tokens and remove stop words #tokens = [lemm.lemmatize(w,'v') for w in tokens] tokens = [porter_stemmer.stem(w) for w in tokens if not w in stopset] print tokens print code_for_classification.findClass(tokens)