コード例 #1
0
def prelim(filename):
    ### Initialize tokenizer, stopset, stemmer and lemmatizer ###
    tokenizer = RegexpTokenizer(
        r'\w+'
    )  #used this because this regex gets rid of punctuations al well....alternativly word_tokenize could also have been used
    stopset = set(stopwords.words('english'))
    #porter_stemmer=PorterStemmer()
    lemm = WordNetLemmatizer()
    #stemmer = SnowballStemmer('english')
    ##################################################
    text = open(filename, 'r').read()
    text = unicode(text, errors='replace')

    tokens = tokenizer.tokenize(text)  #tokenize the text

    tokens = list(set(tokens))  ##doubt about this in the algorithm

    # tokens = [porter_stemmer.stem(w) for w in tokens if not w in stopset] #stem the tokens and remove stop words

    tokens = [lemm.lemmatize(w, 'v') for w in tokens]

    # tokens = [porter_stemmer.stem(w) for w in tokens if not w in stopset]
    # tokens = [lemm.lemmatize(w) for w in tokens if not w in stopset]

    import code_for_classification
    return code_for_classification.findClass(tokens)
コード例 #2
0
import code_for_classification
import string
from sys import argv

script, filename, correctoutput, myoutput = argv

target = open(filename, 'r')

lines = target.readlines()

target_correct = open(correctoutput, 'w')

target_my = open(myoutput, 'w')

i = 1
for line in iter(lines):
    words = [word for word in line.split()]
    target_my.write(code_for_classification.findClass(words[1:]) + "\n")
    target_correct.write(words[0] + "\n")
    # print(i)
    i += 1

target.close()
target_correct.close()
target_my.close()

コード例 #3
0
import code_for_classification
import string
from sys import argv

sys,argv=argv #filename to be classified in the command line
text = open(argv,'r').read()

### Initialize tokenizer, stopset, stemmer and lemmatizer ###
tokenizer = RegexpTokenizer(r'\w+')  #used this because this regex gets rid of punctuations al well....alternativly word_tokenize could also have been used
stopset = set(stopwords.words('english'))
porter_stemmer=PorterStemmer()
#lemm = WordNetLemmatizer()
#stemmer = SnowballStemmer('english')
##################################################



tokens = tokenizer.tokenize(text) #tokenize the text

tokens = list(set(tokens))   ##doubt about this in the algorithm

#tokens = [stemmer.stem(w) for w in tokens if not w in stopset] #stem the tokens and remove stop words

#tokens = [lemm.lemmatize(w,'v') for w in tokens]

tokens = [porter_stemmer.stem(w) for w in tokens if not w in stopset]

print tokens

print code_for_classification.findClass(tokens)