class StemmerTokenizer(): def __init__(self): self.lem = PorterStemmer() def __call__(self, string): tokens = word_tokenize( string.lower() ) return [ self.lem.stem_word(t) for t in tokens ]
def preprocess(text): stemmer = PorterStemmer() stop = stopwords.words("english") result = word_tokenize(text) result = [stemmer.stem_word(word.lower()) for word in result if \ word not in stop and \ word not in string.punctuation and \ word not in string.digits] return result
def dialogue_act_features(post): words = nltk.word_tokenize(post) sentences = nltk.sent_tokenize(post) features = { 'word_diversity': len(words)/len(set(words)), } stemmer = PorterStemmer() stemmed_words = [stemmer.stem_word(w) for w in words] # words for word in set(stemmed_words): features['contains(%s)' % word.lower()] = True # check for presence/absence of specific words check_words = [ 'who', 'what', 'where', 'why', 'how', # question words 'love', 'hate', 'despis', # emotional words (?) ] for word in check_words: features['contains(%s)' % word] = word in stemmed_words # punctuation for punctuation in ['?', '!', '!!', '?!', '"', '...', '.']: features['punctuation_count(%s)' % punctuation] = post.count(punctuation) # skip parts of speech for now - slow, not helping much return features # get counts for parts of speech pos_count = defaultdict(int) for sentence in sentences: # tokenize the sentence into words and tag parts of speech sentence_words = nltk.word_tokenize(sentence) # - using the nltk parts-of-speech tagger for now # (other options may be faster/more accurate) pos_sentence = nltk.pos_tag(sentence_words) for word, pos in pos_sentence: pos_count['pos_%s' % pos] += 1 # include final counts by part of speech in the features features.update(pos_count) return features
class PreProcess: def __init__(self): self.tokenizer = word_tokenize self.stemmer = PorterStemmer() self.punct = string.punctuation self.digits = string.digits self.stop = stopwords.words("english") def process_sent(self, snt): snt = self.tokenizer(snt) snt = [self.stemmer.stem_word(wrd.lower()) for wrd in snt if \ wrd not in self.stop and \ wrd not in self.digits and \ wrd not in self.punct ] return snt def process(self, snts): return [self.process_sent(snt) for snt in snts]
def stem(self,input_text): tokenizer = RegexpTokenizer('\s+', gaps=True) stemmed_text=[] lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() text = tokenizer.tokenize(str(input_text)) filtered_text = self.stopword(text) for word in filtered_text: if word.isalpha(): if len(word)>4: stemmed_text.append(stemmer.stem_word(word).lower()) else: stemmed_text.append(word.lower()) for word in stemmed_text: if len(word) < 3 : stemmed_text.remove(word) ' '.join(stemmed_text) return stemmed_text
""" General approach: serach for control-type structures which may be ambigious (with raising) then search for those verbs to see if they exist in "There[ex] VERB" contexts e.g we find "John seems to be beside himself today" so we search for "/[tT]here/ . (/VB/ < /^(seem)/)" if this returns any results, "seem" must be a raising verb """ from pdb import set_trace import runTregex as tx from nltk.stem import PorterStemmer ps = PorterStemmer() treebank_dir = "/home/chase/CompLing/stanford-tregex-2012-03-09/treebank_3/parsed/mrg/wsj/" unfiltered = set() for t in trees: unfiltered.add(ps.stem_word(t.matchTree.leaves()[0]).lower()) # this takes forever and isn't really too effective... for word in unfiltered: pat = "(/[Tt]here/ > EX) . /^%s/"%word reload(tx) trees = tx.Treebank(treebank_dir, pat) trees.run() if len(trees) > 0: print word
True_iD = True continue if inPage and line.find( "<text" ) != -1: inText = True continue if inPage and True_iD and line.find("<id>") != -1: iD.append(line[len("<id>") : -len("</id>")]) True_iD = False if inPage and line.find( "/text" ) != -1: inText = False text = ' '.join(list) #Tokenizing Text For Each XML temp = text.decode("utf-8","ignore") temp = temp.replace(u'\ufeff',' ') temp_1 = re.sub(pattern," ",temp) temp_1 = temp_1.lower() res=[] for x in temp_1.split(): if x not in stopwords.words('english'): res.append(x) clean_text = " ".join(stem.stem_word(word) for word in res) tokens = nltk.word_tokenize(clean_text) cnt = Counter(tokens) print("[[%s]]\t[[%.0f]]") % (dict(cnt),int(iD[0])) list = [] continue
stop_words = set(stopwords.words("english")) words = word_tokenize(example_sentence) filter_sentence = [w for w in words if w not in stop_words] print(filter_sentence) ##### STEAMMER EXAMPLE ##### ps = PorterStemmer() example_words = ["pythone", "pythoner", "pythoning", "pythoned", "pythonly"] for w in example_words: print(ps.stem_word(w)) ##### SENTENCE TOKENIZER EXAMPLE ##### train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_some_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_some_tokenizer(sample_text) def proce_content(): try: for w in tokenized: words = nltk.word_tokenize(w)
def __stem_tokens(tokens): stemmer = PorterStemmer() return [stemmer.stem_word(token) for token in tokens]
def blah(str_inp): print str_inp reload(sys) sys.setdefaultencoding('utf-8') porter = PorterStemmer() stop_words = set(stopwords.words("english")) # load stopwords tag = open("hashtags.txt", "r") text = open("tweet_text_lower.txt", "r") from collections import defaultdict distict_terms = defaultdict(int) hash_tags = defaultdict(int) count = -1 ht_index_dict = {} ht_index_list = [] x_size = 0 count = 0 with open("hashtags.txt", "r") as f: x = [] for line in f: x.append(ast.literal_eval(line.strip())) for ht in x: x_size += 1 ht_str = "" for ele in ht: ht_str += str(ele) if ht_str in ht_index_dict: ht_index_list.append(ht_index_dict[ht_str]) else: ht_index_dict[ht_str] = count count += 1 ht_index_list.append(ht_index_dict[ht_str]) if (x_size == 1197): break # print ("x_size"+str(x_size)) #print x #count =-1 #for a in x: # for t in a: # if t.lower() not in hash_tags: # count+=1 # hash_tags[t.lower()]=count''' tagsize = count #dictionary of terms count = -1 with open("tweet_text_lower.txt", "r") as f: for rec in f: for word in rec.split(): # print word if word[0] != '#': if word.strip() not in stop_words: # print word.strip() if porter.stem_word(word.strip()) not in distict_terms: count += 1 distict_terms[porter.stem_word( word.strip())] = count # print f dictsize = count # print len(distict_terms), count #dictionary of hashtags # print len(hash_tags), count count = 0 count2 = 0 line_num = 0 with open("tweet_text_lower.txt", "r") as f: for i, l in enumerate(f): pass line_num = i + 1 mat = [[0] * (dictsize + 1) for i in range(1197)] #replace by line_num+1 # mat = [[0]] # print mat # print text for (line, i) in zip(text, range(1197)): # print line for word in line.split(): #print word if (word[0] != '#'): if word.strip() not in stop_words: #for tags in ast.literal_eval(hash_.strip()): #print distict_terms[word] + hash_tags[tags] mat[i][distict_terms[porter.stem_word(word.lower())]] += 1 #print mat document_freq = [0] * (dictsize + 1) #print document_freq for i in distict_terms: for j in range(1197): if mat[j][distict_terms[i]] >= 1: document_freq[distict_terms[i]] += 1 #print line_num #print document_freq #for t in distict_terms: # if distict_terms[t]==1: # print t #print tagsize #print len(distict_terms) import math for i in distict_terms: for j in range(1197): mat[j][distict_terms[i]] = math.log( 1 + mat[j][distict_terms[i]]) * (math.log( (1197) / (document_freq[distict_terms[i]] + 1))) #print mat[distict_terms[i]][hash_tags[j]] import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_multilabel_classification from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC from sklearn.preprocessing import LabelBinarizer from sklearn.decomposition import PCA from sklearn.cross_decomposition import CCA from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MultiLabelBinarizer from scipy import spatial import numpy as np from sklearn.metrics.pairwise import cosine_similarity # print line_num clf = KNeighborsClassifier(n_neighbors=4) #mat = CCA(n_components=2).fit(mat, x).transform(mat) #print feature #clf = OneVsRestClassifier(SVC(kernel='poly')) #mat=np.array(mat) #print len(x) #print len(mat) #print x[0] #test =["not"]*1197 #print test clf.fit(mat, ht_index_list) #print type(mat), type(Y) query = [0] * (dictsize + 1) for word in str_inp.split(): if word[0] != '#': if word not in stop_words: if porter.stem_word(word) in distict_terms: query[distict_terms[porter.stem_word(word)]] += 1 for word in distict_terms: query[distict_terms[word]] = math.log( 1 + query[distict_terms[word]]) * (math.log( (1197) / (document_freq[distict_terms[word]] + 1))) res_index = clf.predict(query) # print res_index for key in ht_index_dict: if (ht_index_dict[key] == res_index[0]): print key break return key
class TitleSim: def __init__(self, features_conf, features_deleted): print 'Start initialization' # initial model training features = features_deleted + features_conf target = [0 for x in range(len(features_deleted))] +\ [1 for x in range(len(features_conf))] self.classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) self.classifier.fit(features, target) # loading relational data which will be used paths = json.loads(open("SETTINGS.json").read()) paper_doc = paths["paper_doc"] self.paper = dict([(entry[0], entry[1]) for entry in csv.reader(open(paper_doc))]) # loading setting file self.paths = json.loads(open("SETTINGS.json").read()) # loading word map of titles self.wordmap = self.load_titlemap() # do other initializations self.stemmer = PorterStemmer() print 'End initialization' def label_predict(self, fea_dict): # fea_dict is a dictionary whose key is 'user id' prob_dict = {} for key in fea_dict: features = [feature[1:] for feature in fea_dict[key]] predictions = self.classifier.predict_proba(features)[:,1] prob_dict[key]=[(item[0],prob) for item,prob in zip(fea_dict[key],predictions)] return prob_dict def load_titlemap(self): return dict([(entry[0],entry[1]) for entry in \ csv.reader(open(self.paths["title_wordmap"]))]) def calsim(self, author_doc, pairs): # calculate the similarity between titles title_features = [] for pair in pairs: if pair[0] not in author_doc: print 'Key error.' sys.exit(1) title_features.append(self.calpairsim(author_doc[pair[0]], pair[1])) return title_features def calpairsim(self, doclist, target_doc): author_words = {} for doc in doclist: words = self.paper[doc].lower().split(' ') for word in words: stemmed_word = self.stemmer.stem_word(word) if stemmed_word in self.wordmap: if stemmed_word in author_words: author_words[stemmed_word] += 1 else: author_words[stemmed_word] = 1 doc_words = {} words = self.paper[target_doc].lower().split(' ') for word in words: stemmed_word = self.stemmer.stem_word(word) if stemmed_word in self.wordmap: if stemmed_word in doc_words: doc_words[stemmed_word] += 1 else: doc_words[stemmed_word] = 1 # number of common words comm_num = len(set(author_words.keys()) & set(doc_words.keys())) # pearson coefficient if (len(set(author_words.keys())) + len(set(doc_words.keys()))) != 0: pearson = comm_num*1.0/ (len(set(author_words.keys())) + len(set(doc_words.keys()))) else: pearson = 0.0 return [comm_num, pearson]