def process_query(queryt, dictionaryword, dictionarytitleq, dictionaryc): querydt = queryt.split(',') querytt = None query = None queryc = None for i in querydt: if 'T:' in i: querytt = i.split(':') querytt = querytt[1] elif 'B:' in i: query = i.split(':') query = query[1] elif 'C:' in i: queryc = i.split(':') queryc = queryc[1] else: query = i querytt = i if query is not None: queryterms = re.split("[^a-zA-Z]+", query) for word in queryterms: word = word.lower() if word not in stopword.stopword: word = stem(word) if word not in dictionaryword: dictionaryword[word] = 1 else: dictionaryword[word] += 1 if querytt is not None: queryterms = re.split("[^a-zA-Z]+", querytt) for word in queryterms: word = word.lower() if word not in stopword.stopword: word = stem(word) if word not in dictionarytitleq: dictionarytitleq[word] = 1 else: dictionarytitleq[word] += 1 if queryc is not None: queryterms = re.split("[^a-zA-Z]+", queryc) for word in queryterms: word = word.lower() if word not in stopword.stopword: word = stem(word) if word not in dictionarytitleq: dictionaryc[word] = 1 else: dictionaryc[word] += 1
def createDocAndTokenHashes_WithStemming(): print("Creating Document Hash and Token Hash with stemming....") files = os.listdir("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection") doc_id=1 token_increment = 1 doc_hash = dict() token_hash = dict() for file in files: with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection\\"+file) as f: doc_file = f.read() docs=re.findall(r'<DOC>(.*?)</DOC>',doc_file,re.DOTALL) if docs: for doc in docs: li_doc_no=re.findall(r'<DOCNO>(.*?)</DOCNO>',doc) li_texts=re.findall(r'<TEXT>(.*?)</TEXT>',doc,re.DOTALL) doc_no= ''.join(map(str, li_doc_no)) texts=''.join(map(str, li_texts)) dlen=0 for m in re.finditer(r'\w+(\.?\w+)*', texts.lower()): token_noStem = m.group(0) token= stem(token_noStem) dlen+=1 if token not in token_hash.keys(): token_hash[token] = token_increment token_increment+=1 doc_hash[doc_no]=(doc_id,dlen) doc_id+=1 print("Docs= "+str(doc_id-1)) print("Tokens with stemming= "+str(token_increment-1)) pickle.dump( doc_hash, open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Doc_Hash_withStemming", "wb" ) ) pickle.dump( token_hash, open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Token_Hash_withStemming", "wb" ) )
def get_feature_list(): feature_list = set() for line in open("./../data/featurelist.txt"): word = line.strip() feature_list.add(stem(word.lower())) return feature_list
def text_processing(texts): if isinstance(texts, list): clean = [] for st in texts: clean.extend([ stem(w) for w in st.casefold().translate(translator).split() if w not in stop_words ]) return clean clean = [ stem(w) for w in texts.casefold().translate(translator).split() if w not in stop_words ] return clean
def getP(self, review): words = re.split('[^A-Za-z0-9_\'\"?!]+',review) words = filter(None,words) words = [stem(word.lower()) for word in words] words = set(words) pos = (0.0+self.pos_doc)/(self.pos_doc+self.neg_doc) for word in words: if len(word) == 1 and word != '?' and word != '!': continue if word in self.positive: pos *= self.positive[word] else: pos *= (1.0/self.pos_doc) neg = (0.0+self.neg_doc)/(self.pos_doc+self.neg_doc) for word in words: if len(word) == 1 and word != '?' and word != '!': continue if word in self.negative: neg *= self.negative[word] else: neg *= (1.0/self.neg_doc) print "pos:"+str(pos) print "neg:"+str(neg) if pos > neg: print 'pos' return 1 else: print 'neg' return 0
def read(self, file_name, timelength): with open(file_name, 'rb') as f: data = json.load(f) vocabulary = {} tempLine = [] lineno = 0 for line in data: temp = {} cmnt = re.split(r'[^A-Za-z]+', line["message"]) words = [] for word in cmnt: word = word.lower() if word not in self.stopWords: word = stem(word) words.append(word) if word not in vocabulary: vocabulary[word] = 0 if len(words) != 0: temp["time"] = line["time"] temp["lineno"] = lineno temp["text"] = words tempLine.append(temp) lineno += 1 lines = sorted(tempLine, key=lambda e: (e.__getitem__('time'))) self.store(lines, timelength) return lines, timelength, vocabulary
def getP(self, review): words = re.split('[^A-Za-z0-9_\']+',review) words = filter(None,words) words = [stem(word.lower()) for word in words] words = set(words) pos = (0.0+self.pos_count)/(self.pos_count+self.neg_count) for word in words: if word in self.positive: pos *= self.positive[word] else: pos *= (1.0/self.pos_count) neg = (0.0+self.neg_count)/(self.pos_count+self.neg_count) for word in words: if word in self.negative: neg *= self.negative[word] else: neg *= (1.0/self.neg_count) print "pos:"+str(pos) print "neg:"+str(neg) if pos > neg: return 1 else: return 0
def build_queries(): file = "query_desc.51-100.short.txt" with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\"+file) as f: queries = f.readlines() q_dict = dict() for query in queries: new_query = query.split(' ') li_query_terms=[] for m in re.finditer(r'\w+(\.?\w+)*', new_query[1].lower()): li_query_terms.append(m.group(0)) edit_query = ' '.join(map(str, li_query_terms[3:])) q_dict[new_query[0][:-1]] = edit_query stop_file = open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\stoplist.txt",'r') stop_words = [] for line in stop_file: line=line[:-1] stop_words.append(line) fin_query='' query_dict=dict() for key in q_dict.keys(): fin_query=' '.join([stem(word) for word in q_dict[key].split() if word not in stop_words]) query_dict[key]=fin_query return query_dict
def build_queries(): file = "query_desc.51-100.short.txt" with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\"+file) as f: queries = f.readlines() q_dict = dict() for query in queries: new_query = query.split('.') new_query.pop() if len(new_query) > 0 and new_query[0] not in q_dict: edit_query=new_query[1].split() edit_query = edit_query[3:] edit_query = ' '.join(edit_query) edit_query = edit_query.replace(',','') edit_query = edit_query.replace('"','') edit_query = edit_query.replace('-',' ') q_dict[new_query[0]] = edit_query stop_fname = "stoplist.txt" stop_file = open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\"+stop_fname,'r') stop_words = [] for line in stop_file: line=line[:-1] stop_words.append(line) fin_query='' query_dict=dict() for key in q_dict.keys(): fin_query=' '.join([stem(word) for word in q_dict[key].split() if word not in stop_words]) query_dict[key]=fin_query return query_dict
def ans_52(): with open('nlp_51.txt', mode='r') as f: words = f.read() words = words.split('\n') result = '\n'.join( [word + '\t' + stem(word) for word in words if len(words) != 0]) # list(map(lambda x: [x+'\t'+stem(x)], words)) print(result[:300])
def wordCleanUp(word, irrDict=None): """ stem and change verbs to present tense """ w = stem(word.lower()) try: return irrDict[w] except (TypeError, KeyError): return w
def read_csv_file(self, filename): csv_file = csv.DictReader(open(filename,'rb'), delimiter=',',quotechar='"') self.positive = dict() #self.pos_alpha = dict() self.negative = dict() #self.neg_alpha = dict() self.pos_doc = 0.0 self.neg_doc = 0.0 for line in csv_file: review = line['Review'] category =line['Category'] words = re.split('[^A-Za-z0-9_\'\"?!]+',review) words = filter(None,words) words = [stem(word.lower()) for word in words] words = set(words) if category == '1': self.pos_doc +=1.0 temp_words = set() for word in words: if word == '\'' or word == '\"': continue temp_words.add(word) for word in temp_words: if self.positive.has_key(word): self.positive[word] +=1 else: self.positive[word] = 1 elif category == '0': self.neg_doc += 1.0 temp_words = set() for word in words: if word == '\'' or word == '\"': continue temp_words.add(word) for word in temp_words: if self.negative.has_key(word): self.negative[word] += 1 else: self.negative[word] = 1 #inter_words = set(self.positive.keys())&set(self.negative.keys()) for word in self.positive.keys(): if len(word) == 1 and word != '?' and word != '!': del self.positive[word] continue for word in self.negative.keys(): if len(word) == 1 and word != '?' and word != '!': del self.negative[word] continue for word in self.positive.keys(): self.positive[word] = self.positive[word]/self.pos_doc for word in self.negative.keys(): self.negative[word] = self.negative[word]/self.neg_doc
def stemm_word(word): """ Use Porter stemmer to stem words. :param word: String :return: Stemmed word """ return stem(word)
def build_table(): table = defaultdict(Counter) csvs = [f for f in os.listdir('.') if f.endswith('.csv')] for fname in csvs: with open(fname, 'rb') as f: reader = csv.reader(f) for username, tweet in reader: for s in (stem(t) for t in tokenize(tweet)): table[username][s] += 1 return table
def tokenize(message): """ Finds all the words in message and returns them as a unique set """ message = message.lower() all_words = re.findall("[a-z0-9']+", message) words = [stem(word) for word in all_words] return set(words)
def build_table(): table = defaultdict(Counter) dirs = [fname for fname in os.listdir('.') if fname.endswith('.csv')] for fname in dirs: with open(fname, 'r') as f: reader = csv.reader(f) for username, tweet in reader: for token in tokenize(tweet.lower()): if token not in stopwords: table[username.lower()][stem(token)] += 1 return table
def proc_data(data): for (i, desc) in data: if pd.isnull(desc): desc = 'none' desc = desc.translate(trans_mask).lower().split(' ') ln_list = [] for word, cnt in Counter(desc).iteritems(): word = word.strip() word = stem(word) if word in skip: continue if word == '': continue ln_list.append("%i:%i" % (vocab[word], cnt)) yield str(len(ln_list)) + ' ' + ' '.join(ln_list) + '\n'
def getIrrDict(path=".\wordlists\irr.txt"): ''' irregular verbs stemming ''' irr = dict() with open(path, 'r') as f: for l in f: w0 = l.split()[0].lower() for w in l.strip().split(): w1 = stem(w.lower()) if w1 != w0 : irr[w1]=w0 return irr
def calculate_cosine_similarity(alert_type): cos_numerator_sum = 0 cos_denominator_local_count = 0 cos_denominator_news_count = 0 for local_word in alert_type: for news_word in getKeywords(article): if stem(local_word) == stem(news_word[0]): cos_numerator_sum = cos_numerator_sum + news_word[1] cos_denominator_local_count+=1 cos_denominator_news_count_temp = news_word[1]*news_word[1] cos_denominator_news_count = cos_denominator_news_count + cos_denominator_news_count_temp cos_denominator_sum = math.sqrt(cos_denominator_news_count) * math.sqrt(cos_denominator_local_count) cos_similarity = 0 if cos_denominator_sum != 0: cos_similarity = cos_numerator_sum / cos_denominator_sum return cos_similarity
def extract_features(data, dict_features): data_one_x = np.zeros(len(dict_features) + 1, dtype=np.float64) data_one_x[0] = 1 for word in data.split(' '): word = word.strip() if is_stop_word(word): continue word = porter.stem(word) try: data_one_x[dict_features[word]] = 1 except: pass return data_one_x
def preprocess(text): ''' preprocesses the given text before feeding it to the classification system ''' p = re.compile(r'<script.*?</script>', re.DOTALL) text = p.sub('', text) p = re.compile(r'<a href.*?</a>', re.DOTALL) #TODO use some better name for this text = p.sub('hyperlinkk', text) text = remove_spaces(remove_tags(text)) text = text.lower() p = re.compile(r'[^a-z\s]', re.DOTALL) text = p.sub('', text) stemmed_text = '' for word in text.split(): stemmed_text += stem(word) + " " return stemmed_text
def addToIndex(wordList, docID, t): ''' Removes all the non-ASCII words and then performs stemming and then adds in the index at appropriate location. ''' for word in wordList: word = word.strip().encode('utf-8') if word.isalpha() and len(word) > 3 and word not in stopWords: # Stemming the Words word = stem(word) if word not in stopWords: if word in invertedIndex: if docID in invertedIndex[word]: if t in invertedIndex[word][docID]: invertedIndex[word][docID][t] += 1 else: invertedIndex[word][docID][t] = 1 else: invertedIndex[word][docID] = {t: 1} else: invertedIndex[word] = dict({docID: {t: 1}})
def read_csv_file(self, filename): csv_file = csv.DictReader(open(filename,'rb'), delimiter=',',quotechar='"') self.positive = dict() #self.pos_alpha = dict() self.negative = dict() #self.neg_alpha = dict() self.pos_count = 0.0 self.neg_count = 0.0 for line in csv_file: review = line['Review'] category =line['Category'] words = re.split('[^A-Za-z0-9_\']+',review) words = filter(None,words) words = [stem(word.lower()) for word in words] words = set(words) if category == '1': self.pos_count +=1.0 for word in words: if word == '\'': continue if self.positive.has_key(word): self.positive[word] += 1 else: self.positive[word] = 1 elif category == '0': self.neg_count += 1.0 for word in words: if word == '\'': continue if self.negative.has_key(word): self.negative[word] += 1 else: self.negative[word] = 1 inter_words = set(self.positive.keys())&set(self.negative.keys()) for word in self.positive.keys(): self.positive[word] = self.positive[word]/self.pos_count for word in self.negative.keys(): self.negative[word] = self.negative[word]/self.neg_count
def stemming(paragraph): temp=split(paragraph) for i in range(len(temp)): temp[i]=stem(temp[i]) temp_x = ' '.join(temp) return temp_x
import string import urllib from stemming.porter import stem news = urllib.urlopen('https://ceiba.ntu.edu.tw/course/35d27d/content/28.txt') str = news.read() # lowercase str = str.lower() # tokenization str = str.translate(string.maketrans("",""),string.punctuation) list = str.split() # Porter's stemmer stemlist = [] for x in list: stemlist.append(stem(x)) # stopword removal page = urllib.urlopen('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words') stopwords_list = page.read().split() result = [x for x in stemlist if x not in stopwords_list] # write to file outstring = " ".join(result) output = open("output1.txt", "wb+") output.write(outstring) output.close()
def query_search(queries, secondary_index, number_of_files): ranking = {} queries[3] = re.sub(r"[-.]", '', str(queries[3])) queries[2] = re.sub(r"[-.]", '', str(queries[2])) queries[1] = re.sub(r"[-.]", '', str(queries[1])) queries[0] = re.sub(r"[-.]", '', str(queries[0])) queryb = re.split('[^a-zA-Z0-9]', str(queries[3])) queryc = re.split('[^a-zA-Z0-9]', str(queries[2])) queryi = re.split('[^a-zA-Z0-9]', str(queries[1])) queryt = re.split('[^a-zA-Z0-9]', str(queries[0])) if (queryb): for word in queryb: if (word == ''): continue temp = word.lower() word = stem(word) + ";p" rank = None #print word rank = findi(word, secondary_index, number_of_files) for row in rank: if (row in ranking): ranking[int(row)] += rank[int(row)] else: ranking[int(row)] = rank[int(row)] leng = {} word = stem(temp) + ";t" rank = {} rank = findi(word, secondary_index, number_of_files) for row in rank: if (row in ranking): ranking[int(row)] += log10(30) * rank[int(row)] else: ranking[int(row)] = log10(30) * rank[int(row)] word = stem(temp) + ";i" rank = {} rank = findi(word, secondary_index, number_of_files) for row in rank: if (row in ranking): ranking[int(row)] += rank[int(row)] else: ranking[int(row)] = rank[int(row)] if (queryc): for word in queryc: if (word == ''): continue word = stem(word.lower()) + ";t" leng = findi(word, secondary_index, number_of_files) for row in leng: if (row in ranking): ranking[row] += leng[row] else: ranking[row] = leng[row] if (queryt): for word in queryt: if (word == ''): continue word = stem(word.lower()) + ";t" leng = findi(word, secondary_index, number_of_files) for row in leng: if (row in ranking): ranking[row] += leng[row] else: ranking[row] = leng[row] if (queryi): for word in queryi: if (word == ''): continue word = stem(word.lower()) + ";i" #print word leng = findi(word, secondary_index, number_of_files) for row in leng: if (row in ranking): ranking[row] += leng[row] else: ranking[row] = leng[row] return ranking
def createPartialIndexes_WithStemnStop(): doc_itr=0 catalog={} file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\Partial_Indexer_withStemnStop","a") file.close() doc_hash = pickle.load( open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Doc_Hash_withStemnStop", "rb" ) ) token_hash = pickle.load( open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Token_Hash_withStemnStop", "rb" ) ) files = os.listdir("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection") dblock_hash={} stop_file = open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\stoplist.txt",'r') stop_words = [] for line in stop_file: line=line[:-1] stop_words.append(line) for file in files: with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection\\"+file) as f: doc_file = f.read() docs=re.findall(r'<DOC>(.*?)</DOC>',doc_file,re.DOTALL) if docs: for doc in docs: if (doc_itr%1000 == 0 and doc_itr!=0): #Dump to index indexfile_content = '' file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a') indexfile_offset = file.tell() file.close() for token_id in dblock_hash.keys(): token_content = str(token_id)+"{" for doc_id in dblock_hash[token_id].keys(): token_content += str(doc_id)+": ("+ dblock_hash[token_id][doc_id] + ");" token_content += "}" if token_id in catalog.keys(): tkn_ent_list = catalog[token_id] tkn_ent_list.append((str(indexfile_offset)+","+str(len(token_content)))) catalog[token_id] = tkn_ent_list else: catalog[token_id]=[(str(indexfile_offset)+","+str(len(token_content)))] indexfile_content += token_content indexfile_offset+=len(token_content) file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a') file.write(indexfile_content) file.close() dblock_hash={} print("Dumped " +str(doc_itr)+ " Documents after Stemming and with Stop Words removed") li_doc_no=re.findall(r'<DOCNO>(.*?)</DOCNO>',doc) li_texts=re.findall(r'<TEXT>(.*?)</TEXT>',doc,re.DOTALL) doc_no= ''.join(map(str, li_doc_no)) doc_id = doc_hash[doc_no][0] texts=''.join(map(str, li_texts)) texts=texts.lower() tkn_cnt=0 for m in re.finditer(r'\w+(\.?\w+)*', texts.lower()): token_noStem = m.group(0) if token_noStem not in stop_words: token= stem(token_noStem) tkn_cnt+=1 token_id=token_hash[token] if token_id in dblock_hash.keys(): if doc_id in dblock_hash[token_id].keys(): dblock_hash[token_id][doc_id]+= ","+str(tkn_cnt) else: dblock_hash[token_id][doc_id]=str(tkn_cnt) else: dblock_hash[token_id]={} dblock_hash[token_id][doc_id]=str(tkn_cnt) doc_itr+=1 #Dump the last chunk of dblocks to index indexfile_content = '' file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a') indexfile_offset = file.tell() file.close() for token_id in dblock_hash.keys(): token_content = str(token_id)+"{" for doc_no in dblock_hash[token_id].keys(): token_content += str(doc_no)+": ("+ dblock_hash[token_id][doc_no] + ");" token_content += "}" if token_id in catalog.keys(): tkn_ent_list = catalog[token_id] tkn_ent_list.append((str(indexfile_offset)+","+str(len(token_content)))) catalog[token_id] = tkn_ent_list else: catalog[token_id]=[(str(indexfile_offset)+","+str(len(token_content)))] indexfile_content += token_content indexfile_offset+=len(token_content) file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a') file.write(indexfile_content) file.close() pickle.dump( catalog, open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Dummy_Catalog_withStemnStop", "wb" ) ) print("Dumped All Documents after Stemming and with Stop Words removed")
def stemming(a_list_of_words): stemmed = [stem(words) for words in a_list_of_words] return stemmed
def queryNormal(queryWords): wordsToSearch = list() for word in queryWords: word = word.lower().strip() if word not in stopWords: word = stem(word) if word.isalpha() and len(word) >= 3 and word not in stopWords: wordsToSearch.append(word) globalSearch = dict(list()) for word in wordsToSearch: loc = bisect(secondaryIndex, word) startFlag = False if loc - 1 >= 0 and secondaryIndex[loc - 1] == word: startFlag = True if loc - 1 != 0: loc -= 1 if loc + 1 == len(secondaryIndex) and secondaryIndex[loc] == word: loc += 1 primaryFile = "finalIndex/index" + str(loc) + ".txt" file = open(primaryFile, "r") data = file.read() if startFlag: startIndex = data.find(word + "=") else: startIndex = data.find("\n" + word + "=") endIndex = data.find("\n", startIndex + 1) reqLine = data[startIndex:endIndex] pl = reqLine.split("=")[1].split(",") numDoc = len(pl) idf = log10(noDocs / numDoc) for i in pl: docID, entry = i.split(":") if docID in globalSearch: globalSearch[docID].append(entry + "_" + str(idf)) else: globalSearch[docID] = [entry + "_" + str(idf)] lengthFreq = dict(dict()) regEx = re.compile(r'(\d+|\s+)') for k in globalSearch: weightedFreq = 0 n = len(globalSearch[k]) for x in globalSearch[k]: x, idf = x.split("_") x = x.split("#") for y in x: lis = regEx.split(y) tagType, freq = lis[0], lis[1] if tagType == "t": weightedFreq += int(freq) * 1000 elif tagType == "i" or tagType == "c" or tagType == "r" or tagType == "e": weightedFreq += int(freq) * 50 elif tagType == "b": weightedFreq += int(freq) if n in lengthFreq: lengthFreq[n][k] = float(log10(1 + weightedFreq)) * float(idf) else: lengthFreq[n] = {k: float(log10(1 + weightedFreq)) * float(idf)} count = 0 flag = False # resultList = [] K = 10 for k, v in sorted(lengthFreq.items(), reverse=True): for k1, v1 in sorted(v.items(), key=itemgetter(1), reverse=True): print docTitleMap[k1] count += 1 if count == K: flag = True break if flag: break
def tokenize(s): return [stem(w) for w in re.findall('\w+', s.lower()) if w not in stopwords]
create_doc_term() create_term_info() avg_doclen = sum_doclen/num_doc ignore_file = open("ignore_tokens.txt") ignore_tokens = list(ignore_file.read().split()) query_list = [] xmldoc = minidom.parse('topics.xmls') itemlist = xmldoc.getElementsByTagName('topic') for s in itemlist: qnumber = s.attributes['number'].value Q = s.getElementsByTagName('query').item(0).childNodes[0].data.lower() Qwords = Q.split(' ') stem_words = [] query_str = '' for w in Qwords: if w not in ignore_tokens: stem_word = stem(w) stem_words.append(stem_word) query_str = ' '.join(stem_words) query = [qnumber, query_str] query_list.append(query) Rank(query_list) else: incorrect_usage() else: incorrect_usage()
def stemming(a_list_of_words): #return a list of words return [stem(i) for i in a_list_of_words]
def getStemmed (self, word): return stem(word)
def stemming(lst): stem_lst = [] for wrd in lst: stem_lst.append(wrd + "\t" + stem(wrd)) return stem_lst
import re from stemming import porter def get_sentence(): reg = re.compile(r'(.*?[.;:?!])\s([A-Z].*?\.)') with open('nlp.txt') as f: for l in f: l = l.strip() match = reg.split(l) if len(match) > 0: for line in match: if len(line) > 0: yield (line.strip()) if __name__ == '__main__': for line in get_sentence(): print(''.join([ porter.stem(word.strip(',.')) + '\n' for word in line.split(' ') ]))
inrevision = False ns = 0 l_hitext=[] dicw={} elif tname == 'revision': # Do not pick up on revision id's inrevision = True elif tname == 'redirect': dicw={} rtitle=elem.attrib; if rtitle is not None: l_hitext=re.split("[^a-zA-Z]+", rtitle['title']) for word in l_hitext: word=word.lower() if word not in stopword.stopword: word=stem(word) if word not in dicw: dicw[word]=1 else: dicw[word]+1 for key in dicw: if key not in invit: invit[key]=id+'t'+str(dicw[key])+':' else: invit[key]=invit[key]+(id+'t'+str(dicw[key])+':') l_hitext=[] dicw={} else: if tname == 'title': title = elem.text elif tname == 'id' and not inrevision:
'either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,' 'him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,' 'likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,' 'on,only,or,other,our,own,rather,said,say,says,she,should,since,so,' 'some,than,that,the,their,them,then,there,these,they,this,tis,to,too,' 'twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,' 'will,with,would,yet,you,your').lower().split(',') def is_stopword(string): return string.lower() in stop_words if __name__ == '__main__': word_counter = Counter() with open('sentiment.txt') as f: for line in f: for word in line[3:].split(' '): word = word.strip() if is_stopword(word): continue word = porter.stem(word) if word != '!' and word != '?' and len(word) <= 1: continue word_counter.update([word]) features = [word for word, count in word_counter.items() if count >= 6] with open('features.txt', 'w') as f: print(*features, sep='\n', file=f)
def main(): for word in sys.stdin: word = word.strip() s_word = porter.stem(word) s2_word = porter2.stem(word) print "{}\t{}\t{}".format(word, s_word, s2_word)
# wor=list(set(words)) target = [x for x in temp if x not in commonwords] target = gram_1(target) length_target = sum(target[i][1] for i in range(len(target))) ''' sentences =[] for i in range(1,476): file=open('C:/Users/prakash chand gupta/Desktop/AI PROJECT/Articles2/'+str(i)+'.txt','r') p=file.read().replace("'",' ').replace("`",'').replace(";",'').replace(":",'').replace("!",'.').replace('"',"").replace('\xe2',"").replace('\x80',"").replace('\x9d',"").replace('\x91',"").replace('\x92',"").replace('\x93',"").replace('\x94',"").replace('\x95',"").replace('\x96',"").replace('\x97',"").replace('\x98',"").replace('\x99',"").lower() p=unicodedata.normalize('NFKD', unicode(p,errors='ignore')).encode('ascii','ignore') temp = split(p) temp2 = [x for x in temp if x not in commonwords] for i in range(len(temp2)): temp2[i]=stem(temp2[i]) temp3 = [x for x in temp2 if x not in commonwords] temp3 = gram_1(temp3) sentences.append(temp3) ''' source_distance=[] target_distance=[] ''' ''' for i in range(len(sentences)): length_temp = sum(sentences[i][j][1] for j in range(len(sentences[i]))) count=0 if len(sentences[i]) < len(source): for k in range(len(sentences[i])): for j in range(len(source)):
def _stem(self, term): return porter.stem(term)
'C:/Users/prakash chand gupta/Desktop/AI PROJECT/Articles2/' + str(i) + '.txt', 'r') p = file.read().replace("'", ' ').replace("`", '').replace( ";", '').replace(":", '').replace("!", '.').replace('"', "").replace( '\xe2', "").replace('\x80', "").replace('\x9d', "").replace( '\x91', "").replace('\x92', "").replace('\x93', "").replace( '\x94', "").replace('\x95', "").replace('\x96', "").replace( '\x97', "").replace('\x98', "").replace('\x99', "").lower() p = unicodedata.normalize('NFKD', unicode(p, errors='ignore')).encode( 'ascii', 'ignore') temp = split(p) temp2 = [x for x in temp if x not in commonwords] for i in range(len(temp2)): temp2[i] = stem(temp2[i]) temp3 = [x for x in temp2 if x not in commonwords] temp3 = gram_1(temp3) sentences.append(temp3) ''' source_distance=[] target_distance=[] ''' ''' for i in range(len(sentences)): length_temp = sum(sentences[i][j][1] for j in range(len(sentences[i]))) count=0 if len(sentences[i]) < len(source): for k in range(len(sentences[i])): for j in range(len(source)): if sentences[i][k][0] == source[j][0] and sentences[i][k][0]!='':
def stemming(paragraph): temp = split(paragraph) for i in range(len(temp)): temp[i] = stem(temp[i]) temp_x = ' '.join(temp) return temp_x
def tokenize(doc, stopwords): reg = re.compile(r'(\w+(?:\.?\w+)*)') return map(lambda t: stem(t), filter(lambda t: t not in stopwords, reg.findall(doc.lower())))
def analyze_term(term, stopwords): if term in stopwords: return None else: return stem(term.lower())
f_docids = open("docids.txt", "w") f_termids = open("termids.txt", "w") f_doc_index = open("doc_index.txt", "w") f_doc_read = open("doc_index.txt", "r") for file in os.listdir(path): docid_str = file + '\t' + str(docid) + '\n' f_docids.write(docid_str) count = 1 wordList = tokenize (file) doc_words = collections.OrderedDict() stem_list = [] for word in wordList: stem_word = stem(word[0]) if stem_word not in doc_words.keys() and word[0] not in ignore_tokens: doc_words[stem_word] = 1 if stem_word not in all_tokens.keys(): all_tokens.update({stem_word : termid}) termid_str = ''.join([stem_word, '\t', str(termid), '\n']) f_termids.write(termid_str) termid += 1 stem_list.append(stem_word) for token in doc_words: doc_idx_str = ''.join([str(docid), '\t', str(all_tokens[token])]) position = 1 for s in stem_list: if token == s: doc_idx_str = ''.join([doc_idx_str.strip(), '\t', str(position)])