def process_query(queryt, dictionaryword, dictionarytitleq, dictionaryc):
    querydt = queryt.split(',')
    querytt = None
    query = None
    queryc = None
    for i in querydt:
        if 'T:' in i:
            querytt = i.split(':')
            querytt = querytt[1]
        elif 'B:' in i:
            query = i.split(':')
            query = query[1]
        elif 'C:' in i:
            queryc = i.split(':')
            queryc = queryc[1]
        else:
            query = i
            querytt = i
    if query is not None:
        queryterms = re.split("[^a-zA-Z]+", query)
        for word in queryterms:
            word = word.lower()
            if word not in stopword.stopword:
                word = stem(word)
                if word not in dictionaryword:
                    dictionaryword[word] = 1
                else:
                    dictionaryword[word] += 1
    if querytt is not None:
        queryterms = re.split("[^a-zA-Z]+", querytt)
        for word in queryterms:
            word = word.lower()
            if word not in stopword.stopword:
                word = stem(word)
                if word not in dictionarytitleq:
                    dictionarytitleq[word] = 1
                else:
                    dictionarytitleq[word] += 1
    if queryc is not None:
        queryterms = re.split("[^a-zA-Z]+", queryc)
        for word in queryterms:
            word = word.lower()
            if word not in stopword.stopword:
                word = stem(word)
                if word not in dictionarytitleq:
                    dictionaryc[word] = 1
                else:
                    dictionaryc[word] += 1
Exemple #2
0
def createDocAndTokenHashes_WithStemming():

    print("Creating Document Hash and Token Hash with stemming....")
    files = os.listdir("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection")
    doc_id=1
    token_increment = 1
    doc_hash = dict()
    token_hash = dict()
    for file in files:
        with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection\\"+file) as f:
            doc_file = f.read()
        docs=re.findall(r'<DOC>(.*?)</DOC>',doc_file,re.DOTALL)        
        if docs:
            for doc in docs:
                li_doc_no=re.findall(r'<DOCNO>(.*?)</DOCNO>',doc)
                li_texts=re.findall(r'<TEXT>(.*?)</TEXT>',doc,re.DOTALL)
                doc_no= ''.join(map(str, li_doc_no))
                texts=''.join(map(str, li_texts))
                dlen=0
                for m in re.finditer(r'\w+(\.?\w+)*', texts.lower()):
                    token_noStem = m.group(0)
                    token= stem(token_noStem)
                    dlen+=1
                    if token not in token_hash.keys():
                        token_hash[token] = token_increment
                        token_increment+=1
                doc_hash[doc_no]=(doc_id,dlen)
                doc_id+=1
    print("Docs= "+str(doc_id-1))
    print("Tokens with stemming= "+str(token_increment-1))
    pickle.dump( doc_hash, open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Doc_Hash_withStemming", "wb" ) )
    pickle.dump( token_hash, open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Token_Hash_withStemming", "wb" ) )
Exemple #3
0
def get_feature_list():
    feature_list = set()
    for line in open("./../data/featurelist.txt"):
        word = line.strip()
        feature_list.add(stem(word.lower()))

    return feature_list
Exemple #4
0
def text_processing(texts):
    if isinstance(texts, list):
        clean = []
        for st in texts:
            clean.extend([
                stem(w) for w in st.casefold().translate(translator).split()
                if w not in stop_words
            ])
        return clean

    clean = [
        stem(w) for w in texts.casefold().translate(translator).split()
        if w not in stop_words
    ]

    return clean
	def getP(self, review):
		words = re.split('[^A-Za-z0-9_\'\"?!]+',review)

		words = filter(None,words)
		words = [stem(word.lower()) for word in words]
		words = set(words)

		pos = (0.0+self.pos_doc)/(self.pos_doc+self.neg_doc)
		for word in words:

			if len(word) == 1 and word != '?' and word != '!':
				continue
			if word in self.positive:
				pos *= self.positive[word]
			else:
				pos *= (1.0/self.pos_doc)
		neg = (0.0+self.neg_doc)/(self.pos_doc+self.neg_doc)
		for word in words:

			if len(word) == 1 and word != '?' and word != '!':
				continue
			if word in self.negative:
				neg *= self.negative[word]
			else:
				neg *= (1.0/self.neg_doc)

		print "pos:"+str(pos)
		print "neg:"+str(neg)
		if pos > neg:
			print 'pos'
			return 1
		else:
			print 'neg'
			return 0
Exemple #6
0
    def read(self, file_name, timelength):

        with open(file_name, 'rb') as f:
            data = json.load(f)

        vocabulary = {}
        tempLine = []
        lineno = 0
        for line in data:
            temp = {}

            cmnt = re.split(r'[^A-Za-z]+', line["message"])

            words = []
            for word in cmnt:
                word = word.lower()
                if word not in self.stopWords:
                    word = stem(word)
                    words.append(word)
                    if word not in vocabulary:
                        vocabulary[word] = 0

            if len(words) != 0:
                temp["time"] = line["time"]
                temp["lineno"] = lineno
                temp["text"] = words
                tempLine.append(temp)
            lineno += 1

        lines = sorted(tempLine, key=lambda e: (e.__getitem__('time')))
        self.store(lines, timelength)
        return lines, timelength, vocabulary
	def getP(self, review):
		words = re.split('[^A-Za-z0-9_\']+',review)
		words = filter(None,words)
		words = [stem(word.lower()) for word in words]
		words = set(words)

		pos = (0.0+self.pos_count)/(self.pos_count+self.neg_count)
		for word in words:
			if word in self.positive:
				pos *= self.positive[word]
			else:
				pos *= (1.0/self.pos_count)
		neg = (0.0+self.neg_count)/(self.pos_count+self.neg_count)
		for word in words:
			if word in self.negative:
				neg *= self.negative[word]
			else:
				neg *= (1.0/self.neg_count)

		print "pos:"+str(pos)
		print "neg:"+str(neg)
		if pos > neg:
			return 1
		else:
			return 0
Exemple #8
0
def build_queries():
    
    file = "query_desc.51-100.short.txt"
    with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\"+file) as f:
        queries = f.readlines()
    q_dict = dict()

    for query in queries:    
        new_query = query.split('   ')
        li_query_terms=[]
        for m in re.finditer(r'\w+(\.?\w+)*', new_query[1].lower()):
                li_query_terms.append(m.group(0))
        edit_query = ' '.join(map(str, li_query_terms[3:]))
        q_dict[new_query[0][:-1]] = edit_query

    stop_file = open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\stoplist.txt",'r')
    stop_words = []

    for line in stop_file:
        line=line[:-1]
        stop_words.append(line)
    fin_query=''
    query_dict=dict()
    for key in q_dict.keys():    
        fin_query=' '.join([stem(word) for word in q_dict[key].split() if word not in stop_words])    
        query_dict[key]=fin_query
    return query_dict
def build_queries():
    
    file = "query_desc.51-100.short.txt"
    with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\"+file) as f:
        queries = f.readlines()
    q_dict = dict()
    
    for query in queries:    
        new_query = query.split('.')
        new_query.pop()
        if len(new_query) > 0 and new_query[0] not in q_dict:
            edit_query=new_query[1].split()        
            edit_query = edit_query[3:]
            edit_query = ' '.join(edit_query)
            edit_query = edit_query.replace(',','')
            edit_query = edit_query.replace('"','')
            edit_query = edit_query.replace('-',' ')
            q_dict[new_query[0]] = edit_query
                
    stop_fname = "stoplist.txt"
    stop_file = open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\"+stop_fname,'r')
    stop_words = []

    for line in stop_file:
        line=line[:-1]
        stop_words.append(line)
    fin_query=''
    query_dict=dict()
    for key in q_dict.keys():    
        fin_query=' '.join([stem(word) for word in q_dict[key].split() if word not in stop_words])    
        query_dict[key]=fin_query
    return query_dict
Exemple #10
0
def ans_52():
    with open('nlp_51.txt', mode='r') as f:
        words = f.read()
    words = words.split('\n')
    result = '\n'.join(
        [word + '\t' + stem(word) for word in words if len(words) != 0])

    # list(map(lambda x: [x+'\t'+stem(x)], words))
    print(result[:300])
def wordCleanUp(word, irrDict=None):
    """
    stem and change verbs to present tense
    """
    w = stem(word.lower())
    try:
        return irrDict[w]
    except (TypeError, KeyError):
        return w
	def read_csv_file(self, filename):
		csv_file = csv.DictReader(open(filename,'rb'), delimiter=',',quotechar='"')
		self.positive = dict()
		#self.pos_alpha = dict()
		self.negative = dict()
		#self.neg_alpha = dict()
		self.pos_doc = 0.0
		self.neg_doc = 0.0
		for line in csv_file:
			review = line['Review']
			category  =line['Category']
			words = re.split('[^A-Za-z0-9_\'\"?!]+',review)
			words = filter(None,words)
			words = [stem(word.lower()) for word in words]
			words = set(words)
			if category == '1':

				self.pos_doc +=1.0
				temp_words = set()
				for word in words:
					if word == '\'' or word == '\"':
						continue
					temp_words.add(word)
				for word in temp_words:
					if self.positive.has_key(word):
						self.positive[word] +=1
					else:
						self.positive[word] = 1
			elif category == '0':
				
				self.neg_doc += 1.0
				temp_words = set()
				for word in words:
					if word == '\'' or word == '\"':
						continue
					temp_words.add(word)
				for word in temp_words:
					if self.negative.has_key(word):
						self.negative[word] += 1
					else:
						self.negative[word] = 1
		#inter_words = set(self.positive.keys())&set(self.negative.keys())

		for word in self.positive.keys():
			if len(word) == 1 and word != '?' and word != '!':
				del self.positive[word]
				continue

		for word in self.negative.keys():
			if len(word) == 1 and word != '?' and word != '!':
				del self.negative[word]
				continue

		for word in self.positive.keys():
			self.positive[word] = self.positive[word]/self.pos_doc
		for word in self.negative.keys():
			self.negative[word] = self.negative[word]/self.neg_doc
Exemple #13
0
def stemm_word(word):
    """
        Use Porter stemmer to stem words.

    :param word: String

    :return: Stemmed word
    """
    return stem(word)
Exemple #14
0
def build_table():
    table = defaultdict(Counter)
    csvs = [f for f in os.listdir('.') if f.endswith('.csv')]
    for fname in csvs:
        with open(fname, 'rb') as f:
            reader = csv.reader(f)
            for username, tweet in reader:
                for s in (stem(t) for t in tokenize(tweet)):
                    table[username][s] += 1
    return table
Exemple #15
0
def tokenize(message):
    
    """
    Finds all the words in message and returns them as a
    unique set
    """

    message = message.lower()
    all_words = re.findall("[a-z0-9']+", message)
    words = [stem(word) for word in all_words]
    return set(words)
Exemple #16
0
def build_table():
    table = defaultdict(Counter)
    dirs = [fname for fname in os.listdir('.') if fname.endswith('.csv')]
    for fname in dirs:
        with open(fname, 'r') as f:
            reader = csv.reader(f)
            for username, tweet in reader:
                for token in tokenize(tweet.lower()):
                    if token not in stopwords:
                        table[username.lower()][stem(token)] += 1

    return table
Exemple #17
0
 def proc_data(data):
     for (i, desc) in data:
         if pd.isnull(desc): desc = 'none'
         desc = desc.translate(trans_mask).lower().split(' ')
         ln_list = []
         for word, cnt in Counter(desc).iteritems():
             word = word.strip()
             word = stem(word)
             if word in skip: continue
             if word == '': continue
             ln_list.append("%i:%i" % (vocab[word], cnt))
         yield str(len(ln_list)) + ' ' + ' '.join(ln_list) + '\n'
def getIrrDict(path=".\wordlists\irr.txt"):
    '''
    irregular verbs stemming
    '''
    irr = dict()
    with open(path, 'r') as f:
        for l in f:
            w0 = l.split()[0].lower()
            for w in l.strip().split():
                w1 = stem(w.lower())
                if w1 != w0 : irr[w1]=w0

    return irr
Exemple #19
0
def calculate_cosine_similarity(alert_type):
	cos_numerator_sum = 0
	cos_denominator_local_count = 0 
	cos_denominator_news_count = 0 

	for local_word in alert_type: 
		for news_word in getKeywords(article): 
			if stem(local_word) == stem(news_word[0]):
				cos_numerator_sum = cos_numerator_sum + news_word[1]
				cos_denominator_local_count+=1
				cos_denominator_news_count_temp = news_word[1]*news_word[1]
				cos_denominator_news_count = cos_denominator_news_count + cos_denominator_news_count_temp

	cos_denominator_sum = math.sqrt(cos_denominator_news_count) * math.sqrt(cos_denominator_local_count)

	cos_similarity = 0


	if cos_denominator_sum != 0: 
		cos_similarity = cos_numerator_sum / cos_denominator_sum


	return cos_similarity
Exemple #20
0
def extract_features(data, dict_features):
    data_one_x = np.zeros(len(dict_features) + 1, dtype=np.float64)
    data_one_x[0] = 1
    for word in data.split(' '):
        word = word.strip()
        if is_stop_word(word):
            continue

        word = porter.stem(word)
        try:
            data_one_x[dict_features[word]] = 1
        except:
            pass
    return data_one_x
Exemple #21
0
def preprocess(text):
    '''
    preprocesses the given text before feeding it to the classification
    system
    '''
    p = re.compile(r'<script.*?</script>', re.DOTALL)
    text = p.sub('', text)
    p = re.compile(r'<a href.*?</a>', re.DOTALL)
    #TODO use some better name for this
    text = p.sub('hyperlinkk', text)
    text = remove_spaces(remove_tags(text))
    text = text.lower()
    p = re.compile(r'[^a-z\s]', re.DOTALL)
    text = p.sub('', text)
    
    stemmed_text = ''
    for word in text.split():
        stemmed_text += stem(word) + " "
    return stemmed_text
Exemple #22
0
def addToIndex(wordList, docID, t):
    '''
    Removes all the non-ASCII words and then performs stemming and then adds in the index at appropriate location.
    '''
    for word in wordList:
        word = word.strip().encode('utf-8')
        if word.isalpha() and len(word) > 3 and word not in stopWords:
            # Stemming the Words
            word = stem(word)
            if word not in stopWords:
                if word in invertedIndex:
                    if docID in invertedIndex[word]:
                        if t in invertedIndex[word][docID]:
                            invertedIndex[word][docID][t] += 1
                        else:
                            invertedIndex[word][docID][t] = 1
                    else:
                        invertedIndex[word][docID] = {t: 1}
                else:
                    invertedIndex[word] = dict({docID: {t: 1}})
	def read_csv_file(self, filename):
		csv_file = csv.DictReader(open(filename,'rb'), delimiter=',',quotechar='"')
		self.positive = dict()
		#self.pos_alpha = dict()
		self.negative = dict()
		#self.neg_alpha = dict()
		self.pos_count = 0.0
		self.neg_count = 0.0
		for line in csv_file:
			review = line['Review']
			category  =line['Category']
			words = re.split('[^A-Za-z0-9_\']+',review)
			words = filter(None,words)
			words = [stem(word.lower()) for word in words]
			words = set(words)
			if category == '1':

				self.pos_count +=1.0
				for word in words:
					if word == '\'':
						continue
					if self.positive.has_key(word):
						self.positive[word] += 1
					else:
						self.positive[word] = 1
			elif category == '0':
				
				self.neg_count += 1.0
				for word in words:
					if word == '\'':
						continue
					if self.negative.has_key(word):
						self.negative[word] += 1
					else:
						self.negative[word] = 1
		inter_words = set(self.positive.keys())&set(self.negative.keys())
		for word in self.positive.keys():
			self.positive[word] = self.positive[word]/self.pos_count
		for word in self.negative.keys():
			self.negative[word] = self.negative[word]/self.neg_count
Exemple #24
0
def stemming(paragraph):
	temp=split(paragraph)
	for i in range(len(temp)):
		temp[i]=stem(temp[i])
	temp_x = ' '.join(temp)
	return temp_x
Exemple #25
0
import string
import urllib
from stemming.porter import stem

news = urllib.urlopen('https://ceiba.ntu.edu.tw/course/35d27d/content/28.txt')
str = news.read()

# lowercase
str = str.lower()

# tokenization
str = str.translate(string.maketrans("",""),string.punctuation)
list = str.split()

# Porter's stemmer
stemlist = []
for x in list:
	stemlist.append(stem(x))

# stopword removal
page = urllib.urlopen('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words')
stopwords_list = page.read().split()
result = [x for x in stemlist if x not in stopwords_list]

# write to file
outstring = " ".join(result)
output = open("output1.txt", "wb+")
output.write(outstring)
output.close()
def query_search(queries, secondary_index, number_of_files):
    ranking = {}
    queries[3] = re.sub(r"[-.]", '', str(queries[3]))
    queries[2] = re.sub(r"[-.]", '', str(queries[2]))
    queries[1] = re.sub(r"[-.]", '', str(queries[1]))
    queries[0] = re.sub(r"[-.]", '', str(queries[0]))
    queryb = re.split('[^a-zA-Z0-9]', str(queries[3]))
    queryc = re.split('[^a-zA-Z0-9]', str(queries[2]))
    queryi = re.split('[^a-zA-Z0-9]', str(queries[1]))
    queryt = re.split('[^a-zA-Z0-9]', str(queries[0]))

    if (queryb):
        for word in queryb:
            if (word == ''):
                continue
            temp = word.lower()
            word = stem(word) + ";p"
            rank = None
            #print word
            rank = findi(word, secondary_index, number_of_files)
            for row in rank:
                if (row in ranking):
                    ranking[int(row)] += rank[int(row)]
                else:
                    ranking[int(row)] = rank[int(row)]

            leng = {}
            word = stem(temp) + ";t"
            rank = {}
            rank = findi(word, secondary_index, number_of_files)

            for row in rank:
                if (row in ranking):
                    ranking[int(row)] += log10(30) * rank[int(row)]
                else:
                    ranking[int(row)] = log10(30) * rank[int(row)]
            word = stem(temp) + ";i"
            rank = {}
            rank = findi(word, secondary_index, number_of_files)
            for row in rank:
                if (row in ranking):
                    ranking[int(row)] += rank[int(row)]
                else:
                    ranking[int(row)] = rank[int(row)]

    if (queryc):
        for word in queryc:
            if (word == ''):
                continue
            word = stem(word.lower()) + ";t"
            leng = findi(word, secondary_index, number_of_files)
            for row in leng:
                if (row in ranking):
                    ranking[row] += leng[row]
                else:
                    ranking[row] = leng[row]

    if (queryt):
        for word in queryt:
            if (word == ''):
                continue
            word = stem(word.lower()) + ";t"
            leng = findi(word, secondary_index, number_of_files)
            for row in leng:
                if (row in ranking):
                    ranking[row] += leng[row]
                else:
                    ranking[row] = leng[row]

    if (queryi):
        for word in queryi:
            if (word == ''):
                continue
            word = stem(word.lower()) + ";i"
            #print word
            leng = findi(word, secondary_index, number_of_files)
            for row in leng:
                if (row in ranking):
                    ranking[row] += leng[row]
                else:
                    ranking[row] = leng[row]

    return ranking
Exemple #27
0
def createPartialIndexes_WithStemnStop():
    
    doc_itr=0
    catalog={}
    file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\Partial_Indexer_withStemnStop","a")
    file.close()
    doc_hash = pickle.load( open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Doc_Hash_withStemnStop", "rb" ) )
    token_hash = pickle.load( open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Token_Hash_withStemnStop", "rb" ) )
    files = os.listdir("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection")
    dblock_hash={}
    stop_file = open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\stoplist.txt",'r')
    stop_words = []
    for line in stop_file:
        line=line[:-1]
        stop_words.append(line)
    for file in files:
        with open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\ap89_collection\\"+file) as f:
            doc_file = f.read()
        docs=re.findall(r'<DOC>(.*?)</DOC>',doc_file,re.DOTALL)
        if docs:
            for doc in docs:
                if (doc_itr%1000 == 0 and doc_itr!=0):
                    #Dump to index
                    indexfile_content = ''
                    file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a')
                    indexfile_offset = file.tell()
                    file.close()
                    
                    for token_id in dblock_hash.keys():
                        token_content = str(token_id)+"{"
                        for doc_id in dblock_hash[token_id].keys():
                            token_content += str(doc_id)+": ("+ dblock_hash[token_id][doc_id] + ");"
                        token_content += "}"                        
                        if token_id in catalog.keys():
                            tkn_ent_list = catalog[token_id]
                            tkn_ent_list.append((str(indexfile_offset)+","+str(len(token_content))))
                            catalog[token_id] = tkn_ent_list
                        else:
                            catalog[token_id]=[(str(indexfile_offset)+","+str(len(token_content)))]
                        indexfile_content += token_content
                        indexfile_offset+=len(token_content)
                    file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a')
                    file.write(indexfile_content)
                    file.close()
                    dblock_hash={}
                    print("Dumped " +str(doc_itr)+ " Documents after Stemming and with Stop Words removed")
                    
                li_doc_no=re.findall(r'<DOCNO>(.*?)</DOCNO>',doc)
                li_texts=re.findall(r'<TEXT>(.*?)</TEXT>',doc,re.DOTALL)
                doc_no= ''.join(map(str, li_doc_no))
                doc_id = doc_hash[doc_no][0]
                texts=''.join(map(str, li_texts))
                texts=texts.lower()
                tkn_cnt=0
                for m in re.finditer(r'\w+(\.?\w+)*', texts.lower()):
                    token_noStem = m.group(0)
                    if token_noStem not in stop_words:
                        token= stem(token_noStem)
                        tkn_cnt+=1
                        token_id=token_hash[token]
                        if token_id in dblock_hash.keys():
                            if doc_id in dblock_hash[token_id].keys():
                                dblock_hash[token_id][doc_id]+= ","+str(tkn_cnt)
                            else:
                                dblock_hash[token_id][doc_id]=str(tkn_cnt)
                        else:
                            dblock_hash[token_id]={}
                            dblock_hash[token_id][doc_id]=str(tkn_cnt)
                
                doc_itr+=1
    #Dump the last chunk of dblocks to index
    indexfile_content = ''
    file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a')
    indexfile_offset = file.tell()
    file.close()
    
    for token_id in dblock_hash.keys():
        token_content = str(token_id)+"{"
        for doc_no in dblock_hash[token_id].keys():
            token_content += str(doc_no)+": ("+ dblock_hash[token_id][doc_no] + ");"
        token_content += "}"                        
        if token_id in catalog.keys():
            tkn_ent_list = catalog[token_id]
            tkn_ent_list.append((str(indexfile_offset)+","+str(len(token_content))))
            catalog[token_id] = tkn_ent_list
        else:
            catalog[token_id]=[(str(indexfile_offset)+","+str(len(token_content)))]
        indexfile_content += token_content
        indexfile_offset+=len(token_content)
    file=open("C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Partial_Indexer_withStemnStop",'a')
    file.write(indexfile_content)
    file.close()
    pickle.dump( catalog, open( "C:\\Users\\hp\\Desktop\\IR_Documents\\AP_DATA\\HW2_Data\\Dummy_Catalog_withStemnStop", "wb" ) )
    print("Dumped All Documents after Stemming and with Stop Words removed")
Exemple #28
0
def stemming(a_list_of_words):
    stemmed = [stem(words) for words in a_list_of_words]
    return stemmed
Exemple #29
0
def queryNormal(queryWords):
    wordsToSearch = list()
    for word in queryWords:
        word = word.lower().strip()
        if word not in stopWords:
            word = stem(word)
        if word.isalpha() and len(word) >= 3 and word not in stopWords:
            wordsToSearch.append(word)
    globalSearch = dict(list())
    for word in wordsToSearch:
        loc = bisect(secondaryIndex, word)
        startFlag = False
        if loc - 1 >= 0 and secondaryIndex[loc - 1] == word:
            startFlag = True
            if loc - 1 != 0:
                loc -= 1
            if loc + 1 == len(secondaryIndex) and secondaryIndex[loc] == word:
                loc += 1
        primaryFile = "finalIndex/index" + str(loc) + ".txt"
        file = open(primaryFile, "r")
        data = file.read()
        if startFlag:
            startIndex = data.find(word + "=")
        else:
            startIndex = data.find("\n" + word + "=")
        endIndex = data.find("\n", startIndex + 1)
        reqLine = data[startIndex:endIndex]
        pl = reqLine.split("=")[1].split(",")
        numDoc = len(pl)
        idf = log10(noDocs / numDoc)
        for i in pl:
            docID, entry = i.split(":")
            if docID in globalSearch:
                globalSearch[docID].append(entry + "_" + str(idf))
            else:
                globalSearch[docID] = [entry + "_" + str(idf)]
    lengthFreq = dict(dict())
    regEx = re.compile(r'(\d+|\s+)')
    for k in globalSearch:
        weightedFreq = 0
        n = len(globalSearch[k])
        for x in globalSearch[k]:
            x, idf = x.split("_")
            x = x.split("#")
            for y in x:
                lis = regEx.split(y)
                tagType, freq = lis[0], lis[1]
                if tagType == "t":
                    weightedFreq += int(freq) * 1000
                elif tagType == "i" or tagType == "c" or tagType == "r" or tagType == "e":
                    weightedFreq += int(freq) * 50
                elif tagType == "b":
                    weightedFreq += int(freq)
        if n in lengthFreq:
            lengthFreq[n][k] = float(log10(1 + weightedFreq)) * float(idf)
        else:
            lengthFreq[n] = {k: float(log10(1 + weightedFreq)) * float(idf)}
    count = 0
    flag = False
    # resultList = []
    K = 10
    for k, v in sorted(lengthFreq.items(), reverse=True):
        for k1, v1 in sorted(v.items(), key=itemgetter(1), reverse=True):
            print docTitleMap[k1]
            count += 1
            if count == K:
                flag = True
                break
        if flag:
            break
Exemple #30
0
def tokenize(s):
    return [stem(w) for w in re.findall('\w+', s.lower()) if w not in stopwords]
      create_doc_term()
      create_term_info()

      avg_doclen = sum_doclen/num_doc

      ignore_file = open("ignore_tokens.txt")
      ignore_tokens = list(ignore_file.read().split())

      query_list = []
      xmldoc = minidom.parse('topics.xmls')
      itemlist = xmldoc.getElementsByTagName('topic')
      for s in itemlist:
         qnumber = s.attributes['number'].value
         Q = s.getElementsByTagName('query').item(0).childNodes[0].data.lower()
	 Qwords = Q.split(' ')
	 stem_words = []
	 query_str = ''
	 for w in Qwords:
	    if w not in ignore_tokens:
	       stem_word = stem(w)
	       stem_words.append(stem_word)
         query_str = ' '.join(stem_words)
	 query = [qnumber, query_str]
         query_list.append(query)

      Rank(query_list)
   else:
      incorrect_usage()
else:
   incorrect_usage()
Exemple #32
0
 def stemming(a_list_of_words):        
     #return a list of words
     return [stem(i) for i in a_list_of_words]
Exemple #33
0
	def getStemmed (self, word):
		return stem(word)
Exemple #34
0
def stemming(lst):
    stem_lst = []
    for wrd in lst:
        stem_lst.append(wrd + "\t" + stem(wrd))
    return stem_lst
Exemple #35
0
import re

from stemming import porter


def get_sentence():
    reg = re.compile(r'(.*?[.;:?!])\s([A-Z].*?\.)')
    with open('nlp.txt') as f:
        for l in f:
            l = l.strip()
            match = reg.split(l)
            if len(match) > 0:
                for line in match:
                    if len(line) > 0:
                        yield (line.strip())


if __name__ == '__main__':
    for line in get_sentence():
        print(''.join([
            porter.stem(word.strip(',.')) + '\n' for word in line.split(' ')
        ]))
Exemple #36
0
 def stemming(a_list_of_words):
     #return a list of words
     return [stem(i) for i in a_list_of_words]
Exemple #37
0
         inrevision = False
         ns = 0
         l_hitext=[]
         dicw={}
     elif tname == 'revision':
         # Do not pick up on revision id's
         inrevision = True
     elif tname == 'redirect':
         dicw={}
         rtitle=elem.attrib;
         if rtitle is not None:
             l_hitext=re.split("[^a-zA-Z]+", rtitle['title'])
         for word in l_hitext:
             word=word.lower()
             if word not in stopword.stopword:
                 word=stem(word)
                 if word not in dicw:
                     dicw[word]=1
                 else:
                     dicw[word]+1
         for key in dicw:
             if key not in invit:
                 invit[key]=id+'t'+str(dicw[key])+':'
             else:
                 invit[key]=invit[key]+(id+'t'+str(dicw[key])+':')
         l_hitext=[]
         dicw={}
 else:
     if tname == 'title':
         title = elem.text
     elif tname == 'id' and not inrevision:
Exemple #38
0
    'either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,'
    'him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,'
    'likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,'
    'on,only,or,other,our,own,rather,said,say,says,she,should,since,so,'
    'some,than,that,the,their,them,then,there,these,they,this,tis,to,too,'
    'twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,'
    'will,with,would,yet,you,your').lower().split(',')


def is_stopword(string):
    return string.lower() in stop_words


if __name__ == '__main__':
    word_counter = Counter()
    with open('sentiment.txt') as f:
        for line in f:
            for word in line[3:].split(' '):
                word = word.strip()
                if is_stopword(word):
                    continue
                word = porter.stem(word)
                if word != '!' and word != '?' and len(word) <= 1:
                    continue

                word_counter.update([word])

    features = [word for word, count in word_counter.items() if count >= 6]
    with open('features.txt', 'w') as f:
        print(*features, sep='\n', file=f)
Exemple #39
0
def main():
    for word in sys.stdin:
        word = word.strip()
        s_word = porter.stem(word)
        s2_word = porter2.stem(word)
        print "{}\t{}\t{}".format(word, s_word, s2_word)
#	wor=list(set(words))

target = [x for x in temp if x not in commonwords]
target = gram_1(target)
length_target = sum(target[i][1] for i in range(len(target)))
'''

sentences =[]
for i in range(1,476):
    file=open('C:/Users/prakash chand gupta/Desktop/AI PROJECT/Articles2/'+str(i)+'.txt','r')
    p=file.read().replace("'",' ').replace("`",'').replace(";",'').replace(":",'').replace("!",'.').replace('"',"").replace('\xe2',"").replace('\x80',"").replace('\x9d',"").replace('\x91',"").replace('\x92',"").replace('\x93',"").replace('\x94',"").replace('\x95',"").replace('\x96',"").replace('\x97',"").replace('\x98',"").replace('\x99',"").lower()
    p=unicodedata.normalize('NFKD', unicode(p,errors='ignore')).encode('ascii','ignore')
    temp = split(p)
    temp2 = [x for x in temp if x not in commonwords]
    for i in range(len(temp2)):
        temp2[i]=stem(temp2[i])
    temp3 = [x for x in temp2 if x not in commonwords]
    temp3 = gram_1(temp3)
    sentences.append(temp3)

'''
source_distance=[]
target_distance=[]
'''
'''
for i in range(len(sentences)):
    length_temp = sum(sentences[i][j][1] for j in range(len(sentences[i])))
    count=0
    if len(sentences[i]) < len(source):
        for k in range(len(sentences[i])):
                   for j in range(len(source)):
Exemple #41
0
 def _stem(self, term):
     return porter.stem(term)
Exemple #42
0
def stemming(lst):
    stem_lst = []
    for wrd in lst:
        stem_lst.append(wrd + "\t" + stem(wrd))
    return stem_lst
        'C:/Users/prakash chand gupta/Desktop/AI PROJECT/Articles2/' + str(i) +
        '.txt', 'r')
    p = file.read().replace("'", ' ').replace("`", '').replace(
        ";", '').replace(":", '').replace("!", '.').replace('"', "").replace(
            '\xe2', "").replace('\x80', "").replace('\x9d', "").replace(
                '\x91', "").replace('\x92', "").replace('\x93', "").replace(
                    '\x94',
                    "").replace('\x95', "").replace('\x96', "").replace(
                        '\x97', "").replace('\x98', "").replace('\x99',
                                                                "").lower()
    p = unicodedata.normalize('NFKD', unicode(p, errors='ignore')).encode(
        'ascii', 'ignore')
    temp = split(p)
    temp2 = [x for x in temp if x not in commonwords]
    for i in range(len(temp2)):
        temp2[i] = stem(temp2[i])
    temp3 = [x for x in temp2 if x not in commonwords]
    temp3 = gram_1(temp3)
    sentences.append(temp3)
'''
source_distance=[]
target_distance=[]
'''
'''
for i in range(len(sentences)):
    length_temp = sum(sentences[i][j][1] for j in range(len(sentences[i])))
    count=0
    if len(sentences[i]) < len(source):
        for k in range(len(sentences[i])):
                   for j in range(len(source)):
                               if sentences[i][k][0] == source[j][0] and sentences[i][k][0]!='':
Exemple #44
0
def stemming(paragraph):
    temp = split(paragraph)
    for i in range(len(temp)):
        temp[i] = stem(temp[i])
    temp_x = ' '.join(temp)
    return temp_x
Exemple #45
0
def tokenize(doc, stopwords):
    reg = re.compile(r'(\w+(?:\.?\w+)*)')
    return map(lambda t: stem(t),
               filter(lambda t: t not in stopwords, reg.findall(doc.lower())))
Exemple #46
0
def analyze_term(term, stopwords):
    if term in stopwords:
        return None
    else:
        return stem(term.lower())
f_docids = open("docids.txt", "w")
f_termids = open("termids.txt", "w")
f_doc_index = open("doc_index.txt", "w")
f_doc_read = open("doc_index.txt", "r")

for file in os.listdir(path):
   docid_str = file + '\t' + str(docid) + '\n'
   f_docids.write(docid_str)
   count = 1
   wordList = tokenize (file)

   doc_words = collections.OrderedDict()
   stem_list = []
   for word in wordList:
      stem_word = stem(word[0])
      if stem_word not in doc_words.keys() and word[0] not in ignore_tokens:
         doc_words[stem_word] = 1
	 if stem_word not in all_tokens.keys():
	    all_tokens.update({stem_word : termid})
            termid_str = ''.join([stem_word, '\t', str(termid), '\n'])
            f_termids.write(termid_str)
	    termid += 1
      stem_list.append(stem_word)

   for token in doc_words:
      doc_idx_str = ''.join([str(docid), '\t', str(all_tokens[token])])
      position = 1
      for s in stem_list:
         if token == s:
	    doc_idx_str = ''.join([doc_idx_str.strip(), '\t', str(position)])