def tokenize(file_data):
	file_data=file_data.lower()	#lowercasing
	file_data = re.sub('[^0-9a-zA-Z]+', ' ', file_data)	#ignoring punctuations
	file_data = re.sub('(\w)\'(\w)/$1~$2', ' ', file_data)	#ignoring punctuations
	file_data = re.sub('\'/ ', ' ', file_data)	#ignoring punctuations

	file_words = file_data.split()	#split the file into single words

	count = 0
	for word in file_words:
		file_words[count] = stem(stem(word))
		count+=1
	return file_words
Esempio n. 2
0
    def getWeightsForQuery(self, query):
        '''
            retourne les pondérations idf des termes de la requête

            paramètres
            ----------
            query : string
                    requête
            renvoie
            -------
            token_weights: dict of string -> float
                           dictionnaire associant à chaque terme de la requête
                           sa pondération idf
        '''
        df_dict = self.indexer.get_df()
        N = self.indexer.N
        tokens = query.split(" ")
        processed_query = [porter.stem(t.lower()) for t in tokens]
        q = list(set(processed_query))
        token_weights = dict()
        for t in processed_query:
            if t in df_dict:
                token_weights[t] = math.log((1+N)/(1+df_dict[t]))
            else:
                token_weights[t] = 0
        return token_weights
Esempio n. 3
0
def replace_word_by_stems(wordList):
  
    stemList = []
    for word in wordList:
        stemList.append(stem(word))

    return set(stemList)
def stem(tokens):
    """
    now we could do this in python. We coudl also call the same function that
    exists server-side, or even run an embedded javascript interpreter. See
    http://groups.google.com/group/mongodb-user/browse_frm/thread/728c4376c3013007/b5ac548f70c8b3ca
    """
    return [porter.stem(tok) for tok in tokens]
Esempio n. 5
0
 def getWeightsForStem(self,stem):
     stem = porter.stem(stem.lower())
     dicoRes=dict()
     indexInverse = self.indexObject.getTfsForDoc(stem)
     for doc in self.index:
         if doc in indexInverse:
             dicoRes[doc]=1+math.log(indexInverse[doc])
     return dicoRes
Esempio n. 6
0
def Query_processing(query):

    query_tokens = tokenizer.tokenize(query)
    query_tokens = [w.casefold() for w in query_tokens]
    query_tokens = [w for w in query_tokens if w not in stop_words]
    query_tokens = [ps.stem(w) for w in query_tokens]

    return query_tokens, []
Esempio n. 7
0
 def e_morphological_feats(self):
     D = {}
     etymology = retrieve_etymology(self.lemma)
     D["e_latin_root"] = has_ancestor_in_lang("lat", etymology)  # check wiktionary
     D["e_length_dist_lemma_form"] = len(self.word) - len(self.lemma)
     stem, steps = porter.stem(self.word)
     D["e_length_dist_stem_form"] = len(self.word) - len(stem)
     D["e_inflectional_morphemes_count"] = steps 
     return D
Esempio n. 8
0
 def getTextRepresentation(self,text):
     tab=re.findall(r"\w+",text,re.UNICODE)
     
     tab=[i.lower() for i in tab]
     
     ret=Counter(tab)
     
     ret={porter.stem(a):b for (a,b) in ret.items()  if a not in self.stopWords}
     return ret
Esempio n. 9
0
 def getTextRepresentation(self,text):
     tab=re.findall(r"\w+",text,re.UNICODE)
     
     tab = [porter.stem(word) for word in tab if word not in self.stopWords]
     ret=Counter(tab)
     
     # Error programs, program gives only 1
     #ret={porter.stem(a):b for (a,b) in ret.items()  if a not in self.stopWords}
     return dict(ret)
Esempio n. 10
0
 def getWeightsForStem(self,stem):
     stem =  porter.stem(stem.lower())
     dicoRes=dict()
     indexInverse = self.indexObject.getTfsForDoc(stem)
     for doc in self.index:
         if doc in indexInverse:
             idf=math.log((1+len(self.index))/(1+len(indexInverse)))
             dicoRes[doc]=(1+math.log(indexInverse[doc]))*idf
     return dicoRes
Esempio n. 11
0
def parse_query(content, stopwords):
    tempbagofwords = content.replace("-", " ").split()
    bagofwords=[]
    for word in tempbagofwords:
        word = word.lower()
        if not stopwords.has_key(word):
            word = porter.stem(word)
            bagofwords.append(word)
    return bagofwords
Esempio n. 12
0
    def query(self, querystring):
        '''
    '''
        query_no_puct = cfc_tools.replace_punctuation(querystring)
        query_termlist = cfc_tools.remove_multiple_space(query_no_puct).strip().split()

        acc = list()
        for term in query_termlist:
            term_processed = porter.stem(term)
            acc.append((term, self[term].doclist))
Esempio n. 13
0
    def getTextRepresentation(self, text):
        tab = re.findall(r"\w+", text, re.UNICODE)

        tab = [porter.stem(word.lower()) for word in tab]
        tab = [word for word in tab if word not in self.stopWords]

        ret = Counter(tab)

        ret = {a: b for (a, b) in ret.items()}
        return ret
Esempio n. 14
0
 def e_morphological_feats(self):
     D = {}
     etymology = retrieve_etymology(self.lemma)
     D["e_latin_root"] = has_ancestor_in_lang("lat",
                                              etymology)  # check wiktionary
     D["e_length_dist_lemma_form"] = len(self.word) - len(self.lemma)
     stem, steps = porter.stem(self.word)
     D["e_length_dist_stem_form"] = len(self.word) - len(stem)
     D["e_inflectional_morphemes_count"] = steps
     return D
Esempio n. 15
0
def queryPreprocessing(query):
    """
    Preprocess d'une query sous forme de string pour renvoyer un tableau de mots
    """
    arrayQuery = "".join(c for c in query if c.isalnum() or c.isspace()).split()
    #print(arrayQuery)
    res=[]
    for word in arrayQuery:
        word = porter.stem(word.lower()) #On stem les mots car ils sont stemmés dans l'index
        res.append(word)
    return res
Esempio n. 16
0
 def getTextRepresentation(self,text):
     """ Return the stemmed representation of a string.
     :param text: string, input text
     :return: dictionary of {stem: frequency}
     """
     
     tab=re.findall(r"\w+",text,re.UNICODE)
     tab=[word.lower() for word in tab]
     tab = [porter.stem(word) for word in tab if word not in self.stopWords]
     ret=Counter(tab)
     ret={a:b for (a,b) in ret.items()}
     return ret
Esempio n. 17
0
def parse_document_content(content, stopwords):
    tempbagofwords = content.replace("-", " ").split()
    bag_of_words_dict=dict()
    global number_of_terms
    for word in tempbagofwords:
        word = word.lower()
        if word not in stopwords:
            number_of_terms += 1
            word = porter.stem(word)
            if bag_of_words_dict.has_key(word):
                bag_of_words_dict[word] += 1
            else:
                bag_of_words_dict[word] = 1
    return bag_of_words_dict
def Stemming():

    for Tokenobj in TokenPages:
        Tokenobj.title = [ps.stem(w) for w in Tokenobj.title]
        Tokenobj.infobox = [ps.stem(w) for w in Tokenobj.infobox]
        Tokenobj.category = [ps.stem(w) for w in Tokenobj.category]
        Tokenobj.links = [ps.stem(w) for w in Tokenobj.links]
        Tokenobj.ref = [ps.stem(w) for w in Tokenobj.ref]
        Tokenobj.body = [ps.stem(w) for w in Tokenobj.body]
Esempio n. 19
0
def evaluation_modele_vectoriel(D,qry):
    
    qry = [porter.stem(mot) for mot in qry.split()]
    ind = index(D)
    
    score = dict()
    
    for doc in ind[0]:
        score[doc]=0
    
    for mot in qry:
        for docu in ind[1][mot]:
            score[int(docu)]+=int(ind[1][mot][docu])
            
    return score
Esempio n. 20
0
def evaluation_modele_binaire(D,qry):
    
    ind = index(D)[0]
    
    score = dict()
    for i in ind:
        score[i]=0
        for j in qry.split():
            if porter.stem(j) in ind[i]:
                score[i]+=1
        if len(qry.split())==score[i]:
            score[i] = 1 
        else:
            score[i] = 0
    return score
def input_query():
    _result = []
    qq = []
    q = raw_input("Please Enter Your Query:" + '\n')
    x = int(raw_input("pleas enter how many document id you want?" + '\n'))
    q = re.findall(r"[\w']+", q)
    for i in range(0, len(q)):
        q[i] = q[i].lower()
        q[i] = porter.stem(q[i])
        if (q[i] in list_stop):
            continue
        qq.append(q[i])
    #print qq #################
    _result = score_phrase(qq, x)
    user(_result)
Esempio n. 22
0
def create_index_doc(nb):
    global allwords
    # Reading document and conversion to lower case
    words = re.findall(r'\w+', open('doc' + str(nb) + '.txt').read().lower())

    # Reading stop words
    stop_words = re.findall(r'\w+', open('stop_words.txt').read().lower())

    # Deleting stop words + normalization
    words = [porter.stem(word) for word in words if word not in stop_words]

    #Saving all the words of all the documents
    allwords = set(list(allwords) + words)

    # Creation of dict
    return dict(collections.Counter(words))
def ParseRawData(data,stopWordFile):
	'''do tokenize, stemming, remove stop word
		return the parsed file name
	'''
	logging.info('begin parse raw data')
	parsedDatafile = '/parsedPatent.txt'
	writefile = open(directory + parsedDatafile,'w')
	# open stop word file
	# stopwordlist = [word for word in open(stopWordFile).read().split(',')]
	for line in open(data,'r'):
		stemmed_line =[stem(word) for word in cleanTokenizeText(line)]
		# nostop_word_list = [word for word in stemmed_line if word not in stopwordlist]
		writefile.write(' '.join(stemmed_line)+'\n')
	writefile.close()
	logging.info('finished parsing, store the parsed file into ./TFIDF_LDA/parsedPatent.txt')	
	return parsedDatafile
Esempio n. 24
0
def create_black_list():
    fr=None
    tmp=[]
    fr=open("balcklist_word.txt",'rb')
    #s=[]
    for line in fr:
        line= re.findall(r"[\w']+",line)
        for i in range(0,len(line)):
            line[i]=line[i].lower()
            line[i] = porter.stem(line[i])
            if(line[i] in list_stop):
                continue
            black_list.append(line[i])
            tmp.append(line[i])
        black_list_s.append(tmp)
        tmp=[]
    blac_list=list(OrderedDict.fromkeys(black_list))
Esempio n. 25
0
def Pre_Processing(Abstract):

    # case folding
    Abstract = Abstract.lower()

    #split into words
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    words_tokens = tokenizer.tokenize(Abstract)

    #stop word removal
    stop_words = set(stopwords.words('english'))
    words_tokens = [token for token in words_tokens if token not in stop_words]

    #stemming
    words_tokens = [pt.stem(token) for token in words_tokens]

    return words_tokens
Esempio n. 26
0
def create_black_list():
    fr = None
    tmp = []
    fr = open("balcklist_word.txt", 'rb')
    #s=[]
    for line in fr:
        line = re.findall(r"[\w']+", line)
        for i in range(0, len(line)):
            line[i] = line[i].lower()
            line[i] = porter.stem(line[i])
            if (line[i] in list_stop):
                continue
            black_list.append(line[i])
            tmp.append(line[i])
        black_list_s.append(tmp)
        tmp = []
    blac_list = list(OrderedDict.fromkeys(black_list))
Esempio n. 27
0
def ParseRawData(data, stopWordFile):
    '''do tokenize, stemming, remove stop word
		return the parsed file name
	'''
    logging.info('begin parse raw data')
    parsedDatafile = '/parsedPatent.txt'
    writefile = open(directory + parsedDatafile, 'w')
    # open stop word file
    # stopwordlist = [word for word in open(stopWordFile).read().split(',')]
    for line in open(data, 'r'):
        stemmed_line = [stem(word) for word in cleanTokenizeText(line)]
        # nostop_word_list = [word for word in stemmed_line if word not in stopwordlist]
        writefile.write(' '.join(stemmed_line) + '\n')
    writefile.close()
    logging.info(
        'finished parsing, store the parsed file into ./TFIDF_LDA/parsedPatent.txt'
    )
    return parsedDatafile
Esempio n. 28
0
def Field_Query_Processing(query):

    query_tokens = []
    fields = []

    tokens = query.split(',')
    for token in tokens:
        field_tokens = token.split(':')
        field = field_tokens[0].strip()
        value_tokens = field_tokens[1].split(" ")
        for val in value_tokens:
            fields.append(field)
            query_tokens.append(val.strip())

    query_tokens = [w.casefold() for w in query_tokens]
    query_tokens = [w for w in query_tokens if w not in stop_words]
    query_tokens = [ps.stem(w) for w in query_tokens]

    return query_tokens, fields
Esempio n. 29
0
def doQuery(query, index, k):
    q = query.split()
    
    #tokenize query
    j = 0
    while(j < len(q)):
        token = tokenize(q[j])
        if(token == ''):
            del q[j]
            j -= 1
        else:
            q[j] = porter.stem(token)
            j += 1
    
    #tier1sim is list of tuples of the form (docID, similarity) 
    #similarity is the similarity between document number docID and query q.
    tier1Sim = []
    
    i = 0
    while(i < index.numDocuments):
        tier1Sim.append((i + 1, index.computeSimilarity(q, i + 1, index.tier1)))
        i += 1
        
    sortedSim = sorted(tier1Sim, key = lambda tup: tup[1], reverse = True)
    
    sortedSim = sortedSim[0:k]
    simTier2 = []
    
    i = 0
    while(i < len(sortedSim)):
        docID = sortedSim[i][0]
        simTier2.append((docID, index.computeSimilarity(q, docID, index.tier2)))
        i += 1
        
    resultListTups = sorted(simTier2, key = lambda tup: tup[1], reverse = True)
    
    resultList = []
    i = 0
    while(i < len(resultListTups)):
        resultList.append(resultListTups[i][0])
        i += 1
    
    return resultList
Esempio n. 30
0
    def getTextRepresentation(self, text):
        """
            Permet d'obtenir une représentation d'un texte, On va connaitre le nombre de tous les mots présents dans
        le texte, mot vide exclu

        :type text: String
        :param text: Le texte dont on veux avoir la representation
        :return: un dictionnaire qui reprensente texte, c'est a dire que les mots sont lemmatisé et compté par occurence
                {mot1 : n1, mot2 : n2, ...}
        """
        mots = re.findall(
            r"\w+", text,
            re.UNICODE)  # On recupére une liste tous les mots du texte
        mots = [i.lower() for i in mots]  # On met tous les mots en minuscule

        compte_mots = Counter(mots)
        resultat = {
            porter.stem(a): b
            for (a, b) in compte_mots.items() if a not in self.stopWords
        }

        return resultat
def read_doc(position):
    _str = str()
    with open('DOCUMENT.txt', 'rb') as f:
        f.seek(position)
        line = f.readline()
        end = line.find("</BODY>")
        while (end == -1):
            _str = _str + line
            line = f.readline()
            end = line.find("</BODY>")
        _str = _str + line[:end]
    f.close()
    _string = []
    _str = re.findall(r"[\w']+", _str)
    #print len(_str)
    for i in range(0, len(_str)):
        _str[i] = _str[i].lower()
        _str[i] = porter.stem(_str[i])
        if (_str[i] in list_stop):
            continue
        _string.append(_str[i])
    return _string
Esempio n. 32
0
    def getWeightsForQuery(self, query):
        '''
            retourne les pondérations 0-1 pour les termes de la requête

            paramètres
            ----------
            query : string
                    requête
            renvoie
            -------
            token_weights: dict of string -> int
                           dictionnaire associant à chaque terme de la requête
                           la pondération 1
        '''
        query_lower = query.lower()
        tokens = list(set(query_lower.split(" ")))
        token_weights = dict()

        for t in tokens:
            token_weights[porter.stem(t)] = 1

        return token_weights
Esempio n. 33
0
def read_doc(position):
    _str=str()
    with open('DOCUMENT.txt','rb') as f:
        f.seek(position)
        line=f.readline()
        end=line.find("</BODY>")
        while(end==-1):
            _str = _str + line
            line=f.readline()
            end=line.find("</BODY>")
        _str = _str + line[:end]
    f.close()
    _string =[]   
    _str = re.findall(r"[\w']+",_str)
    #print len(_str)
    for i in range(0,len(_str)):
        _str[i]= _str[i].lower()
        _str[i] = porter.stem(_str[i])
        if(_str[i] in list_stop):
            continue
        _string.append(_str[i])       
    return _string
Esempio n. 34
0
    def getWeightsForQuery(self, query):
        '''
            retourne les pondérations tf des termes de la requête

            paramètres
            ----------
            query : string
                    requête
            renvoie
            -------
            token_weights: dict of string -> int
                           dictionnaire associant à chaque terme de la requête
                           sa pondération tf
        '''
        tokens = query.split(" ")
        processed_query = [porter.stem(t.lower()) for t in tokens]
        c = collections.Counter(processed_query)
        tokens = np.array(list(c.keys()))
        token_counts = np.array(list(c.values()))
        token_weights = dict()

        for i in range(len(tokens)):
            token_weights[tokens[i]] = token_counts[i]
        return token_weights
Esempio n. 35
0
def processPhrase(query, negFlag):
	# global docList
	global docID
	global inv_index
	global rank
	global STOPWORDS
	global phrase_pos
	word_docs={}
	phrase_words=query.split()	#split the phrase into single words
	deletionList=[]
	for find_stop in phrase_words:	#remove stopwords from the phrase
		if find_stop in STOPWORDS:
			deletionList.append(find_stop)
	for find_stop in deletionList:
		phrase_words.remove(find_stop)
	count=0
	for word in phrase_words:
		phrase_words[count]=stem(stem(word))
		count+=1
	phrase_length=len(phrase_words)
	for word in phrase_words:
		word=stem(stem(word))
		if word not in inv_index:
			return []
		if word in inv_index:
			for docs in inv_index[word]:
				if word not in word_docs:
					word_docs[word]=[docs[0]]
				else:
					word_docs[word].append(docs[0])

	combinedList=[]	#stores documents in which all words of the phrase are present
	actualList=[]	#stores documents in which all words are present at adjacent positions
	count=0
	for docs in word_docs.values():	#forming the combined list
		if count == 0:
			combinedList=docs
		else:
			combinedList=list(set(combinedList) & set(docs))
		count+=1

	for doc in combinedList:	#processing docs in the combined list for adjacent positions of words
		phrase_count=0
		filename=docID[doc]
		words=getDocWords(filename)
		word=phrase_words[0]
		postings=inv_index[word]
		for entry in postings:	#takes a word in the phrase and checks for adajacent words in the document
			if entry[0] != doc:
				continue
			phrase_start=0
			for position in entry[1]:
				check="true"
				for i in range(0,phrase_length):	#iterates over phrase length to check adjacent positions
					if position+i<len(words):
						if stem(stem(words[position+i])) != phrase_words[i]:
							check="false"
							break
						elif position+i<len(words) and stem(stem(words[position+i])) == phrase_words[i]:
							continue
					else:
						check="false"
				if check == "true":
					if doc not in actualList:
						actualList.append(doc)
					phrase_start=position
					phrase_count+=1
		if phrase_count != 0:	#negation flag is to handle negation of phrases
			if negFlag == "false":
				if not doc in phrase_pos:	#rank documents other than the phrase for negation queries
					phrase_pos[doc]=phrase_start #position of the phrase for snippets
				if doc not in rank:
					rank[doc]=phrase_count*len(phrase_words)
				else:
					rank[doc]+=(phrase_count*len(phrase_words))
		# if phrase_count==0:
		# 	combinedList.remove(doc)
	#the result is the intersection of the real list and combined list
	combinedList=intersect(actualList,combinedList)	
	return combinedList
Esempio n. 36
0
def main_dictionary():
    m=[]
    current_milli_time = lambda: int(round(time.time() * 1000))
    num_doc=[]
    doc_id=1
    b_start = 0
    b_end = 0
    tmp=str()
    total_temp_files=0
    #num_line=0
    last_tick=current_milli_time()
    print 'reading document file...'
    total_records=0
    
    fr = open('DOCUMENT.txt','r')
    ss=fr.tell()
    num_line=len(fr.readlines())
    fr.seek(ss)
    position=ss
    line=fr.readline()
    _num=0
    while(_num<num_line):
        if (b_start==0):
            start=line.find("<BODY>")
            if (start > -1):
                tmp = tmp + line[start+6:]
                pp=position
                b_start = 1
                
        if (b_start==1):
            end=line.find("</BODY>")
            if(end == -1):
                tmp = tmp + line
            if (end != -1):
                tmp = tmp + line[:end]
                b_start = 0
                b_end =1
            
        if(b_end == 1):
            tmp = re.findall(r"[\w']+",tmp)
            nd=0
            for i in range(0,len(tmp)):
                tmp[i]=tmp[i].lower()
                tmp[i] = porter.stem(tmp[i])
                if(tmp[i] in list_stop):
                    continue
                m.append((tmp[i],[(doc_id,i)]))
                nd+=1
            num_doc.append((pp,nd))
            b_end=0
            tmp=str()
            doc_id = doc_id + 1
        if sys.getsizeof(m)>4*1024*1024:
            total_temp_files=total_temp_files+1
            fname='partial-temp-'+str(total_temp_files)+'.dat'
            print 'saving temporary file ',fname,'...'
            m=sort_data(m)
            write_file(fname,m,int(0),len(m))
            total_records=total_records+len(m)
            m=[]
        position=fr.tell()
        line=fr.readline()
        _num+=1
    total_temp_files=total_temp_files+1
    fname='partial-temp-'+str(total_temp_files)+'.dat'
    print 'saving temporary file ',fname,'...'
    m=sort_data(m)
    write_file(fname,m,int(0),len(m))
    total_records=total_records+len(m)
    m=[]

    now_tick=current_milli_time()
    print 'read ',total_records,' records in ',(now_tick-last_tick),' ms.',' and saved ',total_temp_files,' temp files...'
    last_tick=now_tick

    if total_temp_files==1:
        os.rename('partial-temp-1.dat','inverted-index.dat')
    else:
        merge_files('partial-temp-1.dat','partial-temp-2.dat','merge-temp.dat')
        os.remove('partial-temp-1.dat')
        os.remove('partial-temp-2.dat')
        for i in range(3,total_temp_files+1):
            os.rename('merge-temp.dat','merge-temp-2.dat')
            print 'merging partial-temp-',str(i),'.dat...'
            merge_files('merge-temp-2.dat','partial-temp-'+str(i)+'.dat','merge-temp.dat')
            os.remove('merge-temp-2.dat')
            os.remove('partial-temp-'+str(i)+'.dat')
        os.rename('merge-temp.dat','inverted-index.dat')
        now_tick=current_milli_time()
        print 'merged inverted-index.dat in ',(now_tick-last_tick),' ms.'
        last_tick=now_tick
            
        
    print 'done!'

    unmerged_files()
    num_document(num_doc)
    print 'done'
Esempio n. 37
0
# -*- coding: utf-8 -*-
"""
common.py: provides common functions for 'text' module.
"""

import porter
import re

re_sentence = re.compile(r'[^?!.;]+[?!.;]?\n?')
re_word = re.compile(r"[\w]+")

strip_html = lambda s: re.sub(r'</?[^<>]*/?>', ' ', s)

get_stem = lambda w: porter.stem(w.lower())


def remove_camel_case(text):
    text = re.sub(r'([a-z])([A-Z])', '\\1 \\2', text)
    text = re.sub(r'([0-9])([a-zA-Z])', '\\1 \\2', text)
    text = re.sub(r'([a-zA-Z])([0-9])', '\\1 \\2', text)
    return text


def is_bad(word):
    return re.match('^[0-9]+', word) is not None


def iter_words(text):

    sentences = re.finditer(re_sentence, text)
Esempio n. 38
0
def extract_terms(text):
    term_list = [stem(term)
                 for term in splitter.split(text.lower()) 
                 if ((len(term)>1) and (term not in STOP_WORDS))]
    return list(frozenset(term_list))
Esempio n. 39
0
def countWord(doc):
    return dict(Counter([porter.stem(i.lower()) for i in doc.split() if i.lower() not in motvide]))
Esempio n. 40
0
def counter(phrase):
    porter_stemer = textRepresenter.PorterStemmer()
    #ou textRepresenter.getTextRepresentation
    l = (porter.stem(w.lower()) for w in phrase.split(" ") if w != "")
    l = (w for w in l if w not in porter_stemer.stopWords)
    return dict(collections.Counter(l).items())
def tokenize(word):
    regex = re.compile('[^a-zA-Z]+')
    w = regex.sub('', word)
    w = w.lower()
    w = porter.stem(w)
    return w
Esempio n. 42
0
def main():
	global docID
	global inv_index
	global path
	global phraseList
	global query_result
	global docList
	global negList
	global tf
	global df
	global allDocs
	global rank
	global phrase_pos
	global phraseList
	global snippetDone
	while(True):
		loadIndex()
		path = inv_index['cranfield_corpus_path']
		loadDocID()
		query=raw_input("\nEnter your query or a special command (tf/df/freq/title/doc/author/similar) (Enter goodbye to exit)\n")
		start = time.time()
		makeAllDocList()
		#convert entire query to lower case
		query = query.lower()
		if query == 'goodbye':
			print "Thank you for using this information retrieval system.."
			break
		#shlex is used to split pharses and words separately
		query_words=shlex.split(query)
		count = 0
		#stemming of the query
		for word in query_words:
			if " " not in word:
				query_words[count] = stem(stem(word))
			count+=1
		phraseList=[]
		negPhraseList=[]
		#process special query 'similar'
		if query_words[0] == 'similar':
			flag="false"
			for word in inv_index.keys():
				dist = levenshtein(query_words[1],word)
				if dist==1:
					flag="true"
					print word
			if flag == "false":
				print "No similar words found"
			query_result={}	#stores the result of the query for further processing
			docList=[]
			df={}	
			negList=[]	
			tf={}	
			allDocs=[]	
			rank={}	
			phrase_pos={}	
			phraseList=[]	
			snippetDone=[]
			continue
		#process special query tf/df/freq/doc/title/author
		if query_words[0] == 'tf' or query_words[0] == 'df' or query_words[0] == 'freq' or query_words[0] == 'doc' or query_words[0] == 'titl' or query_words[0] == 'author':
			negFlag="false"
			specialQuery(query_words)
			query_result={}	#stores the result of the query for further processing
			docList=[]
			df={}	
			negList=[]	
			tf={}	
			allDocs=[]	
			rank={}	
			phrase_pos={}	
			phraseList=[]	
			snippetDone=[]
			continue
		#process each word in the query
		for word in query_words:
			#single word
			if word in inv_index and word[0] != '!':
				postings=inv_index[word]
				docListing(word, postings)
			#single word with negation
			elif word[0]=='!' and word[1:] in inv_index and " " not in word:
				postings=inv_index[word[1:]]
				negativeListing(word[1:], postings)
			#phrase with negation
			elif " " in word and word[0]=='!':
				negPhraseList=negPhraseList+processPhrase(word[1:], "true")
			#phrase without negation
			elif " " in word:
				phraseList=phraseList+processPhrase(word, "false")
		#take set difference with universal set for phrase negation
		if len(negPhraseList) != 0:
			negPhraseList=difference(allDocs,negPhraseList)
			for doc in negPhraseList:
				if doc not in rank:
					rank[doc]=1
				else: 
					rank[doc]+=1
		#set operations on results
		docSet=posUnion() # Set A
		docSet=list(set(docSet) | set(phraseList)) # A = A U B
		negSet=negUnion()	#Set C
		negSet=list(set(negSet) | set(negPhraseList))	#C = C U D
		resultSet=union(docSet,negSet)	# RESULT = A U C = (A U B) U (C U D)
		#Display snippets if results have been found
		if len(resultSet)!=0:
			displaySnippets(resultSet, query_words)
		else:
			print "No match found!"
		print "Number of results:", len(resultSet)
		# print resultSet
		end = time.time()
		print "\nTook", (end-start), "seconds to give the results"

		#Re-initilaize global variables for the next query 
		query_result={}	#stores the result of the query for further processing
		docList=[]
		df={}	
		negList=[]	
		tf={}	
		allDocs=[]	
		rank={}	
		phrase_pos={}	
		phraseList=[]	
		snippetDone=[]
Esempio n. 43
0
def textRank(df, docNum, abstractLen):

    #Sentence tokenizer
    s = str(df[docNum])
    sentences = sent_tokenize(str(s))
    bagOfWords = {}
    i = 0

    #For each sentence, clean
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        finalTokens = {}
        for token in tokens:
            token = token.casefold()
            #Bag of Words
            if token not in stopWords and (token.isalpha()
                                           or token.isnumeric()):
                token = porter.stem(token)
                finalTokens[token] = 1
        if len(finalTokens) > 0:
            bagOfWords[i] = finalTokens
        i += 1

    #Matrix N*N (N = no. of sentences)
    #Jaccard similarity for each sentence with each sentence (i,j)
    lenSecSent = 0
    i = 0
    j = 0
    similarityMatrix = np.zeros((len(sentences), len(sentences)))
    for sentence in bagOfWords:
        BOW1 = bagOfWords[sentence]
        for secondSentence in bagOfWords:
            if sentence != secondSentence:
                BOW2 = bagOfWords[secondSentence]
                lenSecSent = len(BOW2)
                j = 0
                for token in BOW1:
                    if token in BOW2:
                        j += 1
            similarityMatrix[sentence][secondSentence] = j / (len(BOW1) +
                                                              lenSecSent - j)
            similarityMatrix[secondSentence][sentence] = j / (len(BOW1) +
                                                              lenSecSent - j)

    #Page rank function
    similarityMatrix = scipy.sparse.csr_matrix(similarityMatrix)
    nx_graph = nx.from_scipy_sparse_matrix(similarityMatrix)
    scores = pagerank(nx_graph, max_iter=1000)

    #Sorting for top documents
    rankedSentences = sorted(((scores[i], s) for i, s in enumerate(sentences)),
                             reverse=True)

    abstract = ""
    i = 0
    for s in rankedSentences:
        if i < abstractLen:
            # abstract.append(rankedSentences[i][1])
            abstract += rankedSentences[i][1] + " "
        i += 1
    return abstract
def main_dictionary():
    m = []
    current_milli_time = lambda: int(round(time.time() * 1000))
    num_doc = []
    doc_id = 1
    b_start = 0
    b_end = 0
    tmp = str()
    total_temp_files = 0
    #num_line=0
    last_tick = current_milli_time()
    print 'reading document file...'
    total_records = 0

    fr = open('DOCUMENT.txt', 'r')
    ss = fr.tell()
    num_line = len(fr.readlines())
    fr.seek(ss)
    position = ss
    line = fr.readline()
    _num = 0
    while (_num < num_line):
        if (b_start == 0):
            start = line.find("<BODY>")
            if (start > -1):
                tmp = tmp + line[start + 6:]
                pp = position
                b_start = 1

        if (b_start == 1):
            end = line.find("</BODY>")
            if (end == -1):
                tmp = tmp + line
            if (end != -1):
                tmp = tmp + line[:end]
                b_start = 0
                b_end = 1

        if (b_end == 1):
            tmp = re.findall(r"[\w']+", tmp)
            nd = 0
            for i in range(0, len(tmp)):
                tmp[i] = tmp[i].lower()
                tmp[i] = porter.stem(tmp[i])
                if (tmp[i] in list_stop):
                    continue
                m.append((tmp[i], [(doc_id, i)]))
                nd += 1
            num_doc.append((pp, nd))
            b_end = 0
            tmp = str()
            doc_id = doc_id + 1
        if sys.getsizeof(m) > 4 * 1024 * 1024:
            total_temp_files = total_temp_files + 1
            fname = 'partial-temp-' + str(total_temp_files) + '.dat'
            print 'saving temporary file ', fname, '...'
            m = sort_data(m)
            write_file(fname, m, int(0), len(m))
            total_records = total_records + len(m)
            m = []
        position = fr.tell()
        line = fr.readline()
        _num += 1
    total_temp_files = total_temp_files + 1
    fname = 'partial-temp-' + str(total_temp_files) + '.dat'
    print 'saving temporary file ', fname, '...'
    m = sort_data(m)
    write_file(fname, m, int(0), len(m))
    total_records = total_records + len(m)
    m = []

    now_tick = current_milli_time()
    print 'read ', total_records, ' records in ', (
        now_tick -
        last_tick), ' ms.', ' and saved ', total_temp_files, ' temp files...'
    last_tick = now_tick

    if total_temp_files == 1:
        os.rename('partial-temp-1.dat', 'inverted-index.dat')
    else:
        merge_files('partial-temp-1.dat', 'partial-temp-2.dat',
                    'merge-temp.dat')
        os.remove('partial-temp-1.dat')
        os.remove('partial-temp-2.dat')
        for i in range(3, total_temp_files + 1):
            os.rename('merge-temp.dat', 'merge-temp-2.dat')
            print 'merging partial-temp-', str(i), '.dat...'
            merge_files('merge-temp-2.dat', 'partial-temp-' + str(i) + '.dat',
                        'merge-temp.dat')
            os.remove('merge-temp-2.dat')
            os.remove('partial-temp-' + str(i) + '.dat')
        os.rename('merge-temp.dat', 'inverted-index.dat')
        now_tick = current_milli_time()
        print 'merged inverted-index.dat in ', (now_tick - last_tick), ' ms.'
        last_tick = now_tick

    print 'done!'

    unmerged_files()
    num_document(num_doc)
    print 'done'
Esempio n. 45
0
def apply_stem(termslist):
    '''
'''
    for i, term in enumerate(termslist):
        termslist[i] = porter.stem(term)