def tokenize(file_data): file_data=file_data.lower() #lowercasing file_data = re.sub('[^0-9a-zA-Z]+', ' ', file_data) #ignoring punctuations file_data = re.sub('(\w)\'(\w)/$1~$2', ' ', file_data) #ignoring punctuations file_data = re.sub('\'/ ', ' ', file_data) #ignoring punctuations file_words = file_data.split() #split the file into single words count = 0 for word in file_words: file_words[count] = stem(stem(word)) count+=1 return file_words
def getWeightsForQuery(self, query): ''' retourne les pondérations idf des termes de la requête paramètres ---------- query : string requête renvoie ------- token_weights: dict of string -> float dictionnaire associant à chaque terme de la requête sa pondération idf ''' df_dict = self.indexer.get_df() N = self.indexer.N tokens = query.split(" ") processed_query = [porter.stem(t.lower()) for t in tokens] q = list(set(processed_query)) token_weights = dict() for t in processed_query: if t in df_dict: token_weights[t] = math.log((1+N)/(1+df_dict[t])) else: token_weights[t] = 0 return token_weights
def replace_word_by_stems(wordList): stemList = [] for word in wordList: stemList.append(stem(word)) return set(stemList)
def stem(tokens): """ now we could do this in python. We coudl also call the same function that exists server-side, or even run an embedded javascript interpreter. See http://groups.google.com/group/mongodb-user/browse_frm/thread/728c4376c3013007/b5ac548f70c8b3ca """ return [porter.stem(tok) for tok in tokens]
def getWeightsForStem(self,stem): stem = porter.stem(stem.lower()) dicoRes=dict() indexInverse = self.indexObject.getTfsForDoc(stem) for doc in self.index: if doc in indexInverse: dicoRes[doc]=1+math.log(indexInverse[doc]) return dicoRes
def Query_processing(query): query_tokens = tokenizer.tokenize(query) query_tokens = [w.casefold() for w in query_tokens] query_tokens = [w for w in query_tokens if w not in stop_words] query_tokens = [ps.stem(w) for w in query_tokens] return query_tokens, []
def e_morphological_feats(self): D = {} etymology = retrieve_etymology(self.lemma) D["e_latin_root"] = has_ancestor_in_lang("lat", etymology) # check wiktionary D["e_length_dist_lemma_form"] = len(self.word) - len(self.lemma) stem, steps = porter.stem(self.word) D["e_length_dist_stem_form"] = len(self.word) - len(stem) D["e_inflectional_morphemes_count"] = steps return D
def getTextRepresentation(self,text): tab=re.findall(r"\w+",text,re.UNICODE) tab=[i.lower() for i in tab] ret=Counter(tab) ret={porter.stem(a):b for (a,b) in ret.items() if a not in self.stopWords} return ret
def getTextRepresentation(self,text): tab=re.findall(r"\w+",text,re.UNICODE) tab = [porter.stem(word) for word in tab if word not in self.stopWords] ret=Counter(tab) # Error programs, program gives only 1 #ret={porter.stem(a):b for (a,b) in ret.items() if a not in self.stopWords} return dict(ret)
def getWeightsForStem(self,stem): stem = porter.stem(stem.lower()) dicoRes=dict() indexInverse = self.indexObject.getTfsForDoc(stem) for doc in self.index: if doc in indexInverse: idf=math.log((1+len(self.index))/(1+len(indexInverse))) dicoRes[doc]=(1+math.log(indexInverse[doc]))*idf return dicoRes
def parse_query(content, stopwords): tempbagofwords = content.replace("-", " ").split() bagofwords=[] for word in tempbagofwords: word = word.lower() if not stopwords.has_key(word): word = porter.stem(word) bagofwords.append(word) return bagofwords
def query(self, querystring): ''' ''' query_no_puct = cfc_tools.replace_punctuation(querystring) query_termlist = cfc_tools.remove_multiple_space(query_no_puct).strip().split() acc = list() for term in query_termlist: term_processed = porter.stem(term) acc.append((term, self[term].doclist))
def getTextRepresentation(self, text): tab = re.findall(r"\w+", text, re.UNICODE) tab = [porter.stem(word.lower()) for word in tab] tab = [word for word in tab if word not in self.stopWords] ret = Counter(tab) ret = {a: b for (a, b) in ret.items()} return ret
def queryPreprocessing(query): """ Preprocess d'une query sous forme de string pour renvoyer un tableau de mots """ arrayQuery = "".join(c for c in query if c.isalnum() or c.isspace()).split() #print(arrayQuery) res=[] for word in arrayQuery: word = porter.stem(word.lower()) #On stem les mots car ils sont stemmés dans l'index res.append(word) return res
def getTextRepresentation(self,text): """ Return the stemmed representation of a string. :param text: string, input text :return: dictionary of {stem: frequency} """ tab=re.findall(r"\w+",text,re.UNICODE) tab=[word.lower() for word in tab] tab = [porter.stem(word) for word in tab if word not in self.stopWords] ret=Counter(tab) ret={a:b for (a,b) in ret.items()} return ret
def parse_document_content(content, stopwords): tempbagofwords = content.replace("-", " ").split() bag_of_words_dict=dict() global number_of_terms for word in tempbagofwords: word = word.lower() if word not in stopwords: number_of_terms += 1 word = porter.stem(word) if bag_of_words_dict.has_key(word): bag_of_words_dict[word] += 1 else: bag_of_words_dict[word] = 1 return bag_of_words_dict
def Stemming(): for Tokenobj in TokenPages: Tokenobj.title = [ps.stem(w) for w in Tokenobj.title] Tokenobj.infobox = [ps.stem(w) for w in Tokenobj.infobox] Tokenobj.category = [ps.stem(w) for w in Tokenobj.category] Tokenobj.links = [ps.stem(w) for w in Tokenobj.links] Tokenobj.ref = [ps.stem(w) for w in Tokenobj.ref] Tokenobj.body = [ps.stem(w) for w in Tokenobj.body]
def evaluation_modele_vectoriel(D,qry): qry = [porter.stem(mot) for mot in qry.split()] ind = index(D) score = dict() for doc in ind[0]: score[doc]=0 for mot in qry: for docu in ind[1][mot]: score[int(docu)]+=int(ind[1][mot][docu]) return score
def evaluation_modele_binaire(D,qry): ind = index(D)[0] score = dict() for i in ind: score[i]=0 for j in qry.split(): if porter.stem(j) in ind[i]: score[i]+=1 if len(qry.split())==score[i]: score[i] = 1 else: score[i] = 0 return score
def input_query(): _result = [] qq = [] q = raw_input("Please Enter Your Query:" + '\n') x = int(raw_input("pleas enter how many document id you want?" + '\n')) q = re.findall(r"[\w']+", q) for i in range(0, len(q)): q[i] = q[i].lower() q[i] = porter.stem(q[i]) if (q[i] in list_stop): continue qq.append(q[i]) #print qq ################# _result = score_phrase(qq, x) user(_result)
def create_index_doc(nb): global allwords # Reading document and conversion to lower case words = re.findall(r'\w+', open('doc' + str(nb) + '.txt').read().lower()) # Reading stop words stop_words = re.findall(r'\w+', open('stop_words.txt').read().lower()) # Deleting stop words + normalization words = [porter.stem(word) for word in words if word not in stop_words] #Saving all the words of all the documents allwords = set(list(allwords) + words) # Creation of dict return dict(collections.Counter(words))
def ParseRawData(data,stopWordFile): '''do tokenize, stemming, remove stop word return the parsed file name ''' logging.info('begin parse raw data') parsedDatafile = '/parsedPatent.txt' writefile = open(directory + parsedDatafile,'w') # open stop word file # stopwordlist = [word for word in open(stopWordFile).read().split(',')] for line in open(data,'r'): stemmed_line =[stem(word) for word in cleanTokenizeText(line)] # nostop_word_list = [word for word in stemmed_line if word not in stopwordlist] writefile.write(' '.join(stemmed_line)+'\n') writefile.close() logging.info('finished parsing, store the parsed file into ./TFIDF_LDA/parsedPatent.txt') return parsedDatafile
def create_black_list(): fr=None tmp=[] fr=open("balcklist_word.txt",'rb') #s=[] for line in fr: line= re.findall(r"[\w']+",line) for i in range(0,len(line)): line[i]=line[i].lower() line[i] = porter.stem(line[i]) if(line[i] in list_stop): continue black_list.append(line[i]) tmp.append(line[i]) black_list_s.append(tmp) tmp=[] blac_list=list(OrderedDict.fromkeys(black_list))
def Pre_Processing(Abstract): # case folding Abstract = Abstract.lower() #split into words tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') words_tokens = tokenizer.tokenize(Abstract) #stop word removal stop_words = set(stopwords.words('english')) words_tokens = [token for token in words_tokens if token not in stop_words] #stemming words_tokens = [pt.stem(token) for token in words_tokens] return words_tokens
def create_black_list(): fr = None tmp = [] fr = open("balcklist_word.txt", 'rb') #s=[] for line in fr: line = re.findall(r"[\w']+", line) for i in range(0, len(line)): line[i] = line[i].lower() line[i] = porter.stem(line[i]) if (line[i] in list_stop): continue black_list.append(line[i]) tmp.append(line[i]) black_list_s.append(tmp) tmp = [] blac_list = list(OrderedDict.fromkeys(black_list))
def ParseRawData(data, stopWordFile): '''do tokenize, stemming, remove stop word return the parsed file name ''' logging.info('begin parse raw data') parsedDatafile = '/parsedPatent.txt' writefile = open(directory + parsedDatafile, 'w') # open stop word file # stopwordlist = [word for word in open(stopWordFile).read().split(',')] for line in open(data, 'r'): stemmed_line = [stem(word) for word in cleanTokenizeText(line)] # nostop_word_list = [word for word in stemmed_line if word not in stopwordlist] writefile.write(' '.join(stemmed_line) + '\n') writefile.close() logging.info( 'finished parsing, store the parsed file into ./TFIDF_LDA/parsedPatent.txt' ) return parsedDatafile
def Field_Query_Processing(query): query_tokens = [] fields = [] tokens = query.split(',') for token in tokens: field_tokens = token.split(':') field = field_tokens[0].strip() value_tokens = field_tokens[1].split(" ") for val in value_tokens: fields.append(field) query_tokens.append(val.strip()) query_tokens = [w.casefold() for w in query_tokens] query_tokens = [w for w in query_tokens if w not in stop_words] query_tokens = [ps.stem(w) for w in query_tokens] return query_tokens, fields
def doQuery(query, index, k): q = query.split() #tokenize query j = 0 while(j < len(q)): token = tokenize(q[j]) if(token == ''): del q[j] j -= 1 else: q[j] = porter.stem(token) j += 1 #tier1sim is list of tuples of the form (docID, similarity) #similarity is the similarity between document number docID and query q. tier1Sim = [] i = 0 while(i < index.numDocuments): tier1Sim.append((i + 1, index.computeSimilarity(q, i + 1, index.tier1))) i += 1 sortedSim = sorted(tier1Sim, key = lambda tup: tup[1], reverse = True) sortedSim = sortedSim[0:k] simTier2 = [] i = 0 while(i < len(sortedSim)): docID = sortedSim[i][0] simTier2.append((docID, index.computeSimilarity(q, docID, index.tier2))) i += 1 resultListTups = sorted(simTier2, key = lambda tup: tup[1], reverse = True) resultList = [] i = 0 while(i < len(resultListTups)): resultList.append(resultListTups[i][0]) i += 1 return resultList
def getTextRepresentation(self, text): """ Permet d'obtenir une représentation d'un texte, On va connaitre le nombre de tous les mots présents dans le texte, mot vide exclu :type text: String :param text: Le texte dont on veux avoir la representation :return: un dictionnaire qui reprensente texte, c'est a dire que les mots sont lemmatisé et compté par occurence {mot1 : n1, mot2 : n2, ...} """ mots = re.findall( r"\w+", text, re.UNICODE) # On recupére une liste tous les mots du texte mots = [i.lower() for i in mots] # On met tous les mots en minuscule compte_mots = Counter(mots) resultat = { porter.stem(a): b for (a, b) in compte_mots.items() if a not in self.stopWords } return resultat
def read_doc(position): _str = str() with open('DOCUMENT.txt', 'rb') as f: f.seek(position) line = f.readline() end = line.find("</BODY>") while (end == -1): _str = _str + line line = f.readline() end = line.find("</BODY>") _str = _str + line[:end] f.close() _string = [] _str = re.findall(r"[\w']+", _str) #print len(_str) for i in range(0, len(_str)): _str[i] = _str[i].lower() _str[i] = porter.stem(_str[i]) if (_str[i] in list_stop): continue _string.append(_str[i]) return _string
def getWeightsForQuery(self, query): ''' retourne les pondérations 0-1 pour les termes de la requête paramètres ---------- query : string requête renvoie ------- token_weights: dict of string -> int dictionnaire associant à chaque terme de la requête la pondération 1 ''' query_lower = query.lower() tokens = list(set(query_lower.split(" "))) token_weights = dict() for t in tokens: token_weights[porter.stem(t)] = 1 return token_weights
def read_doc(position): _str=str() with open('DOCUMENT.txt','rb') as f: f.seek(position) line=f.readline() end=line.find("</BODY>") while(end==-1): _str = _str + line line=f.readline() end=line.find("</BODY>") _str = _str + line[:end] f.close() _string =[] _str = re.findall(r"[\w']+",_str) #print len(_str) for i in range(0,len(_str)): _str[i]= _str[i].lower() _str[i] = porter.stem(_str[i]) if(_str[i] in list_stop): continue _string.append(_str[i]) return _string
def getWeightsForQuery(self, query): ''' retourne les pondérations tf des termes de la requête paramètres ---------- query : string requête renvoie ------- token_weights: dict of string -> int dictionnaire associant à chaque terme de la requête sa pondération tf ''' tokens = query.split(" ") processed_query = [porter.stem(t.lower()) for t in tokens] c = collections.Counter(processed_query) tokens = np.array(list(c.keys())) token_counts = np.array(list(c.values())) token_weights = dict() for i in range(len(tokens)): token_weights[tokens[i]] = token_counts[i] return token_weights
def processPhrase(query, negFlag): # global docList global docID global inv_index global rank global STOPWORDS global phrase_pos word_docs={} phrase_words=query.split() #split the phrase into single words deletionList=[] for find_stop in phrase_words: #remove stopwords from the phrase if find_stop in STOPWORDS: deletionList.append(find_stop) for find_stop in deletionList: phrase_words.remove(find_stop) count=0 for word in phrase_words: phrase_words[count]=stem(stem(word)) count+=1 phrase_length=len(phrase_words) for word in phrase_words: word=stem(stem(word)) if word not in inv_index: return [] if word in inv_index: for docs in inv_index[word]: if word not in word_docs: word_docs[word]=[docs[0]] else: word_docs[word].append(docs[0]) combinedList=[] #stores documents in which all words of the phrase are present actualList=[] #stores documents in which all words are present at adjacent positions count=0 for docs in word_docs.values(): #forming the combined list if count == 0: combinedList=docs else: combinedList=list(set(combinedList) & set(docs)) count+=1 for doc in combinedList: #processing docs in the combined list for adjacent positions of words phrase_count=0 filename=docID[doc] words=getDocWords(filename) word=phrase_words[0] postings=inv_index[word] for entry in postings: #takes a word in the phrase and checks for adajacent words in the document if entry[0] != doc: continue phrase_start=0 for position in entry[1]: check="true" for i in range(0,phrase_length): #iterates over phrase length to check adjacent positions if position+i<len(words): if stem(stem(words[position+i])) != phrase_words[i]: check="false" break elif position+i<len(words) and stem(stem(words[position+i])) == phrase_words[i]: continue else: check="false" if check == "true": if doc not in actualList: actualList.append(doc) phrase_start=position phrase_count+=1 if phrase_count != 0: #negation flag is to handle negation of phrases if negFlag == "false": if not doc in phrase_pos: #rank documents other than the phrase for negation queries phrase_pos[doc]=phrase_start #position of the phrase for snippets if doc not in rank: rank[doc]=phrase_count*len(phrase_words) else: rank[doc]+=(phrase_count*len(phrase_words)) # if phrase_count==0: # combinedList.remove(doc) #the result is the intersection of the real list and combined list combinedList=intersect(actualList,combinedList) return combinedList
def main_dictionary(): m=[] current_milli_time = lambda: int(round(time.time() * 1000)) num_doc=[] doc_id=1 b_start = 0 b_end = 0 tmp=str() total_temp_files=0 #num_line=0 last_tick=current_milli_time() print 'reading document file...' total_records=0 fr = open('DOCUMENT.txt','r') ss=fr.tell() num_line=len(fr.readlines()) fr.seek(ss) position=ss line=fr.readline() _num=0 while(_num<num_line): if (b_start==0): start=line.find("<BODY>") if (start > -1): tmp = tmp + line[start+6:] pp=position b_start = 1 if (b_start==1): end=line.find("</BODY>") if(end == -1): tmp = tmp + line if (end != -1): tmp = tmp + line[:end] b_start = 0 b_end =1 if(b_end == 1): tmp = re.findall(r"[\w']+",tmp) nd=0 for i in range(0,len(tmp)): tmp[i]=tmp[i].lower() tmp[i] = porter.stem(tmp[i]) if(tmp[i] in list_stop): continue m.append((tmp[i],[(doc_id,i)])) nd+=1 num_doc.append((pp,nd)) b_end=0 tmp=str() doc_id = doc_id + 1 if sys.getsizeof(m)>4*1024*1024: total_temp_files=total_temp_files+1 fname='partial-temp-'+str(total_temp_files)+'.dat' print 'saving temporary file ',fname,'...' m=sort_data(m) write_file(fname,m,int(0),len(m)) total_records=total_records+len(m) m=[] position=fr.tell() line=fr.readline() _num+=1 total_temp_files=total_temp_files+1 fname='partial-temp-'+str(total_temp_files)+'.dat' print 'saving temporary file ',fname,'...' m=sort_data(m) write_file(fname,m,int(0),len(m)) total_records=total_records+len(m) m=[] now_tick=current_milli_time() print 'read ',total_records,' records in ',(now_tick-last_tick),' ms.',' and saved ',total_temp_files,' temp files...' last_tick=now_tick if total_temp_files==1: os.rename('partial-temp-1.dat','inverted-index.dat') else: merge_files('partial-temp-1.dat','partial-temp-2.dat','merge-temp.dat') os.remove('partial-temp-1.dat') os.remove('partial-temp-2.dat') for i in range(3,total_temp_files+1): os.rename('merge-temp.dat','merge-temp-2.dat') print 'merging partial-temp-',str(i),'.dat...' merge_files('merge-temp-2.dat','partial-temp-'+str(i)+'.dat','merge-temp.dat') os.remove('merge-temp-2.dat') os.remove('partial-temp-'+str(i)+'.dat') os.rename('merge-temp.dat','inverted-index.dat') now_tick=current_milli_time() print 'merged inverted-index.dat in ',(now_tick-last_tick),' ms.' last_tick=now_tick print 'done!' unmerged_files() num_document(num_doc) print 'done'
# -*- coding: utf-8 -*- """ common.py: provides common functions for 'text' module. """ import porter import re re_sentence = re.compile(r'[^?!.;]+[?!.;]?\n?') re_word = re.compile(r"[\w]+") strip_html = lambda s: re.sub(r'</?[^<>]*/?>', ' ', s) get_stem = lambda w: porter.stem(w.lower()) def remove_camel_case(text): text = re.sub(r'([a-z])([A-Z])', '\\1 \\2', text) text = re.sub(r'([0-9])([a-zA-Z])', '\\1 \\2', text) text = re.sub(r'([a-zA-Z])([0-9])', '\\1 \\2', text) return text def is_bad(word): return re.match('^[0-9]+', word) is not None def iter_words(text): sentences = re.finditer(re_sentence, text)
def extract_terms(text): term_list = [stem(term) for term in splitter.split(text.lower()) if ((len(term)>1) and (term not in STOP_WORDS))] return list(frozenset(term_list))
def countWord(doc): return dict(Counter([porter.stem(i.lower()) for i in doc.split() if i.lower() not in motvide]))
def counter(phrase): porter_stemer = textRepresenter.PorterStemmer() #ou textRepresenter.getTextRepresentation l = (porter.stem(w.lower()) for w in phrase.split(" ") if w != "") l = (w for w in l if w not in porter_stemer.stopWords) return dict(collections.Counter(l).items())
def tokenize(word): regex = re.compile('[^a-zA-Z]+') w = regex.sub('', word) w = w.lower() w = porter.stem(w) return w
def main(): global docID global inv_index global path global phraseList global query_result global docList global negList global tf global df global allDocs global rank global phrase_pos global phraseList global snippetDone while(True): loadIndex() path = inv_index['cranfield_corpus_path'] loadDocID() query=raw_input("\nEnter your query or a special command (tf/df/freq/title/doc/author/similar) (Enter goodbye to exit)\n") start = time.time() makeAllDocList() #convert entire query to lower case query = query.lower() if query == 'goodbye': print "Thank you for using this information retrieval system.." break #shlex is used to split pharses and words separately query_words=shlex.split(query) count = 0 #stemming of the query for word in query_words: if " " not in word: query_words[count] = stem(stem(word)) count+=1 phraseList=[] negPhraseList=[] #process special query 'similar' if query_words[0] == 'similar': flag="false" for word in inv_index.keys(): dist = levenshtein(query_words[1],word) if dist==1: flag="true" print word if flag == "false": print "No similar words found" query_result={} #stores the result of the query for further processing docList=[] df={} negList=[] tf={} allDocs=[] rank={} phrase_pos={} phraseList=[] snippetDone=[] continue #process special query tf/df/freq/doc/title/author if query_words[0] == 'tf' or query_words[0] == 'df' or query_words[0] == 'freq' or query_words[0] == 'doc' or query_words[0] == 'titl' or query_words[0] == 'author': negFlag="false" specialQuery(query_words) query_result={} #stores the result of the query for further processing docList=[] df={} negList=[] tf={} allDocs=[] rank={} phrase_pos={} phraseList=[] snippetDone=[] continue #process each word in the query for word in query_words: #single word if word in inv_index and word[0] != '!': postings=inv_index[word] docListing(word, postings) #single word with negation elif word[0]=='!' and word[1:] in inv_index and " " not in word: postings=inv_index[word[1:]] negativeListing(word[1:], postings) #phrase with negation elif " " in word and word[0]=='!': negPhraseList=negPhraseList+processPhrase(word[1:], "true") #phrase without negation elif " " in word: phraseList=phraseList+processPhrase(word, "false") #take set difference with universal set for phrase negation if len(negPhraseList) != 0: negPhraseList=difference(allDocs,negPhraseList) for doc in negPhraseList: if doc not in rank: rank[doc]=1 else: rank[doc]+=1 #set operations on results docSet=posUnion() # Set A docSet=list(set(docSet) | set(phraseList)) # A = A U B negSet=negUnion() #Set C negSet=list(set(negSet) | set(negPhraseList)) #C = C U D resultSet=union(docSet,negSet) # RESULT = A U C = (A U B) U (C U D) #Display snippets if results have been found if len(resultSet)!=0: displaySnippets(resultSet, query_words) else: print "No match found!" print "Number of results:", len(resultSet) # print resultSet end = time.time() print "\nTook", (end-start), "seconds to give the results" #Re-initilaize global variables for the next query query_result={} #stores the result of the query for further processing docList=[] df={} negList=[] tf={} allDocs=[] rank={} phrase_pos={} phraseList=[] snippetDone=[]
def textRank(df, docNum, abstractLen): #Sentence tokenizer s = str(df[docNum]) sentences = sent_tokenize(str(s)) bagOfWords = {} i = 0 #For each sentence, clean for sentence in sentences: tokens = tokenizer.tokenize(sentence) finalTokens = {} for token in tokens: token = token.casefold() #Bag of Words if token not in stopWords and (token.isalpha() or token.isnumeric()): token = porter.stem(token) finalTokens[token] = 1 if len(finalTokens) > 0: bagOfWords[i] = finalTokens i += 1 #Matrix N*N (N = no. of sentences) #Jaccard similarity for each sentence with each sentence (i,j) lenSecSent = 0 i = 0 j = 0 similarityMatrix = np.zeros((len(sentences), len(sentences))) for sentence in bagOfWords: BOW1 = bagOfWords[sentence] for secondSentence in bagOfWords: if sentence != secondSentence: BOW2 = bagOfWords[secondSentence] lenSecSent = len(BOW2) j = 0 for token in BOW1: if token in BOW2: j += 1 similarityMatrix[sentence][secondSentence] = j / (len(BOW1) + lenSecSent - j) similarityMatrix[secondSentence][sentence] = j / (len(BOW1) + lenSecSent - j) #Page rank function similarityMatrix = scipy.sparse.csr_matrix(similarityMatrix) nx_graph = nx.from_scipy_sparse_matrix(similarityMatrix) scores = pagerank(nx_graph, max_iter=1000) #Sorting for top documents rankedSentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) abstract = "" i = 0 for s in rankedSentences: if i < abstractLen: # abstract.append(rankedSentences[i][1]) abstract += rankedSentences[i][1] + " " i += 1 return abstract
def main_dictionary(): m = [] current_milli_time = lambda: int(round(time.time() * 1000)) num_doc = [] doc_id = 1 b_start = 0 b_end = 0 tmp = str() total_temp_files = 0 #num_line=0 last_tick = current_milli_time() print 'reading document file...' total_records = 0 fr = open('DOCUMENT.txt', 'r') ss = fr.tell() num_line = len(fr.readlines()) fr.seek(ss) position = ss line = fr.readline() _num = 0 while (_num < num_line): if (b_start == 0): start = line.find("<BODY>") if (start > -1): tmp = tmp + line[start + 6:] pp = position b_start = 1 if (b_start == 1): end = line.find("</BODY>") if (end == -1): tmp = tmp + line if (end != -1): tmp = tmp + line[:end] b_start = 0 b_end = 1 if (b_end == 1): tmp = re.findall(r"[\w']+", tmp) nd = 0 for i in range(0, len(tmp)): tmp[i] = tmp[i].lower() tmp[i] = porter.stem(tmp[i]) if (tmp[i] in list_stop): continue m.append((tmp[i], [(doc_id, i)])) nd += 1 num_doc.append((pp, nd)) b_end = 0 tmp = str() doc_id = doc_id + 1 if sys.getsizeof(m) > 4 * 1024 * 1024: total_temp_files = total_temp_files + 1 fname = 'partial-temp-' + str(total_temp_files) + '.dat' print 'saving temporary file ', fname, '...' m = sort_data(m) write_file(fname, m, int(0), len(m)) total_records = total_records + len(m) m = [] position = fr.tell() line = fr.readline() _num += 1 total_temp_files = total_temp_files + 1 fname = 'partial-temp-' + str(total_temp_files) + '.dat' print 'saving temporary file ', fname, '...' m = sort_data(m) write_file(fname, m, int(0), len(m)) total_records = total_records + len(m) m = [] now_tick = current_milli_time() print 'read ', total_records, ' records in ', ( now_tick - last_tick), ' ms.', ' and saved ', total_temp_files, ' temp files...' last_tick = now_tick if total_temp_files == 1: os.rename('partial-temp-1.dat', 'inverted-index.dat') else: merge_files('partial-temp-1.dat', 'partial-temp-2.dat', 'merge-temp.dat') os.remove('partial-temp-1.dat') os.remove('partial-temp-2.dat') for i in range(3, total_temp_files + 1): os.rename('merge-temp.dat', 'merge-temp-2.dat') print 'merging partial-temp-', str(i), '.dat...' merge_files('merge-temp-2.dat', 'partial-temp-' + str(i) + '.dat', 'merge-temp.dat') os.remove('merge-temp-2.dat') os.remove('partial-temp-' + str(i) + '.dat') os.rename('merge-temp.dat', 'inverted-index.dat') now_tick = current_milli_time() print 'merged inverted-index.dat in ', (now_tick - last_tick), ' ms.' last_tick = now_tick print 'done!' unmerged_files() num_document(num_doc) print 'done'
def apply_stem(termslist): ''' ''' for i, term in enumerate(termslist): termslist[i] = porter.stem(term)