Python loadUrlfile Examples

Programming Language: Python

Namespace/Package Name: finverted

Method/Function: loadUrlfile

Examples at hotexamples.com: 2

Python loadUrlfile - 2 examples found. These are the top rated real world Python examples of finverted.loadUrlfile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def creatIndex(self):
     self.fhashkeyToid = finverted.loadUrlfile(fstd.rootpath+'file/url')
     fm = fmmseg.fmmseg()
     fm.loadTermfile() #
     
     
     filenames = os.listdir(fstd.rootpath+'file')
     os.chdir(fstd.rootpath+'file')
     for filename in filenames:
         
         fpos = filename.find('.tmp')
         if fpos != -1:
             print filename
             fp = open(filename,'r')
             hashkey = filename[:fpos]
             if hashkey not in self.fhashkeyToid:
                 continue
             docid = self.fhashkeyToid[hashkey]
             for each in fp:
                 if each == '\n':
                     continue
                 s = each[:each.find('\n')]
                 while s != '':
                     pos = s.find('###')
                     if pos == -1:
                         break
                     s1 = s[:pos]
                     if s1 == '':
                         break
                     if s1 not in fm.termdict:
                         s = s[pos+3:]
                         continue
                     id = fm.termdict[s1]
                     if id not in self.index.keys():
                             self.index[id] = set()
                     self.index[id].add(docid)
                     s = s[pos+3:]
                 
             fp.close()
     #print self.index
     fout = open(fstd.rootpath+'file/termid','w')
     for termids in self.index.keys():
         s = str(termids)+'###'
         for termid in self.index[termids]:
             s = s+str(termid) +'###'
         fout.write(s+'\n')
     fout.close()

Example #2

Show file

    def MergeIndex(self):
        self.fhashkeyToid = finverted.loadUrlfile(fstd.rootpath+'file/url')
        
        fp = open(fstd.rootpath+'file/termid','r')
        for each in fp:
            pos1 = each.find('###')
            termid = int(each[:pos1])
            self.index[termid] = set()
            s = each[pos1 + 3:]
            pos2 = s.find('###')
            while pos2 != -1 :
                docid = int(s[:pos2])
                self.index[termid].add(docid)
                s = s[pos2+3:]
                pos2 = s.find('###')
            
        fp.close()
        
        
        #对新的文件进行分词
        os.chdir(fstd.rootpath+'file')
        fm = fmmseg.fmmseg()
        fm.loadTermfile()
        furl = open(fstd.rootpath+'file/newurl','r')
        for url in furl:
            url = url[:url.find('\n')]
            fm.segmentAFile(url+'.text')
            self.fhashkeyToid[url] = docid
 
        
        fm.mergeTermJieba()
        
        furl.close()
        #进行索引
        furl = open(fstd.rootpath+'file/newurl','r')
        
        for filename in furl:
            
            filename = filename[:filename.find('\n')]
            filename = filename+'.tmp'
            if True:
                print filename
                fp = open(filename,'r')
                hashkey = filename[:filename.find('.tmp')]
                if hashkey not in self.fhashkeyToid:
                    print "-----> "+hashkey + "not in"
                    continue
                docid = self.fhashkeyToid[hashkey]
                for each in fp:
                    if each == '\n':
                        continue
                    s = each[:each.find('\n')]
                    while s != '':
                        pos = s.find('###')
                        if pos == -1:
                            break
                        s1 = s[:pos]
                        if s1 == '':
                            break
                        if s1 not in fm.termdict:
                            s = s[pos+3:]
                            continue
                        id = fm.termdict[s1]
                        if id not in self.index.keys():
                                self.index[id] = set()
#                        print docid
                        self.index[id].add(docid)
                        s = s[pos+3:]
                    
                fp.close()
        
        #print self.index[2] 
        print '索引建好了'
        fout = open(fstd.rootpath+'file/termid','w')
        for termids in self.index.keys():
            s = str(termids)+'###'
            for termid in self.index[termids]:
                s = s+str(termid) +'###'
            fout.write(s+'\n')
        fout.close()