def main(): global offset global countFile if len(sys.argv)!= 3: #check arguments print "Usage :: python wikiIndexer.py sample.xml /output" sys.exit(0) parser = xml.sax.make_parser( ) #SAX Parser handler = WikiHandler( ) parser.setContentHandler(handler) parser.parse(sys.argv[1]) with open(sys.argv[2]+'//numberOfFiles.txt','wb') as f: f.write(str(count)) offset = writeIntoFile(sys.argv[2], index, dict_Id, countFile,offset) countFile+=1 mergeFiles(sys.argv[2], countFile) titleOffset=[] with open(sys.argv[2]+'//title.txt','rb') as f: titleOffset.append('0') for line in f: titleOffset.append(str(len(line))) titleOffset = titleOffset[:-1] with open(sys.argv[2]+'//titleoffset.txt','wb') as f: f.write('\n'.join(titleOffset))
def main(): global offset, countFile, OUTPUT_FOLDER if len(sys.argv) != 3: print "Usage :: python wikiIndexer.py sample.xml ./output" sys.exit(0) OUTPUT_FOLDER = sys.argv[2] # SAX Parser parser = xml.sax.make_parser() handler = WikiHandler() parser.setContentHandler(handler) parser.parse(sys.argv[1]) with open(OUTPUT_FOLDER + '/numberOfFiles.txt', 'wb') as f: f.write(str(count)) offset = writeIntoFile(OUTPUT_FOLDER, index, dict_Id, countFile, offset) # pdb.set_trace() countFile += 1 #countFile = 3529 mergeFiles(OUTPUT_FOLDER, countFile) titleOffset = [] with open(OUTPUT_FOLDER + '/title.txt', 'rb') as f: titleOffset.append('0') for line in f: titleOffset.append(str(int(titleOffset[-1]) + len(line))) titleOffset = titleOffset[:-1] with open(OUTPUT_FOLDER + '/titleoffset.txt', 'wb') as f: f.write('\n'.join(titleOffset))
def createIndex(self, title, text, infoBox, category, externalLink): global index, dict_id, countFile, offset, count, OUTPUT_FOLDER vocabularyList = list( set(title.keys() + text.keys() + infoBox.keys() + externalLink.keys())) t, b, i, c, e = float(len(title)), float(len(text)), float( len(infoBox)), float(len(category)), float(len(externalLink)) for key in vocabularyList: string = str(count) + ' ' for (contentType, contentLen) in [(title, t), (text, b), (infoBox, i), (category, c), (externalLink, e)]: try: if SCORE_TYPE == "freq": string += str(int(contentType[key])) + ' ' elif SCORE_TYPE == "freq_ratio": string += str(round(contentType[key] / contentLen, 3)) + ' ' else: print("ERROR: Unknown scoring type") except ZeroDivisionError: string += str(SCORE_TYPE_TYPE(0)) + ' ' index[key].append(string) count += 1 if count % WRITE_PAGES_TO_FILE == 0: print( f"Pages Processed: {count} | Writing the partial index to disk ...." ) offset = writeIntoFile(OUTPUT_FOLDER, index, dict_Id, countFile, offset) index = defaultdict(list) dict_Id = {} countFile += 1
def createIndex(self, title, text, infoBox, category, externalLink): #add tokens generated to index global index global dict_Id global countFile global offset global count vocabularyList= list(set(title.keys()+text.keys()+infoBox.keys()+category.keys()+externalLink.keys())) t=float(len(title)) b=float(len(text)) i=float(len(infoBox)) c=float(len(category)) e=float(len(externalLink)) for key in vocabularyList: string= str(count)+' ' try: string+=str(round(title[key]/t,4))+' ' except ZeroDivisionError: string+='0.0 ' try: string+=str(round(text[key]/b,4))+' ' except ZeroDivisionError: string+='0.0 ' try: string+=str(round(infoBox[key]/i,4))+' ' except ZeroDivisionError: string+='0.0 ' try: string+=str(round(category[key]/c,4))+' ' except ZeroDivisionError: string+='0.0 ' try: string+=str(round(externalLink[key]/e,4)) except ZeroDivisionError: string+='0.0' index[key].append(string) count+=1 if count%5000==0: print count offset = writeIntoFile(sys.argv[2], index, dict_Id, countFile,offset) index=defaultdict(list) dict_Id={} countFile+=1