Esempio n. 1
0
def main():

    global offset
    global countFile
    if len(sys.argv)!= 3:                                             #check arguments
        print "Usage :: python wikiIndexer.py sample.xml /output"
        sys.exit(0)
  
    parser = xml.sax.make_parser(  )                                  #SAX Parser
    handler = WikiHandler(  )
    parser.setContentHandler(handler)
    parser.parse(sys.argv[1])
    with open(sys.argv[2]+'//numberOfFiles.txt','wb') as f:
      f.write(str(count))
    
    offset = writeIntoFile(sys.argv[2], index, dict_Id, countFile,offset)
    countFile+=1
    mergeFiles(sys.argv[2], countFile)

    titleOffset=[]
    with open(sys.argv[2]+'//title.txt','rb') as f:
      titleOffset.append('0')
      for line in f:
        titleOffset.append(str(len(line)))
    titleOffset = titleOffset[:-1]

    with open(sys.argv[2]+'//titleoffset.txt','wb') as f:
      f.write('\n'.join(titleOffset))
Esempio n. 2
0
def main():
    global offset, countFile, OUTPUT_FOLDER
    if len(sys.argv) != 3:
        print "Usage :: python wikiIndexer.py sample.xml ./output"
        sys.exit(0)
    OUTPUT_FOLDER = sys.argv[2]

    # SAX Parser
    parser = xml.sax.make_parser()
    handler = WikiHandler()
    parser.setContentHandler(handler)
    parser.parse(sys.argv[1])
    with open(OUTPUT_FOLDER + '/numberOfFiles.txt', 'wb') as f:
        f.write(str(count))

    offset = writeIntoFile(OUTPUT_FOLDER, index, dict_Id, countFile, offset)
    # pdb.set_trace()
    countFile += 1
    #countFile = 3529
    mergeFiles(OUTPUT_FOLDER, countFile)

    titleOffset = []
    with open(OUTPUT_FOLDER + '/title.txt', 'rb') as f:
        titleOffset.append('0')
        for line in f:
            titleOffset.append(str(int(titleOffset[-1]) + len(line)))
    titleOffset = titleOffset[:-1]

    with open(OUTPUT_FOLDER + '/titleoffset.txt', 'wb') as f:
        f.write('\n'.join(titleOffset))
Esempio n. 3
0
    def createIndex(self, title, text, infoBox, category, externalLink):
        global index, dict_id, countFile, offset, count, OUTPUT_FOLDER
        vocabularyList = list(
            set(title.keys() + text.keys() + infoBox.keys() +
                externalLink.keys()))
        t, b, i, c, e = float(len(title)), float(len(text)), float(
            len(infoBox)), float(len(category)), float(len(externalLink))
        for key in vocabularyList:
            string = str(count) + ' '
            for (contentType, contentLen) in [(title, t), (text, b),
                                              (infoBox, i), (category, c),
                                              (externalLink, e)]:
                try:
                    if SCORE_TYPE == "freq":
                        string += str(int(contentType[key])) + ' '
                    elif SCORE_TYPE == "freq_ratio":
                        string += str(round(contentType[key] / contentLen,
                                            3)) + ' '
                    else:
                        print("ERROR: Unknown scoring type")
                except ZeroDivisionError:
                    string += str(SCORE_TYPE_TYPE(0)) + ' '
            index[key].append(string)

        count += 1
        if count % WRITE_PAGES_TO_FILE == 0:
            print(
                f"Pages Processed: {count} | Writing the partial index to disk ...."
            )
            offset = writeIntoFile(OUTPUT_FOLDER, index, dict_Id, countFile,
                                   offset)
            index = defaultdict(list)
            dict_Id = {}
            countFile += 1
Esempio n. 4
0
def main():

    global offset
    global countFile
    if len(sys.argv)!= 3:                                             #check arguments
        print "Usage :: python wikiIndexer.py sample.xml /output"
        sys.exit(0)
  
    parser = xml.sax.make_parser(  )                                  #SAX Parser
    handler = WikiHandler(  )
    parser.setContentHandler(handler)
    parser.parse(sys.argv[1])
    with open(sys.argv[2]+'//numberOfFiles.txt','wb') as f:
      f.write(str(count))
    
    offset = writeIntoFile(sys.argv[2], index, dict_Id, countFile,offset)
    countFile+=1
    mergeFiles(sys.argv[2], countFile)

    titleOffset=[]
    with open(sys.argv[2]+'//title.txt','rb') as f:
      titleOffset.append('0')
      for line in f:
        titleOffset.append(str(len(line)))
    titleOffset = titleOffset[:-1]

    with open(sys.argv[2]+'//titleoffset.txt','wb') as f:
      f.write('\n'.join(titleOffset))
Esempio n. 5
0
  def createIndex(self, title, text, infoBox, category, externalLink):    #add tokens generated to index

    global index
    global dict_Id
    global countFile
    global offset
    global count
    
    vocabularyList= list(set(title.keys()+text.keys()+infoBox.keys()+category.keys()+externalLink.keys()))
    t=float(len(title))
    b=float(len(text))
    i=float(len(infoBox))
    c=float(len(category))
    e=float(len(externalLink))
    for key in vocabularyList:
      string= str(count)+' '
      try:
        string+=str(round(title[key]/t,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(text[key]/b,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(infoBox[key]/i,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(category[key]/c,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(externalLink[key]/e,4))
      except ZeroDivisionError:
        string+='0.0'
      index[key].append(string)       

    count+=1
    if count%5000==0:
      print count
      offset = writeIntoFile(sys.argv[2], index, dict_Id, countFile,offset)
      index=defaultdict(list)
      dict_Id={}
      countFile+=1
Esempio n. 6
0
  def createIndex(self, title, text, infoBox, category, externalLink):    #add tokens generated to index

    global index
    global dict_Id
    global countFile
    global offset
    global count
    
    vocabularyList= list(set(title.keys()+text.keys()+infoBox.keys()+category.keys()+externalLink.keys()))
    t=float(len(title))
    b=float(len(text))
    i=float(len(infoBox))
    c=float(len(category))
    e=float(len(externalLink))
    for key in vocabularyList:
      string= str(count)+' '
      try:
        string+=str(round(title[key]/t,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(text[key]/b,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(infoBox[key]/i,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(category[key]/c,4))+' '
      except ZeroDivisionError:
        string+='0.0 '
      try:
        string+=str(round(externalLink[key]/e,4))
      except ZeroDivisionError:
        string+='0.0'
      index[key].append(string)       

    count+=1
    if count%5000==0:
      print count
      offset = writeIntoFile(sys.argv[2], index, dict_Id, countFile,offset)
      index=defaultdict(list)
      dict_Id={}
      countFile+=1