Esempio n. 1
0
 def parse(self):
     queries = []
     c_query = None
     c_doc = None
     self.avg_anchor_length = 0
     self.avg_title_length = 0
     self.avg_body_length = 0
     self.docs = 0
     self.titles = 0
     self.anchors = 0
     f = open(self.queryDocPath, 'r')
     line = f.readline().strip()
     while True:
         if line == None or line == "":
             break
         elif line.startswith('query'):
             c_query = Query(self.idf)
             queries.append(c_query)
             c_query.set_query(line.split(':')[1])
         elif line.startswith('url'):
             self.docs+=1
             if not c_doc == None:
                 c_doc.calculate_vectors()
             c_doc = Document(c_query)
             c_query.add_doc(c_doc)
             c_doc.set_url(line)
         elif line.startswith('title'):
             c_doc.set_title(line.split(':')[1])
             self.avg_title_length += len(c_doc.title_terms)
             self.titles += 1
         elif line.startswith('body_hits'):
             x = line.split(':')[1].strip().split()
             term = x[0]
             hits = x[1:]
             c_doc.add_body_hits(term, hits)
         elif line.startswith('body_length'):
             c_doc.set_body_length(line.split(':')[1])
             self.avg_body_length += c_doc.body_length
         elif line.startswith('anchor_text'):
             text = line.split(':')[1].strip()
             line = f.readline()
             count = line.split(':')[1].strip()
             c_doc.add_anchor_text(text, count)
             self.avg_anchor_length += int(count)
             self.avg_anchors_per_doc += int(count)
         line = f.readline().strip()
         
     if not c_doc == None:
             c_doc.calculate_vectors()
     #We calculate avg anchor length as follows:
     #Consider all words in anchor text for a doc as one BIG document, so count up all occurrences of anchor words
     # and divide them by the number of docs
     self.avg_anchor_length = self.avg_anchor_length*1.0/self.docs
     self.avg_title_length = self.avg_title_length*1.0/self.titles 
     self.avg_body_length = self.avg_body_length*1.0/self.docs     
     self.avg_anchors_per_doc = self.avg_anchors_per_doc*1.0 / self.docs  
     return queries