def startElement(self, name, attrs): self.current = name if name == "bug_id": # the high-level element self.isBugId = True if name == "comment": self.isComment = True self.saveLine = "" if self.isProduct: self.gdo = GnomeDataObject(GnomeDataObject.BUG) # a new GDO self.gdo.setRSN( self.bug_number) # approximate, might be off by a few if name == 'product': pass if name == 'text': self.buffer = '' #an empty buffer for each text element if name == 'bug_when': self.buffer = '' #an empty buffer for each text element
def query(self, query_string): """ get all the rows in the db """ self.cursor.execute(query_string) result_list = self.cursor.fetchall() for r in result_list: date = r['first_date'] subj = r['subject'] body = r['message_body'] text = subj + ' ' + body node = GnomeDataObject(GnomeDataObject.MAIL) node.setDate(date) node.setEvent(text) node.setRSN(-1) #-1 indicates no RSN retrieved self.store_tokens(node) self.cursor.close()
def parse_line(self): """ parse the entries in the log file, add them to the list of GDOs""" #entries = self.f.getElementsByTagName('logentry') iter = self.f.getiterator('logentry') for entry in iter: revNum = entry.attrib.get('revision') strdate = entry.find('date').text strdate = strdate[0:19] #this removes the milliseconds and TZ info date = datetime.strptime(strdate, '%Y-%m-%dT%H:%M:%S') try: msg = entry.find('msg').text except AttributeError: msg = "" n = GnomeDataObject(GnomeDataObject.SVN) n.setDate(date) n.setRSN(revNum) n.setEvent(msg) self.store_tokens(n)
class BugContentHandler(xml.sax.ContentHandler): """ a content handler for SAx that processes Gnome bugzilla xml events""" def startDocument(self): print "Beginning parsing" def endDocument(self): print "Parsing complete" def startElement(self, name, attrs): self.current = name if name == "bug_id": # the high-level element self.isBugId = True if name == "comment": self.isComment = True self.saveLine = "" if self.isProduct: self.gdo = GnomeDataObject(GnomeDataObject.BUG) # a new GDO self.gdo.setRSN( self.bug_number) # approximate, might be off by a few if name == 'product': pass if name == 'text': self.buffer = '' #an empty buffer for each text element if name == 'bug_when': self.buffer = '' #an empty buffer for each text element def endElement(self, name): if name == "bug_id": self.isBugId = False if name == 'bug': self.isProduct = False # reset if set to true from last bug if name == 'product': pass if name == "comment": self.isComment = False self.saveLine = " " if self.isProduct: self.write_db() #write_text() if name == 'text' and self.isProduct and self.isComment: event = self.buffer.replace('\n', ' ').strip() self.gdo.setEvent(event) self.buffer = ' ' if name == 'bug_when' and self.isProduct and self.isComment: dateFormat = re.compile( '[\d\-:]+.+[\d\-:]' ) # a date is any word that starts with a digit, : or -, mat_obj = dateFormat.search( self.buffer ) #has stuff in the middle, and ends with a digit. : or - if mat_obj != None: try: date = datetime.datetime.strptime(mat_obj.group(), '%Y-%m-%d %H:%M:%S') except ValueError: print 'Error on bug date in current bug' date = datetime.date(1900, 01, 01) self.gdo.setDate(date) self.buffer = '' def characters(self, content): if self.isProduct and (self.current == 'bug_when' or self.current == 'text'): #if self.current == 'bug_when' or self.current == 'text' or self.current == 'product': stack_trace_re = re.compile( "No symbol table info available|#[0-9]+ 0x[0-9]+.*") if stack_trace_re.search( content ) == None: # ignore the string if it matches this regex, which is stack trace self.buffer = self.buffer + ' ' + content #performance: this is creating a lot of string objects bugFormat = re.compile('\d+') if self.isBugId: bug_id = bugFormat.search(content) if bug_id != None: self.bug_number = bug_id.group() print "Parsing bug # " + str(self.bug_number) if self.current == 'product': prodFormat = re.compile( '\w+.*\w' ) # a Gnome product name starts with a character and has anything else following and prodmatch = prodFormat.search(content) #ends with a alphanumeric if prodmatch != None: prodname = prodmatch.group() #.strip() #print upper(prodname) #print "EVOLUTION" if upper(prodname) in self.products: self.isProduct = True #TODO make sure this detects our product correctly self.cur_product = prodname else: self.isProduct = False def get_data(self): return self.data def write_db(self): """store the data in the mysql db""" self.store_tokens(self.gdo) def write_txt(self): """store the data in a text file""" out_string = unicode(self.gdo) self.data.write( "\n\n\n******************* new bug report *****************************\n\n\n" ) self.data.write( out_string.encode('iso-8859-1', 'replace') ) # we've parsed a bug, so add the completed bug event to our list.... def connect_store(self, db_name): """ connect to store the data""" self.storedb = MySQLdb.connect(passwd="hello", db=db_name, cursorclass=DictCursor) self.store_cursor = self.storedb.cursor() def store_tokens(self, node): """ store in the database""" store_con = self.connect_store("data_objects") #print node.getEvent() #store_query_string = "INSERT INTO %s (rsn, date, type, event) VALUES (%i, %s, %s, %s)" % \ (Parser.STORAGE_TABLE, node.getRSN(), node.getDate(), node.getType(), node.getEvent()) try: self.store_cursor.execute("INSERT INTO t_data (rsn, msr_date, msr_type, event, product) VALUES (%s, %s, %s, %s, %s)", \ (node.getRSN(), node.getDate(), node.getType(), node.getEvent(), self.cur_product) ) except (ValueError): print 'Error in query syntax' def __init__(self): xml.sax.ContentHandler.__init__(self) self.data = open('out.txt', 'w') self.isProduct = False self.isBugId = False self.isComment = False #SAX element flags self.products = [u'EKIGA', u'DESKBAR-APPLET', u'TOTEM', \ u'EVOLUTION', u'METACITY', u'EVINCE', u'EMPATHY', u'NAUTILUS'] self.current = "none" self.cur_product = '' self.gdo = None self.saveLine = "" self.bugCount = 0 self.buffer = '' self.bug_number = 0