def startElement(self, name, attrs):
        self.current = name
        if name == "bug_id":  # the high-level element
            self.isBugId = True
        if name == "comment":
            self.isComment = True
            self.saveLine = ""
            if self.isProduct:
                self.gdo = GnomeDataObject(GnomeDataObject.BUG)  # a new GDO
                self.gdo.setRSN(
                    self.bug_number)  # approximate, might be off by a few
        if name == 'product':
            pass

        if name == 'text':
            self.buffer = ''  #an empty buffer for each text element
        if name == 'bug_when':
            self.buffer = ''  #an empty buffer for each text element
 def query(self, query_string):
     """ get all the rows in the db """
     self.cursor.execute(query_string)
     result_list = self.cursor.fetchall()
     for r in result_list:
         date = r['first_date']
         subj = r['subject']
         body = r['message_body']
         text = subj + ' ' + body
         node = GnomeDataObject(GnomeDataObject.MAIL)
         node.setDate(date)
         node.setEvent(text)
         node.setRSN(-1)  #-1 indicates no RSN retrieved
         self.store_tokens(node)
     self.cursor.close()
Example #3
0
 def parse_line(self):
     """ parse the entries in the log file, add them to the list of GDOs"""
     #entries = self.f.getElementsByTagName('logentry')
     iter = self.f.getiterator('logentry')
     for entry in iter:
         revNum = entry.attrib.get('revision')
         strdate = entry.find('date').text
         strdate = strdate[0:19]  #this removes the milliseconds and TZ info
         date = datetime.strptime(strdate, '%Y-%m-%dT%H:%M:%S')
         try:
             msg = entry.find('msg').text
         except AttributeError:
             msg = ""
         n = GnomeDataObject(GnomeDataObject.SVN)
         n.setDate(date)
         n.setRSN(revNum)
         n.setEvent(msg)
         self.store_tokens(n)
 def query(self, query_string):
     """ get all the rows in the db """
     self.cursor.execute(query_string)
     result_list = self.cursor.fetchall()
     for r in result_list:
         date = r['first_date']
         subj = r['subject']
         body = r['message_body']  
         text = subj + ' ' + body 
         node = GnomeDataObject(GnomeDataObject.MAIL)  
         node.setDate(date)
         node.setEvent(text)
         node.setRSN(-1) #-1 indicates no RSN retrieved
         self.store_tokens(node)
     self.cursor.close()
 def parse_line(self):
     """ parse the entries in the log file, add them to the list of GDOs"""
     #entries = self.f.getElementsByTagName('logentry')
     iter = self.f.getiterator('logentry')
     for entry in iter:
         revNum = entry.attrib.get('revision') 
         strdate = entry.find('date').text
         strdate = strdate[0:19] #this removes the milliseconds and TZ info
         date = datetime.strptime(strdate, '%Y-%m-%dT%H:%M:%S') 
         try: 
             msg = entry.find('msg').text
         except AttributeError:
             msg = ""
         n = GnomeDataObject(GnomeDataObject.SVN)
         n.setDate(date)
         n.setRSN(revNum)
         n.setEvent(msg)
         self.store_tokens(n)
class BugContentHandler(xml.sax.ContentHandler):
    """ a content handler for SAx that processes Gnome bugzilla xml events"""
    def startDocument(self):
        print "Beginning parsing"

    def endDocument(self):
        print "Parsing complete"

    def startElement(self, name, attrs):
        self.current = name
        if name == "bug_id":  # the high-level element
            self.isBugId = True
        if name == "comment":
            self.isComment = True
            self.saveLine = ""
            if self.isProduct:
                self.gdo = GnomeDataObject(GnomeDataObject.BUG)  # a new GDO
                self.gdo.setRSN(
                    self.bug_number)  # approximate, might be off by a few
        if name == 'product':
            pass

        if name == 'text':
            self.buffer = ''  #an empty buffer for each text element
        if name == 'bug_when':
            self.buffer = ''  #an empty buffer for each text element

    def endElement(self, name):
        if name == "bug_id":
            self.isBugId = False
        if name == 'bug':
            self.isProduct = False  # reset if set to true from last bug
        if name == 'product':
            pass
        if name == "comment":
            self.isComment = False
            self.saveLine = " "
            if self.isProduct:
                self.write_db()
                #write_text()

        if name == 'text' and self.isProduct and self.isComment:
            event = self.buffer.replace('\n', ' ').strip()
            self.gdo.setEvent(event)
            self.buffer = ' '
        if name == 'bug_when' and self.isProduct and self.isComment:
            dateFormat = re.compile(
                '[\d\-:]+.+[\d\-:]'
            )  # a date is any word that starts with a digit, : or -,
            mat_obj = dateFormat.search(
                self.buffer
            )  #has stuff in the middle, and ends with a digit. : or -
            if mat_obj != None:
                try:
                    date = datetime.datetime.strptime(mat_obj.group(),
                                                      '%Y-%m-%d %H:%M:%S')
                except ValueError:
                    print 'Error on bug date in current bug'
                    date = datetime.date(1900, 01, 01)
            self.gdo.setDate(date)
            self.buffer = ''

    def characters(self, content):
        if self.isProduct and (self.current == 'bug_when'
                               or self.current == 'text'):
            #if self.current == 'bug_when' or self.current == 'text' or self.current == 'product':
            stack_trace_re = re.compile(
                "No symbol table info available|#[0-9]+ 0x[0-9]+.*")
            if stack_trace_re.search(
                    content
            ) == None:  # ignore the string if it matches this regex, which is stack trace
                self.buffer = self.buffer + ' ' + content  #performance: this is creating a lot of string objects

        bugFormat = re.compile('\d+')
        if self.isBugId:
            bug_id = bugFormat.search(content)
            if bug_id != None:
                self.bug_number = bug_id.group()
                print "Parsing bug # " + str(self.bug_number)

        if self.current == 'product':
            prodFormat = re.compile(
                '\w+.*\w'
            )  # a Gnome product name starts with a character and has anything else following and
            prodmatch = prodFormat.search(content)  #ends with a alphanumeric
            if prodmatch != None:
                prodname = prodmatch.group()  #.strip()
                #print upper(prodname)
                #print "EVOLUTION"
                if upper(prodname) in self.products:
                    self.isProduct = True  #TODO make sure this detects our product correctly
                    self.cur_product = prodname
                else:
                    self.isProduct = False

    def get_data(self):
        return self.data

    def write_db(self):
        """store the data in the mysql db"""
        self.store_tokens(self.gdo)

    def write_txt(self):
        """store the data in a text file"""
        out_string = unicode(self.gdo)
        self.data.write(
            "\n\n\n******************* new bug report *****************************\n\n\n"
        )
        self.data.write(
            out_string.encode('iso-8859-1', 'replace')
        )  # we've parsed a bug, so add the completed bug event to our list....

    def connect_store(self, db_name):
        """ connect to store the data"""
        self.storedb = MySQLdb.connect(passwd="hello",
                                       db=db_name,
                                       cursorclass=DictCursor)
        self.store_cursor = self.storedb.cursor()

    def store_tokens(self, node):
        """ store in the database"""
        store_con = self.connect_store("data_objects")
        #print node.getEvent()
        #store_query_string = "INSERT INTO %s (rsn, date, type, event) VALUES (%i, %s, %s, %s)" % \  (Parser.STORAGE_TABLE, node.getRSN(), node.getDate(), node.getType(), node.getEvent())
        try:
            self.store_cursor.execute("INSERT INTO t_data (rsn, msr_date, msr_type, event, product) VALUES (%s, %s, %s, %s, %s)", \
            (node.getRSN(), node.getDate(), node.getType(), node.getEvent(), self.cur_product) )
        except (ValueError):
            print 'Error in query syntax'

    def __init__(self):
        xml.sax.ContentHandler.__init__(self)
        self.data = open('out.txt', 'w')
        self.isProduct = False
        self.isBugId = False
        self.isComment = False  #SAX element flags
        self.products = [u'EKIGA', u'DESKBAR-APPLET', u'TOTEM', \
                        u'EVOLUTION', u'METACITY', u'EVINCE', u'EMPATHY', u'NAUTILUS']
        self.current = "none"
        self.cur_product = ''
        self.gdo = None
        self.saveLine = ""
        self.bugCount = 0
        self.buffer = ''
        self.bug_number = 0