Ejemplo n.º 1
0
    def do_remove_bad_reports(cls, config_file):

        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        import ir_mongodb_helper
        from ir_text import IRText

        
        config = IRConfig.get_instance()
        config.load(config_file)
        bug_id_name = config.get('bug_id_name')
        bug_description_name = config.get('bug_description_name')
        text_cursor = IRText.get_iterator(None)
        remove_ids = []
        def iter_text(item):
            if IRText.is_drop_report(item[bug_description_name]):
                remove_ids.append(item[bug_id_name])
                IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3)
        IRProgressBar.execute_iteration_for_cursor(text_cursor, iter_text)

        # remove from all database
        def remove_from_collection(collection_cfg_name):
            collection =ir_mongodb_helper.IRCollection( \
                'bug_db_name', collection_cfg_name, 'a')
            collection.remove({'bug_id':{'$in':remove_ids}})
            collection.close()

        remove_from_collection('bug_text_collection_name')
        remove_from_collection('bug_tfidf_collection_name')
        remove_from_collection('bug_duplicate_collection_name')
Ejemplo n.º 2
0
    def cache_all_data(cls):
        """Load all document count into memory.
        
        """
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        # config
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        term_name = IRConfig.get_instance().get('bug_term_name')

        cls.__is_cache = True
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'r')

        def iter_document_count(term):
            summary = term[summary_name] if summary_name in term else 0
            description = term[
                description_name] if description_name in term else 0
            cls.__cache_document_count[term[term_name]] = \
                    (summary, description)

        IRProgressBar.execute_iteration_for_cursor(
            documentcount_collection.find({}), iter_document_count,
            "Caching Document Count")
Ejemplo n.º 3
0
 def batch_generate_term_count(cls):
     """Generate term count for text in mongodb database,
         and store to database.
     """
     from ir_log import IRProgressBar
     from ir_text import IRText
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # config
     bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
     summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
     description_name = IRConfig.get_instance().\
             get('bug_description_name', 'desc')
     
     termcount_collection = IRCollection(
         'bug_db_name', 'bug_termcount_collection_name', 'w')
     def iter_text(bug):
         summary_bow, description_bow = cls.calculate_term_count(
             bug[summary_name], bug[description_name])
         termcount_collection.insert({
             bug_id_name : bug[bug_id_name],
             summary_name : summary_bow,
             description_name : description_bow })
     IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}),
                                                iter_text, "From Text to Term Count")
     termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
     termcount_collection.close()
Ejemplo n.º 4
0
    def test_execute_iteration_for_dict(self):
        from ir_log import IRProgressBar

        dictionary = {1:1, 2:2, 3:3, 4:4, 5:5}
        def func(ele):
            print ele, dictionary[ele]
        IRProgressBar.execute_iteration_for_dict(dictionary, func, 'Dict')
Ejemplo n.º 5
0
    def batch_generate_tfidf(cls):
        """Batch calculate TFIDF."""

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount
        from ir_term_count import IRTermCount
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        # prepare collections
        IRDocumentCount.cache_all_data()
        tfidf_collection = IRCollection(
            'bug_db_name', 'bug_tfidf_collection_name', 'w')
        # batch calculate tfidf
        termcount_iterator = IRTermCount.get_iterator()
        bug_count = termcount_iterator.count()
        def iter_term_count(bug):
            summary_tfidf = cls.calculate_tfidf(bug[summary_name],
                                                summary_name, bug_count, None, tfidf_algorithm)
            description_tfidf = cls.calculate_tfidf(bug[description_name],
                                                    description_name, bug_count, None, tfidf_algorithm)
            tfidf_collection.insert({bug_id_name : bug[bug_id_name],
                                     summary_name : summary_tfidf,
                                     description_name : description_tfidf})
        IRProgressBar.execute_iteration_for_cursor(termcount_iterator,
                                                   iter_term_count, "Calculating TFIDF")
        tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        tfidf_collection.close()
Ejemplo n.º 6
0
    def cache_all_data(cls):
        """Load all data into memory."""
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        # caching data
        cls.set_is_cache(True)
        text_collection = \
                IRCollection('bug_db_name', 'bug_text_collection_name', 'r')
        cls.__cache_summary_description = {}
        cls.__cache_stacktrace = {}

        def iter_func(bug):
            cls.__cache_summary_description[bug[bug_id_name]] = \
                    (bug[summary_name], bug[description_name])
            cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name]

        IRProgressBar.execute_iteration_for_cursor(text_collection.find(),
                                                   iter_func,
                                                   'Caching Text Data')
        text_collection.close()
Ejemplo n.º 7
0
    def test_execute_iteration_for_dict(self):
        from ir_log import IRProgressBar

        dictionary = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5}

        def func(ele):
            print ele, dictionary[ele]

        IRProgressBar.execute_iteration_for_dict(dictionary, func, 'Dict')
Ejemplo n.º 8
0
    def __store_to_mongodb(cls, bug2group, group2bug):
        """Store duplicate group information into Mongodb.
        
        Args:
            bug2group: dict, {bug_id -> group_id}
            group2bug: dict, {group_id -> [bug_id]}
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        bug_group_name = IRConfig.get_instance().get('bug_group_name')
        duplicate_collection = IRCollection('bug_db_name',
                                            'bug_duplicate_collection_name',
                                            'w')

        def iter_bug_group(bug):
            duplicate_collection.insert({
                bug_id_name: bug,
                bug_group_name: bug2group[bug]
            })

        IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group,
                                                 "Store to db")
        duplicate_collection.create_index([(bug_id_name,
                                            IRCollection.ASCENDING)])
        duplicate_collection.create_index([(bug_group_name,
                                            IRCollection.ASCENDING)])
        duplicate_collection.close()

        # duplicate group size collection
        group_name = IRConfig.get_instance().get('bug_group_name')
        group_size_name = IRConfig.get_instance().get('bug_group_size')
        duplicate_group_count_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w')
        line_num = 0
        for group, bugs in group2bug.items():
            line_num += 1

        def iter_group_bug(group):
            duplicate_group_count_collection.insert({
                group_name:
                group,
                group_size_name:
                group2bug[group].__len__()
            })

        IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug,
                                                 'Store Index')
        duplicate_group_count_collection.create_index([
            (group_name, IRCollection.ASCENDING)
        ])
        duplicate_group_count_collection.close()
Ejemplo n.º 9
0
    def test_execute_iteration_for_cursor(self):
        from ir_log import IRProgressBar
        import pymongo

        con = pymongo.Connection('127.0.0.1', 27017)
        col = con['bug_gnome_test']['text']

        def func(ele):
            print ele

        IRProgressBar.execute_iteration_for_cursor(col.find(), func, 'Text')
Ejemplo n.º 10
0
    def test_execute_iteration_for_file(self):
        from ir_log import IRProgressBar

        file = open('test.tmp', 'w')
        file.write('\n'.join(['One', 'Two', 'III', '4', '5']))
        file.close()

        def func(ele):
            print ele
        file = open('test.tmp', 'r')
        IRProgressBar.execute_iteration_for_file(file, func, 'Text')
Ejemplo n.º 11
0
    def test_execute_iteration_for_cursor(self):
        from ir_log import IRProgressBar
        import pymongo

        con = pymongo.Connection('127.0.0.1', 27017)
        col = con['bug_gnome_test']['text']

        def func(ele):
            print ele
        IRProgressBar.execute_iteration_for_cursor(
            col.find(), func, 'Text')
Ejemplo n.º 12
0
    def test_execute_iteration_for_file(self):
        from ir_log import IRProgressBar

        file = open('test.tmp', 'w')
        file.write('\n'.join(['One', 'Two', 'III', '4', '5']))
        file.close()

        def func(ele):
            print ele

        file = open('test.tmp', 'r')
        IRProgressBar.execute_iteration_for_file(file, func, 'Text')
Ejemplo n.º 13
0
    def batch_generate_document_count(cls):
        """Batch calculate term count over documents.
        Input is from mongodb, termcount collection.
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_term_count import IRTermCount

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        # Calculate document count and stored in document_count
        document_count = {}

        def iter_term_count(bug):
            for term in bug[summary_name]:
                if not term in document_count:
                    document_count[term] = {
                        term_name: term,
                        summary_name: 0,
                        description_name: 0
                    }
                document_count[term][summary_name] += 1
            for term in bug[description_name]:
                if not term in document_count:
                    document_count[term] = {
                        term_name: term,
                        summary_name: 0,
                        description_name: 0
                    }
                document_count[term][description_name] += 1

        IRProgressBar.execute_iteration_for_cursor(
            IRTermCount.get_iterator({}), iter_term_count,
            "Counting Document Count")
        # Write to db
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'w')

        def write_to_mongo(term):
            documentcount_collection.insert(document_count[term])

        IRProgressBar.execute_iteration_for_dict(document_count,
                                                 write_to_mongo,
                                                 "Write to database")
        documentcount_collection.create_index([(bug_id_name,
                                                IRCollection.ASCENDING)])
        documentcount_collection.close()
Ejemplo n.º 14
0
 def cache_all_data(cls):
     """Load all TFIDF into memory."""
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     bug_id_name = IRConfig.get_instance().get('bug_id_name')
     summary_name = IRConfig.get_instance().get('bug_summary_name')
     description_name = IRConfig.get_instance().get('bug_description_name')
     tfidf_collection = IRCollection(
         'bug_db_name', 'bug_tfidf_collection_name', 'r')
     cls.set_is_cache(True)
     cls.__cache = {}
     def iter_tfidf(bug):
         cls.__cache[bug[bug_id_name]] = (bug[summary_name],
                                          bug[description_name])
     IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(),
                                                iter_tfidf, "Caching TFIDF")
Ejemplo n.º 15
0
 def cache_all_data(cls):
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
   
     cls.__is_cache = True
     bug_name = IRConfig.get_instance(). \
             get('bug_id_name', 'bug_id')
     summary_name = IRConfig.get_instance(). \
                 get('bug_summary_name', 'summ')
     description_name = IRConfig.get_instance(). \
                 get('bug_description_name', 'desc')
     def iter_term_count(bug):
         cls.__cache_term_count[bug[bug_name]] = \
             (bug[summary_name], bug[description_name])
     IRProgressBar.execute_iteration_for_cursor(cls.get_iterator({}),
                                                iter_term_count, "Caching Term Count")
Ejemplo n.º 16
0
    def parse_dump_dup_file(cls, dump_dup_file=None):
        """Generate duplicate group database from dump dup_file
        
        Args:
            dump_dup_file: str
        """

        from ir_log import IRLog
        from ir_log import IRProgressBar
        if None == dump_dup_file:
            dump_dup_file = IRConfig.get_instance(). \
                    get('bug_dump_dup_filename')
        in_file = open(dump_dup_file, 'r')
        # count the lines
        IRLog.get_instance().println('Counting line number of info level0')
        line_count = sum(1 for line in in_file)
        in_file.seek(0)

        progress_bar = IRProgressBar(line_count, 'Read sql duplicate file',
                                     False, 0, 1)
        line_num = 0
        groups = {}
        for line in in_file:
            line_num += 1
            progress_bar.set_value(line_num)
            line = line.strip()
            info = line.split("|")
            origin = int(info[0])
            target = int(info[1])
            if not origin in groups:
                groups[origin] = [origin]
            if not target in groups[origin]:
                groups[origin].append(target)
        in_file.close()

        index = 0
        bug2group = {}
        group2bug = {}
        for key, group in groups.items():
            group2bug[index] = group
            for bug in group:
                bug2group[bug] = index
            index += 1

        cls.__store_to_mongodb(bug2group, group2bug)
Ejemplo n.º 17
0
    def show_distribution_on_product_and_create_ts(cls):
        """Show the distribution of create time and number of products on
        each duplicate group.
        """
        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug2group_collection = IRCollection('bug_db_name',
                                            'bug_duplicate_collection_name',
                                            'r')
        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')

        group_ids = bug2group_collection.distinct(group_name)
        progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1)
        group_num = 0
        for group_id in group_ids:
            group_num += 1
            progress_bar.set_value(group_num)
            bugs = bug2group_collection.find({group_name: group_id})
            min_ts = 9999999999
            max_ts = -1000
            product_set = set()
            for bug in bugs:
                bug_id = bug[bug_id_name]
                basic = basic_collection.find({bug_id_name: bug_id})
                if basic.count() == 0:
                    continue
                ts = basic[0][create_ts_name]
                product = basic[0][product_name]
                # ts
                if ts > max_ts:
                    max_ts = ts
                if ts < min_ts:
                    min_ts = ts
                # product
                product_set.add(product)
            IRLog.get_instance().println('ts span:%d;product number:%d' \
                    % (max_ts - min_ts, product_set.__len__()), 2)
Ejemplo n.º 18
0
    def parse_dump_basic_file(cls, dump_filename=None):
        # Not finished yet
        """Extract basic information mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        product_name = IRConfig.get_instance().get('bug_product_name',
                                                   'product')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name',
                                                     'ts')

        collection = IRCollection('bug_db_name', 'bug_basic_collection_name',
                                  'w')

        # load and insert text file
        if None == dump_filename:
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_basic_filename')
        in_file = open(dump_filename, 'r')

        def iter_for_line(line):
            # TODO here
            bug_id, product, ts = cls.__extract_basic_from_dump_file_line__(
                line)

            collection.insert({
                bug_id_name: int(bug_id),
                product_name: product,
                create_ts_name: int(ts)
            })

        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump Basic')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Ejemplo n.º 19
0
    def show_distribution_on_product_and_create_ts(cls):
        """Show the distribution of create time and number of products on
        each duplicate group.
        """
        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug2group_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'r')
        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')

        group_ids = bug2group_collection.distinct(group_name)
        progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1)
        group_num = 0
        for group_id in group_ids:
            group_num += 1
            progress_bar.set_value(group_num)
            bugs = bug2group_collection.find({group_name : group_id})
            min_ts = 9999999999
            max_ts = -1000
            product_set = set()
            for bug in bugs:
                bug_id = bug[bug_id_name]
                basic = basic_collection.find({bug_id_name : bug_id})
                if basic.count() == 0:
                    continue
                ts = basic[0][create_ts_name]
                product = basic[0][product_name]
                # ts
                if ts > max_ts:
                    max_ts = ts
                if ts < min_ts:
                    min_ts = ts
                # product
                product_set.add(product)
            IRLog.get_instance().println('ts span:%d;product number:%d' \
                    % (max_ts - min_ts, product_set.__len__()), 2)
Ejemplo n.º 20
0
    def parse_dump_dup_file(cls, dump_dup_file = None):
        """Generate duplicate group database from dump dup_file
        
        Args:
            dump_dup_file: str
        """

        from ir_log import IRLog
        from ir_log import IRProgressBar
        if None == dump_dup_file:
            dump_dup_file = IRConfig.get_instance(). \
                    get('bug_dump_dup_filename')
        in_file = open(dump_dup_file, 'r')
        # count the lines
        IRLog.get_instance().println('Counting line number of info level0')
        line_count = sum(1 for line in in_file)
        in_file.seek(0)
        
        progress_bar = IRProgressBar(line_count, 'Read sql duplicate file', False, 0, 1) 
        line_num = 0
        groups = {}
        for line in in_file:
            line_num += 1
            progress_bar.set_value(line_num)
            line = line.strip()
            info = line.split("|")
            origin = int(info[0])
            target = int(info[1])
            if not origin in groups:
                groups[origin] = [origin]
            if not target in groups[origin]:
                groups[origin].append(target)
        in_file.close()

        index = 0
        bug2group = {}
        group2bug = {}
        for key, group in groups.items():
            group2bug[index] = group
            for bug in group:
                bug2group[bug] = index
            index += 1

        cls.__store_to_mongodb(bug2group, group2bug)
Ejemplo n.º 21
0
    def cache_all_data(cls):
        """Load all TFIDF into memory."""
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_collection = IRCollection('bug_db_name',
                                        'bug_tfidf_collection_name', 'r')
        cls.set_is_cache(True)
        cls.__cache = {}

        def iter_tfidf(bug):
            cls.__cache[bug[bug_id_name]] = (bug[summary_name],
                                             bug[description_name])

        IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(),
                                                   iter_tfidf, "Caching TFIDF")
Ejemplo n.º 22
0
    def __store_to_mongodb(cls, bug2group, group2bug):
        """Store duplicate group information into Mongodb.
        
        Args:
            bug2group: dict, {bug_id -> group_id}
            group2bug: dict, {group_id -> [bug_id]}
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        bug_group_name = IRConfig.get_instance().get('bug_group_name')
        duplicate_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'w')
        def iter_bug_group(bug):
            duplicate_collection.insert({ bug_id_name : bug,
                                          bug_group_name : bug2group[bug] })
        IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group,
                                                 "Store to db")
        duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)])
        duplicate_collection.close()

        # duplicate group size collection
        group_name = IRConfig.get_instance().get('bug_group_name')
        group_size_name = IRConfig.get_instance().get('bug_group_size')
        duplicate_group_count_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_group_count_collection_name',
            'w')
        line_num = 0
        for group, bugs in group2bug.items():
            line_num += 1
        def iter_group_bug(group):
            duplicate_group_count_collection.insert({group_name : group,
                                                     group_size_name : group2bug[group].__len__()})
        IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug,
                                                 'Store Index')
        duplicate_group_count_collection.create_index(
            [(group_name, IRCollection.ASCENDING)])
        duplicate_group_count_collection.close()
Ejemplo n.º 23
0
    def parse_dump_file(cls, dump_filename=None):
        """Extract text from mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        # get key name
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name',
                                                       'desc')
        # collection
        collection = IRCollection('bug_db_name', 'bug_text_collection_name',
                                  'w')

        # load and insert text file
        if None == dump_filename:
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_text_filename')
        in_file = open(dump_filename, 'r')

        def iter_for_line(line):
            bug_id, summary, description = \
                    cls.__extract_summary_and_description_from_dump_file_line(line)
            collection.insert({
                bug_id_name: int(bug_id),
                summary_name: summary,
                description_name: description
            })

        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Ejemplo n.º 24
0
    def batch_generate_tfidf(cls):
        """Batch calculate TFIDF."""

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount
        from ir_term_count import IRTermCount
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        # prepare collections
        IRDocumentCount.cache_all_data()
        tfidf_collection = IRCollection('bug_db_name',
                                        'bug_tfidf_collection_name', 'w')
        # batch calculate tfidf
        termcount_iterator = IRTermCount.get_iterator()
        bug_count = termcount_iterator.count()

        def iter_term_count(bug):
            summary_tfidf = cls.calculate_tfidf(bug[summary_name],
                                                summary_name, bug_count, None,
                                                tfidf_algorithm)
            description_tfidf = cls.calculate_tfidf(bug[description_name],
                                                    description_name,
                                                    bug_count, None,
                                                    tfidf_algorithm)
            tfidf_collection.insert({
                bug_id_name: bug[bug_id_name],
                summary_name: summary_tfidf,
                description_name: description_tfidf
            })

        IRProgressBar.execute_iteration_for_cursor(termcount_iterator,
                                                   iter_term_count,
                                                   "Calculating TFIDF")
        tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        tfidf_collection.close()
Ejemplo n.º 25
0
    def parse_dump_basic_file(cls, dump_filename = None):
        # Not finished yet
        """Extract basic information mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection


        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        product_name = IRConfig.get_instance().get('bug_product_name', 'product')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name', 'ts')

        collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'w')

        # load and insert text file
        if None == dump_filename :
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_basic_filename')
        in_file = open(dump_filename, 'r')
        
        def iter_for_line(line):
            # TODO here
            bug_id, product, ts = cls.__extract_basic_from_dump_file_line__(line)

            collection.insert({ bug_id_name : int(bug_id),
                                product_name: product,
                                create_ts_name : int(ts) })

        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump Basic')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Ejemplo n.º 26
0
    def batch_generate_document_count(cls):
        """Batch calculate term count over documents.
        Input is from mongodb, termcount collection.
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection 
        from ir_term_count import IRTermCount

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        # Calculate document count and stored in document_count
        document_count = {}
        def iter_term_count(bug):
            for term in bug[summary_name]:
                if not term in document_count:
                    document_count[term] = {term_name:term, summary_name:0,
                                            description_name:0}
                document_count[term][summary_name] += 1
            for term in bug[description_name]:
                if not term in document_count:
                    document_count[term] = {term_name:term, summary_name:0,
                                            description_name:0}
                document_count[term][description_name] += 1
        IRProgressBar.execute_iteration_for_cursor(IRTermCount.get_iterator({}),
                                                   iter_term_count, "Counting Document Count")
        # Write to db
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'w')
        def write_to_mongo(term):
            documentcount_collection.insert(document_count[term])
        IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo,
                                                 "Write to database")
        documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        documentcount_collection.close()
Ejemplo n.º 27
0
 def cache_all_data(cls):
     """Load all document count into memory.
     
     """
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # config
     summary_name = IRConfig.get_instance().get('bug_summary_name')
     description_name = IRConfig.get_instance().get('bug_description_name')
     term_name = IRConfig.get_instance().get('bug_term_name')
     
     cls.__is_cache = True
     documentcount_collection = IRCollection(
         'bug_db_name', 'bug_documentcount_collection_name', 'r')
     def iter_document_count(term):
         summary = term[summary_name] if summary_name in term else 0
         description = term[description_name] if description_name in term else 0
         cls.__cache_document_count[term[term_name]] = \
                 (summary, description)
     IRProgressBar.execute_iteration_for_cursor(
         documentcount_collection.find({}), iter_document_count,
         "Caching Document Count")
Ejemplo n.º 28
0
    def test_progress_bar(self):
        from ir_log import IRLog
        from ir_log import IRProgressBar

        IRLog.get_instance().start_log(True)
        title = 'ProgressBar Output Not Verbose'
        bar = IRProgressBar(1000, title, False, 0, 1)
        assert bar is not None
        for i in range(0, 1001):
            bar.set_value(i)
        title = 'ProgressBar Output Verbose'
        bar = IRProgressBar(1000, title, True, 1, 0)
        assert bar is not None
        for i in range(0, 1001):
            bar.set_value(i)
        IRLog.get_instance().start_log()
Ejemplo n.º 29
0
 def cache_all_data(cls):
     """Load all data into memory."""
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # get config
     bug_id_name = IRConfig.get_instance().get('bug_id_name')
     summary_name = IRConfig.get_instance().get('bug_summary_name')
     description_name = IRConfig.get_instance().get('bug_description_name')
     stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
     # caching data 
     cls.set_is_cache(True)
     text_collection = \
             IRCollection('bug_db_name', 'bug_text_collection_name', 'r')
     cls.__cache_summary_description = {}
     cls.__cache_stacktrace = {}
     def iter_func(bug):
         cls.__cache_summary_description[bug[bug_id_name]] = \
                 (bug[summary_name], bug[description_name])
         cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name]
     IRProgressBar.execute_iteration_for_cursor(
         text_collection.find(), iter_func, 'Caching Text Data')
     text_collection.close()
Ejemplo n.º 30
0
    def parse_dump_file(cls, dump_filename = None):
        """Extract text from mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        # get key name
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name', 'desc')
        # collection
        collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'w')

        # load and insert text file
        if None == dump_filename :
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_text_filename')
        in_file = open(dump_filename, 'r')
        
        def iter_for_line(line):
            bug_id, summary, description = \
                    cls.__extract_summary_and_description_from_dump_file_line(line)
            collection.insert({ bug_id_name : int(bug_id),
                                summary_name: summary,
                                description_name : description }) 
        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Ejemplo n.º 31
0
    def test_progress_bar(self):
        from ir_log import IRLog
        from ir_log import IRProgressBar

        IRLog.get_instance().start_log(True)
        title = 'ProgressBar Output Not Verbose'
        bar = IRProgressBar(1000, title, False, 0, 1)
        assert bar is not None
        for i in range(0,1001):
            bar.set_value(i)
        title = 'ProgressBar Output Verbose'
        bar = IRProgressBar(1000, title, True, 1, 0)
        assert bar is not None
        for i in range(0,1001):
            bar.set_value(i)
        IRLog.get_instance().start_log()
Ejemplo n.º 32
0
    def parse_info_level0(cls, info_level0_filename = None):
        """Generate duplicate group database from info level0.
        
        Args:
            info_level0_filename: str, If not given, the parameter will be 
                loaded from config file.
        """

        from ir_log import IRLog
        from ir_log import IRProgressBar
        max_group_id = 0
        bug2group = {}
        group2bug = {}
        incomplete_bug = []
        cur_bug = -1
        is_cur_incomplete = False
        if None == info_level0_filename:
            from ir_config import IRConfig
            info_level0_filename = IRConfig.get_instance(). \
                    get('bug_info_level0_filename')
        in_file = open(info_level0_filename, 'r')
        # count the lines
        IRLog.get_instance().println('Counting line number of info level0')
        line_count = sum(1 for line in in_file)
        in_file.seek(0)
        
        progress_bar = IRProgressBar(line_count, 'Read info level0', False, 0, 1) 
        line_num = 0
        # The lines may contain useful information: bug_id, resolution and dup_id
        # bug_id: current bug
        # resolution: the resolution of the current bug
        # dup_id: the duplicate of current bug
        # strategy:
        # 1. drop when resolution is INCOMPLETE
        # 2. (1) if both dup_id and cur_bug are in no group, assign a new group id
        #       for them
        #    (2) if only of dup_id or cur_bug is in group, assign the group id 
        #       to the other
        #    (3) if both of them are in (different) group, merge the groups
        for line in in_file:
            line_num += 1
            progress_bar.set_value(line_num)
            line = line.strip()
            if line.startswith('<bug_id>'):
                cur_bug = int(cls.__get_contain(line))
                is_cur_incomplete = False
            elif line.startswith('<resolution>INCOMPLETE'):
                is_cur_incomplete = True
                incomplete_bug.append(cur_bug)
            elif line.startswith('<dup_id>'):
                if is_cur_incomplete:
                    # ignore this one
                    continue
                dup_bug = int(cls.__get_contain(line))
                cur_bug_group = -1
                dup_bug_group = -1
                if cur_bug in bug2group:
                    cur_bug_group = bug2group[cur_bug]
                if dup_bug in bug2group:
                    dup_bug_group = bug2group[dup_bug]
                if cur_bug_group == -1 and dup_bug_group == -1:
                    # (1) assign a new group id
                    group_id = max_group_id
                    max_group_id += 1
                    bug2group[cur_bug] = group_id
                    bug2group[dup_bug] = group_id
                    group2bug[group_id] = [cur_bug, dup_bug]
                elif cur_bug_group != -1 and dup_bug_group != -1 and cur_bug_group != dup_bug_group:
                    # (3) merge small group to the large
                    conserve_group = cur_bug_group
                    remove_group = dup_bug_group
                    if group2bug[cur_bug_group].__len__() < \
                        group2bug[dup_bug_group].__len__():
                        conserve_group = dup_bug_group
                        remove_group = cur_bug_group
                    for bug in group2bug[remove_group]:
                        bug2group[bug] = conserve_group
                    group2bug[conserve_group].extend(group2bug.pop(remove_group))
                else:
                    # (2) assign the group id
                    if cur_bug_group == -1:
                        group2bug[dup_bug_group].append(cur_bug)
                        bug2group[cur_bug] = dup_bug_group
                    else:
                        group2bug[cur_bug_group].append(dup_bug)
                        bug2group[dup_bug] = cur_bug_group
        in_file.close()
        # remove incomplete bugs
        for bug in incomplete_bug:
            if bug in bug2group:
                group = bug2group[bug]
                bug2group.__delitem__(bug)
                group2bug[group].remove(bug)
                if group2bug[group].__len__() == 0:
                    del group2bug[group]

        cls.__store_to_mongodb(bug2group, group2bug)
Ejemplo n.º 33
0
    def parse_info_level1(cls, info_level1_filename=None):
        """Extract text and insert into mongo db

        info_level1_filename: str, Filename of info level1. If this parameter
            is not given, bug_info_level1_filename will be fetched from
            config file
        """

        import pymongo
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_gnome_st_tools import IRSTTools
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name',
                                                       'desc')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        # collections
        collection = IRCollection('bug_db_name', 'bug_text_collection_name',
                                  'w')
        collection_basic = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'w')
        community_name = IRConfig.get_instance().get('community')

        # load and insert text file
        if None == info_level1_filename:
            info_level1_filename = IRConfig.get_instance().\
                    get('bug_info_level1_filename')
        in_file = open(info_level1_filename, 'r')

        def func_each_line(line):
            bug_id, summary, description, resolution, create_ts, product = \
                    cls.__extract_information_from_info_level1_line(line)

            if resolution is not None and resolution != "INCOMPLETE":
                # post process description
                description, stacktrace = \
                        cls.extract_raw_description_info(description,
                                                         community_name)
                # drop the report whose description containing stacktrace info
                if cls.is_drop_report(description):
                    from ir_log import IRLog
                    IRLog.get_instance().println('Drop report#=%d because it '\
                            'contains unrecognizable stacktrace.' % bug_id, 3)
                    return

                collection.insert({
                    bug_id_name: bug_id,
                    summary_name: summary,
                    description_name: description,
                    stacktrace_name: stacktrace
                })
                collection_basic.insert({
                    bug_id_name: bug_id,
                    create_ts_name: create_ts,
                    product_name: product
                })

        IRProgressBar.execute_iteration_for_file(in_file, func_each_line,
                                                 "Parsing Infolevel 1")
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection_basic.create_index([(bug_id_name, IRCollection.ASCENDING),
                                       (create_ts_name,
                                        IRCollection.ASCENDING),
                                       (product_name, IRCollection.ASCENDING)])
        collection.close()
        collection_basic.close()
Ejemplo n.º 34
0
    def parse_info_level0(cls, info_level0_filename=None):
        """Generate duplicate group database from info level0.
        
        Args:
            info_level0_filename: str, If not given, the parameter will be 
                loaded from config file.
        """

        from ir_log import IRLog
        from ir_log import IRProgressBar
        max_group_id = 0
        bug2group = {}
        group2bug = {}
        incomplete_bug = []
        cur_bug = -1
        is_cur_incomplete = False
        if None == info_level0_filename:
            from ir_config import IRConfig
            info_level0_filename = IRConfig.get_instance(). \
                    get('bug_info_level0_filename')
        in_file = open(info_level0_filename, 'r')
        # count the lines
        IRLog.get_instance().println('Counting line number of info level0')
        line_count = sum(1 for line in in_file)
        in_file.seek(0)

        progress_bar = IRProgressBar(line_count, 'Read info level0', False, 0,
                                     1)
        line_num = 0
        # The lines may contain useful information: bug_id, resolution and dup_id
        # bug_id: current bug
        # resolution: the resolution of the current bug
        # dup_id: the duplicate of current bug
        # strategy:
        # 1. drop when resolution is INCOMPLETE
        # 2. (1) if both dup_id and cur_bug are in no group, assign a new group id
        #       for them
        #    (2) if only of dup_id or cur_bug is in group, assign the group id
        #       to the other
        #    (3) if both of them are in (different) group, merge the groups
        for line in in_file:
            line_num += 1
            progress_bar.set_value(line_num)
            line = line.strip()
            if line.startswith('<bug_id>'):
                cur_bug = int(cls.__get_contain(line))
                is_cur_incomplete = False
            elif line.startswith('<resolution>INCOMPLETE'):
                is_cur_incomplete = True
                incomplete_bug.append(cur_bug)
            elif line.startswith('<dup_id>'):
                if is_cur_incomplete:
                    # ignore this one
                    continue
                dup_bug = int(cls.__get_contain(line))
                cur_bug_group = -1
                dup_bug_group = -1
                if cur_bug in bug2group:
                    cur_bug_group = bug2group[cur_bug]
                if dup_bug in bug2group:
                    dup_bug_group = bug2group[dup_bug]
                if cur_bug_group == -1 and dup_bug_group == -1:
                    # (1) assign a new group id
                    group_id = max_group_id
                    max_group_id += 1
                    bug2group[cur_bug] = group_id
                    bug2group[dup_bug] = group_id
                    group2bug[group_id] = [cur_bug, dup_bug]
                elif cur_bug_group != -1 and dup_bug_group != -1 and cur_bug_group != dup_bug_group:
                    # (3) merge small group to the large
                    conserve_group = cur_bug_group
                    remove_group = dup_bug_group
                    if group2bug[cur_bug_group].__len__() < \
                        group2bug[dup_bug_group].__len__():
                        conserve_group = dup_bug_group
                        remove_group = cur_bug_group
                    for bug in group2bug[remove_group]:
                        bug2group[bug] = conserve_group
                    group2bug[conserve_group].extend(
                        group2bug.pop(remove_group))
                else:
                    # (2) assign the group id
                    if cur_bug_group == -1:
                        group2bug[dup_bug_group].append(cur_bug)
                        bug2group[cur_bug] = dup_bug_group
                    else:
                        group2bug[cur_bug_group].append(dup_bug)
                        bug2group[dup_bug] = cur_bug_group
        in_file.close()
        # remove incomplete bugs
        for bug in incomplete_bug:
            if bug in bug2group:
                group = bug2group[bug]
                bug2group.__delitem__(bug)
                group2bug[group].remove(bug)
                if group2bug[group].__len__() == 0:
                    del group2bug[group]

        cls.__store_to_mongodb(bug2group, group2bug)
Ejemplo n.º 35
0
    def parse_info_level1(cls, info_level1_filename = None):
        """Extract text and insert into mongo db

        info_level1_filename: str, Filename of info level1. If this parameter
            is not given, bug_info_level1_filename will be fetched from
            config file
        """

        import pymongo
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_gnome_st_tools import IRSTTools
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name', 'desc')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        # collections
        collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'w')
        collection_basic = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'w')
        community_name = IRConfig.get_instance().get('community')
        
        # load and insert text file
        if None == info_level1_filename :
            info_level1_filename = IRConfig.get_instance().\
                    get('bug_info_level1_filename')
        in_file = open(info_level1_filename, 'r')
        
        def func_each_line(line):
            bug_id, summary, description, resolution, create_ts, product = \
                    cls.__extract_information_from_info_level1_line(line)
            
            if resolution is not None and resolution != "INCOMPLETE":
                # post process description
                description, stacktrace = \
                        cls.extract_raw_description_info(description,
                                                         community_name)
                # drop the report whose description containing stacktrace info
                if cls.is_drop_report(description):
                    from ir_log import IRLog
                    IRLog.get_instance().println('Drop report#=%d because it '\
                            'contains unrecognizable stacktrace.' % bug_id, 3)
                    return
                
                collection.insert({ bug_id_name : bug_id,
                                    summary_name: summary,
                                    description_name : description,
                                    stacktrace_name : stacktrace })
                collection_basic.insert({ bug_id_name : bug_id,
                                          create_ts_name : create_ts,
                                          product_name : product })
        IRProgressBar.execute_iteration_for_file(in_file, func_each_line,
                                                 "Parsing Infolevel 1")
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection_basic.create_index([ (bug_id_name, IRCollection.ASCENDING),
                                        (create_ts_name, IRCollection.ASCENDING),
                                        (product_name, IRCollection.ASCENDING) ])
        collection.close()
        collection_basic.close()