def do_remove_bad_reports(cls, config_file): from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig import ir_mongodb_helper from ir_text import IRText config = IRConfig.get_instance() config.load(config_file) bug_id_name = config.get('bug_id_name') bug_description_name = config.get('bug_description_name') text_cursor = IRText.get_iterator(None) remove_ids = [] def iter_text(item): if IRText.is_drop_report(item[bug_description_name]): remove_ids.append(item[bug_id_name]) IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3) IRProgressBar.execute_iteration_for_cursor(text_cursor, iter_text) # remove from all database def remove_from_collection(collection_cfg_name): collection =ir_mongodb_helper.IRCollection( \ 'bug_db_name', collection_cfg_name, 'a') collection.remove({'bug_id':{'$in':remove_ids}}) collection.close() remove_from_collection('bug_text_collection_name') remove_from_collection('bug_tfidf_collection_name') remove_from_collection('bug_duplicate_collection_name')
def cache_all_data(cls): """Load all document count into memory. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') term_name = IRConfig.get_instance().get('bug_term_name') cls.__is_cache = True documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') def iter_document_count(term): summary = term[summary_name] if summary_name in term else 0 description = term[ description_name] if description_name in term else 0 cls.__cache_document_count[term[term_name]] = \ (summary, description) IRProgressBar.execute_iteration_for_cursor( documentcount_collection.find({}), iter_document_count, "Caching Document Count")
def batch_generate_term_count(cls): """Generate term count for text in mongodb database, and store to database. """ from ir_log import IRProgressBar from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().\ get('bug_description_name', 'desc') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'w') def iter_text(bug): summary_bow, description_bow = cls.calculate_term_count( bug[summary_name], bug[description_name]) termcount_collection.insert({ bug_id_name : bug[bug_id_name], summary_name : summary_bow, description_name : description_bow }) IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}), iter_text, "From Text to Term Count") termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) termcount_collection.close()
def test_execute_iteration_for_dict(self): from ir_log import IRProgressBar dictionary = {1:1, 2:2, 3:3, 4:4, 5:5} def func(ele): print ele, dictionary[ele] IRProgressBar.execute_iteration_for_dict(dictionary, func, 'Dict')
def batch_generate_tfidf(cls): """Batch calculate TFIDF.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount from ir_term_count import IRTermCount # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') # prepare collections IRDocumentCount.cache_all_data() tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'w') # batch calculate tfidf termcount_iterator = IRTermCount.get_iterator() bug_count = termcount_iterator.count() def iter_term_count(bug): summary_tfidf = cls.calculate_tfidf(bug[summary_name], summary_name, bug_count, None, tfidf_algorithm) description_tfidf = cls.calculate_tfidf(bug[description_name], description_name, bug_count, None, tfidf_algorithm) tfidf_collection.insert({bug_id_name : bug[bug_id_name], summary_name : summary_tfidf, description_name : description_tfidf}) IRProgressBar.execute_iteration_for_cursor(termcount_iterator, iter_term_count, "Calculating TFIDF") tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) tfidf_collection.close()
def cache_all_data(cls): """Load all data into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') # caching data cls.set_is_cache(True) text_collection = \ IRCollection('bug_db_name', 'bug_text_collection_name', 'r') cls.__cache_summary_description = {} cls.__cache_stacktrace = {} def iter_func(bug): cls.__cache_summary_description[bug[bug_id_name]] = \ (bug[summary_name], bug[description_name]) cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name] IRProgressBar.execute_iteration_for_cursor(text_collection.find(), iter_func, 'Caching Text Data') text_collection.close()
def test_execute_iteration_for_dict(self): from ir_log import IRProgressBar dictionary = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} def func(ele): print ele, dictionary[ele] IRProgressBar.execute_iteration_for_dict(dictionary, func, 'Dict')
def __store_to_mongodb(cls, bug2group, group2bug): """Store duplicate group information into Mongodb. Args: bug2group: dict, {bug_id -> group_id} group2bug: dict, {group_id -> [bug_id]} """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') bug_group_name = IRConfig.get_instance().get('bug_group_name') duplicate_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'w') def iter_bug_group(bug): duplicate_collection.insert({ bug_id_name: bug, bug_group_name: bug2group[bug] }) IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group, "Store to db") duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)]) duplicate_collection.close() # duplicate group size collection group_name = IRConfig.get_instance().get('bug_group_name') group_size_name = IRConfig.get_instance().get('bug_group_size') duplicate_group_count_collection = IRCollection( 'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w') line_num = 0 for group, bugs in group2bug.items(): line_num += 1 def iter_group_bug(group): duplicate_group_count_collection.insert({ group_name: group, group_size_name: group2bug[group].__len__() }) IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug, 'Store Index') duplicate_group_count_collection.create_index([ (group_name, IRCollection.ASCENDING) ]) duplicate_group_count_collection.close()
def test_execute_iteration_for_cursor(self): from ir_log import IRProgressBar import pymongo con = pymongo.Connection('127.0.0.1', 27017) col = con['bug_gnome_test']['text'] def func(ele): print ele IRProgressBar.execute_iteration_for_cursor(col.find(), func, 'Text')
def test_execute_iteration_for_file(self): from ir_log import IRProgressBar file = open('test.tmp', 'w') file.write('\n'.join(['One', 'Two', 'III', '4', '5'])) file.close() def func(ele): print ele file = open('test.tmp', 'r') IRProgressBar.execute_iteration_for_file(file, func, 'Text')
def test_execute_iteration_for_cursor(self): from ir_log import IRProgressBar import pymongo con = pymongo.Connection('127.0.0.1', 27017) col = con['bug_gnome_test']['text'] def func(ele): print ele IRProgressBar.execute_iteration_for_cursor( col.find(), func, 'Text')
def batch_generate_document_count(cls): """Batch calculate term count over documents. Input is from mongodb, termcount collection. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_term_count import IRTermCount bug_id_name = IRConfig.get_instance().get('bug_id_name') term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') # Calculate document count and stored in document_count document_count = {} def iter_term_count(bug): for term in bug[summary_name]: if not term in document_count: document_count[term] = { term_name: term, summary_name: 0, description_name: 0 } document_count[term][summary_name] += 1 for term in bug[description_name]: if not term in document_count: document_count[term] = { term_name: term, summary_name: 0, description_name: 0 } document_count[term][description_name] += 1 IRProgressBar.execute_iteration_for_cursor( IRTermCount.get_iterator({}), iter_term_count, "Counting Document Count") # Write to db documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'w') def write_to_mongo(term): documentcount_collection.insert(document_count[term]) IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo, "Write to database") documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) documentcount_collection.close()
def cache_all_data(cls): """Load all TFIDF into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'r') cls.set_is_cache(True) cls.__cache = {} def iter_tfidf(bug): cls.__cache[bug[bug_id_name]] = (bug[summary_name], bug[description_name]) IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(), iter_tfidf, "Caching TFIDF")
def cache_all_data(cls): from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection cls.__is_cache = True bug_name = IRConfig.get_instance(). \ get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance(). \ get('bug_summary_name', 'summ') description_name = IRConfig.get_instance(). \ get('bug_description_name', 'desc') def iter_term_count(bug): cls.__cache_term_count[bug[bug_name]] = \ (bug[summary_name], bug[description_name]) IRProgressBar.execute_iteration_for_cursor(cls.get_iterator({}), iter_term_count, "Caching Term Count")
def parse_dump_dup_file(cls, dump_dup_file=None): """Generate duplicate group database from dump dup_file Args: dump_dup_file: str """ from ir_log import IRLog from ir_log import IRProgressBar if None == dump_dup_file: dump_dup_file = IRConfig.get_instance(). \ get('bug_dump_dup_filename') in_file = open(dump_dup_file, 'r') # count the lines IRLog.get_instance().println('Counting line number of info level0') line_count = sum(1 for line in in_file) in_file.seek(0) progress_bar = IRProgressBar(line_count, 'Read sql duplicate file', False, 0, 1) line_num = 0 groups = {} for line in in_file: line_num += 1 progress_bar.set_value(line_num) line = line.strip() info = line.split("|") origin = int(info[0]) target = int(info[1]) if not origin in groups: groups[origin] = [origin] if not target in groups[origin]: groups[origin].append(target) in_file.close() index = 0 bug2group = {} group2bug = {} for key, group in groups.items(): group2bug[index] = group for bug in group: bug2group[bug] = index index += 1 cls.__store_to_mongodb(bug2group, group2bug)
def show_distribution_on_product_and_create_ts(cls): """Show the distribution of create time and number of products on each duplicate group. """ from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug2group_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'r') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') product_name = IRConfig.get_instance().get('bug_product_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') group_ids = bug2group_collection.distinct(group_name) progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1) group_num = 0 for group_id in group_ids: group_num += 1 progress_bar.set_value(group_num) bugs = bug2group_collection.find({group_name: group_id}) min_ts = 9999999999 max_ts = -1000 product_set = set() for bug in bugs: bug_id = bug[bug_id_name] basic = basic_collection.find({bug_id_name: bug_id}) if basic.count() == 0: continue ts = basic[0][create_ts_name] product = basic[0][product_name] # ts if ts > max_ts: max_ts = ts if ts < min_ts: min_ts = ts # product product_set.add(product) IRLog.get_instance().println('ts span:%d;product number:%d' \ % (max_ts - min_ts, product_set.__len__()), 2)
def parse_dump_basic_file(cls, dump_filename=None): # Not finished yet """Extract basic information mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') product_name = IRConfig.get_instance().get('bug_product_name', 'product') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name', 'ts') collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'w') # load and insert text file if None == dump_filename: dump_filename = IRConfig.get_instance().\ get('bug_dump_basic_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): # TODO here bug_id, product, ts = cls.__extract_basic_from_dump_file_line__( line) collection.insert({ bug_id_name: int(bug_id), product_name: product, create_ts_name: int(ts) }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump Basic') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def show_distribution_on_product_and_create_ts(cls): """Show the distribution of create time and number of products on each duplicate group. """ from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug2group_collection = IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'r') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') product_name = IRConfig.get_instance().get('bug_product_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') group_ids = bug2group_collection.distinct(group_name) progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1) group_num = 0 for group_id in group_ids: group_num += 1 progress_bar.set_value(group_num) bugs = bug2group_collection.find({group_name : group_id}) min_ts = 9999999999 max_ts = -1000 product_set = set() for bug in bugs: bug_id = bug[bug_id_name] basic = basic_collection.find({bug_id_name : bug_id}) if basic.count() == 0: continue ts = basic[0][create_ts_name] product = basic[0][product_name] # ts if ts > max_ts: max_ts = ts if ts < min_ts: min_ts = ts # product product_set.add(product) IRLog.get_instance().println('ts span:%d;product number:%d' \ % (max_ts - min_ts, product_set.__len__()), 2)
def parse_dump_dup_file(cls, dump_dup_file = None): """Generate duplicate group database from dump dup_file Args: dump_dup_file: str """ from ir_log import IRLog from ir_log import IRProgressBar if None == dump_dup_file: dump_dup_file = IRConfig.get_instance(). \ get('bug_dump_dup_filename') in_file = open(dump_dup_file, 'r') # count the lines IRLog.get_instance().println('Counting line number of info level0') line_count = sum(1 for line in in_file) in_file.seek(0) progress_bar = IRProgressBar(line_count, 'Read sql duplicate file', False, 0, 1) line_num = 0 groups = {} for line in in_file: line_num += 1 progress_bar.set_value(line_num) line = line.strip() info = line.split("|") origin = int(info[0]) target = int(info[1]) if not origin in groups: groups[origin] = [origin] if not target in groups[origin]: groups[origin].append(target) in_file.close() index = 0 bug2group = {} group2bug = {} for key, group in groups.items(): group2bug[index] = group for bug in group: bug2group[bug] = index index += 1 cls.__store_to_mongodb(bug2group, group2bug)
def cache_all_data(cls): """Load all TFIDF into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection('bug_db_name', 'bug_tfidf_collection_name', 'r') cls.set_is_cache(True) cls.__cache = {} def iter_tfidf(bug): cls.__cache[bug[bug_id_name]] = (bug[summary_name], bug[description_name]) IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(), iter_tfidf, "Caching TFIDF")
def __store_to_mongodb(cls, bug2group, group2bug): """Store duplicate group information into Mongodb. Args: bug2group: dict, {bug_id -> group_id} group2bug: dict, {group_id -> [bug_id]} """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') bug_group_name = IRConfig.get_instance().get('bug_group_name') duplicate_collection = IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'w') def iter_bug_group(bug): duplicate_collection.insert({ bug_id_name : bug, bug_group_name : bug2group[bug] }) IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group, "Store to db") duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)]) duplicate_collection.close() # duplicate group size collection group_name = IRConfig.get_instance().get('bug_group_name') group_size_name = IRConfig.get_instance().get('bug_group_size') duplicate_group_count_collection = IRCollection( 'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w') line_num = 0 for group, bugs in group2bug.items(): line_num += 1 def iter_group_bug(group): duplicate_group_count_collection.insert({group_name : group, group_size_name : group2bug[group].__len__()}) IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug, 'Store Index') duplicate_group_count_collection.create_index( [(group_name, IRCollection.ASCENDING)]) duplicate_group_count_collection.close()
def parse_dump_file(cls, dump_filename=None): """Extract text from mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get key name bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') # collection collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'w') # load and insert text file if None == dump_filename: dump_filename = IRConfig.get_instance().\ get('bug_dump_text_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): bug_id, summary, description = \ cls.__extract_summary_and_description_from_dump_file_line(line) collection.insert({ bug_id_name: int(bug_id), summary_name: summary, description_name: description }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def batch_generate_tfidf(cls): """Batch calculate TFIDF.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount from ir_term_count import IRTermCount # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') # prepare collections IRDocumentCount.cache_all_data() tfidf_collection = IRCollection('bug_db_name', 'bug_tfidf_collection_name', 'w') # batch calculate tfidf termcount_iterator = IRTermCount.get_iterator() bug_count = termcount_iterator.count() def iter_term_count(bug): summary_tfidf = cls.calculate_tfidf(bug[summary_name], summary_name, bug_count, None, tfidf_algorithm) description_tfidf = cls.calculate_tfidf(bug[description_name], description_name, bug_count, None, tfidf_algorithm) tfidf_collection.insert({ bug_id_name: bug[bug_id_name], summary_name: summary_tfidf, description_name: description_tfidf }) IRProgressBar.execute_iteration_for_cursor(termcount_iterator, iter_term_count, "Calculating TFIDF") tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) tfidf_collection.close()
def parse_dump_basic_file(cls, dump_filename = None): # Not finished yet """Extract basic information mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') product_name = IRConfig.get_instance().get('bug_product_name', 'product') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name', 'ts') collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'w') # load and insert text file if None == dump_filename : dump_filename = IRConfig.get_instance().\ get('bug_dump_basic_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): # TODO here bug_id, product, ts = cls.__extract_basic_from_dump_file_line__(line) collection.insert({ bug_id_name : int(bug_id), product_name: product, create_ts_name : int(ts) }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump Basic') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def batch_generate_document_count(cls): """Batch calculate term count over documents. Input is from mongodb, termcount collection. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_term_count import IRTermCount bug_id_name = IRConfig.get_instance().get('bug_id_name') term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') # Calculate document count and stored in document_count document_count = {} def iter_term_count(bug): for term in bug[summary_name]: if not term in document_count: document_count[term] = {term_name:term, summary_name:0, description_name:0} document_count[term][summary_name] += 1 for term in bug[description_name]: if not term in document_count: document_count[term] = {term_name:term, summary_name:0, description_name:0} document_count[term][description_name] += 1 IRProgressBar.execute_iteration_for_cursor(IRTermCount.get_iterator({}), iter_term_count, "Counting Document Count") # Write to db documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'w') def write_to_mongo(term): documentcount_collection.insert(document_count[term]) IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo, "Write to database") documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) documentcount_collection.close()
def cache_all_data(cls): """Load all document count into memory. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') term_name = IRConfig.get_instance().get('bug_term_name') cls.__is_cache = True documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') def iter_document_count(term): summary = term[summary_name] if summary_name in term else 0 description = term[description_name] if description_name in term else 0 cls.__cache_document_count[term[term_name]] = \ (summary, description) IRProgressBar.execute_iteration_for_cursor( documentcount_collection.find({}), iter_document_count, "Caching Document Count")
def test_progress_bar(self): from ir_log import IRLog from ir_log import IRProgressBar IRLog.get_instance().start_log(True) title = 'ProgressBar Output Not Verbose' bar = IRProgressBar(1000, title, False, 0, 1) assert bar is not None for i in range(0, 1001): bar.set_value(i) title = 'ProgressBar Output Verbose' bar = IRProgressBar(1000, title, True, 1, 0) assert bar is not None for i in range(0, 1001): bar.set_value(i) IRLog.get_instance().start_log()
def cache_all_data(cls): """Load all data into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') # caching data cls.set_is_cache(True) text_collection = \ IRCollection('bug_db_name', 'bug_text_collection_name', 'r') cls.__cache_summary_description = {} cls.__cache_stacktrace = {} def iter_func(bug): cls.__cache_summary_description[bug[bug_id_name]] = \ (bug[summary_name], bug[description_name]) cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name] IRProgressBar.execute_iteration_for_cursor( text_collection.find(), iter_func, 'Caching Text Data') text_collection.close()
def parse_dump_file(cls, dump_filename = None): """Extract text from mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get key name bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') # collection collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'w') # load and insert text file if None == dump_filename : dump_filename = IRConfig.get_instance().\ get('bug_dump_text_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): bug_id, summary, description = \ cls.__extract_summary_and_description_from_dump_file_line(line) collection.insert({ bug_id_name : int(bug_id), summary_name: summary, description_name : description }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def test_progress_bar(self): from ir_log import IRLog from ir_log import IRProgressBar IRLog.get_instance().start_log(True) title = 'ProgressBar Output Not Verbose' bar = IRProgressBar(1000, title, False, 0, 1) assert bar is not None for i in range(0,1001): bar.set_value(i) title = 'ProgressBar Output Verbose' bar = IRProgressBar(1000, title, True, 1, 0) assert bar is not None for i in range(0,1001): bar.set_value(i) IRLog.get_instance().start_log()
def parse_info_level0(cls, info_level0_filename = None): """Generate duplicate group database from info level0. Args: info_level0_filename: str, If not given, the parameter will be loaded from config file. """ from ir_log import IRLog from ir_log import IRProgressBar max_group_id = 0 bug2group = {} group2bug = {} incomplete_bug = [] cur_bug = -1 is_cur_incomplete = False if None == info_level0_filename: from ir_config import IRConfig info_level0_filename = IRConfig.get_instance(). \ get('bug_info_level0_filename') in_file = open(info_level0_filename, 'r') # count the lines IRLog.get_instance().println('Counting line number of info level0') line_count = sum(1 for line in in_file) in_file.seek(0) progress_bar = IRProgressBar(line_count, 'Read info level0', False, 0, 1) line_num = 0 # The lines may contain useful information: bug_id, resolution and dup_id # bug_id: current bug # resolution: the resolution of the current bug # dup_id: the duplicate of current bug # strategy: # 1. drop when resolution is INCOMPLETE # 2. (1) if both dup_id and cur_bug are in no group, assign a new group id # for them # (2) if only of dup_id or cur_bug is in group, assign the group id # to the other # (3) if both of them are in (different) group, merge the groups for line in in_file: line_num += 1 progress_bar.set_value(line_num) line = line.strip() if line.startswith('<bug_id>'): cur_bug = int(cls.__get_contain(line)) is_cur_incomplete = False elif line.startswith('<resolution>INCOMPLETE'): is_cur_incomplete = True incomplete_bug.append(cur_bug) elif line.startswith('<dup_id>'): if is_cur_incomplete: # ignore this one continue dup_bug = int(cls.__get_contain(line)) cur_bug_group = -1 dup_bug_group = -1 if cur_bug in bug2group: cur_bug_group = bug2group[cur_bug] if dup_bug in bug2group: dup_bug_group = bug2group[dup_bug] if cur_bug_group == -1 and dup_bug_group == -1: # (1) assign a new group id group_id = max_group_id max_group_id += 1 bug2group[cur_bug] = group_id bug2group[dup_bug] = group_id group2bug[group_id] = [cur_bug, dup_bug] elif cur_bug_group != -1 and dup_bug_group != -1 and cur_bug_group != dup_bug_group: # (3) merge small group to the large conserve_group = cur_bug_group remove_group = dup_bug_group if group2bug[cur_bug_group].__len__() < \ group2bug[dup_bug_group].__len__(): conserve_group = dup_bug_group remove_group = cur_bug_group for bug in group2bug[remove_group]: bug2group[bug] = conserve_group group2bug[conserve_group].extend(group2bug.pop(remove_group)) else: # (2) assign the group id if cur_bug_group == -1: group2bug[dup_bug_group].append(cur_bug) bug2group[cur_bug] = dup_bug_group else: group2bug[cur_bug_group].append(dup_bug) bug2group[dup_bug] = cur_bug_group in_file.close() # remove incomplete bugs for bug in incomplete_bug: if bug in bug2group: group = bug2group[bug] bug2group.__delitem__(bug) group2bug[group].remove(bug) if group2bug[group].__len__() == 0: del group2bug[group] cls.__store_to_mongodb(bug2group, group2bug)
def parse_info_level1(cls, info_level1_filename=None): """Extract text and insert into mongo db info_level1_filename: str, Filename of info level1. If this parameter is not given, bug_info_level1_filename will be fetched from config file """ import pymongo from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_gnome_st_tools import IRSTTools # get config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') # collections collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'w') collection_basic = IRCollection('bug_db_name', 'bug_basic_collection_name', 'w') community_name = IRConfig.get_instance().get('community') # load and insert text file if None == info_level1_filename: info_level1_filename = IRConfig.get_instance().\ get('bug_info_level1_filename') in_file = open(info_level1_filename, 'r') def func_each_line(line): bug_id, summary, description, resolution, create_ts, product = \ cls.__extract_information_from_info_level1_line(line) if resolution is not None and resolution != "INCOMPLETE": # post process description description, stacktrace = \ cls.extract_raw_description_info(description, community_name) # drop the report whose description containing stacktrace info if cls.is_drop_report(description): from ir_log import IRLog IRLog.get_instance().println('Drop report#=%d because it '\ 'contains unrecognizable stacktrace.' % bug_id, 3) return collection.insert({ bug_id_name: bug_id, summary_name: summary, description_name: description, stacktrace_name: stacktrace }) collection_basic.insert({ bug_id_name: bug_id, create_ts_name: create_ts, product_name: product }) IRProgressBar.execute_iteration_for_file(in_file, func_each_line, "Parsing Infolevel 1") in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection_basic.create_index([(bug_id_name, IRCollection.ASCENDING), (create_ts_name, IRCollection.ASCENDING), (product_name, IRCollection.ASCENDING)]) collection.close() collection_basic.close()
def parse_info_level0(cls, info_level0_filename=None): """Generate duplicate group database from info level0. Args: info_level0_filename: str, If not given, the parameter will be loaded from config file. """ from ir_log import IRLog from ir_log import IRProgressBar max_group_id = 0 bug2group = {} group2bug = {} incomplete_bug = [] cur_bug = -1 is_cur_incomplete = False if None == info_level0_filename: from ir_config import IRConfig info_level0_filename = IRConfig.get_instance(). \ get('bug_info_level0_filename') in_file = open(info_level0_filename, 'r') # count the lines IRLog.get_instance().println('Counting line number of info level0') line_count = sum(1 for line in in_file) in_file.seek(0) progress_bar = IRProgressBar(line_count, 'Read info level0', False, 0, 1) line_num = 0 # The lines may contain useful information: bug_id, resolution and dup_id # bug_id: current bug # resolution: the resolution of the current bug # dup_id: the duplicate of current bug # strategy: # 1. drop when resolution is INCOMPLETE # 2. (1) if both dup_id and cur_bug are in no group, assign a new group id # for them # (2) if only of dup_id or cur_bug is in group, assign the group id # to the other # (3) if both of them are in (different) group, merge the groups for line in in_file: line_num += 1 progress_bar.set_value(line_num) line = line.strip() if line.startswith('<bug_id>'): cur_bug = int(cls.__get_contain(line)) is_cur_incomplete = False elif line.startswith('<resolution>INCOMPLETE'): is_cur_incomplete = True incomplete_bug.append(cur_bug) elif line.startswith('<dup_id>'): if is_cur_incomplete: # ignore this one continue dup_bug = int(cls.__get_contain(line)) cur_bug_group = -1 dup_bug_group = -1 if cur_bug in bug2group: cur_bug_group = bug2group[cur_bug] if dup_bug in bug2group: dup_bug_group = bug2group[dup_bug] if cur_bug_group == -1 and dup_bug_group == -1: # (1) assign a new group id group_id = max_group_id max_group_id += 1 bug2group[cur_bug] = group_id bug2group[dup_bug] = group_id group2bug[group_id] = [cur_bug, dup_bug] elif cur_bug_group != -1 and dup_bug_group != -1 and cur_bug_group != dup_bug_group: # (3) merge small group to the large conserve_group = cur_bug_group remove_group = dup_bug_group if group2bug[cur_bug_group].__len__() < \ group2bug[dup_bug_group].__len__(): conserve_group = dup_bug_group remove_group = cur_bug_group for bug in group2bug[remove_group]: bug2group[bug] = conserve_group group2bug[conserve_group].extend( group2bug.pop(remove_group)) else: # (2) assign the group id if cur_bug_group == -1: group2bug[dup_bug_group].append(cur_bug) bug2group[cur_bug] = dup_bug_group else: group2bug[cur_bug_group].append(dup_bug) bug2group[dup_bug] = cur_bug_group in_file.close() # remove incomplete bugs for bug in incomplete_bug: if bug in bug2group: group = bug2group[bug] bug2group.__delitem__(bug) group2bug[group].remove(bug) if group2bug[group].__len__() == 0: del group2bug[group] cls.__store_to_mongodb(bug2group, group2bug)
def parse_info_level1(cls, info_level1_filename = None): """Extract text and insert into mongo db info_level1_filename: str, Filename of info level1. If this parameter is not given, bug_info_level1_filename will be fetched from config file """ import pymongo from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_gnome_st_tools import IRSTTools # get config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') # collections collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'w') collection_basic = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'w') community_name = IRConfig.get_instance().get('community') # load and insert text file if None == info_level1_filename : info_level1_filename = IRConfig.get_instance().\ get('bug_info_level1_filename') in_file = open(info_level1_filename, 'r') def func_each_line(line): bug_id, summary, description, resolution, create_ts, product = \ cls.__extract_information_from_info_level1_line(line) if resolution is not None and resolution != "INCOMPLETE": # post process description description, stacktrace = \ cls.extract_raw_description_info(description, community_name) # drop the report whose description containing stacktrace info if cls.is_drop_report(description): from ir_log import IRLog IRLog.get_instance().println('Drop report#=%d because it '\ 'contains unrecognizable stacktrace.' % bug_id, 3) return collection.insert({ bug_id_name : bug_id, summary_name: summary, description_name : description, stacktrace_name : stacktrace }) collection_basic.insert({ bug_id_name : bug_id, create_ts_name : create_ts, product_name : product }) IRProgressBar.execute_iteration_for_file(in_file, func_each_line, "Parsing Infolevel 1") in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection_basic.create_index([ (bug_id_name, IRCollection.ASCENDING), (create_ts_name, IRCollection.ASCENDING), (product_name, IRCollection.ASCENDING) ]) collection.close() collection_basic.close()