def identify_bugs_in_commits(input_file, output_file=None, project=None):
    logger.info('Identifying bugs in commits')
    logger.debug('Identifying for project: %s',project)
    
    # Load the xml-file that contains the commits
    logger.debug('Loading the file: %s', input_file)
    xml_parser = etree.XMLParser(remove_blank_text=True, huge_tree=True)
    xml_tree = etree.parse(input_file, parser=xml_parser)
    xml_root = xml_tree.getroot()
    logger.debug('Total number of commits: %s', len(xml_root))
    
    # Get the commits that are related to bugs
    output_root = etree.Element('log')
    for xml_commit in xml_root:
        identified_bugs = get_bugs_in_commit(xml_commit, project)
        xml_identified_bugs = etree.Element('identified_bugs')
        for bug_id in identified_bugs:
            # Add it to the commit
            xml_bug = etree.Element('bug')
            xml_bug.set('bug_id', bug_id)
            xml_identified_bugs.append(xml_bug)
        if len(xml_identified_bugs) > 0:
            xml_commit.append(xml_identified_bugs)
            output_root.append(xml_commit)
    logger.debug('Number of commits with identified-bugs: %s', len(output_root))
    
    if output_file is None:
        return output_root
    else:
        tree_as_string = etree.tostring(output_root, encoding='unicode', pretty_print=True)
        store_data_into_file(tree_as_string, output_file)
def do_main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--clone-repos",
        metavar=("<file-repos-urls>", "<output-dir>"),
        nargs=2,
        help="Clone the repositories listed in file-repos-urls" + " and store them into the output-dir directory",
    )
    parser.add_argument(
        "--repo-type", choices=["git", "hg"], required=True, help="Specify the type of version-contro to work with"
    )
    parser.add_argument(
        "--extract-commits",
        metavar=("<repo-path>", "<output-file>"),
        nargs=2,
        help="Extract the commits of the repository and" + " and store it as a xml into output-file",
    )
    args = parser.parse_args()

    # Execute the options
    if args.clone_repos:
        file_repos_urls, output_dir = args.clone_repos
        clone_git_repositories(file_repos_urls, output_dir)

    if args.extract_commits:
        repo_path, output_file = args.extract_commits
        xml_tree = get_hg_logs_as_xml(repo_path)
        tree_as_string = etree.tostring(xml_tree, encoding="unicode", pretty_print=True)
        store_data_into_file(tree_as_string, output_file)
def extract_non_bug_related_commits(input_file, output_file=None, project=None):
    logger.info('Extracting the commits that are NOT related to bugs')
    if project is not None:
        logger.debug('Extracting for project: %s',project)
        
    # Load the xml-file that contains the commits
    logger.debug('Loading the file: %s', input_file)
    xml_parser = etree.XMLParser(remove_blank_text=True, huge_tree=True)
    xml_tree = etree.parse(input_file, parser=xml_parser)
    xml_root = xml_tree.getroot()
    logger.debug('Total number of commits: %s', len(xml_root))

    # Get the commits that are related to bugs
    output_root = etree.Element('log')
    for xml_commit in xml_root:
        if not commit_is_bug_related(xml_commit, project):
            output_root.append(xml_commit)
    logger.debug('Number of NON-bug-related-commits: %s', len(output_root))
    
    if output_file is None:
        return output_root
    else:
        tree_as_string = etree.tostring(output_root, encoding='unicode', pretty_print=True)
        store_data_into_file(tree_as_string, output_file)
def escape_tag_from_invalid_xml_file(filename_invalid_xml, tag, new_tag=None, new_filename_xml=None):
    """
    Escape a section (tag) in a invalid xml file.
    Optionally, it renames the section with new_tag
    """
    logger.debug('Escaping the tag: "%s" in file: %s'%(tag, filename_invalid_xml))
    content_buffer = list()
    tag_buffer = list()
    rex_tag_data = re.compile((r'^<{0}(.*?)>(.*?)</{0}>|'
                               r'^<{0}(.*?)>').format(tag), 
                              re.S|re.M)
    with open(filename_invalid_xml, 'r') as f:
        line_id = 0L
        for line in f:
            # Get the line as unicode
            line = line.decode("utf-8").replace(u'\n',u'',1)
            line_id += 1
            
            # Check if a tag-section is not being processed currently
            if len(tag_buffer) == 0:
                # try to match the entire section
                match = rex_tag_data.search(line)
                if match is None:
                    content_buffer.append(line)
                else:
                    # Begin processing a tag-section
                    tag_buffer.append(line)
                
                    # Check if the tag has reached its end
                    attrs_1, content, attrs_2 = match.groups()
                    if content is None:
                        tag_has_ended = False
                    else:
                        tag_has_ended = True
            else:
                # Check if the tag-section not has reached its end
                if not tag_has_ended and not line.endswith('</%s>'%tag):
                    tag_buffer.append(line)
                elif line.endswith('</%s>'%tag) :
                    tag_buffer.append(line)

                # Process the tag-section
                if tag_has_ended or line.endswith('</%s>'%tag):
                    
                    # Get tag-content by removing the tag-content
                    raw_tag = u'\n'.join(tag_buffer)
                    match = rex_tag_data.search(raw_tag)
                    if match:
                        attrs_1, unescaped_content, attrs_2 = match.groups()
                        attrs_data = attrs_1 or attrs_2 or ''
                    else:
                        error_msg = ('Error matching the tag: %s '
                                     'at line: %s, with raw-content:\n%s')%(tag, line_id, raw_tag)
                        logger.error(error_msg)
                        raise Exception(error_msg)
                    
                    # Convert the raw-content into a valid-xml-content
                    try:
                        unescaped_content = escape_xml_illegal_chars(unescaped_content, ' ')
                        if new_tag is None:
                            new_tag = tag
                        dummy_element = etree.Element(new_tag)
                        dummy_element.text = unescaped_content
                    except Exception as ex:
                        error_msg = ('Error escaping the tag: %s '
                                     'at line: %s, with content:\n%s')%(tag, line_id, unescaped_content)
                        logger.error(error_msg)
                        raise ex
                    escaped_tag = etree.tounicode(dummy_element)

                    # Add the attribute-info  
                    if attrs_data is not None and attrs_data.strip() != '':
                        attrs_data = escape_xml_illegal_chars(attrs_data, ' ')
                        attrs_data = escape(attrs_data)
                        
                    escaped_tag = escaped_tag.replace(u'<%s>'%new_tag, u'<%s%s>'%(new_tag, attrs_data))
                    
                    # include to the main buffer
                    content_buffer.extend(escaped_tag.splitlines())
                    tag_buffer = list()

                if tag_has_ended:
                    content_buffer.append(line)

    # Store it again
    if new_filename_xml is None:
        new_filename_xml = filename_invalid_xml
    store_data_into_file(u'\n'.join(content_buffer), new_filename_xml)