Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(prog='index2ddg.py')

    parser.add_argument(
        'index',
        type=str,
        help='The path to the XML index containing identifier data')

    parser.add_argument(
        'reference',
        type=str,
        help='The path to the downloaded reference (reference directory in '
        'the downloaded archive)')

    parser.add_argument('output',
                        type=str,
                        help='The path to destination output.txt file')

    parser.add_argument(
        '--split_code_snippets',
        action='store_true',
        default=False,
        help='Puts each declaration into a separate code snippet.')

    parser.add_argument(
        '--max_code_lines',
        type=int,
        default=6,
        help='Maximum number of lines of code to show in abstract')

    parser.add_argument(
        '--max_sentences',
        type=int,
        default=1,
        help='Maximum number of sentences to use for the description')

    parser.add_argument(
        '--max_characters',
        type=int,
        default=200,
        help='Maximum number of characters to use for the description')

    parser.add_argument(
        '--max_paren_chars',
        type=int,
        default=40,
        help='Maximum size of parenthesized text in the description. '
        'Parenthesized chunks longer than that is removed, unless they '
        'are within <code>, <b> or <i> tags')

    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help='Enables debug mode.')

    parser.add_argument(
        '--debug_ident',
        type=str,
        default=None,
        help='Processes only the identifiers that match debug_ident')

    parser.add_argument(
        '--debug_abstracts_path',
        type=str,
        default=None,
        help='Path to print the abstracts before newline stripping occurs')
    args = parser.parse_args()

    # If a the second argument is 'debug', the program switches to debug mode
    # and prints everything to stdout. If the third argument is provided, the
    # program processes only the identifiers that match the provided string

    debug = DDGDebug(args.debug, args.debug_ident, args.debug_abstracts_path)

    index_file = args.index
    output_file = args.output

    # a map that stores information about location and type of identifiers
    # it's two level map: full_link maps to a dict that has full_name map to
    # ITEM_TYPE_* value
    ident_map = {}

    # get a list of pages to analyze
    tr = Index2DuckDuckGoList(ident_map)
    tr.transform_file(index_file)

    # get a mapping between titles and pages
    # linkmap = dict { title -> filename }
    link_map = build_link_map(args.reference)

    # create a list of processing instructions for each page
    proc_ins = get_processing_instructions(ident_map, link_map)

    # sort proc_ins to produce ordered output.txt
    proc_ins = [v for v in proc_ins.values()]
    proc_ins = sorted(proc_ins, key=lambda x: x['link'])

    for page in proc_ins:
        idents = page['idents']
        idents = [v for v in idents.values()]
        idents = sorted(idents, key=lambda x: x['ident'])
        page['idents'] = idents

    redirects = []

    out = open(output_file, 'w', encoding='utf-8')

    # i=1
    for page in proc_ins:
        idents = page['idents']
        link = page['link']
        fn = page['fn']

        if debug.should_skip_ident([i['ident'] for i in idents]):
            continue

        # print(str(i) + '/' + str(len(proc_ins)) + ': ' + link)
        # i+=1

        root = e.parse(os.path.join(args.reference, fn),
                       parser=html.HTMLParser())

        for ident in idents:

            item_ident = ident['ident']
            item_type = ident['type']

            process_identifier(out,
                               redirects,
                               root,
                               link,
                               item_ident,
                               item_type,
                               args,
                               debug=debug)

    output_redirects(out, redirects)

    if debug.enabled:
        print('=============================')
        print('Numbers of lines used:')
        for i, l in enumerate(debug.stat_line_nums):
            print(str(i) + ': ' + str(l) + ' result(s)')
Esempio n. 2
0
        IndexTransform.process_item_hook(self, el, full_name, full_link)

# get a list of pages to analyze
tr = Index2DuckDuckGoList()
tr.transform(index_file)

# get a list of existing pages
html_files = []
for root, dirnames, filenames in os.walk('reference'):
    for filename in fnmatch.filter(filenames, '*.html'):
        html_files.append(os.path.join(root, filename))

# get a mapping between titles and pages

# linkmap = dict { title -> filename }
link_map = build_link_map('reference')

# create a list of processing instructions for each page
proc_ins = {}

for link in items:
    if link in link_map:
        fn = link_map[link]
        if fn not in proc_ins:
            proc_ins[fn] = { 'fn': fn, 'link': link, 'idents': {}}
        for ident in items[link]:
            proc_ins[fn]['idents'][ident] = { 'ident' : ident,
                                              'type' : items[link][ident] }

# sort proc_ins to produce ordered output.txt
proc_ins = [ v for v in proc_ins.values() ]

# get a list of pages to analyze
tr = Index2DuckDuckGoList()
tr.transform(index_file)

# get a list of existing pages
html_files = []
for root, dirnames, filenames in os.walk('reference'):
    for filename in fnmatch.filter(filenames, '*.html'):
        html_files.append(os.path.join(root, filename))

# get a mapping between titles and pages

# linkmap = dict { title -> filename }
link_map = build_link_map('reference')

# create a list of processing instructions for each page
proc_ins = {}

for link in items:
    if link in link_map.mapping:
        fn = link_map.mapping[link]
        if fn not in proc_ins:
            proc_ins[fn] = {'fn': fn, 'link': link, 'idents': {}}
        for ident in items[link]:
            proc_ins[fn]['idents'][ident] = {
                'ident': ident,
                'type': items[link][ident]
            }
def main():

    parser = argparse.ArgumentParser(prog='index2ddg.py')
    parser.add_argument('index', type=str,
                        help='The path to the XML index containing identifier data')
    parser.add_argument('reference', type=str,
                        help=('The path to the downloaded reference (reference '
                              'directory in the downloaded archive)'))
    parser.add_argument('output', type=str,
                        help='The path to destination output.txt file')
    parser.add_argument('--split_code_snippets', action='store_true', default=False,
                        help='Puts each declaration into a separate code snippet.')
    parser.add_argument('--max_code_lines', type=int, default=6,
                        help='Maximum number of lines of code to show in abstract')
    parser.add_argument('--max_sentences', type=int, default=1,
                        help='Maximum number of sentences to use for the description')
    parser.add_argument('--max_characters', type=int, default=200,
                        help='Maximum number of characters to use for the description')
    parser.add_argument('--max_paren_chars', type=int, default=40,
                        help='Maximum size of parenthesized text in the description. '+
                        'Parenthesized chunks longer than that is removed, unless '+
                        'they are within <code>, <b> or <i> tags')
    parser.add_argument('--debug', action='store_true', default=False,
                        help='Enables debug mode.')
    parser.add_argument('--debug_ident', type=str, default=None,
                        help='Processes only the identifiers that match debug_ident')
    parser.add_argument('--debug_abstracts_path', type=str, default=None,
                        help='Path to print the abstracts before newline stripping occurs')
    args = parser.parse_args()

    # If a the second argument is 'debug', the program switches to debug mode and
    # prints everything to stdout. If the third argument is provided, the program
    # processes only the identifiers that match the provided string

    debug = DDGDebug(args.debug, args.debug_ident, args.debug_abstracts_path)

    index_file = args.index
    output_file = args.output

    # a map that stores information about location and type of identifiers
    # it's two level map: full_link maps to a dict that has full_name map to
    # ITEM_TYPE_* value
    ident_map = {}

    # get a list of pages to analyze
    tr = Index2DuckDuckGoList(ident_map)
    tr.transform(index_file)

    # get a list of existing pages
    html_files = get_html_files(args.reference)

    # get a mapping between titles and pages
    # linkmap = dict { title -> filename }
    link_map = build_link_map(args.reference)

    # create a list of processing instructions for each page
    proc_ins = get_processing_instructions(ident_map, link_map)

    # sort proc_ins to produce ordered output.txt
    proc_ins = [ v for v in proc_ins.values() ]
    proc_ins = sorted(proc_ins, key=lambda x: x['link'])

    for page in proc_ins:
        idents = page['idents']
        idents = [ v for v in idents.values() ]
        idents = sorted(idents, key=lambda x: x['ident'])
        page['idents'] = idents

    redirects = []

    out = open(output_file, 'w', encoding='utf-8')

    #i=1
    for page in proc_ins:
        idents = page['idents']
        link = page['link']
        fn = page['fn']

        if debug.should_skip_ident([ i['ident'] for i in idents ]):
            continue

        #print(str(i) + '/' + str(len(proc_ins)) + ': ' + link)
        #i+=1

        root = e.parse(os.path.join(args.reference, fn), parser=html.HTMLParser())

        for ident in idents:

            item_ident = ident['ident']
            item_type = ident['type']

            process_identifier(out, redirects, root, link, item_ident, item_type,
                               args, debug=debug)

    output_redirects(out, redirects)

    if debug.enabled:
        print('=============================')
        print('Numbers of lines used:')
        for i,l in enumerate(debug.stat_line_nums):
            print(str(i) + ': ' + str(l) + ' result(s)')
Esempio n. 5
0
    else:
        print("Loader file " + fn + " does not match any known files")
        sys.exit(1)

    rename_file(root, fn, new_fn)

# rename filenames that conflict on case-insensitive filesystems
# TODO: perform this automatically
rename_file('output/reference/en/cpp/numeric/math', 'NAN.html', 'NAN.2.html')
rename_file('output/reference/en/c/numeric/math', 'NAN.html', 'NAN.2.html')

# clean FAQ
clean_faq('output')

# generate link map as long as there is all information present
build_link_map()

# find files that need to be preprocessed
html_files = []
for root, dirnames, filenames in os.walk('output/reference/'):
    for filename in fnmatch.filter(filenames, '*.html'):
        html_files.append(os.path.join(root, filename))

#temporary fix
# r1 = re.compile('<style[^<]*?<[^<]*?MediaWiki:Geshi\.css[^<]*?<\/style>', re.MULTILINE)

# fix links to files in rename_map
rlink = re.compile('((?:src|href)=")([^"]*)(")')

html_comment = re.compile("<!--(.|\s)*?-->")