def Write(self, outfile):
     """
     Write the entity cache to an open file object.
     """
     self.Fortify()
     et = ElementTree(self.history.ToElement())
     et.write(outfile)
Beispiel #2
0
def gencix(major, minor):
    # First generate first pass at the CILE over all of the lib tree
    cixfile = "activeperl-%d.%d.cix" % (major, minor)
    command = "python ../../../ci2.py scan -n -r -p -l Perl -T /tmp/ActivePerl-%d.%d/perl/lib -i \"*.pm\"> %s" % (
        major, minor, cixfile)
    retval = os.system(command)
    if retval != 0:
        print "Error scanning ActivePerl library"
        sys.exit(retval)
    #
    # Grab the output of that scan

    root = parse(cixfile).getroot()

    newroot = Element("codeintel", version="2.0")
    cixfile = SubElement(newroot,
                         "file",
                         lang="Perl",
                         mtime=str(int(time.time())),
                         path=os.path.basename('perl.cix'))

    for file in root.getiterator('file'):
        print >> sys.stderr, "Processing", file.get('path')
        for blob in file:
            if blob.get("src"):
                # Don't want the src string.
                del blob.attrib["src"]
            cixfile.append(blob)

    cix = genPerlStdCIX(
        cixfile,
        "/tmp/ActivePerl-%d.%d/perl/lib/pod/perlfunc.pod" % (major, minor))

    parent_map = dict((c, p) for p in cixfile.getiterator() for c in p)
    for variable in newroot.getiterator('variable'):
        attributes = variable.get('attributes')
        if attributes and '__local__' in variable.get('attributes'):
            parent_map[variable].remove(variable)

    # Generate the CIX.
    print >> sys.stderr, "Prettying"
    prettify(newroot)
    tree = ElementTree(newroot)
    #fname = '../../../lib/codeintel2/stdlibs/perl-%d.%d.cix' % (major, minor)
    fname = 'perl-%d.%d.cix' % (major, minor)
    #os.system('p4 edit %s' % fname)
    stream = open(fname, "w")
    print >> sys.stderr, "Writing"
    stream.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    tree.write(stream)
    stream.close()
Beispiel #3
0
def gencix(major, minor):
    # First generate first pass at the CILE over all of the lib tree
    cixfile = "activeperl-%d.%d.cix" % (major, minor)
    command = "python ../../../ci2.py scan -n -r -p -l Perl -T /tmp/ActivePerl-%d.%d/perl/lib -i \"*.pm\"> %s" % (major, minor, cixfile)
    retval = os.system(command)
    if retval != 0:
        print "Error scanning ActivePerl library"
        sys.exit(retval)
    #    
    # Grab the output of that scan
    
    root = parse(cixfile).getroot()
    
    newroot = Element("codeintel", version="2.0")
    cixfile = SubElement(newroot, "file", lang="Perl",
                         mtime=str(int(time.time())),
                         path=os.path.basename('perl.cix'))
    
    for file in root.getiterator('file'):
        print >> sys.stderr, "Processing", file.get('path')
        for blob in file:
            if blob.get("src"):
                # Don't want the src string.
                del blob.attrib["src"]
            cixfile.append(blob)
    
    cix = genPerlStdCIX(cixfile, "/tmp/ActivePerl-%d.%d/perl/lib/pod/perlfunc.pod" % (major, minor))
        
    parent_map = dict((c, p) for p in cixfile.getiterator() for c in p)
    for variable in newroot.getiterator('variable'):
        attributes = variable.get('attributes')
        if attributes and '__local__' in variable.get('attributes'):
            parent_map[variable].remove(variable)

    # Generate the CIX.
    print >>sys.stderr, "Prettying"
    prettify(newroot)
    tree = ElementTree(newroot)
    #fname = '../../../lib/codeintel2/stdlibs/perl-%d.%d.cix' % (major, minor)
    fname = 'perl-%d.%d.cix' % (major, minor)
    #os.system('p4 edit %s' % fname)
    stream = open(fname, "w")
    print >>sys.stderr, "Writing"
    stream.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    tree.write(stream)
    stream.close()
Beispiel #4
0
    def __call__(self, **kwargs):
        xml = """<?xml version="1.0" encoding="UTF-8"?>"""
        exporter = queryMultiAdapter((self.context, self.request), IExport)
        if not exporter:
            return xml

        body = exporter.body
        if not isinstance(body, ElementTree):
            body = ElementTree(body)

        out = StringIO()
        body.write(out)
        out.seek(0)
        xml += out.read()

        self.request.response.setHeader('content-type', 'text/xml')
        return xml
Beispiel #5
0
def genPerlStdCIX(filename, stream):
    log.debug("genPerlStdCIX(filename=%r, stream=%r)", filename, stream)

    root = Element("codeintel", version="2.0")
    cixfile = SubElement(root, "file", lang="Perl",
                         mtime=str(int(time.time())),
                         path=os.path.basename(filename))

    # Process Perl's built-ins out of perlfunc.pod.
    if 1:
        p4path = "//depot/main/Apps/Gecko/src/Core/pod/perlfunc.pod"
        cmd = "p4 print -q %s" % p4path
        i,o,e = os.popen3(cmd)
        lines = o.read().splitlines(0)
        i.close(); o.close(); retval = e.close()
        if retval:
            raise Error("error running: %s" % cmd)
    else:
        lines = open("perlfunc.pod", 'r').read().splitlines(0)

    # Parse the "Alphabetical Listing of Perl Functions" into a list of
    # 'blocks' where each block is one command-"=item" block.
    start = lines.index("=head2 Alphabetical Listing of Perl Functions")
    blocks = []
    block = None
    level = 0
    def parseItem(line):
        sig = line.split(None, 1)[1]
        name = re.split("[ \t\n(/]", sig, 1)[0]
        return name, sig
    for i, line in enumerate(lines[start:]):
        if line.startswith("=over"):
            level += 1
        if line.startswith("=back"):
            level -= 1
            if level == 0: # done the 'Alphabetical Listing' section
                if block: blocks.append(block)
                break
    
        if level > 1:
            if block:
                block["lines"].append(line)
        elif block is None and not line.startswith("=item"):
            continue
        elif block is None and line.startswith("=item"):
            block = {}
            name, sig = parseItem(line)
            block = {
                "name": name,
                "sigs": [sig],
                "lines": []
            }
        elif line.startswith("=item"):
            name, sig = parseItem(line)
            if name == block["name"]:
                block["sigs"].append(sig)
            else:
                blocks.append(block)
                block = {
                    "name": name,
                    "sigs": [sig],
                    "lines": []
                }
        else:
            if not block["lines"] and not line.strip():
                pass # drop leading empty lines
            elif not line.strip() and block["lines"] and \
               not block["lines"][-1].strip():
                pass # collapse multiple blank lines
            else:
                block["lines"].append(line)
    #pprint(blocks)

    # Process the blocks into a list of command info dicts.
    def podrender(pod):
        rendered = pod
        rendered = re.sub("F<(.*?)>", r"\1", rendered)
        rendered = re.sub("I<(.*?)>", r"*\1*", rendered)
        def quoteifspaced(match):
            if ' ' in match.group(1):
                return "'%s'" % match.group(1)
            else:
                return match.group(1)
        rendered = re.sub("C<(.*?)>", quoteifspaced, rendered)
        def linkrepl(match):
            content = match.group(1)
            if content.startswith("/"): content = content[1:]
            if "/" in content:
                page, section = content.split("/", 1)
                content = "%s in '%s'" % (section, page)
            else:
                content = "'%s'" % content
            return content
        rendered = re.sub("L<(.*?)>", linkrepl, rendered)
        return rendered

    # These perl built-ins are grouped in perlfunc.pod.
    commands = []
    WIDTH = 60 # desc field width
    syscalls = """
        getpwnam getgrnam gethostbyname getnetbyname getprotobyname 
        getpwuid getgrgid getservbyname gethostbyaddr getnetbyaddr 
        getprotobynumber getservbyport getpwent getgrent gethostent
        getnetent getprotoent getservent setpwent setgrent sethostent 
        setnetent setprotoent setservent endpwent endgrent endhostent
        endnetent endprotoent endservent
    """.split()
    calltip_skips = "sub use require".split()
    for block in blocks:
        name, sigs, lines = block["name"], block["sigs"], block["lines"]
        if name == "-X": # template for -r, -w, -f, ...
            pattern = re.compile(r"^    (-\w)\t(.*)$")
            tlines = [line for line in lines if pattern.match(line)]
            for tline in tlines:
                tname, tdesc = pattern.match(tline).groups()
                tsigs = [s.replace("-X", tname) for s in sigs]
                command = {"name": tname, "sigs": tsigs,
                           "desc": textwrap.fill(tdesc, WIDTH)}
                commands.append(command)
        elif name in ("m", "q", "qq", "qr", "qx", "qw", "s", "tr", "y"):
            operators = {
                "m":  """\
                    m/PATTERN/cgimosx
                    /PATTERN/cgimosx

                    Searches a string for a pattern match, and in scalar
                    context returns true if it succeeds, false if it fails.
                      """,
                "q":  """\
                    q/STRING/
                    'STRING'

                    A single-quoted, literal string.
                      """,
                "qq": """\
                    qq/STRING/
                    "STRING"

                    A double-quoted, interpolated string.
                      """,
                "qr": """\
                    qr/STRING/imosx

                    Quotes (and possibly compiles) STRING as a regular
                    expression.
                      """,
                "qx": """\
                    qx/STRING/
                    `STRING`

                    A string which is (possibly) interpolated and then
                    executed as a system command.
                      """,
                "qw": """\
                    qw/STRING/

                    Evaluates to a list of the words extracted out of STRING,
                    using embedded whitespace as the word delimiters.
                      """,
                "s":  """\
                    s/PATTERN/REPLACEMENT/egimosx

                    Searches a string for a pattern, and if found, replaces
                    that pattern with the replacement text and returns the
                    number of substitutions made. Otherwise it returns the
                    empty string.
                      """,
                "tr": """\
                    tr/SEARCHLIST/REPLACEMENTLIST/cds
                    y/SEARCHLIST/REPLACEMENTLIST/cds

                    Transliterates all occurrences of the characters found in
                    the search list with the corresponding character in the
                    replacement list. It returns the number of characters
                    replaced or deleted.
                      """,
                "y":  """\
                    tr/SEARCHLIST/REPLACEMENTLIST/cds
                    y/SEARCHLIST/REPLACEMENTLIST/cds

                    Transliterates all occurrences of the characters found in
                    the search list with the corresponding character in the
                    replacement list. It returns the number of characters
                    replaced or deleted.
                      """,
            }
            sigs = []
            desclines = None
            for line in operators[name].splitlines(0):
                if desclines is not None:
                    desclines.append(line.strip())
                elif not line.strip():
                    desclines = []
                else:
                    sigs.append(line.strip())
            command = {"name": name, "sigs": sigs,
                       "desc": textwrap.fill(' '.join(desclines), WIDTH)}
            commands.append(command)
        elif name in syscalls:
            desc = "Performs the same function as the '%s' system call." % name
            desc = textwrap.fill(desc, WIDTH)
            getterListContext = {
                "getpw":    "\n"
                            "  ($name,$passwd,$uid,$gid,$quota,$comment,\n"
                            "   $gcos,$dir,$shell,$expire) = %s",
                "getgr":    "\n  ($name,$passwd,$gid,$members) = %s",
                "gethost":  "\n  ($name,$aliases,$addrtype,$length,@addrs) = %s",
                "getnet":   "\n  ($name,$aliases,$addrtype,$net) = %s",
                "getproto": "\n  ($name,$aliases,$proto) = %s",
                "getserv":  "\n  ($name,$aliases,$port,$proto) = %s",
            }
            getterScalarContext = {
                "getgrent":         "$name = %s",
                "getgrgid":         "$name = %s",
                "getgrnam":         "$gid = %s",
                "gethostbyaddr":    "$name = %s",
                "gethostbyname":    "$addr = %s",
                "gethostent":       "$name = %s",
                "getnetbyaddr":     "$name = %s",
                "getnetbyname":     "$net = %s",
                "getnetent":        "$name = %s",
                "getprotobyname":   "$num = %s",
                "getprotobynumber": "$name = %s",
                "getprotoent":      "$name = %s",
                "getpwent":         "$name = %s",
                "getpwnam":         "$uid = %s",
                "getpwuid":         "$name = %s",
                "getservbyname":    "$num = %s",
                "getservbyport":    "$name = %s",
                "getservent":       "$name = %s",
            }
            for prefix, template in getterListContext.items():
                if name.startswith(prefix):
                    desc += template % sigs[0]
                    if name in getterScalarContext:
                        desc += "\nin list context or:\n  "\
                                + getterScalarContext[name] % sigs[0]
            command = {"name": name, "desc": desc, "sigs": sigs}
            commands.append(command)
        elif name == "shmread":
            desc = """\
                Reads the System V shared memory segment ID
                starting at position POS for size SIZE by attaching to it,
                copying out, and detaching from it.
            """
            desc = ' '.join([ln.strip() for ln in desc.splitlines(0)])
            command = {"name": name, "sigs": sigs,
                       "desc": textwrap.fill(desc, WIDTH)}
            commands.append(command)
        elif name == "shmwrite":
            desc = """\
                Writes the System V shared memory segment ID
                starting at position POS for size SIZE by attaching to it,
                copying in, and detaching from it.
            """
            desc = ' '.join([ln.strip() for ln in desc.splitlines(0)])
            command = {"name": name, "sigs": sigs,
                       "desc": textwrap.fill(desc, WIDTH)}
            commands.append(command)
        elif name in calltip_skips:
            continue # just drop the sub calltip: annoying
        else:
            # Parsing the description from the full description:
            # Pull out the first sentence up to a maximum of three lines
            # and one paragraph. If the first *two* sentences fit on the
            # first line, then use both.
            desc = ""
            sentencePat = re.compile(r"([^\.]+(?:\. |\.$))")
            if name in ("dbmclose", "dbmopen"):
                # Skip the first paragraph: "[This function...superceded by"
                lines = lines[lines.index('')+1:]
            elif name == "do":
                # Skip the first sentence: "Not really a function."
                end = sentencePat.match(lines[0]).span()[1]
                lines[0] = lines[0][end:].lstrip()
            for i, line in enumerate(lines):
                if not line.strip(): break
                sentences = sentencePat.findall(line)
                if not sentences:
                    desc += line + ' '
                    continue
                elif i == 0 and len(sentences) > 1:
                    desc += ' '.join([s.strip() for s in sentences[:2]])
                else:
                    desc += sentences[0].strip()
                break
            command = {"name": name, "sigs": sigs,
                       "desc": textwrap.fill(podrender(desc), WIDTH)}
            commands.append(command)
    #for command in commands:
    #    print
    #    print banner(command["name"], '-')
    #    print '\n'.join(command["sigs"])
    #    print
    #    print command["desc"]
    
    # Generate the CIX for each function.
    module_elt = SubElement(cixfile, "scope", ilk="blob", name="*") # "built-ins" module
    for command in commands:
        name, sigs, desc = command["name"], command["sigs"], command["desc"]
        func_elt = SubElement(module_elt, "scope", ilk="function", name=name)
        if sigs:
            func_elt.set("signature", '\n'.join(sigs))
        if desc:
            doclines = desc.split('\n')[:3]
            #doclines = parseDocSummary(doclines)
            doc = '\n'.join(doclines)
            func_elt.set("doc", doc)

    # Generate the CIX.
    prettify(root)
    tree = ElementTree(root)
    stream.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    tree.write(stream)
Beispiel #6
0
def main(options, args):
    # 1. load reference lexicon
    print('loading reference lexicon ...')
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print('loading subliminal lexicon ...')
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print('loading subliminal g2p model ...')
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print('loading g2p model ...')
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print('stripped number of multigrams from %d to %d' % (oldSize, newSize))

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print('creating extended lexicon ...')
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(filter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print(p, file=f)
	else:
	    for w in vocabulary:
		if w is not None:
		    print(w, file=f)

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print('creating sequence model events ...')
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print(repr(event), '\t', count, file=f)

    # 6. count LM events
    if options.write_counts:
	print('creating sequence model counts ...')
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print('dumping fragments ...')
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in list(fragments.keys()):
            print(event, '\t', ' '.join(fragments[event]), file=f)

    # 8. dump modified LM training text
    if options.write_lm_text:
        print('dumping modified LM training text ...')
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print(" ".join(modWords), file=f)
Beispiel #7
0
etree = ElementTree(file=StringIO.StringIO(content))
feed = XML(content)

print etree
print feed

#print len(feed)
#print feed[0]
#print feed.keys()

ATOM = "http://www.w3.org/2005/Atom"

entry = etree.getiterator('{%s}entry'%ATOM)[0]
new_lin = SubElement(entry, '{%s}link'%ATOM)
new_lin.set('rel', 'source')
new_lin.set('href', 'http://somthing.org')

title = etree.findall('{%s}title'%ATOM)[0]
print tostring(title)

missing = etree.findall('{%s}missing'%ATOM)
print missing

for e in etree.findall('//{%s}link'%ATOM):
    print e.get('rel', 'alternate')

s = StringIO.StringIO()
etree.write(s)
s.seek(0)
print s.getvalue()
Beispiel #8
0
def main(options, args):
    # 1. load reference lexicon
    print 'loading reference lexicon ...'
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print 'loading subliminal lexicon ...'
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print 'loading subliminal g2p model ...'
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print 'loading g2p model ...'
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print 'stripped number of multigrams from %d to %d' % (oldSize, newSize)

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print 'creating extended lexicon ...'
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(ifilter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print >> f, p
	else:
	    for w in vocabulary:
		if w is not None:
		    print >> f, w

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print 'creating sequence model events ...'
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print >> f, repr(event), '\t', count

    # 6. count LM events
    if options.write_counts:
	print 'creating sequence model counts ...'
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print 'dumping fragments ...'
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in fragments.keys():
            print >> f, event, '\t', ' '.join(fragments[event])

    # 8. dump modified LM training text
    if options.write_lm_text:
        print 'dumping modified LM training text ...'
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print >> f, " ".join(modWords)
Beispiel #9
0
def genPerlStdCIX(filename, stream):
    log.debug("genPerlStdCIX(filename=%r, stream=%r)", filename, stream)

    root = Element("codeintel", version="2.0")
    cixfile = SubElement(root,
                         "file",
                         lang="Perl",
                         mtime=str(int(time.time())),
                         path=os.path.basename(filename))

    # Process Perl's built-ins out of perlfunc.pod.
    if 1:
        p4path = "//depot/main/Apps/Gecko/src/Core/pod/perlfunc.pod"
        cmd = "p4 print -q %s" % p4path
        i, o, e = os.popen3(cmd)
        lines = o.read().splitlines(0)
        i.close()
        o.close()
        retval = e.close()
        if retval:
            raise Error("error running: %s" % cmd)
    else:
        lines = open("perlfunc.pod", 'r').read().splitlines(0)

    # Parse the "Alphabetical Listing of Perl Functions" into a list of
    # 'blocks' where each block is one command-"=item" block.
    start = lines.index("=head2 Alphabetical Listing of Perl Functions")
    blocks = []
    block = None
    level = 0

    def parseItem(line):
        sig = line.split(None, 1)[1]
        name = re.split("[ \t\n(/]", sig, 1)[0]
        return name, sig

    for i, line in enumerate(lines[start:]):
        if line.startswith("=over"):
            level += 1
        if line.startswith("=back"):
            level -= 1
            if level == 0:  # done the 'Alphabetical Listing' section
                if block:
                    blocks.append(block)
                break

        if level > 1:
            if block:
                block["lines"].append(line)
        elif block is None and not line.startswith("=item"):
            continue
        elif block is None and line.startswith("=item"):
            block = {}
            name, sig = parseItem(line)
            block = {"name": name, "sigs": [sig], "lines": []}
        elif line.startswith("=item"):
            name, sig = parseItem(line)
            if name == block["name"]:
                block["sigs"].append(sig)
            else:
                blocks.append(block)
                block = {"name": name, "sigs": [sig], "lines": []}
        else:
            if not block["lines"] and not line.strip():
                pass  # drop leading empty lines
            elif not line.strip() and block["lines"] and \
                    not block["lines"][-1].strip():
                pass  # collapse multiple blank lines
            else:
                block["lines"].append(line)
    # pprint(blocks)

    # Process the blocks into a list of command info dicts.
    def podrender(pod):
        rendered = pod
        rendered = re.sub("F<(.*?)>", r"\1", rendered)
        rendered = re.sub("I<(.*?)>", r"*\1*", rendered)

        def quoteifspaced(match):
            if ' ' in match.group(1):
                return "'%s'" % match.group(1)
            else:
                return match.group(1)

        rendered = re.sub("C<(.*?)>", quoteifspaced, rendered)

        def linkrepl(match):
            content = match.group(1)
            if content.startswith("/"):
                content = content[1:]
            if "/" in content:
                page, section = content.split("/", 1)
                content = "%s in '%s'" % (section, page)
            else:
                content = "'%s'" % content
            return content

        rendered = re.sub("L<(.*?)>", linkrepl, rendered)
        return rendered

    # These perl built-ins are grouped in perlfunc.pod.
    commands = []
    WIDTH = 60  # desc field width
    syscalls = """
        getpwnam getgrnam gethostbyname getnetbyname getprotobyname
        getpwuid getgrgid getservbyname gethostbyaddr getnetbyaddr
        getprotobynumber getservbyport getpwent getgrent gethostent
        getnetent getprotoent getservent setpwent setgrent sethostent
        setnetent setprotoent setservent endpwent endgrent endhostent
        endnetent endprotoent endservent
    """.split()
    calltip_skips = "sub use require".split()
    for block in blocks:
        name, sigs, lines = block["name"], block["sigs"], block["lines"]
        if name == "-X":  # template for -r, -w, -f, ...
            pattern = re.compile(r"^    (-\w)\t(.*)$")
            tlines = [line for line in lines if pattern.match(line)]
            for tline in tlines:
                tname, tdesc = pattern.match(tline).groups()
                tsigs = [s.replace("-X", tname) for s in sigs]
                command = {
                    "name": tname,
                    "sigs": tsigs,
                    "desc": textwrap.fill(tdesc, WIDTH)
                }
                commands.append(command)
        elif name in ("m", "q", "qq", "qr", "qx", "qw", "s", "tr", "y"):
            operators = {
                "m":
                """\
                    m/PATTERN/cgimosx
                    /PATTERN/cgimosx

                    Searches a string for a pattern match, and in scalar
                    context returns true if it succeeds, false if it fails.
                      """,
                "q":
                """\
                    q/STRING/
                    'STRING'

                    A single-quoted, literal string.
                      """,
                "qq":
                """\
                    qq/STRING/
                    "STRING"

                    A double-quoted, interpolated string.
                      """,
                "qr":
                """\
                    qr/STRING/imosx

                    Quotes (and possibly compiles) STRING as a regular
                    expression.
                      """,
                "qx":
                """\
                    qx/STRING/
                    `STRING`

                    A string which is (possibly) interpolated and then
                    executed as a system command.
                      """,
                "qw":
                """\
                    qw/STRING/

                    Evaluates to a list of the words extracted out of STRING,
                    using embedded whitespace as the word delimiters.
                      """,
                "s":
                """\
                    s/PATTERN/REPLACEMENT/egimosx

                    Searches a string for a pattern, and if found, replaces
                    that pattern with the replacement text and returns the
                    number of substitutions made. Otherwise it returns the
                    empty string.
                      """,
                "tr":
                """\
                    tr/SEARCHLIST/REPLACEMENTLIST/cds
                    y/SEARCHLIST/REPLACEMENTLIST/cds

                    Transliterates all occurrences of the characters found in
                    the search list with the corresponding character in the
                    replacement list. It returns the number of characters
                    replaced or deleted.
                      """,
                "y":
                """\
                    tr/SEARCHLIST/REPLACEMENTLIST/cds
                    y/SEARCHLIST/REPLACEMENTLIST/cds

                    Transliterates all occurrences of the characters found in
                    the search list with the corresponding character in the
                    replacement list. It returns the number of characters
                    replaced or deleted.
                      """,
            }
            sigs = []
            desclines = None
            for line in operators[name].splitlines(0):
                if desclines is not None:
                    desclines.append(line.strip())
                elif not line.strip():
                    desclines = []
                else:
                    sigs.append(line.strip())
            command = {
                "name": name,
                "sigs": sigs,
                "desc": textwrap.fill(' '.join(desclines), WIDTH)
            }
            commands.append(command)
        elif name in syscalls:
            desc = "Performs the same function as the '%s' system call." % name
            desc = textwrap.fill(desc, WIDTH)
            getterListContext = {
                "getpw": "\n"
                "  ($name,$passwd,$uid,$gid,$quota,$comment,\n"
                "   $gcos,$dir,$shell,$expire) = %s",
                "getgr": "\n  ($name,$passwd,$gid,$members) = %s",
                "gethost":
                "\n  ($name,$aliases,$addrtype,$length,@addrs) = %s",
                "getnet": "\n  ($name,$aliases,$addrtype,$net) = %s",
                "getproto": "\n  ($name,$aliases,$proto) = %s",
                "getserv": "\n  ($name,$aliases,$port,$proto) = %s",
            }
            getterScalarContext = {
                "getgrent": "$name = %s",
                "getgrgid": "$name = %s",
                "getgrnam": "$gid = %s",
                "gethostbyaddr": "$name = %s",
                "gethostbyname": "$addr = %s",
                "gethostent": "$name = %s",
                "getnetbyaddr": "$name = %s",
                "getnetbyname": "$net = %s",
                "getnetent": "$name = %s",
                "getprotobyname": "$num = %s",
                "getprotobynumber": "$name = %s",
                "getprotoent": "$name = %s",
                "getpwent": "$name = %s",
                "getpwnam": "$uid = %s",
                "getpwuid": "$name = %s",
                "getservbyname": "$num = %s",
                "getservbyport": "$name = %s",
                "getservent": "$name = %s",
            }
            for prefix, template in getterListContext.items():
                if name.startswith(prefix):
                    desc += template % sigs[0]
                    if name in getterScalarContext:
                        desc += "\nin list context or:\n  "\
                                + getterScalarContext[name] % sigs[0]
            command = {"name": name, "desc": desc, "sigs": sigs}
            commands.append(command)
        elif name == "shmread":
            desc = """\
                Reads the System V shared memory segment ID
                starting at position POS for size SIZE by attaching to it,
                copying out, and detaching from it.
            """
            desc = ' '.join([ln.strip() for ln in desc.splitlines(0)])
            command = {
                "name": name,
                "sigs": sigs,
                "desc": textwrap.fill(desc, WIDTH)
            }
            commands.append(command)
        elif name == "shmwrite":
            desc = """\
                Writes the System V shared memory segment ID
                starting at position POS for size SIZE by attaching to it,
                copying in, and detaching from it.
            """
            desc = ' '.join([ln.strip() for ln in desc.splitlines(0)])
            command = {
                "name": name,
                "sigs": sigs,
                "desc": textwrap.fill(desc, WIDTH)
            }
            commands.append(command)
        elif name in calltip_skips:
            continue  # just drop the sub calltip: annoying
        else:
            # Parsing the description from the full description:
            # Pull out the first sentence up to a maximum of three lines
            # and one paragraph. If the first *two* sentences fit on the
            # first line, then use both.
            desc = ""
            sentencePat = re.compile(r"([^\.]+(?:\. |\.$))")
            if name in ("dbmclose", "dbmopen"):
                # Skip the first paragraph: "[This function...superceded by"
                lines = lines[lines.index('') + 1:]
            elif name == "do":
                # Skip the first sentence: "Not really a function."
                end = sentencePat.match(lines[0]).span()[1]
                lines[0] = lines[0][end:].lstrip()
            for i, line in enumerate(lines):
                if not line.strip():
                    break
                sentences = sentencePat.findall(line)
                if not sentences:
                    desc += line + ' '
                    continue
                elif i == 0 and len(sentences) > 1:
                    desc += ' '.join([s.strip() for s in sentences[:2]])
                else:
                    desc += sentences[0].strip()
                break
            command = {
                "name": name,
                "sigs": sigs,
                "desc": textwrap.fill(podrender(desc), WIDTH)
            }
            commands.append(command)
    # for command in commands:
    #    print
    #    print banner(command["name"], '-')
    #    print '\n'.join(command["sigs"])
    #    print
    #    print command["desc"]

    # Generate the CIX for each function.
    module_elt = SubElement(cixfile, "scope", ilk="blob",
                            name="*")  # "built-ins" module
    for command in commands:
        name, sigs, desc = command["name"], command["sigs"], command["desc"]
        func_elt = SubElement(module_elt, "scope", ilk="function", name=name)
        if sigs:
            func_elt.set("signature", '\n'.join(sigs))
        if desc:
            doclines = desc.split('\n')[:3]
            # doclines = parseDocSummary(doclines)
            doc = '\n'.join(doclines)
            func_elt.set("doc", doc)

    # Generate the CIX.
    prettify(root)
    tree = ElementTree(root)
    stream.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    tree.write(stream)
Beispiel #10
0
                # add person node
                for _p in _person_list:
                    _p_id += 1
                    _person = SubElement(_persons, "person", id="%d" % _p_id)
                    _person.text = _p
            # create links node
            _links = SubElement(_talk, "links")
            # add links
            _keys = _url_dict.keys()
            _keys.sort()
            for _k in _keys:
                SubElement(_links, "link", href=_url_dict[_k]).text = _k

# render to file
_tree = ElementTree(element=_root)
_tree.write("pycon_dst_tmp.xml", encoding="utf-8")

from BeautifulSoup import BeautifulSoup

_f_tmp = open("pycon_dst_tmp.xml")
_f_dst = open("pycon_dst.xml", "wb")

# dummy replacement of unexpected chars
_content = _f_tmp.read()
_content = _content.replace("&#195;&#169;", "é")
_content = _content.replace("&#195;&#174;", "î")
_content = _content.replace("&#195;&#167;", "ç")
_content = _content.replace("&#195;&#161;", "á")
_content = _content.replace("&#195;&#177;", "ñ")
# _content = _content.replace("&#;&#;", "")
et = ElementTree()
    
#transform categories
print "Transforming Categories..."
for cat_fname in os.listdir(cat_path):
    fpath = os.path.join(cat_path, cat_fname)
    et.parse(fpath)
    version = et.getroot().get("version")
    if not version:
        print "\tTransforming %s..." % cat_fname
        root = Element("category",
                        {"version": "1.1",
                         "name": et.find("name").text.strip(),
                         "description": et.find("description").text.strip()})
        et = ElementTree(root)
        et.write(fpath, indent=True)
    elif version == "1.0":
        print "\tTransforming %s..." % cat_fname
        root = Element("category",
                        {"version": "1.1",
                         "name": et.getroot().get("name"),
                         "description": et.getroot().get("description")})
        et = ElementTree(root)
        et.write(fpath, indent=True)
    else:
        print "\tSkipping %s - Not the version this script was written to transform." % cat_fname
    
#transform components
print "Transforming Components..."
for comp_fname in os.listdir(comp_path):
    fpath = os.path.join(comp_path, comp_fname)
Beispiel #12
0
#transform categories
print "Transforming Categories..."
for cat_fname in os.listdir(cat_path):
    fpath = os.path.join(cat_path, cat_fname)
    et.parse(fpath)
    version = et.getroot().get("version")
    if not version:
        print "\tTransforming %s..." % cat_fname
        root = Element(
            "category", {
                "version": "1.1",
                "name": et.find("name").text.strip(),
                "description": et.find("description").text.strip()
            })
        et = ElementTree(root)
        et.write(fpath, indent=True)
    elif version == "1.0":
        print "\tTransforming %s..." % cat_fname
        root = Element(
            "category", {
                "version": "1.1",
                "name": et.getroot().get("name"),
                "description": et.getroot().get("description")
            })
        et = ElementTree(root)
        et.write(fpath, indent=True)
    else:
        print "\tSkipping %s - Not the version this script was written to transform." % cat_fname

#transform components
print "Transforming Components..."
Beispiel #13
0
for vf in votesfiles:
	print vf
	try:
		votetree=ElementTree(file=vf)
		voteroot=votetree.getroot()
		date=voteroot.get('date')
		m=re.match('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',date)
		if not m:
			print "internal error in date format"
			sys.exit()
		mgd=m.groupdict()
		mgd.update({'date':date})
		acts=votetree.findall('//royal_assent/act')
		if len(acts)>0:
			assent=Element('assent',mgd)
			for j in range(len(acts)):
				assent.insert(j,acts[j])
			topelement.insert(i,assent)
			i=i+1
	except xml.parsers.expat.ExpatError, errorinst:
		print errorinst
		print "XML parsing error in %s" % vf, sys.exc_info()[0]
	


top=ElementTree(topelement)

top.write('allvotes.xml')

	
def WriteElementToFile(element, fileName):
	
	# wrap it in an ElementTree instance, and save as XML
	tree = ElementTree(element)
	tree.write(fileName, encoding="utf-8")
Beispiel #15
0
topelement = Element('top')
i = 1

for vf in votesfiles:
    print vf
    try:
        votetree = ElementTree(file=vf)
        voteroot = votetree.getroot()
        date = voteroot.get('date')
        m = re.match('(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', date)
        if not m:
            print "internal error in date format"
            sys.exit()
        mgd = m.groupdict()
        mgd.update({'date': date})
        acts = votetree.findall('//royal_assent/act')
        if len(acts) > 0:
            assent = Element('assent', mgd)
            for j in range(len(acts)):
                assent.insert(j, acts[j])
            topelement.insert(i, assent)
            i = i + 1
    except xml.parsers.expat.ExpatError, errorinst:
        print errorinst
        print "XML parsing error in %s" % vf, sys.exc_info()[0]

top = ElementTree(topelement)

top.write('allvotes.xml')